Re: [PATCH 2/2] vhost-use-blk: convert to new virtio_delete_queue

2020-02-21 Thread Pan Nengyuan



On 2/21/2020 7:31 PM, Stefan Hajnoczi wrote:
> On Thu, Feb 13, 2020 at 09:28:07AM +0800, pannengy...@huawei.com wrote:
>> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
>> index 2eba8b9db0..ed6a5cc03b 100644
>> --- a/hw/block/vhost-user-blk.c
>> +++ b/hw/block/vhost-user-blk.c
>> @@ -420,9 +420,10 @@ static void vhost_user_blk_device_realize(DeviceState 
>> *dev, Error **errp)
>>  virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
>>  sizeof(struct virtio_blk_config));
>>  
>> +s->virtqs = g_new0(VirtQueue *, s->num_queues);
> 
> Minor point, up to you if you want to change it: the array is fully
> initialized by the for loop in the next line.  There is no need to clear
> the memory first:
> 
> s/g_new0/g_new/
OK, it's fine, I will change it.

Thanks.

> 
>> diff --git a/include/hw/virtio/vhost-user-blk.h 
>> b/include/hw/virtio/vhost-user-blk.h
>> index 108bfadeeb..f68911f6f0 100644
>> --- a/include/hw/virtio/vhost-user-blk.h
>> +++ b/include/hw/virtio/vhost-user-blk.h
>> @@ -37,6 +37,7 @@ typedef struct VHostUserBlk {
>>  struct vhost_inflight *inflight;
>>  VhostUserState vhost_user;
>>  struct vhost_virtqueue *vqs;
>> +VirtQueue **virtqs;
> 
> Both vqs and virtqs exist and are easily confused.  Please rename vqs to
> vhost_vqs.

OK, I will do it.

Thanks.

> 



[PATCH v2 1/1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
This patch introduces support for PMR that has been defined as part of NVMe 1.4
spec. User can now specify a pmr_file which will be mmap'ed into qemu address
space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
to the PMR region that will stay persistent accross system reboot.

Signed-off-by: Andrzej Jakowski 
---
 hw/block/nvme.c   | 165 +++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 346 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d28335cbf3..ff7e74d765 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,10 +19,14 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
+ *  [pmr_file=,] \
  *  num_queues=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
+ *
+ * Either cmb or pmr - due to limitation in avaialbe BAR indexes.
+ * pmr_file file needs to be preallocated and power of two in size.
  */
 
 #include "qemu/osdep.h"
@@ -1141,6 +1145,28 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
 NVME_GUEST_ERR(nvme_ub_mmiowr_cmbsz_readonly,
"invalid write to read only CMBSZ, ignored");
 return;
+#ifndef _WIN32
+case 0xE00: /* PMRCAP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrcap_readonly,
+   "invalid write to PMRCAP register, ignored");
+return;
+case 0xE04: /* TODO PMRCTL */
+break;
+case 0xE08: /* PMRSTS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrsts_readonly,
+   "invalid write to PMRSTS register, ignored");
+return;
+case 0xE0C: /* PMREBS */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrebs_readonly,
+   "invalid write to PMREBS register, ignored");
+return;
+case 0xE10: /* PMRSWTP */
+NVME_GUEST_ERR(nvme_ub_mmiowr_pmrswtp_readonly,
+   "invalid write to PMRSWTP register, ignored");
+return;
+case 0xE14: /* TODO PMRMSC */
+ break;
+#endif /* !_WIN32 */
 default:
 NVME_GUEST_ERR(nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -1169,6 +1195,22 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
addr, unsigned size)
 }
 
 if (addr < sizeof(n->bar)) {
+#ifndef _WIN32
+/*
+ * When PMRWBM bit 1 is set then read from
+ * from PMRSTS should ensure prior writes
+ * made it to persistent media
+ */
+if (addr == 0xE08 &&
+(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02) >> 1) {
+int ret;
+ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
+if (!ret) {
+NVME_GUEST_ERR(nvme_ub_mmiord_pmrread_barrier,
+   "error while persisting data");
+}
+}
+#endif /* !_WIN32 */
 memcpy(, ptr + addr, size);
 } else {
 NVME_GUEST_ERR(nvme_ub_mmiord_invalid_ofs,
@@ -1303,6 +1345,31 @@ static const MemoryRegionOps nvme_cmb_ops = {
 },
 };
 
+#ifndef _WIN32
+static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
+unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+stn_le_p(>pmrbuf[addr], size, data);
+}
+
+static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
+{
+NvmeCtrl *n = (NvmeCtrl *)opaque;
+return ldn_le_p(>pmrbuf[addr], size);
+}
+
+static const MemoryRegionOps nvme_pmr_ops = {
+.read = nvme_pmr_read,
+.write = nvme_pmr_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.impl = {
+.min_access_size = 1,
+.max_access_size = 8,
+},
+};
+#endif /* !_WIN32 */
+
 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
 {
 NvmeCtrl *n = NVME(pci_dev);
@@ -1332,6 +1399,39 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
 error_setg(errp, "serial property not set");
 return;
 }
+
+#ifndef _WIN32
+if (!n->cmb_size_mb && n->pmr_file) {
+int fd;
+
+n->f_pmr = fopen(n->pmr_file, "r+b");
+if (!n->f_pmr) {
+error_setg(errp, "pmr backend file open error");
+return;
+}
+
+fseek(n->f_pmr, 0L, SEEK_END);
+n->f_pmr_size = ftell(n->f_pmr);
+fseek(n->f_pmr, 0L, SEEK_SET);
+
+/* pmr file size needs to be power of 2 in size */
+if (!n->f_pmr_size || !is_power_of_2(n->f_pmr_size)) {
+error_setg(errp, "pmr backend file size needs to be greater than 0"
+ "and power of 2 in size");
+return;
+}
+
+fd = fileno(n->f_pmr);
+n->pmrbuf = mmap(NULL, n->f_pmr_size,
+ (PROT_READ | PROT_WRITE), MAP_SHARED, fd, 0);
+
+

[PATCH v2 0/1] Enable PMR feature from NVMe 1.4 spec to NVMe driver

2020-02-21 Thread Andrzej Jakowski
Changes since v1:
 - provided support for Bit 1 from PMRWBM register instead of Bit 0 to ensure
   improved performance in virtualized environment [1] (Stefan)

 - added check if pmr size is power of two in size (David)

 - addressed cross compilation build problems reported by CI environment

[1]: 
https://nvmexpress.org/wp-content/uploads/NVM-Express-1_4-2019.06.10-Ratified.pdf
[2]: 
https://lore.kernel.org/qemu-devel/20200218224811.30050-1-andrzej.jakow...@linux.intel.com/
 
---

Persistent Memory Region (PMR) is a new optional feature provided in NVMe 1.4
specification. This patch implements initial support for it in NVMe driver.

Andrzej Jakowski (1):
  block/nvme: introduce PMR support from NVMe 1.4 spec

 hw/block/nvme.c   | 165 +++-
 hw/block/nvme.h   |   5 ++
 hw/block/trace-events |   5 ++
 include/block/nvme.h  | 172 ++
 4 files changed, 346 insertions(+), 1 deletion(-)

-- 
2.21.1




Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 12:32 PM, Stefan Hajnoczi wrote:
> Hi Andrzej,
> After having looked at the PMRWBM part of the spec, I think that the
> Bit 1 mode should be implemented for slightly better performance.  Bit
> 0 mode is not well-suited to virtualization for the reasons I
> mentioned in the previous email.
> 
> The spec describes Bit 1 mode as "The completion of a read to the
> PMRSTS register shall ensure that all prior writes to the Persistent
> Memory Region have completed and are persistent".
> 
> Stefan

Make sense -- will incorporate that feedback in second version of patch.



Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 11:45 AM, Dr. David Alan Gilbert wrote:
> Isn't there also a requirement that BARs are powers of two? Wouldn't
> you need to ensure the PMR file is a power of 2 in size?
> 
> Dave

Agree, need to make sure it is power of 2.



Re: [PATCH v7 01/11] qapi/error: add (Error **errp) cleaning APIs

2020-02-21 Thread Eric Blake

On 2/21/20 3:20 AM, Vladimir Sementsov-Ogievskiy wrote:


+static inline void warn_report_errp(Error **errp)
+{
+    assert(errp && *errp);
+    warn_report_err(*errp);
+    *errp = NULL;
+}
+
+
  /*
   * Just like error_setg(), except you get to specify the error class.
   * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is


These appear to be unused apart from the Coccinelle script in PATCH 03.

They are used in the full "[RFC v5 000/126] error: auto propagated
local_err" series.  Options:

1. Pick a few more patches into this part I series, so these guys come
    with users.


It needs some additional effort for this series.. But it's possible. Still,
I think that we at least should not pull out patches which should be in
future series (for example from ppc or block/)..




2. Punt this patch to the first part that has users, along with the
    part of the Coccinelle script that deals with them.


But coccinelle script would be wrong, if we drop this part from it. I 
think,
that after commit which adds coccinelle script, it should work with any 
file,

not only subset of these series.

So, it's probably OK for now to drop these functions, forcing their 
addition if
coccinelle script will be applied where these functions are needed. We 
can, for

example comment these three functions.

Splitting coccinelle script into two parts, which will be in different 
series will

not help any patch-porting processes.


Splitting the coccinelle script across multiple patches is actually 
quite reviewable, and still easy to backport.  Consider this series by 
Philippe:


https://lists.gnu.org/archive/html/qemu-devel/2020-02/msg05554.html

which makes multiple additions to scripts/coccinelle/exec_rw_const.cocci 
over the course of the series.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v7 02/11] error: auto propagated local_err

2020-02-21 Thread Eric Blake

On 2/21/20 3:42 AM, Vladimir Sementsov-Ogievskiy wrote:


+#define ERRP_AUTO_PROPAGATE()  \
+    g_auto(ErrorPropagator) _auto_errp_prop = {.errp = errp};  \
+    errp = ((errp == NULL || *errp == error_fatal) \
+    ? &_auto_errp_prop.local_err : errp)
+
  /*
   * Special error destination to abort on error.
   * See error_setg() and error_propagate() for details.


*errp == error_fatal tests *errp == NULL, which is not what you want.
You need to test errp == _fatal, just like error_handle_fatal().


Oops, great bug) And nobody noticed before) Of course, you are right.


Sorry I missed it in my earlier looks.





Superfluous parenthesis around the first operand of ?:.

Wouldn't

    #define ERRP_AUTO_PROPAGATE()  \
    g_auto(ErrorPropagator) _auto_errp_prop = {.errp = errp};  \
    if (!errp || errp == _fatal) {   \
    errp = &_auto_errp_prop.local_err; \
    }

be clearer?



Hmm, notation with "if" will allow omitting ';' after macro invocation, 
which seems not good..


Then wrap it:

g_auto(ErrorPropagator) _auto_errp_prop = {.errp = errp}; \
do { \
  if (!errp || errp == _fata) {
errp = &_auto_errp_prop.local_err; \
  } \
while (0)


And if I'm not wrong we've already discussed it somewhere in previous 
versions.


The original use of ?: stems from my suggestion on an earlier revision 
when we were still trying to pack everything into two consecutive 
declaration lines, rather than a declaration and a statement (as ?: is 
necessary for conditionals in declarations).  But since then, we decided 
to go with a statement (we require a C99 compiler, so declaration after 
statement is supported by our compiler, even if our coding style 
currently avoids it where possible), so as long as we support 
statements, we might as well go with a legible statement instead of 
insisting on the compact ?: form.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Stefan Hajnoczi
Hi Andrzej,
After having looked at the PMRWBM part of the spec, I think that the
Bit 1 mode should be implemented for slightly better performance.  Bit
0 mode is not well-suited to virtualization for the reasons I
mentioned in the previous email.

The spec describes Bit 1 mode as "The completion of a read to the
PMRSTS register shall ensure that all prior writes to the Persistent
Memory Region have completed and are persistent".

Stefan



Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Dr. David Alan Gilbert
* Stefan Hajnoczi (stefa...@gmail.com) wrote:
> On Tue, Feb 18, 2020 at 03:48:11PM -0700, Andrzej Jakowski wrote:
> > This patch introduces support for PMR that has been defined as part of NVMe 
> > 1.4
> > spec. User can now specify a pmr_file which will be mmap'ed into qemu 
> > address
> > space and subsequently in PCI BAR 2. Guest OS can perform mmio read and 
> > writes
> > to the PMR region that will stay persistent accross system reboot.
> > 
> > Signed-off-by: Andrzej Jakowski 
> > ---
> >  hw/block/nvme.c   | 145 ++-
> >  hw/block/nvme.h   |   5 ++
> >  hw/block/trace-events |   5 ++
> >  include/block/nvme.h  | 172 ++
> >  4 files changed, 326 insertions(+), 1 deletion(-)
> 
> NVDIMM folks, please take a look.  There seems to be commonality here.
> 
> Can this use -object memory-backend-file instead of manually opening and
> mapping a file?
> 
> Also CCing David Gilbert because there is some similarity with the
> vhost-user-fs's DAX Window feature where QEMU mmaps regions of files
> into a BAR.

I guess the biggest difference here is that the read can have the side
effect; in my world I don't have to set read/write/endian ops - I just
map a chunk of memory and use memory_region_add_subregion, so all the
read/writes are native read/writes - I assume that would be a lot
faster - I guess it depends if NVME_PMRCAP_PMRWBM is something constant
you know early on; if you know that you don't need to do side effects in
the read you could do the same trick and avoid the IO ops altogether.


Isn't there also a requirement that BARs are powers of two? Wouldn't
you need to ensure the PMR file is a power of 2 in size?

Dave

> > @@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = {
> >  },
> >  };
> >  
> > +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
> > +unsigned size)
> > +{
> > +NvmeCtrl *n = (NvmeCtrl *)opaque;
> > +stn_le_p(>pmrbuf[addr], size, data);
> > +}
> > +
> > +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
> > +{
> > +NvmeCtrl *n = (NvmeCtrl *)opaque;
> > +if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) {
> > +int ret;
> > +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
> > +if (!ret) {
> > +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier,
> > +   "error while persisting data");
> > +}
> > +}
> 
> Why is msync(2) done on memory loads instead of stores?


--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020, 17:50 Dr. David Alan Gilbert 
wrote:

> * Stefan Hajnoczi (stefa...@gmail.com) wrote:
> > On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski
> >  wrote:
> > > On 2/21/20 6:45 AM, Stefan Hajnoczi wrote:
> > > > Why is msync(2) done on memory loads instead of stores?
> > >
> > > This is my interpretation of NVMe spec wording with regards to PMRWBM
> field
> > > which says:
> > >
> > > "The completion of a memory read from any Persistent
> > > Memory Region address ensures that all prior writes to the
> > > Persistent Memory Region have completed and are
> > > persistent."
> >
> > Thanks, I haven't read the PMR section of the spec :).
> >
> > A synchronous operation is bad for virtualization performance.  While
> > the sync may be a cheap operation in hardware, it can be arbitrarily
> > expensive with msync(2).  The vCPU will be stuck until msync(2)
> > completes on the host.
> >
> > It's also a strange design choice since performance will suffer when
> > an unrelated read has to wait for writes to complete.  This is
> > especially problematic for multi-threaded applications or multi-core
> > systems where I guess this case is hit frequently.  Maybe it's so
> > cheap in hardware that it doesn't matter?  But then why didn't NVDIMM
> > use this mechanism?
> >
> > If anyone knows the answer I'd be interested in learning.  But this
> > isn't a criticism of the patch - of course it needs to implement the
> > hardware spec and we can't change it.
>
> Is this coming from the underlying PCIe spec ?
> In PCIe Base 4.0 Rev 0.3 Feb19-2014, section 2.4.1 Transaction ordering,
> there's a Table 2-39 and entry B2a in that table is:
>
>
>   'A Read Request must not pass a Posted Request unless B2b applies.'
>
> and a posted request is defined as a 'Memory Write Request or a Message
> Request'.
>
> ???
>

No, that relates to transaction ordering in PCI, not data persistence in
PMR.  PMR can define whatever persistence semantics it wants.

The completion of the write transaction at the PCI level does not mean data
has to be persistent at the PMR level.

Stefan

>


Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Dr. David Alan Gilbert
* Stefan Hajnoczi (stefa...@gmail.com) wrote:
> On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski
>  wrote:
> > On 2/21/20 6:45 AM, Stefan Hajnoczi wrote:
> > > Why is msync(2) done on memory loads instead of stores?
> >
> > This is my interpretation of NVMe spec wording with regards to PMRWBM field
> > which says:
> >
> > "The completion of a memory read from any Persistent
> > Memory Region address ensures that all prior writes to the
> > Persistent Memory Region have completed and are
> > persistent."
> 
> Thanks, I haven't read the PMR section of the spec :).
> 
> A synchronous operation is bad for virtualization performance.  While
> the sync may be a cheap operation in hardware, it can be arbitrarily
> expensive with msync(2).  The vCPU will be stuck until msync(2)
> completes on the host.
> 
> It's also a strange design choice since performance will suffer when
> an unrelated read has to wait for writes to complete.  This is
> especially problematic for multi-threaded applications or multi-core
> systems where I guess this case is hit frequently.  Maybe it's so
> cheap in hardware that it doesn't matter?  But then why didn't NVDIMM
> use this mechanism?
> 
> If anyone knows the answer I'd be interested in learning.  But this
> isn't a criticism of the patch - of course it needs to implement the
> hardware spec and we can't change it.

Is this coming from the underlying PCIe spec ?
In PCIe Base 4.0 Rev 0.3 Feb19-2014, section 2.4.1 Transaction ordering,
there's a Table 2-39 and entry B2a in that table is:


  'A Read Request must not pass a Posted Request unless B2b applies.'

and a posted request is defined as a 'Memory Write Request or a Message
Request'.

???

Dave

> Stefan
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 3:36 PM Andrzej Jakowski
 wrote:
> On 2/21/20 6:45 AM, Stefan Hajnoczi wrote:
> > Why is msync(2) done on memory loads instead of stores?
>
> This is my interpretation of NVMe spec wording with regards to PMRWBM field
> which says:
>
> "The completion of a memory read from any Persistent
> Memory Region address ensures that all prior writes to the
> Persistent Memory Region have completed and are
> persistent."

Thanks, I haven't read the PMR section of the spec :).

A synchronous operation is bad for virtualization performance.  While
the sync may be a cheap operation in hardware, it can be arbitrarily
expensive with msync(2).  The vCPU will be stuck until msync(2)
completes on the host.

It's also a strange design choice since performance will suffer when
an unrelated read has to wait for writes to complete.  This is
especially problematic for multi-threaded applications or multi-core
systems where I guess this case is hit frequently.  Maybe it's so
cheap in hardware that it doesn't matter?  But then why didn't NVDIMM
use this mechanism?

If anyone knows the answer I'd be interested in learning.  But this
isn't a criticism of the patch - of course it needs to implement the
hardware spec and we can't change it.

Stefan



Re: [PATCH v7 01/11] qapi/error: add (Error **errp) cleaning APIs

2020-02-21 Thread Vladimir Sementsov-Ogievskiy

21.02.2020 19:34, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


21.02.2020 10:38, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


Add functions to clean Error **errp: call corresponding Error *err
cleaning function an set pointer to NULL.

New functions:
error_free_errp
error_report_errp
warn_report_errp

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Greg Kurz 
Reviewed-by: Eric Blake 
---

CC: Eric Blake 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Greg Kurz 
CC: Stefano Stabellini 
CC: Anthony Perard 
CC: Paul Durrant 
CC: Stefan Hajnoczi 
CC: "Philippe Mathieu-Daudé" 
CC: Laszlo Ersek 
CC: Gerd Hoffmann 
CC: Stefan Berger 
CC: Markus Armbruster 
CC: Michael Roth 
CC: qemu-block@nongnu.org
CC: xen-de...@lists.xenproject.org

   include/qapi/error.h | 26 ++
   1 file changed, 26 insertions(+)

diff --git a/include/qapi/error.h b/include/qapi/error.h
index ad5b6e896d..d34987148d 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -309,6 +309,32 @@ void warn_reportf_err(Error *err, const char *fmt, ...)
   void error_reportf_err(Error *err, const char *fmt, ...)
   GCC_FMT_ATTR(2, 3);
   +/*
+ * Functions to clean Error **errp: call corresponding Error *err cleaning
+ * function, then set pointer to NULL.
+ */
+static inline void error_free_errp(Error **errp)
+{
+assert(errp && *errp);
+error_free(*errp);
+*errp = NULL;
+}
+
+static inline void error_report_errp(Error **errp)
+{
+assert(errp && *errp);
+error_report_err(*errp);
+*errp = NULL;
+}
+
+static inline void warn_report_errp(Error **errp)
+{
+assert(errp && *errp);
+warn_report_err(*errp);
+*errp = NULL;
+}
+
+
   /*
* Just like error_setg(), except you get to specify the error class.
* Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is


These appear to be unused apart from the Coccinelle script in PATCH 03.

They are used in the full "[RFC v5 000/126] error: auto propagated
local_err" series.  Options:

1. Pick a few more patches into this part I series, so these guys come
 with users.


It needs some additional effort for this series.. But it's possible. Still,
I think that we at least should not pull out patches which should be in
future series (for example from ppc or block/)..


Yes, we want to keep related stuff together.


Grepping through v5:
  for x in {warn_report_errp,error_report_errp,error_free_errp}; do echo == $x 
==; git grep -l $x | grep -v coccinelle | grep -v 'error\.h'; echo; done
== warn_report_errp ==
block/file-posix.c
hw/ppc/spapr.c
hw/ppc/spapr_caps.c
hw/ppc/spapr_irq.c
hw/vfio/pci.c
net/tap.c
qom/object.c

== error_report_errp ==
hw/block/vhost-user-blk.c
util/oslib-posix.c

== error_free_errp ==
block.c
block/qapi.c
block/sheepdog.c
block/snapshot.c
blockdev.c
chardev/char-socket.c
hw/audio/intel-hda.c
hw/core/qdev-properties.c
hw/pci-bridge/pci_bridge_dev.c
hw/pci-bridge/pcie_pci_bridge.c
hw/scsi/megasas.c
hw/scsi/mptsas.c
hw/usb/hcd-xhci.c
io/net-listener.c
migration/colo.c
qga/commands-posix.c
qga/commands-win32.c
util/qemu-sockets.c

What do you want to add?


PATCH v5 032 uses both error_report_errp() and error_free_errp().
Adding warn_report_errp() without a user is okay with me.  What do you
think?

If there are patches you consider related to 032, feel free to throw
them in.


032 is qga/commands-win32.c and util/oslib-posix.c

Seems that they are wrongly grouped into one patch.

qga/commands-win32.c matches qga/ (Michael Roth)
and  util/oslib-posix.c matches POSIX (Paolo Bonzini)

So, it should be two separate patches anyway.

For [1.] I only afraid that we'll have to wait for maintainers, who were
not interested in previous iterations, to review these new patches..





2. Punt this patch to the first part that has users, along with the
 part of the Coccinelle script that deals with them.


But coccinelle script would be wrong, if we drop this part from it. I think,
that after commit which adds coccinelle script, it should work with any file,
not only subset of these series.

So, it's probably OK for now to drop these functions, forcing their addition if
coccinelle script will be applied where these functions are needed. We can, for
example comment these three functions.

Splitting coccinelle script into two parts, which will be in different series 
will
not help any patch-porting processes.

Moreover, this will create dependencies between future series updating other 
files.

So, I don't like [2.]..


And it's likely more work than 1.


3. Do nothing: accept the functions without users.


OK for me)



I habitually dislike 3., but reviewing the rest of this series might
make me override that dislike.

[...]




--
Best regards,
Vladimir



Re: [RFC PATCH v3 00/27] Add subcluster allocation to qcow2

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> Hi,
> 
> here's the new version of the patches to add subcluster allocation
> support to qcow2.
> 
> Please refer to the cover letter of the first version for a full
> description of the patches:
> 
>https://lists.gnu.org/archive/html/qemu-block/2019-10/msg00983.html
> 
> This version fixes many of the problems highlighted by Max. I decided
> not to replace completely the cluster logic with subcluster logic in
> all cases because I felt that sometimes it only complicated the code.
> Let's see what you think :-)

Looks good overall. :)

So now I wonder on what your plans are after this series.  Here are some
things that come to my mind, and I wonder whether you plan to address
them or whether there are more things to do still:

- In v2, you had a patch for preallocation support with backing files.
It didn’t quite work, which is why I think you dropped it for now (why
not, it isn’t crucial).

- There is a TODO on subcluster zeroing.

- I think adding support to amend for switching extended_l2 on or off
would make sense.  But maybe it’s too complicated to be worth the effort.

- As I noted in v2, I think it’d be great if it were possible to run the
iotests with -o extended_l2=on.  But I suppose this kind of depends on
me adding data_file support to the Python tests first...

Max



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 27/27] iotests: Add tests for qcow2 images with extended L2 entries

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Signed-off-by: Alberto Garcia 
> ---
>  tests/qemu-iotests/271 | 256 +
>  tests/qemu-iotests/271.out | 208 ++
>  tests/qemu-iotests/group   |   1 +
>  3 files changed, 465 insertions(+)
>  create mode 100755 tests/qemu-iotests/271
>  create mode 100644 tests/qemu-iotests/271.out

Currently, you’re using the reference output to verify the results.  I
find this rather difficult.

Can this not be written in a way that the test itself verifies the
results?  I realize bit manipulation in bash is hard, which is why I
wonder whether Python may be better suited for the job.

Or maybe at least there could be some way to produce a hexdump-like
result from some more abstract description on what to expect and then
compare the strings.

I suppose I can live with how it is, but I feel like I’d have to do
something in my head that could be better done by a script.

Max



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 26/27] qcow2: Add subcluster support to qcow2_measure()

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Extended L2 entries are bigger than normal L2 entries so this has an
> impact on the amount of metadata needed for a qcow2 file.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2.c | 19 ---
>  1 file changed, 12 insertions(+), 7 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 25/27] qcow2: Add the 'extended_l2' option and the QCOW2_INCOMPAT_EXTL2 bit

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Now that the implementation of subclusters is complete we can finally
> add the necessary options to create and read images with this feature,
> which we call "extended L2 entries".
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2.c|  65 ++--
>  block/qcow2.h|   8 ++-
>  include/block/block_int.h|   1 +
>  qapi/block-core.json |   7 +++
>  tests/qemu-iotests/031.out   |   8 +--
>  tests/qemu-iotests/036.out   |   4 +-
>  tests/qemu-iotests/049.out   | 102 +++
>  tests/qemu-iotests/060.out   |   1 +
>  tests/qemu-iotests/061.out   |  20 +++---
>  tests/qemu-iotests/065   |  18 --
>  tests/qemu-iotests/082.out   |  48 ---
>  tests/qemu-iotests/085.out   |  38 ++--
>  tests/qemu-iotests/144.out   |   4 +-
>  tests/qemu-iotests/182.out   |   2 +-
>  tests/qemu-iotests/185.out   |   8 +--
>  tests/qemu-iotests/198.out   |   2 +
>  tests/qemu-iotests/206.out   |   4 ++
>  tests/qemu-iotests/242.out   |   5 ++
>  tests/qemu-iotests/255.out   |   8 +--
>  tests/qemu-iotests/273.out   |   9 ++-
>  tests/qemu-iotests/common.filter |   1 +
>  21 files changed, 245 insertions(+), 118 deletions(-)

With the .qapi versions adjusted to match $next_release, and with the
bit fixed to be at index 4 instead of 3 (and with the iotests rebases
that always become necessary[1]):

Reviewed-by: Max Reitz 

[1] e.g. 280 fails now – I suppose qemu_img_log should filter just like
the bash tests do, but then again, I’d rather drop that function
altogether anyway
(https://lists.nongnu.org/archive/html/qemu-block/2019-10/msg00136.html)



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v7 01/11] qapi/error: add (Error **errp) cleaning APIs

2020-02-21 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 21.02.2020 10:38, Markus Armbruster wrote:
>> Vladimir Sementsov-Ogievskiy  writes:
>>
>>> Add functions to clean Error **errp: call corresponding Error *err
>>> cleaning function an set pointer to NULL.
>>>
>>> New functions:
>>>error_free_errp
>>>error_report_errp
>>>warn_report_errp
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> Reviewed-by: Greg Kurz 
>>> Reviewed-by: Eric Blake 
>>> ---
>>>
>>> CC: Eric Blake 
>>> CC: Kevin Wolf 
>>> CC: Max Reitz 
>>> CC: Greg Kurz 
>>> CC: Stefano Stabellini 
>>> CC: Anthony Perard 
>>> CC: Paul Durrant 
>>> CC: Stefan Hajnoczi 
>>> CC: "Philippe Mathieu-Daudé" 
>>> CC: Laszlo Ersek 
>>> CC: Gerd Hoffmann 
>>> CC: Stefan Berger 
>>> CC: Markus Armbruster 
>>> CC: Michael Roth 
>>> CC: qemu-block@nongnu.org
>>> CC: xen-de...@lists.xenproject.org
>>>
>>>   include/qapi/error.h | 26 ++
>>>   1 file changed, 26 insertions(+)
>>>
>>> diff --git a/include/qapi/error.h b/include/qapi/error.h
>>> index ad5b6e896d..d34987148d 100644
>>> --- a/include/qapi/error.h
>>> +++ b/include/qapi/error.h
>>> @@ -309,6 +309,32 @@ void warn_reportf_err(Error *err, const char *fmt, ...)
>>>   void error_reportf_err(Error *err, const char *fmt, ...)
>>>   GCC_FMT_ATTR(2, 3);
>>>   +/*
>>> + * Functions to clean Error **errp: call corresponding Error *err cleaning
>>> + * function, then set pointer to NULL.
>>> + */
>>> +static inline void error_free_errp(Error **errp)
>>> +{
>>> +assert(errp && *errp);
>>> +error_free(*errp);
>>> +*errp = NULL;
>>> +}
>>> +
>>> +static inline void error_report_errp(Error **errp)
>>> +{
>>> +assert(errp && *errp);
>>> +error_report_err(*errp);
>>> +*errp = NULL;
>>> +}
>>> +
>>> +static inline void warn_report_errp(Error **errp)
>>> +{
>>> +assert(errp && *errp);
>>> +warn_report_err(*errp);
>>> +*errp = NULL;
>>> +}
>>> +
>>> +
>>>   /*
>>>* Just like error_setg(), except you get to specify the error class.
>>>* Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is
>>
>> These appear to be unused apart from the Coccinelle script in PATCH 03.
>>
>> They are used in the full "[RFC v5 000/126] error: auto propagated
>> local_err" series.  Options:
>>
>> 1. Pick a few more patches into this part I series, so these guys come
>> with users.
>
> It needs some additional effort for this series.. But it's possible. Still,
> I think that we at least should not pull out patches which should be in
> future series (for example from ppc or block/)..

Yes, we want to keep related stuff together.

> Grepping through v5:
>  for x in {warn_report_errp,error_report_errp,error_free_errp}; do echo == $x 
> ==; git grep -l $x | grep -v coccinelle | grep -v 'error\.h'; echo; done
> == warn_report_errp ==
> block/file-posix.c
> hw/ppc/spapr.c
> hw/ppc/spapr_caps.c
> hw/ppc/spapr_irq.c
> hw/vfio/pci.c
> net/tap.c
> qom/object.c
>
> == error_report_errp ==
> hw/block/vhost-user-blk.c
> util/oslib-posix.c
>
> == error_free_errp ==
> block.c
> block/qapi.c
> block/sheepdog.c
> block/snapshot.c
> blockdev.c
> chardev/char-socket.c
> hw/audio/intel-hda.c
> hw/core/qdev-properties.c
> hw/pci-bridge/pci_bridge_dev.c
> hw/pci-bridge/pcie_pci_bridge.c
> hw/scsi/megasas.c
> hw/scsi/mptsas.c
> hw/usb/hcd-xhci.c
> io/net-listener.c
> migration/colo.c
> qga/commands-posix.c
> qga/commands-win32.c
> util/qemu-sockets.c
>
> What do you want to add?

PATCH v5 032 uses both error_report_errp() and error_free_errp().
Adding warn_report_errp() without a user is okay with me.  What do you
think?

If there are patches you consider related to 032, feel free to throw
them in.

>>
>> 2. Punt this patch to the first part that has users, along with the
>> part of the Coccinelle script that deals with them.
>
> But coccinelle script would be wrong, if we drop this part from it. I think,
> that after commit which adds coccinelle script, it should work with any file,
> not only subset of these series.
>
> So, it's probably OK for now to drop these functions, forcing their addition 
> if
> coccinelle script will be applied where these functions are needed. We can, 
> for
> example comment these three functions.
>
> Splitting coccinelle script into two parts, which will be in different series 
> will
> not help any patch-porting processes.
>
> Moreover, this will create dependencies between future series updating other 
> files.
>
> So, I don't like [2.]..

And it's likely more work than 1.

>> 3. Do nothing: accept the functions without users.
>
> OK for me)
>
>>
>> I habitually dislike 3., but reviewing the rest of this series might
>> make me override that dislike.
[...]




[PATCH v2 0/2] block/curl: Improve HTTP header parsing

2020-02-21 Thread David Edmondson
An HTTP object store of my acquaintance returns "accept-ranges: bytes"
(all lower case) as a header, causing the QEMU curl backend to refuse
to talk to it. RFC 7230 says that HTTP headers are case insensitive,
so update the curl backend accordingly.

At the same time, allow for arbitrary white space around the HTTP
header field value, as required by the RFC.

v2:
- strncasecmp -> g_ascii_strncasecmp (Philippe).
- isspace -> g_ascii_isspace, for good measure.

David Edmondson (2):
  block/curl: HTTP header fields allow whitespace around values
  block/curl: HTTP header field names are case insensitive

 block/curl.c | 31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

-- 
2.24.1




[PATCH v2 2/2] block/curl: HTTP header field names are case insensitive

2020-02-21 Thread David Edmondson
RFC 7230 section 3.2 indicates that HTTP header field names are case
insensitive.

Signed-off-by: David Edmondson 
---
 block/curl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index f9ffb7f4e2bf..1421e8fb9815 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -216,11 +216,11 @@ static size_t curl_header_cb(void *ptr, size_t size, 
size_t nmemb, void *opaque)
 size_t realsize = size * nmemb;
 const char *header = (char *)ptr;
 const char *end = header + realsize;
-const char *accept_ranges = "Accept-Ranges:";
+const char *accept_ranges = "accept-ranges:";
 const char *bytes = "bytes";
 
 if (realsize >= strlen(accept_ranges)
-&& strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) {
+&& g_ascii_strncasecmp(header, accept_ranges, strlen(accept_ranges)) 
== 0) {
 
 char *p = strchr(header, ':') + 1;
 
-- 
2.24.1




[PATCH v2 1/2] block/curl: HTTP header fields allow whitespace around values

2020-02-21 Thread David Edmondson
RFC 7230 section 3.2 indicates that whitespace is permitted between
the field name and field value and after the field value.

Signed-off-by: David Edmondson 
---
 block/curl.c | 31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/block/curl.c b/block/curl.c
index f86299378e38..f9ffb7f4e2bf 100644
--- a/block/curl.c
+++ b/block/curl.c
@@ -214,11 +214,34 @@ static size_t curl_header_cb(void *ptr, size_t size, 
size_t nmemb, void *opaque)
 {
 BDRVCURLState *s = opaque;
 size_t realsize = size * nmemb;
-const char *accept_line = "Accept-Ranges: bytes";
+const char *header = (char *)ptr;
+const char *end = header + realsize;
+const char *accept_ranges = "Accept-Ranges:";
+const char *bytes = "bytes";
 
-if (realsize >= strlen(accept_line)
-&& strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) {
-s->accept_range = true;
+if (realsize >= strlen(accept_ranges)
+&& strncmp(header, accept_ranges, strlen(accept_ranges)) == 0) {
+
+char *p = strchr(header, ':') + 1;
+
+/* Skip whitespace between the header name and value. */
+while (p < end && *p && g_ascii_isspace(*p)) {
+p++;
+}
+
+if (end - p >= strlen(bytes)
+&& strncmp(p, bytes, strlen(bytes)) == 0) {
+
+/* Check that there is nothing but whitespace after the value. */
+p += strlen(bytes);
+while (p < end && *p && g_ascii_isspace(*p)) {
+p++;
+}
+
+if (p == end || !*p) {
+s->accept_range = true;
+}
+}
 }
 
 return realsize;
-- 
2.24.1




Re: [PATCH v7 02/11] error: auto propagated local_err

2020-02-21 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> 21.02.2020 12:19, Markus Armbruster wrote:
>> Vladimir Sementsov-Ogievskiy  writes:
>>
>>> Here is introduced ERRP_AUTO_PROPAGATE macro, to be used at start of
>>> functions with an errp OUT parameter.
>>>
>>> It has three goals:
>>>
>>> 1. Fix issue with error_fatal and error_prepend/error_append_hint: user
>>> can't see this additional information, because exit() happens in
>>> error_setg earlier than information is added. [Reported by Greg Kurz]
>>>
>>> 2. Fix issue with error_abort and error_propagate: when we wrap
>>> error_abort by local_err+error_propagate, the resulting coredump will
>>> refer to error_propagate and not to the place where error happened.
>>> (the macro itself doesn't fix the issue, but it allows us to [3.] drop
>>> the local_err+error_propagate pattern, which will definitely fix the
>>> issue) [Reported by Kevin Wolf]
>>>
>>> 3. Drop local_err+error_propagate pattern, which is used to workaround
>>> void functions with errp parameter, when caller wants to know resulting
>>> status. (Note: actually these functions could be merely updated to
>>> return int error code).
>>>
>>> To achieve these goals, later patches will add invocations
>>> of this macro at the start of functions with either use
>>> error_prepend/error_append_hint (solving 1) or which use
>>> local_err+error_propagate to check errors, switching those
>>> functions to use *errp instead (solving 2 and 3).
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> Reviewed-by: Greg Kurz 
>>> Reviewed-by: Eric Blake 
>>> ---
>>>
>>> CC: Eric Blake 
>>> CC: Kevin Wolf 
>>> CC: Max Reitz 
>>> CC: Greg Kurz 
>>> CC: Stefano Stabellini 
>>> CC: Anthony Perard 
>>> CC: Paul Durrant 
>>> CC: Stefan Hajnoczi 
>>> CC: "Philippe Mathieu-Daudé" 
>>> CC: Laszlo Ersek 
>>> CC: Gerd Hoffmann 
>>> CC: Stefan Berger 
>>> CC: Markus Armbruster 
>>> CC: Michael Roth 
>>> CC: qemu-block@nongnu.org
>>> CC: xen-de...@lists.xenproject.org
>>>
>>>   include/qapi/error.h | 83 +++-
>>>   1 file changed, 82 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/qapi/error.h b/include/qapi/error.h
>>> index d34987148d..b9452d4806 100644
>>> --- a/include/qapi/error.h
>>> +++ b/include/qapi/error.h
>>> @@ -78,7 +78,7 @@
>>>* Call a function treating errors as fatal:
>>>* foo(arg, _fatal);
>>>*
>>> - * Receive an error and pass it on to the caller:
>>> + * Receive an error and pass it on to the caller (DEPRECATED*):
>>>* Error *err = NULL;
>>>* foo(arg, );
>>>* if (err) {
>>> @@ -98,6 +98,50 @@
>>>* foo(arg, errp);
>>>* for readability.
>>>*
>>> + * DEPRECATED* This pattern is deprecated now, the use ERRP_AUTO_PROPAGATE 
>>> macro
>>> + * instead (defined below).
>>> + * It's deprecated because of two things:
>>> + *
>>> + * 1. Issue with error_abort and error_propagate: when we wrap error_abort 
>>> by
>>> + * local_err+error_propagate, the resulting coredump will refer to
>>> + * error_propagate and not to the place where error happened.
>>> + *
>>> + * 2. A lot of extra code of the same pattern
>>> + *
>>> + * How to update old code to use ERRP_AUTO_PROPAGATE?
>>> + *
>>> + * All you need is to add ERRP_AUTO_PROPAGATE() invocation at function 
>>> start,
>>> + * than you may safely dereference errp to check errors and do not need any
>>> + * additional local Error variables or calls to error_propagate().
>>> + *
>>> + * Example:
>>> + *
>>> + * old code
>>> + *
>>> + * void fn(..., Error **errp) {
>>> + * Error *err = NULL;
>>> + * foo(arg, );
>>> + * if (err) {
>>> + * handle the error...
>>> + * error_propagate(errp, err);
>>> + * return;
>>> + * }
>>> + * ...
>>> + * }
>>> + *
>>> + * updated code
>>> + *
>>> + * void fn(..., Error **errp) {
>>> + * ERRP_AUTO_PROPAGATE();
>>> + * foo(arg, errp);
>>> + * if (*errp) {
>>> + * handle the error...
>>> + * return;
>>> + * }
>>> + * ...
>>> + * }
>>> + *
>>> + *
>>>* Receive and accumulate multiple errors (first one wins):
>>>* Error *err = NULL, *local_err = NULL;
>>>* foo(arg, );
>>
>> Let's explain what should be done *first*, and only then talk about the
>> deprecated pattern and how to convert it to current usage.
>>
>>> @@ -348,6 +392,43 @@ void error_set_internal(Error **errp,
>>>   ErrorClass err_class, const char *fmt, ...)
>>>   GCC_FMT_ATTR(6, 7);
>>>   +typedef struct ErrorPropagator {
>>> +Error *local_err;
>>> +Error **errp;
>>> +} ErrorPropagator;
>>> +
>>> +static inline void error_propagator_cleanup(ErrorPropagator *prop)
>>> +{
>>> +error_propagate(prop->errp, prop->local_err);
>>> +}
>>> +
>>> +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(ErrorPropagator, 
>>> error_propagator_cleanup);
>>> +
>>> +/*
>>> + * ERRP_AUTO_PROPAGATE
>>> + *
>>> + * 

Re: [RFC PATCH v3 24/27] qcow2: Restrict qcow2_co_pwrite_zeroes() to full clusters only

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Ideally it should be possible to zero individual subclusters using
> this function, but this is currently not implemented.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2.c | 6 ++
>  1 file changed, 6 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 23/27] qcow2: Add subcluster support to handle_alloc_space()

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> The bdrv_co_pwrite_zeroes() call here fills complete clusters with
> zeroes, but it can happen that some subclusters are not part of the
> write request or the copy-on-write. This patch makes sure that only
> the affected subclusters are overwritten.
> 
> A potential improvement would be to also fill with zeroes the other
> subclusters if we can guarantee that we are not overwriting existing
> data. However this would waste more disk space, so we should first
> evaluate if it's really worth doing.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 22/27] qcow2: Clear the L2 bitmap when allocating a compressed cluster

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Compressed clusters always have the bitmap part of the extended L2
> entry set to 0.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 3 +++
>  1 file changed, 3 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 21/27] qcow2: Update L2 bitmap in qcow2_alloc_cluster_link_l2()

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> The L2 bitmap needs to be updated after each write to indicate what
> new subclusters are now allocated.
> 
> This needs to happen even if the cluster was already allocated and the
> L2 entry was otherwise valid.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 17 +
>  1 file changed, 17 insertions(+)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 0a40944667..ed291a4042 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -986,6 +986,23 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
> QCowL2Meta *m)
>  
>  set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_COPIED |
>   (cluster_offset + (i << s->cluster_bits)));
> +
> +/* Update bitmap with the subclusters that were just written */
> +if (has_subclusters(s)) {
> +unsigned written_from = m->cow_start.offset;
> +unsigned written_to = m->cow_end.offset + m->cow_end.nb_bytes ?:
> +m->nb_clusters << s->cluster_bits;

I suppose we could also calculate both at the beginning of the function
(I’m not sure whether the compiler can optimize these calculations to
happen only once if we don’t).

> +uint64_t l2_bitmap = get_l2_bitmap(s, l2_slice, l2_index + i);
> +int sc;
> +for (sc = 0; sc < s->subclusters_per_cluster; sc++) {
> +int sc_off = i * s->cluster_size + sc * s->subcluster_size;
> +if (sc_off >= written_from && sc_off < written_to) {
> +l2_bitmap |= QCOW_OFLAG_SUB_ALLOC(sc);
> +l2_bitmap &= ~QCOW_OFLAG_SUB_ZERO(sc);

Works, but maybe a QCOW_OFLAG_SUB_MASK(sc) would be better for:

l2_bitmap &= ~QCOW_OFLAG_SUB_MASK(sc);
l2_bitmap |= QCOW_OFLAG_SUB_ALLOC(sc);

Nothing wrong though, so:

Reviewed-by: Max Reitz 

> +}
> +}
> +set_l2_bitmap(s, l2_slice, l2_index + i, l2_bitmap);
> +}
>   }
>  
>  
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Andrzej Jakowski
On 2/21/20 6:45 AM, Stefan Hajnoczi wrote:
> Why is msync(2) done on memory loads instead of stores?

This is my interpretation of NVMe spec wording with regards to PMRWBM field
which says:

"The completion of a memory read from any Persistent
Memory Region address ensures that all prior writes to the
Persistent Memory Region have completed and are
persistent."



Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Paolo Bonzini
On 21/02/20 16:29, Stefan Hajnoczi wrote:
>> Yes.  Let's keep the Q*_REMOVE cleanup on the todo list.  I'd keep
>> Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so
>> that we can have something like Q*_IN_LIST too.
> QLIST_IS_INSERTED() is part of this patch series, although I can rename
> it to Q*_IN_LIST() and cover all linked list variants. :)

Yes I meant having it for all variants.  I remembered you adding it but
not the exact spelling!

Paolo




Re: [PATCH 0/5] aio-posix: towards an O(1) event loop

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 14, 2020 at 05:17:07PM +, Stefan Hajnoczi wrote:
> This patch series makes AioHandler deletion and dispatch O(1) with respect to
> the total number of registered handlers.  The event loop has scalability
> problems when many AioHandlers are registered because it is O(n).  Linux
> epoll(7) is used to avoid scanning over all pollfds but parts of the code 
> still
> scan all AioHandlers.
> 
> This series reduces QEMU CPU utilization and therefore increases IOPS,
> especially for guests that have many devices.  It was tested with 32 vCPUs, 2
> virtio-blk,num-queues=1,iothread=iothread1, and 99
> virtio-blk,num-queues=32,iothread=iothread1 devices.  Using an IOThread is
> necessary because this series does not improve the glib main loop, a non-goal
> since the glib API is inherently O(n).
> 
> AioContext polling remains O(n) and will be addressed in a separate patch
> series.  This patch series increases IOPS from 260k to 300k when AioContext
> polling is commented out
> (rw=randread,bs=4k,iodepth=32,ioengine=libaio,direct=1).
> 
> Stefan Hajnoczi (5):
>   aio-posix: fix use after leaving scope in aio_poll()
>   aio-posix: don't pass ns timeout to epoll_wait()
>   qemu/queue.h: add QLIST_SAFE_REMOVE()
>   aio-posix: make AioHandler deletion O(1)
>   aio-posix: make AioHandler dispatch O(1) with epoll
> 
>  block.c  |   5 +-
>  chardev/spice.c  |   4 +-
>  include/block/aio.h  |   6 +-
>  include/qemu/queue.h |  17 +
>  util/aio-posix.c | 172 +--
>  5 files changed, 141 insertions(+), 63 deletions(-)
> 
> -- 
> 2.24.1
> 

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature


Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 04:04:10PM +0100, Paolo Bonzini wrote:
> On 21/02/20 15:47, Stefan Hajnoczi wrote:
> >>>   QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's 
> >>> list */
> >>>   ^ would cause corruption if node->node_ready was stale!
> >>>
> >>> Would you like me to add a comment?
> >> No, it's okay.
> > Are you happy with this series?
> 
> Yes.  Let's keep the Q*_REMOVE cleanup on the todo list.  I'd keep
> Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so
> that we can have something like Q*_IN_LIST too.

QLIST_IS_INSERTED() is part of this patch series, although I can rename
it to Q*_IN_LIST() and cover all linked list variants. :)

> > Shall I include it in my next pull request or do you want to merge it?
> 
> No, it's yours.

Thanks!

Stefan


signature.asc
Description: PGP signature


Re: [RFC PATCH v3 12/27] qcow2: Replace QCOW2_CLUSTER_* with QCOW2_SUBCLUSTER_*

2020-02-21 Thread Alberto Garcia
On Fri 21 Feb 2020 12:35:55 PM CET, Max Reitz wrote:
>> @@ -2223,22 +2227,23 @@ static coroutine_fn int 
>> qcow2_co_preadv_part(BlockDriverState *bs,
>>  }
>>  
>>  qemu_co_mutex_lock(>lock);
>> -ret = qcow2_get_cluster_offset(bs, offset, _bytes, 
>> _offset);
>> +ret = qcow2_get_cluster_offset(bs, offset, _bytes,
>> +   _offset, );
>
> I wonder whether this is kind of a bug fix here.  It’s entirely possible
> that @ret isn’t set after this, and then we get to the “out” label,
> which has a check on “if (ret == 0)”.

I think that in order to get to "if (ret == 0)" you would first need to
run aio_task_pool_new(), and that codepath guarantees that @ret is set.

Berto



Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Paolo Bonzini
On 21/02/20 15:47, Stefan Hajnoczi wrote:
>>>   QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's 
>>> list */
>>>   ^ would cause corruption if node->node_ready was stale!
>>>
>>> Would you like me to add a comment?
>> No, it's okay.
> Are you happy with this series?

Yes.  Let's keep the Q*_REMOVE cleanup on the todo list.  I'd keep
Q*_SAFE_REMOVE, but clear the pointer unconditionally in Q*_REMOVE so
that we can have something like Q*_IN_LIST too.

> Shall I include it in my next pull request or do you want to merge it?

No, it's yours.

Paolo




Re: [RFC PATCH v3 20/27] qcow2: Fix offset calculation in handle_dependencies()

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> l2meta_cow_start() and l2meta_cow_end() are not necessarily
> cluster-aligned if the image has subclusters, so update the
> calculation of old_start and old_end to guarantee that no two requests
> try to write on the same cluster.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 19/27] qcow2: Add subcluster support to expand_zero_clusters_in_l1()

2020-02-21 Thread Max Reitz
On 22.12.19 12:37, Alberto Garcia wrote:
> Two changes are needed in order to add subcluster support to this
> function: deallocated clusters must have their bitmaps cleared, and
> expanded clusters must have all the "subcluster allocated" bits set.

Not really, to have real subcluster support it would need to be
expand_zero_subclusters_in_l1().  Right now it can only deal with full
zero clusters, which will actually never happen for images with subclusters.

As noted in v2, this function is only called when downgrading qcow2
images to v2.  It kind of made sense to just call set_l2_bitmap() in v2,
but now with the if () conditional...  I suppose it may make more sense
to assert that the image does not have subclusters at the beginning of
the function and be done with it.

OTOH, well, this does make ensuring that we have subcluster “support”
everywhere a bit easier because this way all set_l2_entry() calls are
accompanied by an “if (subclusters) { set_l2_bitmap() }” part.

But it is dead code.

Max

> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
> index 207f670c94..ede75138d2 100644
> --- a/block/qcow2-cluster.c
> +++ b/block/qcow2-cluster.c
> @@ -2054,6 +2054,9 @@ static int expand_zero_clusters_in_l1(BlockDriverState 
> *bs, uint64_t *l1_table,
>  /* not backed; therefore we can simply deallocate the
>   * cluster */
>  set_l2_entry(s, l2_slice, j, 0);
> +if (has_subclusters(s)) {
> +set_l2_bitmap(s, l2_slice, j, 0);
> +}
>  l2_dirty = true;
>  continue;
>  }
> @@ -2120,6 +2123,9 @@ static int expand_zero_clusters_in_l1(BlockDriverState 
> *bs, uint64_t *l1_table,
>  } else {
>  set_l2_entry(s, l2_slice, j, offset);
>  }
> +if (has_subclusters(s)) {
> +set_l2_bitmap(s, l2_slice, j, QCOW_L2_BITMAP_ALL_ALLOC);
> +}
>  l2_dirty = true;
>  }
>  
> 




signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 18/27] qcow2: Add subcluster support to check_refcounts_l2()

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> Setting the QCOW_OFLAG_ZERO bit of the L2 entry is forbidden if an
> image has subclusters. Instead, the individual 'all zeroes' bits must
> be used.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-refcount.c | 9 +++--
>  1 file changed, 7 insertions(+), 2 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 02:06:26PM +0100, Paolo Bonzini wrote:
> On 21/02/20 13:59, Stefan Hajnoczi wrote:
> >   /* Add a handler to a ready list */
> >   static void add_ready_handler(AioHandlerList *ready_list,
> > AioHandler *node,
> > int revents)
> >   {
> >   QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's 
> > list */
> >   ^ would cause corruption if node->node_ready was stale!
> > 
> > Would you like me to add a comment?
> No, it's okay.

Are you happy with this series?

Shall I include it in my next pull request or do you want to merge it?

Thanks,
Stefan


signature.asc
Description: PGP signature


Re: [RFC PATCH v3 17/27] qcow2: Add subcluster support to discard_in_l2_slice()

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> Setting the QCOW_OFLAG_ZERO bit of the L2 entry is forbidden if an
> image has subclusters. Instead, the individual 'all zeroes' bits must
> be used.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 02:06:26PM +0100, Paolo Bonzini wrote:
> On 21/02/20 13:59, Stefan Hajnoczi wrote:
> > 1. It doesn't crash if the node is currently not on a list.
> > 2. It clears the node's linked list pointers so that future linked
> >list operations (like QLIST_SAFE_REMOVE()) aren't accidentally
> >performed on stale pointers.
> >
> > The node has a long lifespan and will be inserted into ready_lists
> > multiple times.  We need to safely remove it from ready_list to protect
> > against a corruption the next time the node is inserted into a
> > ready_list again:
> 
> Ah, so the one I singled out is for (2) (we know the node is currently
> on a list), while the one below is for (1).  Would it make sense to move
> (2) to Q*_REMOVE_*?  We can do it separately after this pull request.

Extending all Q*_REMOVE*() macros to clear the linked list pointers is
nice.  I'll send a follow-up patch.

Stefan


signature.asc
Description: PGP signature


Re: [RFC PATCH v3 16/27] qcow2: Add subcluster support to zero_in_l2_slice()

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> Setting the QCOW_OFLAG_ZERO bit of the L2 entry is forbidden if an
> image has subclusters. Instead, the individual 'all zeroes' bits must
> be used.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 14 ++
>  1 file changed, 10 insertions(+), 4 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 15/27] qcow2: Add subcluster support to qcow2_get_cluster_offset()

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> The logic of this function remains pretty much the same, except that
> it uses count_contiguous_subclusters(), which combines the logic of
> count_contiguous_clusters() / count_contiguous_clusters_unallocated()
> and checks individual subclusters.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 136 --
>  block/qcow2.h |  36 +--
>  2 files changed, 80 insertions(+), 92 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [PULL 00/18] Block patches

2020-02-21 Thread Peter Maydell
On Thu, 20 Feb 2020 at 16:07, Max Reitz  wrote:
>
> The following changes since commit 672f9d0df10a68a5c5f2b32cbc8284abf9f5ee18:
>
>   Merge remote-tracking branch 'remotes/kevin/tags/for-upstream' into staging 
> (2020-02-18 14:23:43 +)
>
> are available in the Git repository at:
>
>   https://github.com/XanClic/qemu.git tags/pull-block-2020-02-20
>
> for you to fetch changes up to dff8d44c96f128480430b0c59ed8760917dbd427:
>
>   iotests: Test snapshot -l field separation (2020-02-20 16:43:42 +0100)
>
> 
> Block patches:
> - qemu-img convert: New --target-is-zero parameter
> - qcow2: Specify non-default compression type flag
> - optionally flat output for query-named-block-nodes
> - some fixes
> - pseudo-creation of images on block devices is now done by a generic
>   block layer function
>
> 



Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/5.0
for any user-visible changes.

-- PMM



Re: [RFC PATCH v3 10/27] qcow2: Update get/set_l2_entry() and add get/set_l2_bitmap()

2020-02-21 Thread Alberto Garcia
On Thu 20 Feb 2020 05:27:28 PM CET, Max Reitz wrote:
>> +static inline uint64_t get_l2_bitmap(BDRVQcow2State *s, uint64_t *l2_slice,
>> + int idx)
>> +{
>> +if (has_subclusters(s)) {
>> +idx *= l2_entry_size(s) / sizeof(uint64_t);
>> +return be64_to_cpu(l2_slice[idx + 1]);
>> +} else {
>> +/* For convenience only; the caller should ignore this value. */
>> +return 0;
>
> Is there a reason you decided not to return the first subcluster as
> allocated?  (As you had proposed in v2)

Yeah, I thought that it would not make much sense to return a meaningful
value after a comment saying that the caller should ignore it.

If there was a situation in which something depends on that value then
it would be a bug in QEMU.

Berto



Re: [PATCH v3] util/async: make bh_aio_poll() O(1)

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 09:39:51AM +, Stefan Hajnoczi wrote:
> The ctx->first_bh list contains all created BHs, including those that
> are not scheduled.  The list is iterated by the event loop and therefore
> has O(n) time complexity with respected to the number of created BHs.
> 
> Rewrite BHs so that only scheduled or deleted BHs are enqueued.
> Only BHs that actually require action will be iterated.
> 
> One semantic change is required: qemu_bh_delete() enqueues the BH and
> therefore invokes aio_notify().  The
> tests/test-aio.c:test_source_bh_delete_from_cb() test case assumed that
> g_main_context_iteration(NULL, false) returns false after
> qemu_bh_delete() but it now returns true for one iteration.  Fix up the
> test case.
> 
> This patch makes aio_compute_timeout() and aio_bh_poll() drop from a CPU
> profile reported by perf-top(1).  Previously they combined to 9% CPU
> utilization when AioContext polling is commented out and the guest has 2
> virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32 devices.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
> v3:
>  * Use QSLIST_FOREACH_RCU() and QSLIST_FIRST_RCU() [Paolo]
> v2:
>  * Use QSLIST for BHs and QSIMPLEQ for BHListSlices [Paolo]
>(Note that I replaced bh = atomic_rcu_read(_bh) with
> QSLIST_FOREACH(_list) so there is no memory ordering but I think
> this is safe.)
>  * Comment clarifications [Paolo]
> 
> Based-on: 20200220103828.24525-1-pbonz...@redhat.com
>   ("[PATCH] rcu_queue: add QSLIST functions")
> ---
>  include/block/aio.h |  20 +++-
>  tests/test-aio.c|   3 +-
>  util/async.c| 237 ++--
>  3 files changed, 158 insertions(+), 102 deletions(-)

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v1] block/nvme: introduce PMR support from NVMe 1.4 spec

2020-02-21 Thread Stefan Hajnoczi
On Tue, Feb 18, 2020 at 03:48:11PM -0700, Andrzej Jakowski wrote:
> This patch introduces support for PMR that has been defined as part of NVMe 
> 1.4
> spec. User can now specify a pmr_file which will be mmap'ed into qemu address
> space and subsequently in PCI BAR 2. Guest OS can perform mmio read and writes
> to the PMR region that will stay persistent accross system reboot.
> 
> Signed-off-by: Andrzej Jakowski 
> ---
>  hw/block/nvme.c   | 145 ++-
>  hw/block/nvme.h   |   5 ++
>  hw/block/trace-events |   5 ++
>  include/block/nvme.h  | 172 ++
>  4 files changed, 326 insertions(+), 1 deletion(-)

NVDIMM folks, please take a look.  There seems to be commonality here.

Can this use -object memory-backend-file instead of manually opening and
mapping a file?

Also CCing David Gilbert because there is some similarity with the
vhost-user-fs's DAX Window feature where QEMU mmaps regions of files
into a BAR.

> @@ -1303,6 +1327,38 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> +static void nvme_pmr_write(void *opaque, hwaddr addr, uint64_t data,
> +unsigned size)
> +{
> +NvmeCtrl *n = (NvmeCtrl *)opaque;
> +stn_le_p(>pmrbuf[addr], size, data);
> +}
> +
> +static uint64_t nvme_pmr_read(void *opaque, hwaddr addr, unsigned size)
> +{
> +NvmeCtrl *n = (NvmeCtrl *)opaque;
> +if (!NVME_PMRCAP_PMRWBM(n->bar.pmrcap)) {
> +int ret;
> +ret = msync(n->pmrbuf, n->f_pmr_size, MS_SYNC);
> +if (!ret) {
> +NVME_GUEST_ERR(nvme_ub_mmiowr_pmrread_barrier,
> +   "error while persisting data");
> +}
> +}

Why is msync(2) done on memory loads instead of stores?


signature.asc
Description: PGP signature


Re: [PATCH] util/async: make bh_aio_poll() O(1)

2020-02-21 Thread Stefan Hajnoczi
On Wed, Feb 19, 2020 at 08:05:12PM +0100, Paolo Bonzini wrote:
> Il mer 19 feb 2020, 18:58 Stefan Hajnoczi  ha scritto:
> 
> > On Wed, Feb 19, 2020 at 12:09:48PM +0100, Paolo Bonzini wrote:
> > > Really a great idea, though I have some remarks on the implementation
> > below.
> > >
> > > On 19/02/20 11:00, Stefan Hajnoczi wrote:
> > > > + * Each aio_bh_poll() call carves off a slice of the BH list.  This
> > way newly
> > > > + * scheduled BHs are not processed until the next aio_bh_poll()
> > call.  This
> > > > + * concept extends to nested aio_bh_poll() calls because slices are
> > chained
> > > > + * together.
> > >
> > > This is the tricky part so I would expand a bit on why it's needed:
> > >
> > > /*
> > >  * Each aio_bh_poll() call carves off a slice of the BH list, so that
> > >  * newly scheduled BHs are not processed until the next aio_bh_poll()
> > >  * call.  All active aio_bh_poll() calls chained their slices together
> > >  * in a list, so that nested aio_bh_poll() calls process all scheduled
> > >  * bottom halves.
> > >  */
> >
> > Thanks, will fix in v2.
> >
> > > > +struct BHListSlice {
> > > > +QEMUBH *first_bh;
> > > > +BHListSlice *next;
> > > > +};
> > > > +
> > >
> > > Using QLIST and QSLIST removes the need to create your own lists, since
> > > you can use QSLIST_MOVE_ATOMIC and QSLIST_INSERT_HEAD_ATOMIC.  For
> > example:
> > >
> > > struct BHListSlice {
> > > QSLIST_HEAD(, QEMUBH) first_bh;
> > > QLIST_ENTRY(BHListSlice) next;
> > > };
> > >
> > > ...
> > >
> > > QSLIST_HEAD(, QEMUBH) active_bh;
> > > QLIST_HEAD(, BHListSlice) bh_list;
> >
> > I thought about this but chose the explicit tail pointer approach
> > because it lets aio_compute_timeout() and aio_ctx_check() iterate over
> > both the active BH list and slices in a single for loop :).  But
> > thinking about it more, maybe it can still be done by replacing
> > active_bh with a permanently present first BHListSlice element.
> >
> 
> Probably not so easy since the idea was to empty the slices list entirely
> via the FIFO order.
> 
> But since you are rewriting everything anyway, can you add a flag for
> whether there are any non-idle bottom halves anywhere in the list? It need
> not be computed perfectly, because any non-idle bh will cause the idle
> bottom halves to be triggered as well; you can just set in qemu_bh_schedule
> and clear it at the end of aio_bh_poll.
> 
> Then if there is any active bh or slice you consult the flag and use it to
> return the timeout, which will be either 0 or 10ms depending on the flag.
> That's truly O(1). (More precisely, this patch goes from O(#created-bh) to
> O(#scheduled-bh), which of course is optimal for aio_bh_poll but not for
> aio_compute_timeout; making timeout computation O(1) can lower latency a
> bit by decreasing the constant factor).

Yes, good idea.  I'll send a follow-up patch.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH] aio-posix: avoid reacquiring rcu_read_lock() when polling

2020-02-21 Thread Stefan Hajnoczi
On Tue, Feb 18, 2020 at 06:27:08PM +, Stefan Hajnoczi wrote:
> The first rcu_read_lock/unlock() is expensive.  Nested calls are cheap.
> 
> This optimization increases IOPS from 73k to 162k with a Linux guest
> that has 2 virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32
> devices.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  util/aio-posix.c | 11 +++
>  1 file changed, 11 insertions(+)

Thanks, applied to my block tree:
https://github.com/stefanha/qemu/commits/block

Stefan


signature.asc
Description: PGP signature


Re: [RFC PATCH v3 14/27] qcow2: Add subcluster support to calculate_l2_meta()

2020-02-21 Thread Max Reitz
On 22.12.19 12:36, Alberto Garcia wrote:
> If an image has subclusters then there are more copy-on-write
> scenarios that we need to consider. Let's say we have a write request
> from the middle of subcluster #3 until the end of the cluster:
> 
>- If the cluster is new, then subclusters #0 to #3 from the old
>  cluster must be copied into the new one.
> 
>- If the cluster is new but the old cluster was unallocated, then
>  only subcluster #3 needs copy-on-write. #0 to #2 are marked as
>  unallocated in the bitmap of the new L2 entry.
> 
>- If we are overwriting an old cluster and subcluster #3 is
>  unallocated or has the all-zeroes bit set then we need
>  copy-on-write on subcluster #3.
> 
>- If we are overwriting an old cluster and subcluster #3 was
>  allocated then there is no need to copy-on-write.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  block/qcow2-cluster.c | 140 +-
>  1 file changed, 110 insertions(+), 30 deletions(-)

It’s all a bit tough to wrap my head around.  One thing I got
particularly hung up is how we ensure that for new clusters the
head/tail subcluster bits that do not need COW are initialized to the
correct value.  Then I realized that we just have to keep them as they
are (unallocated or zero, respectively), because this path is only for
when we already have L2 entries, it’s just that they point to normal
non-COPIED clusters.  (So only the L2 offset entry has to be changed,
not the bitmap.  At least not for the subclusters that aren’t touched by
the write.)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [RFC PATCH v3 07/27] qcow2: Add subcluster-related fields to BDRVQcow2State

2020-02-21 Thread Alberto Garcia
On Thu 20 Feb 2020 05:48:25 PM CET, Eric Blake wrote:
>>> The qcow2 spec changes earlier in the series made it sound like your
>>> choices are exactly 1 or 32,
>> 
 +#define QCOW_MAX_SUBCLUSTERS_PER_CLUSTER 32
 +
>>>
>>> ...but this name sounds like other values (2, 4, 8, 16) might be
>>> possible?
>> 
>> I guess I didn't want to call it QCOW_SUBCLUSTERS_PER_CLUSTER because
>> there's already BDRVQcow2State.subclusters_per_cluster. And that one can
>> have two possible values (1 and 32) so 32 would be the maximum.
>> 
>> I get your point, however, and I'm open to suggestions.
>
> Maybe QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER
>
> since it is a hard-coded property of the EXTL2 feature.

Sounds good.

Berto



Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Paolo Bonzini
On 21/02/20 13:59, Stefan Hajnoczi wrote:
> 1. It doesn't crash if the node is currently not on a list.
> 2. It clears the node's linked list pointers so that future linked
>list operations (like QLIST_SAFE_REMOVE()) aren't accidentally
>performed on stale pointers.
>
> The node has a long lifespan and will be inserted into ready_lists
> multiple times.  We need to safely remove it from ready_list to protect
> against a corruption the next time the node is inserted into a
> ready_list again:

Ah, so the one I singled out is for (2) (we know the node is currently
on a list), while the one below is for (1).  Would it make sense to move
(2) to Q*_REMOVE_*?  We can do it separately after this pull request.

>   /* Add a handler to a ready list */
>   static void add_ready_handler(AioHandlerList *ready_list,
> AioHandler *node,
> int revents)
>   {
>   QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's 
> list */
>   ^ would cause corruption if node->node_ready was stale!
> 
> Would you like me to add a comment?
No, it's okay.

Paolo




Re: [PATCH 5/5] aio-posix: make AioHandler dispatch O(1) with epoll

2020-02-21 Thread Stefan Hajnoczi
On Wed, Feb 19, 2020 at 12:13:40PM +0100, Paolo Bonzini wrote:
> On 14/02/20 18:17, Stefan Hajnoczi wrote:
> > +while ((node = QLIST_FIRST(ready_list))) {
> > +QLIST_SAFE_REMOVE(node, node_ready);
> 
> Why does this need safe remove?

Yes, it's necessary.  QLIST_SAFE_REMOVE() has two properties that make
it "safe":
1. It doesn't crash if the node is currently not on a list.
2. It clears the node's linked list pointers so that future linked
   list operations (like QLIST_SAFE_REMOVE()) aren't accidentally
   performed on stale pointers.

The node has a long lifespan and will be inserted into ready_lists
multiple times.  We need to safely remove it from ready_list to protect
against a corruption the next time the node is inserted into a
ready_list again:

  /* Add a handler to a ready list */
  static void add_ready_handler(AioHandlerList *ready_list,
AioHandler *node,
int revents)
  {
  QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list 
*/
  ^ would cause corruption if node->node_ready was stale!

Would you like me to add a comment?

Stefan


signature.asc
Description: PGP signature


Re: [PATCH 1/2] vhost-user-blk: delete virtioqueues in unrealize to fix memleaks

2020-02-21 Thread Stefan Hajnoczi
On Thu, Feb 13, 2020 at 09:28:06AM +0800, pannengy...@huawei.com wrote:
> From: Pan Nengyuan 
> 
> virtio queues forgot to delete in unrealize, and aslo error path in
> realize, this patch fix these memleaks, the leak stack is as follow:
> 
> Direct leak of 114688 byte(s) in 16 object(s) allocated from:
> #0 0x7f24024fdbf0 in calloc (/lib64/libasan.so.3+0xcabf0)
> #1 0x7f2401642015 in g_malloc0 (/lib64/libglib-2.0.so.0+0x50015)
> #2 0x55ad175a6447 in virtio_add_queue 
> /mnt/sdb/qemu/hw/virtio/virtio.c:2327
> #3 0x55ad17570cf9 in vhost_user_blk_device_realize 
> /mnt/sdb/qemu/hw/block/vhost-user-blk.c:419
> #4 0x55ad175a3707 in virtio_device_realize 
> /mnt/sdb/qemu/hw/virtio/virtio.c:3509
> #5 0x55ad176ad0d1 in device_set_realized /mnt/sdb/qemu/hw/core/qdev.c:876
> #6 0x55ad1781ff9d in property_set_bool /mnt/sdb/qemu/qom/object.c:2080
> #7 0x55ad178245ae in object_property_set_qobject 
> /mnt/sdb/qemu/qom/qom-qobject.c:26
> #8 0x55ad17821eb4 in object_property_set_bool 
> /mnt/sdb/qemu/qom/object.c:1338
> #9 0x55ad177aeed7 in virtio_pci_realize 
> /mnt/sdb/qemu/hw/virtio/virtio-pci.c:1801
> 
> Reported-by: Euler Robot 
> Signed-off-by: Pan Nengyuan 
> ---
>  hw/block/vhost-user-blk.c | 8 
>  1 file changed, 8 insertions(+)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature


Re: [PATCH 2/2] vhost-use-blk: convert to new virtio_delete_queue

2020-02-21 Thread Stefan Hajnoczi
On Thu, Feb 13, 2020 at 09:28:07AM +0800, pannengy...@huawei.com wrote:
> diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
> index 2eba8b9db0..ed6a5cc03b 100644
> --- a/hw/block/vhost-user-blk.c
> +++ b/hw/block/vhost-user-blk.c
> @@ -420,9 +420,10 @@ static void vhost_user_blk_device_realize(DeviceState 
> *dev, Error **errp)
>  virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
>  sizeof(struct virtio_blk_config));
>  
> +s->virtqs = g_new0(VirtQueue *, s->num_queues);

Minor point, up to you if you want to change it: the array is fully
initialized by the for loop in the next line.  There is no need to clear
the memory first:

s/g_new0/g_new/

> diff --git a/include/hw/virtio/vhost-user-blk.h 
> b/include/hw/virtio/vhost-user-blk.h
> index 108bfadeeb..f68911f6f0 100644
> --- a/include/hw/virtio/vhost-user-blk.h
> +++ b/include/hw/virtio/vhost-user-blk.h
> @@ -37,6 +37,7 @@ typedef struct VHostUserBlk {
>  struct vhost_inflight *inflight;
>  VhostUserState vhost_user;
>  struct vhost_virtqueue *vqs;
> +VirtQueue **virtqs;

Both vqs and virtqs exist and are easily confused.  Please rename vqs to
vhost_vqs.


signature.asc
Description: PGP signature


[PATCH v4 4/4] iotests: add 282 luks qemu-img measure test

2020-02-21 Thread Stefan Hajnoczi
This test exercises the block/crypto.c "luks" block driver
.bdrv_measure() code.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Max Reitz 
---
 tests/qemu-iotests/282 | 93 ++
 tests/qemu-iotests/282.out | 30 
 tests/qemu-iotests/group   |  1 +
 3 files changed, 124 insertions(+)
 create mode 100755 tests/qemu-iotests/282
 create mode 100644 tests/qemu-iotests/282.out

diff --git a/tests/qemu-iotests/282 b/tests/qemu-iotests/282
new file mode 100755
index 00..6c62065aef
--- /dev/null
+++ b/tests/qemu-iotests/282
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+#
+# qemu-img measure tests for LUKS images
+#
+# Copyright (C) 2020 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=stefa...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+status=1# failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+rm -f "$TEST_IMG.converted"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.pattern
+
+_supported_fmt luks
+_supported_proto file
+_supported_os Linux
+
+SECRET=secret,id=sec0,data=passphrase
+
+echo "== measure 1G image file =="
+echo
+
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10 \
+ --size 1G
+
+echo
+echo "== create 1G image file (size should be no greater than measured) =="
+echo
+
+_make_test_img 1G
+stat -c "image file size in bytes: %s" "$TEST_IMG_FILE"
+
+echo
+echo "== modified 1G image file (size should be no greater than measured) =="
+echo
+
+$QEMU_IO --object "$SECRET" --image-opts "$TEST_IMG" -c "write -P 0x51 0x1 
0x400" | _filter_qemu_io | _filter_testdir
+stat -c "image file size in bytes: %s" "$TEST_IMG_FILE"
+
+echo
+echo "== measure preallocation=falloc 1G image file =="
+echo
+
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10,preallocation=falloc \
+ --size 1G
+
+echo
+echo "== measure with input image file =="
+echo
+
+IMGFMT=raw IMGKEYSECRET= IMGOPTS= _make_test_img 1G | _filter_imgfmt
+QEMU_IO_OPTIONS= IMGOPTSSYNTAX= $QEMU_IO -f raw -c "write -P 0x51 0x1 
0x400" "$TEST_IMG_FILE" | _filter_qemu_io | _filter_testdir
+$QEMU_IMG measure --object "$SECRET" \
+ -O "$IMGFMT" \
+ -o key-secret=sec0,iter-time=10 \
+ -f raw \
+ "$TEST_IMG_FILE"
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/282.out b/tests/qemu-iotests/282.out
new file mode 100644
index 00..996cc44a67
--- /dev/null
+++ b/tests/qemu-iotests/282.out
@@ -0,0 +1,30 @@
+QA output created by 282
+== measure 1G image file ==
+
+required size: 1075810304
+fully allocated size: 1075810304
+
+== create 1G image file (size should be no greater than measured) ==
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+image file size in bytes: 1075810304
+
+== modified 1G image file (size should be no greater than measured) ==
+
+wrote 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+image file size in bytes: 1075810304
+
+== measure preallocation=falloc 1G image file ==
+
+required size: 1075810304
+fully allocated size: 1075810304
+
+== measure with input image file ==
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
+wrote 1024/1024 bytes at offset 65536
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+required size: 1075810304
+fully allocated size: 1075810304
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 1904223020..6a25efea4d 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -289,4 +289,5 @@
 279 rw backing quick
 280 rw migration quick
 281 rw quick
+282 quick
 283 auto quick
-- 
2.24.1



[PATCH v4 2/4] luks: implement .bdrv_measure()

2020-02-21 Thread Stefan Hajnoczi
Add qemu-img measure support in the "luks" block driver.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Max Reitz 
---
 block/crypto.c | 62 ++
 1 file changed, 62 insertions(+)

diff --git a/block/crypto.c b/block/crypto.c
index 24823835c1..23e9c74d6f 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -484,6 +484,67 @@ static int64_t block_crypto_getlength(BlockDriverState *bs)
 }
 
 
+static BlockMeasureInfo *block_crypto_measure(QemuOpts *opts,
+  BlockDriverState *in_bs,
+  Error **errp)
+{
+g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
+Error *local_err = NULL;
+BlockMeasureInfo *info;
+uint64_t size;
+size_t luks_payload_size;
+QDict *cryptoopts;
+
+/*
+ * Preallocation mode doesn't affect size requirements but we must consume
+ * the option.
+ */
+g_free(qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC));
+
+size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
+
+if (in_bs) {
+int64_t ssize = bdrv_getlength(in_bs);
+
+if (ssize < 0) {
+error_setg_errno(_err, -ssize,
+ "Unable to get image virtual_size");
+goto err;
+}
+
+size = ssize;
+}
+
+cryptoopts = qemu_opts_to_qdict_filtered(opts, NULL,
+_crypto_create_opts_luks, true);
+qdict_put_str(cryptoopts, "format", "luks");
+create_opts = block_crypto_create_opts_init(cryptoopts, _err);
+qobject_unref(cryptoopts);
+if (!create_opts) {
+goto err;
+}
+
+if (!qcrypto_block_calculate_payload_offset(create_opts, NULL,
+_payload_size,
+_err)) {
+goto err;
+}
+
+/*
+ * Unallocated blocks are still encrypted so allocation status makes no
+ * difference to the file size.
+ */
+info = g_new(BlockMeasureInfo, 1);
+info->fully_allocated = luks_payload_size + size;
+info->required = luks_payload_size + size;
+return info;
+
+err:
+error_propagate(errp, local_err);
+return NULL;
+}
+
+
 static int block_crypto_probe_luks(const uint8_t *buf,
int buf_size,
const char *filename) {
@@ -670,6 +731,7 @@ static BlockDriver bdrv_crypto_luks = {
 .bdrv_co_preadv = block_crypto_co_preadv,
 .bdrv_co_pwritev= block_crypto_co_pwritev,
 .bdrv_getlength = block_crypto_getlength,
+.bdrv_measure   = block_crypto_measure,
 .bdrv_get_info  = block_crypto_get_info_luks,
 .bdrv_get_specific_info = block_crypto_get_specific_info_luks,
 
-- 
2.24.1



[PATCH v4 0/4] luks: add qemu-img measure support

2020-02-21 Thread Stefan Hajnoczi
v4:
 * This revision is what German speakers call "das Tüpfelchen auf dem I".  "The
   icing on the cake" is the English equivalent.  Since I like cake and don't
   want it to be half-baked, and because I like my metaphors shaken, not
   stirred, I went ahead with the extra revision so I could write this message.
 * Use g_autoptr(QCryptoBlock) to make the code more concise [Max]
 * Use local_err consistently [Max]
 * Folded in Max's Reviewed-by tags
v3:
 * Move payload offset calculation function to crypto/block.c [Max]
 * Zero/unallocated blocks always require disk space on encrypted files [Max]
 * Update qemu-iotests 178 output when changing qemu-img measure command-line
   options

v2:
 * Fix uint64_t <-> size_t type mismatch in block_crypto_measure() so that
   32-bit builds pass

This patch series adds qemu-img measure support to the "luks" block driver.  We
just need to take into account the LUKS header when sizing the image.

Stefan Hajnoczi (4):
  luks: extract qcrypto_block_calculate_payload_offset()
  luks: implement .bdrv_measure()
  qemu-img: allow qemu-img measure --object without a filename
  iotests: add 282 luks qemu-img measure test

 block/crypto.c   | 62 +
 block/qcow2.c| 74 +++--
 crypto/block.c   | 36 +
 include/crypto/block.h   | 22 
 qemu-img.c   |  6 +--
 tests/qemu-iotests/178   |  2 +-
 tests/qemu-iotests/178.out.qcow2 |  8 +--
 tests/qemu-iotests/178.out.raw   |  8 +--
 tests/qemu-iotests/282   | 93 
 tests/qemu-iotests/282.out   | 30 +++
 tests/qemu-iotests/group |  1 +
 11 files changed, 274 insertions(+), 68 deletions(-)
 create mode 100755 tests/qemu-iotests/282
 create mode 100644 tests/qemu-iotests/282.out

-- 
2.24.1



[PATCH v4 1/4] luks: extract qcrypto_block_calculate_payload_offset()

2020-02-21 Thread Stefan Hajnoczi
The qcow2 .bdrv_measure() code calculates the crypto payload offset.
This logic really belongs in crypto/block.c where it can be reused by
other image formats.

The "luks" block driver will need this same logic in order to implement
.bdrv_measure(), so extract the qcrypto_block_calculate_payload_offset()
function now.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Max Reitz 
---
 block/qcow2.c  | 74 +++---
 crypto/block.c | 36 
 include/crypto/block.h | 22 +
 3 files changed, 77 insertions(+), 55 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 8dcee5efec..59df7ec0ce 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -4601,60 +4601,6 @@ static coroutine_fn int 
qcow2_co_flush_to_os(BlockDriverState *bs)
 return ret;
 }
 
-static ssize_t qcow2_measure_crypto_hdr_init_func(QCryptoBlock *block,
-size_t headerlen, void *opaque, Error **errp)
-{
-size_t *headerlenp = opaque;
-
-/* Stash away the payload size */
-*headerlenp = headerlen;
-return 0;
-}
-
-static ssize_t qcow2_measure_crypto_hdr_write_func(QCryptoBlock *block,
-size_t offset, const uint8_t *buf, size_t buflen,
-void *opaque, Error **errp)
-{
-/* Discard the bytes, we're not actually writing to an image */
-return buflen;
-}
-
-/* Determine the number of bytes for the LUKS payload */
-static bool qcow2_measure_luks_headerlen(QemuOpts *opts, size_t *len,
- Error **errp)
-{
-QDict *opts_qdict;
-QDict *cryptoopts_qdict;
-QCryptoBlockCreateOptions *cryptoopts;
-QCryptoBlock *crypto;
-
-/* Extract "encrypt." options into a qdict */
-opts_qdict = qemu_opts_to_qdict(opts, NULL);
-qdict_extract_subqdict(opts_qdict, _qdict, "encrypt.");
-qobject_unref(opts_qdict);
-
-/* Build QCryptoBlockCreateOptions object from qdict */
-qdict_put_str(cryptoopts_qdict, "format", "luks");
-cryptoopts = block_crypto_create_opts_init(cryptoopts_qdict, errp);
-qobject_unref(cryptoopts_qdict);
-if (!cryptoopts) {
-return false;
-}
-
-/* Fake LUKS creation in order to determine the payload size */
-crypto = qcrypto_block_create(cryptoopts, "encrypt.",
-  qcow2_measure_crypto_hdr_init_func,
-  qcow2_measure_crypto_hdr_write_func,
-  len, errp);
-qapi_free_QCryptoBlockCreateOptions(cryptoopts);
-if (!crypto) {
-return false;
-}
-
-qcrypto_block_free(crypto);
-return true;
-}
-
 static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
Error **errp)
 {
@@ -4705,9 +4651,27 @@ static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, 
BlockDriverState *in_bs,
 g_free(optstr);
 
 if (has_luks) {
+g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
+QDict *opts_qdict;
+QDict *cryptoopts;
 size_t headerlen;
 
-if (!qcow2_measure_luks_headerlen(opts, , _err)) {
+opts_qdict = qemu_opts_to_qdict(opts, NULL);
+qdict_extract_subqdict(opts_qdict, , "encrypt.");
+qobject_unref(opts_qdict);
+
+qdict_put_str(cryptoopts, "format", "luks");
+
+create_opts = block_crypto_create_opts_init(cryptoopts, errp);
+qobject_unref(cryptoopts);
+if (!create_opts) {
+goto err;
+}
+
+if (!qcrypto_block_calculate_payload_offset(create_opts,
+"encrypt.",
+,
+_err)) {
 goto err;
 }
 
diff --git a/crypto/block.c b/crypto/block.c
index 325752871c..6f42b32f1e 100644
--- a/crypto/block.c
+++ b/crypto/block.c
@@ -115,6 +115,42 @@ QCryptoBlock 
*qcrypto_block_create(QCryptoBlockCreateOptions *options,
 }
 
 
+static ssize_t qcrypto_block_headerlen_hdr_init_func(QCryptoBlock *block,
+size_t headerlen, void *opaque, Error **errp)
+{
+size_t *headerlenp = opaque;
+
+/* Stash away the payload size */
+*headerlenp = headerlen;
+return 0;
+}
+
+
+static ssize_t qcrypto_block_headerlen_hdr_write_func(QCryptoBlock *block,
+size_t offset, const uint8_t *buf, size_t buflen,
+void *opaque, Error **errp)
+{
+/* Discard the bytes, we're not actually writing to an image */
+return buflen;
+}
+
+
+bool
+qcrypto_block_calculate_payload_offset(QCryptoBlockCreateOptions *create_opts,
+   const char *optprefix,
+   size_t *len,
+   Error **errp)
+{
+/* Fake LUKS creation in order to determine the payload size */
+g_autoptr(QCryptoBlock) crypto =
+qcrypto_block_create(create_opts, optprefix,
+ 

[PATCH v4 3/4] qemu-img: allow qemu-img measure --object without a filename

2020-02-21 Thread Stefan Hajnoczi
In most qemu-img sub-commands the --object option only makes sense when
there is a filename.  qemu-img measure is an exception because objects
may be referenced from the image creation options instead of an existing
image file.  Allow --object without a filename.

Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Max Reitz 
---
 qemu-img.c   | 6 ++
 tests/qemu-iotests/178   | 2 +-
 tests/qemu-iotests/178.out.qcow2 | 8 
 tests/qemu-iotests/178.out.raw   | 8 
 4 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 2b4562b9d9..0cee7bed8b 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -4912,10 +4912,8 @@ static int img_measure(int argc, char **argv)
 filename = argv[optind];
 }
 
-if (!filename &&
-(object_opts || image_opts || fmt || snapshot_name || sn_opts)) {
-error_report("--object, --image-opts, -f, and -l "
- "require a filename argument.");
+if (!filename && (image_opts || fmt || snapshot_name || sn_opts)) {
+error_report("--image-opts, -f, and -l require a filename argument.");
 goto out;
 }
 if (filename && img_size != UINT64_MAX) {
diff --git a/tests/qemu-iotests/178 b/tests/qemu-iotests/178
index 51a70fe669..7cf0e27154 100755
--- a/tests/qemu-iotests/178
+++ b/tests/qemu-iotests/178
@@ -50,7 +50,7 @@ _make_test_img 1G
 $QEMU_IMG measure # missing arguments
 $QEMU_IMG measure --size 2G "$TEST_IMG" # only one allowed
 $QEMU_IMG measure "$TEST_IMG" a # only one filename allowed
-$QEMU_IMG measure --object secret,id=sec0,data=MTIzNDU2,format=base64 # 
missing filename
+$QEMU_IMG measure --object secret,id=sec0,data=MTIzNDU2,format=base64 # size 
or filename needed
 $QEMU_IMG measure --image-opts # missing filename
 $QEMU_IMG measure -f qcow2 # missing filename
 $QEMU_IMG measure -l snap1 # missing filename
diff --git a/tests/qemu-iotests/178.out.qcow2 b/tests/qemu-iotests/178.out.qcow2
index 9e7d8c44df..f59bf4b2fb 100644
--- a/tests/qemu-iotests/178.out.qcow2
+++ b/tests/qemu-iotests/178.out.qcow2
@@ -5,10 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 qemu-img: Either --size N or one filename must be specified.
 qemu-img: --size N cannot be used together with a filename.
 qemu-img: At most one filename argument is allowed.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
+qemu-img: Either --size N or one filename must be specified.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
 qemu-img: Invalid option list: ,
 qemu-img: Invalid parameter 'snapshot.foo'
 qemu-img: Failed in parsing snapshot param 'snapshot.foo'
diff --git a/tests/qemu-iotests/178.out.raw b/tests/qemu-iotests/178.out.raw
index 6478365905..404ca908d8 100644
--- a/tests/qemu-iotests/178.out.raw
+++ b/tests/qemu-iotests/178.out.raw
@@ -5,10 +5,10 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824
 qemu-img: Either --size N or one filename must be specified.
 qemu-img: --size N cannot be used together with a filename.
 qemu-img: At most one filename argument is allowed.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
-qemu-img: --object, --image-opts, -f, and -l require a filename argument.
+qemu-img: Either --size N or one filename must be specified.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
+qemu-img: --image-opts, -f, and -l require a filename argument.
 qemu-img: Invalid option list: ,
 qemu-img: Invalid parameter 'snapshot.foo'
 qemu-img: Failed in parsing snapshot param 'snapshot.foo'
-- 
2.24.1



Re: [PATCH v3 2/4] luks: implement .bdrv_measure()

2020-02-21 Thread Stefan Hajnoczi
On Wed, Feb 19, 2020 at 04:46:34PM +0100, Max Reitz wrote:
> On 11.02.20 17:03, Stefan Hajnoczi wrote:
> > Add qemu-img measure support in the "luks" block driver.
> > 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> >  block/crypto.c | 62 ++
> >  1 file changed, 62 insertions(+)
> > 
> > diff --git a/block/crypto.c b/block/crypto.c
> > index 24823835c1..453119875e 100644
> > --- a/block/crypto.c
> > +++ b/block/crypto.c
> > @@ -484,6 +484,67 @@ static int64_t block_crypto_getlength(BlockDriverState 
> > *bs)
> 
> [...]
> 
> > +cryptoopts = qemu_opts_to_qdict_filtered(opts, NULL,
> > +_crypto_create_opts_luks, true);
> > +qdict_put_str(cryptoopts, "format", "luks");
> > +create_opts = block_crypto_create_opts_init(cryptoopts, errp);
> 
> It looks a bit weird to me to use errp here...
> 
> > +qobject_unref(cryptoopts);
> > +if (!create_opts) {
> > +goto err;
> > +}
> > +
> > +if (!qcrypto_block_calculate_payload_offset(create_opts, NULL,
> > +_payload_size,
> > +_err)) {
> 
> ...and local_err here.  Either works, but consistent style would be a
> bit nicer.
> 
> But not more correct, so:
> 
> Reviewed-by: Max Reitz 

Thanks, will fix!

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v3 1/4] luks: extract qcrypto_block_calculate_payload_offset()

2020-02-21 Thread Stefan Hajnoczi
On Wed, Feb 19, 2020 at 02:03:50PM +0100, Max Reitz wrote:
> On 11.02.20 17:03, Stefan Hajnoczi wrote:
> > The qcow2 .bdrv_measure() code calculates the crypto payload offset.
> > This logic really belongs in crypto/block.c where it can be reused by
> > other image formats.
> > 
> > The "luks" block driver will need this same logic in order to implement
> > .bdrv_measure(), so extract the qcrypto_block_calculate_payload_offset()
> > function now.
> > 
> > Signed-off-by: Stefan Hajnoczi 
> > ---
> >  block/qcow2.c  | 74 +++---
> >  crypto/block.c | 40 +++
> >  include/crypto/block.h | 22 +
> >  3 files changed, 81 insertions(+), 55 deletions(-)
> 
> [...]
> 
> > diff --git a/crypto/block.c b/crypto/block.c
> > index 325752871c..a9e1b8cc36 100644
> > --- a/crypto/block.c
> > +++ b/crypto/block.c
> > @@ -115,6 +115,46 @@ QCryptoBlock 
> > *qcrypto_block_create(QCryptoBlockCreateOptions *options,
> 
> [...]
> 
> > +bool
> > +qcrypto_block_calculate_payload_offset(QCryptoBlockCreateOptions 
> > *create_opts,
> > +   const char *optprefix,
> > +   size_t *len,
> > +   Error **errp)
> > +{
> > +QCryptoBlock *crypto;
> > +bool ok;
> > +
> > +/* Fake LUKS creation in order to determine the payload size */
> > +crypto = qcrypto_block_create(create_opts, optprefix,
> > +  qcrypto_block_headerlen_hdr_init_func,
> > +  qcrypto_block_headerlen_hdr_write_func,
> > +  len, errp);
> > +ok = crypto != NULL;
> > +qcrypto_block_free(crypto);
> > +return ok;
> 
> Speaking of g_autoptr...  Would g_autoptr(QCryptoBlock) crypto; suffice
> to contract these three lines into “return crypto != NULL;”?
> 
> Either way:
> 
> Reviewed-by: Max Reitz 

Yes, it can be simplified.  Will fix in v3.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v2 2/4] virtio-scsi: default num_queues to -smp N

2020-02-21 Thread Stefan Hajnoczi
On Wed, Feb 12, 2020 at 11:18:32AM +, Stefan Hajnoczi wrote:
> On Tue, Feb 11, 2020 at 11:31:17AM -0500, Michael S. Tsirkin wrote:
> > On Tue, Feb 11, 2020 at 04:20:41PM +, Stefan Hajnoczi wrote:
> > > On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> > > > On Mon, Feb 03, 2020 at 10:57:44AM +, Daniel P. Berrangé wrote:
> > > > > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > > > > On Thu, Jan 30, 2020 at 10:52:35AM +, Stefan Hajnoczi wrote:
> > > > > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > > > > >> On Fri, 24 Jan 2020 10:01:57 +
> > > > > > > > >> Stefan Hajnoczi  wrote:
> > > I will create a 32 vCPU guest with 100 virtio-blk devices and verify
> > > that enabling multi-queue is successful.
> > 
> > and that it's helpful for performance?
> 
> I may be a little while before the next revision of this patch series.
> Testing reveals scalability problems when creating so many virtqueues
> :).
> 
> I've measured boot time, memory consumption, and random read IOPS.  They
> are all significantly worse (32 vCPUs, 24 GB RAM, 101 virtio-blk
> devices, 32 queues/device).
> 
> Time to see what's going on and whether some general scalability
> improvements are possible here before we enable multi-queue by default.

Update:

Boot time has improved with "[PATCH] memory: batch allocate ioeventfds[]
in address_space_update_ioeventfds()".

IOPS looks a lot better with the O(1) QEMU event loop patches that I've
posted.  This work is not complete yet, I still need to make AioContext
polling O(1) too (it consumes too much CPU with many idle devices).

After this work is complete I'll measure boot time, memory consumption,
and IOPS again.  Then we can decide whether multiqueue by default is a
good idea.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v3] util/async: make bh_aio_poll() O(1)

2020-02-21 Thread Stefan Hajnoczi
On Fri, Feb 21, 2020 at 9:40 AM Stefan Hajnoczi  wrote:
>
> The ctx->first_bh list contains all created BHs, including those that
> are not scheduled.  The list is iterated by the event loop and therefore
> has O(n) time complexity with respected to the number of created BHs.
>
> Rewrite BHs so that only scheduled or deleted BHs are enqueued.
> Only BHs that actually require action will be iterated.
>
> One semantic change is required: qemu_bh_delete() enqueues the BH and
> therefore invokes aio_notify().  The
> tests/test-aio.c:test_source_bh_delete_from_cb() test case assumed that
> g_main_context_iteration(NULL, false) returns false after
> qemu_bh_delete() but it now returns true for one iteration.  Fix up the
> test case.
>
> This patch makes aio_compute_timeout() and aio_bh_poll() drop from a CPU
> profile reported by perf-top(1).  Previously they combined to 9% CPU
> utilization when AioContext polling is commented out and the guest has 2
> virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32 devices.
>
> Signed-off-by: Stefan Hajnoczi 
> ---
> v3:
>  * Use QSLIST_FOREACH_RCU() and QSLIST_FIRST_RCU() [Paolo]

I forgot to include Paolo's R-b that he gave conditional on making this change:

Reviewed-by: Paolo Bonzini 



Re: [PATCH v7 02/11] error: auto propagated local_err

2020-02-21 Thread Vladimir Sementsov-Ogievskiy

21.02.2020 12:19, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


Here is introduced ERRP_AUTO_PROPAGATE macro, to be used at start of
functions with an errp OUT parameter.

It has three goals:

1. Fix issue with error_fatal and error_prepend/error_append_hint: user
can't see this additional information, because exit() happens in
error_setg earlier than information is added. [Reported by Greg Kurz]

2. Fix issue with error_abort and error_propagate: when we wrap
error_abort by local_err+error_propagate, the resulting coredump will
refer to error_propagate and not to the place where error happened.
(the macro itself doesn't fix the issue, but it allows us to [3.] drop
the local_err+error_propagate pattern, which will definitely fix the
issue) [Reported by Kevin Wolf]

3. Drop local_err+error_propagate pattern, which is used to workaround
void functions with errp parameter, when caller wants to know resulting
status. (Note: actually these functions could be merely updated to
return int error code).

To achieve these goals, later patches will add invocations
of this macro at the start of functions with either use
error_prepend/error_append_hint (solving 1) or which use
local_err+error_propagate to check errors, switching those
functions to use *errp instead (solving 2 and 3).

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Greg Kurz 
Reviewed-by: Eric Blake 
---

CC: Eric Blake 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Greg Kurz 
CC: Stefano Stabellini 
CC: Anthony Perard 
CC: Paul Durrant 
CC: Stefan Hajnoczi 
CC: "Philippe Mathieu-Daudé" 
CC: Laszlo Ersek 
CC: Gerd Hoffmann 
CC: Stefan Berger 
CC: Markus Armbruster 
CC: Michael Roth 
CC: qemu-block@nongnu.org
CC: xen-de...@lists.xenproject.org

  include/qapi/error.h | 83 +++-
  1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/include/qapi/error.h b/include/qapi/error.h
index d34987148d..b9452d4806 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -78,7 +78,7 @@
   * Call a function treating errors as fatal:
   * foo(arg, _fatal);
   *
- * Receive an error and pass it on to the caller:
+ * Receive an error and pass it on to the caller (DEPRECATED*):
   * Error *err = NULL;
   * foo(arg, );
   * if (err) {
@@ -98,6 +98,50 @@
   * foo(arg, errp);
   * for readability.
   *
+ * DEPRECATED* This pattern is deprecated now, the use ERRP_AUTO_PROPAGATE 
macro
+ * instead (defined below).
+ * It's deprecated because of two things:
+ *
+ * 1. Issue with error_abort and error_propagate: when we wrap error_abort by
+ * local_err+error_propagate, the resulting coredump will refer to
+ * error_propagate and not to the place where error happened.
+ *
+ * 2. A lot of extra code of the same pattern
+ *
+ * How to update old code to use ERRP_AUTO_PROPAGATE?
+ *
+ * All you need is to add ERRP_AUTO_PROPAGATE() invocation at function start,
+ * than you may safely dereference errp to check errors and do not need any
+ * additional local Error variables or calls to error_propagate().
+ *
+ * Example:
+ *
+ * old code
+ *
+ * void fn(..., Error **errp) {
+ * Error *err = NULL;
+ * foo(arg, );
+ * if (err) {
+ * handle the error...
+ * error_propagate(errp, err);
+ * return;
+ * }
+ * ...
+ * }
+ *
+ * updated code
+ *
+ * void fn(..., Error **errp) {
+ * ERRP_AUTO_PROPAGATE();
+ * foo(arg, errp);
+ * if (*errp) {
+ * handle the error...
+ * return;
+ * }
+ * ...
+ * }
+ *
+ *
   * Receive and accumulate multiple errors (first one wins):
   * Error *err = NULL, *local_err = NULL;
   * foo(arg, );


Let's explain what should be done *first*, and only then talk about the
deprecated pattern and how to convert it to current usage.


@@ -348,6 +392,43 @@ void error_set_internal(Error **errp,
  ErrorClass err_class, const char *fmt, ...)
  GCC_FMT_ATTR(6, 7);
  
+typedef struct ErrorPropagator {

+Error *local_err;
+Error **errp;
+} ErrorPropagator;
+
+static inline void error_propagator_cleanup(ErrorPropagator *prop)
+{
+error_propagate(prop->errp, prop->local_err);
+}
+
+G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(ErrorPropagator, error_propagator_cleanup);
+
+/*
+ * ERRP_AUTO_PROPAGATE
+ *
+ * This macro is created to be the first line of a function which use
+ * Error **errp parameter to report error. It's needed only in cases where we
+ * want to use error_prepend, error_append_hint or dereference *errp. It's
+ * still safe (but useless) in other cases.
+ *
+ * If errp is NULL or points to error_fatal, it is rewritten to point to a
+ * local Error object, which will be automatically propagated to the original
+ * errp on function exit (see error_propagator_cleanup).
+ *
+ * After invocation of this macro it is always safe to dereference errp
+ * (as it's not NULL anymore) and to 

[PATCH v3] util/async: make bh_aio_poll() O(1)

2020-02-21 Thread Stefan Hajnoczi
The ctx->first_bh list contains all created BHs, including those that
are not scheduled.  The list is iterated by the event loop and therefore
has O(n) time complexity with respected to the number of created BHs.

Rewrite BHs so that only scheduled or deleted BHs are enqueued.
Only BHs that actually require action will be iterated.

One semantic change is required: qemu_bh_delete() enqueues the BH and
therefore invokes aio_notify().  The
tests/test-aio.c:test_source_bh_delete_from_cb() test case assumed that
g_main_context_iteration(NULL, false) returns false after
qemu_bh_delete() but it now returns true for one iteration.  Fix up the
test case.

This patch makes aio_compute_timeout() and aio_bh_poll() drop from a CPU
profile reported by perf-top(1).  Previously they combined to 9% CPU
utilization when AioContext polling is commented out and the guest has 2
virtio-blk,num-queues=1 and 99 virtio-blk,num-queues=32 devices.

Signed-off-by: Stefan Hajnoczi 
---
v3:
 * Use QSLIST_FOREACH_RCU() and QSLIST_FIRST_RCU() [Paolo]
v2:
 * Use QSLIST for BHs and QSIMPLEQ for BHListSlices [Paolo]
   (Note that I replaced bh = atomic_rcu_read(_bh) with
QSLIST_FOREACH(_list) so there is no memory ordering but I think
this is safe.)
 * Comment clarifications [Paolo]

Based-on: 20200220103828.24525-1-pbonz...@redhat.com
  ("[PATCH] rcu_queue: add QSLIST functions")
---
 include/block/aio.h |  20 +++-
 tests/test-aio.c|   3 +-
 util/async.c| 237 ++--
 3 files changed, 158 insertions(+), 102 deletions(-)

diff --git a/include/block/aio.h b/include/block/aio.h
index 7ba9bd7874..1a2ce9ca26 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -51,6 +51,19 @@ struct ThreadPool;
 struct LinuxAioState;
 struct LuringState;
 
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call.  All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+BHList bh_list;
+QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
 struct AioContext {
 GSource source;
 
@@ -91,8 +104,11 @@ struct AioContext {
  */
 QemuLockCnt list_lock;
 
-/* Anchor of the list of Bottom Halves belonging to the context */
-struct QEMUBH *first_bh;
+/* Bottom Halves pending aio_bh_poll() processing */
+BHList bh_list;
+
+/* Chained BH list slices for each nested aio_bh_poll() call */
+QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
 
 /* Used by aio_notify.
  *
diff --git a/tests/test-aio.c b/tests/test-aio.c
index 86fb73b3d5..8a46078463 100644
--- a/tests/test-aio.c
+++ b/tests/test-aio.c
@@ -615,7 +615,8 @@ static void test_source_bh_delete_from_cb(void)
 g_assert_cmpint(data1.n, ==, data1.max);
 g_assert(data1.bh == NULL);
 
-g_assert(!g_main_context_iteration(NULL, false));
+assert(g_main_context_iteration(NULL, false));
+assert(!g_main_context_iteration(NULL, false));
 }
 
 static void test_source_bh_delete_from_cb_many(void)
diff --git a/util/async.c b/util/async.c
index c192a24a61..b94518b948 100644
--- a/util/async.c
+++ b/util/async.c
@@ -29,6 +29,7 @@
 #include "block/thread-pool.h"
 #include "qemu/main-loop.h"
 #include "qemu/atomic.h"
+#include "qemu/rcu_queue.h"
 #include "block/raw-aio.h"
 #include "qemu/coroutine_int.h"
 #include "trace.h"
@@ -36,16 +37,76 @@
 /***/
 /* bottom halves (can be seen as timers which expire ASAP) */
 
+/* QEMUBH::flags values */
+enum {
+/* Already enqueued and waiting for aio_bh_poll() */
+BH_PENDING   = (1 << 0),
+
+/* Invoke the callback */
+BH_SCHEDULED = (1 << 1),
+
+/* Delete without invoking callback */
+BH_DELETED   = (1 << 2),
+
+/* Delete after invoking callback */
+BH_ONESHOT   = (1 << 3),
+
+/* Schedule periodically when the event loop is idle */
+BH_IDLE  = (1 << 4),
+};
+
 struct QEMUBH {
 AioContext *ctx;
 QEMUBHFunc *cb;
 void *opaque;
-QEMUBH *next;
-bool scheduled;
-bool idle;
-bool deleted;
+QSLIST_ENTRY(QEMUBH) next;
+unsigned flags;
 };
 
+/* Called concurrently from any thread */
+static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
+{
+AioContext *ctx = bh->ctx;
+unsigned old_flags;
+
+/*
+ * The memory barrier implicit in atomic_fetch_or makes sure that:
+ * 1. idle & any writes needed by the callback are done before the
+ *locations are read in the aio_bh_poll.
+ * 2. ctx is loaded before the callback has a chance to execute and bh
+ *could be freed.
+ */
+old_flags = atomic_fetch_or(>flags, BH_PENDING | new_flags);
+if (!(old_flags & BH_PENDING)) {
+QSLIST_INSERT_HEAD_ATOMIC(>bh_list, 

Re: [PATCH v7 01/11] qapi/error: add (Error **errp) cleaning APIs

2020-02-21 Thread Vladimir Sementsov-Ogievskiy

21.02.2020 10:38, Markus Armbruster wrote:

Vladimir Sementsov-Ogievskiy  writes:


Add functions to clean Error **errp: call corresponding Error *err
cleaning function an set pointer to NULL.

New functions:
   error_free_errp
   error_report_errp
   warn_report_errp

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Greg Kurz 
Reviewed-by: Eric Blake 
---

CC: Eric Blake 
CC: Kevin Wolf 
CC: Max Reitz 
CC: Greg Kurz 
CC: Stefano Stabellini 
CC: Anthony Perard 
CC: Paul Durrant 
CC: Stefan Hajnoczi 
CC: "Philippe Mathieu-Daudé" 
CC: Laszlo Ersek 
CC: Gerd Hoffmann 
CC: Stefan Berger 
CC: Markus Armbruster 
CC: Michael Roth 
CC: qemu-block@nongnu.org
CC: xen-de...@lists.xenproject.org

  include/qapi/error.h | 26 ++
  1 file changed, 26 insertions(+)

diff --git a/include/qapi/error.h b/include/qapi/error.h
index ad5b6e896d..d34987148d 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -309,6 +309,32 @@ void warn_reportf_err(Error *err, const char *fmt, ...)
  void error_reportf_err(Error *err, const char *fmt, ...)
  GCC_FMT_ATTR(2, 3);
  
+/*

+ * Functions to clean Error **errp: call corresponding Error *err cleaning
+ * function, then set pointer to NULL.
+ */
+static inline void error_free_errp(Error **errp)
+{
+assert(errp && *errp);
+error_free(*errp);
+*errp = NULL;
+}
+
+static inline void error_report_errp(Error **errp)
+{
+assert(errp && *errp);
+error_report_err(*errp);
+*errp = NULL;
+}
+
+static inline void warn_report_errp(Error **errp)
+{
+assert(errp && *errp);
+warn_report_err(*errp);
+*errp = NULL;
+}
+
+
  /*
   * Just like error_setg(), except you get to specify the error class.
   * Note: use of error classes other than ERROR_CLASS_GENERIC_ERROR is


These appear to be unused apart from the Coccinelle script in PATCH 03.

They are used in the full "[RFC v5 000/126] error: auto propagated
local_err" series.  Options:

1. Pick a few more patches into this part I series, so these guys come
with users.


It needs some additional effort for this series.. But it's possible. Still,
I think that we at least should not pull out patches which should be in
future series (for example from ppc or block/)..

Grepping through v5:
 for x in {warn_report_errp,error_report_errp,error_free_errp}; do echo == $x 
==; git grep -l $x | grep -v coccinelle | grep -v 'error\.h'; echo; done
== warn_report_errp ==
block/file-posix.c
hw/ppc/spapr.c
hw/ppc/spapr_caps.c
hw/ppc/spapr_irq.c
hw/vfio/pci.c
net/tap.c
qom/object.c

== error_report_errp ==
hw/block/vhost-user-blk.c
util/oslib-posix.c

== error_free_errp ==
block.c
block/qapi.c
block/sheepdog.c
block/snapshot.c
blockdev.c
chardev/char-socket.c
hw/audio/intel-hda.c
hw/core/qdev-properties.c
hw/pci-bridge/pci_bridge_dev.c
hw/pci-bridge/pcie_pci_bridge.c
hw/scsi/megasas.c
hw/scsi/mptsas.c
hw/usb/hcd-xhci.c
io/net-listener.c
migration/colo.c
qga/commands-posix.c
qga/commands-win32.c
util/qemu-sockets.c

What do you want to add?



2. Punt this patch to the first part that has users, along with the
part of the Coccinelle script that deals with them.


But coccinelle script would be wrong, if we drop this part from it. I think,
that after commit which adds coccinelle script, it should work with any file,
not only subset of these series.

So, it's probably OK for now to drop these functions, forcing their addition if
coccinelle script will be applied where these functions are needed. We can, for
example comment these three functions.

Splitting coccinelle script into two parts, which will be in different series 
will
not help any patch-porting processes.

Moreover, this will create dependencies between future series updating other 
files.

So, I don't like [2.]..



3. Do nothing: accept the functions without users.


OK for me)



I habitually dislike 3., but reviewing the rest of this series might
make me override that dislike.





same grep with maintainers:
 for x in {warn_report_errp,error_report_errp,error_free_errp}; do echo == $x 
==; git grep -l $x | grep -v coccinelle | grep -v 'error\.h' | while read f; do 
echo $f; ./scripts/get_maintainer.pl -f --no-rolestats $f | grep -v 
'qemu-block\|qemu-devel' | sed -e 's/^/   /'; done; echo; done
== warn_report_errp ==
block/file-posix.c
   Kevin Wolf 
   Max Reitz 
hw/ppc/spapr.c
   David Gibson 
   qemu-...@nongnu.org
hw/ppc/spapr_caps.c
   David Gibson 
   qemu-...@nongnu.org
hw/ppc/spapr_irq.c
   David Gibson 
   qemu-...@nongnu.org
hw/vfio/pci.c
   Alex Williamson 
net/tap.c
   Jason Wang 
qom/object.c
   Paolo Bonzini 
   "Daniel P. Berrangé" 
   Eduardo Habkost 

== error_report_errp ==
hw/block/vhost-user-blk.c
   "Michael S. Tsirkin" 
   Kevin Wolf 
   Max Reitz 
util/oslib-posix.c
   Paolo Bonzini 

== error_free_errp ==
block.c
   Kevin Wolf 
   Max Reitz 
block/qapi.c
   Markus Armbruster 
   Kevin Wolf 
   Max Reitz 
block/sheepdog.c
   Liu Yuan 
   Kevin Wolf 
   Max Reitz 
   

Re: [PATCH v7 02/11] error: auto propagated local_err

2020-02-21 Thread Markus Armbruster
Vladimir Sementsov-Ogievskiy  writes:

> Here is introduced ERRP_AUTO_PROPAGATE macro, to be used at start of
> functions with an errp OUT parameter.
>
> It has three goals:
>
> 1. Fix issue with error_fatal and error_prepend/error_append_hint: user
> can't see this additional information, because exit() happens in
> error_setg earlier than information is added. [Reported by Greg Kurz]
>
> 2. Fix issue with error_abort and error_propagate: when we wrap
> error_abort by local_err+error_propagate, the resulting coredump will
> refer to error_propagate and not to the place where error happened.
> (the macro itself doesn't fix the issue, but it allows us to [3.] drop
> the local_err+error_propagate pattern, which will definitely fix the
> issue) [Reported by Kevin Wolf]
>
> 3. Drop local_err+error_propagate pattern, which is used to workaround
> void functions with errp parameter, when caller wants to know resulting
> status. (Note: actually these functions could be merely updated to
> return int error code).
>
> To achieve these goals, later patches will add invocations
> of this macro at the start of functions with either use
> error_prepend/error_append_hint (solving 1) or which use
> local_err+error_propagate to check errors, switching those
> functions to use *errp instead (solving 2 and 3).
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Greg Kurz 
> Reviewed-by: Eric Blake 
> ---
>
> CC: Eric Blake 
> CC: Kevin Wolf 
> CC: Max Reitz 
> CC: Greg Kurz 
> CC: Stefano Stabellini 
> CC: Anthony Perard 
> CC: Paul Durrant 
> CC: Stefan Hajnoczi 
> CC: "Philippe Mathieu-Daudé" 
> CC: Laszlo Ersek 
> CC: Gerd Hoffmann 
> CC: Stefan Berger 
> CC: Markus Armbruster 
> CC: Michael Roth 
> CC: qemu-block@nongnu.org
> CC: xen-de...@lists.xenproject.org
>
>  include/qapi/error.h | 83 +++-
>  1 file changed, 82 insertions(+), 1 deletion(-)
>
> diff --git a/include/qapi/error.h b/include/qapi/error.h
> index d34987148d..b9452d4806 100644
> --- a/include/qapi/error.h
> +++ b/include/qapi/error.h
> @@ -78,7 +78,7 @@
>   * Call a function treating errors as fatal:
>   * foo(arg, _fatal);
>   *
> - * Receive an error and pass it on to the caller:
> + * Receive an error and pass it on to the caller (DEPRECATED*):
>   * Error *err = NULL;
>   * foo(arg, );
>   * if (err) {
> @@ -98,6 +98,50 @@
>   * foo(arg, errp);
>   * for readability.
>   *
> + * DEPRECATED* This pattern is deprecated now, the use ERRP_AUTO_PROPAGATE 
> macro
> + * instead (defined below).
> + * It's deprecated because of two things:
> + *
> + * 1. Issue with error_abort and error_propagate: when we wrap error_abort by
> + * local_err+error_propagate, the resulting coredump will refer to
> + * error_propagate and not to the place where error happened.
> + *
> + * 2. A lot of extra code of the same pattern
> + *
> + * How to update old code to use ERRP_AUTO_PROPAGATE?
> + *
> + * All you need is to add ERRP_AUTO_PROPAGATE() invocation at function start,
> + * than you may safely dereference errp to check errors and do not need any
> + * additional local Error variables or calls to error_propagate().
> + *
> + * Example:
> + *
> + * old code
> + *
> + * void fn(..., Error **errp) {
> + * Error *err = NULL;
> + * foo(arg, );
> + * if (err) {
> + * handle the error...
> + * error_propagate(errp, err);
> + * return;
> + * }
> + * ...
> + * }
> + *
> + * updated code
> + *
> + * void fn(..., Error **errp) {
> + * ERRP_AUTO_PROPAGATE();
> + * foo(arg, errp);
> + * if (*errp) {
> + * handle the error...
> + * return;
> + * }
> + * ...
> + * }
> + *
> + *
>   * Receive and accumulate multiple errors (first one wins):
>   * Error *err = NULL, *local_err = NULL;
>   * foo(arg, );

Let's explain what should be done *first*, and only then talk about the
deprecated pattern and how to convert it to current usage.

> @@ -348,6 +392,43 @@ void error_set_internal(Error **errp,
>  ErrorClass err_class, const char *fmt, ...)
>  GCC_FMT_ATTR(6, 7);
>  
> +typedef struct ErrorPropagator {
> +Error *local_err;
> +Error **errp;
> +} ErrorPropagator;
> +
> +static inline void error_propagator_cleanup(ErrorPropagator *prop)
> +{
> +error_propagate(prop->errp, prop->local_err);
> +}
> +
> +G_DEFINE_AUTO_CLEANUP_CLEAR_FUNC(ErrorPropagator, error_propagator_cleanup);
> +
> +/*
> + * ERRP_AUTO_PROPAGATE
> + *
> + * This macro is created to be the first line of a function which use
> + * Error **errp parameter to report error. It's needed only in cases where we
> + * want to use error_prepend, error_append_hint or dereference *errp. It's
> + * still safe (but useless) in other cases.
> + *
> + * If errp is NULL or points to error_fatal, it is rewritten to point to a
> + * local Error object, which will be 

Re: [PATCH v3 19/20] Let cpu_[physical]_memory() calls pass a boolean 'is_write' argument

2020-02-21 Thread Cornelia Huck
On Thu, 20 Feb 2020 14:05:47 +0100
Philippe Mathieu-Daudé  wrote:

> Use an explicit boolean type.
> 
> This commit was produced with the included Coccinelle script
> scripts/coccinelle/exec_rw_const.
> 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  scripts/coccinelle/exec_rw_const.cocci | 14 ++
>  include/exec/cpu-common.h  |  4 ++--
>  hw/display/exynos4210_fimd.c   |  3 ++-
>  hw/display/milkymist-tmu2.c|  8 
>  hw/display/omap_dss.c  |  2 +-
>  hw/display/ramfb.c |  2 +-
>  hw/misc/pc-testdev.c   |  2 +-
>  hw/nvram/spapr_nvram.c |  4 ++--
>  hw/ppc/ppc440_uc.c |  6 --
>  hw/ppc/spapr_hcall.c   |  4 ++--
>  hw/s390x/ipl.c |  2 +-
>  hw/s390x/s390-pci-bus.c|  2 +-
>  hw/s390x/virtio-ccw.c  |  2 +-
>  hw/xen/xen_pt_graphics.c   |  2 +-
>  target/i386/hax-all.c  |  4 ++--
>  target/s390x/excp_helper.c |  2 +-
>  target/s390x/helper.c  |  6 +++---
>  17 files changed, 43 insertions(+), 26 deletions(-)
> 

> diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
> index 7773499d7f..0817874b48 100644
> --- a/hw/s390x/ipl.c
> +++ b/hw/s390x/ipl.c
> @@ -626,7 +626,7 @@ static void s390_ipl_prepare_qipl(S390CPU *cpu)
>  uint8_t *addr;
>  uint64_t len = 4096;
>  
> -addr = cpu_physical_memory_map(cpu->env.psa, , 1);
> +addr = cpu_physical_memory_map(cpu->env.psa, , true);
>  if (!addr || len < QIPL_ADDRESS + sizeof(QemuIplParameters)) {
>  error_report("Cannot set QEMU IPL parameters");
>  return;
> diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
> index 7c6a2b3c63..ed8be124da 100644
> --- a/hw/s390x/s390-pci-bus.c
> +++ b/hw/s390x/s390-pci-bus.c
> @@ -641,7 +641,7 @@ static uint8_t set_ind_atomic(uint64_t ind_loc, uint8_t 
> to_be_set)
>  hwaddr len = 1;
>  uint8_t *ind_addr;
>  
> -ind_addr = cpu_physical_memory_map(ind_loc, , 1);
> +ind_addr = cpu_physical_memory_map(ind_loc, , true);
>  if (!ind_addr) {
>  s390_pci_generate_error_event(ERR_EVENT_AIRERR, 0, 0, 0, 0);
>  return -1;
> diff --git a/hw/s390x/virtio-ccw.c b/hw/s390x/virtio-ccw.c
> index 13f57e7b67..50cf95b781 100644
> --- a/hw/s390x/virtio-ccw.c
> +++ b/hw/s390x/virtio-ccw.c
> @@ -790,7 +790,7 @@ static uint8_t virtio_set_ind_atomic(SubchDev *sch, 
> uint64_t ind_loc,
>  hwaddr len = 1;
>  uint8_t *ind_addr;
>  
> -ind_addr = cpu_physical_memory_map(ind_loc, , 1);
> +ind_addr = cpu_physical_memory_map(ind_loc, , true);
>  if (!ind_addr) {
>  error_report("%s(%x.%x.%04x): unable to access indicator",
>   __func__, sch->cssid, sch->ssid, sch->schid);

> diff --git a/target/s390x/excp_helper.c b/target/s390x/excp_helper.c
> index 1e9d6f20c1..3b58d10df3 100644
> --- a/target/s390x/excp_helper.c
> +++ b/target/s390x/excp_helper.c
> @@ -393,7 +393,7 @@ static int mchk_store_vregs(CPUS390XState *env, uint64_t 
> mcesao)
>  MchkExtSaveArea *sa;
>  int i;
>  
> -sa = cpu_physical_memory_map(mcesao, , 1);
> +sa = cpu_physical_memory_map(mcesao, , true);
>  if (!sa) {
>  return -EFAULT;
>  }
> diff --git a/target/s390x/helper.c b/target/s390x/helper.c
> index a3a49164e4..b810ad431e 100644
> --- a/target/s390x/helper.c
> +++ b/target/s390x/helper.c
> @@ -151,7 +151,7 @@ LowCore *cpu_map_lowcore(CPUS390XState *env)
>  LowCore *lowcore;
>  hwaddr len = sizeof(LowCore);
>  
> -lowcore = cpu_physical_memory_map(env->psa, , 1);
> +lowcore = cpu_physical_memory_map(env->psa, , true);
>  
>  if (len < sizeof(LowCore)) {
>  cpu_abort(env_cpu(env), "Could not map lowcore\n");
> @@ -246,7 +246,7 @@ int s390_store_status(S390CPU *cpu, hwaddr addr, bool 
> store_arch)
>  hwaddr len = sizeof(*sa);
>  int i;
>  
> -sa = cpu_physical_memory_map(addr, , 1);
> +sa = cpu_physical_memory_map(addr, , true);
>  if (!sa) {
>  return -EFAULT;
>  }
> @@ -298,7 +298,7 @@ int s390_store_adtl_status(S390CPU *cpu, hwaddr addr, 
> hwaddr len)
>  hwaddr save = len;
>  int i;
>  
> -sa = cpu_physical_memory_map(addr, , 1);
> +sa = cpu_physical_memory_map(addr, , true);
>  if (!sa) {
>  return -EFAULT;
>  }

s390 parts
Acked-by: Cornelia Huck 




Re: [PATCH v3 08/20] Remove unnecessary cast when using the address_space API

2020-02-21 Thread Cornelia Huck
On Thu, 20 Feb 2020 14:05:36 +0100
Philippe Mathieu-Daudé  wrote:

> This commit was produced with the included Coccinelle script
> scripts/coccinelle/exec_rw_const.
> 
> Two lines in hw/net/dp8393x.c that Coccinelle produced that
> were over 80 characters were re-wrapped by hand.
> 
> Suggested-by: Stefan Weil 
> Signed-off-by: Philippe Mathieu-Daudé 
> ---
>  scripts/coccinelle/exec_rw_const.cocci | 15 +-
>  target/i386/hvf/vmx.h  |  2 +-
>  hw/arm/boot.c  |  6 ++
>  hw/dma/rc4030.c|  4 ++--
>  hw/dma/xlnx-zdma.c |  2 +-
>  hw/net/cadence_gem.c   | 21 +--
>  hw/net/dp8393x.c   | 28 +-
>  hw/s390x/css.c |  4 ++--
>  qtest.c| 12 +--
>  target/i386/hvf/x86_mmu.c  |  2 +-
>  target/i386/whpx-all.c |  2 +-
>  target/s390x/mmu_helper.c  |  2 +-
>  12 files changed, 54 insertions(+), 46 deletions(-)
> 

> diff --git a/hw/s390x/css.c b/hw/s390x/css.c
> index 844caab408..f27f8c45a5 100644
> --- a/hw/s390x/css.c
> +++ b/hw/s390x/css.c
> @@ -875,7 +875,7 @@ static inline int ida_read_next_idaw(CcwDataStream *cds)
>  return -EINVAL; /* channel program check */
>  }
>  ret = address_space_rw(_space_memory, idaw_addr,
> -   MEMTXATTRS_UNSPECIFIED, (void *) ,
> +   MEMTXATTRS_UNSPECIFIED, ,
> sizeof(idaw.fmt2), false);
>  cds->cda = be64_to_cpu(idaw.fmt2);
>  } else {
> @@ -884,7 +884,7 @@ static inline int ida_read_next_idaw(CcwDataStream *cds)
>  return -EINVAL; /* channel program check */
>  }
>  ret = address_space_rw(_space_memory, idaw_addr,
> -   MEMTXATTRS_UNSPECIFIED, (void *) ,
> +   MEMTXATTRS_UNSPECIFIED, ,
> sizeof(idaw.fmt1), false);
>  cds->cda = be64_to_cpu(idaw.fmt1);
>  if (cds->cda & 0x8000) {

> diff --git a/target/s390x/mmu_helper.c b/target/s390x/mmu_helper.c
> index c9f3f34750..0be2f300bb 100644
> --- a/target/s390x/mmu_helper.c
> +++ b/target/s390x/mmu_helper.c
> @@ -106,7 +106,7 @@ static inline bool read_table_entry(CPUS390XState *env, 
> hwaddr gaddr,
>   * We treat them as absolute addresses and don't wrap them.
>   */
>  if (unlikely(address_space_read(cs->as, gaddr, MEMTXATTRS_UNSPECIFIED,
> -(uint8_t *)entry, sizeof(*entry)) !=
> +entry, sizeof(*entry)) !=
>   MEMTX_OK)) {
>  return false;
>  }

s390 parts
Acked-by: Cornelia Huck