date:20200918

Re: [PATCH v1 0/2] Add timeout mechanism to qmp actions

2020-09-18 Thread Zhenyu Ye

On 2020/9/18 22:06, Fam Zheng wrote:
> 
> I can see how blocking in a slow io_submit can cause trouble for main
> thread. I think one way to fix it (until it's made truly async in new
> kernels) is moving the io_submit call to thread pool, and wrapped in a
> coroutine, perhaps.
>

I'm not sure if any other operation will block the main thread, other
than io_submit().

> I'm not sure qmp timeout is a complete solution because we would still
> suffer from a blocked state for a period, in this exact situation before
> the timeout.

Anyway, the qmp timeout may be the last measure to prevent the VM
soft lockup.  Ideally, after your fix of io_submit, this mechanism will
never be triggered.

Thanks,
Zhenyu

RE: [PATCH v3 06/15] hw/block/nvme: Add support for Namespace Types

2020-09-18 Thread Dmitry Fomichev

> -Original Message-
> From: Klaus Jensen 
> Sent: Friday, September 18, 2020 5:30 PM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-block@nongnu.org; qemu-de...@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v3 06/15] hw/block/nvme: Add support for Namespace
> Types
> 
> On Sep 14 07:14, Dmitry Fomichev wrote:
> > From: Niklas Cassel 
> >
> > Namespace Types introduce a new command set, "I/O Command Sets",
> > that allows the host to retrieve the command sets associated with
> > a namespace. Introduce support for the command set and enable
> > detection for the NVM Command Set.
> >
> > The new workflows for identify commands rely heavily on zero-filled
> > identify structs. E.g., certain CNS commands are defined to return
> > a zero-filled identify struct when an inactive namespace NSID
> > is supplied.
> >
> > Add a helper function in order to avoid code duplication when
> > reporting zero-filled identify structures.
> >
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  hw/block/nvme.c | 235
> +++-
> >  1 file changed, 215 insertions(+), 20 deletions(-)
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 4bd88f4046..d01c1c1e06 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -595,6 +595,33 @@ static inline uint16_t
> nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> >  return NVME_SUCCESS;
> >  }
> >
> > +static void nvme_fill_data(QEMUSGList *qsg, QEMUIOVector *iov,
> > +   uint64_t offset, uint8_t pattern)
> > +{
> > +ScatterGatherEntry *entry;
> > +uint32_t len, ent_len;
> > +
> > +if (qsg->nsg > 0) {
> > +entry = qsg->sg;
> > +for (len = qsg->size; len > 0; len -= ent_len) {
> > +ent_len = MIN(len, entry->len);
> > +if (offset > ent_len) {
> > +offset -= ent_len;
> > +} else if (offset != 0) {
> > +dma_memory_set(qsg->as, entry->base + offset,
> > +   pattern, ent_len - offset);
> > +offset = 0;
> > +} else {
> > +dma_memory_set(qsg->as, entry->base, pattern, ent_len);
> > +}
> > +entry++;
> > +}
> > +} else if (iov->iov) {
> > +qemu_iovec_memset(iov, offset, pattern,
> > +  iov_size(iov->iov, iov->niov) - offset);
> > +}
> > +}
> > +
> >  static void nvme_rw_cb(void *opaque, int ret)
> >  {
> >  NvmeRequest *req = opaque;
> > @@ -1153,6 +1180,19 @@ static uint16_t nvme_create_cq(NvmeCtrl *n,
> NvmeRequest *req)
> >  return NVME_SUCCESS;
> >  }
> >
> > +static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, uint64_t prp1,
> > + uint64_t prp2, NvmeRequest *req)
> > +{
> > +uint16_t status;
> > +
> > +status = nvme_map_prp(n, prp1, prp2, NVME_IDENTIFY_DATA_SIZE,
> req);
> > +if (status) {
> > +return status;
> > +}
> > +nvme_fill_data(&req->qsg, &req->iov, 0, 0);
> > +return NVME_SUCCESS;
> > +}
> > +
> 
> Instead of doing the filling, why not just directly call nvme_dma_prp
> with an empty NvmeIdCtrl/NvmeIdNs stack allocated variable?

Yes, this should work too, perhaps it will be simpler.

Re: [PATCH v3 06/15] hw/block/nvme: Add support for Namespace Types

2020-09-18 Thread Klaus Jensen

On Sep 14 07:14, Dmitry Fomichev wrote:
> From: Niklas Cassel 
> 
> Namespace Types introduce a new command set, "I/O Command Sets",
> that allows the host to retrieve the command sets associated with
> a namespace. Introduce support for the command set and enable
> detection for the NVM Command Set.
> 
> The new workflows for identify commands rely heavily on zero-filled
> identify structs. E.g., certain CNS commands are defined to return
> a zero-filled identify struct when an inactive namespace NSID
> is supplied.
> 
> Add a helper function in order to avoid code duplication when
> reporting zero-filled identify structures.
> 
> Signed-off-by: Niklas Cassel 
> Signed-off-by: Dmitry Fomichev 
> ---
>  hw/block/nvme.c | 235 +++-
>  1 file changed, 215 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 4bd88f4046..d01c1c1e06 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -595,6 +595,33 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, 
> NvmeNamespace *ns,
>  return NVME_SUCCESS;
>  }
>  
> +static void nvme_fill_data(QEMUSGList *qsg, QEMUIOVector *iov,
> +   uint64_t offset, uint8_t pattern)
> +{
> +ScatterGatherEntry *entry;
> +uint32_t len, ent_len;
> +
> +if (qsg->nsg > 0) {
> +entry = qsg->sg;
> +for (len = qsg->size; len > 0; len -= ent_len) {
> +ent_len = MIN(len, entry->len);
> +if (offset > ent_len) {
> +offset -= ent_len;
> +} else if (offset != 0) {
> +dma_memory_set(qsg->as, entry->base + offset,
> +   pattern, ent_len - offset);
> +offset = 0;
> +} else {
> +dma_memory_set(qsg->as, entry->base, pattern, ent_len);
> +}
> +entry++;
> +}
> +} else if (iov->iov) {
> +qemu_iovec_memset(iov, offset, pattern,
> +  iov_size(iov->iov, iov->niov) - offset);
> +}
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>  NvmeRequest *req = opaque;
> @@ -1153,6 +1180,19 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, 
> NvmeRequest *req)
>  return NVME_SUCCESS;
>  }
>  
> +static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, uint64_t prp1,
> + uint64_t prp2, NvmeRequest *req)
> +{
> +uint16_t status;
> +
> +status = nvme_map_prp(n, prp1, prp2, NVME_IDENTIFY_DATA_SIZE, req);
> +if (status) {
> +return status;
> +}
> +nvme_fill_data(&req->qsg, &req->iov, 0, 0);
> +return NVME_SUCCESS;
> +}
> +

Instead of doing the filling, why not just directly call nvme_dma_prp
with an empty NvmeIdCtrl/NvmeIdNs stack allocated variable?


signature.asc
Description: PGP signature

RE: [PATCH v3 10/15] hw/block/nvme: Support Zoned Namespace Command Set

2020-09-18 Thread Dmitry Fomichev

> -Original Message-
> From: Klaus Jensen 
> Sent: Friday, September 18, 2020 5:20 PM
> To: Dmitry Fomichev 
> Cc: Keith Busch ; Klaus Jensen
> ; Kevin Wolf ; Philippe
> Mathieu-Daudé ; Maxim Levitsky
> ; Fam Zheng ; Niklas Cassel
> ; Damien Le Moal ;
> qemu-block@nongnu.org; qemu-de...@nongnu.org; Alistair Francis
> ; Matias Bjorling 
> Subject: Re: [PATCH v3 10/15] hw/block/nvme: Support Zoned Namespace
> Command Set
> 
> On Sep 14 07:14, Dmitry Fomichev wrote:
> > The emulation code has been changed to advertise NVM Command Set
> when
> > "zoned" device property is not set (default) and Zoned Namespace
> > Command Set otherwise.
> >
> > Handlers for three new NVMe commands introduced in Zoned Namespace
> > Command Set specification are added, namely for Zone Management
> > Receive, Zone Management Send and Zone Append.
> >
> > Device initialization code has been extended to create a proper
> > configuration for zoned operation using device properties.
> >
> > Read/Write command handler is modified to only allow writes at the
> > write pointer if the namespace is zoned. For Zone Append command,
> > writes implicitly happen at the write pointer and the starting write
> > pointer value is returned as the result of the command. Write Zeroes
> > handler is modified to add zoned checks that are identical to those
> > done as a part of Write flow.
> >
> > The code to support for Zone Descriptor Extensions is not included in
> > this commit and ZDES 0 is always reported. A later commit in this
> > series will add ZDE support.
> >
> > This commit doesn't yet include checks for active and open zone
> > limits. It is assumed that there are no limits on either active or
> > open zones.
> >
> > Signed-off-by: Niklas Cassel 
> > Signed-off-by: Hans Holmberg 
> > Signed-off-by: Ajay Joshi 
> > Signed-off-by: Chaitanya Kulkarni 
> > Signed-off-by: Matias Bjorling 
> > Signed-off-by: Aravind Ramesh 
> > Signed-off-by: Shin'ichiro Kawasaki 
> > Signed-off-by: Adam Manzanares 
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  hw/block/nvme.c | 968
> ++--
> >  1 file changed, 939 insertions(+), 29 deletions(-)
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index c740cbcf1f..0bf7471815 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -709,11 +965,77 @@ static uint16_t nvme_rw(NvmeCtrl *n,
> NvmeRequest *req)
> >  return status;
> >  }
> >
> > +if (n->params.zoned) {
> > +zone_idx = nvme_zone_idx(n, slba);
> > +assert(zone_idx < n->num_zones);
> > +zone = &ns->zone_array[zone_idx];
> > +
> > +if (is_write) {
> > +status = nvme_check_zone_write(zone, slba, nlb);
> > +if (status != NVME_SUCCESS) {
> > +trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> > +return status | NVME_DNR;
> > +}
> > +
> > +assert(nvme_wp_is_valid(zone));
> > +if (append) {
> > +if (unlikely(slba != zone->d.zslba)) {
> > +trace_pci_nvme_err_append_not_at_start(slba, zone-
> >d.zslba);
> > +return NVME_ZONE_INVALID_WRITE | NVME_DNR;
> > +}
> > +if (data_size > (n->page_size << n->zasl)) {
> > +trace_pci_nvme_err_append_too_large(slba, nlb, 
> > n->zasl);
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +slba = zone->d.wp;
> 
> Your append is broken.
> 
> Since you moved the write pointer update to post aio completion (which I
> totally agree with), you are now assigning the same SLBA to all appends
> that are in queue when nvme_process_sq is invoked.

Yes, these newer changes to update the WP in aio callback turned out to be 
buggy.
Will fix this in the next version.

Re: [PATCH v3 01/15] hw/block/nvme: Define 64 bit cqe.result

2020-09-18 Thread Keith Busch

On Tue, Sep 15, 2020 at 10:48:49PM +0200, Klaus Jensen wrote:
> On Sep 15 20:44, Dmitry Fomichev wrote:
> > 
> > It is not necessary to change it in NST patch since result64 field is not 
> > used
> > in that patch. But if you insist, I can move it to NST patch :)
> 
> You are right of course, but since it is a change to the "spec" related
> data structures that go into include/block/nvme.h, I think it belongs in
> "hw/block/nvme: Introduce the Namespace Types definitions".

Just my $.02, unless you're making exernal APIs, I really find it easier
to review internal changes inline with the patches that use it.

Another example, there are patches in this series that introduce trace
points, but the patch that use them come later. I find that harder to
review since I need to look at two different patches to understand its
value.

I realize this particular patch is implementing a spec feature, but I'd
prefer to see how it's used over of making a round trip to the spec.

Re: [PATCH v3 10/15] hw/block/nvme: Support Zoned Namespace Command Set

2020-09-18 Thread Klaus Jensen

On Sep 14 07:14, Dmitry Fomichev wrote:
> The emulation code has been changed to advertise NVM Command Set when
> "zoned" device property is not set (default) and Zoned Namespace
> Command Set otherwise.
> 
> Handlers for three new NVMe commands introduced in Zoned Namespace
> Command Set specification are added, namely for Zone Management
> Receive, Zone Management Send and Zone Append.
> 
> Device initialization code has been extended to create a proper
> configuration for zoned operation using device properties.
> 
> Read/Write command handler is modified to only allow writes at the
> write pointer if the namespace is zoned. For Zone Append command,
> writes implicitly happen at the write pointer and the starting write
> pointer value is returned as the result of the command. Write Zeroes
> handler is modified to add zoned checks that are identical to those
> done as a part of Write flow.
> 
> The code to support for Zone Descriptor Extensions is not included in
> this commit and ZDES 0 is always reported. A later commit in this
> series will add ZDE support.
> 
> This commit doesn't yet include checks for active and open zone
> limits. It is assumed that there are no limits on either active or
> open zones.
> 
> Signed-off-by: Niklas Cassel 
> Signed-off-by: Hans Holmberg 
> Signed-off-by: Ajay Joshi 
> Signed-off-by: Chaitanya Kulkarni 
> Signed-off-by: Matias Bjorling 
> Signed-off-by: Aravind Ramesh 
> Signed-off-by: Shin'ichiro Kawasaki 
> Signed-off-by: Adam Manzanares 
> Signed-off-by: Dmitry Fomichev 
> ---
>  hw/block/nvme.c | 968 ++--
>  1 file changed, 939 insertions(+), 29 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c740cbcf1f..0bf7471815 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -709,11 +965,77 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
>  return status;
>  }
>  
> +if (n->params.zoned) {
> +zone_idx = nvme_zone_idx(n, slba);
> +assert(zone_idx < n->num_zones);
> +zone = &ns->zone_array[zone_idx];
> +
> +if (is_write) {
> +status = nvme_check_zone_write(zone, slba, nlb);
> +if (status != NVME_SUCCESS) {
> +trace_pci_nvme_err_zone_write_not_ok(slba, nlb, status);
> +return status | NVME_DNR;
> +}
> +
> +assert(nvme_wp_is_valid(zone));
> +if (append) {
> +if (unlikely(slba != zone->d.zslba)) {
> +trace_pci_nvme_err_append_not_at_start(slba, 
> zone->d.zslba);
> +return NVME_ZONE_INVALID_WRITE | NVME_DNR;
> +}
> +if (data_size > (n->page_size << n->zasl)) {
> +trace_pci_nvme_err_append_too_large(slba, nlb, n->zasl);
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +slba = zone->d.wp;

Your append is broken.

Since you moved the write pointer update to post aio completion (which I
totally agree with), you are now assigning the same SLBA to all appends
that are in queue when nvme_process_sq is invoked.


signature.asc
Description: PGP signature

[PATCH v2 11/17] hw/block/nvme: harden cmb access

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Since the controller has only supported PRPs so far it has not been
required to check the ending address (addr + len - 1) of the CMB access
for validity since it has been guaranteed to be in range of the CMB.

This changes when the controller adds support for SGLs (next patch), so
add that check.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7e6cbd3f1058..a440f6617f70 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -142,7 +142,12 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr 
addr)
 
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
+hwaddr hi = addr + size - 1;
+if (hi < addr) {
+return 1;
+}
+
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 return 0;
 }
-- 
2.28.0

[PATCH v2 13/17] hw/block/nvme: add support for sgl bit bucket descriptor

2020-09-18 Thread Klaus Jensen

From: Gollu Appalanaidu 

This adds support for SGL descriptor type 0x1 (bit bucket descriptor).
See the NVM Express v1.3d specification, Section 4.4 ("Scatter Gather
List (SGL)").

Signed-off-by: Gollu Appalanaidu 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e01b4b22882e..d2e211c7cbc2 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -430,6 +430,10 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 uint8_t type = NVME_SGL_TYPE(segment[i].type);
 
 switch (type) {
+case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
+if (req->cmd.opcode == NVME_CMD_WRITE) {
+continue;
+}
 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
 break;
 case NVME_SGL_DESCR_TYPE_SEGMENT:
@@ -440,6 +444,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 }
 
 dlen = le32_to_cpu(segment[i].len);
+
 if (!dlen) {
 continue;
 }
@@ -460,6 +465,11 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 }
 
 trans_len = MIN(*len, dlen);
+
+if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
+goto next;
+}
+
 addr = le64_to_cpu(segment[i].addr);
 
 if (UINT64_MAX - addr < dlen) {
@@ -471,6 +481,7 @@ static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList 
*qsg,
 return status;
 }
 
+next:
 *len -= trans_len;
 }
 
@@ -540,7 +551,8 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, 
QEMUIOVector *iov,
 seg_len = le32_to_cpu(sgld->len);
 
 /* check the length of the (Last) Segment descriptor */
-if (!seg_len || seg_len & 0xf) {
+if ((!seg_len || seg_len & 0xf) &&
+(NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
 }
 
@@ -577,19 +589,27 @@ static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList 
*qsg, QEMUIOVector *iov,
 
 last_sgld = &segment[nsgld - 1];
 
-/* if the segment ends with a Data Block, then we are done */
-if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+/*
+ * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
+ * then we are done.
+ */
+switch (NVME_SGL_TYPE(last_sgld->type)) {
+case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
 status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
 if (status) {
 goto unmap;
 }
 
 goto out;
+
+default:
+break;
 }
 
 /*
- * If the last descriptor was not a Data Block, then the current
- * segment must not be a Last Segment.
+ * If the last descriptor was not a Data Block or Bit Bucket, then the
+ * current segment must not be a Last Segment.
  */
 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
@@ -2654,7 +2674,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
*pci_dev)
 id->nn = cpu_to_le32(n->num_namespaces);
 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
NVME_ONCS_FEATURES);
-id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
+id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
+   NVME_CTRL_SGLS_BITBUCKET);
 
 subnqn = g_strdup_printf("nqn.2019-08.org.qemu:%s", n->params.serial);
 strpadcpy((char *)id->subnqn, sizeof(id->subnqn), subnqn, '\0');
-- 
2.28.0

[PATCH v2 07/17] hw/block/nvme: fix endian conversion

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

The raw NLB field is a 16 bit value, so use le16_to_cpu instead of
le32_to_cpu and cast to uint32_t before incrementing the value to not
wrap around.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 62db87460413..32267a3e4782 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -645,7 +645,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest 
*req)
 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
 uint64_t slba = le64_to_cpu(rw->slba);
-uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t offset = nvme_l2b(ns, slba);
 uint32_t count = nvme_l2b(ns, nlb);
 uint16_t status;
@@ -669,7 +669,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
-uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
+uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
 
 uint64_t data_size = nvme_l2b(ns, nlb);
-- 
2.28.0

[PATCH v2 17/17] hw/block/nvme: change controller pci id

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

There are two reasons for changing this:

  1. The nvme device currently uses an internal Intel device id.

  2. Since commits "nvme: fix write zeroes offset and count" and "nvme:
 support multiple namespaces" the controller device no longer has
 the quirks that the Linux kernel think it has.

 As the quirks are applied based on pci vendor and device id, change
 them to get rid of the quirks.

To keep backward compatibility, add a new 'use-intel-id' parameter to
the nvme device to force use of the Intel vendor and device id. This is
off by default but add a compat property to set this for 5.1 machines
and older. If a 5.1 machine is booted (or the use-intel-id parameter is
explicitly set to true), the Linux kernel will just apply these
unnecessary quirks:

  1. NVME_QUIRK_IDENTIFY_CNS which says that the device does not support
 anything else than values 0x0 and 0x1 for CNS (Identify Namespace
 and Identify Namespace). With multiple namespace support, this just
 means that the kernel will "scan" namespaces instead of using
 "Active Namespace ID list" (CNS 0x2).

  2. NVME_QUIRK_DISABLE_WRITE_ZEROES. The nvme device started out with a
 broken Write Zeroes implementation which has since been fixed in
 commit 9d6459d21a6e ("nvme: fix write zeroes offset and count").

Signed-off-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.h   |  1 +
 hw/block/nvme.c   | 12 ++--
 hw/core/machine.c |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index a8b98a1c1ea9..ccf52ac7bb82 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -15,6 +15,7 @@ typedef struct NvmeParams {
 uint8_t  aerl;
 uint32_t aer_max_queued;
 uint8_t  mdts;
+bool use_intel_id;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e733a2b1321..c00e78b85f51 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2681,6 +2681,15 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 
 pci_conf[PCI_INTERRUPT_PIN] = 1;
 pci_config_set_prog_interface(pci_conf, 0x2);
+
+if (n->params.use_intel_id) {
+pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
+pci_config_set_device_id(pci_conf, 0x5845);
+} else {
+pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
+pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
+}
+
 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
 pcie_endpoint_cap_init(pci_dev, 0x80);
 
@@ -2834,6 +2843,7 @@ static Property nvme_props[] = {
 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
+DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2850,8 +2860,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
 pc->realize = nvme_realize;
 pc->exit = nvme_exit;
 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
-pc->vendor_id = PCI_VENDOR_ID_INTEL;
-pc->device_id = 0x5845;
 pc->revision = 2;
 
 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/core/machine.c b/hw/core/machine.c
index ea26d612374d..944120cf19c8 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -34,6 +34,7 @@ GlobalProperty hw_compat_5_1[] = {
 { "vhost-user-scsi", "num_queues", "1"},
 { "virtio-blk-device", "num-queues", "1"},
 { "virtio-scsi-device", "num_queues", "1"},
+{ "nvme", "use-intel-id", "on"},
 };
 const size_t hw_compat_5_1_len = G_N_ELEMENTS(hw_compat_5_1);
 
-- 
2.28.0

[PATCH v2 10/17] hw/block/nvme: default request status to success

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Make the default request status NVME_SUCCESS so only error status codes
have to be set.

Signed-off-by: Klaus Jensen 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/block/nvme.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index e6790c196d2b..7e6cbd3f1058 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -230,6 +230,7 @@ static void nvme_req_clear(NvmeRequest *req)
 {
 req->ns = NULL;
 memset(&req->cqe, 0x0, sizeof(req->cqe));
+req->status = NVME_SUCCESS;
 }
 
 static void nvme_req_exit(NvmeRequest *req)
@@ -546,8 +547,6 @@ static void nvme_process_aers(void *opaque)
 result->log_page = event->result.log_page;
 g_free(event);
 
-req->status = NVME_SUCCESS;
-
 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
 result->log_page);
 
@@ -626,7 +625,6 @@ static void nvme_rw_cb(void *opaque, int ret)
 
 if (!ret) {
 block_acct_done(stats, acct);
-req->status = NVME_SUCCESS;
 } else {
 uint16_t status;
 
-- 
2.28.0

[PATCH v2 14/17] hw/block/nvme: refactor identify active namespace id list

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Prepare to support inactive namespaces.

Signed-off-by: Klaus Jensen 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d2e211c7cbc2..f533eec55c5c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1471,7 +1471,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 uint32_t min_nsid = le32_to_cpu(c->nsid);
 uint32_t *list;
 uint16_t ret;
-int i, j = 0;
+int j = 0;
 
 trace_pci_nvme_identify_nslist(min_nsid);
 
@@ -1486,11 +1486,11 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req)
 }
 
 list = g_malloc0(data_len);
-for (i = 0; i < n->num_namespaces; i++) {
-if (i < min_nsid) {
+for (int i = 1; i <= n->num_namespaces; i++) {
+if (i <= min_nsid) {
 continue;
 }
-list[j++] = cpu_to_le32(i + 1);
+list[j++] = cpu_to_le32(i);
 if (j == data_len / sizeof(uint32_t)) {
 break;
 }
-- 
2.28.0

[PATCH v2 15/17] hw/block/nvme: support multiple namespaces

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

This adds support for multiple namespaces by introducing a new 'nvme-ns'
device model. The nvme device creates a bus named from the device name
('id'). The nvme-ns devices then connect to this and registers
themselves with the nvme device.

This changes how an nvme device is created. Example with two namespaces:

  -drive file=nvme0n1.img,if=none,id=disk1
  -drive file=nvme0n2.img,if=none,id=disk2
  -device nvme,serial=deadbeef,id=nvme0
  -device nvme-ns,drive=disk1,bus=nvme0,nsid=1
  -device nvme-ns,drive=disk2,bus=nvme0,nsid=2

The drive property is kept on the nvme device to keep the change
backward compatible, but the property is now optional. Specifying a
drive for the nvme device will always create the namespace with nsid 1.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Reviewed-by: Keith Busch 
Reviewed-by: Minwoo Im 
---
 hw/block/nvme-ns.h|  74 +
 hw/block/nvme.h   |  46 
 hw/block/nvme-ns.c| 167 
 hw/block/nvme.c   | 249 +++---
 hw/block/meson.build  |   2 +-
 hw/block/trace-events |   6 +-
 6 files changed, 428 insertions(+), 116 deletions(-)
 create mode 100644 hw/block/nvme-ns.h
 create mode 100644 hw/block/nvme-ns.c

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
new file mode 100644
index ..83734f4606e1
--- /dev/null
+++ b/hw/block/nvme-ns.h
@@ -0,0 +1,74 @@
+/*
+ * QEMU NVM Express Virtual Namespace
+ *
+ * Copyright (c) 2019 CNEX Labs
+ * Copyright (c) 2020 Samsung Electronics
+ *
+ * Authors:
+ *  Klaus Jensen  
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See the
+ * COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef NVME_NS_H
+#define NVME_NS_H
+
+#define TYPE_NVME_NS "nvme-ns"
+#define NVME_NS(obj) \
+OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
+
+typedef struct NvmeNamespaceParams {
+uint32_t nsid;
+} NvmeNamespaceParams;
+
+typedef struct NvmeNamespace {
+DeviceState  parent_obj;
+BlockConfblkconf;
+int32_t  bootindex;
+int64_t  size;
+NvmeIdNs id_ns;
+
+NvmeNamespaceParams params;
+} NvmeNamespace;
+
+static inline uint32_t nvme_nsid(NvmeNamespace *ns)
+{
+if (ns) {
+return ns->params.nsid;
+}
+
+return -1;
+}
+
+static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
+{
+NvmeIdNs *id_ns = &ns->id_ns;
+return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
+}
+
+static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
+{
+return nvme_ns_lbaf(ns)->ds;
+}
+
+/* calculate the number of LBAs that the namespace can accomodate */
+static inline uint64_t nvme_ns_nlbas(NvmeNamespace *ns)
+{
+return ns->size >> nvme_ns_lbads(ns);
+}
+
+/* convert an LBA to the equivalent in bytes */
+static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
+{
+return lba << nvme_ns_lbads(ns);
+}
+
+typedef struct NvmeCtrl NvmeCtrl;
+
+int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
+void nvme_ns_drain(NvmeNamespace *ns);
+void nvme_ns_flush(NvmeNamespace *ns);
+
+#endif /* NVME_NS_H */
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 5741e93a0fb9..a8b98a1c1ea9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -2,6 +2,9 @@
 #define HW_NVME_H
 
 #include "block/nvme.h"
+#include "nvme-ns.h"
+
+#define NVME_MAX_NAMESPACES 256
 
 typedef struct NvmeParams {
 char *serial;
@@ -101,26 +104,12 @@ typedef struct NvmeCQueue {
 QTAILQ_HEAD(, NvmeRequest) req_list;
 } NvmeCQueue;
 
-typedef struct NvmeNamespace {
-NvmeIdNsid_ns;
-} NvmeNamespace;
+#define TYPE_NVME_BUS "nvme-bus"
+#define NVME_BUS(obj) OBJECT_CHECK(NvmeBus, (obj), TYPE_NVME_BUS)
 
-static inline NvmeLBAF *nvme_ns_lbaf(NvmeNamespace *ns)
-{
-NvmeIdNs *id_ns = &ns->id_ns;
-return &id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(id_ns->flbas)];
-}
-
-static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
-{
-return nvme_ns_lbaf(ns)->ds;
-}
-
-/* convert an LBA to the equivalent in bytes */
-static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
-{
-return lba << nvme_ns_lbads(ns);
-}
+typedef struct NvmeBus {
+BusState parent_bus;
+} NvmeBus;
 
 #define TYPE_NVME "nvme"
 #define NVME(obj) \
@@ -132,6 +121,7 @@ typedef struct NvmeFeatureVal {
 uint16_t temp_thresh_low;
 };
 uint32_tasync_config;
+uint32_tvwc;
 } NvmeFeatureVal;
 
 typedef struct NvmeCtrl {
@@ -139,8 +129,9 @@ typedef struct NvmeCtrl {
 MemoryRegion iomem;
 MemoryRegion ctrl_mem;
 NvmeBar  bar;
-BlockConfconf;
 NvmeParams   params;
+NvmeBus  bus;
+BlockConfconf;
 
 boolqs_created;
 uint32_tpage_size;
@@ -151,7 +142,6 @@ typedef struct NvmeCtrl {
 uint32_treg_size;
 uint32_tnum_namespaces;
 uint32_tmax_q_ents;
-uint64_tns_size;
 uint8_t outstanding_aers;
 uint8_t *cmbuf;
 uint32_tirq_status;
@@

[PATCH v2 12/17] hw/block/nvme: add support for scatter gather lists

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

For now, support the Data Block, Segment and Last Segment descriptor
types.

See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 include/block/nvme.h  |   6 +-
 hw/block/nvme.c   | 329 ++
 hw/block/trace-events |   4 +
 3 files changed, 279 insertions(+), 60 deletions(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index 65e68a82c897..58647bcdad0b 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -412,9 +412,9 @@ typedef union NvmeCmdDptr {
 } NvmeCmdDptr;
 
 enum NvmePsdt {
-PSDT_PRP = 0x0,
-PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
-PSDT_SGL_MPTR_SGL= 0x2,
+NVME_PSDT_PRP = 0x0,
+NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
+NVME_PSDT_SGL_MPTR_SGL= 0x2,
 };
 
 typedef struct QEMU_PACKED NvmeCmd {
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a440f6617f70..e01b4b22882e 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -413,13 +413,262 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 return NVME_SUCCESS;
 }
 
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir,
+/*
+ * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
+ * number of bytes mapped in len.
+ */
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+  QEMUIOVector *iov,
+  NvmeSglDescriptor *segment, uint64_t nsgld,
+  size_t *len, NvmeRequest *req)
+{
+dma_addr_t addr, trans_len;
+uint32_t dlen;
+uint16_t status;
+
+for (int i = 0; i < nsgld; i++) {
+uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+switch (type) {
+case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+break;
+case NVME_SGL_DESCR_TYPE_SEGMENT:
+case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+default:
+return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+}
+
+dlen = le32_to_cpu(segment[i].len);
+if (!dlen) {
+continue;
+}
+
+if (*len == 0) {
+/*
+ * All data has been mapped, but the SGL contains additional
+ * segments and/or descriptors. The controller might accept
+ * ignoring the rest of the SGL.
+ */
+uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+break;
+}
+
+trace_pci_nvme_err_invalid_sgl_excess_length(nvme_cid(req));
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+trans_len = MIN(*len, dlen);
+addr = le64_to_cpu(segment[i].addr);
+
+if (UINT64_MAX - addr < dlen) {
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+if (status) {
+return status;
+}
+
+*len -= trans_len;
+}
+
+return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ NvmeSglDescriptor sgl, size_t len,
  NvmeRequest *req)
+{
+/*
+ * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+ * dynamically allocating a potentially huge SGL. The spec allows the SGL
+ * to be larger (as in number of bytes required to describe the SGL
+ * descriptors and segment chain) than the command transfer size, so it is
+ * not bounded by MDTS.
+ */
+const int SEG_CHUNK_SIZE = 256;
+
+NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+uint64_t nsgld;
+uint32_t seg_len;
+uint16_t status;
+bool sgl_in_cmb = false;
+hwaddr addr;
+int ret;
+
+sgld = &sgl;
+addr = le64_to_cpu(sgl.addr);
+
+trace_pci_nvme_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), len);
+
+/*
+ * If the entire transfer can be described with a single data block it can
+ * be mapped directly.
+ */
+if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
+if (status) {
+goto unmap;
+}
+
+goto out;
+}
+
+/*
+ * If the segment is located in the CMB, the submission queue of the
+ * request must also reside there.
+ */
+if (nvme_addr_is_cmb(n, addr)) {
+if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+}
+
+sgl_in_cmb = true;
+}
+
+for (;;) {
+switch (NVME_SGL_TYPE(sgld->type)) {
+case NVME_SGL_DESCR

[PATCH v2 09/17] hw/block/nvme: refactor aio submission

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

This pulls block layer aio submission/completion to common functions.

For completions, additionally map an AIO error to the Unrecovered Read
and Write Fault status codes.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h   |  25 
 hw/block/nvme.c   | 136 +-
 hw/block/trace-events |   4 +-
 3 files changed, 124 insertions(+), 41 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index ce9e931420d7..5741e93a0fb9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -32,6 +32,17 @@ typedef struct NvmeRequest {
 QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
+static inline bool nvme_req_is_write(NvmeRequest *req)
+{
+switch (req->cmd.opcode) {
+case NVME_CMD_WRITE:
+case NVME_CMD_WRITE_ZEROES:
+return true;
+default:
+return false;
+}
+}
+
 static inline const char *nvme_adm_opc_str(uint8_t opc)
 {
 switch (opc) {
@@ -171,4 +182,18 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, 
NvmeNamespace *ns)
 return n->ns_size >> nvme_ns_lbads(ns);
 }
 
+static inline NvmeCQueue *nvme_cq(NvmeRequest *req)
+{
+NvmeSQueue *sq = req->sq;
+NvmeCtrl *n = sq->ctrl;
+
+return n->cq[sq->cqid];
+}
+
+static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
+{
+NvmeSQueue *sq = req->sq;
+return sq->ctrl;
+}
+
 #endif /* HW_NVME_H */
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0b2aa6f92e7f..e6790c196d2b 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -614,30 +614,108 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, 
NvmeNamespace *ns,
 static void nvme_rw_cb(void *opaque, int ret)
 {
 NvmeRequest *req = opaque;
-NvmeSQueue *sq = req->sq;
-NvmeCtrl *n = sq->ctrl;
-NvmeCQueue *cq = n->cq[sq->cqid];
+NvmeCtrl *n = nvme_ctrl(req);
 
-trace_pci_nvme_rw_cb(nvme_cid(req));
+BlockBackend *blk = n->conf.blk;
+BlockAcctCookie *acct = &req->acct;
+BlockAcctStats *stats = blk_get_stats(blk);
+
+Error *local_err = NULL;
+
+trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
 
 if (!ret) {
-block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
+block_acct_done(stats, acct);
 req->status = NVME_SUCCESS;
 } else {
-block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
-req->status = NVME_INTERNAL_DEV_ERROR;
+uint16_t status;
+
+block_acct_failed(stats, acct);
+
+switch (req->cmd.opcode) {
+case NVME_CMD_READ:
+status = NVME_UNRECOVERED_READ;
+break;
+case NVME_CMD_FLUSH:
+case NVME_CMD_WRITE:
+case NVME_CMD_WRITE_ZEROES:
+status = NVME_WRITE_FAULT;
+break;
+default:
+status = NVME_INTERNAL_DEV_ERROR;
+break;
+}
+
+trace_pci_nvme_err_aio(nvme_cid(req), strerror(ret), status);
+
+error_setg_errno(&local_err, -ret, "aio failed");
+error_report_err(local_err);
+
+req->status = status;
 }
 
-nvme_enqueue_req_completion(cq, req);
+nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
+static uint16_t nvme_do_aio(BlockBackend *blk, int64_t offset, size_t len,
+NvmeRequest *req)
+{
+BlockAcctCookie *acct = &req->acct;
+BlockAcctStats *stats = blk_get_stats(blk);
+
+bool is_write;
+
+trace_pci_nvme_do_aio(nvme_cid(req), req->cmd.opcode,
+  nvme_io_opc_str(req->cmd.opcode), blk_name(blk),
+  offset, len);
+
+switch (req->cmd.opcode) {
+case NVME_CMD_FLUSH:
+block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+req->aiocb = blk_aio_flush(blk, nvme_rw_cb, req);
+break;
+
+case NVME_CMD_WRITE_ZEROES:
+block_acct_start(stats, acct, len, BLOCK_ACCT_WRITE);
+req->aiocb = blk_aio_pwrite_zeroes(blk, offset, len,
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
+   req);
+break;
+
+case NVME_CMD_READ:
+case NVME_CMD_WRITE:
+is_write = nvme_req_is_write(req);
+
+block_acct_start(stats, acct, len,
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+if (req->qsg.sg) {
+if (is_write) {
+req->aiocb = dma_blk_write(blk, &req->qsg, offset,
+   BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+} else {
+req->aiocb = dma_blk_read(blk, &req->qsg, offset,
+  BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+}
+} else {
+if (is_write) {
+req->aiocb = blk_aio_pwritev(blk, offset, &req->iov, 0,
+ nvme_rw_cb, req);
+} else {
+req->aiocb = blk_aio_preadv(blk, offset, &req->iov, 0,
+

[PATCH v2 16/17] pci: allocate pci id for nvme

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

The emulated nvme device (hw/block/nvme.c) is currently using an
internal Intel device id.

Prepare to change that by allocating a device id under the 1b36 (Red
Hat, Inc.) vendor id.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Acked-by: Gerd Hoffmann 
Reviewed-by: Maxim Levitsky 
---
 docs/specs/nvme.txt| 23 +++
 docs/specs/pci-ids.txt |  1 +
 include/hw/pci/pci.h   |  1 +
 MAINTAINERS|  1 +
 4 files changed, 26 insertions(+)
 create mode 100644 docs/specs/nvme.txt

diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
new file mode 100644
index ..56d393884e7a
--- /dev/null
+++ b/docs/specs/nvme.txt
@@ -0,0 +1,23 @@
+NVM Express Controller
+==
+
+The nvme device (-device nvme) emulates an NVM Express Controller.
+
+
+Reference Specifications
+
+
+The device currently implements most mandatory features of NVMe v1.3d, see
+
+  https://nvmexpress.org/resources/specifications/
+
+for the specification.
+
+
+Known issues
+
+
+* The accounting numbers in the SMART/Health are reset across power cycles
+
+* Interrupt Coalescing is not supported and is disabled by default in volation
+  of the specification.
diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
index 4d53e5c7d9d5..abbdbca6be38 100644
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -63,6 +63,7 @@ PCI devices (other than virtio):
 1b36:000b  PCIe Expander Bridge (-device pxb-pcie)
 1b36:000d  PCI xhci usb host adapter
 1b36:000f  mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c
+1b36:0010  PCIe NVMe device (-device nvme)
 
 All these devices are documented in docs/specs.
 
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 0ff3feec1573..fb3ea21ee1d6 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -106,6 +106,7 @@ extern bool pci_available;
 #define PCI_DEVICE_ID_REDHAT_XHCI0x000d
 #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e
 #define PCI_DEVICE_ID_REDHAT_MDPY0x000f
+#define PCI_DEVICE_ID_REDHAT_NVME0x0010
 #define PCI_DEVICE_ID_REDHAT_QXL 0x0100
 
 #define FMT_PCIBUS  PRIx64
diff --git a/MAINTAINERS b/MAINTAINERS
index 3d17cad19aa0..3f3babeca663 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1880,6 +1880,7 @@ L: qemu-block@nongnu.org
 S: Supported
 F: hw/block/nvme*
 F: tests/qtest/nvme-test.c
+F: docs/specs/nvme.txt
 T: git git://git.infradead.org/qemu-nvme.git nvme-next
 
 megasas
-- 
2.28.0

[PATCH v2 08/17] hw/block/nvme: add symbolic command name to trace events

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Add the symbolic command name to the pci_nvme_{io,admin}_cmd and
pci_nvme_rw trace events.

Signed-off-by: Klaus Jensen 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/block/nvme.h   | 28 
 hw/block/nvme.c   |  8 +---
 hw/block/trace-events |  6 +++---
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 1675c1e0755c..ce9e931420d7 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -32,6 +32,34 @@ typedef struct NvmeRequest {
 QTAILQ_ENTRY(NvmeRequest)entry;
 } NvmeRequest;
 
+static inline const char *nvme_adm_opc_str(uint8_t opc)
+{
+switch (opc) {
+case NVME_ADM_CMD_DELETE_SQ:return "NVME_ADM_CMD_DELETE_SQ";
+case NVME_ADM_CMD_CREATE_SQ:return "NVME_ADM_CMD_CREATE_SQ";
+case NVME_ADM_CMD_GET_LOG_PAGE: return "NVME_ADM_CMD_GET_LOG_PAGE";
+case NVME_ADM_CMD_DELETE_CQ:return "NVME_ADM_CMD_DELETE_CQ";
+case NVME_ADM_CMD_CREATE_CQ:return "NVME_ADM_CMD_CREATE_CQ";
+case NVME_ADM_CMD_IDENTIFY: return "NVME_ADM_CMD_IDENTIFY";
+case NVME_ADM_CMD_ABORT:return "NVME_ADM_CMD_ABORT";
+case NVME_ADM_CMD_SET_FEATURES: return "NVME_ADM_CMD_SET_FEATURES";
+case NVME_ADM_CMD_GET_FEATURES: return "NVME_ADM_CMD_GET_FEATURES";
+case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
+default:return "NVME_ADM_CMD_UNKNOWN";
+}
+}
+
+static inline const char *nvme_io_opc_str(uint8_t opc)
+{
+switch (opc) {
+case NVME_CMD_FLUSH:return "NVME_NVM_CMD_FLUSH";
+case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
+case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
+case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+default:return "NVME_NVM_CMD_UNKNOWN";
+}
+}
+
 typedef struct NvmeSQueue {
 struct NvmeCtrl *ctrl;
 uint16_tsqid;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 32267a3e4782..0b2aa6f92e7f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -678,7 +678,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 uint16_t status;
 
-trace_pci_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+trace_pci_nvme_rw(nvme_cid(req), nvme_io_opc_str(rw->opcode), nlb,
+  data_size, slba);
 
 status = nvme_check_mdts(n, data_size);
 if (status) {
@@ -727,7 +728,7 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
 
 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
-  req->cmd.opcode);
+  req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
 
 if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
 trace_pci_nvme_err_invalid_ns(nsid, n->num_namespaces);
@@ -1584,7 +1585,8 @@ static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
-trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode);
+trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
+ nvme_adm_opc_str(req->cmd.opcode));
 
 switch (req->cmd.opcode) {
 case NVME_ADM_CMD_DELETE_SQ:
diff --git a/hw/block/trace-events b/hw/block/trace-events
index c506ae8b69ef..63d8750dd96e 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -36,9 +36,9 @@ pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, 
prp1=0x%"PRIx64" prp2
 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
 pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len 
%"PRIu64""
 pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t 
prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 
0x%"PRIx64" num_prps %d"
-pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) 
"cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
-pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" 
sqid %"PRIu16" opc 0x%"PRIx8""
-pci_nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, 
uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+pci_nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode, 
const char *opname) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8" 
opname '%s'"
+pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char 
*opname) "cid %"PRIu16" sqid %"PRIu16" opc 0x%"PRIx8" opname '%s'"
+pci_nvme_rw(uint16_t cid, const char *verb, uint32_t nlb, uint64_t count, 
uint64_t lba) "cid %"PRIu16" '%s' nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
 pci_nvme_rw_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_write_zer

[PATCH v2 04/17] hw/block/nvme: commonize nvme_rw error handling

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Move common error handling to a label.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 8ffe407ef658..78c3ac80fd4c 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -687,20 +687,18 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 status = nvme_check_mdts(n, data_size);
 if (status) {
 trace_pci_nvme_err_mdts(nvme_cid(req), data_size);
-block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-return status;
+goto invalid;
 }
 
 status = nvme_check_bounds(n, ns, slba, nlb);
 if (status) {
 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
-block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-return status;
+goto invalid;
 }
 
-if (nvme_map_dptr(n, data_size, req)) {
-block_acct_invalid(blk_get_stats(n->conf.blk), acct);
-return NVME_INVALID_FIELD | NVME_DNR;
+status = nvme_map_dptr(n, data_size, req);
+if (status) {
+goto invalid;
 }
 
 if (req->qsg.nsg > 0) {
@@ -722,6 +720,10 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 }
 
 return NVME_NO_COMPLETE;
+
+invalid:
+block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+return status;
 }
 
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
-- 
2.28.0

[PATCH v2 06/17] hw/block/nvme: add a lba to bytes helper

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Add the nvme_l2b helper and use it for converting NLB and SLBA to byte
counts and offsets.

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h |  6 ++
 hw/block/nvme.c | 12 
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 52ba794f2e9a..1675c1e0755c 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -77,6 +77,12 @@ static inline uint8_t nvme_ns_lbads(NvmeNamespace *ns)
 return nvme_ns_lbaf(ns)->ds;
 }
 
+/* convert an LBA to the equivalent in bytes */
+static inline size_t nvme_l2b(NvmeNamespace *ns, uint64_t lba)
+{
+return lba << nvme_ns_lbads(ns);
+}
+
 #define TYPE_NVME "nvme"
 #define NVME(obj) \
 OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 10568cbf8761..62db87460413 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -644,12 +644,10 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, 
NvmeRequest *req)
 {
 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
 NvmeNamespace *ns = req->ns;
-const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
-const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
 uint64_t slba = le64_to_cpu(rw->slba);
 uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
-uint64_t offset = slba << data_shift;
-uint32_t count = nlb << data_shift;
+uint64_t offset = nvme_l2b(ns, slba);
+uint32_t count = nvme_l2b(ns, nlb);
 uint16_t status;
 
 trace_pci_nvme_write_zeroes(nvme_cid(req), slba, nlb);
@@ -674,10 +672,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
 uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
 uint64_t slba = le64_to_cpu(rw->slba);
 
-uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
-uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
-uint64_t data_size = (uint64_t)nlb << data_shift;
-uint64_t data_offset = slba << data_shift;
+uint64_t data_size = nvme_l2b(ns, nlb);
+uint64_t data_offset = nvme_l2b(ns, slba);
 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 uint16_t status;
-- 
2.28.0

[PATCH v2 00/17] hw/block/nvme: multiple namespaces support

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

This is the next round of my patches for the nvme device.

This includes a bit of cleanup and two new features:

  * support for scatter/gather lists

  * multiple namespaces support through a new nvme-ns device

Finally, the series wraps up with changing the PCI vendor and device ID to get
rid of the internal Intel id and as a side-effect get rid of some Linux kernel
quirks that no longer applies.

"pci: pass along the return value of dma_memory_rw" has already been posted by
Philippe in another series, but since it is not applied yet, I am including it
here.

Changes for v2
~~

  * Added a new patch ("hw/block/nvme: fix typo in trace event") that does what
it says on the tin.

  * Dropped the "hw/block/nvme: support multiple parallel aios per request"
patch (Keith).

  * hw/block/nvme: add symbolic command name to trace events
Changed to single quote (Philippe)

  * hw/block/nvme: default request status to success
Commit message typo fixed (Philippe)

  * hw/block/nvme: change controller pci id
Do NOT bump the device id for the legacy Intel id (David)

Gollu Appalanaidu (1):
  hw/block/nvme: add support for sgl bit bucket descriptor

Klaus Jensen (16):
  hw/block/nvme: fix typo in trace event
  pci: pass along the return value of dma_memory_rw
  hw/block/nvme: handle dma errors
  hw/block/nvme: commonize nvme_rw error handling
  hw/block/nvme: alignment style fixes
  hw/block/nvme: add a lba to bytes helper
  hw/block/nvme: fix endian conversion
  hw/block/nvme: add symbolic command name to trace events
  hw/block/nvme: refactor aio submission
  hw/block/nvme: default request status to success
  hw/block/nvme: harden cmb access
  hw/block/nvme: add support for scatter gather lists
  hw/block/nvme: refactor identify active namespace id list
  hw/block/nvme: support multiple namespaces
  pci: allocate pci id for nvme
  hw/block/nvme: change controller pci id

 docs/specs/nvme.txt|  23 ++
 docs/specs/pci-ids.txt |   1 +
 hw/block/nvme-ns.h |  74 
 hw/block/nvme.h|  94 -
 include/block/nvme.h   |   6 +-
 include/hw/pci/pci.h   |   4 +-
 hw/block/nvme-ns.c | 167 
 hw/block/nvme.c| 846 ++---
 hw/core/machine.c  |   1 +
 MAINTAINERS|   1 +
 hw/block/meson.build   |   2 +-
 hw/block/trace-events  |  22 +-
 12 files changed, 986 insertions(+), 255 deletions(-)
 create mode 100644 docs/specs/nvme.txt
 create mode 100644 hw/block/nvme-ns.h
 create mode 100644 hw/block/nvme-ns.c

-- 
2.28.0

[PATCH v2 03/17] hw/block/nvme: handle dma errors

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Handling DMA errors gracefully is required for the device to pass the
block/011 test ("disable PCI device while doing I/O") in the blktests
suite.

With this patch the device passes the test by retrying "critical"
transfers (posting of completion entries and processing of submission
queue entries).

If DMA errors occur at any other point in the execution of the command
(say, while mapping the PRPs), the command is aborted with a Data
Transfer Error status code.

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
Reviewed-by: Maxim Levitsky 
---
 hw/block/nvme.c   | 41 +++--
 hw/block/trace-events |  2 ++
 2 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 63078f600920..8ffe407ef658 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -140,14 +140,14 @@ static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr 
addr)
 return &n->cmbuf[addr - n->ctrl_mem.addr];
 }
 
-static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
+static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
-return;
+return 0;
 }
 
-pci_dma_read(&n->parent_obj, addr, buf, size);
+return pci_dma_read(&n->parent_obj, addr, buf, size);
 }
 
 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
@@ -307,6 +307,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 int num_prps = (len >> n->page_bits) + 1;
 uint16_t status;
 bool prp_list_in_cmb = false;
+int ret;
 
 QEMUSGList *qsg = &req->qsg;
 QEMUIOVector *iov = &req->iov;
@@ -347,7 +348,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
+ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
+if (ret) {
+trace_pci_nvme_err_addr_read(prp2);
+return NVME_DATA_TRAS_ERROR;
+}
 while (len != 0) {
 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
 
@@ -364,8 +369,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, uint64_t prp1, 
uint64_t prp2,
 i = 0;
 nents = (len + n->page_size - 1) >> n->page_bits;
 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
-nvme_addr_read(n, prp_ent, (void *)prp_list,
-prp_trans);
+ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
+ prp_trans);
+if (ret) {
+trace_pci_nvme_err_addr_read(prp_ent);
+return NVME_DATA_TRAS_ERROR;
+}
 prp_ent = le64_to_cpu(prp_list[i]);
 }
 
@@ -457,6 +466,7 @@ static void nvme_post_cqes(void *opaque)
 NvmeCQueue *cq = opaque;
 NvmeCtrl *n = cq->ctrl;
 NvmeRequest *req, *next;
+int ret;
 
 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
 NvmeSQueue *sq;
@@ -466,15 +476,21 @@ static void nvme_post_cqes(void *opaque)
 break;
 }
 
-QTAILQ_REMOVE(&cq->req_list, req, entry);
 sq = req->sq;
 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
 req->cqe.sq_id = cpu_to_le16(sq->sqid);
 req->cqe.sq_head = cpu_to_le16(sq->head);
 addr = cq->dma_addr + cq->tail * n->cqe_size;
+ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
+sizeof(req->cqe));
+if (ret) {
+trace_pci_nvme_err_addr_write(addr);
+timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+  500 * SCALE_MS);
+break;
+}
+QTAILQ_REMOVE(&cq->req_list, req, entry);
 nvme_inc_cq_tail(cq);
-pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
-sizeof(req->cqe));
 nvme_req_exit(req);
 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
 }
@@ -1611,7 +1627,12 @@ static void nvme_process_sq(void *opaque)
 
 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
 addr = sq->dma_addr + sq->head * n->sqe_size;
-nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
+if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
+trace_pci_nvme_err_addr_read(addr);
+timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
+  500 * SCALE_MS);
+break;
+}
 nvme_inc_sq_head(sq);
 
 req = QTAILQ_FIRST(&sq->req_list);
diff --git a/hw/block/trace-events b/hw/bl

[PATCH v2 02/17] pci: pass along the return value of dma_memory_rw

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Some devices might want to know the return value of dma_memory_rw, so
pass it along instead of ignoring it.

There are no existing users of the return value, so this patch should be
safe.

Signed-off-by: Klaus Jensen 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Michael S. Tsirkin 
Acked-by: Keith Busch 
---
 include/hw/pci/pci.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index c13ae1f8580b..0ff3feec1573 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -785,8 +785,7 @@ static inline AddressSpace *pci_get_address_space(PCIDevice 
*dev)
 static inline int pci_dma_rw(PCIDevice *dev, dma_addr_t addr,
  void *buf, dma_addr_t len, DMADirection dir)
 {
-dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
-return 0;
+return dma_memory_rw(pci_get_address_space(dev), addr, buf, len, dir);
 }
 
 static inline int pci_dma_read(PCIDevice *dev, dma_addr_t addr,
-- 
2.28.0

[PATCH v2 05/17] hw/block/nvme: alignment style fixes

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Style fixes.

Signed-off-by: Klaus Jensen 
Reviewed-by: Philippe Mathieu-Daudé 
---
 hw/block/nvme.c | 25 +
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 78c3ac80fd4c..10568cbf8761 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -634,7 +634,7 @@ static void nvme_rw_cb(void *opaque, int ret)
 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
 {
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
- BLOCK_ACCT_FLUSH);
+ BLOCK_ACCT_FLUSH);
 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
 
 return NVME_NO_COMPLETE;
@@ -663,7 +663,7 @@ static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest 
*req)
 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
  BLOCK_ACCT_WRITE);
 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
-BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
+   BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
 return NVME_NO_COMPLETE;
 }
 
@@ -803,7 +803,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
 }
 
 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
-uint16_t sqid, uint16_t cqid, uint16_t size)
+ uint16_t sqid, uint16_t cqid, uint16_t size)
 {
 int i;
 NvmeCQueue *cq;
@@ -1058,7 +1058,8 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
 }
 
 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
-uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
+ uint16_t cqid, uint16_t vector, uint16_t size,
+ uint16_t irq_enabled)
 {
 int ret;
 
@@ -1118,7 +1119,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest 
*req)
 
 cq = g_malloc0(sizeof(*cq));
 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
-NVME_CQ_FLAGS_IEN(qflags));
+ NVME_CQ_FLAGS_IEN(qflags));
 
 /*
  * It is only required to set qs_created when creating a completion queue;
@@ -1520,7 +1521,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 }
 
 if (((n->temperature >= n->features.temp_thresh_hi) ||
-(n->temperature <= n->features.temp_thresh_low)) &&
+ (n->temperature <= n->features.temp_thresh_low)) &&
 NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) 
{
 nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
NVME_AER_INFO_SMART_TEMP_THRESH,
@@ -1770,9 +1771,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
-NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
+ NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
-NVME_AQA_ASQS(n->bar.aqa) + 1);
+ NVME_AQA_ASQS(n->bar.aqa) + 1);
 
 nvme_set_timestamp(n, 0ULL);
 
@@ -1782,7 +1783,7 @@ static int nvme_start_ctrl(NvmeCtrl *n)
 }
 
 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
-unsigned size)
+   unsigned size)
 {
 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
@@ -1925,7 +1926,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
"invalid write to PMRSWTP register, ignored");
 return;
 case 0xE14: /* TODO PMRMSC */
- break;
+break;
 default:
 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
"invalid MMIO write,"
@@ -2101,7 +2102,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int 
val)
 }
 
 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
-unsigned size)
+unsigned size)
 {
 NvmeCtrl *n = (NvmeCtrl *)opaque;
 
@@ -2125,7 +2126,7 @@ static const MemoryRegionOps nvme_mmio_ops = {
 };
 
 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
-unsigned size)
+   unsigned size)
 {
 NvmeCtrl *n = (NvmeCtrl *)opaque;
 stn_le_p(&n->cmbuf[addr], size, data);
-- 
2.28.0

[PATCH v2 01/17] hw/block/nvme: fix typo in trace event

2020-09-18 Thread Klaus Jensen

From: Klaus Jensen 

Fix a typo in the sq doorbell trace event.

Signed-off-by: Klaus Jensen 
---
 hw/block/trace-events | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/block/trace-events b/hw/block/trace-events
index ec94c56a4165..8ff4cbc4932c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -70,7 +70,7 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, 
uint16_t status) "c
 pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 
0x%"PRIx64""
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" 
new_head %"PRIu16""
-pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" 
new_tail %"PRIu16""
+pci_nvme_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "sqid %"PRIu16" 
new_tail %"PRIu16""
 pci_nvme_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, 
interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 pci_nvme_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, 
interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
 pci_nvme_mmio_cfg(uint64_t data) "wrote MMIO, config controller 
config=0x%"PRIx64""
-- 
2.28.0

Re: [PATCH v2 04/20] block/block-copy: More explicit call_state

2020-09-18 Thread Vladimir Sementsov-Ogievskiy


17.07.2020 16:45, Max Reitz wrote:

On 01.06.20 20:11, Vladimir Sementsov-Ogievskiy wrote:

Refactor common path to use BlockCopyCallState pointer as parameter, to
prepare it for use in asynchronous block-copy (at least, we'll need to
run block-copy in a coroutine, passing the whole parameters as one
pointer).

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/block-copy.c | 51 ++
  1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 43a018d190..75882a094c 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c


[...]


@@ -646,16 +653,16 @@ out:
   * it means that some I/O operation failed in context of _this_ block_copy 
call,
   * not some parallel operation.
   */
-int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
-bool *error_is_read)
+static int coroutine_fn block_copy_common(BlockCopyCallState *call_state)
  {
  int ret;
  
  do {

-ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
+ret = block_copy_dirty_clusters(call_state);


It’s possible that much of this code will change in a future patch of
this series, but as it is, it might be nice to make
block_copy_dirty_clusters’s argument a const pointer so it’s clear that
the call to block_copy_wait_one() below will use the original @offset
and @bytes values.


Hm. I'm trying this, and it doesn't work:

block_copy_task_entry() wants to change call_state:

   t->call_state->failed = true;



*shrug*

Reviewed-by: Max Reitz 

  
  if (ret == 0) {

-ret = block_copy_wait_one(s, offset, bytes);
+ret = block_copy_wait_one(call_state->s, call_state->offset,
+  call_state->bytes);
  }
  
  /*





--
Best regards,
Vladimir

[PATCH v6 15/15] scripts/simplebench: add bench_prealloc.py

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Benchmark for new preallocate filter.

Example usage:
./bench_prealloc.py ../../build/qemu-img \
ssd-ext4:/path/to/mount/point \
ssd-xfs:/path2 hdd-ext4:/path3 hdd-xfs:/path4

The benchmark shows performance improvement (or degradation) when use
new preallocate filter with qcow2 image.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/simplebench/bench_prealloc.py | 128 ++
 1 file changed, 128 insertions(+)
 create mode 100755 scripts/simplebench/bench_prealloc.py

diff --git a/scripts/simplebench/bench_prealloc.py 
b/scripts/simplebench/bench_prealloc.py
new file mode 100755
index 00..fda4b3410e
--- /dev/null
+++ b/scripts/simplebench/bench_prealloc.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+#
+# Benchmark preallocate filter
+#
+# Copyright (c) 2020 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+
+import sys
+import os
+import subprocess
+import re
+
+import simplebench
+
+
+def qemu_img_bench(args):
+p = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+   universal_newlines=True)
+
+if p.returncode == 0:
+try:
+m = re.search(r'Run completed in (\d+.\d+) seconds.', p.stdout)
+return {'seconds': float(m.group(1))}
+except Exception:
+return {'error': f'failed to parse qemu-img output: {p.stdout}'}
+else:
+return {'error': f'qemu-img failed: {p.returncode}: {p.stdout}'}
+
+
+def bench_func(env, case):
+fname = f"{case['dir']}/prealloc-test.qcow2"
+try:
+os.remove(fname)
+except OSError:
+pass
+
+subprocess.run([env['qemu-img-binary'], 'create', '-f', 'qcow2', fname,
+   '16G'], stdout=subprocess.DEVNULL,
+   stderr=subprocess.DEVNULL, check=True)
+
+args = [env['qemu-img-binary'], 'bench', '-c', str(case['count']),
+'-d', '64', '-s', case['block-size'], '-t', 'none', '-n', '-w']
+if env['prealloc']:
+args += ['--image-opts',
+ 'driver=qcow2,file.driver=preallocate,file.file.driver=file,'
+ f'file.file.filename={fname}']
+else:
+args += ['-f', 'qcow2', fname]
+
+return qemu_img_bench(args)
+
+
+def auto_count_bench_func(env, case):
+case['count'] = 100
+while True:
+res = bench_func(env, case)
+if 'error' in res:
+return res
+
+if res['seconds'] >= 1:
+break
+
+case['count'] *= 10
+
+if res['seconds'] < 5:
+case['count'] = round(case['count'] * 5 / res['seconds'])
+res = bench_func(env, case)
+if 'error' in res:
+return res
+
+res['iops'] = case['count'] / res['seconds']
+return res
+
+
+if __name__ == '__main__':
+if len(sys.argv) < 2:
+print(f'USAGE: {sys.argv[0]}  '
+  'DISK_NAME:DIR_PATH ...')
+exit(1)
+
+qemu_img = sys.argv[1]
+
+envs = [
+{
+'id': 'no-prealloc',
+'qemu-img-binary': qemu_img,
+'prealloc': False
+},
+{
+'id': 'prealloc',
+'qemu-img-binary': qemu_img,
+'prealloc': True
+}
+]
+
+aligned_cases = []
+unaligned_cases = []
+
+for disk in sys.argv[2:]:
+name, path = disk.split(':')
+aligned_cases.append({
+'id': f'{name}, aligned sequential 16k',
+'block-size': '16k',
+'dir': path
+})
+unaligned_cases.append({
+'id': f'{name}, unaligned sequential 64k',
+'block-size': '16k',
+'dir': path
+})
+
+result = simplebench.bench(auto_count_bench_func, envs,
+   aligned_cases + unaligned_cases, count=5)
+print(simplebench.ascii(result))
-- 
2.21.3

Re: [PATCH 0/4] nbd reconnect new fixes

2020-09-18 Thread Vladimir Sementsov-Ogievskiy


ping

03.09.2020 22:02, Vladimir Sementsov-Ogievskiy wrote:

Hi! Let's continue fixing nbd reconnect feature.

These series is based on "[PULL 0/6] NBD patches for 2020-09-02"
Based-on: <20200902215400.2673028-1-ebl...@redhat.com>

Vladimir Sementsov-Ogievskiy (4):
   block/nbd: fix drain dead-lock because of nbd reconnect-delay
   block/nbd: correctly use qio_channel_detach_aio_context when needed
   block/nbd: fix reconnect-delay
   block/nbd: nbd_co_reconnect_loop(): don't connect if drained

  block/nbd.c | 71 -
  1 file changed, 60 insertions(+), 11 deletions(-)




--
Best regards,
Vladimir

[PATCH v6 13/15] scripts/simplebench: improve view of ascii table

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Introduce dynamic float precision and use percentage to show delta.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/simplebench/simplebench.py | 26 +-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/scripts/simplebench/simplebench.py 
b/scripts/simplebench/simplebench.py
index 716d7fe9b2..56d3a91ea2 100644
--- a/scripts/simplebench/simplebench.py
+++ b/scripts/simplebench/simplebench.py
@@ -79,10 +79,34 @@ def bench_one(test_func, test_env, test_case, count=5, 
initial_run=True):
 return result
 
 
+def format_float(x):
+res = round(x)
+if res >= 100:
+return str(res)
+
+res = f'{x:.1f}'
+if len(res) >= 4:
+return res
+
+return f'{x:.2f}'
+
+
+def format_percent(x):
+x *= 100
+
+res = round(x)
+if res >= 10:
+return str(res)
+
+return f'{x:.1f}' if res >= 1 else f'{x:.2f}'
+
+
 def ascii_one(result):
 """Return ASCII representation of bench_one() returned dict."""
 if 'average' in result:
-s = '{:.2f} +- {:.2f}'.format(result['average'], result['delta'])
+avg = result['average']
+delta_pr = result['delta'] / avg
+s = f'{format_float(avg)}±{format_percent(delta_pr)}%'
 if 'n-failed' in result:
 s += '\n({} failed)'.format(result['n-failed'])
 return s
-- 
2.21.3

[PATCH v6 14/15] scripts/simplebench: improve ascii table: add difference line

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Performance improvements / degradations are usually discussed in
percentage. Let's make the script calculate it for us.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/simplebench/simplebench.py | 46 +++---
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/scripts/simplebench/simplebench.py 
b/scripts/simplebench/simplebench.py
index 56d3a91ea2..0ff05a38b8 100644
--- a/scripts/simplebench/simplebench.py
+++ b/scripts/simplebench/simplebench.py
@@ -153,14 +153,22 @@ def bench(test_func, test_envs, test_cases, *args, 
**vargs):
 
 def ascii(results):
 """Return ASCII representation of bench() returned dict."""
-from tabulate import tabulate
+import tabulate
+
+# We want leading whitespace for difference row cells (see below)
+tabulate.PRESERVE_WHITESPACE = True
 
 dim = None
-tab = [[""] + [c['id'] for c in results['envs']]]
+tab = [
+# Environment columns are named A, B, ...
+[""] + [chr(ord('A') + i) for i in range(len(results['envs']))],
+[""] + [c['id'] for c in results['envs']]
+]
 for case in results['cases']:
 row = [case['id']]
+case_results = results['tab'][case['id']]
 for env in results['envs']:
-res = results['tab'][case['id']][env['id']]
+res = case_results[env['id']]
 if dim is None:
 dim = res['dimension']
 else:
@@ -168,4 +176,34 @@ def ascii(results):
 row.append(ascii_one(res))
 tab.append(row)
 
-return f'All results are in {dim}\n\n' + tabulate(tab)
+# Add row of difference between column. For each column starting from
+# B we calculate difference with all previous columns.
+row = ['', '']  # case name and first column
+for i in range(1, len(results['envs'])):
+cell = ''
+env = results['envs'][i]
+res = case_results[env['id']]
+
+if 'average' not in res:
+# Failed result
+row.append(cell)
+continue
+
+for j in range(0, i):
+env_j = results['envs'][j]
+res_j = case_results[env_j['id']]
+
+if 'average' not in res_j:
+# Failed result
+cell += ' --'
+continue
+
+col_j = chr(ord('A') + j)
+avg_j = res_j['average']
+delta = (res['average'] - avg_j) / avg_j * 100
+delta_delta = (res['delta'] + res_j['delta']) / avg_j * 100
+cell += f' {col_j}{round(delta):+}±{round(delta_delta)}%'
+row.append(cell)
+tab.append(row)
+
+return f'All results are in {dim}\n\n' + tabulate.tabulate(tab)
-- 
2.21.3

[PATCH v6 09/15] qemu-io: add preallocate mode parameter for truncate command

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

This will be used in further test.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qemu-io-cmds.c | 46 --
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index baeae86d8c..64f0246a71 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1698,13 +1698,42 @@ static const cmdinfo_t flush_cmd = {
 .oneline= "flush all in-core file state to disk",
 };
 
+static int truncate_f(BlockBackend *blk, int argc, char **argv);
+static const cmdinfo_t truncate_cmd = {
+.name   = "truncate",
+.altname= "t",
+.cfunc  = truncate_f,
+.perm   = BLK_PERM_WRITE | BLK_PERM_RESIZE,
+.argmin = 1,
+.argmax = 3,
+.args   = "[-m prealloc_mode] off",
+.oneline= "truncates the current file at the given offset",
+};
+
 static int truncate_f(BlockBackend *blk, int argc, char **argv)
 {
 Error *local_err = NULL;
 int64_t offset;
-int ret;
+int c, ret;
+PreallocMode prealloc = PREALLOC_MODE_OFF;
 
-offset = cvtnum(argv[1]);
+while ((c = getopt(argc, argv, "m:")) != -1) {
+switch (c) {
+case 'm':
+prealloc = qapi_enum_parse(&PreallocMode_lookup, optarg,
+   PREALLOC_MODE__MAX, NULL);
+if (prealloc == PREALLOC_MODE__MAX) {
+error_report("Invalid preallocation mode '%s'", optarg);
+return -EINVAL;
+}
+break;
+default:
+qemuio_command_usage(&truncate_cmd);
+return -EINVAL;
+}
+}
+
+offset = cvtnum(argv[optind]);
 if (offset < 0) {
 print_cvtnum_err(offset, argv[1]);
 return offset;
@@ -1715,7 +1744,7 @@ static int truncate_f(BlockBackend *blk, int argc, char 
**argv)
  * exact=true.  It is better to err on the "emit more errors" side
  * than to be overly permissive.
  */
-ret = blk_truncate(blk, offset, false, PREALLOC_MODE_OFF, 0, &local_err);
+ret = blk_truncate(blk, offset, false, prealloc, 0, &local_err);
 if (ret < 0) {
 error_report_err(local_err);
 return ret;
@@ -1724,17 +1753,6 @@ static int truncate_f(BlockBackend *blk, int argc, char 
**argv)
 return 0;
 }
 
-static const cmdinfo_t truncate_cmd = {
-.name   = "truncate",
-.altname= "t",
-.cfunc  = truncate_f,
-.perm   = BLK_PERM_WRITE | BLK_PERM_RESIZE,
-.argmin = 1,
-.argmax = 1,
-.args   = "off",
-.oneline= "truncates the current file at the given offset",
-};
-
 static int length_f(BlockBackend *blk, int argc, char **argv)
 {
 int64_t size;
-- 
2.21.3

[PATCH v6 12/15] scripts/simplebench: support iops

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Support benchmarks returning not seconds but iops. We'll use it for
further new test.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/simplebench/simplebench.py | 35 +++---
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/scripts/simplebench/simplebench.py 
b/scripts/simplebench/simplebench.py
index 59e7314ff6..716d7fe9b2 100644
--- a/scripts/simplebench/simplebench.py
+++ b/scripts/simplebench/simplebench.py
@@ -24,9 +24,12 @@ def bench_one(test_func, test_env, test_case, count=5, 
initial_run=True):
 
 test_func   -- benchmarking function with prototype
test_func(env, case), which takes test_env and test_case
-   arguments and returns {'seconds': int} (which is benchmark
-   result) on success and {'error': str} on error. Returned
-   dict may contain any other additional fields.
+   arguments and on success returns dict with 'seconds' or
+   'iops' (or both) fields, specifying the benchmark result.
+   If both 'iops' and 'seconds' provided, the 'iops' is
+   considered the main, and 'seconds' is just an additional
+   info. On failure test_func should return {'error': str}.
+   Returned dict may contain any other additional fields.
 test_env-- test environment - opaque first argument for test_func
 test_case   -- test case - opaque second argument for test_func
 count   -- how many times to call test_func, to calculate average
@@ -34,6 +37,7 @@ def bench_one(test_func, test_env, test_case, count=5, 
initial_run=True):
 
 Returns dict with the following fields:
 'runs': list of test_func results
+'dimension': dimension of results, may be 'seconds' or 'iops'
 'average':  average seconds per run (exists only if at least one run
 succeeded)
 'delta':maximum delta between test_func result and the average
@@ -54,11 +58,20 @@ def bench_one(test_func, test_env, test_case, count=5, 
initial_run=True):
 
 result = {'runs': runs}
 
-successed = [r for r in runs if ('seconds' in r)]
+successed = [r for r in runs if ('seconds' in r or 'iops' in r)]
 if successed:
-avg = sum(r['seconds'] for r in successed) / len(successed)
+dim = 'iops' if ('iops' in successed[0]) else 'seconds'
+if 'iops' in successed[0]:
+assert all('iops' in r for r in successed)
+dim = 'iops'
+else:
+assert all('seconds' in r for r in successed)
+assert all('iops' not in r for r in successed)
+dim = 'seconds'
+avg = sum(r[dim] for r in successed) / len(successed)
+result['dimension'] = dim
 result['average'] = avg
-result['delta'] = max(abs(r['seconds'] - avg) for r in successed)
+result['delta'] = max(abs(r[dim] - avg) for r in successed)
 
 if len(successed) < count:
 result['n-failed'] = count - len(successed)
@@ -118,11 +131,17 @@ def ascii(results):
 """Return ASCII representation of bench() returned dict."""
 from tabulate import tabulate
 
+dim = None
 tab = [[""] + [c['id'] for c in results['envs']]]
 for case in results['cases']:
 row = [case['id']]
 for env in results['envs']:
-row.append(ascii_one(results['tab'][case['id']][env['id']]))
+res = results['tab'][case['id']][env['id']]
+if dim is None:
+dim = res['dimension']
+else:
+assert dim == res['dimension']
+row.append(ascii_one(res))
 tab.append(row)
 
-return tabulate(tab)
+return f'All results are in {dim}\n\n' + tabulate(tab)
-- 
2.21.3

[PATCH v6 07/15] block: bdrv_check_perm(): process children anyway

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Do generic processing even for drivers which define .bdrv_check_perm
handler. It's needed for further preallocate filter: it will need to do
additional action on bdrv_check_perm, but don't want to reimplement
generic logic.

The patch doesn't change existing behaviour: the only driver that
implements bdrv_check_perm is file-posix, but it never has any
children.

Also, bdrv_set_perm() don't stop processing if driver has
.bdrv_set_perm handler as well.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 9538af4884..165c2d3cb2 100644
--- a/block.c
+++ b/block.c
@@ -1964,8 +1964,7 @@ static void bdrv_child_perm(BlockDriverState *bs, 
BlockDriverState *child_bs,
 /*
  * Check whether permissions on this node can be changed in a way that
  * @cumulative_perms and @cumulative_shared_perms are the new cumulative
- * permissions of all its parents. This involves checking whether all necessary
- * permission changes to child nodes can be performed.
+ * permissions of all its parents.
  *
  * Will set *tighten_restrictions to true if and only if new permissions have 
to
  * be taken or currently shared permissions are to be unshared.  Otherwise,
@@ -2047,8 +2046,11 @@ static int bdrv_check_perm(BlockDriverState *bs, 
BlockReopenQueue *q,
 }
 
 if (drv->bdrv_check_perm) {
-return drv->bdrv_check_perm(bs, cumulative_perms,
-cumulative_shared_perms, errp);
+ret = drv->bdrv_check_perm(bs, cumulative_perms,
+   cumulative_shared_perms, errp);
+if (ret < 0) {
+return ret;
+}
 }
 
 /* Drivers that never have children can omit .bdrv_child_perm() */
-- 
2.21.3

[PATCH v6 08/15] block: introduce preallocate filter

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

It's intended to be inserted between format and protocol nodes to
preallocate additional space (expanding protocol file) on writes
crossing EOF. It improves performance for file-systems with slow
allocation.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 docs/system/qemu-block-drivers.rst.inc |  26 ++
 qapi/block-core.json   |  20 +-
 block/preallocate.c| 556 +
 block/meson.build  |   1 +
 4 files changed, 602 insertions(+), 1 deletion(-)
 create mode 100644 block/preallocate.c

diff --git a/docs/system/qemu-block-drivers.rst.inc 
b/docs/system/qemu-block-drivers.rst.inc
index b052a6d14e..60a064b232 100644
--- a/docs/system/qemu-block-drivers.rst.inc
+++ b/docs/system/qemu-block-drivers.rst.inc
@@ -952,3 +952,29 @@ on host and see if there are locks held by the QEMU 
process on the image file.
 More than one byte could be locked by the QEMU instance, each byte of which
 reflects a particular permission that is acquired or protected by the running
 block driver.
+
+Filter drivers
+~~
+
+QEMU supports several filter drivers, which don't store any data, but perform
+some additional tasks, hooking io requests.
+
+.. program:: filter-drivers
+.. option:: preallocate
+
+  The preallocate filter driver is intended to be inserted between format
+  and protocol nodes and preallocates some additional space
+  (expanding the protocol file) when writing past the file’s end. This can be
+  useful for file-systems with slow allocation.
+
+  Supported options:
+
+  .. program:: preallocate
+  .. option:: prealloc-align
+
+On preallocation, align the file length to this value (in bytes), default 
1M.
+
+  .. program:: preallocate
+  .. option:: prealloc-size
+
+How much to preallocate (in bytes), default 128M.
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 2d94873ca0..c8030e19b4 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2819,7 +2819,7 @@
 'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', 'ftps',
 'gluster', 'host_cdrom', 'host_device', 'http', 'https', 'iscsi',
 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
-'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
+'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
 { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
 'sheepdog',
 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
@@ -3088,6 +3088,23 @@
   'data': { 'aes': 'QCryptoBlockOptionsQCow',
 'luks': 'QCryptoBlockOptionsLUKS'} }
 
+##
+# @BlockdevOptionsPreallocate:
+#
+# Filter driver intended to be inserted between format and protocol node
+# and do preallocation in protocol node on write.
+#
+# @prealloc-align: on preallocation, align file length to this number,
+#  default 1048576 (1M)
+#
+# @prealloc-size: how much to preallocate, default 134217728 (128M)
+#
+# Since: 5.2
+##
+{ 'struct': 'BlockdevOptionsPreallocate',
+  'base': 'BlockdevOptionsGenericFormat',
+  'data': { '*prealloc-align': 'int', '*prealloc-size': 'int' } }
+
 ##
 # @BlockdevOptionsQcow2:
 #
@@ -3993,6 +4010,7 @@
   'null-co':'BlockdevOptionsNull',
   'nvme':   'BlockdevOptionsNVMe',
   'parallels':  'BlockdevOptionsGenericFormat',
+  'preallocate':'BlockdevOptionsPreallocate',
   'qcow2':  'BlockdevOptionsQcow2',
   'qcow':   'BlockdevOptionsQcow',
   'qed':'BlockdevOptionsGenericCOWFormat',
diff --git a/block/preallocate.c b/block/preallocate.c
new file mode 100644
index 00..6510ad0149
--- /dev/null
+++ b/block/preallocate.c
@@ -0,0 +1,556 @@
+/*
+ * preallocate filter driver
+ *
+ * The driver performs preallocate operation: it is injected above
+ * some node, and before each write over EOF it does additional preallocating
+ * write-zeroes request.
+ *
+ * Copyright (c) 2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/module.h"
+#include "qemu/option.h"
+#include "qemu/units.h"
+#include "block/block_int.h"
+
+
+typedef struct PreallocateOpts {
+int64_t prealloc_size;
+int64_t prealloc_align;
+} Pr

[PATCH v6 10/15] iotests: qemu_io_silent: support --image-opts

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/iotests.py | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 91e4a57126..3d48108f3a 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -197,7 +197,12 @@ def qemu_io_log(*args):
 
 def qemu_io_silent(*args):
 '''Run qemu-io and return the exit code, suppressing stdout'''
-args = qemu_io_args + list(args)
+if '-f' in args or '--image-opts' in args:
+default_args = qemu_io_args_no_fmt
+else:
+default_args = qemu_io_args
+
+args = default_args + list(args)
 exitcode = subprocess.call(args, stdout=open('/dev/null', 'w'))
 if exitcode < 0:
 sys.stderr.write('qemu-io received signal %i: %s\n' %
-- 
2.21.3

[PATCH v6 11/15] iotests: add 298 to test new preallocate filter driver

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/298 | 186 +
 tests/qemu-iotests/298.out |   5 +
 tests/qemu-iotests/group   |   1 +
 3 files changed, 192 insertions(+)
 create mode 100644 tests/qemu-iotests/298
 create mode 100644 tests/qemu-iotests/298.out

diff --git a/tests/qemu-iotests/298 b/tests/qemu-iotests/298
new file mode 100644
index 00..fef10f6a7a
--- /dev/null
+++ b/tests/qemu-iotests/298
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+#
+# Test for preallocate filter
+#
+# Copyright (c) 2020 Virtuozzo International GmbH.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+import iotests
+
+MiB = 1024 * 1024
+disk = os.path.join(iotests.test_dir, 'disk')
+overlay = os.path.join(iotests.test_dir, 'overlay')
+refdisk = os.path.join(iotests.test_dir, 'refdisk')
+drive_opts = f'node-name=disk,driver={iotests.imgfmt},' \
+f'file.node-name=filter,file.driver=preallocate,' \
+f'file.file.node-name=file,file.file.filename={disk}'
+
+
+class TestPreallocateBase(iotests.QMPTestCase):
+def setUp(self):
+iotests.qemu_img_create('-f', iotests.imgfmt, disk, str(10 * MiB))
+
+def tearDown(self):
+try:
+self.check_small()
+check = iotests.qemu_img_check(disk)
+self.assertFalse('leaks' in check)
+self.assertFalse('corruptions' in check)
+self.assertEqual(check['check-errors'], 0)
+finally:
+os.remove(disk)
+
+def check_big(self):
+self.assertTrue(os.path.getsize(disk) > 100 * MiB)
+
+def check_small(self):
+self.assertTrue(os.path.getsize(disk) < 10 * MiB)
+
+
+class TestQemuImg(TestPreallocateBase):
+def test_qemu_img(self):
+p = iotests.QemuIoInteractive('--image-opts', drive_opts)
+
+p.cmd('write 0 1M')
+p.cmd('flush')
+
+self.check_big()
+
+p.close()
+
+
+class TestPreallocateFilter(TestPreallocateBase):
+def setUp(self):
+super().setUp()
+self.vm = iotests.VM().add_drive(path=None, opts=drive_opts)
+self.vm.launch()
+
+def tearDown(self):
+self.vm.shutdown()
+super().tearDown()
+
+def test_prealloc(self):
+self.vm.hmp_qemu_io('drive0', 'write 0 1M')
+self.check_big()
+
+def test_external_snapshot(self):
+self.test_prealloc()
+
+result = self.vm.qmp('blockdev-snapshot-sync', node_name='disk',
+ snapshot_file=overlay,
+ snapshot_node_name='overlay')
+self.assert_qmp(result, 'return', {})
+
+# on reopen to  r-o base preallocation should be dropped
+self.check_small()
+
+self.vm.hmp_qemu_io('drive0', 'write 1M 1M')
+
+result = self.vm.qmp('block-commit', device='overlay')
+self.assert_qmp(result, 'return', {})
+self.complete_and_wait()
+
+# commit of new megabyte should trigger preallocation
+self.check_big()
+
+def test_reopen_opts(self):
+result = self.vm.qmp('x-blockdev-reopen', **{
+'node-name': 'disk',
+'driver': iotests.imgfmt,
+'file': {
+'node-name': 'filter',
+'driver': 'preallocate',
+'prealloc-size': 20 * MiB,
+'prealloc-align': 5 * MiB,
+'file': {
+'node-name': 'file',
+'driver': 'file',
+'filename': disk
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+self.vm.hmp_qemu_io('drive0', 'write 0 1M')
+self.assertTrue(os.path.getsize(disk) == 25 * MiB)
+
+
+class TestTruncate(iotests.QMPTestCase):
+def setUp(self):
+iotests.qemu_img_create('-f', iotests.imgfmt, disk, str(10 * MiB))
+iotests.qemu_img_create('-f', iotests.imgfmt, refdisk, str(10 * MiB))
+
+def tearDown(self):
+os.remove(disk)
+os.remove(refdisk)
+
+def do_test(self, prealloc_mode, new_size):
+ret = iotests.qemu_io_silent('--image-opts', '-c', 'write 0 10M', '-c',
+ f'truncate -m {prealloc_mode} {new_size}',
+ drive_opts)
+self.assertEqual(ret, 0)
+
+ret = iotests.qemu_io_silent('-f', iotests.imgfmt, '

[PATCH v6 06/15] block: introduce BDRV_REQ_NO_WAIT flag

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Add flag to make serialising request no wait: if there are conflicting
requests, just return error immediately. It's will be used in upcoming
preallocate filter.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 include/block/block.h |  9 -
 block/io.c| 11 ++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index ef948e3f34..e7188fea05 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -67,8 +67,15 @@ typedef enum {
  * written to qiov parameter which may be NULL.
  */
 BDRV_REQ_PREFETCH  = 0x200,
+
+/*
+ * If we need to wait for other requests, just fail immediately. Used
+ * only together with BDRV_REQ_SERIALISING.
+ */
+BDRV_REQ_NO_WAIT = 0x400,
+
 /* Mask of valid flags */
-BDRV_REQ_MASK   = 0x3ff,
+BDRV_REQ_MASK   = 0x7ff,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
diff --git a/block/io.c b/block/io.c
index 9b148bb8ea..fdcac4888e 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1912,9 +1912,18 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, uint64_t bytes,
 assert(!(bs->open_flags & BDRV_O_INACTIVE));
 assert((bs->open_flags & BDRV_O_NO_IO) == 0);
 assert(!(flags & ~BDRV_REQ_MASK));
+assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
 
 if (flags & BDRV_REQ_SERIALISING) {
-bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
+QEMU_LOCK_GUARD(&bs->reqs_lock);
+
+tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
+
+if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
+return -EBUSY;
+}
+
+bdrv_wait_serialising_requests_locked(req);
 } else {
 bdrv_wait_serialising_requests(req);
 }
-- 
2.21.3

[PATCH v6 01/15] block: simplify comment to BDRV_REQ_SERIALISING

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

1. BDRV_REQ_NO_SERIALISING doesn't exist already, don't mention it.

2. We are going to add one more user of BDRV_REQ_SERIALISING, so
   comment about backup becomes a bit confusing here. The use case in
   backup is documented in block/backup.c, so let's just drop
   duplication here.

3. The fact that BDRV_REQ_SERIALISING is only for write requests is
   omitted. Add a note.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Stefan Hajnoczi 
---
 include/block/block.h | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 981ab5b314..ef948e3f34 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -53,16 +53,7 @@ typedef enum {
  * content. */
 BDRV_REQ_WRITE_UNCHANGED= 0x40,
 
-/*
- * BDRV_REQ_SERIALISING forces request serialisation for writes.
- * It is used to ensure that writes to the backing file of a backup process
- * target cannot race with a read of the backup target that defers to the
- * backing file.
- *
- * Note, that BDRV_REQ_SERIALISING is _not_ opposite in meaning to
- * BDRV_REQ_NO_SERIALISING. A more descriptive name for the latter might be
- * _DO_NOT_WAIT_FOR_SERIALISING, except that is too long.
- */
+/* Forces request serialisation. Use only with write requests. */
 BDRV_REQ_SERIALISING= 0x80,
 
 /* Execute the request only if the operation can be offloaded or otherwise
-- 
2.21.3

[PATCH v6 04/15] block/io: bdrv_wait_serialising_requests_locked: drop extra bs arg

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

bs is linked in req, so no needs to pass it separately. Most of
tracked-requests API doesn't have bs argument. Actually, after this
patch only tracked_request_begin has it, but it's for purpose.

While being here, also add a comment about what "_locked" is.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Stefan Hajnoczi 
---
 block/io.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/io.c b/block/io.c
index c58fd36091..ab9ef7fd1a 100644
--- a/block/io.c
+++ b/block/io.c
@@ -761,16 +761,16 @@ bdrv_find_conflicting_request(BdrvTrackedRequest *self)
 return NULL;
 }
 
+/* Called with self->bs->reqs_lock held */
 static bool coroutine_fn
-bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
-  BdrvTrackedRequest *self)
+bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
 {
 BdrvTrackedRequest *req;
 bool waited = false;
 
 while ((req = bdrv_find_conflicting_request(self))) {
 self->waiting_for = req;
-qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
+qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
 self->waiting_for = NULL;
 waited = true;
 }
@@ -794,7 +794,7 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, 
uint64_t align)
 
 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
-waited = bdrv_wait_serialising_requests_locked(bs, req);
+waited = bdrv_wait_serialising_requests_locked(req);
 qemu_co_mutex_unlock(&bs->reqs_lock);
 return waited;
 }
@@ -876,7 +876,7 @@ static bool coroutine_fn 
bdrv_wait_serialising_requests(BdrvTrackedRequest *self
 }
 
 qemu_co_mutex_lock(&bs->reqs_lock);
-waited = bdrv_wait_serialising_requests_locked(bs, self);
+waited = bdrv_wait_serialising_requests_locked(self);
 qemu_co_mutex_unlock(&bs->reqs_lock);
 
 return waited;
-- 
2.21.3

[PATCH v6 05/15] block: bdrv_mark_request_serialising: split non-waiting function

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

We'll need a separate function, which will only "mark" request
serialising with specified align but not wait for conflicting
requests. So, it will be like old bdrv_mark_request_serialising(),
before merging bdrv_wait_serialising_requests_locked() into it.

To reduce the possible mess, let's do the following:

Public function that does both marking and waiting will be called
bdrv_make_request_serialising, and private function which will only
"mark" will be called tracked_request_set_serialising().

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 include/block/block_int.h |  3 ++-
 block/file-posix.c|  2 +-
 block/io.c| 35 +++
 3 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 38cad9d15c..887b0668d8 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1052,7 +1052,8 @@ extern unsigned int bdrv_drain_all_count;
 void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
 void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState 
*old_parent);
 
-bool coroutine_fn bdrv_mark_request_serialising(BdrvTrackedRequest *req, 
uint64_t align);
+bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
+uint64_t align);
 BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState 
*bs);
 
 int get_tmp_filename(char *filename, int size);
diff --git a/block/file-posix.c b/block/file-posix.c
index c63926d592..37d9266f6a 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2953,7 +2953,7 @@ raw_do_pwrite_zeroes(BlockDriverState *bs, int64_t 
offset, int bytes,
 req->bytes = end - req->offset;
 req->overlap_bytes = req->bytes;
 
-bdrv_mark_request_serialising(req, bs->bl.request_alignment);
+bdrv_make_request_serialising(req, bs->bl.request_alignment);
 }
 #endif
 
diff --git a/block/io.c b/block/io.c
index ab9ef7fd1a..9b148bb8ea 100644
--- a/block/io.c
+++ b/block/io.c
@@ -778,15 +778,14 @@ bdrv_wait_serialising_requests_locked(BdrvTrackedRequest 
*self)
 return waited;
 }
 
-bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
+/* Called with req->bs->reqs_lock held */
+static void tracked_request_set_serialising(BdrvTrackedRequest *req,
+uint64_t align)
 {
-BlockDriverState *bs = req->bs;
 int64_t overlap_offset = req->offset & ~(align - 1);
 uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
- overlap_offset;
-bool waited;
 
-qemu_co_mutex_lock(&bs->reqs_lock);
 if (!req->serialising) {
 atomic_inc(&req->bs->serialising_in_flight);
 req->serialising = true;
@@ -794,9 +793,6 @@ bool bdrv_mark_request_serialising(BdrvTrackedRequest *req, 
uint64_t align)
 
 req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
-waited = bdrv_wait_serialising_requests_locked(req);
-qemu_co_mutex_unlock(&bs->reqs_lock);
-return waited;
 }
 
 /**
@@ -882,6 +878,21 @@ static bool coroutine_fn 
bdrv_wait_serialising_requests(BdrvTrackedRequest *self
 return waited;
 }
 
+bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
+uint64_t align)
+{
+bool waited;
+
+qemu_co_mutex_lock(&req->bs->reqs_lock);
+
+tracked_request_set_serialising(req, align);
+waited = bdrv_wait_serialising_requests_locked(req);
+
+qemu_co_mutex_unlock(&req->bs->reqs_lock);
+
+return waited;
+}
+
 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
size_t size)
 {
@@ -1492,7 +1503,7 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild 
*child,
  * with each other for the same cluster.  For example, in copy-on-read
  * it ensures that the CoR read and write operations are atomic and
  * guest writes cannot interleave between them. */
-bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
+bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
 } else {
 bdrv_wait_serialising_requests(req);
 }
@@ -1903,7 +1914,7 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, uint64_t bytes,
 assert(!(flags & ~BDRV_REQ_MASK));
 
 if (flags & BDRV_REQ_SERIALISING) {
-bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
+bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
 } else {
 bdrv_wait_serialising_requests(req);
 }
@@ -2069,7 +2080,7 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild 
*child,
 
 padding = bdrv_init_padding(bs, offset, bytes, &pad);
 if (padding) {
-bdrv_mark_request_seriali

[PATCH v6 02/15] block/io.c: drop assertion on double waiting for request serialisation

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

The comments states, that on misaligned request we should have already
been waiting. But for bdrv_padding_rmw_read, we called
bdrv_mark_request_serialising with align = request_alignment, and now
we serialise with align = cluster_size. So we may have to wait again
with larger alignment.

Note, that the only user of BDRV_REQ_SERIALISING is backup which issues
cluster-aligned requests, so seems the assertion should not fire for
now. But it's wrong anyway.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Paolo Bonzini 
---
 block/io.c | 11 +--
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/block/io.c b/block/io.c
index a2389bb38c..67617bb9b2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1881,7 +1881,6 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, uint64_t bytes,
   BdrvTrackedRequest *req, int flags)
 {
 BlockDriverState *bs = child->bs;
-bool waited;
 int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
 
 if (bs->read_only) {
@@ -1893,15 +1892,7 @@ bdrv_co_write_req_prepare(BdrvChild *child, int64_t 
offset, uint64_t bytes,
 assert(!(flags & ~BDRV_REQ_MASK));
 
 if (flags & BDRV_REQ_SERIALISING) {
-waited = bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
-/*
- * For a misaligned request we should have already waited earlier,
- * because we come after bdrv_padding_rmw_read which must be called
- * with the request already marked as serialising.
- */
-assert(!waited ||
-   (req->offset == req->overlap_offset &&
-req->bytes == req->overlap_bytes));
+bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
 } else {
 bdrv_wait_serialising_requests(req);
 }
-- 
2.21.3

[PATCH v6 03/15] block/io: split out bdrv_find_conflicting_request

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

To be reused in separate.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Stefan Hajnoczi 
---
 block/io.c | 71 +++---
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/block/io.c b/block/io.c
index 67617bb9b2..c58fd36091 100644
--- a/block/io.c
+++ b/block/io.c
@@ -727,43 +727,54 @@ static bool tracked_request_overlaps(BdrvTrackedRequest 
*req,
 return true;
 }
 
+/* Called with self->bs->reqs_lock held */
+static BdrvTrackedRequest *
+bdrv_find_conflicting_request(BdrvTrackedRequest *self)
+{
+BdrvTrackedRequest *req;
+
+QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
+if (req == self || (!req->serialising && !self->serialising)) {
+continue;
+}
+if (tracked_request_overlaps(req, self->overlap_offset,
+ self->overlap_bytes))
+{
+/*
+ * Hitting this means there was a reentrant request, for
+ * example, a block driver issuing nested requests.  This must
+ * never happen since it means deadlock.
+ */
+assert(qemu_coroutine_self() != req->co);
+
+/*
+ * If the request is already (indirectly) waiting for us, or
+ * will wait for us as soon as it wakes up, then just go on
+ * (instead of producing a deadlock in the former case).
+ */
+if (!req->waiting_for) {
+return req;
+}
+}
+}
+
+return NULL;
+}
+
 static bool coroutine_fn
 bdrv_wait_serialising_requests_locked(BlockDriverState *bs,
   BdrvTrackedRequest *self)
 {
 BdrvTrackedRequest *req;
-bool retry;
 bool waited = false;
 
-do {
-retry = false;
-QLIST_FOREACH(req, &bs->tracked_requests, list) {
-if (req == self || (!req->serialising && !self->serialising)) {
-continue;
-}
-if (tracked_request_overlaps(req, self->overlap_offset,
- self->overlap_bytes))
-{
-/* Hitting this means there was a reentrant request, for
- * example, a block driver issuing nested requests.  This must
- * never happen since it means deadlock.
- */
-assert(qemu_coroutine_self() != req->co);
-
-/* If the request is already (indirectly) waiting for us, or
- * will wait for us as soon as it wakes up, then just go on
- * (instead of producing a deadlock in the former case). */
-if (!req->waiting_for) {
-self->waiting_for = req;
-qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
-self->waiting_for = NULL;
-retry = true;
-waited = true;
-break;
-}
-}
-}
-} while (retry);
+while ((req = bdrv_find_conflicting_request(self))) {
+self->waiting_for = req;
+qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
+self->waiting_for = NULL;
+waited = true;
+}
+
 return waited;
 }
 
-- 
2.21.3

[PATCH v6 00/15] preallocate filter

2020-09-18 Thread Vladimir Sementsov-Ogievskiy

Hi all!

Here is a filter, which does preallocation on write.

In Virtuozzo we have to deal with some custom distributed storage
solution, where allocation is very-very expensive operation. We have to
workaround it in Qemu, so here is a new filter.

Still, the filter shows good results for me even for xfs and ext4.

Here are results, produced by new benchmark (last 4 patches):

All results are in iops (larger means better)

--  ---  ---
AB
no-prealloc  prealloc
ssd-ext4, aligned sequential 16k19934±1.2%   27108±0.27%
  A+36±2%
ssd-xfs, aligned sequential 16k 15528±5.5%   25953±3.3%
  A+67±11%
hdd-ext4, aligned sequential 16k5079±29% 3165±11%
  A-38±36%
hdd-xfs, aligned sequential 16k 4096±95% 3321±7.6%
  A-19±101%
ssd-ext4, unaligned sequential 64k  19969±1.9%   27043±0.49%
  A+35±3%
ssd-xfs, unaligned sequential 64k   15403±2.8%   25725±6.4%
  A+67±13%
hdd-ext4, unaligned sequential 64k  5250±17% 3239±8.7%
  A-38±23%
hdd-xfs, unaligned sequential 64k   5291±8.2%3336±4.2%
  A-37±11%
--  ---  ---

Note: it's on Fedora 30, kernel 5.6.13-100.fc30.x86_64

The tests are actually qemu-img bench, run like:

  ./qemu-img create -f qcow2 $img 16G

aligned:
  ./qemu-img bench -c 1 -d 64 -f qcow2  -s 16k -t none -n -w $img

unaligned
  ./qemu-img bench -c 1 -d 64 -f qcow2 -o 1k -s 64k -t none -n -w $img

and for preallocation, you'll drop -f qcow2, add --image-opts, and
instead of just $img use
  
driver=qcow2,file.driver=preallocate,file.file.driver=file,file.file.filename=$img
 

v6:
05: add Max's r-b
06: add Max's r-b
07: new
08: Changed a lot. really. no .active now, support more use-cases.
Somehow, now I see performance benefit on xfs too :)
probably due to .zero_start feature.
09: new
10: new
11: mostly rewritten, a lot more cases, drop r-b
12-15: new, to produce final benchmark table

Vladimir Sementsov-Ogievskiy (15):
  block: simplify comment to BDRV_REQ_SERIALISING
  block/io.c: drop assertion on double waiting for request serialisation
  block/io: split out bdrv_find_conflicting_request
  block/io: bdrv_wait_serialising_requests_locked: drop extra bs arg
  block: bdrv_mark_request_serialising: split non-waiting function
  block: introduce BDRV_REQ_NO_WAIT flag
  block: bdrv_check_perm(): process children anyway
  block: introduce preallocate filter
  qemu-io: add preallocate mode parameter for truncate command
  iotests: qemu_io_silent: support --image-opts
  iotests: add 298 to test new preallocate filter driver
  scripts/simplebench: support iops
  scripts/simplebench: improve view of ascii table
  scripts/simplebench: improve ascii table: add difference line
  scripts/simplebench: add bench_prealloc.py

 docs/system/qemu-block-drivers.rst.inc |  26 ++
 qapi/block-core.json   |  20 +-
 include/block/block.h  |  20 +-
 include/block/block_int.h  |   3 +-
 block.c|  10 +-
 block/file-posix.c |   2 +-
 block/io.c | 130 +++---
 block/preallocate.c| 556 +
 qemu-io-cmds.c |  46 +-
 block/meson.build  |   1 +
 scripts/simplebench/bench_prealloc.py  | 128 ++
 scripts/simplebench/simplebench.py | 103 -
 tests/qemu-iotests/298 | 186 +
 tests/qemu-iotests/298.out |   5 +
 tests/qemu-iotests/group   |   1 +
 tests/qemu-iotests/iotests.py  |   7 +-
 16 files changed, 1146 insertions(+), 98 deletions(-)
 create mode 100644 block/preallocate.c
 create mode 100755 scripts/simplebench/bench_prealloc.py
 create mode 100644 tests/qemu-iotests/298
 create mode 100644 tests/qemu-iotests/298.out

-- 
2.21.3

Re: [PATCH 0/5] qom: Convert more declarations to OBJECT_DECLARE*

2020-09-18 Thread Eduardo Habkost

On Wed, Sep 16, 2020 at 02:25:14PM -0400, Eduardo Habkost wrote:
> This converts many QOM types to use OBJECT_DECLARE* instead of
> manually using DECLARE*_CHECKER*.
> 
> Before doing that, I'm simplifying the OBJECT_DECLARE* API to
> make it easier to use and more difficult to misuse.  The
> module_obj_name and ParentClassType parameters were removed
> because they are not needed.

I'm queueing this on machine-next.

> 
> Eduardo Habkost (5):
>   scripts/codeconverter: Update to latest version
>   qom: Remove ParentClassType argument from OBJECT_DECLARE_SIMPLE_TYPE
>   qom: Remove module_obj_name parameter from OBJECT_DECLARE* macros
>   [automated] Use OBJECT_DECLARE_TYPE when possible
>   [automated] Use OBJECT_DECLARE_SIMPLE_TYPE when possible
> 
>  hw/9pfs/virtio-9p.h   |   4 +-
>  hw/audio/intel-hda.h  |   6 +-
>  hw/display/ati_int.h  |   4 +-
>  hw/display/qxl.h  |   4 +-
>  hw/display/virtio-vga.h   |   2 +-
>  hw/i386/amd_iommu.h   |   4 +-
>  hw/misc/tmp105.h  |   4 +-
>  hw/net/fsl_etsec/etsec.h  |   3 +-
>  hw/net/tulip.h|   4 +-
>  hw/ppc/e500-ccsr.h|   4 +-
>  hw/ppc/e500.h |   5 +-
>  hw/ppc/mac.h  |   4 +-
>  hw/s390x/ccw-device.h |   4 +-
>  hw/s390x/ipl.h|   4 +-
>  hw/s390x/s390-pci-bus.h   |  16 +-
>  hw/s390x/virtio-ccw.h |  57 +-
>  hw/usb/ccid.h |   5 +-
>  hw/usb/hcd-dwc2.h |   3 +-
>  hw/usb/hcd-ehci.h |  13 +-
>  hw/usb/hcd-ohci.h |   4 +-
>  hw/usb/hcd-xhci.h |   4 +-
>  hw/vfio/pci.h |   4 +-
>  hw/virtio/virtio-pci.h|   5 +-
>  hw/xen/xen_pt.h   |   4 +-
>  include/authz/base.h  |   2 +-
>  include/authz/list.h  |   4 +-
>  include/authz/listfile.h  |   4 +-
>  include/authz/pamacct.h   |   4 +-
>  include/authz/simple.h|   4 +-
>  include/block/throttle-groups.h   |   4 +-
>  include/chardev/char.h|   4 +-
>  include/crypto/secret_common.h|   2 +-
>  include/crypto/secret_keyring.h   |   4 +-
>  include/hw/acpi/generic_event_device.h|   4 +-
>  include/hw/acpi/vmgenid.h |   4 +-
>  include/hw/adc/stm32f2xx_adc.h|   4 +-
>  include/hw/arm/allwinner-a10.h|   4 +-
>  include/hw/arm/allwinner-h3.h |   4 +-
>  include/hw/arm/armsse.h   |   2 +-
>  include/hw/arm/armv7m.h   |   8 +-
>  include/hw/arm/aspeed_soc.h   |   5 +-
>  include/hw/arm/bcm2835_peripherals.h  |   4 +-
>  include/hw/arm/bcm2836.h  |   5 +-
>  include/hw/arm/digic.h|   4 +-
>  include/hw/arm/exynos4210.h   |   4 +-
>  include/hw/arm/fsl-imx25.h|   4 +-
>  include/hw/arm/fsl-imx31.h|   4 +-
>  include/hw/arm/fsl-imx6.h |   4 +-
>  include/hw/arm/fsl-imx6ul.h   |   4 +-
>  include/hw/arm/fsl-imx7.h |   4 +-
>  include/hw/arm/msf2-soc.h |   4 +-
>  include/hw/arm/nrf51_soc.h|   4 +-
>  include/hw/arm/omap.h |   4 +-
>  include/hw/arm/pxa.h  |  15 +-
>  include/hw/arm/smmu-common.h  |   5 +-
>  include/hw/arm/smmuv3.h   |   5 +-
>  include/hw/arm/stm32f205_soc.h|   4 +-
>  include/hw/arm/stm32f405_soc.h|   4 +-
>  include/hw/arm/virt.h |   5 +-
>  include/hw/arm/xlnx-versal.h  |   4 +-
>  include/hw/arm/xlnx-zynqmp.h  |   4 +-
>  include/hw/block/flash.h  |   8 +-
>  include/hw/block/swim.h   |  12 +-
>  include/hw/boards.h   |   3 +-
>  include/hw/char/avr_usart.h   |   4 +-
>  include/hw/char/bcm2835_aux.h |   4 +-
>  include/hw/char/cadence_uart.h|   4 +-
>  include/hw/char/cmsdk-apb-uart.h  |   4 +-
>  include/hw/char/digic-uart.h  |   4 +-
>  include/hw/char/escc.h|   4 +-
>  include/hw/char/ibex_uart.h   |   4 +-
>  include/hw/char/imx_serial.h  |   4 +-
>  include/hw/char/nrf51_uart.h  |   4 +-
>  i

[RFC PATCH 4/6] hw/sd/sdcard: Reset both start/end addresses on error

2020-09-18 Thread Philippe Mathieu-Daudé

>From the Spec "4.3.5 Erase":

  The host should adhere to the following command
  sequence: ERASE_WR_BLK_START, ERASE_WR_BLK_END and
  ERASE (CMD38).

  If an erase (CMD38) or address setting (CMD32, 33)
  command is received out of sequence, the card shall
  set the ERASE_SEQ_ERROR bit in the status register
  and reset the whole sequence.

Reset both addresses if the ERASE command occured
out of sequence (one of the start/end address is
not set).

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 4c05152f189..ee7b64023aa 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -755,6 +755,8 @@ static void sd_erase(SDState *sd)
 if (sd->erase_start == INVALID_ADDRESS
 || sd->erase_end == INVALID_ADDRESS) {
 sd->card_status |= ERASE_SEQ_ERROR;
+sd->erase_start = INVALID_ADDRESS;
+sd->erase_end = INVALID_ADDRESS;
 return;
 }
 
-- 
2.26.2

[RFC PATCH 3/6] hw/sd/sdcard: Do not use legal address '0' for INVALID_ADDRESS

2020-09-18 Thread Philippe Mathieu-Daudé

As it is legal to WRITE/ERASE the address/block 0,
change the value of this definition to an illegal
address: UINT32_MAX.

Signed-off-by: Philippe Mathieu-Daudé 
---
Cc: Dr. David Alan Gilbert 
Cc: Markus Armbruster 

Same problem I had with the pflash device last year...
This break migration :(
What is the best way to do this?
---
 hw/sd/sd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 30ae435d669..4c05152f189 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -53,7 +53,7 @@
 
 #define SDSC_MAX_CAPACITY   (2 * GiB)
 
-#define INVALID_ADDRESS 0
+#define INVALID_ADDRESS UINT32_MAX
 
 typedef enum {
 sd_r0 = 0,/* no response */
@@ -666,8 +666,8 @@ static int sd_vmstate_pre_load(void *opaque)
 
 static const VMStateDescription sd_vmstate = {
 .name = "sd-card",
-.version_id = 1,
-.minimum_version_id = 1,
+.version_id = 2,
+.minimum_version_id = 2,
 .pre_load = sd_vmstate_pre_load,
 .fields = (VMStateField[]) {
 VMSTATE_UINT32(mode, SDState),
-- 
2.26.2

[RFC PATCH 6/6] hw/sd/sdcard: Assert if accessing an illegal group

2020-09-18 Thread Philippe Mathieu-Daudé

We can not have more group than 'wpgrps_size'.
Assert if we are accessing a group above this limit.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 4454d168e2f..c3febed2434 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -780,6 +780,7 @@ static void sd_erase(SDState *sd)
 sd->csd[14] |= 0x40;
 
 for (i = erase_start; i <= erase_end; i++) {
+assert(i < sd->wpgrps_size);
 if (test_bit(i, sd->wp_groups)) {
 sd->card_status |= WP_ERASE_SKIP;
 }
@@ -794,6 +795,7 @@ static uint32_t sd_wpbits(SDState *sd, uint64_t addr)
 wpnum = sd_addr_to_wpnum(addr);
 
 for (i = 0; i < 32; i++, wpnum++, addr += WPGROUP_SIZE) {
+assert(wpnum < sd->wpgrps_size);
 if (addr < sd->size && test_bit(wpnum, sd->wp_groups)) {
 ret |= (1 << i);
 }
-- 
2.26.2

[RFC PATCH 5/6] hw/sd/sdcard: Do not attempt to erase out of range addresses

2020-09-18 Thread Philippe Mathieu-Daudé

While the Spec v3 is not very clear, v6 states:

  If the host provides an out of range address as an argument
  to CMD32 or CMD33, the card shall indicate OUT_OF_RANGE error
  in R1 (ERX) for CMD38.

If an address is out of range, do not attempt to erase it:
return R1 with the error bit set.

Buglink: https://bugs.launchpad.net/bugs/1895310
Reported-by: Alexander Bulekov 
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index ee7b64023aa..4454d168e2f 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -766,6 +766,13 @@ static void sd_erase(SDState *sd)
 erase_end *= 512;
 }
 
+if (sd->erase_start > sd->size || sd->erase_end > sd->size) {
+sd->card_status |= OUT_OF_RANGE;
+sd->erase_start = INVALID_ADDRESS;
+sd->erase_end = INVALID_ADDRESS;
+return;
+}
+
 erase_start = sd_addr_to_wpnum(erase_start);
 erase_end = sd_addr_to_wpnum(erase_end);
 sd->erase_start = INVALID_ADDRESS;
-- 
2.26.2

[RFC PATCH 2/6] hw/sd/sdcard: Introduce the INVALID_ADDRESS definition

2020-09-18 Thread Philippe Mathieu-Daudé

'0' is used as a value to indicate an invalid (or unset)
address. Use a definition instead of a magic value.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 2606b969e34..30ae435d669 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -53,6 +53,8 @@
 
 #define SDSC_MAX_CAPACITY   (2 * GiB)
 
+#define INVALID_ADDRESS 0
+
 typedef enum {
 sd_r0 = 0,/* no response */
 sd_r1,/* normal response command */
@@ -575,8 +577,8 @@ static void sd_reset(DeviceState *dev)
 sd->wpgrps_size = sect;
 sd->wp_groups = bitmap_new(sd->wpgrps_size);
 memset(sd->function_group, 0, sizeof(sd->function_group));
-sd->erase_start = 0;
-sd->erase_end = 0;
+sd->erase_start = INVALID_ADDRESS;
+sd->erase_end = INVALID_ADDRESS;
 sd->size = size;
 sd->blk_len = 0x200;
 sd->pwd_len = 0;
@@ -750,7 +752,8 @@ static void sd_erase(SDState *sd)
 uint64_t erase_end = sd->erase_end;
 
 trace_sdcard_erase(sd->erase_start, sd->erase_end);
-if (!sd->erase_start || !sd->erase_end) {
+if (sd->erase_start == INVALID_ADDRESS
+|| sd->erase_end == INVALID_ADDRESS) {
 sd->card_status |= ERASE_SEQ_ERROR;
 return;
 }
@@ -763,8 +766,8 @@ static void sd_erase(SDState *sd)
 
 erase_start = sd_addr_to_wpnum(erase_start);
 erase_end = sd_addr_to_wpnum(erase_end);
-sd->erase_start = 0;
-sd->erase_end = 0;
+sd->erase_start = INVALID_ADDRESS;
+sd->erase_end = INVALID_ADDRESS;
 sd->csd[14] |= 0x40;
 
 for (i = erase_start; i <= erase_end; i++) {
-- 
2.26.2

[RFC PATCH 0/6] hw/sd/sdcard: Do not attempt to erase out of range addresses

2020-09-18 Thread Philippe Mathieu-Daudé

Yet another bug in the sdcard model found by libfuzzer:
https://bugs.launchpad.net/bugs/1895310

The bug is fixed, but there is a migration issue to
be resolved... so posting as RFC.

Philippe Mathieu-Daudé (6):
  hw/sd/sdcard: Add trace event for ERASE command (CMD38)
  hw/sd/sdcard: Introduce the INVALID_ADDRESS definition
  hw/sd/sdcard: Do not use legal address '0' for INVALID_ADDRESS
  hw/sd/sdcard: Reset both start/end addresses on error
  hw/sd/sdcard: Do not attempt to erase out of range addresses
  hw/sd/sdcard: Assert if accessing an illegal group

 hw/sd/sd.c | 30 ++
 hw/sd/trace-events |  2 +-
 2 files changed, 23 insertions(+), 9 deletions(-)

-- 
2.26.2

[RFC PATCH 1/6] hw/sd/sdcard: Add trace event for ERASE command (CMD38)

2020-09-18 Thread Philippe Mathieu-Daudé

Trace addresses provided to the ERASE command.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/sd/sd.c | 2 +-
 hw/sd/trace-events | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/sd/sd.c b/hw/sd/sd.c
index 0012884..2606b969e34 100644
--- a/hw/sd/sd.c
+++ b/hw/sd/sd.c
@@ -749,7 +749,7 @@ static void sd_erase(SDState *sd)
 uint64_t erase_start = sd->erase_start;
 uint64_t erase_end = sd->erase_end;
 
-trace_sdcard_erase();
+trace_sdcard_erase(sd->erase_start, sd->erase_end);
 if (!sd->erase_start || !sd->erase_end) {
 sd->card_status |= ERASE_SEQ_ERROR;
 return;
diff --git a/hw/sd/trace-events b/hw/sd/trace-events
index a87d7355fb8..96c7ea5e52f 100644
--- a/hw/sd/trace-events
+++ b/hw/sd/trace-events
@@ -46,7 +46,7 @@ sdcard_reset(void) ""
 sdcard_set_blocklen(uint16_t length) "0x%03x"
 sdcard_inserted(bool readonly) "read_only: %u"
 sdcard_ejected(void) ""
-sdcard_erase(void) ""
+sdcard_erase(uint32_t first, uint32_t last) "addr first 0x%" PRIx32" last 0x%" 
PRIx32
 sdcard_lock(void) ""
 sdcard_unlock(void) ""
 sdcard_read_block(uint64_t addr, uint32_t len) "addr 0x%" PRIx64 " size 0x%x"
-- 
2.26.2

[PULL v2 06/15] vhost: recheck dev state in the vhost_migration_log routine

2020-09-18 Thread Michael S. Tsirkin

From: Dima Stepanov 

vhost-user devices can get a disconnect in the middle of the VHOST-USER
handshake on the migration start. If disconnect event happened right
before sending next VHOST-USER command, then the vhost_dev_set_log()
call in the vhost_migration_log() function will return error. This error
will lead to the assert() and close the QEMU migration source process.
For the vhost-user devices the disconnect event should not break the
migration process, because:
  - the device will be in the stopped state, so it will not be changed
during migration
  - if reconnect will be made the migration log will be reinitialized as
part of reconnect/init process:
#0  vhost_log_global_start (listener=0x563989cf7be0)
at hw/virtio/vhost.c:920
#1  0x56398603d8bc in listener_add_address_space 
(listener=0x563989cf7be0,
as=0x563986ea4340 )
at softmmu/memory.c:2664
#2  0x56398603dd30 in memory_listener_register (listener=0x563989cf7be0,
as=0x563986ea4340 )
at softmmu/memory.c:2740
#3  0x563985fd6956 in vhost_dev_init (hdev=0x563989cf7bd8,
opaque=0x563989cf7e30, backend_type=VHOST_BACKEND_TYPE_USER,
busyloop_timeout=0)
at hw/virtio/vhost.c:1385
#4  0x563985f7d0b8 in vhost_user_blk_connect (dev=0x563989cf7990)
at hw/block/vhost-user-blk.c:315
#5  0x563985f7d3f6 in vhost_user_blk_event (opaque=0x563989cf7990,
event=CHR_EVENT_OPENED)
at hw/block/vhost-user-blk.c:379
Update the vhost-user-blk device with the internal started_vu field which
will be used for initialization (vhost_user_blk_start) and clean up
(vhost_user_blk_stop). This additional flag in the VhostUserBlk structure
will be used to track whether the device really needs to be stopped and
cleaned up on a vhost-user level.
The disconnect event will set the overall VHOST device (not vhost-user) to
the stopped state, so it can be used by the general vhost_migration_log
routine.
Such approach could be propogated to the other vhost-user devices, but
better idea is just to make the same connect/disconnect code for all the
vhost-user devices.

This migration issue was slightly discussed earlier:
  - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg01509.html
  - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg05241.html

Signed-off-by: Dima Stepanov 
Reviewed-by: Raphael Norwitz 
Message-Id: 
<9fbfba06791a87813fcee3e2315f0b904cc6789a.1599813294.git.dimas...@yandex-team.ru>
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 include/hw/virtio/vhost-user-blk.h | 10 ++
 hw/block/vhost-user-blk.c  | 19 ---
 hw/virtio/vhost.c  | 27 ---
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/include/hw/virtio/vhost-user-blk.h 
b/include/hw/virtio/vhost-user-blk.h
index dc40ab6f11..5d86ff2e87 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -42,7 +42,17 @@ struct VHostUserBlk {
 VhostUserState vhost_user;
 struct vhost_virtqueue *vhost_vqs;
 VirtQueue **virtqs;
+
+/*
+ * There are at least two steps of initialization of the
+ * vhost-user device. The first is a "connect" step and
+ * second is a "start" step. Make a separation between
+ * those initialization phases by using two fields.
+ */
+/* vhost_user_blk_connect/vhost_user_blk_disconnect */
 bool connected;
+/* vhost_user_blk_start/vhost_user_blk_stop */
+bool started_vu;
 };
 
 #endif
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 39aec42dae..a076b1e54d 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -150,6 +150,7 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
 error_report("Error starting vhost: %d", -ret);
 goto err_guest_notifiers;
 }
+s->started_vu = true;
 
 /* guest_notifier_mask/pending not used yet, so just unmask
  * everything here. virtio-pci will do the right thing by
@@ -175,6 +176,11 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 int ret;
 
+if (!s->started_vu) {
+return;
+}
+s->started_vu = false;
+
 if (!k->set_guest_notifiers) {
 return;
 }
@@ -341,9 +347,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
 }
 s->connected = false;
 
-if (s->dev.started) {
-vhost_user_blk_stop(vdev);
-}
+vhost_user_blk_stop(vdev);
 
 vhost_dev_cleanup(&s->dev);
 }
@@ -399,6 +403,15 @@ static void vhost_user_blk_event(void *opaque, 
QEMUChrEvent event)
 NULL, NULL, false);
 aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
 }
+
+/*
+ * Move vhost device to the stopped state. The vhost-user device
+ * will be clean up and disconnected in BH. This can be useful in
+ * the vhost mi

Re: [PATCH v2 13/13] block/qed: bdrv_qed_do_open: deal with errp

2020-09-18 Thread Greg Kurz

On Thu, 17 Sep 2020 22:55:19 +0300
Vladimir Sementsov-Ogievskiy  wrote:

> Set errp always on failure. Generic bdrv_open_driver supports driver
> functions which can return negative value and forget to set errp.
> That's a strange thing.. Let's improve bdrv_qed_do_open to not behave
> this way. This allows to simplify code in
> bdrv_qed_co_invalidate_cache().
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Alberto Garcia 
> ---

Reviewed-by: Greg Kurz 

>  block/qed.c | 24 +++-
>  1 file changed, 15 insertions(+), 9 deletions(-)
> 
> diff --git a/block/qed.c b/block/qed.c
> index b27e7546ca..f45c640513 100644
> --- a/block/qed.c
> +++ b/block/qed.c
> @@ -393,6 +393,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header));
>  if (ret < 0) {
> +error_setg(errp, "Failed to read QED header");
>  return ret;
>  }
>  qed_header_le_to_cpu(&le_header, &s->header);
> @@ -408,25 +409,30 @@ static int coroutine_fn 
> bdrv_qed_do_open(BlockDriverState *bs, QDict *options,
>  return -ENOTSUP;
>  }
>  if (!qed_is_cluster_size_valid(s->header.cluster_size)) {
> +error_setg(errp, "QED cluster size is invalid");
>  return -EINVAL;
>  }
>  
>  /* Round down file size to the last cluster */
>  file_size = bdrv_getlength(bs->file->bs);
>  if (file_size < 0) {
> +error_setg(errp, "Failed to get file length");
>  return file_size;
>  }
>  s->file_size = qed_start_of_cluster(s, file_size);
>  
>  if (!qed_is_table_size_valid(s->header.table_size)) {
> +error_setg(errp, "QED table size is invalid");
>  return -EINVAL;
>  }
>  if (!qed_is_image_size_valid(s->header.image_size,
>   s->header.cluster_size,
>   s->header.table_size)) {
> +error_setg(errp, "QED image size is invalid");
>  return -EINVAL;
>  }
>  if (!qed_check_table_offset(s, s->header.l1_table_offset)) {
> +error_setg(errp, "QED table offset is invalid");
>  return -EINVAL;
>  }
>  
> @@ -438,6 +444,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  /* Header size calculation must not overflow uint32_t */
>  if (s->header.header_size > UINT32_MAX / s->header.cluster_size) {
> +error_setg(errp, "QED header size is too large");
>  return -EINVAL;
>  }
>  
> @@ -445,6 +452,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  if ((uint64_t)s->header.backing_filename_offset +
>  s->header.backing_filename_size >
>  s->header.cluster_size * s->header.header_size) {
> +error_setg(errp, "QED backing filename offset is invalid");
>  return -EINVAL;
>  }
>  
> @@ -453,6 +461,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>bs->auto_backing_file,
>sizeof(bs->auto_backing_file));
>  if (ret < 0) {
> +error_setg(errp, "Failed to read backing filename");
>  return ret;
>  }
>  pstrcpy(bs->backing_file, sizeof(bs->backing_file),
> @@ -475,6 +484,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  ret = qed_write_header_sync(s);
>  if (ret) {
> +error_setg(errp, "Failed to update header");
>  return ret;
>  }
>  
> @@ -487,6 +497,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  ret = qed_read_l1_table_sync(s);
>  if (ret) {
> +error_setg(errp, "Failed to read L1 table");
>  goto out;
>  }
>  
> @@ -503,6 +514,7 @@ static int coroutine_fn bdrv_qed_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  ret = qed_check(s, &result, true);
>  if (ret) {
> +error_setg(errp, "Image corrupted");
>  goto out;
>  }
>  }
> @@ -1537,22 +1549,16 @@ static void coroutine_fn 
> bdrv_qed_co_invalidate_cache(BlockDriverState *bs,
>Error **errp)
>  {
>  BDRVQEDState *s = bs->opaque;
> -Error *local_err = NULL;
>  int ret;
>  
>  bdrv_qed_close(bs);
>  
>  bdrv_qed_init_state(bs);
>  qemu_co_mutex_lock(&s->table_lock);
> -ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, &local_err);
> +ret = bdrv_qed_do_open(bs, NULL, bs->open_flags, errp);
>  qemu_co_mutex_unlock(&s->table_lock);
> -if (local_err) {
> -error_propagate_prepend(errp, local_err,
> -"Could not reopen qed layer: ");
> -return;
> -} else if (ret < 0) {
> -error_setg_

Re: [PATCH v2 12/13] block/qcow2: simplify qcow2_co_invalidate_cache()

2020-09-18 Thread Vladimir Sementsov-Ogievskiy


18.09.2020 18:51, Alberto Garcia wrote:

On Fri 18 Sep 2020 05:30:06 PM CEST, Greg Kurz wrote:

qcow2_do_open correctly sets errp on each failure path. So, we can
simplify code in qcow2_co_invalidate_cache() and drop explicit error
propagation. We should use ERRP_GUARD() (accordingly to comment in
include/qapi/error.h) together with error_append() call which we add to
avoid problems with error_fatal.



The wording gives the impression that we add error_append() to avoid problems
with error_fatal which is certainly not true. Also it isn't _append() but
_prepend() :)

What about ?

"Add ERRP_GUARD() as mandated by the documentation in include/qapi/error.h
  to avoid problems with the error_prepend() call if errp is
  &error_fatal."


OK for me.



I had to go to the individual error functions to see what "it doesn't
work with &error_fatal" actually means.

So in a case like qcow2_do_open() which has:

error_setg(errp, ...)
error_append_hint(errp, ...)

As far as I can see this works just fine without ERRP_GUARD() and with
error_fatal, the difference is that if we don't use the guard then the
process exists during error_setg(), and if we use the guard it exists
during the implicit error_propagate() call triggered by its destruction
at the end of the function. In this latter case the printed error
message would include the hint.



Yes the only problem is that without ERRP_GUARD we lose the hint in case of 
error_fatal.

--
Best regards,
Vladimir

Re: [PATCH v2 12/13] block/qcow2: simplify qcow2_co_invalidate_cache()

2020-09-18 Thread Greg Kurz

On Fri, 18 Sep 2020 19:01:34 +0300
Vladimir Sementsov-Ogievskiy  wrote:

> 18.09.2020 18:51, Alberto Garcia wrote:
> > On Fri 18 Sep 2020 05:30:06 PM CEST, Greg Kurz wrote:
> >>> qcow2_do_open correctly sets errp on each failure path. So, we can
> >>> simplify code in qcow2_co_invalidate_cache() and drop explicit error
> >>> propagation. We should use ERRP_GUARD() (accordingly to comment in
> >>> include/qapi/error.h) together with error_append() call which we add to
> >>> avoid problems with error_fatal.
> >>>
> >>
> >> The wording gives the impression that we add error_append() to avoid 
> >> problems
> >> with error_fatal which is certainly not true. Also it isn't _append() but
> >> _prepend() :)
> >>
> >> What about ?
> >>
> >> "Add ERRP_GUARD() as mandated by the documentation in include/qapi/error.h
> >>   to avoid problems with the error_prepend() call if errp is
> >>   &error_fatal."
> 
> OK for me.
> 
> > 
> > I had to go to the individual error functions to see what "it doesn't
> > work with &error_fatal" actually means.
> > 
> > So in a case like qcow2_do_open() which has:
> > 
> > error_setg(errp, ...)
> > error_append_hint(errp, ...)
> > 
> > As far as I can see this works just fine without ERRP_GUARD() and with
> > error_fatal, the difference is that if we don't use the guard then the
> > process exists during error_setg(), and if we use the guard it exists
> > during the implicit error_propagate() call triggered by its destruction
> > at the end of the function. In this latter case the printed error
> > message would include the hint.
> > 
> 
> Yes the only problem is that without ERRP_GUARD we lose the hint in case of 
> error_fatal.
> 

Yeah, so rather:

"Add ERRP_GUARD() as mandated by the documentation in include/qapi/error.h
 so that error_prepend() is actually called even if errp is &error_fatal."

Cheers,

--
Greg

Re: [PATCH v2] tests/check-block: Do not run the iotests with old versions of bash

2020-09-18 Thread no-reply

Patchew URL: https://patchew.org/QEMU/20200918153514.330705-1-th...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

N/A. Internal error while reading log file



The full log is available at
http://patchew.org/logs/20200918153514.330705-1-th...@redhat.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 12/13] block/qcow2: simplify qcow2_co_invalidate_cache()

2020-09-18 Thread Alberto Garcia

On Fri 18 Sep 2020 05:30:06 PM CEST, Greg Kurz wrote:
>> qcow2_do_open correctly sets errp on each failure path. So, we can
>> simplify code in qcow2_co_invalidate_cache() and drop explicit error
>> propagation. We should use ERRP_GUARD() (accordingly to comment in
>> include/qapi/error.h) together with error_append() call which we add to
>> avoid problems with error_fatal.
>> 
>
> The wording gives the impression that we add error_append() to avoid problems
> with error_fatal which is certainly not true. Also it isn't _append() but
> _prepend() :)
>
> What about ?
>
> "Add ERRP_GUARD() as mandated by the documentation in include/qapi/error.h
>  to avoid problems with the error_prepend() call if errp is
>  &error_fatal."

I had to go to the individual error functions to see what "it doesn't
work with &error_fatal" actually means.

So in a case like qcow2_do_open() which has:

   error_setg(errp, ...)
   error_append_hint(errp, ...)

As far as I can see this works just fine without ERRP_GUARD() and with
error_fatal, the difference is that if we don't use the guard then the
process exists during error_setg(), and if we use the guard it exists
during the implicit error_propagate() call triggered by its destruction
at the end of the function. In this latter case the printed error
message would include the hint.

Berto

Re: [PATCH v2] tests/check-block: Do not run the iotests with old versions of bash

2020-09-18 Thread Daniel P . Berrangé

On Fri, Sep 18, 2020 at 05:35:14PM +0200, Thomas Huth wrote:
> macOS is shipped with a very old version of the bash (3.2), which
> is currently not suitable for running the iotests anymore (e.g.
> it is missing support for "readarray" which is used in the file
> tests/qemu-iotests/common.filter). Add a check to skip the iotests
> in this case - if someone still wants to run the iotests on macOS,
> they can install a newer version from homebrew, for example.
> 
> Signed-off-by: Thomas Huth 
> ---
>  v2: Use LANG=C and "-q"
> 
>  tests/check-block.sh | 5 +
>  1 file changed, 5 insertions(+)

Reviewed-by: Daniel P. Berrangé 


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v2] tests/check-block: Do not run the iotests with old versions of bash

2020-09-18 Thread Thomas Huth

macOS is shipped with a very old version of the bash (3.2), which
is currently not suitable for running the iotests anymore (e.g.
it is missing support for "readarray" which is used in the file
tests/qemu-iotests/common.filter). Add a check to skip the iotests
in this case - if someone still wants to run the iotests on macOS,
they can install a newer version from homebrew, for example.

Signed-off-by: Thomas Huth 
---
 v2: Use LANG=C and "-q"

 tests/check-block.sh | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tests/check-block.sh b/tests/check-block.sh
index a5a69060e1..f6b1bda7b9 100755
--- a/tests/check-block.sh
+++ b/tests/check-block.sh
@@ -46,6 +46,11 @@ if ! command -v bash >/dev/null 2>&1 ; then
 exit 0
 fi
 
+if LANG=C bash --version | grep -q 'GNU bash, version [123]' ; then
+echo "bash version too old ==> Not running the qemu-iotests."
+exit 0
+fi
+
 if ! (sed --version | grep 'GNU sed') > /dev/null 2>&1 ; then
 if ! command -v gsed >/dev/null 2>&1; then
 echo "GNU sed not available ==> Not running the qemu-iotests."
-- 
2.18.2

Re: [PATCH v2 12/13] block/qcow2: simplify qcow2_co_invalidate_cache()

2020-09-18 Thread Greg Kurz

On Thu, 17 Sep 2020 22:55:18 +0300
Vladimir Sementsov-Ogievskiy  wrote:

> qcow2_do_open correctly sets errp on each failure path. So, we can
> simplify code in qcow2_co_invalidate_cache() and drop explicit error
> propagation. We should use ERRP_GUARD() (accordingly to comment in
> include/qapi/error.h) together with error_append() call which we add to
> avoid problems with error_fatal.
> 

The wording gives the impression that we add error_append() to avoid problems
with error_fatal which is certainly not true. Also it isn't _append() but
_prepend() :)

What about ?

"Add ERRP_GUARD() as mandated by the documentation in include/qapi/error.h
 to avoid problems with the error_prepend() call if errp is &error_fatal."

With that fixed,

Reviewed-by: Greg Kurz 

> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/qcow2.c | 13 -
>  1 file changed, 4 insertions(+), 9 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 2b6ec4b757..cd5f48d3fb 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2702,11 +2702,11 @@ static void qcow2_close(BlockDriverState *bs)
>  static void coroutine_fn qcow2_co_invalidate_cache(BlockDriverState *bs,
> Error **errp)
>  {
> +ERRP_GUARD();
>  BDRVQcow2State *s = bs->opaque;
>  int flags = s->flags;
>  QCryptoBlock *crypto = NULL;
>  QDict *options;
> -Error *local_err = NULL;
>  int ret;
>  
>  /*
> @@ -2724,16 +2724,11 @@ static void coroutine_fn 
> qcow2_co_invalidate_cache(BlockDriverState *bs,
>  
>  flags &= ~BDRV_O_INACTIVE;
>  qemu_co_mutex_lock(&s->lock);
> -ret = qcow2_do_open(bs, options, flags, &local_err);
> +ret = qcow2_do_open(bs, options, flags, errp);
>  qemu_co_mutex_unlock(&s->lock);
>  qobject_unref(options);
> -if (local_err) {
> -error_propagate_prepend(errp, local_err,
> -"Could not reopen qcow2 layer: ");
> -bs->drv = NULL;
> -return;
> -} else if (ret < 0) {
> -error_setg_errno(errp, -ret, "Could not reopen qcow2 layer");
> +if (ret < 0) {
> +error_prepend(errp, "Could not reopen qcow2 layer: ");
>  bs->drv = NULL;
>  return;
>  }

[PATCH] iotests/046: Filter request length

2020-09-18 Thread Max Reitz

For its concurrent requests, 046 has always filtered the offset,
probably because concurrent requests may settle in any order.  However,
it did not filter the request length, and so if requests with different
lengths settle in an unexpected order (notably the longer request before
the shorter request), the test fails (for no good reason).

Filter the length, too.

Signed-off-by: Max Reitz 
---
This has annoyed me for quite some time now, but when rebasing (and
testing) my FUSE export series, it became apparent that on a FUSE export
qcow2 images with -o compat=0.10 always fail this test (because the
first 56k request settles before its accompanying 8k request), so now
I'm forced to do something about it.
---
 tests/qemu-iotests/046 |   3 +-
 tests/qemu-iotests/046.out | 104 ++---
 2 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/tests/qemu-iotests/046 b/tests/qemu-iotests/046
index ed6fae3529..17e71c4b5e 100755
--- a/tests/qemu-iotests/046
+++ b/tests/qemu-iotests/046
@@ -186,7 +186,8 @@ EOF
 }
 
 overlay_io | $QEMU_IO blkdebug::"$TEST_IMG" | _filter_qemu_io |\
-   sed -e 's/bytes at offset [0-9]*/bytes at offset XXX/g'
+sed -e 's/[0-9]*\/[0-9]* bytes at offset [0-9]*/XXX\/XXX bytes at offset 
XXX/g' \
+-e 's/^[0-9]* KiB/XXX KiB/g'
 
 echo
 echo "== Verify image content =="
diff --git a/tests/qemu-iotests/046.out b/tests/qemu-iotests/046.out
index 66ad987ab3..b1a03f4041 100644
--- a/tests/qemu-iotests/046.out
+++ b/tests/qemu-iotests/046.out
@@ -71,74 +71,74 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=6442450944 
backing_file=TEST_DIR
 == Some concurrent requests touching the same cluster ==
 blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 32768/32768 bytes at offset XXX
-32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 57344/57344 bytes at offset XXX
-56 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 4096/4096 bytes at offset XXX
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 32768/32768 bytes at offset XXX
-32 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-discard 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+discard XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 blkdebug: Suspended request 'A'
 blkdebug: Resuming request 'A'
-wrote 8192/8192 bytes at offset XXX
-8 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 57344/57344 bytes at offset XXX
-56 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 4096/4096 bytes at offset XXX
-4 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-wrote 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-discard 65536/65536 bytes at offset XXX
-64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote XXX/XXX bytes at offset XXX
+XXX KiB, X ops; XX:XX:XX.X (X

Re: [PATCH v2 12/13] block/qcow2: simplify qcow2_co_invalidate_cache()

2020-09-18 Thread Alberto Garcia

On Thu 17 Sep 2020 09:55:18 PM CEST, Vladimir Sementsov-Ogievskiy wrote:
> qcow2_do_open correctly sets errp on each failure path. So, we can
> simplify code in qcow2_co_invalidate_cache() and drop explicit error
> propagation. We should use ERRP_GUARD() (accordingly to comment in
> include/qapi/error.h) together with error_append() call which we add to
> avoid problems with error_fatal.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v2 10/13] block/qcow2-bitmap: return status from qcow2_store_persistent_dirty_bitmaps

2020-09-18 Thread Alberto Garcia

On Thu 17 Sep 2020 09:55:16 PM CEST, Vladimir Sementsov-Ogievskiy wrote:
> It's better to return status together with setting errp. It makes
> possible to avoid error propagation.
>
> While being here, put ERRP_GUARD() to fix error_prepend(errp, ...)
> usage inside qcow2_store_persistent_dirty_bitmaps() (see the comment
> above ERRP_GUARD() definition in include/qapi/error.h)
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Greg Kurz 

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v2 09/13] block/qcow2-bitmap: improve qcow2_load_dirty_bitmaps() interface

2020-09-18 Thread Greg Kurz

On Thu, 17 Sep 2020 22:55:15 +0300
Vladimir Sementsov-Ogievskiy  wrote:

> It's recommended for bool functions with errp to return true on success
> and false on failure. Non-standard interfaces don't help to understand
> the code. The change is also needed to reduce error propagation.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---

With the documentation change suggested by Berto,

Reviewed-by: Greg Kurz 

>  block/qcow2.h|  3 ++-
>  block/qcow2-bitmap.c | 25 ++---
>  block/qcow2.c|  6 ++
>  3 files changed, 18 insertions(+), 16 deletions(-)
> 
> diff --git a/block/qcow2.h b/block/qcow2.h
> index 6eac088f1c..3c64dcda33 100644
> --- a/block/qcow2.h
> +++ b/block/qcow2.h
> @@ -972,7 +972,8 @@ void qcow2_cache_discard(Qcow2Cache *c, void *table);
>  int qcow2_check_bitmaps_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>void **refcount_table,
>int64_t *refcount_table_size);
> -bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp);
> +bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated,
> +  Error **errp);
>  bool qcow2_get_bitmap_info_list(BlockDriverState *bs,
>  Qcow2BitmapInfoList **info_list, Error 
> **errp);
>  int qcow2_reopen_bitmaps_rw(BlockDriverState *bs, Error **errp);
> diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
> index 4f6138f544..500175f4e8 100644
> --- a/block/qcow2-bitmap.c
> +++ b/block/qcow2-bitmap.c
> @@ -962,25 +962,26 @@ static void set_readonly_helper(gpointer bitmap, 
> gpointer value)
>  bdrv_dirty_bitmap_set_readonly(bitmap, (bool)value);
>  }
>  
> -/* qcow2_load_dirty_bitmaps()
> - * Return value is a hint for caller: true means that the Qcow2 header was
> - * updated. (false doesn't mean that the header should be updated by the
> - * caller, it just means that updating was not needed or the image cannot be
> - * written to).
> - * On failure the function returns false.
> +/*
> + * Return true on success, false on failure. Anyway, if header_updated
> + * provided set it appropriately.
>   */
> -bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, Error **errp)
> +bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, bool *header_updated,
> +  Error **errp)
>  {
>  BDRVQcow2State *s = bs->opaque;
>  Qcow2BitmapList *bm_list;
>  Qcow2Bitmap *bm;
>  GSList *created_dirty_bitmaps = NULL;
> -bool header_updated = false;
>  bool needs_update = false;
>  
> +if (header_updated) {
> +*header_updated = false;
> +}
> +
>  if (s->nb_bitmaps == 0) {
>  /* No bitmaps - nothing to do */
> -return false;
> +return true;
>  }
>  
>  bm_list = bitmap_list_load(bs, s->bitmap_directory_offset,
> @@ -1036,7 +1037,9 @@ bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, 
> Error **errp)
>  error_setg_errno(errp, -ret, "Can't update bitmap directory");
>  goto fail;
>  }
> -header_updated = true;
> +if (header_updated) {
> +*header_updated = true;
> +}
>  }
>  
>  if (!can_write(bs)) {
> @@ -1047,7 +1050,7 @@ bool qcow2_load_dirty_bitmaps(BlockDriverState *bs, 
> Error **errp)
>  g_slist_free(created_dirty_bitmaps);
>  bitmap_list_free(bm_list);
>  
> -return header_updated;
> +return true;
>  
>  fail:
>  g_slist_foreach(created_dirty_bitmaps, release_dirty_bitmap_helper, bs);
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 8c89c98978..c4b86df7c0 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -1297,7 +1297,6 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
> *bs, QDict *options,
>  unsigned int len, i;
>  int ret = 0;
>  QCowHeader header;
> -Error *local_err = NULL;
>  uint64_t ext_end;
>  uint64_t l1_vm_state_index;
>  bool update_header = false;
> @@ -1785,9 +1784,8 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
> *bs, QDict *options,
>  
>  if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
>  /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
> -bool header_updated = qcow2_load_dirty_bitmaps(bs, &local_err);
> -if (local_err != NULL) {
> -error_propagate(errp, local_err);
> +bool header_updated;
> +if (!qcow2_load_dirty_bitmaps(bs, &header_updated, errp)) {
>  ret = -EINVAL;
>  goto fail;
>  }

[PULL 06/15] vhost: recheck dev state in the vhost_migration_log routine

2020-09-18 Thread Michael S. Tsirkin

From: Dima Stepanov 

vhost-user devices can get a disconnect in the middle of the VHOST-USER
handshake on the migration start. If disconnect event happened right
before sending next VHOST-USER command, then the vhost_dev_set_log()
call in the vhost_migration_log() function will return error. This error
will lead to the assert() and close the QEMU migration source process.
For the vhost-user devices the disconnect event should not break the
migration process, because:
  - the device will be in the stopped state, so it will not be changed
during migration
  - if reconnect will be made the migration log will be reinitialized as
part of reconnect/init process:
#0  vhost_log_global_start (listener=0x563989cf7be0)
at hw/virtio/vhost.c:920
#1  0x56398603d8bc in listener_add_address_space 
(listener=0x563989cf7be0,
as=0x563986ea4340 )
at softmmu/memory.c:2664
#2  0x56398603dd30 in memory_listener_register (listener=0x563989cf7be0,
as=0x563986ea4340 )
at softmmu/memory.c:2740
#3  0x563985fd6956 in vhost_dev_init (hdev=0x563989cf7bd8,
opaque=0x563989cf7e30, backend_type=VHOST_BACKEND_TYPE_USER,
busyloop_timeout=0)
at hw/virtio/vhost.c:1385
#4  0x563985f7d0b8 in vhost_user_blk_connect (dev=0x563989cf7990)
at hw/block/vhost-user-blk.c:315
#5  0x563985f7d3f6 in vhost_user_blk_event (opaque=0x563989cf7990,
event=CHR_EVENT_OPENED)
at hw/block/vhost-user-blk.c:379
Update the vhost-user-blk device with the internal started_vu field which
will be used for initialization (vhost_user_blk_start) and clean up
(vhost_user_blk_stop). This additional flag in the VhostUserBlk structure
will be used to track whether the device really needs to be stopped and
cleaned up on a vhost-user level.
The disconnect event will set the overall VHOST device (not vhost-user) to
the stopped state, so it can be used by the general vhost_migration_log
routine.
Such approach could be propogated to the other vhost-user devices, but
better idea is just to make the same connect/disconnect code for all the
vhost-user devices.

This migration issue was slightly discussed earlier:
  - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg01509.html
  - https://lists.gnu.org/archive/html/qemu-devel/2020-05/msg05241.html

Signed-off-by: Dima Stepanov 
Message-Id: 

Reviewed-by: Raphael Norwitz 
Reviewed-by: Michael S. Tsirkin 
Signed-off-by: Michael S. Tsirkin 
---
 include/hw/virtio/vhost-user-blk.h | 10 ++
 hw/block/vhost-user-blk.c  | 19 ---
 hw/virtio/vhost.c  | 27 ---
 3 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/include/hw/virtio/vhost-user-blk.h 
b/include/hw/virtio/vhost-user-blk.h
index dc40ab6f11..5d86ff2e87 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -42,7 +42,17 @@ struct VHostUserBlk {
 VhostUserState vhost_user;
 struct vhost_virtqueue *vhost_vqs;
 VirtQueue **virtqs;
+
+/*
+ * There are at least two steps of initialization of the
+ * vhost-user device. The first is a "connect" step and
+ * second is a "start" step. Make a separation between
+ * those initialization phases by using two fields.
+ */
+/* vhost_user_blk_connect/vhost_user_blk_disconnect */
 bool connected;
+/* vhost_user_blk_start/vhost_user_blk_stop */
+bool started_vu;
 };
 
 #endif
diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 39aec42dae..a076b1e54d 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -150,6 +150,7 @@ static int vhost_user_blk_start(VirtIODevice *vdev)
 error_report("Error starting vhost: %d", -ret);
 goto err_guest_notifiers;
 }
+s->started_vu = true;
 
 /* guest_notifier_mask/pending not used yet, so just unmask
  * everything here. virtio-pci will do the right thing by
@@ -175,6 +176,11 @@ static void vhost_user_blk_stop(VirtIODevice *vdev)
 VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 int ret;
 
+if (!s->started_vu) {
+return;
+}
+s->started_vu = false;
+
 if (!k->set_guest_notifiers) {
 return;
 }
@@ -341,9 +347,7 @@ static void vhost_user_blk_disconnect(DeviceState *dev)
 }
 s->connected = false;
 
-if (s->dev.started) {
-vhost_user_blk_stop(vdev);
-}
+vhost_user_blk_stop(vdev);
 
 vhost_dev_cleanup(&s->dev);
 }
@@ -399,6 +403,15 @@ static void vhost_user_blk_event(void *opaque, 
QEMUChrEvent event)
 NULL, NULL, false);
 aio_bh_schedule_oneshot(ctx, vhost_user_blk_chr_closed_bh, opaque);
 }
+
+/*
+ * Move vhost device to the stopped state. The vhost-user device
+ * will be clean up and disconnected in BH. This can be useful in
+ * the vhost migration code. If disconnect was caught there is an
+ * option for the gen

Re: [PATCH v2 08/13] block/qcow2: qcow2_get_specific_info(): drop error propagation

2020-09-18 Thread Alberto Garcia

On Thu 17 Sep 2020 09:55:14 PM CEST, Vladimir Sementsov-Ogievskiy wrote:
> Don't use error propagation in qcow2_get_specific_info(). For this
> refactor qcow2_get_bitmap_info_list, its current interface is rather
> weird.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Greg Kurz 

> - * In case of no bitmaps, the function returns NULL and
> - * the @errp parameter is not set.
> - * When bitmap information can not be obtained, the function returns
> - * NULL and the @errp parameter is set.
> + * On success return true with bm_list set (probably to NULL, if no bitmaps),
> + * on failure return false with errp set.

I still find the 'probably' thing odd :-) I think the API documentation
should describe what the function does and under which conditions, not
the probability of the outcomes.

>  QSIMPLEQ_FOREACH(bm, bm_list, entry) {
>  Qcow2BitmapInfo *info = g_new0(Qcow2BitmapInfo, 1);
>  Qcow2BitmapInfoList *obj = g_new0(Qcow2BitmapInfoList, 1);
> @@ -1124,13 +1123,13 @@ Qcow2BitmapInfoList 
> *qcow2_get_bitmap_info_list(BlockDriverState *bs,
>  info->name = g_strdup(bm->name);
>  info->flags = get_bitmap_info_flags(bm->flags & ~BME_RESERVED_FLAGS);
>  obj->value = info;
> -*plist = obj;
> -plist = &obj->next;
> +*info_list = obj;
> +info_list = &obj->next;
>  }

You were right with this, I got confused by the fact that you are
modifying the pointer provided by the user.

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH] xen: rework pci_piix3_xen_ide_unplug

2020-09-18 Thread no-reply

Patchew URL: 
https://patchew.org/QEMU/20200918145256.1886250-1-anthony.per...@citrix.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

N/A. Internal error while reading log file



The full log is available at
http://patchew.org/logs/20200918145256.1886250-1-anthony.per...@citrix.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH v2 09/13] block/qcow2-bitmap: improve qcow2_load_dirty_bitmaps() interface

2020-09-18 Thread Vladimir Sementsov-Ogievskiy


18.09.2020 17:54, Alberto Garcia wrote:

On Thu 17 Sep 2020 09:55:15 PM CEST, Vladimir Sementsov-Ogievskiy wrote:

It's recommended for bool functions with errp to return true on success
and false on failure. Non-standard interfaces don't help to understand
the code. The change is also needed to reduce error propagation.

Signed-off-by: Vladimir Sementsov-Ogievskiy 



+/*
+ * Return true on success, false on failure. Anyway, if header_updated
+ * provided set it appropriately.
   */


I'm not a native speaker but it sounds a bit odd to me. Maybe "If
header_updated is not NULL then it is set appropriately regardless of
the return value".


That's better I think, thanks.



But I'm fine with your version, so

Reviewed-by: Alberto Garcia 

Berto




--
Best regards,
Vladimir

[PATCH] xen: rework pci_piix3_xen_ide_unplug

2020-09-18 Thread Anthony PERARD via

This is to allow IDE disks to be unplugged when adding to QEMU via:
-drive file=/root/disk_file,if=none,id=ide-disk0,format=raw
-device ide-hd,drive=ide-disk0,bus=ide.0,unit=0

as the current code only works for disk added with:
-drive file=/root/disk_file,if=ide,index=0,media=disk,format=raw

Since the code already have the IDE controller as `dev`, we don't need
to use the legacy DriveInfo to find all the drive we want to unplug.
We can simply use `blk` from the controller, as it kind of was already
assume to be the same, by setting it to NULL.

Signed-off-by: Anthony PERARD 
---
 hw/ide/piix.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/hw/ide/piix.c b/hw/ide/piix.c
index b402a936362b..16fcbe58d6fe 100644
--- a/hw/ide/piix.c
+++ b/hw/ide/piix.c
@@ -164,30 +164,29 @@ static void pci_piix_ide_realize(PCIDevice *dev, Error 
**errp)
 int pci_piix3_xen_ide_unplug(DeviceState *dev, bool aux)
 {
 PCIIDEState *pci_ide;
-DriveInfo *di;
 int i;
 IDEDevice *idedev;
+IDEBus *idebus;
+BlockBackend *blk;
 
 pci_ide = PCI_IDE(dev);
 
 for (i = aux ? 1 : 0; i < 4; i++) {
-di = drive_get_by_index(IF_IDE, i);
-if (di != NULL && !di->media_cd) {
-BlockBackend *blk = blk_by_legacy_dinfo(di);
-DeviceState *ds = blk_get_attached_dev(blk);
+idebus = &pci_ide->bus[i / 2];
+blk = idebus->ifs[i % 2].blk;
 
-blk_drain(blk);
-blk_flush(blk);
-
-if (ds) {
-blk_detach_dev(blk, ds);
-}
-pci_ide->bus[di->bus].ifs[di->unit].blk = NULL;
+if (blk && idebus->ifs[i%2].drive_kind != IDE_CD) {
 if (!(i % 2)) {
-idedev = pci_ide->bus[di->bus].master;
+idedev = idebus->master;
 } else {
-idedev = pci_ide->bus[di->bus].slave;
+idedev = idebus->slave;
 }
+
+blk_drain(blk);
+blk_flush(blk);
+
+blk_detach_dev(blk, DEVICE(idedev));
+idebus->ifs[i % 2].blk = NULL;
 idedev->conf.blk = NULL;
 monitor_remove_blk(blk);
 blk_unref(blk);
-- 
Anthony PERARD

Re: [PATCH v2 09/13] block/qcow2-bitmap: improve qcow2_load_dirty_bitmaps() interface

2020-09-18 Thread Alberto Garcia

On Thu 17 Sep 2020 09:55:15 PM CEST, Vladimir Sementsov-Ogievskiy wrote:
> It's recommended for bool functions with errp to return true on success
> and false on failure. Non-standard interfaces don't help to understand
> the code. The change is also needed to reduce error propagation.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

> +/*
> + * Return true on success, false on failure. Anyway, if header_updated
> + * provided set it appropriately.
>   */

I'm not a native speaker but it sounds a bit odd to me. Maybe "If
header_updated is not NULL then it is set appropriately regardless of
the return value".

But I'm fine with your version, so

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v1 0/2] Add timeout mechanism to qmp actions

2020-09-18 Thread Fam Zheng

On 2020-09-18 19:23, Zhenyu Ye wrote:
>   Thread 5 (LWP 4802):
>   #0  0x83086b54 in syscall () at /lib64/libc.so.6
>   #1  0x834598b8 in io_submit () at /lib64/libaio.so.1
>   #2  0xe851e89c in ioq_submit (s=0xfffd3c001bb0) at 
> ../block/linux-aio.c:299
>   #3  0xe851eb50 in laio_io_unplug (bs=0xef0f2340, 
> s=0xfffd3c001bb0)
>   at ../block/linux-aio.c:344
>   #4  0xe8559f1c in raw_aio_unplug (bs=0xef0f2340) at 
> ../block/file-posix.c:2063
>   #5  0xe8538344 in bdrv_io_unplug (bs=0xef0f2340) at 
> ../block/io.c:3135
>   #6  0xe8538360 in bdrv_io_unplug (bs=0xef0eb020) at 
> ../block/io.c:3140
>   #7  0xe8496104 in blk_io_unplug (blk=0xef0e8f20)
>   at ../block/block-backend.c:2147
>   #8  0xe830e1a4 in virtio_blk_handle_vq (s=0xf0374280, 
> vq=0x700fc1d8)
>   at ../hw/block/virtio-blk.c:796
>   #9  0xe82e6b68 in virtio_blk_data_plane_handle_output
>   (vdev=0xf0374280, vq=0x700fc1d8) at 
> ../hw/block/dataplane/virtio-blk.c:165
>   #10 0xe83878fc in virtio_queue_notify_aio_vq (vq=0x700fc1d8)
>   at ../hw/virtio/virtio.c:2325
>   #11 0xe838ab50 in virtio_queue_host_notifier_aio_poll 
> (opaque=0x700fc250)
>   at ../hw/virtio/virtio.c:3545
>   #12 0xe85fab3c in run_poll_handlers_once
>   (ctx=0xef0a87b0, now=77604310618960, timeout=0x73ffdf78)
>   at ../util/aio-posix.c:398
>   #13 0xe85fae5c in run_poll_handlers
>   (ctx=0xef0a87b0, max_ns=4000, timeout=0x73ffdf78) at 
> ../util/aio-posix.c:492
>   #14 0xe85fb078 in try_poll_mode (ctx=0xef0a87b0, 
> timeout=0x73ffdf78)
>   at ../util/aio-posix.c:535
>   #15 0xe85fb180 in aio_poll (ctx=0xef0a87b0, blocking=true)
>   at ../util/aio-posix.c:571
>   #16 0xe8027004 in iothread_run (opaque=0xeee79a00) at 
> ../iothread.c:73
>   #17 0xe85f269c in qemu_thread_start (args=0xef0a8d10)
>   at ../util/qemu-thread-posix.c:521
>   #18 0x831428bc in  () at /lib64/libpthread.so.0
>   #19 0x8308aa1c in  () at /lib64/libc.so.6

I can see how blocking in a slow io_submit can cause trouble for main
thread. I think one way to fix it (until it's made truly async in new
kernels) is moving the io_submit call to thread pool, and wrapped in a
coroutine, perhaps.

I'm not sure qmp timeout is a complete solution because we would still
suffer from a blocked state for a period, in this exact situation before
the timeout.

Fam

Re: [PATCH v2] hw/ide: check null block before _cancel_dma_sync

2020-09-18 Thread Li Qiang

P J P  于2020年9月18日周五 下午6:26写道：
>
> +-- On Fri, 18 Sep 2020, Li Qiang wrote --+
> | Update v2: use an assert() call
> |   ->https://lists.nongnu.org/archive/html/qemu-devel/2020-08/msg08336.html
> ...
> | I think it is better to defer this check to 'ide_cancel_dma_sync'.
> | 'ide_cancel_dma_sync' is also called by 'cmd_device_reset' and all of the
> | handlers of 'ide_cmd_table' will check whether the 's->blk' is NULL in the
> | beginning of 'ide_exec_cmd'.
> |
> | So I think it is reasonable to check 's->blk' at the begining of
> | 'ide_cancel_dma_sync'.
>
> * Yes, earlier patch v1 above does the same.
>
> * From Peter's reply in another thread of similar issue I gather, issue is
>   setting 'blk' to NULL, even erroneously. So (blk == NULL) check should be
>   done where 'blk' is set to null, rather than where it is dereferenced.
>

I don't find anywhere set the 'blk' to NULL.
I think this NULL deference is caused by following:

In 'pci_ide_create_devs' we call 'ide_create_drive', in the latter
function it will create the 's->blk'.
However it is not for every IDEBus.

IDEBus[0]: ifs[2]
IDEBus[1]: ifs[2]

The 'ide_create_drive' will only initialize the 'IDEBus[0].ifs[0]' and
'IDEBus[1].ifs[0]' from the reproducer command line.
So the 'IDEBus[0].ifs[1]' and 'IDEBus[1].ifs[1]' s blk will be NULL.

In 'ide_ioport_write' the guest can set 'bus->unit' to 0 or 1 by issue
'ATA_IOPORT_WR_DEVICE_HEAD'. So this case the guest
can set the active ifs. If the guest set this to 1.

Then in 'idebus_active_if' will return 'IDEBus.ifs[1]' and thus the
's->blk' will be NULL.

So from your (Peter's) saying, we need to check the value in
'ATA_IOPORT_WR_DEVICE_HEAD' handler. To say if the guest
set a valid 'bus->unit'. This can also work I think.

As we the 'ide_exec_cmd' and other functions in 'hw/ide/core.c' check
the 's->blk' directly. I think we just check it in
'ide_cancel_dma_sync' is enough and also this is more consistent with
the other functions. 'ide_cancel_dma_sync' is
also called by 'cmd_device_reset' which is one of the 'ide_cmd_table' handler.


BTW, where is the Peter's email saying this, just want to learn something, :).

Thanks,
Li Qiang


> * At the dereference point, assert(3) is good.
>
>
> Thank you.
> --
> Prasad J Pandit / Red Hat Product Security Team
> 8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D
>

Re: [PATCH v3 5/6] qemu: Introduce to handle transient disk

2020-09-18 Thread Peter Krempa

On Thu, Sep 17, 2020 at 09:30:44 -0400, Masayoshi Mizuma wrote:
> From: Masayoshi Mizuma 
> 
> Here is the implementation of transient option for qcow2 and raw format
> disk. This gets available  directive in domain xml file
> like as:
> 
> 
>   
>   
>   
>   
> 
> 
> When the qemu command line options are built, a new qcow2 image is
> created with backing qcow2 by using blockdev-snapshot command.
> The backing image is the qcow2 file which is set as .
> The filename of the new qcow2 image is original-source-file.TRANSIENT.
> 
> Signed-off-by: Masayoshi Mizuma 
> ---
>  src/qemu/qemu_snapshot.c  | 103 +++---
>  src/qemu/qemu_snapshot.h  |   5 ++
>  src/util/virstoragefile.c |   2 +
>  src/util/virstoragefile.h |   4 ++
>  4 files changed, 107 insertions(+), 7 deletions(-)
> 
> diff --git a/src/qemu/qemu_snapshot.c b/src/qemu/qemu_snapshot.c
> index 1e8ea80b22..67fdc488e0 100644
> --- a/src/qemu/qemu_snapshot.c
> +++ b/src/qemu/qemu_snapshot.c
> @@ -1158,7 +1158,8 @@ qemuSnapshotCreateDiskActive(virQEMUDriverPtr driver,
>   virHashTablePtr blockNamedNodeData,
>   unsigned int flags,
>   virQEMUDriverConfigPtr cfg,
> - qemuDomainAsyncJob asyncJob)
> + qemuDomainAsyncJob asyncJob,
> + bool domSave)
>  {
>  qemuDomainObjPrivatePtr priv = vm->privateData;
>  g_autoptr(virJSONValue) actions = NULL;
> @@ -1201,17 +1202,26 @@ qemuSnapshotCreateDiskActive(virQEMUDriverPtr driver,
>  
>  virDomainAuditDisk(vm, dd->disk->src, dd->src, "snapshot", rc >= 0);
>  
> -if (rc == 0)
> +if (rc == 0) {
>  qemuSnapshotDiskUpdateSource(driver, vm, dd, blockdev);
> +
> +if (dd->disk->transient) {
> +/* Mark the transient working is completed to make sure we 
> can */
> +/* remove the transient disk when the guest is shutdown. */
> +dd->disk->src->transientEstablished = true;
> +}
> +}
>  }
>  
>  if (rc < 0)
>  goto cleanup;
>  
> -if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0 ||
> -(vm->newDef && virDomainDefSave(vm->newDef, driver->xmlopt,
> -cfg->configDir) < 0))
> -goto cleanup;
> +if (domSave) {
> +if (virDomainObjSave(vm, driver->xmlopt, cfg->stateDir) < 0 ||
> +(vm->newDef && virDomainDefSave(vm->newDef, driver->xmlopt,
> +cfg->configDir) < 0))
> +goto cleanup;
> +}
>  
>  ret = 0;
>  
> @@ -1349,7 +1359,7 @@ qemuSnapshotCreateActiveExternal(virQEMUDriverPtr 
> driver,
>  
>  if ((ret = qemuSnapshotCreateDiskActive(driver, vm, snap,
>  blockNamedNodeData, flags, cfg,
> -QEMU_ASYNC_JOB_SNAPSHOT)) < 0)
> +QEMU_ASYNC_JOB_SNAPSHOT, true)) 
> < 0)
>  goto cleanup;
>  
>  /* the snapshot is complete now */
> @@ -2264,3 +2274,82 @@ qemuSnapshotDelete(virDomainObjPtr vm,
>   cleanup:
>  return ret;
>  }
> +
> +
> +static int
> +qemuSnapshotSetupTransientDisk(virDomainSnapshotDiskDefPtr def,
> +   virStorageSourcePtr src)
> +{
> +g_autoptr(virStorageSource) trans = NULL;
> +
> +if (!(trans = virStorageSourceNew()))
> +return -1;
> +
> +trans->path = g_strdup_printf("%s.TRANSIENT", src->path);
> +if (virFileExists(trans->path)) {
> +virReportError(VIR_ERR_INVALID_ARG,
> +   _("Transient disk '%s' for '%s' exists"),
> +   trans->path, src->path);
> +return -1;
> +}
> +
> +trans->type = VIR_STORAGE_TYPE_FILE;
> +trans->format = VIR_STORAGE_FILE_QCOW2;
> +
> +def->src = g_steal_pointer(&trans);
> +
> +return 0;
> +}
> +
> +
> +int
> +qemuSnapshotCreateTransientDisk(virQEMUDriverPtr driver,
> +virDomainObjPtr vm,
> +int asyncJob)
> +{
> +int rc;
> +size_t i;
> +virDomainMomentObjPtr snap = NULL;
> +g_autoptr(virDomainSnapshotDef) snapdef = NULL;
> +g_autoptr(virHashTable) blockNamedNodeData = NULL;
> +qemuDomainObjPrivatePtr priv = vm->privateData;
> +g_autoptr(virQEMUDriverConfig) cfg = 
> virQEMUDriverGetConfig(priv->driver);
> +
> +if (!(snapdef = virDomainSnapshotDefNew()))
> +return -1;

This shouldn't be necessary if you factor out stuff out of
qemuSnapshotCreateDiskActive. I'll send a prerequisite patch doing the
refactor as I've requested last time.

> +
> +snapdef->parent.name = g_strdup_printf("transient");
> +
> +snapdef->ndisks = vm->def->ndisks;
> +if (VIR_ALLOC_N(snapdef->disks, snapdef->ndisks) < 0)
> +retur

Re: [PATCH v3 4/6] qemu: update the validation for transient disk option

2020-09-18 Thread Peter Krempa

On Thu, Sep 17, 2020 at 09:30:43 -0400, Masayoshi Mizuma wrote:
> From: Masayoshi Mizuma 
> 
> Update validation of transient disk option. The option for qemu is supported
> with under condistions.
> 
> - qemu has blockdev feature 
> - the type is file and the format is qcow2 and raw
> - writable disk
> 
> Signed-off-by: Masayoshi Mizuma 
> ---
>  src/qemu/qemu_validate.c | 25 -
>  1 file changed, 24 insertions(+), 1 deletion(-)
> 
> diff --git a/src/qemu/qemu_validate.c b/src/qemu/qemu_validate.c
> index 070f1c962b..9cf78ca0c9 100644
> --- a/src/qemu/qemu_validate.c
> +++ b/src/qemu/qemu_validate.c
> @@ -2002,6 +2002,28 @@ qemuValidateDomainDeviceDefDiskSerial(const char 
> *value)
>  }
>  
>  
> +static int

This is declared as returning 'int'

> +qemuValidateDomainDeviceDefDiskTransient(const virDomainDiskDef *disk,

If this function is called qemuValidateDomainDeviceDefDiskTransient it
should also do all of the validation including reporting errors.

> + virQEMUCapsPtr qemuCaps)
> +{
> +if ((!qemuCaps) || !virQEMUCapsGet(qemuCaps, QEMU_CAPS_BLOCKDEV))

Parentheses are not required around negation

> +return false;

but returns booleans instead

> +
> +if (disk->src->readonly)
> +return false;
> +
> +if ((disk->src->format != VIR_STORAGE_FILE_QCOW2) &&
> +(disk->src->format != VIR_STORAGE_FILE_RAW) &&
> +(disk->src->type != VIR_STORAGE_TYPE_FILE)) {
> +return false;
> +}
> +
> +if (virStorageSourceIsEmpty(disk->src))
> +return false;
> +
> +return true;
> +}
> +
>  static int
>  qemuValidateDomainDeviceDefDiskFrontend(const virDomainDiskDef *disk,
>  virQEMUCapsPtr qemuCaps)
> @@ -2186,7 +2208,8 @@ qemuValidateDomainDeviceDefDiskFrontend(const 
> virDomainDiskDef *disk,
>  }
>  }
>  
> -if (disk->transient) {
> +if ((disk->transient) &&
> +!qemuValidateDomainDeviceDefDiskTransient(disk, qemuCaps)) {
>  virReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s",
> _("transient disks not supported yet"));

The error messages this produces are really suboptimal. You don't know
whether your qemu is old, whether incorrect format is used or whether
it's a network disk.

>  return -1;

I'll rewrite this patch to produce better error messages and fix the
problems above.

Re: [PATCH v3 3/6] qemu: Block migration when transient disk option is enabled

2020-09-18 Thread Peter Krempa

On Thu, Sep 17, 2020 at 09:30:42 -0400, Masayoshi Mizuma wrote:
> From: Masayoshi Mizuma 
> 
> Block migration when transient disk option is enabled because migration
> requires some blockjobs.
> 
> Signed-off-by: Masayoshi Mizuma 
> ---
>  src/qemu/qemu_migration.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/src/qemu/qemu_migration.c b/src/qemu/qemu_migration.c
> index a530c17582..7316d74677 100644
> --- a/src/qemu/qemu_migration.c
> +++ b/src/qemu/qemu_migration.c
> @@ -1397,6 +1397,16 @@ qemuMigrationSrcIsAllowed(virQEMUDriverPtr driver,
> _("cannot migrate this domain without 
> dbus-vmstate support"));
>  return false;
>  }
> +
> +for (i = 0; i < vm->def->ndisks; i++) {
> +virDomainDiskDefPtr disk = vm->def->disks[i];
> +
> +if (disk->transient) {
> +virReportError(VIR_ERR_OPERATION_INVALID, "%s",
> +   _("migration with transient disk is not 
> supported"));

transient disk '%s' is not supported

and use disk->dst as '%s'

> +return false;
> +}
> +}

Reviewed-by: Peter Krempa

Re: [PATCH v3 1/6] qemu: Block blockjobs when transient disk option is enabled

2020-09-18 Thread Peter Krempa

On Fri, Sep 18, 2020 at 14:53:53 +0200, Peter Krempa wrote:
> On Thu, Sep 17, 2020 at 09:30:40 -0400, Masayoshi Mizuma wrote:
> > From: Masayoshi Mizuma 
> > 
> > Signed-off-by: Masayoshi Mizuma 
> > ---
> >  src/qemu/qemu_domain.c | 7 +++
> >  1 file changed, 7 insertions(+)
> 
> Reviewed-by: Peter Krempa 

Actually, I'd prefer some explanation in the commit message. No need to
re-send for now as I'll probably be fixing some things in other patches
as well.

Re: [PATCH v3 2/6] qemu: Block disk hotplug when transient disk option is enabled

2020-09-18 Thread Peter Krempa

On Thu, Sep 17, 2020 at 09:30:41 -0400, Masayoshi Mizuma wrote:
> From: Masayoshi Mizuma 
> 
> Signed-off-by: Masayoshi Mizuma 
> ---
>  src/qemu/qemu_hotplug.c | 6 ++
>  1 file changed, 6 insertions(+)

Similarly to previous commit, the code is good but the commit message
could actually describe the reasons.

Reviewed-by: Peter Krempa

Re: [PATCH v3 1/6] qemu: Block blockjobs when transient disk option is enabled

2020-09-18 Thread Peter Krempa

On Thu, Sep 17, 2020 at 09:30:40 -0400, Masayoshi Mizuma wrote:
> From: Masayoshi Mizuma 
> 
> Signed-off-by: Masayoshi Mizuma 
> ---
>  src/qemu/qemu_domain.c | 7 +++
>  1 file changed, 7 insertions(+)

Reviewed-by: Peter Krempa

Re: [PATCH v2 00/13] block: deal with errp: part I

2020-09-18 Thread Vladimir Sementsov-Ogievskiy


17.09.2020 23:15, no-re...@patchew.org wrote:

Patchew URL: 
https://patchew.org/QEMU/20200917195519.19589-1-vsement...@virtuozzo.com/



Hi,

This series failed build test on FreeBSD host. Please find the details below.






The full log is available at
http://patchew.org/logs/20200917195519.19589-1-vsement...@virtuozzo.com/testing.FreeBSD/?type=message.


Link is broken, it shows: "N/A. Internal error while reading log file"


--
Best regards,
Vladimir

Re: [PATCH v1 0/2] Add timeout mechanism to qmp actions

2020-09-18 Thread Zhenyu Ye

Hi Stefan, Fam,

On 2020/9/18 0:01, Fam Zheng wrote:
> On 2020-09-17 16:44, Stefan Hajnoczi wrote:
>> On Thu, Sep 17, 2020 at 03:36:57PM +0800, Zhenyu Ye wrote:
>>> When the hang occurs, the QEMU is blocked at:
>>>
>>> #0  0x95762b64 in ?? () from target:/usr/lib64/libpthread.so.0
>>> #1  0x9575bd88 in pthread_mutex_lock () from 
>>> target:/usr/lib64/libpthread.so.0
>>> #2  0xbb1f5948 in qemu_mutex_lock_impl (mutex=0xcc8e1860,
>>> file=0xbb4e1bd0 
>>> "/Images/eillon/CODE/5-opensource/qemu/util/async.c", line=605)
>>> #3  0xbb20acd4 in aio_context_acquire (ctx=0xcc8e1800)
>>> #4  0xbb105e90 in bdrv_query_image_info (bs=0xcc934620,
>>> p_info=0xccc41e18, errp=0xca669118)
>>> #5  0xbb105968 in bdrv_block_device_info (blk=0xcdca19f0, 
>>> bs=0xcc934620,
>>> flat=false, errp=0xca6692b8)
>>> #6  0xbb1063dc in bdrv_query_info (blk=0xcdca19f0, 
>>> p_info=0xcd29c9a8,
>>> errp=0xca6692b8)
>>> #7  0xbb106c14 in qmp_query_block (errp=0x0)
>>> #8  0xbacb8e6c in hmp_info_block (mon=0xca6693d0, 
>>> qdict=0xcd089790)
>>
>> Great, this shows that the main loop thread is stuck waiting for the
>> AioContext lock.
>>
>> Please post backtraces from all QEMU threads ((gdb) thread apply all bt)
>> so we can figure out which thread is holding up the main loop.
> 
> I think that is reflected in the perf backtrace posted by Zheny already:
> 
> And in the host, the information of sys_enter_io_submit() is:
> 
> Samples: 3K of event 'syscalls:sys_enter_io_submit', Event count
> (approx.): 3150
>Children  Self  Trace output
>-   66.70%66.70%  ctx_id: 0x9c044000,
>nr: 0x0001, iocbpp: 0x9f7fad28
>0xae7f871c
>0xae8a27c4
>qemu_thread_start
>iothread_run
>aio_poll
>aio_dispatch_ready_handlers
>aio_dispatch_handler
>virtio_queue_host_notifier_aio_read
>virtio_queue_notify_aio_vq
>virtio_blk_data_plane_handle_output
>virtio_blk_handle_vq
>blk_io_unplug
>bdrv_io_unplug
>bdrv_io_unplug
>raw_aio_unplug
>laio_io_unplug
>syscall
> 
> So the iothread is blocked by a slow io_submit holding the AioContext
> lock.
> 
> It would be interesting to know what in kernel is blocking io_submit
> from returning.
> 

Sorry, I only get the backtraces of all QEMU threads, like below:

(gdb) thread apply all bt

Thread 35 (LWP 49700):
#0  0x83148fe4 in pthread_cond_timedwait () at 
/lib64/libpthread.so.0
#1  0xe85f2080 in qemu_sem_timedwait (sem=0xef12e748, 
ms=1)
at ../util/qemu-thread-posix.c:282
#2  0xe861c600 in worker_thread (opaque=0xef12e6c0)
at ../util/thread-pool.c:91
#3  0xe85f269c in qemu_thread_start (args=0xf0204be0)
at ../util/qemu-thread-posix.c:521
#4  0x831428bc in  () at /lib64/libpthread.so.0
#5  0x8308aa1c in  () at /lib64/libc.so.6

Thread 34 (LWP 45337):
#0  0x83148fe4 in pthread_cond_timedwait () at 
/lib64/libpthread.so.0
#1  0xe85f2080 in qemu_sem_timedwait (sem=0xef12e748, 
ms=1)
at ../util/qemu-thread-posix.c:282
#2  0xe861c600 in worker_thread (opaque=0xef12e6c0)
at ../util/thread-pool.c:91
#3  0xe85f269c in qemu_thread_start (args=0xfffcc8000b20)
at ../util/qemu-thread-posix.c:521
#4  0x831428bc in  () at /lib64/libpthread.so.0
#5  0x8308aa1c in  () at /lib64/libc.so.6

Thread 33 (LWP 45336):
#0  0x83148fe4 in pthread_cond_timedwait () at 
/lib64/libpthread.so.0
#1  0xe85f2080 in qemu_sem_timedwait (sem=0xef12e748, 
ms=1)
at ../util/qemu-thread-posix.c:282
#2  0xe861c600 in worker_thread (opaque=0xef12e6c0)
at ../util/thread-pool.c:91
#3  0xe85f269c in qemu_thread_start (args=0xfffd14000b20)
at ../util/qemu-thread-posix.c:521
#4  0x831428bc in  () at /lib64/libpthread.so.0
#5  0x8308aa1c in  () at /lib64/libc.so.6

Thread 32 (LWP 45335):
#0  0x83148fe4 in pthread_cond_timedwait () at 
/lib64/libpthread.so.0
#1  0xe85f2080 in qemu_sem_timedwait (sem=0xef12e748, 
ms=1)
at ../util/qemu-thread-posix.c:282
#2  0xe861c600 in worker_thread (opaque=0xef12e6c0)
at ../util/thread-pool.c:91
#3  0xe85f269c in qemu_thread_start (args=0xefb00f00)
at ../util/qemu-thread-posix.c:521
#4  0x831428bc in  () at /lib64/libpthread.so.0
#5  0x8308aa1c in  () at /lib64/libc.so.6

Thread 12 (LWP 4849):
#0  0x83082a0c in i

Re: [PATCH] fdc: check null block pointer before blk_pwrite

2020-09-18 Thread Li Qiang

P J P  于2020年8月27日周四 下午7:41写道：
>
> From: Prasad J Pandit 
>
> While transferring data via fdctrl_write_data(), check that
> current drive does not have a null block pointer. Avoid
> null pointer dereference.
>
>  -> https://ruhr-uni-bochum.sciebo.de/s/NNWP2GfwzYKeKwE?path=%2Ffdc_nullptr1
> ==1658854==Hint: address points to the zero page.
> #0 blk_inc_in_flight block/block-backend.c:1327
> #1 blk_prw block/block-backend.c:1299
> #2 blk_pwrite block/block-backend.c:1464
> #3 fdctrl_write_data hw/block/fdc.c:2418
> #4 fdctrl_write hw/block/fdc.c:962
> #5 portio_write ioport.c:205
> #6 memory_region_write_accessor memory.c:483
> #7 access_with_adjusted_size memory.c:544
> #8 memory_region_dispatch_write memory.c:1476
>
> Reported-by: Ruhr-University 
> Signed-off-by: Prasad J Pandit 
> ---
>  hw/block/fdc.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/hw/block/fdc.c b/hw/block/fdc.c
> index e9ed3eef45..dedadac68a 100644
> --- a/hw/block/fdc.c
> +++ b/hw/block/fdc.c
> @@ -2419,7 +2419,8 @@ static void fdctrl_write_data(FDCtrl *fdctrl, uint32_t 
> value)
>  if (pos == FD_SECTOR_LEN - 1 ||
>  fdctrl->data_pos == fdctrl->data_len) {
>  cur_drv = get_cur_drv(fdctrl);
> -if (blk_pwrite(cur_drv->blk, fd_offset(cur_drv), fdctrl->fifo,
> +if (cur_drv->blk
> +&& blk_pwrite(cur_drv->blk, fd_offset(cur_drv), fdctrl->fifo,
> BDRV_SECTOR_SIZE, 0) < 0) {
>  FLOPPY_DPRINTF("error writing sector %d\n",
> fd_sector(cur_drv));
> --

Hello Prasad,

There are several pattern to address this NULL check in fdc.c.
I think it is better to consider this as an error. Just like the check
in 'fdctrl_format_sector':

if (cur_drv->blk == NULL ||
blk_pwrite(cur_drv->blk, fd_offset(cur_drv), fdctrl->fifo,
BDRV_SECTOR_SIZE, 0) < 0) {
FLOPPY_DPRINTF("error formatting sector %d\n", fd_sector(cur_drv));
fdctrl_stop_transfer(fdctrl, FD_SR0_ABNTERM | FD_SR0_SEEK, 0x00, 0x00);
} else {

Also there seems exists the same issue in  'fdctrl_read_data'

Thanks,
Li Qiang

> 2.26.2
>
>

Re: [PATCH v2] hw/ide: check null block before _cancel_dma_sync

2020-09-18 Thread P J P

+-- On Fri, 18 Sep 2020, Li Qiang wrote --+
| Update v2: use an assert() call
|   ->https://lists.nongnu.org/archive/html/qemu-devel/2020-08/msg08336.html
...
| I think it is better to defer this check to 'ide_cancel_dma_sync'. 
| 'ide_cancel_dma_sync' is also called by 'cmd_device_reset' and all of the 
| handlers of 'ide_cmd_table' will check whether the 's->blk' is NULL in the 
| beginning of 'ide_exec_cmd'.
| 
| So I think it is reasonable to check 's->blk' at the begining of 
| 'ide_cancel_dma_sync'.

* Yes, earlier patch v1 above does the same.

* From Peter's reply in another thread of similar issue I gather, issue is 
  setting 'blk' to NULL, even erroneously. So (blk == NULL) check should be 
  done where 'blk' is set to null, rather than where it is dereferenced.

* At the dereference point, assert(3) is good.


Thank you.
--
Prasad J Pandit / Red Hat Product Security Team
8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D

Re: [PATCH] util: include the target address in socket connect failures

2020-09-18 Thread Philippe Mathieu-Daudé

On 9/18/20 11:34 AM, Daniel P. Berrangé wrote:
> Reporting "Failed to connect socket" is essentially useless for a user
> attempting to diagnose failure. It needs to include the target addres
> details. Simiarly when failing to create a socket we should include the

Typos "address", "Similarly".

> socket family info, so the user understands what particular feature was
> missing in their kernel build (IPv6, VSock in particular).
> 
> Signed-off-by: Daniel P. Berrangé 
> ---
>  tests/qemu-iotests/162.out |  4 ++--
>  util/qemu-sockets.c| 32 +++-
>  2 files changed, 21 insertions(+), 15 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH v10 5/7] block/export: vhost-user block device backend server

2020-09-18 Thread Stefan Hajnoczi

On Fri, Sep 18, 2020 at 04:09:10PM +0800, Coiby Xu wrote:
> +case VIRTIO_BLK_T_GET_ID: {
> +size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
> +  VIRTIO_BLK_ID_BYTES);
> +snprintf(elem->in_sg[0].iov_base, size, "%s", 
> "vhost_user_blk_server");

gcc (GCC) 10.2.1 says:

../block/export/vhost-user-blk-server.c:178:50: error: ‘%s’ directive output 
truncated writing 21 bytes into a region of size 20 [-Werror=format-truncation=]
  178 | snprintf(elem->in_sg[0].iov_base, size, "%s", 
"vhost_user_blk_server");
  |  ^~   
~~~

I suggest changing this string constant to "vhost_user_blk" for now.


signature.asc
Description: PGP signature

[PATCH] util: include the target address in socket connect failures

2020-09-18 Thread Daniel P . Berrangé

Reporting "Failed to connect socket" is essentially useless for a user
attempting to diagnose failure. It needs to include the target addres
details. Simiarly when failing to create a socket we should include the
socket family info, so the user understands what particular feature was
missing in their kernel build (IPv6, VSock in particular).

Signed-off-by: Daniel P. Berrangé 
---
 tests/qemu-iotests/162.out |  4 ++--
 util/qemu-sockets.c| 32 +++-
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/tests/qemu-iotests/162.out b/tests/qemu-iotests/162.out
index 5a00d36d17..f8714cb0d2 100644
--- a/tests/qemu-iotests/162.out
+++ b/tests/qemu-iotests/162.out
@@ -6,8 +6,8 @@ image: nbd://localhost:PORT
 image: nbd+unix://?socket=42
 
 === SSH ===
-qemu-img: Could not open 'json:{"driver": "ssh", "host": "localhost", "port": 
"0", "path": "/foo"}': Failed to connect socket: Connection refused
-qemu-img: Could not open 'driver=ssh,host=localhost,port=0,path=/foo': Failed 
to connect socket: Connection refused
+qemu-img: Could not open 'json:{"driver": "ssh", "host": "localhost", "port": 
"0", "path": "/foo"}': Failed to connect to 'localhost:0': Connection refused
+qemu-img: Could not open 'driver=ssh,host=localhost,port=0,path=/foo': Failed 
to connect to 'localhost:0': Connection refused
 qemu-img: Could not open 'json:{"driver": "ssh", "host": "localhost", "port": 
0.42, "path": "/foo"}': Parameter 'port' expects a number
 qemu-img: Could not open 'driver=ssh,host=localhost,port=0.42,path=/foo': 
Parameter 'port' expects a number
 
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index b37d288866..dbf8e4dd0c 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -354,15 +354,15 @@ listen_ok:
 ((rc) == -EINPROGRESS)
 #endif
 
-static int inet_connect_addr(struct addrinfo *addr, Error **errp);
-
-static int inet_connect_addr(struct addrinfo *addr, Error **errp)
+static int inet_connect_addr(const InetSocketAddress *saddr,
+ struct addrinfo *addr, Error **errp)
 {
 int sock, rc;
 
 sock = qemu_socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol);
 if (sock < 0) {
-error_setg_errno(errp, errno, "Failed to create socket");
+error_setg_errno(errp, errno, "Failed to create socket family %d",
+ addr->ai_family);
 return -1;
 }
 socket_set_fast_reuse(sock);
@@ -376,7 +376,8 @@ static int inet_connect_addr(struct addrinfo *addr, Error 
**errp)
 } while (rc == -EINTR);
 
 if (rc < 0) {
-error_setg_errno(errp, errno, "Failed to connect socket");
+error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ saddr->host, saddr->port);
 closesocket(sock);
 return -1;
 }
@@ -455,7 +456,7 @@ int inet_connect_saddr(InetSocketAddress *saddr, Error 
**errp)
 for (e = res; e != NULL; e = e->ai_next) {
 error_free(local_err);
 local_err = NULL;
-sock = inet_connect_addr(e, &local_err);
+sock = inet_connect_addr(saddr, e, &local_err);
 if (sock >= 0) {
 break;
 }
@@ -549,7 +550,8 @@ static int inet_dgram_saddr(InetSocketAddress *sraddr,
 /* create socket */
 sock = qemu_socket(peer->ai_family, peer->ai_socktype, peer->ai_protocol);
 if (sock < 0) {
-error_setg_errno(errp, errno, "Failed to create socket");
+error_setg_errno(errp, errno, "Failed to create socket family %d",
+ peer->ai_family);
 goto err;
 }
 socket_set_fast_reuse(sock);
@@ -562,7 +564,8 @@ static int inet_dgram_saddr(InetSocketAddress *sraddr,
 
 /* connect to peer */
 if (connect(sock,peer->ai_addr,peer->ai_addrlen) < 0) {
-error_setg_errno(errp, errno, "Failed to connect socket");
+error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ addr, port);
 goto err;
 }
 
@@ -735,13 +738,15 @@ static bool vsock_parse_vaddr_to_sockaddr(const 
VsockSocketAddress *vaddr,
 return true;
 }
 
-static int vsock_connect_addr(const struct sockaddr_vm *svm, Error **errp)
+static int vsock_connect_addr(const VsockSocketAddress *vaddr,
+  const struct sockaddr_vm *svm, Error **errp)
 {
 int sock, rc;
 
 sock = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
 if (sock < 0) {
-error_setg_errno(errp, errno, "Failed to create socket");
+error_setg_errno(errp, errno, "Failed to create socket family %d",
+ AF_VSOCK);
 return -1;
 }
 
@@ -754,7 +759,8 @@ static int vsock_connect_addr(const struct sockaddr_vm 
*svm, Error **errp)
 } while (rc == -EINTR);
 
 if (rc < 0) {
-error_setg_errno(errp, errno, "Failed to connect socket");
+error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ vaddr->cid, vaddr->port);

Re: [PATCH 0/4] hw/sd/sdhci: Strengthen multiple DMA transfers

2020-09-18 Thread Philippe Mathieu-Daudé

On 9/3/20 7:28 PM, Philippe Mathieu-Daudé wrote:
> Still trying to fix the bugs reported by Aleksander...
> 
> - Do not send 0 block count
> - Reduce DMA to MMIO re-entrancy by yielding when pending IRQ
> 
> Based-on: <20200901140411.112150-1-f4...@amsat.org>
> 
> Philippe Mathieu-Daudé (4):
>   hw/sd/sdhci: Stop multiple transfers when block count is cleared
>   hw/sd/sdhci: Resume pending DMA transfers on MMIO accesses
>   hw/sd/sdhci: Let sdhci_update_irq() return if IRQ was delivered
>   hw/sd/sdhci: Yield if interrupt delivered during multiple transfer

Thanks, series applied to my sd-next tree.

Re: [PATCH v2 0/3] hw/sd/sdhci: Fix DMA Transfer Block Size field width

2020-09-18 Thread Philippe Mathieu-Daudé

On 9/1/20 4:04 PM, Philippe Mathieu-Daudé wrote:
> Fix the SDHCI issue reported last week by Alexander:
> https://bugs.launchpad.net/qemu/+bug/1892960
> 
> The field is 12-bit (4KiB) but the guest can set
> up to 16-bit (64KiB), leading to OOB access.
> 
> since v1:
> commited unstaged change in patch #3...
> 
> Philippe Mathieu-Daudé (3):
>   hw/sd/sdhci: Fix qemu_log_mask() format string
>   hw/sd/sdhci: Document the datasheet used
>   hw/sd/sdhci: Fix DMA Transfer Block Size field

Thanks, series applied to my sd-next tree.

[PATCH v10 5/7] block/export: vhost-user block device backend server

2020-09-18 Thread Coiby Xu

By making use of libvhost-user, block device drive can be shared to
the connected vhost-user client. Only one client can connect to the
server one time.

Since vhost-user-server needs a block drive to be created first, delay
the creation of this object.

Suggested-by: Kevin Wolf 
Signed-off-by: Stefan Hajnoczi 
Signed-off-by: Coiby Xu 
---
 block/export/vhost-user-blk-server.c | 661 +++
 block/export/vhost-user-blk-server.h |  36 ++
 block/meson.build|   1 +
 softmmu/vl.c |   4 +
 4 files changed, 702 insertions(+)
 create mode 100644 block/export/vhost-user-blk-server.c
 create mode 100644 block/export/vhost-user-blk-server.h

diff --git a/block/export/vhost-user-blk-server.c 
b/block/export/vhost-user-blk-server.c
new file mode 100644
index 00..ec78130f09
--- /dev/null
+++ b/block/export/vhost-user-blk-server.c
@@ -0,0 +1,661 @@
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Parts of the code based on nbd/server.c.
+ *
+ * Copyright (c) Coiby Xu .
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "vhost-user-blk-server.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/block-backend.h"
+#include "util/block-helpers.h"
+
+enum {
+VHOST_USER_BLK_MAX_QUEUES = 1,
+};
+struct virtio_blk_inhdr {
+unsigned char status;
+};
+
+typedef struct VuBlockReq {
+VuVirtqElement *elem;
+int64_t sector_num;
+size_t size;
+struct virtio_blk_inhdr *in;
+struct virtio_blk_outhdr out;
+VuServer *server;
+struct VuVirtq *vq;
+} VuBlockReq;
+
+static void vu_block_req_complete(VuBlockReq *req)
+{
+VuDev *vu_dev = &req->server->vu_dev;
+
+/* IO size with 1 extra status byte */
+vu_queue_push(vu_dev, req->vq, req->elem, req->size + 1);
+vu_queue_notify(vu_dev, req->vq);
+
+if (req->elem) {
+free(req->elem);
+}
+
+g_free(req);
+}
+
+static VuBlockDev *get_vu_block_device_by_server(VuServer *server)
+{
+return container_of(server, VuBlockDev, vu_server);
+}
+
+static int coroutine_fn
+vu_block_discard_write_zeroes(VuBlockReq *req, struct iovec *iov,
+  uint32_t iovcnt, uint32_t type)
+{
+struct virtio_blk_discard_write_zeroes desc;
+ssize_t size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
+if (unlikely(size != sizeof(desc))) {
+error_report("Invalid size %ld, expect %ld", size, sizeof(desc));
+return -EINVAL;
+}
+
+VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
+uint64_t range[2] = { le64_to_cpu(desc.sector) << 9,
+  le32_to_cpu(desc.num_sectors) << 9 };
+if (type == VIRTIO_BLK_T_DISCARD) {
+if (blk_co_pdiscard(vdev_blk->backend, range[0], range[1]) == 0) {
+return 0;
+}
+} else if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+if (blk_co_pwrite_zeroes(vdev_blk->backend,
+ range[0], range[1], 0) == 0) {
+return 0;
+}
+}
+
+return -EINVAL;
+}
+
+static void coroutine_fn vu_block_flush(VuBlockReq *req)
+{
+VuBlockDev *vdev_blk = get_vu_block_device_by_server(req->server);
+BlockBackend *backend = vdev_blk->backend;
+blk_co_flush(backend);
+}
+
+struct req_data {
+VuServer *server;
+VuVirtq *vq;
+VuVirtqElement *elem;
+};
+
+static void coroutine_fn vu_block_virtio_process_req(void *opaque)
+{
+struct req_data *data = opaque;
+VuServer *server = data->server;
+VuVirtq *vq = data->vq;
+VuVirtqElement *elem = data->elem;
+uint32_t type;
+VuBlockReq *req;
+
+VuBlockDev *vdev_blk = get_vu_block_device_by_server(server);
+BlockBackend *backend = vdev_blk->backend;
+
+struct iovec *in_iov = elem->in_sg;
+struct iovec *out_iov = elem->out_sg;
+unsigned in_num = elem->in_num;
+unsigned out_num = elem->out_num;
+/* refer to hw/block/virtio_blk.c */
+if (elem->out_num < 1 || elem->in_num < 1) {
+error_report("virtio-blk request missing headers");
+free(elem);
+return;
+}
+
+req = g_new0(VuBlockReq, 1);
+req->server = server;
+req->vq = vq;
+req->elem = elem;
+
+if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+sizeof(req->out)) != sizeof(req->out))) {
+error_report("virtio-blk request outhdr too short");
+goto err;
+}
+
+iov_discard_front(&out_iov, &out_num, sizeof(req->out));
+
+if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+error_report("virtio-blk request inhdr too short");
+goto err;
+}
+
+/* We always touch the last byte, so just see how big in_iov is.  */
+req->in = (void *)in_iov[in_num - 1].iov_base
+

87 matches

Mail list logo