Re: [PATCH v5 17/26] nvme: allow multiple aios per command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:53 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 13:48, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > This refactors how the device issues asynchronous block backend
> > > requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> > > associated with the command. This allows multiple aios to be issued for
> > > a command. Only when all requests have been completed will the device
> > > post a completion queue entry.
> > > 
> > > Because the device is currently guaranteed to only issue a single aio
> > > request per command, the benefit is not immediately obvious. But this
> > > functionality is required to support metadata, the dataset management
> > > command and other features.
> > 
> > I don't know what the strategy will be chosen for supporting metadata
> > (qemu doesn't have any notion of metadata in the block layer), but for 
> > dataset management
> > you are right. Dataset management command can contain a table of areas to 
> > discard
> > (although in reality I have seen no driver putting there more that one 
> > entry).
> > 
> 
> The strategy is different depending on how the metadata is transferred
> between host and device. For the "separate buffer" case, metadata is
> transferred using a separate memory pointer in the nvme command (MPTR).
> In this case the metadata is kept separately on a new blockdev attached
> to the namespace.
Looks reasonable.
> 


> In the other case, metadata is transferred as part of an extended lba
> (say 512 + 8 bytes) and kept inline on the main namespace blockdev. This
> is challenging for QEMU as it breaks interoperability of the image with
> other devices. But that is a discussion for fresh RFC ;)

Yes, this one is quite problemetic. IMHO even the kernel opted out to not
support this kind of metadata (I know that since I played with one of Intel's 
enterprise
SSDs when I developed nvme-mdev, and sadly this is the only kind of metadata it 
supports).
I guess if we have to support this format (for the sake of making our nvme 
virtual device
as feature complete as possible for driver development), I would emulate this 
with a
separate drive as well.

> 
> Note that the support for multiple AIOs is also used for DULBE support
This is a typo? I don't recall something like that from the spec.

> down the line when I get around to posting those patches. So this is
> preparatory for a lot of features that requires persistant state across
> device power off.
All right. Thanks again for your work. I wish I had all these features
when I developed nvme-mdev, it would make my life much easier.

> 
> > 
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 449 +-
> > >  hw/block/nvme.h   | 134 +++--
> > >  hw/block/trace-events |   8 +
> > >  3 files changed, 480 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 334265efb21e..e97da35c4ca1 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -19,7 +19,8 @@
> > >   *  -drive file=,if=none,id=
> > >   *  -device nvme,drive=,serial=,id=, 
> > > \
> > >   *  cmb_size_mb=, \
> > > - *  num_queues=
> > > + *  num_queues=, \
> > > + *  mdts=
> > 
> > Could you split mdts checks into a separate patch? This is not related to 
> > the series.
> 
> Absolutely. Done.
Perfect, thanks!
> 
> > 
> > >   *
> > >   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> > >   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> > > @@ -57,6 +58,7 @@
> > >  } while (0)
> > >  
> > >  static void nvme_process_sq(void *opaque);
> > > +static void nvme_aio_cb(void *opaque, int ret);
> > >  
> > >  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> > >  {
> > > @@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t 
> > > *ptr, uint32_t len,
> > >  return status;
> > >  }
> > >  
> > > +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > > +{
> > > +NvmeNamespace *ns = req->ns;
> > > +
> > > +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +
> > > +return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
> > > +}
> > 
> > Same here, this is another nice refactoring and it should be in separate 
> > patch.
> 
> Done.
> 
> > 
> > > +
> > > +static void nvme_aio_destroy(NvmeAIO *aio)
> > > +{
> > > +g_free(aio);
> > > +}
> > > +
> > > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> > > +NvmeAIOOp opc)
> > > +{
> > > +aio->opc = opc;
> > > +
> > > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, 
> > > blk_name(aio->blk),
> > > +aio->offset, aio->len, nvme_aio_opc_str(aio), 

Re: [PATCH v5 17/26] nvme: allow multiple aios per command

2020-03-16 Thread Klaus Birkelund Jensen
On Feb 12 13:48, Maxim Levitsky wrote:
> On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > This refactors how the device issues asynchronous block backend
> > requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> > associated with the command. This allows multiple aios to be issued for
> > a command. Only when all requests have been completed will the device
> > post a completion queue entry.
> > 
> > Because the device is currently guaranteed to only issue a single aio
> > request per command, the benefit is not immediately obvious. But this
> > functionality is required to support metadata, the dataset management
> > command and other features.
> 
> I don't know what the strategy will be chosen for supporting metadata
> (qemu doesn't have any notion of metadata in the block layer), but for 
> dataset management
> you are right. Dataset management command can contain a table of areas to 
> discard
> (although in reality I have seen no driver putting there more that one entry).
> 

The strategy is different depending on how the metadata is transferred
between host and device. For the "separate buffer" case, metadata is
transferred using a separate memory pointer in the nvme command (MPTR).
In this case the metadata is kept separately on a new blockdev attached
to the namespace.

In the other case, metadata is transferred as part of an extended lba
(say 512 + 8 bytes) and kept inline on the main namespace blockdev. This
is challenging for QEMU as it breaks interoperability of the image with
other devices. But that is a discussion for fresh RFC ;)

Note that the support for multiple AIOs is also used for DULBE support
down the line when I get around to posting those patches. So this is
preparatory for a lot of features that requires persistant state across
device power off.

> 
> > 
> > Signed-off-by: Klaus Jensen 
> > Signed-off-by: Klaus Jensen 
> > ---
> >  hw/block/nvme.c   | 449 +-
> >  hw/block/nvme.h   | 134 +++--
> >  hw/block/trace-events |   8 +
> >  3 files changed, 480 insertions(+), 111 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 334265efb21e..e97da35c4ca1 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -19,7 +19,8 @@
> >   *  -drive file=,if=none,id=
> >   *  -device nvme,drive=,serial=,id=, \
> >   *  cmb_size_mb=, \
> > - *  num_queues=
> > + *  num_queues=, \
> > + *  mdts=
> 
> Could you split mdts checks into a separate patch? This is not related to the 
> series.

Absolutely. Done.

> 
> >   *
> >   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> >   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> > @@ -57,6 +58,7 @@
> >  } while (0)
> >  
> >  static void nvme_process_sq(void *opaque);
> > +static void nvme_aio_cb(void *opaque, int ret);
> >  
> >  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> >  {
> > @@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t 
> > *ptr, uint32_t len,
> >  return status;
> >  }
> >  
> > +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > +{
> > +NvmeNamespace *ns = req->ns;
> > +
> > +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > +
> > +return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
> > +}
> 
> Same here, this is another nice refactoring and it should be in separate 
> patch.

Done.

> 
> > +
> > +static void nvme_aio_destroy(NvmeAIO *aio)
> > +{
> > +g_free(aio);
> > +}
> > +
> > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> > +NvmeAIOOp opc)
> > +{
> > +aio->opc = opc;
> > +
> > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> > +aio->offset, aio->len, nvme_aio_opc_str(aio), req);
> > +
> > +if (req) {
> > +QTAILQ_INSERT_TAIL(>aio_tailq, aio, tailq_entry);
> > +}
> > +}
> > +
> > +static void nvme_aio(NvmeAIO *aio)
> Function name not clear to me. Maybe change this to something like 
> nvme_submit_aio.

Fixed.

> > +{
> > +BlockBackend *blk = aio->blk;
> > +BlockAcctCookie *acct = >acct;
> > +BlockAcctStats *stats = blk_get_stats(blk);
> > +
> > +bool is_write, dma;
> > +
> > +switch (aio->opc) {
> > +case NVME_AIO_OPC_NONE:
> > +break;
> > +
> > +case NVME_AIO_OPC_FLUSH:
> > +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> > +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> > +break;
> > +
> > +case NVME_AIO_OPC_WRITE_ZEROES:
> > +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> > +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> > +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
> > +break;
> > +
> > +case 

Re: [PATCH v5 17/26] nvme: allow multiple aios per command

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
> 
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.

I don't know what the strategy will be chosen for supporting metadata
(qemu doesn't have any notion of metadata in the block layer), but for dataset 
management
you are right. Dataset management command can contain a table of areas to 
discard
(although in reality I have seen no driver putting there more that one entry).


> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 449 +-
>  hw/block/nvme.h   | 134 +++--
>  hw/block/trace-events |   8 +
>  3 files changed, 480 insertions(+), 111 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 334265efb21e..e97da35c4ca1 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,8 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> - *  num_queues=
> + *  num_queues=, \
> + *  mdts=

Could you split mdts checks into a separate patch? This is not related to the 
series.

>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -57,6 +58,7 @@
>  } while (0)
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>  
>  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
>  {
> @@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return status;
>  }
>  
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +NvmeNamespace *ns = req->ns;
> +
> +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +
> +return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
> +}

Same here, this is another nice refactoring and it should be in separate patch.

> +
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> +g_free(aio);
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> +NvmeAIOOp opc)
> +{
> +aio->opc = opc;
> +
> +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> +aio->offset, aio->len, nvme_aio_opc_str(aio), req);
> +
> +if (req) {
> +QTAILQ_INSERT_TAIL(>aio_tailq, aio, tailq_entry);
> +}
> +}
> +
> +static void nvme_aio(NvmeAIO *aio)
Function name not clear to me. Maybe change this to something like 
nvme_submit_aio.
> +{
> +BlockBackend *blk = aio->blk;
> +BlockAcctCookie *acct = >acct;
> +BlockAcctStats *stats = blk_get_stats(blk);
> +
> +bool is_write, dma;
> +
> +switch (aio->opc) {
> +case NVME_AIO_OPC_NONE:
> +break;
> +
> +case NVME_AIO_OPC_FLUSH:
> +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_WRITE_ZEROES:
> +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_READ:
> +case NVME_AIO_OPC_WRITE:

> +dma = aio->qsg != NULL;

This doesn't work.
aio->qsg is always not null since nvme_rw_aio sets this to >qsg
which is then written to aio->qsg by nvme_aio_new.

That is yet another reason I really don't like these parallel QEMUSGList
and QEMUIOVector. However I see that few other qemu drivers do this,
thus this is probably a necessary evil.

What we can do maybe is to do dma_memory_map on the SG list,
and then deal with QEMUIOVector only. Virtio does this
(virtqueue_pop/virtqueue_push)


> +is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> +block_acct_start(stats, acct, aio->len,
> +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +if (dma) {
> +aio->aiocb = is_write ?
> +dma_blk_write(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
> +dma_blk_read(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> +
Extra space
> +return;
> +}
> +
> +aio->aiocb = is_write ?
> +   

[PATCH v5 17/26] nvme: allow multiple aios per command

2020-02-04 Thread Klaus Jensen
This refactors how the device issues asynchronous block backend
requests. The NvmeRequest now holds a queue of NvmeAIOs that are
associated with the command. This allows multiple aios to be issued for
a command. Only when all requests have been completed will the device
post a completion queue entry.

Because the device is currently guaranteed to only issue a single aio
request per command, the benefit is not immediately obvious. But this
functionality is required to support metadata, the dataset management
command and other features.

Signed-off-by: Klaus Jensen 
Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.c   | 449 +-
 hw/block/nvme.h   | 134 +++--
 hw/block/trace-events |   8 +
 3 files changed, 480 insertions(+), 111 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 334265efb21e..e97da35c4ca1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -19,7 +19,8 @@
  *  -drive file=,if=none,id=
  *  -device nvme,drive=,serial=,id=, \
  *  cmb_size_mb=, \
- *  num_queues=
+ *  num_queues=, \
+ *  mdts=
  *
  * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
  * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
@@ -57,6 +58,7 @@
 } while (0)
 
 static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
 
 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
 {
@@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
uint32_t len,
 return status;
 }
 
+static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+NvmeNamespace *ns = req->ns;
+
+uint32_t len = req->nlb << nvme_ns_lbads(ns);
+uint64_t prp1 = le64_to_cpu(cmd->prp1);
+uint64_t prp2 = le64_to_cpu(cmd->prp2);
+
+return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
+}
+
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+g_free(aio);
+}
+
+static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
+NvmeAIOOp opc)
+{
+aio->opc = opc;
+
+trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
+aio->offset, aio->len, nvme_aio_opc_str(aio), req);
+
+if (req) {
+QTAILQ_INSERT_TAIL(>aio_tailq, aio, tailq_entry);
+}
+}
+
+static void nvme_aio(NvmeAIO *aio)
+{
+BlockBackend *blk = aio->blk;
+BlockAcctCookie *acct = >acct;
+BlockAcctStats *stats = blk_get_stats(blk);
+
+bool is_write, dma;
+
+switch (aio->opc) {
+case NVME_AIO_OPC_NONE:
+break;
+
+case NVME_AIO_OPC_FLUSH:
+block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+break;
+
+case NVME_AIO_OPC_WRITE_ZEROES:
+block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
+break;
+
+case NVME_AIO_OPC_READ:
+case NVME_AIO_OPC_WRITE:
+dma = aio->qsg != NULL;
+is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+block_acct_start(stats, acct, aio->len,
+is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+if (dma) {
+aio->aiocb = is_write ?
+dma_blk_write(blk, aio->qsg, aio->offset,
+BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
+dma_blk_read(blk, aio->qsg, aio->offset,
+BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+
+return;
+}
+
+aio->aiocb = is_write ?
+blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
+nvme_aio_cb, aio) :
+blk_aio_preadv(blk, aio->offset, aio->iov, 0,
+nvme_aio_cb, aio);
+
+break;
+}
+}
+
+static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
+{
+NvmeAIO *aio;
+size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
+
+aio = g_new0(NvmeAIO, 1);
+
+*aio = (NvmeAIO) {
+.blk = blk,
+.offset = offset,
+.len = len,
+.req = req,
+.qsg = >qsg,
+.iov = >iov,
+};
+
+nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
+NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
+nvme_aio(aio);
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -364,6 +467,7 @@ static void nvme_post_cqes(void *opaque)
 nvme_inc_cq_tail(cq);
 pci_dma_write(>parent_obj, addr, (void *)>cqe,
 sizeof(req->cqe));
+nvme_req_clear(req);
 QTAILQ_INSERT_TAIL(>req_list, req, entry);
 }
 if (cq->tail != cq->head) {
@@ -374,8 +478,8 @@ static void nvme_post_cqes(void *opaque)
 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
 {
 assert(cq->cqid == req->sq->cqid);
-trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
-