Re: [PATCH v6 36/42] nvme: add support for scatter gather lists

2020-03-31 Thread Maxim Levitsky
On Tue, 2020-03-31 at 07:48 +0200, Klaus Birkelund Jensen wrote:
> On Mar 25 12:58, Maxim Levitsky wrote:
> > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > > From: Klaus Jensen 
> > > 
> > > For now, support the Data Block, Segment and Last Segment descriptor
> > > types.
> > > 
> > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Acked-by: Keith Busch 
> > > ---
> > >  hw/block/nvme.c   | 310 +++---
> > >  hw/block/trace-events |   4 +
> > >  2 files changed, 262 insertions(+), 52 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 49d323566393..b89b96990f52 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, 
> > > hwaddr addr)
> > >  
> > >  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> > >  {
> > > -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> > > +hwaddr hi = addr + size;
> > > +if (hi < addr) {
> > > +return 1;
> > > +}
> > > +
> > > +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> > > hi)) {
> > 
> > I would suggest to split this into a separate patch as well, since this 
> > contains not just one but 2 bugfixes
> > for this function and they are not related to sg lists.
> > Or at least move this to 'nvme: refactor nvme_addr_read' and rename this 
> > patch
> > to something like 'nvme: fix and refactor nvme_addr_read'
> > 
> 
> I've split it into a patch.
> 
> > 
> > >  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> > >  return 0;
> > >  }
> > > @@ -328,13 +333,242 @@ unmap:
> > >  return status;
> > >  }
> > >  
> > > -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> > > - uint64_t prp1, uint64_t prp2, DMADirection 
> > > dir,
> > > +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> > > +  QEMUIOVector *iov,
> > > +  NvmeSglDescriptor *segment, uint64_t 
> > > nsgld,
> > > +  size_t *len, NvmeRequest *req)
> > > +{
> > > +dma_addr_t addr, trans_len;
> > > +uint32_t blk_len;
> > > +uint16_t status;
> > > +
> > > +for (int i = 0; i < nsgld; i++) {
> > > +uint8_t type = NVME_SGL_TYPE(segment[i].type);
> > > +
> > > +if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > > +switch (type) {
> > > +case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> > > +case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> > > +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> > > +default:
> > 
> > To be honest I don't like that 'default'
> > I would explicitly state which segment types remain 
> > (I think segment list and last segment list, and various reserved types)
> > In fact for the reserved types you probably also want to return 
> > NVME_SGL_DESCR_TYPE_INVALID)
> > 
> 
> I "negated" the logic which I think is more readable. I still really
> want to keep the default, for instance, nvme v1.4 adds a new type that
> we do not support (the Transport SGL Data Block descriptor).
OK, I'll take a look a that in the next version of the patches.

> 
> > Also this function as well really begs to have a description prior to it,
> > something like 'map a sg list section, assuming that it only contains SGL 
> > data descriptions,
> > caller has to ensure this'.
> > 
> 
> Done.
Thanks a lot!
> 
> > 
> > > +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> > > +}
> > > +}
> > > +
> > > +if (*len == 0) {
> > > +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
> > 
> > Nitpick: I would add a small comment here as well describiing
> > what this does (We reach this point if sg list covers more that that
> > was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH 
> > controller
> > capability indicates that we support just throwing the extra data away)
> > 
> 
> Adding a comment. It's the other way around. The size as indicated by
> NLB (or whatever depending on the command) is the "authoritative" souce
> of information for the size of the payload. We will never accept an SGL
> that is too short such that we lose or throw away data, but we might
> accept ignoring parts of the SGL.
Yes, that is what I meant. Thanks!

> 
> > > +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> > > +break;
> > > +}
> > > +
> > > +trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> > > +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > > +}
> > > +
> > > +addr = le64_to_cpu(segment[i].addr);
> > > +blk_len = le32_to_cpu(segment[i].len);
> > > +
> > > +if (!blk_len) {
> > > +continue;
> > > +}
> > > +

Re: [PATCH v6 36/42] nvme: add support for scatter gather lists

2020-03-30 Thread Klaus Birkelund Jensen
On Mar 25 12:58, Maxim Levitsky wrote:
> On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > For now, support the Data Block, Segment and Last Segment descriptor
> > types.
> > 
> > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> > 
> > Signed-off-by: Klaus Jensen 
> > Acked-by: Keith Busch 
> > ---
> >  hw/block/nvme.c   | 310 +++---
> >  hw/block/trace-events |   4 +
> >  2 files changed, 262 insertions(+), 52 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 49d323566393..b89b96990f52 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> > addr)
> >  
> >  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> >  {
> > -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> > +hwaddr hi = addr + size;
> > +if (hi < addr) {
> > +return 1;
> > +}
> > +
> > +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> > hi)) {
> 
> I would suggest to split this into a separate patch as well, since this 
> contains not just one but 2 bugfixes
> for this function and they are not related to sg lists.
> Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
> to something like 'nvme: fix and refactor nvme_addr_read'
> 

I've split it into a patch.

> 
> >  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> >  return 0;
> >  }
> > @@ -328,13 +333,242 @@ unmap:
> >  return status;
> >  }
> >  
> > -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> > - uint64_t prp1, uint64_t prp2, DMADirection 
> > dir,
> > +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> > +  QEMUIOVector *iov,
> > +  NvmeSglDescriptor *segment, uint64_t 
> > nsgld,
> > +  size_t *len, NvmeRequest *req)
> > +{
> > +dma_addr_t addr, trans_len;
> > +uint32_t blk_len;
> > +uint16_t status;
> > +
> > +for (int i = 0; i < nsgld; i++) {
> > +uint8_t type = NVME_SGL_TYPE(segment[i].type);
> > +
> > +if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> > +switch (type) {
> > +case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> > +case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> > +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> > +default:
> To be honest I don't like that 'default'
> I would explicitly state which segment types remain 
> (I think segment list and last segment list, and various reserved types)
> In fact for the reserved types you probably also want to return 
> NVME_SGL_DESCR_TYPE_INVALID)
> 

I "negated" the logic which I think is more readable. I still really
want to keep the default, for instance, nvme v1.4 adds a new type that
we do not support (the Transport SGL Data Block descriptor).

> Also this function as well really begs to have a description prior to it,
> something like 'map a sg list section, assuming that it only contains SGL 
> data descriptions,
> caller has to ensure this'.
> 

Done.

> 
> > +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> > +}
> > +}
> > +
> > +if (*len == 0) {
> > +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
> Nitpick: I would add a small comment here as well describiing
> what this does (We reach this point if sg list covers more that that
> was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
> capability indicates that we support just throwing the extra data away)
> 

Adding a comment. It's the other way around. The size as indicated by
NLB (or whatever depending on the command) is the "authoritative" souce
of information for the size of the payload. We will never accept an SGL
that is too short such that we lose or throw away data, but we might
accept ignoring parts of the SGL.

> > +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> > +break;
> > +}
> > +
> > +trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> > +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > +}
> > +
> > +addr = le64_to_cpu(segment[i].addr);
> > +blk_len = le32_to_cpu(segment[i].len);
> > +
> > +if (!blk_len) {
> > +continue;
> > +}
> > +
> > +if (UINT64_MAX - addr < blk_len) {
> > +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> > +}
> Good!
> > +
> > +trans_len = MIN(*len, blk_len);
> > +
> > +status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> > +if (status) {
> > +return status;
> > +}
> > +
> > +*len -= trans_len;
> > +}
> > +
> > +return NVME_SUCCESS;
> > +}
> > +
> > +static uint16_t 

Re: [PATCH v6 36/42] nvme: add support for scatter gather lists

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> For now, support the Data Block, Segment and Last Segment descriptor
> types.
> 
> See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 310 +++---
>  hw/block/trace-events |   4 +
>  2 files changed, 262 insertions(+), 52 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 49d323566393..b89b96990f52 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> +hwaddr hi = addr + size;
> +if (hi < addr) {
> +return 1;
> +}
> +
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> hi)) {

I would suggest to split this into a separate patch as well, since this 
contains not just one but 2 bugfixes
for this function and they are not related to sg lists.
Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
to something like 'nvme: fix and refactor nvme_addr_read'


>  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return 0;
>  }
> @@ -328,13 +333,242 @@ unmap:
>  return status;
>  }
>  
> -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> - uint64_t prp1, uint64_t prp2, DMADirection dir,
> +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> +  QEMUIOVector *iov,
> +  NvmeSglDescriptor *segment, uint64_t nsgld,
> +  size_t *len, NvmeRequest *req)
> +{
> +dma_addr_t addr, trans_len;
> +uint32_t blk_len;
> +uint16_t status;
> +
> +for (int i = 0; i < nsgld; i++) {
> +uint8_t type = NVME_SGL_TYPE(segment[i].type);
> +
> +if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> +switch (type) {
> +case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> +case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> +default:
To be honest I don't like that 'default'
I would explicitly state which segment types remain 
(I think segment list and last segment list, and various reserved types)
In fact for the reserved types you probably also want to return 
NVME_SGL_DESCR_TYPE_INVALID)

Also this function as well really begs to have a description prior to it,
something like 'map a sg list section, assuming that it only contains SGL data 
descriptions,
caller has to ensure this'.


> +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> +}
> +}
> +
> +if (*len == 0) {
> +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
Nitpick: I would add a small comment here as well describiing
what this does (We reach this point if sg list covers more that that
was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
capability indicates that we support just throwing the extra data away)

> +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> +break;
> +}
> +
> +trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +}
> +
> +addr = le64_to_cpu(segment[i].addr);
> +blk_len = le32_to_cpu(segment[i].len);
> +
> +if (!blk_len) {
> +continue;
> +}
> +
> +if (UINT64_MAX - addr < blk_len) {
> +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +}
Good!
> +
> +trans_len = MIN(*len, blk_len);
> +
> +status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> +if (status) {
> +return status;
> +}
> +
> +*len -= trans_len;
> +}
> +
> +return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> + NvmeSglDescriptor sgl, size_t len,
>   NvmeRequest *req)
> +{
> +/*
> + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
> + * dynamically allocating a potentially large SGL. The spec allows the 
> SGL
> + * to be larger than the command transfer size, so it is not bounded by
> + * MDTS.
> + */
Now this is a very good comment!

However I don't fully understand the note about the SGL. I assume that you mean
that the data that SGL covers still should be less that MDTS, but the actual 
SGL chain,
if assembled really in inefficient way (like 1 byte per each data descriptor) 
might be larger.


> +const int SEG_CHUNK_SIZE = 256;
> +
> +NvmeSglDescriptor segment[SEG_CHUNK_SIZE], 

[PATCH v6 36/42] nvme: add support for scatter gather lists

2020-03-16 Thread Klaus Jensen
From: Klaus Jensen 

For now, support the Data Block, Segment and Last Segment descriptor
types.

See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").

Signed-off-by: Klaus Jensen 
Acked-by: Keith Busch 
---
 hw/block/nvme.c   | 310 +++---
 hw/block/trace-events |   4 +
 2 files changed, 262 insertions(+), 52 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 49d323566393..b89b96990f52 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 
 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
 {
-if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
+hwaddr hi = addr + size;
+if (hi < addr) {
+return 1;
+}
+
+if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
 return 0;
 }
@@ -328,13 +333,242 @@ unmap:
 return status;
 }
 
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir,
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+  QEMUIOVector *iov,
+  NvmeSglDescriptor *segment, uint64_t nsgld,
+  size_t *len, NvmeRequest *req)
+{
+dma_addr_t addr, trans_len;
+uint32_t blk_len;
+uint16_t status;
+
+for (int i = 0; i < nsgld; i++) {
+uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+switch (type) {
+case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
+case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
+return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+default:
+return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+}
+}
+
+if (*len == 0) {
+uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+break;
+}
+
+trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+addr = le64_to_cpu(segment[i].addr);
+blk_len = le32_to_cpu(segment[i].len);
+
+if (!blk_len) {
+continue;
+}
+
+if (UINT64_MAX - addr < blk_len) {
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+trans_len = MIN(*len, blk_len);
+
+status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+if (status) {
+return status;
+}
+
+*len -= trans_len;
+}
+
+return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ NvmeSglDescriptor sgl, size_t len,
  NvmeRequest *req)
+{
+/*
+ * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+ * dynamically allocating a potentially large SGL. The spec allows the SGL
+ * to be larger than the command transfer size, so it is not bounded by
+ * MDTS.
+ */
+const int SEG_CHUNK_SIZE = 256;
+
+NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+uint64_t nsgld;
+uint32_t seg_len;
+uint16_t status;
+bool sgl_in_cmb = false;
+hwaddr addr;
+int ret;
+
+sgld = 
+addr = le64_to_cpu(sgl.addr);
+
+trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
+   len);
+
+/*
+ * If the entire transfer can be described with a single data block it can
+ * be mapped directly.
+ */
+if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, , req);
+if (status) {
+goto unmap;
+}
+
+goto out;
+}
+
+/*
+ * If the segment is located in the CMB, the submission queue of the
+ * request must also reside there.
+ */
+if (nvme_addr_is_cmb(n, addr)) {
+if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+}
+
+sgl_in_cmb = true;
+}
+
+for (;;) {
+seg_len = le32_to_cpu(sgld->len);
+
+if (!seg_len || seg_len & 0xf) {
+return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+}
+
+if (UINT64_MAX - addr < seg_len) {
+return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+}
+
+nsgld = seg_len / sizeof(NvmeSglDescriptor);
+
+while (nsgld > SEG_CHUNK_SIZE) {
+if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
+trace_nvme_dev_err_addr_read(addr);
+status = NVME_DATA_TRANSFER_ERROR;
+goto unmap;
+}
+
+status =