Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command

2020-10-23 Thread Klaus Jensen
On Oct 23 07:25, Klaus Jensen wrote:
> On Oct 22 23:02, Philippe Mathieu-Daudé wrote:
> > On 10/22/20 8:49 PM, Klaus Jensen wrote:
> > > -/* support DULBE */
> > > -id_ns->nsfeat |= 0x4;
> > > +/* support DULBE and I/O optimization fields */
> > > +id_ns->nsfeat |= (0x4 | 0x10);
> > 
> > The comment helps, but isn't needed if you use explicit definitions
> > for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE
> > and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits.
> > This is why I personally prefer the registerfields API (see
> > "hw/registerfields.h").
> > 
> 
> I've been wanting to fix those constants - but the convention that they
> only extract bits pre-dates the nvme device and is from when the nvme
> block driver was introduced - I have just been following the precedence
> by defining them like that.
> 

I did not know about the hw/registerfields.h API. Looks promising!


signature.asc
Description: PGP signature


Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command

2020-10-22 Thread Klaus Jensen
On Oct 22 23:02, Philippe Mathieu-Daudé wrote:
> Hi Klaus,
> 

Hi Philippe,

Thanks for your comments!

> On 10/22/20 8:49 PM, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > Add support for the Dataset Management command and the Deallocate
> > attribute. Deallocation results in discards being sent to the underlying
> > block device. Whether of not the blocks are actually deallocated is
> > affected by the same factors as Write Zeroes (see previous commit).
> > 
> >   format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)
> 
> Please use B/KiB units which are unambiguous (kb is for kbits)
> (if you queue this yourself, you can fix when applying, no need
> to repost).
> 

Thanks, I'll change it.

> >  --
> >qcow2ignore   n   n  n
> >qcow2unmapn   n  y
> >raw  ignore   n   n  n
> >raw  unmapn   y  y
> > 
> > Again, a raw format and 4kb LBAs are preferable.
> > 
> > In order to set the Namespace Preferred Deallocate Granularity and
> > Alignment fields (NPDG and NPDA), choose a sane minimum discard
> > granularity of 4kb. If we are using a passthru device supporting discard
> > at a 512b granularity, user should set the discard_granularity property
> 
> Ditto.
> 
> > explicitly. NPDG and NPDA will also account for the cluster_size of the
> > block driver if required (i.e. for QCOW2).
> > 
> > See NVM Express 1.3d, Section 6.7 ("Dataset Management command").
> > 
> > Signed-off-by: Klaus Jensen 
> > ---
> >   hw/block/nvme.h  |   2 +
> >   include/block/nvme.h |   7 ++-
> >   hw/block/nvme-ns.c   |  36 +--
> >   hw/block/nvme.c  | 101 ++-
> >   4 files changed, 140 insertions(+), 6 deletions(-)
> > 
> > diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> > index e080a2318a50..574333caa3f9 100644
> > --- a/hw/block/nvme.h
> > +++ b/hw/block/nvme.h
> > @@ -28,6 +28,7 @@ typedef struct NvmeRequest {
> >   struct NvmeNamespace*ns;
> >   BlockAIOCB  *aiocb;
> >   uint16_tstatus;
> > +void*opaque;
> >   NvmeCqe cqe;
> >   NvmeCmd cmd;
> >   BlockAcctCookie acct;
> > @@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
> >   case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
> >   case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
> >   case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
> > +case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
> >   default:return "NVME_NVM_CMD_UNKNOWN";
> >   }
> >   }
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 966c3bb304bd..e95ff6ca9b37 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
> >   uint16_tnabspf;
> >   uint16_tnoiob;
> >   uint8_t nvmcap[16];
> > -uint8_t rsvd64[40];
> > +uint16_tnpwg;
> > +uint16_tnpwa;
> > +uint16_tnpdg;
> > +uint16_tnpda;
> > +uint16_tnows;
> > +uint8_t rsvd74[30];
> >   uint8_t nguid[16];
> >   uint64_teui64;
> >   NvmeLBAFlbaf[16];
> 
> If you consider "block/nvme.h" shared by 2 different subsystems,
> it is better to add the changes in a separate patch. That way
> the changes can be acked individually.
> 

Sure. Some other stuff here warrents a v6 I think, so I will split it.

> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index f1cc734c60f5..840651db7256 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -28,10 +28,14 @@
> >   #include "nvme.h"
> >   #include "nvme-ns.h"
> > -static void nvme_ns_init(NvmeNamespace *ns)
> > +#define MIN_DISCARD_GRANULARITY (4 * KiB)
> > +
> > +static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
> 
> Hmm the Error* argument could be squashed in "hw/block/nvme:
> support multiple namespaces". Else better split patch in dumb
> units IMHO (maybe a reviewer's taste).
> 

Yeah, I guess I can squash that in.

> >   {
> > +BlockDriverInfo bdi;
> >   NvmeIdNs *id_ns = >id_ns;
> >   int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> > +int npdg, ret;
> >   ns->id_ns.dlfeat = 0x9;
> > @@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >   id_ns->ncap = id_ns->nsze;
> >   id_ns->nuse = id_ns->ncap;
> > -/* support DULBE */
> > -id_ns->nsfeat |= 0x4;
> > +/* support DULBE and I/O optimization fields */
> > +id_ns->nsfeat |= (0x4 | 0x10);
> 
> The comment helps, but isn't needed if you use explicit definitions
> for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE
> and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits.
> 

Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command

2020-10-22 Thread Keith Busch
On Thu, Oct 22, 2020 at 11:02:04PM +0200, Philippe Mathieu-Daudé wrote:
> On 10/22/20 8:49 PM, Klaus Jensen wrote:
> > +static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > +NvmeNamespace *ns = req->ns;
> > +NvmeDsmCmd *dsm = (NvmeDsmCmd *) >cmd;
> > +NvmeDsmRange *range = NULL;
> 
> g_autofree?

Or just allocate the array on the stack. The upper limit is just 512
entries.



Re: [PATCH v5 2/2] hw/block/nvme: add the dataset management command

2020-10-22 Thread Philippe Mathieu-Daudé

Hi Klaus,

On 10/22/20 8:49 PM, Klaus Jensen wrote:

From: Klaus Jensen 

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

  format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)


Please use B/KiB units which are unambiguous (kb is for kbits)
(if you queue this yourself, you can fix when applying, no need
to repost).


 --
   qcow2ignore   n   n  n
   qcow2unmapn   n  y
   raw  ignore   n   n  n
   raw  unmapn   y  y

Again, a raw format and 4kb LBAs are preferable.

In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4kb. If we are using a passthru device supporting discard
at a 512b granularity, user should set the discard_granularity property


Ditto.


explicitly. NPDG and NPDA will also account for the cluster_size of the
block driver if required (i.e. for QCOW2).

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen 
---
  hw/block/nvme.h  |   2 +
  include/block/nvme.h |   7 ++-
  hw/block/nvme-ns.c   |  36 +--
  hw/block/nvme.c  | 101 ++-
  4 files changed, 140 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
  struct NvmeNamespace*ns;
  BlockAIOCB  *aiocb;
  uint16_tstatus;
+void*opaque;
  NvmeCqe cqe;
  NvmeCmd cmd;
  BlockAcctCookie acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
  case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
  case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
  case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
  default:return "NVME_NVM_CMD_UNKNOWN";
  }
  }
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 966c3bb304bd..e95ff6ca9b37 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
  uint16_tnabspf;
  uint16_tnoiob;
  uint8_t nvmcap[16];
-uint8_t rsvd64[40];
+uint16_tnpwg;
+uint16_tnpwa;
+uint16_tnpdg;
+uint16_tnpda;
+uint16_tnows;
+uint8_t rsvd74[30];
  uint8_t nguid[16];
  uint64_teui64;
  NvmeLBAFlbaf[16];


If you consider "block/nvme.h" shared by 2 different subsystems,
it is better to add the changes in a separate patch. That way
the changes can be acked individually.


diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index f1cc734c60f5..840651db7256 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -28,10 +28,14 @@
  #include "nvme.h"
  #include "nvme-ns.h"
  
-static void nvme_ns_init(NvmeNamespace *ns)

+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)


Hmm the Error* argument could be squashed in "hw/block/nvme:
support multiple namespaces". Else better split patch in dumb
units IMHO (maybe a reviewer's taste).


  {
+BlockDriverInfo bdi;
  NvmeIdNs *id_ns = >id_ns;
  int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+int npdg, ret;
  
  ns->id_ns.dlfeat = 0x9;
  
@@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)

  id_ns->ncap = id_ns->nsze;
  id_ns->nuse = id_ns->ncap;
  
-/* support DULBE */

-id_ns->nsfeat |= 0x4;
+/* support DULBE and I/O optimization fields */
+id_ns->nsfeat |= (0x4 | 0x10);


The comment helps, but isn't needed if you use explicit definitions
for these flags. You already introduced the NVME_ID_NS_NSFEAT_DULBE
and NVME_ID_NS_FLBAS_EXTENDED but they are restricted to extract bits.
This is why I personally prefer the registerfields API (see
"hw/registerfields.h").


+
+npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+ret = bdrv_get_info(blk_bs(ns->blkconf.blk), );
+if (ret < 0) {
+error_setg_errno(errp, -ret, "could not get block driver info");
+return ret;
+}
+
+if (bdi.cluster_size &&
+bdi.cluster_size > ns->blkconf.discard_granularity) {
+npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+}
+
+id_ns->npda = id_ns->npdg = npdg - 1;
+
+return 0;
  }
  
  static int nvme_ns_init_blk(NvmeCtrl *n, 

[PATCH v5 2/2] hw/block/nvme: add the dataset management command

2020-10-22 Thread Klaus Jensen
From: Klaus Jensen 

Add support for the Dataset Management command and the Deallocate
attribute. Deallocation results in discards being sent to the underlying
block device. Whether of not the blocks are actually deallocated is
affected by the same factors as Write Zeroes (see previous commit).

 format | discard | dsm (512b)  dsm (4kb)  dsm (64kb)
--
  qcow2ignore   n   n  n
  qcow2unmapn   n  y
  raw  ignore   n   n  n
  raw  unmapn   y  y

Again, a raw format and 4kb LBAs are preferable.

In order to set the Namespace Preferred Deallocate Granularity and
Alignment fields (NPDG and NPDA), choose a sane minimum discard
granularity of 4kb. If we are using a passthru device supporting discard
at a 512b granularity, user should set the discard_granularity property
explicitly. NPDG and NPDA will also account for the cluster_size of the
block driver if required (i.e. for QCOW2).

See NVM Express 1.3d, Section 6.7 ("Dataset Management command").

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme.h  |   2 +
 include/block/nvme.h |   7 ++-
 hw/block/nvme-ns.c   |  36 +--
 hw/block/nvme.c  | 101 ++-
 4 files changed, 140 insertions(+), 6 deletions(-)

diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index e080a2318a50..574333caa3f9 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -28,6 +28,7 @@ typedef struct NvmeRequest {
 struct NvmeNamespace*ns;
 BlockAIOCB  *aiocb;
 uint16_tstatus;
+void*opaque;
 NvmeCqe cqe;
 NvmeCmd cmd;
 BlockAcctCookie acct;
@@ -60,6 +61,7 @@ static inline const char *nvme_io_opc_str(uint8_t opc)
 case NVME_CMD_WRITE:return "NVME_NVM_CMD_WRITE";
 case NVME_CMD_READ: return "NVME_NVM_CMD_READ";
 case NVME_CMD_WRITE_ZEROES: return "NVME_NVM_CMD_WRITE_ZEROES";
+case NVME_CMD_DSM:  return "NVME_NVM_CMD_DSM";
 default:return "NVME_NVM_CMD_UNKNOWN";
 }
 }
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 966c3bb304bd..e95ff6ca9b37 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -990,7 +990,12 @@ typedef struct QEMU_PACKED NvmeIdNs {
 uint16_tnabspf;
 uint16_tnoiob;
 uint8_t nvmcap[16];
-uint8_t rsvd64[40];
+uint16_tnpwg;
+uint16_tnpwa;
+uint16_tnpdg;
+uint16_tnpda;
+uint16_tnows;
+uint8_t rsvd74[30];
 uint8_t nguid[16];
 uint64_teui64;
 NvmeLBAFlbaf[16];
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index f1cc734c60f5..840651db7256 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -28,10 +28,14 @@
 #include "nvme.h"
 #include "nvme-ns.h"
 
-static void nvme_ns_init(NvmeNamespace *ns)
+#define MIN_DISCARD_GRANULARITY (4 * KiB)
+
+static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
 {
+BlockDriverInfo bdi;
 NvmeIdNs *id_ns = >id_ns;
 int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+int npdg, ret;
 
 ns->id_ns.dlfeat = 0x9;
 
@@ -43,8 +47,25 @@ static void nvme_ns_init(NvmeNamespace *ns)
 id_ns->ncap = id_ns->nsze;
 id_ns->nuse = id_ns->ncap;
 
-/* support DULBE */
-id_ns->nsfeat |= 0x4;
+/* support DULBE and I/O optimization fields */
+id_ns->nsfeat |= (0x4 | 0x10);
+
+npdg = ns->blkconf.discard_granularity / ns->blkconf.logical_block_size;
+
+ret = bdrv_get_info(blk_bs(ns->blkconf.blk), );
+if (ret < 0) {
+error_setg_errno(errp, -ret, "could not get block driver info");
+return ret;
+}
+
+if (bdi.cluster_size &&
+bdi.cluster_size > ns->blkconf.discard_granularity) {
+npdg = bdi.cluster_size / ns->blkconf.logical_block_size;
+}
+
+id_ns->npda = id_ns->npdg = npdg - 1;
+
+return 0;
 }
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -59,6 +80,11 @@ static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, 
Error **errp)
 return -1;
 }
 
+if (ns->blkconf.discard_granularity == -1) {
+ns->blkconf.discard_granularity =
+MAX(ns->blkconf.logical_block_size, MIN_DISCARD_GRANULARITY);
+}
+
 ns->size = blk_getlength(ns->blkconf.blk);
 if (ns->size < 0) {
 error_setg_errno(errp, -ns->size, "could not get blockdev size");
@@ -92,7 +118,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 return -1;
 }
 
-nvme_ns_init(ns);
+if (nvme_ns_init(ns, errp)) {
+return -1;
+}
 
 if (nvme_register_namespace(n, ns, errp)) {
 return -1;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 4ab0705f5a92..7acb9e9dc38a 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -959,6