Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support

2020-11-16 Thread Keith Busch
On Thu, Nov 12, 2020 at 08:59:43PM +0100, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add support for reporting the Deallocated or Unwritten Logical Block
> Error (DULBE).
> 
> Rely on the block status flags reported by the block layer and consider
> any block with the BDRV_BLOCK_ZERO flag to be deallocated.
> 
> Multiple factors affect when a Write Zeroes command result in
> deallocation of blocks.
> 
>   * the underlying file system block size
>   * the blockdev format
>   * the 'discard' and 'logical_block_size' parameters
> 
>  format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
> -
>   qcow2ignore   n  n  y
>   qcow2unmapn  n  y
>   raw  ignore   n  y  y
>   raw  unmapn  y  y
> 
> So, this works best with an image in raw format and 4KiB LBAs, since
> holes can then be punched on a per-block basis (this assumes a file
> system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> of 64KiB by default and blocks will only be marked deallocated if a full
> cluster is zeroed or discarded. However, this *is* consistent with the
> spec since Write Zeroes "should" deallocate the block if the Deallocate
> attribute is set and "may" deallocate if the Deallocate attribute is not
> set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> always set).

These all sound like reasonable constraints and consistent with the
spec.

Reviewed-by: Keith Busch 



Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support

2020-11-16 Thread Klaus Jensen
On Nov 16 20:43, Minwoo Im wrote:
> On 11/12 20:59, Klaus Jensen wrote:
> > From: Klaus Jensen 
> > 
> > Add support for reporting the Deallocated or Unwritten Logical Block
> > Error (DULBE).
> > 
> > Rely on the block status flags reported by the block layer and consider
> > any block with the BDRV_BLOCK_ZERO flag to be deallocated.
> > 
> > Multiple factors affect when a Write Zeroes command result in
> > deallocation of blocks.
> > 
> >   * the underlying file system block size
> >   * the blockdev format
> >   * the 'discard' and 'logical_block_size' parameters
> > 
> >  format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
> > -
> >   qcow2ignore   n  n  y
> >   qcow2unmapn  n  y
> >   raw  ignore   n  y  y
> >   raw  unmapn  y  y
> > 
> > So, this works best with an image in raw format and 4KiB LBAs, since
> > holes can then be punched on a per-block basis (this assumes a file
> > system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> > of 64KiB by default and blocks will only be marked deallocated if a full
> > cluster is zeroed or discarded. However, this *is* consistent with the
> > spec since Write Zeroes "should" deallocate the block if the Deallocate
> > attribute is set and "may" deallocate if the Deallocate attribute is not
> > set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> > always set).
> > 
> > Signed-off-by: Klaus Jensen 
> > ---
> >  hw/block/nvme-ns.h|  4 ++
> >  include/block/nvme.h  |  5 +++
> >  hw/block/nvme-ns.c|  8 ++--
> >  hw/block/nvme.c   | 91 ++-
> >  hw/block/trace-events |  4 ++
> >  5 files changed, 107 insertions(+), 5 deletions(-)
> > 
> > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > index 83734f4606e1..44bf6271b744 100644
> > --- a/hw/block/nvme-ns.h
> > +++ b/hw/block/nvme-ns.h
> > @@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
> >  NvmeIdNs id_ns;
> >  
> >  NvmeNamespaceParams params;
> > +
> > +struct {
> > +uint32_t err_rec;
> > +} features;
> >  } NvmeNamespace;
> >  
> >  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 8a46d9cf015f..966c3bb304bd 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -687,6 +687,7 @@ enum NvmeStatusCodes {
> >  NVME_E2E_REF_ERROR  = 0x0284,
> >  NVME_CMP_FAILURE= 0x0285,
> >  NVME_ACCESS_DENIED  = 0x0286,
> > +NVME_DULB   = 0x0287,
> >  NVME_MORE   = 0x2000,
> >  NVME_DNR= 0x4000,
> >  NVME_NO_COMPLETE= 0x,
> > @@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
> >  #define NVME_AEC_NS_ATTR(aec)   ((aec >> 8) & 0x1)
> >  #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
> >  
> > +#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0x)
> > +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x1)
> > +
> >  enum NvmeFeatureIds {
> >  NVME_ARBITRATION= 0x1,
> >  NVME_POWER_MANAGEMENT   = 0x2,
> > @@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
> >  
> >  
> >  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)  ((nsfeat & 0x1))
> > +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
> >  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)((flbas >> 4) & 0x1)
> >  #define NVME_ID_NS_FLBAS_INDEX(flbas)   ((flbas & 0xf))
> >  #define NVME_ID_NS_MC_SEPARATE(mc)  ((mc >> 1) & 0x1)
> > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > index 31c80cdf5b5f..f1cc734c60f5 100644
> > --- a/hw/block/nvme-ns.c
> > +++ b/hw/block/nvme-ns.c
> > @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >  NvmeIdNs *id_ns = >id_ns;
> >  int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> >  
> > -if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> > -ns->id_ns.dlfeat = 0x9;
> > -}
> > +ns->id_ns.dlfeat = 0x9;
> >  
> >  id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
> >  
> > @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
> >  /* no thin provisioning */
> >  id_ns->ncap = id_ns->nsze;
> >  id_ns->nuse = id_ns->ncap;
> > +
> > +/* support DULBE */
> > +id_ns->nsfeat |= 0x4;
> >  }
> >  
> >  static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> > @@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
> > **errp)
> >  }
> >  
> >  nvme_ns_init(ns);
> > +
> >  if (nvme_register_namespace(n, ns, errp)) {
> >  return -1;
> >  }
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index a96a996ddc0d..8d75a89fd872 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -105,6 +105,7 @@ static const bool 

Re: [PATCH v8 3/5] hw/block/nvme: add dulbe support

2020-11-16 Thread Minwoo Im
On 11/12 20:59, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add support for reporting the Deallocated or Unwritten Logical Block
> Error (DULBE).
> 
> Rely on the block status flags reported by the block layer and consider
> any block with the BDRV_BLOCK_ZERO flag to be deallocated.
> 
> Multiple factors affect when a Write Zeroes command result in
> deallocation of blocks.
> 
>   * the underlying file system block size
>   * the blockdev format
>   * the 'discard' and 'logical_block_size' parameters
> 
>  format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
> -
>   qcow2ignore   n  n  y
>   qcow2unmapn  n  y
>   raw  ignore   n  y  y
>   raw  unmapn  y  y
> 
> So, this works best with an image in raw format and 4KiB LBAs, since
> holes can then be punched on a per-block basis (this assumes a file
> system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
> of 64KiB by default and blocks will only be marked deallocated if a full
> cluster is zeroed or discarded. However, this *is* consistent with the
> spec since Write Zeroes "should" deallocate the block if the Deallocate
> attribute is set and "may" deallocate if the Deallocate attribute is not
> set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
> always set).
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme-ns.h|  4 ++
>  include/block/nvme.h  |  5 +++
>  hw/block/nvme-ns.c|  8 ++--
>  hw/block/nvme.c   | 91 ++-
>  hw/block/trace-events |  4 ++
>  5 files changed, 107 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 83734f4606e1..44bf6271b744 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
>  NvmeIdNs id_ns;
>  
>  NvmeNamespaceParams params;
> +
> +struct {
> +uint32_t err_rec;
> +} features;
>  } NvmeNamespace;
>  
>  static inline uint32_t nvme_nsid(NvmeNamespace *ns)
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8a46d9cf015f..966c3bb304bd 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -687,6 +687,7 @@ enum NvmeStatusCodes {
>  NVME_E2E_REF_ERROR  = 0x0284,
>  NVME_CMP_FAILURE= 0x0285,
>  NVME_ACCESS_DENIED  = 0x0286,
> +NVME_DULB   = 0x0287,
>  NVME_MORE   = 0x2000,
>  NVME_DNR= 0x4000,
>  NVME_NO_COMPLETE= 0x,
> @@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
>  #define NVME_AEC_NS_ATTR(aec)   ((aec >> 8) & 0x1)
>  #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
>  
> +#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0x)
> +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x1)
> +
>  enum NvmeFeatureIds {
>  NVME_ARBITRATION= 0x1,
>  NVME_POWER_MANAGEMENT   = 0x2,
> @@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
>  
>  
>  #define NVME_ID_NS_NSFEAT_THIN(nsfeat)  ((nsfeat & 0x1))
> +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
>  #define NVME_ID_NS_FLBAS_EXTENDED(flbas)((flbas >> 4) & 0x1)
>  #define NVME_ID_NS_FLBAS_INDEX(flbas)   ((flbas & 0xf))
>  #define NVME_ID_NS_MC_SEPARATE(mc)  ((mc >> 1) & 0x1)
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 31c80cdf5b5f..f1cc734c60f5 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
>  NvmeIdNs *id_ns = >id_ns;
>  int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>  
> -if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
> -ns->id_ns.dlfeat = 0x9;
> -}
> +ns->id_ns.dlfeat = 0x9;
>  
>  id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
>  
> @@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
>  /* no thin provisioning */
>  id_ns->ncap = id_ns->nsze;
>  id_ns->nuse = id_ns->ncap;
> +
> +/* support DULBE */
> +id_ns->nsfeat |= 0x4;
>  }
>  
>  static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> @@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
> **errp)
>  }
>  
>  nvme_ns_init(ns);
> +
>  if (nvme_register_namespace(n, ns, errp)) {
>  return -1;
>  }
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index a96a996ddc0d..8d75a89fd872 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
>  
>  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
>  [NVME_TEMPERATURE_THRESHOLD]= NVME_FEAT_CAP_CHANGE,
> +[NVME_ERROR_RECOVERY]   = NVME_FEAT_CAP_CHANGE | 
> NVME_FEAT_CAP_NS,
>  [NVME_VOLATILE_WRITE_CACHE] = 

[PATCH v8 3/5] hw/block/nvme: add dulbe support

2020-11-12 Thread Klaus Jensen
From: Klaus Jensen 

Add support for reporting the Deallocated or Unwritten Logical Block
Error (DULBE).

Rely on the block status flags reported by the block layer and consider
any block with the BDRV_BLOCK_ZERO flag to be deallocated.

Multiple factors affect when a Write Zeroes command result in
deallocation of blocks.

  * the underlying file system block size
  * the blockdev format
  * the 'discard' and 'logical_block_size' parameters

 format | discard | wz (512B)  wz (4KiB)  wz (64KiB)
-
  qcow2ignore   n  n  y
  qcow2unmapn  n  y
  raw  ignore   n  y  y
  raw  unmapn  y  y

So, this works best with an image in raw format and 4KiB LBAs, since
holes can then be punched on a per-block basis (this assumes a file
system with a 4kb block size, YMMV). A qcow2 image, uses a cluster size
of 64KiB by default and blocks will only be marked deallocated if a full
cluster is zeroed or discarded. However, this *is* consistent with the
spec since Write Zeroes "should" deallocate the block if the Deallocate
attribute is set and "may" deallocate if the Deallocate attribute is not
set. Thus, we always try to deallocate (the BDRV_REQ_MAY_UNMAP flag is
always set).

Signed-off-by: Klaus Jensen 
---
 hw/block/nvme-ns.h|  4 ++
 include/block/nvme.h  |  5 +++
 hw/block/nvme-ns.c|  8 ++--
 hw/block/nvme.c   | 91 ++-
 hw/block/trace-events |  4 ++
 5 files changed, 107 insertions(+), 5 deletions(-)

diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
index 83734f4606e1..44bf6271b744 100644
--- a/hw/block/nvme-ns.h
+++ b/hw/block/nvme-ns.h
@@ -31,6 +31,10 @@ typedef struct NvmeNamespace {
 NvmeIdNs id_ns;
 
 NvmeNamespaceParams params;
+
+struct {
+uint32_t err_rec;
+} features;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8a46d9cf015f..966c3bb304bd 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -687,6 +687,7 @@ enum NvmeStatusCodes {
 NVME_E2E_REF_ERROR  = 0x0284,
 NVME_CMP_FAILURE= 0x0285,
 NVME_ACCESS_DENIED  = 0x0286,
+NVME_DULB   = 0x0287,
 NVME_MORE   = 0x2000,
 NVME_DNR= 0x4000,
 NVME_NO_COMPLETE= 0x,
@@ -903,6 +904,9 @@ enum NvmeIdCtrlLpa {
 #define NVME_AEC_NS_ATTR(aec)   ((aec >> 8) & 0x1)
 #define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
 
+#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0x)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x1)
+
 enum NvmeFeatureIds {
 NVME_ARBITRATION= 0x1,
 NVME_POWER_MANAGEMENT   = 0x2,
@@ -1023,6 +1027,7 @@ enum NvmeNsIdentifierType {
 
 
 #define NVME_ID_NS_NSFEAT_THIN(nsfeat)  ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
 #define NVME_ID_NS_FLBAS_EXTENDED(flbas)((flbas >> 4) & 0x1)
 #define NVME_ID_NS_FLBAS_INDEX(flbas)   ((flbas & 0xf))
 #define NVME_ID_NS_MC_SEPARATE(mc)  ((mc >> 1) & 0x1)
diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 31c80cdf5b5f..f1cc734c60f5 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -33,9 +33,7 @@ static void nvme_ns_init(NvmeNamespace *ns)
 NvmeIdNs *id_ns = >id_ns;
 int lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
 
-if (blk_get_flags(ns->blkconf.blk) & BDRV_O_UNMAP) {
-ns->id_ns.dlfeat = 0x9;
-}
+ns->id_ns.dlfeat = 0x9;
 
 id_ns->lbaf[lba_index].ds = 31 - clz32(ns->blkconf.logical_block_size);
 
@@ -44,6 +42,9 @@ static void nvme_ns_init(NvmeNamespace *ns)
 /* no thin provisioning */
 id_ns->ncap = id_ns->nsze;
 id_ns->nuse = id_ns->ncap;
+
+/* support DULBE */
+id_ns->nsfeat |= 0x4;
 }
 
 static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
@@ -92,6 +93,7 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error 
**errp)
 }
 
 nvme_ns_init(ns);
+
 if (nvme_register_namespace(n, ns, errp)) {
 return -1;
 }
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index a96a996ddc0d..8d75a89fd872 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -105,6 +105,7 @@ static const bool nvme_feature_support[NVME_FID_MAX] = {
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
 [NVME_TEMPERATURE_THRESHOLD]= NVME_FEAT_CAP_CHANGE,
+[NVME_ERROR_RECOVERY]   = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
 [NVME_ASYNCHRONOUS_EVENT_CONF]  = NVME_FEAT_CAP_CHANGE,
@@ -878,6 +879,49 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace 
*ns, uint64_t slba,
 return NVME_SUCCESS;
 }
 
+static uint16_t