Re: [PATCH rfcv1 23/23] intel_iommu: modify x-scalable-mode to be string option

2024-02-04 Thread Joel Granados
On Wed, Jan 31, 2024 at 11:24:18PM +0800, Yi Liu wrote:
> On 2024/1/31 22:40, Joel Granados wrote:
> > On Mon, Jan 15, 2024 at 06:37:35PM +0800, Zhenzhong Duan wrote:
> >> From: Yi Liu 
> >>
> >> Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
> >> related to scalable mode translation, thus there are multiple combinations.
> >> While this vIOMMU implementation wants to simplify it for user by providing
> >> typical combinations. User could config it by "x-scalable-mode" option. The
> >> usage is as below:
> >>
> >> "-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"
> >>
> >>   - "legacy": gives support for stage-2 page table
> >>   - "modern": gives support for stage-1 page table
> >>   - "off": no scalable mode support
> >>   -  if not configured, means no scalable mode support, if not proper
> >>  configured, will throw error
> >>
> >> Signed-off-by: Yi Liu 
> >> Signed-off-by: Yi Sun 
> >> Signed-off-by: Zhenzhong Duan 
> >> ---
> >>   include/hw/i386/intel_iommu.h |  1 +
> >>   hw/i386/intel_iommu.c | 25 +++--
> >>   2 files changed, 24 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> >> index f3e75263b7..9cbd568171 100644
> >> --- a/include/hw/i386/intel_iommu.h
> >> +++ b/include/hw/i386/intel_iommu.h
> >> @@ -320,6 +320,7 @@ struct IntelIOMMUState {
> >>   
> >>   bool caching_mode;  /* RO - is cap CM enabled? */
> >>   bool scalable_mode; /* RO - is Scalable Mode supported? 
> >> */
> >> +char *scalable_mode_str;/* RO - admin's Scalable Mode config 
> >> */
> >>   bool scalable_modern;   /* RO - is modern SM supported? */
> >>   bool snoop_control; /* RO - is SNP filed supported? */
> >>   
> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> >> index e418305f6e..b507112069 100644
> >> --- a/hw/i386/intel_iommu.c
> >> +++ b/hw/i386/intel_iommu.c
> >> @@ -5111,7 +5111,7 @@ static Property vtd_properties[] = {
> >>   DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
> >> VTD_HOST_ADDRESS_WIDTH),
> >>   DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, 
> >> FALSE),
> >> -DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, 
> >> FALSE),
> >> +DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, 
> >> scalable_mode_str),
> >>   DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, 
> >> false),
> >>   DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
> >>   DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
> >> @@ -6122,7 +6122,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, 
> >> Error **errp)
> >>   }
> >>   }
> >>   
> >> -/* Currently only address widths supported are 39 and 48 bits */
> >> +if (s->scalable_mode_str &&
> >> +(strcmp(s->scalable_mode_str, "off") &&
> >> + strcmp(s->scalable_mode_str, "modern") &&
> >> + strcmp(s->scalable_mode_str, "legacy"))) {
> >> +error_setg(errp, "Invalid x-scalable-mode config,"
> >> + "Please use \"modern\", \"legacy\" or \"off\"");
> >> +return false;
> >> +}
> >> +
> >> +if (s->scalable_mode_str &&
> >> +!strcmp(s->scalable_mode_str, "legacy")) {
> >> +s->scalable_mode = true;
> >> +s->scalable_modern = false;
> >> +} else if (s->scalable_mode_str &&
> >> +!strcmp(s->scalable_mode_str, "modern")) {
> >> +s->scalable_mode = true;
> >> +s->scalable_modern = true;
> >> +} else {
> >> +s->scalable_mode = false;
> >> +s->scalable_modern = false;
> >> +}
> >> +
> >>   if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
> >>   (s->aw_bits != VTD_HOST_AW_48BIT) &&
> >>   !s->scalable_modern) {
> >> -- 
> >> 2.34.1
> >>
> >>
> > 
> > I noticed that this patch changes quite a bit from the previous version
> > that you had. I Specifically noticed that you dropped VTD_ECAP_PRS from
> > intel_iommu_internal.h. I was under the impression that this set the
> > Page Request Servicves capability in the IOMMU effectively enabling PRI
> > in the iommu.
> > 
> > Why did you remove it from your original patch?
> 
> It is because this series does not cover the PRS support. It should come
> in a future series.

ok. Thx for getting back to me.

Best
> 
> Regards,
> Yi Liu

-- 

Joel Granados


signature.asc
Description: PGP signature


Re: [PATCH rfcv1 23/23] intel_iommu: modify x-scalable-mode to be string option

2024-01-31 Thread Joel Granados
On Mon, Jan 15, 2024 at 06:37:35PM +0800, Zhenzhong Duan wrote:
> From: Yi Liu 
> 
> Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
> related to scalable mode translation, thus there are multiple combinations.
> While this vIOMMU implementation wants to simplify it for user by providing
> typical combinations. User could config it by "x-scalable-mode" option. The
> usage is as below:
> 
> "-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"
> 
>  - "legacy": gives support for stage-2 page table
>  - "modern": gives support for stage-1 page table
>  - "off": no scalable mode support
>  -  if not configured, means no scalable mode support, if not proper
> configured, will throw error
> 
> Signed-off-by: Yi Liu 
> Signed-off-by: Yi Sun 
> Signed-off-by: Zhenzhong Duan 
> ---
>  include/hw/i386/intel_iommu.h |  1 +
>  hw/i386/intel_iommu.c | 25 +++--
>  2 files changed, 24 insertions(+), 2 deletions(-)
> 
> diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
> index f3e75263b7..9cbd568171 100644
> --- a/include/hw/i386/intel_iommu.h
> +++ b/include/hw/i386/intel_iommu.h
> @@ -320,6 +320,7 @@ struct IntelIOMMUState {
>  
>  bool caching_mode;  /* RO - is cap CM enabled? */
>  bool scalable_mode; /* RO - is Scalable Mode supported? */
> +char *scalable_mode_str;/* RO - admin's Scalable Mode config */
>  bool scalable_modern;   /* RO - is modern SM supported? */
>  bool snoop_control; /* RO - is SNP filed supported? */
>  
> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> index e418305f6e..b507112069 100644
> --- a/hw/i386/intel_iommu.c
> +++ b/hw/i386/intel_iommu.c
> @@ -5111,7 +5111,7 @@ static Property vtd_properties[] = {
>  DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
>VTD_HOST_ADDRESS_WIDTH),
>  DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
> -DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, 
> FALSE),
> +DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, 
> scalable_mode_str),
>  DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false),
>  DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false),
>  DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
> @@ -6122,7 +6122,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, 
> Error **errp)
>  }
>  }
>  
> -/* Currently only address widths supported are 39 and 48 bits */
> +if (s->scalable_mode_str &&
> +(strcmp(s->scalable_mode_str, "off") &&
> + strcmp(s->scalable_mode_str, "modern") &&
> + strcmp(s->scalable_mode_str, "legacy"))) {
> +error_setg(errp, "Invalid x-scalable-mode config,"
> + "Please use \"modern\", \"legacy\" or \"off\"");
> +return false;
> +}
> +
> +if (s->scalable_mode_str &&
> +!strcmp(s->scalable_mode_str, "legacy")) {
> +s->scalable_mode = true;
> +s->scalable_modern = false;
> +} else if (s->scalable_mode_str &&
> +!strcmp(s->scalable_mode_str, "modern")) {
> +s->scalable_mode = true;
> +s->scalable_modern = true;
> +} else {
> +s->scalable_mode = false;
> +s->scalable_modern = false;
> +}
> +
>  if ((s->aw_bits != VTD_HOST_AW_39BIT) &&
>  (s->aw_bits != VTD_HOST_AW_48BIT) &&
>  !s->scalable_modern) {
> -- 
> 2.34.1
> 
> 

I noticed that this patch changes quite a bit from the previous version
that you had. I Specifically noticed that you dropped VTD_ECAP_PRS from
intel_iommu_internal.h. I was under the impression that this set the
Page Request Servicves capability in the IOMMU effectively enabling PRI
in the iommu.

Why did you remove it from your original patch?

Thx in advance

Best

-- 

Joel Granados


signature.asc
Description: PGP signature


Re: Questions regarding the still unpublished qemu series https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting_rfcv1

2023-12-23 Thread Joel Granados
On Thu, Dec 21, 2023 at 03:33:47AM +, Duan, Zhenzhong wrote:
> Hi Joel,
> 
> >-Original Message-----
> >From: Joel Granados 
> >Subject: Questions regarding the still unpublished qemu series
> >https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting
> >_rfcv1
> >
> >Hello Everyone
> >
> >While running
> >https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting
> >_rfcv1
> >I have come across particular code path that seems odd:
> >
> >I'm hitting an assert in softmmu/memory.c:1994 after calling a
> >notification from vtd_flt_page_walk_level. The code in memory.c:1994
> >makes sure that when type==IOMMU_NOTIFIER_UNMAP, the permissions
> >are
> >IOMMU_NONE. But the code in vtd_flt_page_walk_level sets the
> >permissions to read|write. This is part of the "intel_iommu: piotlb
> >invalidation should notify unmap" commit in the iommufd_nesting_rfcv1
> >series.
> >
> >Question is: Why assert on the permissions being NONE if they might be
> >read|write?
> >
> >Hope this makes sense. Don't hesitate to get back to me if you see that
> >there is something missing in my explanation.
> 
> Thanks for your report, you are right. We had seen the same issue with
> vhost device and have it fixed internally. Did you also use a vhost device
> or not?
> 
> The link you used is a bit old, could you try with 
> https//:github.com/yiliu1765/qemu/commits/zhenzhong/iommufd_nesting_rfcv1/
I thought it was the same branch, but I see that it does not have the
"wip" in the path.

Great!. I'll try this as soon as I get some free cycles (they are
difficult to find in holiday season:) and get back to you if I run into
anything else.

Thx for the link.

Best

> 
> Thanks
> Zhenzhong

-- 

Joel Granados


signature.asc
Description: PGP signature


Questions regarding the still unpublished qemu series https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting_rfcv1

2023-12-20 Thread Joel Granados
Hello Everyone

While running 
https://github.com/yiliu1765/qemu/tree/zhenzhong/wip/iommufd_nesting_rfcv1
I have come across particular code path that seems odd:

I'm hitting an assert in softmmu/memory.c:1994 after calling a
notification from vtd_flt_page_walk_level. The code in memory.c:1994
makes sure that when type==IOMMU_NOTIFIER_UNMAP, the permissions are
IOMMU_NONE. But the code in vtd_flt_page_walk_level sets the
permissions to read|write. This is part of the "intel_iommu: piotlb
invalidation should notify unmap" commit in the iommufd_nesting_rfcv1
series.

Question is: Why assert on the permissions being NONE if they might be
read|write?

Hope this makes sense. Don't hesitate to get back to me if you see that
there is something missing in my explanation.

Best

-- 

Joel Granados


signature.asc
Description: PGP signature


Re: [RFC PATCH v4 20/24] vfio/iommufd: Implement the iommufd backend

2023-07-14 Thread Joel Granados
 +__vfio_device_detach_container(vbasedev, container, );
> +if (err) {
> +error_report_err(err);
> +}
> +if (QLIST_EMPTY(>hwpt_list)) {
> +VFIOAddressSpace *space = bcontainer->space;
> +
> +memory_listener_unregister(>listener);
> +iommufd_backend_put_ioas(container->be, container->ioas_id);
> +vfio_iommufd_container_destroy(container);
> +vfio_put_address_space(space);
> +}
> +vbasedev->container = NULL;
> +out:
> +close(vbasedev->fd);
> +g_free(vbasedev->name);
> +}
> +
> +static void vfio_iommu_backend_iommufd_ops_class_init(ObjectClass *oc,
> + void *data) {
> +VFIOIOMMUBackendOpsClass *ops = VFIO_IOMMU_BACKEND_OPS_CLASS(oc);
> +
> +ops->dma_map = iommufd_map;
> +ops->dma_unmap = iommufd_unmap;
> +ops->attach_device = iommufd_attach_device;
> +ops->detach_device = iommufd_detach_device;
> +}
> +
> +static const TypeInfo vfio_iommu_backend_iommufd_ops_type = {
> +.name = TYPE_VFIO_IOMMU_BACKEND_IOMMUFD_OPS,
> +
> +.parent = TYPE_VFIO_IOMMU_BACKEND_OPS,
> +.class_init = vfio_iommu_backend_iommufd_ops_class_init,
> +.abstract = true,
> +};
> +static void vfio_iommu_backend_iommufd_ops_register_types(void)
> +{
> +type_register_static(_iommu_backend_iommufd_ops_type);
> +}
> +type_init(vfio_iommu_backend_iommufd_ops_register_types);
> +
> diff --git a/hw/vfio/meson.build b/hw/vfio/meson.build
> index fd0dfd198a..326ff35749 100644
> --- a/hw/vfio/meson.build
> +++ b/hw/vfio/meson.build
> @@ -7,6 +7,9 @@ vfio_ss.add(files(
>'spapr.c',
>'migration.c',
>  ))
> +if have_iommufd
> +  vfio_ss.add(files('iommufd.c'))
> +endif
>  vfio_ss.add(when: 'CONFIG_VFIO_PCI', if_true: files(
>'display.c',
>'pci-quirks.c',
> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> index 3f1a7e1c3e..4412c510e4 100644
> --- a/hw/vfio/trace-events
> +++ b/hw/vfio/trace-events
> @@ -167,3 +167,14 @@ vfio_save_setup(const char *name, uint64_t 
> data_buffer_size) " (%s) data buffer
>  vfio_state_pending_estimate(const char *name, uint64_t precopy, uint64_t 
> postcopy, uint64_t precopy_init_size, uint64_t precopy_dirty_size) " (%s) 
> precopy 0x%"PRIx64" postcopy 0x%"PRIx64" precopy initial size 0x%"PRIx64" 
> precopy dirty size 0x%"PRIx64
>  vfio_state_pending_exact(const char *name, uint64_t precopy, uint64_t 
> postcopy, uint64_t stopcopy_size, uint64_t precopy_init_size, uint64_t 
> precopy_dirty_size) " (%s) precopy 0x%"PRIx64" postcopy 0x%"PRIx64" stopcopy 
> size 0x%"PRIx64" precopy initial size 0x%"PRIx64" precopy dirty size 
> 0x%"PRIx64
>  vfio_vmstate_change(const char *name, int running, const char *reason, const 
> char *dev_state) " (%s) running %d reason %s device state %s"
> +
> +#iommufd.c
> +
> +vfio_iommufd_get_devicefd(const char *dev, int devfd) " %s (fd=%d)"
> +vfio_iommufd_bind_device(int iommufd, const char *name, int devfd, int 
> devid) " [iommufd=%d] Succesfully bound device %s (fd=%d): output devid=%d"
> +vfio_iommufd_attach_device(int iommufd, const char *name, int devfd, int 
> ioasid, int hwptid) " [iommufd=%d] Succesfully attached device %s (%d) to 
> ioasid=%d: output hwptd=%d"
> +vfio_iommufd_detach_device(int iommufd, const char *name, int ioasid) " 
> [iommufd=%d] Detached %s from ioasid=%d"
> +vfio_iommufd_alloc_ioas(int iommufd, int ioas_id) " [iommufd=%d] new IOMMUFD 
> container with ioasid=%d"
> +vfio_iommufd_device_info(char *name, int devfd, int num_irqs, int 
> num_regions, int flags) " %s (%d) num_irqs=%d num_regions=%d flags=%d"
> +vfio_iommufd_fail_attach_existing_container(const char *msg) " %s"
> +vfio_iommufd_container_reset(char *name) " Successfully reset %s"
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 81a87d88b6..6434a442fd 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -85,6 +85,26 @@ typedef struct VFIOLegacyContainer {
>  QLIST_HEAD(, VFIOGroup) group_list;
>  } VFIOLegacyContainer;
>  
> +#ifdef CONFIG_IOMMUFD
> +typedef struct VFIOIOASHwpt {
> +uint32_t hwpt_id;
> +QLIST_HEAD(, VFIODevice) device_list;
> +QLIST_ENTRY(VFIOIOASHwpt) next;
> +} VFIOIOASHwpt;
> +
> +typedef struct IOMMUFDBackend IOMMUFDBackend;
> +
> +typedef struct VFIOIOMMUFDContainer {
> +VFIOContainer bcontainer;
> +IOMMUFDBackend *be;
> +uint32_t ioas_id;
> +QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
> +} VFIOIOMMUFDContainer;
> +#endif
> +
> +typedef QLIST_HEAD(VFIOAddressSpaceList, VFIOAddressSpace) 
> VFIOAddressSpaceList;
> +extern VFIOAddressSpaceList vfio_address_spaces;
> +
>  typedef struct VFIODeviceOps VFIODeviceOps;
>  
>  typedef struct VFIODevice {
> @@ -110,6 +130,10 @@ typedef struct VFIODevice {
>  OnOffAuto pre_copy_dirty_page_tracking;
>  bool dirty_pages_supported;
>  bool dirty_tracking;
> +#ifdef CONFIG_IOMMUFD
> +int devid;
> +IOMMUFDBackend *iommufd;
> +#endif
>  } VFIODevice;
>  
>  struct VFIODeviceOps {
> diff --git a/include/hw/vfio/vfio-container-base.h 
> b/include/hw/vfio/vfio-container-base.h
> index b18fa92146..51aff4af05 100644
> --- a/include/hw/vfio/vfio-container-base.h
> +++ b/include/hw/vfio/vfio-container-base.h
> @@ -117,6 +117,9 @@ void vfio_container_init(VFIOContainer *container,
>  void vfio_container_destroy(VFIOContainer *container);
>  
>  #define TYPE_VFIO_IOMMU_BACKEND_LEGACY_OPS "vfio-iommu-backend-legacy-ops"
> +#ifdef CONFIG_IOMMUFD
> +#define TYPE_VFIO_IOMMU_BACKEND_IOMMUFD_OPS "vfio-iommu-backend-iommufd-ops"
> +#endif
>  #define TYPE_VFIO_IOMMU_BACKEND_OPS "vfio-iommu-backend-ops"
>  
>  DECLARE_CLASS_CHECKERS(VFIOIOMMUBackendOpsClass,
> -- 
> 2.34.1
> 
> 

-- 

Joel Granados


signature.asc
Description: PGP signature


Re: [PULL 1/5] hw/nvme: move adjustment of data_units{read,written}

2023-03-07 Thread Joel Granados
On Mon, Mar 06, 2023 at 03:34:29PM +0100, Klaus Jensen wrote:
> From: Joel Granados 
> 
> Move the rounding of bytes read/written into nvme_smart_log which
> reports in units of 512 bytes, rounded up in thousands. This is in
> preparation for adding the Endurance Group Information log page which
> reports in units of billions, rounded up.
Awesome. the message now makes more sense. I see that it already has my
signed off tag :)

Joel

> 
> Reviewed-by: Keith Busch 
> Reviewed-by: Klaus Jensen 
> Signed-off-by: Joel Granados 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/nvme/ctrl.c | 14 --
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index f25cc2c235e9..99b92ff20b9a 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -4386,8 +4386,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, 
> struct nvme_stats *stats)
>  {
>  BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
>  
> -stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> -stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> 
> BDRV_SECTOR_BITS;
> +stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
> +stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
>  stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
>  stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
>  }
> @@ -4401,6 +4401,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
> rae, uint32_t buf_len,
>  uint32_t trans_len;
>  NvmeNamespace *ns;
>  time_t current_ms;
> +uint64_t u_read, u_written;
>  
>  if (off >= sizeof(smart)) {
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -4427,10 +4428,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
> rae, uint32_t buf_len,
>  trans_len = MIN(sizeof(smart) - off, buf_len);
>  smart.critical_warning = n->smart_critical_warning;
>  
> -smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
> -1000));
> -smart.data_units_written[0] = 
> cpu_to_le64(DIV_ROUND_UP(stats.units_written,
> -   1000));
> +u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
> +u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
> +
> +smart.data_units_read[0] = cpu_to_le64(u_read);
> +smart.data_units_written[0] = cpu_to_le64(u_written);
>  smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
>  smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
>  
> -- 
> 2.39.2
> 


signature.asc
Description: PGP signature


Re: [PATCH v3 3/5] hw/nvme: add basic endurance group support

2023-02-24 Thread Joel Granados
On Mon, Feb 20, 2023 at 12:59:24PM +0100, Jesper Devantier wrote:
> From: Klaus Jensen 
> 
> Add the mandatory Endurance Group identify data structures and log
> pages.
> 
> For now, all namespaces in a subsystem belongs to a single Endurance
> Group.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/nvme/ctrl.c   | 48 +++-
>  hw/nvme/ns.c |  3 +++
>  hw/nvme/nvme.h   |  4 
>  include/block/nvme.h | 42 +++---
>  4 files changed, 89 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index 99b92ff20b..729ed9adc5 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -4454,6 +4454,44 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
> rae, uint32_t buf_len,
>  return nvme_c2h(n, (uint8_t *)  + off, trans_len, req);
>  }
>  
> +static uint16_t nvme_endgrp_info(NvmeCtrl *n,  uint8_t rae, uint32_t buf_len,
> + uint64_t off, NvmeRequest *req)
> +{
> +uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
> +uint16_t endgrpid = (dw11 >> 16) & 0x;
> +struct nvme_stats stats = {};
> +NvmeEndGrpLog info = {};
> +int i;
> +
> +if (!n->subsys || endgrpid != 0x1) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +if (off >= sizeof(info)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> +NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
> +if (!ns) {
> +continue;
> +}
> +
> +nvme_set_blk_stats(ns, );
> +}
> +
> +info.data_units_read[0] = cpu_to_le64(stats.units_read / 10);
> +info.data_units_written[0] = cpu_to_le64(stats.units_written / 
> 10);
> +info.media_units_written[0] = cpu_to_le64(stats.units_written / 
> 10);

This division is a bit strange for me. Maybe I'm missing something:

NVMe spec states this about Data Units {Written,Read}: "... Contains the
number of 512 byte data units the host has  This value is reported
in thousands (i.e., a value of 1 corresponds to 1,000 units of 512 bytes
read) and is rounded up (e.g., one indicates that the number of 512 byte
data units {read,written} is from 1 to 1,000..."

1. The way I understand this text from the spec is that if 512 were
   written, then the data_units_written should contain 1; but as I see
   it now, it will contain 0. Am I missing something?

2. And where is the 512 units represented? I was expecting a bit shift by
   9.

3. And where is it rounding up to thousands?

Shouldn't it be like this:

 +info.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read >> 
BDRV_SECTOR_BITS, 1000));
 +info.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written 
>> BDRV_SECTOR_BITS, 1000));
 +info.media_units_written[0] = 
cpu_to_le64(DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000));


> +info.host_read_commands[0] = cpu_to_le64(stats.read_commands);
> +info.host_write_commands[0] = cpu_to_le64(stats.write_commands);
> +
> +buf_len = MIN(sizeof(info) - off, buf_len);
> +
> +return nvme_c2h(n, (uint8_t *) + off, buf_len, req);
> +}
> +
> +
>  static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
>   NvmeRequest *req)
>  {
> @@ -4626,6 +4664,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
> *req)
>  return nvme_changed_nslist(n, rae, len, off, req);
>  case NVME_LOG_CMD_EFFECTS:
>  return nvme_cmd_effects(n, csi, len, off, req);
> +case NVME_LOG_ENDGRP:
> +return nvme_endgrp_info(n, rae, len, off, req);
>  default:
>  trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -7382,6 +7422,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  uint8_t *pci_conf = pci_dev->config;
>  uint64_t cap = ldq_le_p(>bar.cap);
>  NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
> +uint32_t ctratt;
>  
>  id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
>  id->ssvid = cpu_to_le16(pci_get_word(pci_conf + 
> PCI_SUBSYSTEM_VENDOR_ID));
> @@ -7392,7 +7433,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  id->cntlid = cpu_to_le16(n->cntlid);
>  
>  id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
> -id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
> +ctratt = NVME_CTRATT_ELBAS;
>  
>  id->rab = 6;
>  
> @@ -7459,8 +7500,13 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  
>  if (n->subsys) {
>  id->cmic |= NVME_CMIC_MULTI_CTRL;
> +ctratt |= NVME_CTRATT_ENDGRPS;
> +
> +id->endgidmax = cpu_to_le16(0x1);
>  }
>  
> +id->ctratt = cpu_to_le32(ctratt);
> +
>  NVME_CAP_SET_MQES(cap, 0x7ff);
>  NVME_CAP_SET_CQR(cap, 1);
>  NVME_CAP_SET_TO(cap, 0xf);
> diff --git a/hw/nvme/ns.c 

Re: [PATCH v3 1/5] hw/nvme: move adjustment of data_units{read,written}

2023-02-24 Thread Joel Granados
On Mon, Feb 20, 2023 at 12:59:22PM +0100, Jesper Devantier wrote:
> From: Joel Granados 
> 
> In order to return the units_{read/written} required by the SMART log we
> need to shift the number of bytes value by BDRV_SECTORS_BITS and multiply
> by 1000. This is a prep patch that moves this adjustment to where the SMART
> log is calculated in order to use the stats struct for calculating OCP
> extended smart log values.

This was originally part of another patchset that looked at adding OCP
to qemu. Why is it needed for this patch set? Should we change the
wording of the commit to remove the OCP specific stuff? or should we
just remove the patch all together?

best
Joel

> 
> Reviewed-by: Klaus Jensen 
> Signed-off-by: Joel Granados 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/nvme/ctrl.c | 14 --
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index f25cc2c235..99b92ff20b 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -4386,8 +4386,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, 
> struct nvme_stats *stats)
>  {
>  BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
>  
> -stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> -stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> 
> BDRV_SECTOR_BITS;
> +stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
> +stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
>  stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
>  stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
>  }
> @@ -4401,6 +4401,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
> rae, uint32_t buf_len,
>  uint32_t trans_len;
>  NvmeNamespace *ns;
>  time_t current_ms;
> +uint64_t u_read, u_written;
>  
>  if (off >= sizeof(smart)) {
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -4427,10 +4428,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
> rae, uint32_t buf_len,
>  trans_len = MIN(sizeof(smart) - off, buf_len);
>  smart.critical_warning = n->smart_critical_warning;
>  
> -smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
> -1000));
> -smart.data_units_written[0] = 
> cpu_to_le64(DIV_ROUND_UP(stats.units_written,
> -   1000));
> +u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
> +u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
> +
> +smart.data_units_read[0] = cpu_to_le64(u_read);
> +smart.data_units_written[0] = cpu_to_le64(u_written);
>  smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
>  smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
>  
> -- 
> 2.39.2
> 


signature.asc
Description: PGP signature


Re: [PATCH v2 0/3] Add OCP extended log to nvme QEMU

2023-01-04 Thread Joel Granados
On Thu, Dec 22, 2022 at 11:39:54PM -0800, Christoph Hellwig wrote:
> Please don't do this.  OCP is acting as a counter standard to the
> proper NVMe standard here and should in absolutely no way be supported
> by open source projects that needs to stick to the actual standards.
> 
> Please work with the NVMe technical working group to add this (very
> useful) functionality to NVMe proper first.

This is a very good point. Regardless of what OCP's ultimate objective,
having this in the NVMe specification would reach more cases. We can
even use existing values like the "Media Bytes with Metadata Written" in
the statistics log page of the newly ratified FDP TP.

Thx for the review

Best

Joel

> 
> On Mon, Nov 14, 2022 at 02:50:40PM +0100, Joel Granados wrote:
> > The motivation and description are contained in the last patch in this set.
> > Will copy paste it here for convenience:
> > 
> > In order to evaluate write amplification factor (WAF) within the storage
> > stack it is important to know the number of bytes written to the
> > controller. The existing SMART log value of Data Units Written is too
> > coarse (given in units of 500 Kb) and so we add the SMART health
> > information extended from the OCP specification (given in units of 
> > bytes).
> > 
> > To accommodate different vendor specific specifications like OCP, we 
> > add a
> > multiplexing function (nvme_vendor_specific_log) which will route to the
> > different log functions based on arguments and log ids. We only return 
> > the
> > OCP extended smart log when the command is 0xC0 and ocp has been turned 
> > on
> > in the args.
> > 
> > Though we add the whole nvme smart log extended structure, we only 
> > populate
> > the physical_media_units_{read,written}, log_page_version and
> > log_page_uuid.
> > 
> > V1 changes:
> > 1. I moved the ocp parameter from the namespace to the subsystem as it is
> >defined there in the OCP specification
> > 2. I now accumulate statistics from all namespaces and report them back on
> >the extended log as per the spec.
> > 3. I removed the default case in the switch in nvme_vendor_specific_log as
> >it does not have any special function.
> > 
> > Joel Granados (3):
> >   nvme: Move adjustment of data_units{read,written}
> >   nvme: Add ocp to the subsys
> >   nvme: Add physical writes/reads from OCP log
> > 
> >  hw/nvme/ctrl.c   | 70 
> >  hw/nvme/nvme.h   |  1 +
> >  hw/nvme/subsys.c |  4 +--
> >  include/block/nvme.h | 36 +++
> >  4 files changed, 103 insertions(+), 8 deletions(-)
> > 
> > -- 
> > 2.30.2
> > 
> > 
> ---end quoted text---


signature.asc
Description: PGP signature


Re: [PATCH v4 0/2] Add OCP extended log to nvme QEMU

2022-12-08 Thread Joel Granados
ping.

Is the solution to the guid constant ok?

Best

On Fri, Nov 25, 2022 at 10:48:06AM +0100, Joel Granados wrote:
> The motivation and description are contained in the last patch in this set.
> Will copy paste it here for convenience:
> 
> In order to evaluate write amplification factor (WAF) within the storage
> stack it is important to know the number of bytes written to the
> controller. The existing SMART log value of Data Units Written is too
> coarse (given in units of 500 Kb) and so we add the SMART health
> information extended from the OCP specification (given in units of bytes).
> 
> To accommodate different vendor specific specifications like OCP, we add a
> multiplexing function (nvme_vendor_specific_log) which will route to the
> different log functions based on arguments and log ids. We only return the
> OCP extended smart log when the command is 0xC0 and ocp has been turned on
> in the args.
> 
> Though we add the whole nvme smart log extended structure, we only 
> populate
> the physical_media_units_{read,written}, log_page_version and
> log_page_uuid.
> 
> V4 changes:
> 1. Fixed cpu_to_le64 instead of cpu_to_le32
> 2. Variable naming : uuid -> guid
> 3. Changed how the guid value appears in the code:
>Used to be:
> smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
> smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
> 
>Now is:
> static const uint8_t guid[16] = {
> 0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
> 0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
> };
> 
>This is different from what @klaus suggested because I want to keep it
>consistent to what nvme-cli currently implements. I think here we can
>either change both nvme-cli and this patch or leave the order of the
>bytes as they are here. This all depends on how you interpret the Spec
>(which is ambiguous)
> 
> V3 changes:
> 1. Corrected a bunch of checkpatch issues. Since I changed the first patch
>I did not include the reviewed-by.
> 2. Included some documentation in nvme.rst for the ocp argument
> 3. Squashed the ocp arg changes into the main patch.
> 4. Fixed several comments and an open parenthesis
> 5. Hex values are now in lower case.
> 6. Change the reserved format to rsvd
> 7. Made sure that NvmeCtrl is the first arg in all the functions.
> 8. Fixed comment on commit of main patch
> 
> V2 changes:
> 1. I moved the ocp parameter from the namespace to the subsystem as it is
>defined there in the OCP specification
> 2. I now accumulate statistics from all namespaces and report them back on
>the extended log as per the spec.
> 3. I removed the default case in the switch in nvme_vendor_specific_log as
>it does not have any special function.
> 
> Joel Granados (2):
>   nvme: Move adjustment of data_units{read,written}
>   nvme: Add physical writes/reads from OCP log
> 
>  docs/system/devices/nvme.rst |  7 
>  hw/nvme/ctrl.c   | 73 +---
>  hw/nvme/nvme.h   |  1 +
>  include/block/nvme.h | 36 ++
>  4 files changed, 111 insertions(+), 6 deletions(-)
> 
> -- 
> 2.30.2
> 


signature.asc
Description: PGP signature


[PATCH v4 0/2] Add OCP extended log to nvme QEMU

2022-11-25 Thread Joel Granados
The motivation and description are contained in the last patch in this set.
Will copy paste it here for convenience:

In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes).

To accommodate different vendor specific specifications like OCP, we add a
multiplexing function (nvme_vendor_specific_log) which will route to the
different log functions based on arguments and log ids. We only return the
OCP extended smart log when the command is 0xC0 and ocp has been turned on
in the args.

Though we add the whole nvme smart log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

V4 changes:
1. Fixed cpu_to_le64 instead of cpu_to_le32
2. Variable naming : uuid -> guid
3. Changed how the guid value appears in the code:
   Used to be:
smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;

   Now is:
static const uint8_t guid[16] = {
0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
};

   This is different from what @klaus suggested because I want to keep it
   consistent to what nvme-cli currently implements. I think here we can
   either change both nvme-cli and this patch or leave the order of the
   bytes as they are here. This all depends on how you interpret the Spec
   (which is ambiguous)

V3 changes:
1. Corrected a bunch of checkpatch issues. Since I changed the first patch
   I did not include the reviewed-by.
2. Included some documentation in nvme.rst for the ocp argument
3. Squashed the ocp arg changes into the main patch.
4. Fixed several comments and an open parenthesis
5. Hex values are now in lower case.
6. Change the reserved format to rsvd
7. Made sure that NvmeCtrl is the first arg in all the functions.
8. Fixed comment on commit of main patch

V2 changes:
1. I moved the ocp parameter from the namespace to the subsystem as it is
   defined there in the OCP specification
2. I now accumulate statistics from all namespaces and report them back on
   the extended log as per the spec.
3. I removed the default case in the switch in nvme_vendor_specific_log as
   it does not have any special function.

Joel Granados (2):
  nvme: Move adjustment of data_units{read,written}
  nvme: Add physical writes/reads from OCP log

 docs/system/devices/nvme.rst |  7 
 hw/nvme/ctrl.c   | 73 +---
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h | 36 ++
 4 files changed, 111 insertions(+), 6 deletions(-)

-- 
2.30.2




[PATCH v4 2/2] nvme: Add physical writes/reads from OCP log

2022-11-25 Thread Joel Granados
In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes)

We add a controller argument (ocp) that toggles on/off the SMART log
extended structure.  To accommodate different vendor specific specifications
like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
will route to the different log functions based on arguments and log ids.
We only return the OCP extended SMART log when the command is 0xC0 and ocp
has been turned on in the args.

Though we add the whole nvme SMART log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

Signed-off-by: Joel Granados 
---
 docs/system/devices/nvme.rst |  7 +
 hw/nvme/ctrl.c   | 59 
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h | 36 ++
 4 files changed, 103 insertions(+)

diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index 30f841ef62..1cc5e52c00 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -53,6 +53,13 @@ parameters.
   Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
   previously used.
 
+``ocp`` (default: ``off``)
+  The Open Compute Project defines the Datacenter NVMe SSD Specification that
+  sits on top of NVMe. It describes additional commands and NVMe behaviors
+  specific for the Datacenter. When this option is ``on`` OCP features such as
+  the SMART / Health information extended log become available in the
+  controller.
+
 Additional Namespaces
 -
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index bf291f7ffe..264d9cb220 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4455,6 +4455,45 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
 
+static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req)
+{
+NvmeNamespace *ns = NULL;
+NvmeSmartLogExtended smart_l = { 0 };
+struct nvme_stats stats = { 0 };
+uint32_t trans_len;
+
+if (off >= sizeof(smart_l)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+/* accumulate all stats from all namespaces */
+for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+if (ns) {
+nvme_set_blk_stats(ns, );
+}
+}
+
+smart_l.physical_media_units_written[0] = cpu_to_le64(stats.units_written);
+smart_l.physical_media_units_read[0] = cpu_to_le64(stats.units_read);
+smart_l.log_page_version = 0x0003;
+
+static const uint8_t guid[16] = {
+0xC5, 0xAF, 0x10, 0x28, 0xEA, 0xBF, 0xF2, 0xA4,
+0x9C, 0x4F, 0x6F, 0x7C, 0xC9, 0x14, 0xD5, 0xAF
+};
+memcpy(smart_l.log_page_guid, guid, sizeof(smart_l.log_page_guid));
+
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_SMART);
+}
+
+trans_len = MIN(sizeof(smart_l) - off, buf_len);
+return nvme_c2h(n, (uint8_t *) _l + off, trans_len, req);
+}
+
 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
 uint64_t off, NvmeRequest *req)
 {
@@ -4642,6 +4681,23 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t 
csi, uint32_t buf_len,
 return nvme_c2h(n, ((uint8_t *)) + off, trans_len, req);
 }
 
+static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req, uint8_t lid)
+{
+switch (lid) {
+case 0xc0:
+if (n->params.ocp) {
+return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
+}
+break;
+/* add a case for each additional vendor specific log id */
+}
+
+trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -4683,6 +4739,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_error_info(n, rae, len, off, req);
 case NVME_LOG_SMART_INFO:
 return nvme_smart_info(n, rae, len, off, req);
+case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
+return nvme_vendor_specific_log(n, rae, len, off, req, lid);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
 case NVME_LOG_CHANGED_NSLIST:
@@ -7685,6 +7743,7 @@ static P

[PATCH v4 1/2] nvme: Move adjustment of data_units{read,written}

2022-11-25 Thread Joel Granados
In order to return the units_{read/written} required by the SMART log we
need to shift the number of bytes value by BDRV_SECTORS_BITS and multiply
by 1000. This is a prep patch that moves this adjustment to where the SMART
log is calculated in order to use the stats struct for calculating OCP
extended smart log values.

Signed-off-by: Joel Granados 
---
 hw/nvme/ctrl.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..bf291f7ffe 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4449,8 +4449,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 {
 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
 
-stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
-stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
+stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
@@ -4464,6 +4464,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 uint32_t trans_len;
 NvmeNamespace *ns;
 time_t current_ms;
+uint64_t u_read, u_written;
 
 if (off >= sizeof(smart)) {
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -4490,10 +4491,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 trans_len = MIN(sizeof(smart) - off, buf_len);
 smart.critical_warning = n->smart_critical_warning;
 
-smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
-1000));
-smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
-   1000));
+u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
+u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
+
+smart.data_units_read[0] = cpu_to_le64(u_read);
+smart.data_units_written[0] = cpu_to_le64(u_written);
 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
 
-- 
2.30.2




Re: [PATCH v3 2/2] nvme: Add physical writes/reads from OCP log

2022-11-17 Thread Joel Granados
On Thu, Nov 17, 2022 at 08:30:46AM +0100, Klaus Jensen wrote:
> On Nov 16 18:14, Joel Granados wrote:
> > In order to evaluate write amplification factor (WAF) within the storage
> > stack it is important to know the number of bytes written to the
> > controller. The existing SMART log value of Data Units Written is too
> > coarse (given in units of 500 Kb) and so we add the SMART health
> > information extended from the OCP specification (given in units of bytes)
> > 
> > We add a controller argument (ocp) that toggles on/off the SMART log
> > extended structure.  To accommodate different vendor specific specifications
> > like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
> > will route to the different log functions based on arguments and log ids.
> > We only return the OCP extended SMART log when the command is 0xC0 and ocp
> > has been turned on in the args.
> > 
> > Though we add the whole nvme SMART log extended structure, we only populate
> > the physical_media_units_{read,written}, log_page_version and
> > log_page_uuid.
> > 
> > Signed-off-by: Joel Granados 
> > ---
> >  docs/system/devices/nvme.rst |  7 +
> >  hw/nvme/ctrl.c   | 55 
> >  hw/nvme/nvme.h   |  1 +
> >  include/block/nvme.h | 36 +++
> >  4 files changed, 99 insertions(+)
> > 
> > diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
> > index 30f841ef62..1cc5e52c00 100644
> > --- a/docs/system/devices/nvme.rst
> > +++ b/docs/system/devices/nvme.rst
> > @@ -53,6 +53,13 @@ parameters.
> >Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
> >previously used.
> >  
> > +``ocp`` (default: ``off``)
> > +  The Open Compute Project defines the Datacenter NVMe SSD Specification 
> > that
> > +  sits on top of NVMe. It describes additional commands and NVMe behaviors
> > +  specific for the Datacenter. When this option is ``on`` OCP features 
> > such as
> > +  the SMART / Health information extended log become available in the
> > +  controller.
> > +
> >  Additional Namespaces
> >  -
> >  
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index bf291f7ffe..c7215a4ed1 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -4455,6 +4455,41 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, 
> > struct nvme_stats *stats)
> >  stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
> >  }
> >  
> > +static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
> > + uint32_t buf_len, uint64_t 
> > off,
> > + NvmeRequest *req)
> > +{
> > +NvmeNamespace *ns = NULL;
> > +NvmeSmartLogExtended smart_l = { 0 };
> > +struct nvme_stats stats = { 0 };
> > +uint32_t trans_len;
> > +
> > +if (off >= sizeof(smart_l)) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +/* accumulate all stats from all namespaces */
> > +for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> > +ns = nvme_ns(n, i);
> > +if (ns) {
> > +nvme_set_blk_stats(ns, );
> > +}
> > +}
> > +
> > +smart_l.physical_media_units_written[0] = 
> > cpu_to_le32(stats.units_written);
> > +smart_l.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
> 
> These are uint64s, so should be cpu_to_le64().
Good catch

> 
> > +smart_l.log_page_version = 0x0003;
> > +smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
> > +smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
> 
> Technically the field is called the "Log Page GUID", not the UUID.
Yep, that was all me. My brain just automatically completed UUID. Sorry
about that.
> Perhaps this is a bit of Microsoft leaking in, or it is to differentiate
> it from the UUID Index functionality, who knows.
> 
> It looks like you byte swapped the two 64 bit parts, but not the
> individual bytes. It's super confusing when the spec just says "shall be
Individual bytes are also swapped. I actually tested this with nvme cli
and it successfully checks these 128 bytes. I got inspired by nvme-cli
(plugins/ocp/ocp-nvme.c) where the opc number appears.

> set to VALUE". Is that VALUE already in little endian? Sigh.
The spec says "All values in logs shall be little endian format". Which
still does not say if the value th

[PATCH v3 0/2] Add OCP extended log to nvme QEMU

2022-11-16 Thread Joel Granados
The motivation and description are contained in the last patch in this set.
Will copy paste it here for convenience:

In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes).

To accommodate different vendor specific specifications like OCP, we add a
multiplexing function (nvme_vendor_specific_log) which will route to the
different log functions based on arguments and log ids. We only return the
OCP extended smart log when the command is 0xC0 and ocp has been turned on
in the args.

Though we add the whole nvme smart log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

V3 changes:
1. Corrected a bunch of checkpatch issues. Since I changed the first patch
   I did not include the reviewed-by.
2. Included some documentation in nvme.rst for the ocp argument
3. Squashed the ocp arg changes into the main patch.
4. Fixed several comments and an open parenthesis
5. Hex values are now in lower case.
6. Change the reserved format to rsvd
7. Made sure that NvmeCtrl is the first arg in all the functions.
8. Fixed comment on commit of main patch

V2 changes:
1. I moved the ocp parameter from the namespace to the subsystem as it is
   defined there in the OCP specification
2. I now accumulate statistics from all namespaces and report them back on
   the extended log as per the spec.
3. I removed the default case in the switch in nvme_vendor_specific_log as
   it does not have any special function.

Joel Granados (2):
  nvme: Move adjustment of data_units{read,written}
  nvme: Add physical writes/reads from OCP log

 docs/system/devices/nvme.rst |  7 
 hw/nvme/ctrl.c   | 69 
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h | 36 +++
 4 files changed, 107 insertions(+), 6 deletions(-)

-- 
2.30.2




[PATCH v3 2/2] nvme: Add physical writes/reads from OCP log

2022-11-16 Thread Joel Granados
In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes)

We add a controller argument (ocp) that toggles on/off the SMART log
extended structure.  To accommodate different vendor specific specifications
like OCP, we add a multiplexing function (nvme_vendor_specific_log) which
will route to the different log functions based on arguments and log ids.
We only return the OCP extended SMART log when the command is 0xC0 and ocp
has been turned on in the args.

Though we add the whole nvme SMART log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

Signed-off-by: Joel Granados 
---
 docs/system/devices/nvme.rst |  7 +
 hw/nvme/ctrl.c   | 55 
 hw/nvme/nvme.h   |  1 +
 include/block/nvme.h | 36 +++
 4 files changed, 99 insertions(+)

diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index 30f841ef62..1cc5e52c00 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -53,6 +53,13 @@ parameters.
   Vendor ID. Set this to ``on`` to revert to the unallocated Intel ID
   previously used.
 
+``ocp`` (default: ``off``)
+  The Open Compute Project defines the Datacenter NVMe SSD Specification that
+  sits on top of NVMe. It describes additional commands and NVMe behaviors
+  specific for the Datacenter. When this option is ``on`` OCP features such as
+  the SMART / Health information extended log become available in the
+  controller.
+
 Additional Namespaces
 -
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index bf291f7ffe..c7215a4ed1 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4455,6 +4455,41 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
 
+static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req)
+{
+NvmeNamespace *ns = NULL;
+NvmeSmartLogExtended smart_l = { 0 };
+struct nvme_stats stats = { 0 };
+uint32_t trans_len;
+
+if (off >= sizeof(smart_l)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+/* accumulate all stats from all namespaces */
+for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+if (ns) {
+nvme_set_blk_stats(ns, );
+}
+}
+
+smart_l.physical_media_units_written[0] = cpu_to_le32(stats.units_written);
+smart_l.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
+smart_l.log_page_version = 0x0003;
+smart_l.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
+smart_l.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
+
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_SMART);
+}
+
+trans_len = MIN(sizeof(smart_l) - off, buf_len);
+return nvme_c2h(n, (uint8_t *) _l + off, trans_len, req);
+}
+
 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
 uint64_t off, NvmeRequest *req)
 {
@@ -4642,6 +4677,23 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t 
csi, uint32_t buf_len,
 return nvme_c2h(n, ((uint8_t *)) + off, trans_len, req);
 }
 
+static uint16_t nvme_vendor_specific_log(NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req, uint8_t lid)
+{
+switch (lid) {
+case 0xc0:
+if (n->params.ocp) {
+return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
+}
+break;
+/* add a case for each additional vendor specific log id */
+}
+
+trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -4683,6 +4735,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_error_info(n, rae, len, off, req);
 case NVME_LOG_SMART_INFO:
 return nvme_smart_info(n, rae, len, off, req);
+case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
+return nvme_vendor_specific_log(n, rae, len, off, req, lid);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
 case NVME_LOG_CHANGED_NSLIST:
@@ -7685,6 +7739,7 @@ static Property nvme_props[] = {
   params.sriov_max_vi_per_vf, 0),
 DEFINE_PROP_UINT8("sriov_m

[PATCH v3 1/2] nvme: Move adjustment of data_units{read,written}

2022-11-16 Thread Joel Granados
In order to return the units_{read/written} required by the SMART log we
need to shift the number of bytes value by BDRV_SECTORS_BITS and multiply
by 1000. This is a prep patch that moves this adjustment to where the SMART
log is calculated in order to use the stats struct for calculating OCP
extended smart log values.

Signed-off-by: Joel Granados 
---
 hw/nvme/ctrl.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..bf291f7ffe 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4449,8 +4449,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 {
 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
 
-stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
-stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
+stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
@@ -4464,6 +4464,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, 
uint32_t buf_len,
 uint32_t trans_len;
 NvmeNamespace *ns;
 time_t current_ms;
+uint64_t u_read, u_written;
 
 if (off >= sizeof(smart)) {
 return NVME_INVALID_FIELD | NVME_DNR;
@@ -4490,10 +4491,11 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 trans_len = MIN(sizeof(smart) - off, buf_len);
 smart.critical_warning = n->smart_critical_warning;
 
-smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
-1000));
-smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
-   1000));
+u_read = DIV_ROUND_UP(stats.units_read >> BDRV_SECTOR_BITS, 1000);
+u_written = DIV_ROUND_UP(stats.units_written >> BDRV_SECTOR_BITS, 1000);
+
+smart.data_units_read[0] = cpu_to_le64(u_read);
+smart.data_units_written[0] = cpu_to_le64(u_written);
 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
 
-- 
2.30.2




Re: [PATCH v2 3/3] nvme: Add physical writes/reads from OCP log

2022-11-16 Thread Joel Granados
On Tue, Nov 15, 2022 at 12:26:17PM +0100, Klaus Jensen wrote:
> On Nov 14 14:50, Joel Granados wrote:
> > In order to evaluate write amplification factor (WAF) within the storage
> > stack it is important to know the number of bytes written to the
> > controller. The existing SMART log value of Data Units Written is too
> > coarse (given in units of 500 Kb) and so we add the SMART health
> > information extended from the OCP specification (given in units of bytes).
> > 
> > To accomodate different vendor specific specifications like OCP, we add a
> > multiplexing function (nvme_vendor_specific_log) which will route to the
> > different log functions based on arguments and log ids. We only return the
> > OCP extended smart log when the command is 0xC0 and ocp has been turned on
> > in the args.
> > 
> > Though we add the whole nvme smart log extended structure, we only populate
> > the physical_media_units_{read,written}, log_page_version and
> > log_page_uuid.
> > 
> > Signed-off-by: Joel Granados 
> > 
> > squash with main
> > 
> > Signed-off-by: Joel Granados 
> 
> Looks like you slightly messed up the squash ;)
oops. that is my bad

> 
> Also, squash the previous patch (adding the ocp parameter) into this.
Here I wanted to keep the introduction of the argument separate. In any
case, I'll squash it with the other one.

> Please add a note in the documentation (docs/system/devices/nvme.rst)
> about this parameter.
Of course. I always forget documentation. I'll add it under the
"Controller Emulation" section and I'll call it ``ocp``

> 
> > ---
> >  hw/nvme/ctrl.c   | 56 
> >  include/block/nvme.h | 36 
> >  2 files changed, 92 insertions(+)
> > 
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index 220683201a..5e6a8150a2 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -4455,6 +4455,42 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, 
> > struct nvme_stats *stats)
> >  stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
> >  }
> >  
> > +static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
> > + uint32_t buf_len, uint64_t 
> > off,
> > + NvmeRequest *req)
> > +{
> > +NvmeNamespace *ns = NULL;
> > +NvmeSmartLogExtended smart_ext = { 0 };
> > +struct nvme_stats stats = { 0 };
> > +uint32_t trans_len;
> > +
> > +if (off >= sizeof(smart_ext)) {
> > +return NVME_INVALID_FIELD | NVME_DNR;
> > +}
> > +
> > +// Accumulate all stats from all namespaces
> 
> Use /* lower-case and no period */ for one sentence, one line comments.
> 
> I think scripts/checkpatch.pl picks this up.
There is a checkpatch like in the kernel. Fantastic! I'll make a note to
use it from now on.


> 
> > +for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> > +ns = nvme_ns(n, i);
> > +if (ns)
> > +{
> 
> Paranthesis go on the same line as the `if`.
of course

> 
> > +nvme_set_blk_stats(ns, );
> > +}
> > +}
> > +
> > +smart_ext.physical_media_units_written[0] = 
> > cpu_to_le32(stats.units_written);
> > +smart_ext.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
> > +smart_ext.log_page_version = 0x0003;
> > +smart_ext.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
> > +smart_ext.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
> > +
> > +if (!rae) {
> > +nvme_clear_events(n, NVME_AER_TYPE_SMART);
> > +}
> > +
> > +trans_len = MIN(sizeof(smart_ext) - off, buf_len);
> > +return nvme_c2h(n, (uint8_t *) _ext + off, trans_len, req);
> > +}
> > +
> >  static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
> >  uint64_t off, NvmeRequest *req)
> >  {
> > @@ -4642,6 +4678,24 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, 
> > uint8_t csi, uint32_t buf_len,
> >  return nvme_c2h(n, ((uint8_t *)) + off, trans_len, req);
> >  }
> >  
> > +static uint16_t nvme_vendor_specific_log(uint8_t lid, NvmeCtrl *n, uint8_t 
> > rae,
> > + uint32_t buf_len, uint64_t off,
> > + NvmeRequest *req)
> 
> `NvmeCtrl *n` must be first parameter.
Any reason why this is the case? I'll change it in my code, but would be
nice to unders

Re: [PATCH v2 2/3] nvme: Add ocp to the subsys

2022-11-16 Thread Joel Granados
On Tue, Nov 15, 2022 at 12:11:50PM +0100, Klaus Jensen wrote:
> On Nov 14 14:50, Joel Granados wrote:
> > The Open Compute Project defines a Datacenter NVMe SSD Spec that sits on
> > top of the NVMe spec. Additional commands and NVMe behaviors specific for
> > the Datacenter. This is a preparation patch that introduces an argument to
> > activate OCP in nvme.
> > 
> > Signed-off-by: Joel Granados 
> > ---
> >  hw/nvme/nvme.h   | 1 +
> >  hw/nvme/subsys.c | 4 ++--
> >  2 files changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> > index 79f5c281c2..aa99c0c57c 100644
> > --- a/hw/nvme/nvme.h
> > +++ b/hw/nvme/nvme.h
> > @@ -56,6 +56,7 @@ typedef struct NvmeSubsystem {
> >  
> >  struct {
> >  char *nqn;
> > +bool ocp;
> >  } params;
> >  } NvmeSubsystem;
> >  
> > diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
> > index 9d2643678b..ecca28449c 100644
> > --- a/hw/nvme/subsys.c
> > +++ b/hw/nvme/subsys.c
> > @@ -129,8 +129,8 @@ static void nvme_subsys_realize(DeviceState *dev, Error 
> > **errp)
> >  
> >  static Property nvme_subsystem_props[] = {
> >  DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn),
> > -DEFINE_PROP_END_OF_LIST(),
> > -};
> > +DEFINE_PROP_BOOL("ocp", NvmeSubsystem, params.ocp, false),
> 
> It is the controller that implements the OCP specification, not the
> namespace or the subsystem. The parameter should be on the controller
> device.
Makes sense. I'll put the option in hw/nvme/ctrl.c

> 
> We discussed that the Get Log Page was subsystem scoped and not
> namespace scoped, but that is unrelated to this.
Yep, this was the confusion. Thx for clarifying.

> 
> > +DEFINE_PROP_END_OF_LIST(), };
> >  
> >  static void nvme_subsys_class_init(ObjectClass *oc, void *data)
> >  {
> > -- 
> > 2.30.2
> > 
> > 




signature.asc
Description: PGP signature


[PATCH v2 0/3] Add OCP extended log to nvme QEMU

2022-11-14 Thread Joel Granados
The motivation and description are contained in the last patch in this set.
Will copy paste it here for convenience:

In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes).

To accommodate different vendor specific specifications like OCP, we add a
multiplexing function (nvme_vendor_specific_log) which will route to the
different log functions based on arguments and log ids. We only return the
OCP extended smart log when the command is 0xC0 and ocp has been turned on
in the args.

Though we add the whole nvme smart log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

V1 changes:
1. I moved the ocp parameter from the namespace to the subsystem as it is
   defined there in the OCP specification
2. I now accumulate statistics from all namespaces and report them back on
   the extended log as per the spec.
3. I removed the default case in the switch in nvme_vendor_specific_log as
   it does not have any special function.

Joel Granados (3):
  nvme: Move adjustment of data_units{read,written}
  nvme: Add ocp to the subsys
  nvme: Add physical writes/reads from OCP log

 hw/nvme/ctrl.c   | 70 
 hw/nvme/nvme.h   |  1 +
 hw/nvme/subsys.c |  4 +--
 include/block/nvme.h | 36 +++
 4 files changed, 103 insertions(+), 8 deletions(-)

-- 
2.30.2




[PATCH v2 3/3] nvme: Add physical writes/reads from OCP log

2022-11-14 Thread Joel Granados
In order to evaluate write amplification factor (WAF) within the storage
stack it is important to know the number of bytes written to the
controller. The existing SMART log value of Data Units Written is too
coarse (given in units of 500 Kb) and so we add the SMART health
information extended from the OCP specification (given in units of bytes).

To accomodate different vendor specific specifications like OCP, we add a
multiplexing function (nvme_vendor_specific_log) which will route to the
different log functions based on arguments and log ids. We only return the
OCP extended smart log when the command is 0xC0 and ocp has been turned on
in the args.

Though we add the whole nvme smart log extended structure, we only populate
the physical_media_units_{read,written}, log_page_version and
log_page_uuid.

Signed-off-by: Joel Granados 

squash with main

Signed-off-by: Joel Granados 
---
 hw/nvme/ctrl.c   | 56 
 include/block/nvme.h | 36 
 2 files changed, 92 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 220683201a..5e6a8150a2 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4455,6 +4455,42 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
 
+static uint16_t nvme_ocp_extended_smart_info(NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req)
+{
+NvmeNamespace *ns = NULL;
+NvmeSmartLogExtended smart_ext = { 0 };
+struct nvme_stats stats = { 0 };
+uint32_t trans_len;
+
+if (off >= sizeof(smart_ext)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+// Accumulate all stats from all namespaces
+for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+ns = nvme_ns(n, i);
+if (ns)
+{
+nvme_set_blk_stats(ns, );
+}
+}
+
+smart_ext.physical_media_units_written[0] = 
cpu_to_le32(stats.units_written);
+smart_ext.physical_media_units_read[0] = cpu_to_le32(stats.units_read);
+smart_ext.log_page_version = 0x0003;
+smart_ext.log_page_uuid[0] = 0xA4F2BFEA2810AFC5;
+smart_ext.log_page_uuid[1] = 0xAFD514C97C6F4F9C;
+
+if (!rae) {
+nvme_clear_events(n, NVME_AER_TYPE_SMART);
+}
+
+trans_len = MIN(sizeof(smart_ext) - off, buf_len);
+return nvme_c2h(n, (uint8_t *) _ext + off, trans_len, req);
+}
+
 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
 uint64_t off, NvmeRequest *req)
 {
@@ -4642,6 +4678,24 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t 
csi, uint32_t buf_len,
 return nvme_c2h(n, ((uint8_t *)) + off, trans_len, req);
 }
 
+static uint16_t nvme_vendor_specific_log(uint8_t lid, NvmeCtrl *n, uint8_t rae,
+ uint32_t buf_len, uint64_t off,
+ NvmeRequest *req)
+{
+NvmeSubsystem *subsys = n->subsys;
+switch (lid) {
+case NVME_LOG_VENDOR_START:
+if (subsys->params.ocp) {
+return nvme_ocp_extended_smart_info(n, rae, buf_len, off, req);
+}
+break;
+/* Add a case for each additional vendor specific log id */
+}
+
+trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
 NvmeCmd *cmd = >cmd;
@@ -4683,6 +4737,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest 
*req)
 return nvme_error_info(n, rae, len, off, req);
 case NVME_LOG_SMART_INFO:
 return nvme_smart_info(n, rae, len, off, req);
+case NVME_LOG_VENDOR_START...NVME_LOG_VENDOR_END:
+return nvme_vendor_specific_log(lid, n, rae, len, off, req);
 case NVME_LOG_FW_SLOT_INFO:
 return nvme_fw_log_info(n, len, off, req);
 case NVME_LOG_CHANGED_NSLIST:
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8027b7126b..2ab0dca529 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -978,6 +978,40 @@ typedef struct QEMU_PACKED NvmeSmartLog {
 uint8_t reserved2[320];
 } NvmeSmartLog;
 
+typedef struct QEMU_PACKED NvmeSmartLogExtended {
+uint64_tphysical_media_units_written[2];
+uint64_tphysical_media_units_read[2];
+uint64_tbad_user_blocks;
+uint64_tbad_system_nand_blocks;
+uint64_txor_recovery_count;
+uint64_tuncorrectable_read_error_count;
+uint64_tsoft_ecc_error_count;
+uint64_tend2end_correction_counts;
+uint8_t system_data_percent_used;
+uint8_t refresh_counts[7];
+uint64_tuser_data_erase_counts;
+uint16_tthermal_throttling_stat_and_count;
+uint16_tdssd_spec_version[3];
+uint64_tpcie_correctable_error

[PATCH v2 1/3] nvme: Move adjustment of data_units{read,written}

2022-11-14 Thread Joel Granados
In order to return the units_{read/written} required by the SMART log we
need to shift the number of bytes value by BDRV_SECTORS_BITS and multiply
by 1000. This is a prep patch that moves this adjustment to where the SMART
log is calculated in order to use the stats struct for calculating OCP
extended smart log values.

Signed-off-by: Joel Granados 
---
 hw/nvme/ctrl.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 87aeba0564..220683201a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -4449,8 +4449,8 @@ static void nvme_set_blk_stats(NvmeNamespace *ns, struct 
nvme_stats *stats)
 {
 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
 
-stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
-stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
+stats->units_read += s->nr_bytes[BLOCK_ACCT_READ];
+stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE];
 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
 }
@@ -4490,10 +4490,12 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 trans_len = MIN(sizeof(smart) - off, buf_len);
 smart.critical_warning = n->smart_critical_warning;
 
-smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
-1000));
-smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
-   1000));
+smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(
+   stats.units_read >> 
BDRV_SECTOR_BITS,
+   1000));
+smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(
+  stats.units_written >> 
BDRV_SECTOR_BITS,
+  1000));
 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
 
-- 
2.30.2




[PATCH v2 2/3] nvme: Add ocp to the subsys

2022-11-14 Thread Joel Granados
The Open Compute Project defines a Datacenter NVMe SSD Spec that sits on
top of the NVMe spec. Additional commands and NVMe behaviors specific for
the Datacenter. This is a preparation patch that introduces an argument to
activate OCP in nvme.

Signed-off-by: Joel Granados 
---
 hw/nvme/nvme.h   | 1 +
 hw/nvme/subsys.c | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 79f5c281c2..aa99c0c57c 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -56,6 +56,7 @@ typedef struct NvmeSubsystem {
 
 struct {
 char *nqn;
+bool ocp;
 } params;
 } NvmeSubsystem;
 
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 9d2643678b..ecca28449c 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -129,8 +129,8 @@ static void nvme_subsys_realize(DeviceState *dev, Error 
**errp)
 
 static Property nvme_subsystem_props[] = {
 DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn),
-DEFINE_PROP_END_OF_LIST(),
-};
+DEFINE_PROP_BOOL("ocp", NvmeSubsystem, params.ocp, false),
+DEFINE_PROP_END_OF_LIST(), };
 
 static void nvme_subsys_class_init(ObjectClass *oc, void *data)
 {
-- 
2.30.2