Re: [PATCH v5 08/26] nvme: refactor device realization

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:43 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 11:27, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > This patch splits up nvme_realize into multiple individual functions,
> > > each initializing a different subset of the device.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c | 175 +++-
> > >  hw/block/nvme.h |  21 ++
> > >  2 files changed, 133 insertions(+), 63 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index e1810260d40b..81514eaef63a 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -44,6 +44,7 @@
> > >  #include "nvme.h"
> > >  
> > >  #define NVME_SPEC_VER 0x00010201
> > > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> > >  
> > >  #define NVME_GUEST_ERR(trace, fmt, ...) \
> > >  do { \
> > > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = {
> > >  },
> > >  };
> > >  
> > > -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> > > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
> > >  {
> > > -NvmeCtrl *n = NVME(pci_dev);
> > > -NvmeIdCtrl *id = >id_ctrl;
> > > -
> > > -int i;
> > > -int64_t bs_size;
> > > -uint8_t *pci_conf;
> > > -
> > > -if (!n->params.num_queues) {
> > > -error_setg(errp, "num_queues can't be zero");
> > > -return;
> > > -}
> > > +NvmeParams *params = >params;
> > >  
> > >  if (!n->conf.blk) {
> > > -error_setg(errp, "drive property not set");
> > > -return;
> > > +error_setg(errp, "nvme: block backend not configured");
> > > +return 1;
> > 
> > As a matter of taste, negative values indicate error, and 0 is the success 
> > value.
> > In Linux kernel this is even an official rule.
> > >  }
> 
> Fixed.
> 
> > >  
> > > -bs_size = blk_getlength(n->conf.blk);
> > > -if (bs_size < 0) {
> > > -error_setg(errp, "could not get backing file size");
> > > -return;
> > > +if (!params->serial) {
> > > +error_setg(errp, "nvme: serial not configured");
> > > +return 1;
> > >  }
> > >  
> > > -if (!n->params.serial) {
> > > -error_setg(errp, "serial property not set");
> > > -return;
> > > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) {
> > > +error_setg(errp, "nvme: invalid queue configuration");
> > 
> > Maybe something like "nvme: invalid queue count specified, should be 
> > between 1 and ..."?
> > > +return 1;
> > >  }
> 
> Fixed.
Thanks
> 
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> > > +{
> > >  blkconf_blocksizes(>conf);
> > >  if (!blkconf_apply_backend_options(>conf, 
> > > blk_is_read_only(n->conf.blk),
> > > -   false, errp)) {
> > > -return;
> > > +false, errp)) {
> > > +return 1;
> > >  }
> > >  
> > > -pci_conf = pci_dev->config;
> > > -pci_conf[PCI_INTERRUPT_PIN] = 1;
> > > -pci_config_set_prog_interface(pci_dev->config, 0x2);
> > > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> > > -pcie_endpoint_cap_init(pci_dev, 0x80);
> > > +return 0;
> > > +}
> > >  
> > > +static void nvme_init_state(NvmeCtrl *n)
> > > +{
> > >  n->num_namespaces = 1;
> > >  n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
> > 
> > Isn't that wrong?
> > First 4K of mmio (0x1000) is the registers, and that is followed by the 
> > doorbells,
> > and each doorbell takes 8 bytes (assuming regular doorbell stride).
> > so n->params.num_queues + 1 should be total number of queues, thus the 
> > 0x1004 should be 0x1000 IMHO.
> > I might miss some rounding magic here though.
> > 
> 
> Yeah. I think you are right. It all becomes slightly more fishy due to
> the num_queues device parameter being 1's based and accounts for the
> admin queue pair.
> 
> But in get/set features, the value has to be 0's based and only account
> for the I/O queues, so we need to subtract 2 from the value. It's
> confusing all around.
Yea, I can't agree more on that. The zero based values had bitten
me few times while I developed nvme-mdev as well.

> 
> Since the admin queue pair isn't really optional I think it would be
> better that we introduces a new max_ioqpairs parameter that is 1's
> based, counts number of pairs and obviously only accounts for the io
> queues.
> 
> I guess we need to keep the num_queues parameter around for
> compatibility.
> 
> The doorbells are only 4 bytes btw, but the calculation still looks
I don't understand that. Each doorbell is indeed 4 bytes, but they come
in pairs so each doorbell pair is 8 bytes.

BTW, the spec has so called doorbell stride, which allows to artificially 
increase
each doorbell by a power of two. This was intended for software 

Re: [PATCH v5 08/26] nvme: refactor device realization

2020-03-16 Thread Klaus Birkelund Jensen
On Feb 12 11:27, Maxim Levitsky wrote:
> On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > This patch splits up nvme_realize into multiple individual functions,
> > each initializing a different subset of the device.
> > 
> > Signed-off-by: Klaus Jensen 
> > ---
> >  hw/block/nvme.c | 175 +++-
> >  hw/block/nvme.h |  21 ++
> >  2 files changed, 133 insertions(+), 63 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index e1810260d40b..81514eaef63a 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -44,6 +44,7 @@
> >  #include "nvme.h"
> >  
> >  #define NVME_SPEC_VER 0x00010201
> > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> >  
> >  #define NVME_GUEST_ERR(trace, fmt, ...) \
> >  do { \
> > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = {
> >  },
> >  };
> >  
> > -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
> >  {
> > -NvmeCtrl *n = NVME(pci_dev);
> > -NvmeIdCtrl *id = >id_ctrl;
> > -
> > -int i;
> > -int64_t bs_size;
> > -uint8_t *pci_conf;
> > -
> > -if (!n->params.num_queues) {
> > -error_setg(errp, "num_queues can't be zero");
> > -return;
> > -}
> > +NvmeParams *params = >params;
> >  
> >  if (!n->conf.blk) {
> > -error_setg(errp, "drive property not set");
> > -return;
> > +error_setg(errp, "nvme: block backend not configured");
> > +return 1;
> As a matter of taste, negative values indicate error, and 0 is the success 
> value.
> In Linux kernel this is even an official rule.
> >  }

Fixed.

> >  
> > -bs_size = blk_getlength(n->conf.blk);
> > -if (bs_size < 0) {
> > -error_setg(errp, "could not get backing file size");
> > -return;
> > +if (!params->serial) {
> > +error_setg(errp, "nvme: serial not configured");
> > +return 1;
> >  }
> >  
> > -if (!n->params.serial) {
> > -error_setg(errp, "serial property not set");
> > -return;
> > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) {
> > +error_setg(errp, "nvme: invalid queue configuration");
> Maybe something like "nvme: invalid queue count specified, should be between 
> 1 and ..."?
> > +return 1;
> >  }

Fixed.

> > +
> > +return 0;
> > +}
> > +
> > +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> > +{
> >  blkconf_blocksizes(>conf);
> >  if (!blkconf_apply_backend_options(>conf, 
> > blk_is_read_only(n->conf.blk),
> > -   false, errp)) {
> > -return;
> > +false, errp)) {
> > +return 1;
> >  }
> >  
> > -pci_conf = pci_dev->config;
> > -pci_conf[PCI_INTERRUPT_PIN] = 1;
> > -pci_config_set_prog_interface(pci_dev->config, 0x2);
> > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> > -pcie_endpoint_cap_init(pci_dev, 0x80);
> > +return 0;
> > +}
> >  
> > +static void nvme_init_state(NvmeCtrl *n)
> > +{
> >  n->num_namespaces = 1;
> >  n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
> 
> Isn't that wrong?
> First 4K of mmio (0x1000) is the registers, and that is followed by the 
> doorbells,
> and each doorbell takes 8 bytes (assuming regular doorbell stride).
> so n->params.num_queues + 1 should be total number of queues, thus the 0x1004 
> should be 0x1000 IMHO.
> I might miss some rounding magic here though.
> 

Yeah. I think you are right. It all becomes slightly more fishy due to
the num_queues device parameter being 1's based and accounts for the
admin queue pair.

But in get/set features, the value has to be 0's based and only account
for the I/O queues, so we need to subtract 2 from the value. It's
confusing all around.

Since the admin queue pair isn't really optional I think it would be
better that we introduces a new max_ioqpairs parameter that is 1's
based, counts number of pairs and obviously only accounts for the io
queues.

I guess we need to keep the num_queues parameter around for
compatibility.

The doorbells are only 4 bytes btw, but the calculation still looks
wrong. With a max_ioqpairs parameter in place, the reg_size should be

pow2ceil(0x1008 + 2 * (n->params.max_ioqpairs) * 4)

Right? Thats 0x1000 for the core registers, 8 bytes for the sq/cq
doorbells for the admin queue pair, and then room for the i/o queue
pairs.

I added a patch for this in v6.

> > -n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> > -
> >  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> >  n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
> >  n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
> > +}
> >  
> > -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n,
> > -  "nvme", n->reg_size);
> > +static void nvme_init_cmb(NvmeCtrl *n, 

Re: [PATCH v5 08/26] nvme: refactor device realization

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> This patch splits up nvme_realize into multiple individual functions,
> each initializing a different subset of the device.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 175 +++-
>  hw/block/nvme.h |  21 ++
>  2 files changed, 133 insertions(+), 63 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e1810260d40b..81514eaef63a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -44,6 +44,7 @@
>  #include "nvme.h"
>  
>  #define NVME_SPEC_VER 0x00010201
> +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
> @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
>  {
> -NvmeCtrl *n = NVME(pci_dev);
> -NvmeIdCtrl *id = >id_ctrl;
> -
> -int i;
> -int64_t bs_size;
> -uint8_t *pci_conf;
> -
> -if (!n->params.num_queues) {
> -error_setg(errp, "num_queues can't be zero");
> -return;
> -}
> +NvmeParams *params = >params;
>  
>  if (!n->conf.blk) {
> -error_setg(errp, "drive property not set");
> -return;
> +error_setg(errp, "nvme: block backend not configured");
> +return 1;
As a matter of taste, negative values indicate error, and 0 is the success 
value.
In Linux kernel this is even an official rule.
>  }
>  
> -bs_size = blk_getlength(n->conf.blk);
> -if (bs_size < 0) {
> -error_setg(errp, "could not get backing file size");
> -return;
> +if (!params->serial) {
> +error_setg(errp, "nvme: serial not configured");
> +return 1;
>  }
>  
> -if (!n->params.serial) {
> -error_setg(errp, "serial property not set");
> -return;
> +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) {
> +error_setg(errp, "nvme: invalid queue configuration");
Maybe something like "nvme: invalid queue count specified, should be between 1 
and ..."?
> +return 1;
>  }
> +
> +return 0;
> +}
> +
> +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> +{
>  blkconf_blocksizes(>conf);
>  if (!blkconf_apply_backend_options(>conf, 
> blk_is_read_only(n->conf.blk),
> -   false, errp)) {
> -return;
> +false, errp)) {
> +return 1;
>  }
>  
> -pci_conf = pci_dev->config;
> -pci_conf[PCI_INTERRUPT_PIN] = 1;
> -pci_config_set_prog_interface(pci_dev->config, 0x2);
> -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> -pcie_endpoint_cap_init(pci_dev, 0x80);
> +return 0;
> +}
>  
> +static void nvme_init_state(NvmeCtrl *n)
> +{
>  n->num_namespaces = 1;
>  n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);

Isn't that wrong?
First 4K of mmio (0x1000) is the registers, and that is followed by the 
doorbells,
and each doorbell takes 8 bytes (assuming regular doorbell stride).
so n->params.num_queues + 1 should be total number of queues, thus the 0x1004 
should be 0x1000 IMHO.
I might miss some rounding magic here though.

> -n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> -
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>  n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
>  n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
> +}
>  
> -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n,
> -  "nvme", n->reg_size);
> +static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> +{
> +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
It would be nice to have #define for CMB bar number
> +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
> +
> +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
> +NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
> +
> +n->cmbloc = n->bar.cmbloc;
> +n->cmbsz = n->bar.cmbsz;
> +
> +n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> +memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,
> +"nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> +pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
Same here although since you read it here from the controller register,
then maybe leave it as is. I prefer though for this kind of thing
to have a #define and use it everywhere. 

> +PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
> +PCI_BASE_ADDRESS_MEM_PREFETCH, >ctrl_mem);
> +}
> +
> +static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev)
> +{
> +uint8_t *pci_conf =