Re: [PATCH v5 08/26] nvme: refactor device realization
On Mon, 2020-03-16 at 00:43 -0700, Klaus Birkelund Jensen wrote: > On Feb 12 11:27, Maxim Levitsky wrote: > > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote: > > > This patch splits up nvme_realize into multiple individual functions, > > > each initializing a different subset of the device. > > > > > > Signed-off-by: Klaus Jensen > > > --- > > > hw/block/nvme.c | 175 +++- > > > hw/block/nvme.h | 21 ++ > > > 2 files changed, 133 insertions(+), 63 deletions(-) > > > > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > > index e1810260d40b..81514eaef63a 100644 > > > --- a/hw/block/nvme.c > > > +++ b/hw/block/nvme.c > > > @@ -44,6 +44,7 @@ > > > #include "nvme.h" > > > > > > #define NVME_SPEC_VER 0x00010201 > > > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE > > > > > > #define NVME_GUEST_ERR(trace, fmt, ...) \ > > > do { \ > > > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = { > > > }, > > > }; > > > > > > -static void nvme_realize(PCIDevice *pci_dev, Error **errp) > > > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp) > > > { > > > -NvmeCtrl *n = NVME(pci_dev); > > > -NvmeIdCtrl *id = >id_ctrl; > > > - > > > -int i; > > > -int64_t bs_size; > > > -uint8_t *pci_conf; > > > - > > > -if (!n->params.num_queues) { > > > -error_setg(errp, "num_queues can't be zero"); > > > -return; > > > -} > > > +NvmeParams *params = >params; > > > > > > if (!n->conf.blk) { > > > -error_setg(errp, "drive property not set"); > > > -return; > > > +error_setg(errp, "nvme: block backend not configured"); > > > +return 1; > > > > As a matter of taste, negative values indicate error, and 0 is the success > > value. > > In Linux kernel this is even an official rule. > > > } > > Fixed. > > > > > > > -bs_size = blk_getlength(n->conf.blk); > > > -if (bs_size < 0) { > > > -error_setg(errp, "could not get backing file size"); > > > -return; > > > +if (!params->serial) { > > > +error_setg(errp, "nvme: serial not configured"); > > > +return 1; > > > } > > > > > > -if (!n->params.serial) { > > > -error_setg(errp, "serial property not set"); > > > -return; > > > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) { > > > +error_setg(errp, "nvme: invalid queue configuration"); > > > > Maybe something like "nvme: invalid queue count specified, should be > > between 1 and ..."? > > > +return 1; > > > } > > Fixed. Thanks > > > > + > > > +return 0; > > > +} > > > + > > > +static int nvme_init_blk(NvmeCtrl *n, Error **errp) > > > +{ > > > blkconf_blocksizes(>conf); > > > if (!blkconf_apply_backend_options(>conf, > > > blk_is_read_only(n->conf.blk), > > > - false, errp)) { > > > -return; > > > +false, errp)) { > > > +return 1; > > > } > > > > > > -pci_conf = pci_dev->config; > > > -pci_conf[PCI_INTERRUPT_PIN] = 1; > > > -pci_config_set_prog_interface(pci_dev->config, 0x2); > > > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS); > > > -pcie_endpoint_cap_init(pci_dev, 0x80); > > > +return 0; > > > +} > > > > > > +static void nvme_init_state(NvmeCtrl *n) > > > +{ > > > n->num_namespaces = 1; > > > n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4); > > > > Isn't that wrong? > > First 4K of mmio (0x1000) is the registers, and that is followed by the > > doorbells, > > and each doorbell takes 8 bytes (assuming regular doorbell stride). > > so n->params.num_queues + 1 should be total number of queues, thus the > > 0x1004 should be 0x1000 IMHO. > > I might miss some rounding magic here though. > > > > Yeah. I think you are right. It all becomes slightly more fishy due to > the num_queues device parameter being 1's based and accounts for the > admin queue pair. > > But in get/set features, the value has to be 0's based and only account > for the I/O queues, so we need to subtract 2 from the value. It's > confusing all around. Yea, I can't agree more on that. The zero based values had bitten me few times while I developed nvme-mdev as well. > > Since the admin queue pair isn't really optional I think it would be > better that we introduces a new max_ioqpairs parameter that is 1's > based, counts number of pairs and obviously only accounts for the io > queues. > > I guess we need to keep the num_queues parameter around for > compatibility. > > The doorbells are only 4 bytes btw, but the calculation still looks I don't understand that. Each doorbell is indeed 4 bytes, but they come in pairs so each doorbell pair is 8 bytes. BTW, the spec has so called doorbell stride, which allows to artificially increase each doorbell by a power of two. This was intended for software
Re: [PATCH v5 08/26] nvme: refactor device realization
On Feb 12 11:27, Maxim Levitsky wrote: > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote: > > This patch splits up nvme_realize into multiple individual functions, > > each initializing a different subset of the device. > > > > Signed-off-by: Klaus Jensen > > --- > > hw/block/nvme.c | 175 +++- > > hw/block/nvme.h | 21 ++ > > 2 files changed, 133 insertions(+), 63 deletions(-) > > > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > > index e1810260d40b..81514eaef63a 100644 > > --- a/hw/block/nvme.c > > +++ b/hw/block/nvme.c > > @@ -44,6 +44,7 @@ > > #include "nvme.h" > > > > #define NVME_SPEC_VER 0x00010201 > > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE > > > > #define NVME_GUEST_ERR(trace, fmt, ...) \ > > do { \ > > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = { > > }, > > }; > > > > -static void nvme_realize(PCIDevice *pci_dev, Error **errp) > > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp) > > { > > -NvmeCtrl *n = NVME(pci_dev); > > -NvmeIdCtrl *id = >id_ctrl; > > - > > -int i; > > -int64_t bs_size; > > -uint8_t *pci_conf; > > - > > -if (!n->params.num_queues) { > > -error_setg(errp, "num_queues can't be zero"); > > -return; > > -} > > +NvmeParams *params = >params; > > > > if (!n->conf.blk) { > > -error_setg(errp, "drive property not set"); > > -return; > > +error_setg(errp, "nvme: block backend not configured"); > > +return 1; > As a matter of taste, negative values indicate error, and 0 is the success > value. > In Linux kernel this is even an official rule. > > } Fixed. > > > > -bs_size = blk_getlength(n->conf.blk); > > -if (bs_size < 0) { > > -error_setg(errp, "could not get backing file size"); > > -return; > > +if (!params->serial) { > > +error_setg(errp, "nvme: serial not configured"); > > +return 1; > > } > > > > -if (!n->params.serial) { > > -error_setg(errp, "serial property not set"); > > -return; > > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) { > > +error_setg(errp, "nvme: invalid queue configuration"); > Maybe something like "nvme: invalid queue count specified, should be between > 1 and ..."? > > +return 1; > > } Fixed. > > + > > +return 0; > > +} > > + > > +static int nvme_init_blk(NvmeCtrl *n, Error **errp) > > +{ > > blkconf_blocksizes(>conf); > > if (!blkconf_apply_backend_options(>conf, > > blk_is_read_only(n->conf.blk), > > - false, errp)) { > > -return; > > +false, errp)) { > > +return 1; > > } > > > > -pci_conf = pci_dev->config; > > -pci_conf[PCI_INTERRUPT_PIN] = 1; > > -pci_config_set_prog_interface(pci_dev->config, 0x2); > > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS); > > -pcie_endpoint_cap_init(pci_dev, 0x80); > > +return 0; > > +} > > > > +static void nvme_init_state(NvmeCtrl *n) > > +{ > > n->num_namespaces = 1; > > n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4); > > Isn't that wrong? > First 4K of mmio (0x1000) is the registers, and that is followed by the > doorbells, > and each doorbell takes 8 bytes (assuming regular doorbell stride). > so n->params.num_queues + 1 should be total number of queues, thus the 0x1004 > should be 0x1000 IMHO. > I might miss some rounding magic here though. > Yeah. I think you are right. It all becomes slightly more fishy due to the num_queues device parameter being 1's based and accounts for the admin queue pair. But in get/set features, the value has to be 0's based and only account for the I/O queues, so we need to subtract 2 from the value. It's confusing all around. Since the admin queue pair isn't really optional I think it would be better that we introduces a new max_ioqpairs parameter that is 1's based, counts number of pairs and obviously only accounts for the io queues. I guess we need to keep the num_queues parameter around for compatibility. The doorbells are only 4 bytes btw, but the calculation still looks wrong. With a max_ioqpairs parameter in place, the reg_size should be pow2ceil(0x1008 + 2 * (n->params.max_ioqpairs) * 4) Right? Thats 0x1000 for the core registers, 8 bytes for the sq/cq doorbells for the admin queue pair, and then room for the i/o queue pairs. I added a patch for this in v6. > > -n->ns_size = bs_size / (uint64_t)n->num_namespaces; > > - > > n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); > > n->sq = g_new0(NvmeSQueue *, n->params.num_queues); > > n->cq = g_new0(NvmeCQueue *, n->params.num_queues); > > +} > > > > -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, > > - "nvme", n->reg_size); > > +static void nvme_init_cmb(NvmeCtrl *n,
Re: [PATCH v5 08/26] nvme: refactor device realization
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote: > This patch splits up nvme_realize into multiple individual functions, > each initializing a different subset of the device. > > Signed-off-by: Klaus Jensen > --- > hw/block/nvme.c | 175 +++- > hw/block/nvme.h | 21 ++ > 2 files changed, 133 insertions(+), 63 deletions(-) > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c > index e1810260d40b..81514eaef63a 100644 > --- a/hw/block/nvme.c > +++ b/hw/block/nvme.c > @@ -44,6 +44,7 @@ > #include "nvme.h" > > #define NVME_SPEC_VER 0x00010201 > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE > > #define NVME_GUEST_ERR(trace, fmt, ...) \ > do { \ > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = { > }, > }; > > -static void nvme_realize(PCIDevice *pci_dev, Error **errp) > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp) > { > -NvmeCtrl *n = NVME(pci_dev); > -NvmeIdCtrl *id = >id_ctrl; > - > -int i; > -int64_t bs_size; > -uint8_t *pci_conf; > - > -if (!n->params.num_queues) { > -error_setg(errp, "num_queues can't be zero"); > -return; > -} > +NvmeParams *params = >params; > > if (!n->conf.blk) { > -error_setg(errp, "drive property not set"); > -return; > +error_setg(errp, "nvme: block backend not configured"); > +return 1; As a matter of taste, negative values indicate error, and 0 is the success value. In Linux kernel this is even an official rule. > } > > -bs_size = blk_getlength(n->conf.blk); > -if (bs_size < 0) { > -error_setg(errp, "could not get backing file size"); > -return; > +if (!params->serial) { > +error_setg(errp, "nvme: serial not configured"); > +return 1; > } > > -if (!n->params.serial) { > -error_setg(errp, "serial property not set"); > -return; > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) { > +error_setg(errp, "nvme: invalid queue configuration"); Maybe something like "nvme: invalid queue count specified, should be between 1 and ..."? > +return 1; > } > + > +return 0; > +} > + > +static int nvme_init_blk(NvmeCtrl *n, Error **errp) > +{ > blkconf_blocksizes(>conf); > if (!blkconf_apply_backend_options(>conf, > blk_is_read_only(n->conf.blk), > - false, errp)) { > -return; > +false, errp)) { > +return 1; > } > > -pci_conf = pci_dev->config; > -pci_conf[PCI_INTERRUPT_PIN] = 1; > -pci_config_set_prog_interface(pci_dev->config, 0x2); > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS); > -pcie_endpoint_cap_init(pci_dev, 0x80); > +return 0; > +} > > +static void nvme_init_state(NvmeCtrl *n) > +{ > n->num_namespaces = 1; > n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4); Isn't that wrong? First 4K of mmio (0x1000) is the registers, and that is followed by the doorbells, and each doorbell takes 8 bytes (assuming regular doorbell stride). so n->params.num_queues + 1 should be total number of queues, thus the 0x1004 should be 0x1000 IMHO. I might miss some rounding magic here though. > -n->ns_size = bs_size / (uint64_t)n->num_namespaces; > - > n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); > n->sq = g_new0(NvmeSQueue *, n->params.num_queues); > n->cq = g_new0(NvmeCQueue *, n->params.num_queues); > +} > > -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, > - "nvme", n->reg_size); > +static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) > +{ > +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2); It would be nice to have #define for CMB bar number > +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0); > + > +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); > +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); > +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0); > +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); > +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); > +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); > +NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); > + > +n->cmbloc = n->bar.cmbloc; > +n->cmbsz = n->bar.cmbsz; > + > +n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); > +memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n, > +"nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); > +pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc), Same here although since you read it here from the controller register, then maybe leave it as is. I prefer though for this kind of thing to have a #define and use it everywhere. > +PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | > +PCI_BASE_ADDRESS_MEM_PREFETCH, >ctrl_mem); > +} > + > +static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev) > +{ > +uint8_t *pci_conf =
[PATCH v5 08/26] nvme: refactor device realization
This patch splits up nvme_realize into multiple individual functions, each initializing a different subset of the device. Signed-off-by: Klaus Jensen --- hw/block/nvme.c | 175 +++- hw/block/nvme.h | 21 ++ 2 files changed, 133 insertions(+), 63 deletions(-) diff --git a/hw/block/nvme.c b/hw/block/nvme.c index e1810260d40b..81514eaef63a 100644 --- a/hw/block/nvme.c +++ b/hw/block/nvme.c @@ -44,6 +44,7 @@ #include "nvme.h" #define NVME_SPEC_VER 0x00010201 +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE #define NVME_GUEST_ERR(trace, fmt, ...) \ do { \ @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = { }, }; -static void nvme_realize(PCIDevice *pci_dev, Error **errp) +static int nvme_check_constraints(NvmeCtrl *n, Error **errp) { -NvmeCtrl *n = NVME(pci_dev); -NvmeIdCtrl *id = >id_ctrl; - -int i; -int64_t bs_size; -uint8_t *pci_conf; - -if (!n->params.num_queues) { -error_setg(errp, "num_queues can't be zero"); -return; -} +NvmeParams *params = >params; if (!n->conf.blk) { -error_setg(errp, "drive property not set"); -return; +error_setg(errp, "nvme: block backend not configured"); +return 1; } -bs_size = blk_getlength(n->conf.blk); -if (bs_size < 0) { -error_setg(errp, "could not get backing file size"); -return; +if (!params->serial) { +error_setg(errp, "nvme: serial not configured"); +return 1; } -if (!n->params.serial) { -error_setg(errp, "serial property not set"); -return; +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) { +error_setg(errp, "nvme: invalid queue configuration"); +return 1; } + +return 0; +} + +static int nvme_init_blk(NvmeCtrl *n, Error **errp) +{ blkconf_blocksizes(>conf); if (!blkconf_apply_backend_options(>conf, blk_is_read_only(n->conf.blk), - false, errp)) { -return; +false, errp)) { +return 1; } -pci_conf = pci_dev->config; -pci_conf[PCI_INTERRUPT_PIN] = 1; -pci_config_set_prog_interface(pci_dev->config, 0x2); -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS); -pcie_endpoint_cap_init(pci_dev, 0x80); +return 0; +} +static void nvme_init_state(NvmeCtrl *n) +{ n->num_namespaces = 1; n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4); -n->ns_size = bs_size / (uint64_t)n->num_namespaces; - n->namespaces = g_new0(NvmeNamespace, n->num_namespaces); n->sq = g_new0(NvmeSQueue *, n->params.num_queues); n->cq = g_new0(NvmeCQueue *, n->params.num_queues); +} -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, - "nvme", n->reg_size); +static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev) +{ +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2); +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0); + +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1); +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0); +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0); +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1); +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1); +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); +NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb); + +n->cmbloc = n->bar.cmbloc; +n->cmbsz = n->bar.cmbsz; + +n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); +memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n, +"nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz)); +pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc), +PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 | +PCI_BASE_ADDRESS_MEM_PREFETCH, >ctrl_mem); +} + +static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev) +{ +uint8_t *pci_conf = pci_dev->config; + +pci_conf[PCI_INTERRUPT_PIN] = 1; +pci_config_set_prog_interface(pci_conf, 0x2); +pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL); +pci_config_set_device_id(pci_conf, 0x5845); +pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS); +pcie_endpoint_cap_init(pci_dev, 0x80); + +memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n, "nvme", +n->reg_size); pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64, >iomem); msix_init_exclusive_bar(pci_dev, n->params.num_queues, 4, NULL); +if (n->params.cmb_size_mb) { +nvme_init_cmb(n, pci_dev); +} +} + +static void nvme_init_ctrl(NvmeCtrl *n) +{ +NvmeIdCtrl *id = >id_ctrl; +NvmeParams *params = >params; +uint8_t *pci_conf = n->parent_obj.config; + id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID)); id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID)); strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');