Jonathan Cameron wrote: > On Thu, 24 Nov 2022 10:34:52 -0800 > Dan Williams <dan.j.willi...@intel.com> wrote: > > > The three objects 'struct cxl_nvdimm_bridge', 'struct cxl_nvdimm', and > > 'struct cxl_pmem_region' manage CXL persistent memory resources. The > > bridge represents base platform resources, the nvdimm represents one or > > more endpoints, and the region is a collection of nvdimms that > > contribute to an assembled address range. > > > > Their relationship is such that a region is torn down if any component > > endpoints are removed. All regions and endpoints are torn down if the > > foundational bridge device goes down. > > > > A workqueue was deployed to manage these interdependencies, but it is > > difficult to reason about, and fragile. A recent attempt to take the CXL > > root device lock in the cxl_mem driver was reported by lockdep as > > colliding with the flush_work() in the cxl_pmem flows. > > > > Instead of the workqueue, arrange for all pmem/nvdimm devices to be torn > > down immediately and hierarchically. A similar change is made to both > > the 'cxl_nvdimm' and 'cxl_pmem_region' objects. For bisect-ability both > > changes are made in the same patch which unfortunately makes the patch > > bigger than desired. > > > > Arrange for cxl_memdev and cxl_region to register a cxl_nvdimm and > > cxl_pmem_region as a devres release action of the bridge device. > > Additionally, include a devres release action of the cxl_memdev or > > cxl_region device that triggers the bridge's release action if an endpoint > > exits before the bridge. I.e. this allows either unplugging the bridge, > > or unplugging and endpoint to result in the same cleanup actions. > > > > To keep the patch smaller the cleanup of the now defunct workqueue > > infrastructure is saved for a follow-on patch. > > > > Signed-off-by: Dan Williams <dan.j.willi...@intel.com> > > Hi Dan, > > This is fiddly to follow, but then so was the original.
Fiddly is a charitable comment for the original. This patch is larger than I would have liked, but I did not see a way to unwind the complexity without causing a bisect break. > A few minor comments inline. Thanks for taking a look. > > Jonathan > > > --- > > drivers/cxl/core/pmem.c | 70 ++++++++++++++++++++---- > > drivers/cxl/core/region.c | 54 ++++++++++++++++++- > > drivers/cxl/cxl.h | 7 ++ > > drivers/cxl/cxlmem.h | 4 + > > drivers/cxl/mem.c | 9 +++ > > drivers/cxl/pci.c | 3 - > > drivers/cxl/pmem.c | 122 > > ++++++++++++------------------------------ > > tools/testing/cxl/test/mem.c | 3 - > > 8 files changed, 164 insertions(+), 108 deletions(-) > > > > diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c > > index 1d12a8206444..647b3a30638e 100644 > > --- a/drivers/cxl/core/pmem.c > > +++ b/drivers/cxl/core/pmem.c > > @@ -219,7 +219,8 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL); > > > > static struct lock_class_key cxl_nvdimm_key; > > > > -static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd) > > +static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge > > *cxl_nvb, > > + struct cxl_memdev *cxlmd) > > { > > struct cxl_nvdimm *cxl_nvd; > > struct device *dev; > > @@ -230,6 +231,7 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct > > cxl_memdev *cxlmd) > > > > dev = &cxl_nvd->dev; > > cxl_nvd->cxlmd = cxlmd; > > + cxlmd->cxl_nvd = cxl_nvd; > > device_initialize(dev); > > lockdep_set_class(&dev->mutex, &cxl_nvdimm_key); > > device_set_pm_not_required(dev); > > @@ -240,27 +242,52 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct > > cxl_memdev *cxlmd) > > return cxl_nvd; > > } > > > > -static void cxl_nvd_unregister(void *dev) > > +static void cxl_nvd_unregister(void *_cxl_nvd) > > { > > - device_unregister(dev); > > + struct cxl_nvdimm *cxl_nvd = _cxl_nvd; > > + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; > > + > > + device_lock_assert(&cxlmd->cxl_nvb->dev); > > Locally it's not immediately obvious if that is always the same > as > device_lock_assert(&cxl_nvb->dev); > If not, a comment, if it is maybe just change to that. Sure, yes, it is always the latter. > > > + cxl_nvd->cxlmd = NULL; > > + cxlmd->cxl_nvd = NULL; > > + device_unregister(&cxl_nvd->dev); > > +} > > + > > +static void cxlmd_release_nvdimm(void *_cxlmd) > > +{ > > + struct cxl_memdev *cxlmd = _cxlmd; > > + struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb; > > + > > + device_lock(&cxl_nvb->dev); > > + if (cxlmd->cxl_nvd) > > + devm_release_action(&cxl_nvb->dev, cxl_nvd_unregister, > > + cxlmd->cxl_nvd); > > + device_unlock(&cxl_nvb->dev); > > + put_device(&cxl_nvb->dev); > > } > > > > /** > > * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm > > - * @host: same host as @cxlmd > > * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations > > * > > * Return: 0 on success negative error code on failure. > > */ > > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd) > > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd) > > { > > + struct cxl_nvdimm_bridge *cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev); > > Another cosmetic change, but I'd prefer the actual > cxl_nvb = cxl_find_nvdimm_bridge(); > > to be just above the error check rather than up here. ok. > > > struct cxl_nvdimm *cxl_nvd; > > struct device *dev; > > int rc; > > > > - cxl_nvd = cxl_nvdimm_alloc(cxlmd); > > - if (IS_ERR(cxl_nvd)) > > - return PTR_ERR(cxl_nvd); > > + if (!cxl_nvb) > > + return -ENODEV; > > + > > + cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd); > > + if (IS_ERR(cxl_nvd)) { > > + rc = PTR_ERR(cxl_nvd); > > + goto err_alloc; > > + } > > + cxlmd->cxl_nvb = cxl_nvb; > > > > dev = &cxl_nvd->dev; > > rc = dev_set_name(dev, "pmem%d", cxlmd->id); > > @@ -271,13 +298,34 @@ int devm_cxl_add_nvdimm(struct device *host, struct > > cxl_memdev *cxlmd) > > if (rc) > > goto err; > > > > - dev_dbg(host, "%s: register %s\n", dev_name(dev->parent), > > - dev_name(dev)); > > + dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); > > > > - return devm_add_action_or_reset(host, cxl_nvd_unregister, dev); > > + /* > > + * Remove this nvdimm connection if either the top-level PMEM > > + * bridge goes down, or the endpoint device goes through > > + * ->remove(). > > + */ > > Perhaps move this comment down to inside the if (cxl_nvb->dev.driver) > block as it only refers (I think) to the devm_add_action_or_reset(), > not the surrounding driver binding checks. It refers to the whole rest of this function because this action: devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister, cxl_nvd) ...deletes the cxl_nvdimm when the bridge is removed, and this action: devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, cxlmd) ...removes the nvdimm, and the above release action, if the endpoint is removed *before* the bridge. At a minimum the comment needs strengthening if that detail was not conveyed. > > > + device_lock(&cxl_nvb->dev); > > + if (cxl_nvb->dev.driver) > > + rc = devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister, > > + cxl_nvd); > > + else > > + rc = -ENXIO; > > + device_unlock(&cxl_nvb->dev); > > + > > + if (rc) > > + goto err_alloc; > > + > > + /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ > > + return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, > > cxlmd); > > > > err: > > put_device(dev); > > +err_alloc: > > + put_device(&cxl_nvb->dev); > > Is this ordering necessary? It's not reverse of the setup above, so if we can > reordering > to be so, that is probably a good thing. (move these NULL setting above the > put_device(&cxl_nvb->dev)). Sure, they can move above. > > > + cxlmd->cxl_nvb = NULL; > > + cxlmd->cxl_nvd = NULL; > > + > > return rc; > > } > > EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, CXL); > > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > > index f9ae5ad284ff..e73bec828032 100644 > > --- a/drivers/cxl/core/region.c > > +++ b/drivers/cxl/core/region.c > > @@ -1812,6 +1812,7 @@ static struct lock_class_key cxl_pmem_region_key; > > static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region > > *cxlr) > > { > > struct cxl_region_params *p = &cxlr->params; > > + struct cxl_nvdimm_bridge *cxl_nvb; > > struct cxl_pmem_region *cxlr_pmem; > > struct device *dev; > > int i; > > @@ -1839,6 +1840,14 @@ static struct cxl_pmem_region > > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > > struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > > struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; > > > > + if (i == 0) { > > Whilst kind of obvious, maybe a comment in here that for end points in the > region the > cxl_nvb will be the same hence we just look it up for the first one? ok. > > > + cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev); > > + if (!cxl_nvb) { > > + cxlr_pmem = ERR_PTR(-ENODEV); > > + goto out; > > + } > > + cxlr->cxl_nvb = cxl_nvb; > > + } > > m->cxlmd = cxlmd; > > get_device(&cxlmd->dev); > > m->start = cxled->dpa_res->start; > > @@ -1848,6 +1857,7 @@ static struct cxl_pmem_region > > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > > > > dev = &cxlr_pmem->dev; > > cxlr_pmem->cxlr = cxlr; > > + cxlr->cxlr_pmem = cxlr_pmem; > > device_initialize(dev); > > lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); > > device_set_pm_not_required(dev); > > @@ -1860,9 +1870,30 @@ static struct cxl_pmem_region > > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > > return cxlr_pmem; > > } > > > > -static void cxlr_pmem_unregister(void *dev) > > +static void cxlr_pmem_unregister(void *_cxlr_pmem) > > +{ > > + struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem; > > + struct cxl_region *cxlr = cxlr_pmem->cxlr; > > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; > > + > > + device_lock_assert(&cxl_nvb->dev); > > This scheme is obvious in this patch, but probably less so when just > looking at the resulting code. Perhaps worth a comment > here on why we care about that particular lock? ok, and I'll make a similar comment in cxl_nvd_unregister. > > > + cxlr->cxlr_pmem = NULL; > > + cxlr_pmem->cxlr = NULL; > > + device_unregister(&cxlr_pmem->dev); > > +} > > + > > +static void cxlr_release_nvdimm(void *_cxlr) > > { > > - device_unregister(dev); > > + struct cxl_region *cxlr = _cxlr; > > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; > > + > > + device_lock(&cxl_nvb->dev); > > + if (cxlr->cxlr_pmem) > > + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, > > + cxlr->cxlr_pmem); > > + device_unlock(&cxl_nvb->dev); > > + cxlr->cxl_nvb = NULL; > > + put_device(&cxl_nvb->dev); > > } > > > > /** > > @@ -1874,12 +1905,14 @@ static void cxlr_pmem_unregister(void *dev) > > static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > > { > > struct cxl_pmem_region *cxlr_pmem; > > + struct cxl_nvdimm_bridge *cxl_nvb; > > struct device *dev; > > int rc; > > > > cxlr_pmem = cxl_pmem_region_alloc(cxlr); > > if (IS_ERR(cxlr_pmem)) > > return PTR_ERR(cxlr_pmem); > > + cxl_nvb = cxlr->cxl_nvb; > > > > dev = &cxlr_pmem->dev; > > rc = dev_set_name(dev, "pmem_region%d", cxlr->id); > > @@ -1893,10 +1926,25 @@ static int devm_cxl_add_pmem_region(struct > > cxl_region *cxlr) > > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > > dev_name(dev)); > > > > - return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev); > > + device_lock(&cxl_nvb->dev); > > + if (cxl_nvb->dev.driver) > > + rc = devm_add_action_or_reset(&cxl_nvb->dev, > > + cxlr_pmem_unregister, cxlr_pmem); > > + else > > + rc = -ENXIO; > > + device_unlock(&cxl_nvb->dev); > > + > > + if (rc) > > + goto err_bridge; > > + > > + /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */ > > + return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr); > > > > err: > > put_device(dev); > > +err_bridge: > > + put_device(&cxl_nvb->dev); > > + cxlr->cxl_nvb = NULL; > > return rc; > > } > > > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > > index 4ac7938eaf6c..9b5ba9626636 100644 > > --- a/drivers/cxl/cxl.h > > +++ b/drivers/cxl/cxl.h > > @@ -386,6 +386,8 @@ struct cxl_region_params { > > * @id: This region's id. Id is globally unique across all regions > > * @mode: Endpoint decoder allocation / access mode > > * @type: Endpoint decoder target type > > + * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem shutdown > > I was going to suggest just carrying the struct device around, but this > comment isn't quite true. I think cxl_region->cxl_nvb is also used in > cxl_pmem_region_probe() to get the nvdimm_buss for nvdimm_pmem_region_create() Perhaps just make the comment less specific and say "nvdimm bridge for coordinating @cxlr_pmem setup and shutdown". > > > > + * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge > > * @params: active + config params for the region > > */ > > struct cxl_region { > > @@ -393,6 +395,8 @@ struct cxl_region { > > int id; > > enum cxl_decoder_mode mode; > > enum cxl_decoder_type type; > > + struct cxl_nvdimm_bridge *cxl_nvb; > > + struct cxl_pmem_region *cxlr_pmem; > > struct cxl_region_params params; > > }; > > > > @@ -438,7 +442,6 @@ struct cxl_pmem_region { > > struct device dev; > > struct cxl_region *cxlr; > > struct nd_region *nd_region; > > - struct cxl_nvdimm_bridge *bridge; > > struct range hpa_range; > > int nr_mappings; > > struct cxl_pmem_region_mapping mapping[]; > > @@ -637,7 +640,7 @@ struct cxl_nvdimm_bridge > > *devm_cxl_add_nvdimm_bridge(struct device *host, > > struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); > > bool is_cxl_nvdimm(struct device *dev); > > bool is_cxl_nvdimm_bridge(struct device *dev); > > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd); > > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd); > > struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev); > > > > #ifdef CONFIG_CXL_REGION > > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > > index 88e3a8e54b6a..c1c9960ab05f 100644 > > --- a/drivers/cxl/cxlmem.h > > +++ b/drivers/cxl/cxlmem.h > > @@ -35,6 +35,8 @@ > > * @cdev: char dev core object for ioctl operations > > * @cxlds: The device state backing this device > > * @detach_work: active memdev lost a port in its ancestry > > + * @cxl_nvb: coordinate removal of @cxl_nvd if present > > + * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem > > * @id: id number of this memdev instance. > > */ > > struct cxl_memdev { > > @@ -42,6 +44,8 @@ struct cxl_memdev { > > struct cdev cdev; > > struct cxl_dev_state *cxlds; > > struct work_struct detach_work; > > + struct cxl_nvdimm_bridge *cxl_nvb; > > + struct cxl_nvdimm *cxl_nvd; > > int id; > > }; > > > > > diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c > > index 652f00fc68ca..73357d0c3f25 100644 > > --- a/drivers/cxl/pmem.c > > +++ b/drivers/cxl/pmem.c > > > > static struct cxl_driver cxl_nvdimm_driver = { > > @@ -200,6 +182,16 @@ static int cxl_pmem_ctl(struct nvdimm_bus_descriptor > > *nd_desc, > > return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len); > > } > > > > +static void unregister_nvdimm_bus(void *_cxl_nvb) > > +{ > > + struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb; > > + struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus; > > + > > + cxl_nvb->nvdimm_bus = NULL; > > + nvdimm_bus_unregister(nvdimm_bus); > > +} > > + > > Single blank line. > > > + > > static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb) > > { > > if (cxl_nvb->nvdimm_bus) > > @@ -303,23 +295,21 @@ static int cxl_nvdimm_bridge_probe(struct device *dev) > > { > > struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); > > > > - if (cxl_nvb->state == CXL_NVB_DEAD) > > - return -ENXIO; > > + cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor){ > ) { > matches existing style in this file. clang-format removes that space, will add it back. > > > + .provider_name = "CXL", > > + .module = THIS_MODULE, > > + .ndctl = cxl_pmem_ctl, > > + }; > > > > - if (cxl_nvb->state == CXL_NVB_NEW) { > > - cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) { > > - .provider_name = "CXL", > > - .module = THIS_MODULE, > > - .ndctl = cxl_pmem_ctl, > > - }; > > + cxl_nvb->nvdimm_bus = > > + nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc); > > > > - INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state); > > - } > > + if (!cxl_nvb->nvdimm_bus) > > + return -ENOMEM; > > > > - cxl_nvb->state = CXL_NVB_ONLINE; > > - cxl_nvdimm_bridge_state_work(cxl_nvb); > > + INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state); > > > > - return 0; > > + return devm_add_action_or_reset(dev, unregister_nvdimm_bus, cxl_nvb);