On Thu, 24 Nov 2022 10:34:52 -0800 Dan Williams <dan.j.willi...@intel.com> wrote:
> The three objects 'struct cxl_nvdimm_bridge', 'struct cxl_nvdimm', and > 'struct cxl_pmem_region' manage CXL persistent memory resources. The > bridge represents base platform resources, the nvdimm represents one or > more endpoints, and the region is a collection of nvdimms that > contribute to an assembled address range. > > Their relationship is such that a region is torn down if any component > endpoints are removed. All regions and endpoints are torn down if the > foundational bridge device goes down. > > A workqueue was deployed to manage these interdependencies, but it is > difficult to reason about, and fragile. A recent attempt to take the CXL > root device lock in the cxl_mem driver was reported by lockdep as > colliding with the flush_work() in the cxl_pmem flows. > > Instead of the workqueue, arrange for all pmem/nvdimm devices to be torn > down immediately and hierarchically. A similar change is made to both > the 'cxl_nvdimm' and 'cxl_pmem_region' objects. For bisect-ability both > changes are made in the same patch which unfortunately makes the patch > bigger than desired. > > Arrange for cxl_memdev and cxl_region to register a cxl_nvdimm and > cxl_pmem_region as a devres release action of the bridge device. > Additionally, include a devres release action of the cxl_memdev or > cxl_region device that triggers the bridge's release action if an endpoint > exits before the bridge. I.e. this allows either unplugging the bridge, > or unplugging and endpoint to result in the same cleanup actions. > > To keep the patch smaller the cleanup of the now defunct workqueue > infrastructure is saved for a follow-on patch. > > Signed-off-by: Dan Williams <dan.j.willi...@intel.com> Hi Dan, This is fiddly to follow, but then so was the original. A few minor comments inline. Jonathan > --- > drivers/cxl/core/pmem.c | 70 ++++++++++++++++++++---- > drivers/cxl/core/region.c | 54 ++++++++++++++++++- > drivers/cxl/cxl.h | 7 ++ > drivers/cxl/cxlmem.h | 4 + > drivers/cxl/mem.c | 9 +++ > drivers/cxl/pci.c | 3 - > drivers/cxl/pmem.c | 122 > ++++++++++++------------------------------ > tools/testing/cxl/test/mem.c | 3 - > 8 files changed, 164 insertions(+), 108 deletions(-) > > diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c > index 1d12a8206444..647b3a30638e 100644 > --- a/drivers/cxl/core/pmem.c > +++ b/drivers/cxl/core/pmem.c > @@ -219,7 +219,8 @@ EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL); > > static struct lock_class_key cxl_nvdimm_key; > > -static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd) > +static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_nvdimm_bridge *cxl_nvb, > + struct cxl_memdev *cxlmd) > { > struct cxl_nvdimm *cxl_nvd; > struct device *dev; > @@ -230,6 +231,7 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct > cxl_memdev *cxlmd) > > dev = &cxl_nvd->dev; > cxl_nvd->cxlmd = cxlmd; > + cxlmd->cxl_nvd = cxl_nvd; > device_initialize(dev); > lockdep_set_class(&dev->mutex, &cxl_nvdimm_key); > device_set_pm_not_required(dev); > @@ -240,27 +242,52 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct > cxl_memdev *cxlmd) > return cxl_nvd; > } > > -static void cxl_nvd_unregister(void *dev) > +static void cxl_nvd_unregister(void *_cxl_nvd) > { > - device_unregister(dev); > + struct cxl_nvdimm *cxl_nvd = _cxl_nvd; > + struct cxl_memdev *cxlmd = cxl_nvd->cxlmd; > + > + device_lock_assert(&cxlmd->cxl_nvb->dev); Locally it's not immediately obvious if that is always the same as device_lock_assert(&cxl_nvb->dev); If not, a comment, if it is maybe just change to that. > + cxl_nvd->cxlmd = NULL; > + cxlmd->cxl_nvd = NULL; > + device_unregister(&cxl_nvd->dev); > +} > + > +static void cxlmd_release_nvdimm(void *_cxlmd) > +{ > + struct cxl_memdev *cxlmd = _cxlmd; > + struct cxl_nvdimm_bridge *cxl_nvb = cxlmd->cxl_nvb; > + > + device_lock(&cxl_nvb->dev); > + if (cxlmd->cxl_nvd) > + devm_release_action(&cxl_nvb->dev, cxl_nvd_unregister, > + cxlmd->cxl_nvd); > + device_unlock(&cxl_nvb->dev); > + put_device(&cxl_nvb->dev); > } > > /** > * devm_cxl_add_nvdimm() - add a bridge between a cxl_memdev and an nvdimm > - * @host: same host as @cxlmd > * @cxlmd: cxl_memdev instance that will perform LIBNVDIMM operations > * > * Return: 0 on success negative error code on failure. > */ > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd) > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd) > { > + struct cxl_nvdimm_bridge *cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev); Another cosmetic change, but I'd prefer the actual cxl_nvb = cxl_find_nvdimm_bridge(); to be just above the error check rather than up here. > struct cxl_nvdimm *cxl_nvd; > struct device *dev; > int rc; > > - cxl_nvd = cxl_nvdimm_alloc(cxlmd); > - if (IS_ERR(cxl_nvd)) > - return PTR_ERR(cxl_nvd); > + if (!cxl_nvb) > + return -ENODEV; > + > + cxl_nvd = cxl_nvdimm_alloc(cxl_nvb, cxlmd); > + if (IS_ERR(cxl_nvd)) { > + rc = PTR_ERR(cxl_nvd); > + goto err_alloc; > + } > + cxlmd->cxl_nvb = cxl_nvb; > > dev = &cxl_nvd->dev; > rc = dev_set_name(dev, "pmem%d", cxlmd->id); > @@ -271,13 +298,34 @@ int devm_cxl_add_nvdimm(struct device *host, struct > cxl_memdev *cxlmd) > if (rc) > goto err; > > - dev_dbg(host, "%s: register %s\n", dev_name(dev->parent), > - dev_name(dev)); > + dev_dbg(&cxlmd->dev, "register %s\n", dev_name(dev)); > > - return devm_add_action_or_reset(host, cxl_nvd_unregister, dev); > + /* > + * Remove this nvdimm connection if either the top-level PMEM > + * bridge goes down, or the endpoint device goes through > + * ->remove(). > + */ Perhaps move this comment down to inside the if (cxl_nvb->dev.driver) block as it only refers (I think) to the devm_add_action_or_reset(), not the surrounding driver binding checks. > + device_lock(&cxl_nvb->dev); > + if (cxl_nvb->dev.driver) > + rc = devm_add_action_or_reset(&cxl_nvb->dev, cxl_nvd_unregister, > + cxl_nvd); > + else > + rc = -ENXIO; > + device_unlock(&cxl_nvb->dev); > + > + if (rc) > + goto err_alloc; > + > + /* @cxlmd carries a reference on @cxl_nvb until cxlmd_release_nvdimm */ > + return devm_add_action_or_reset(&cxlmd->dev, cxlmd_release_nvdimm, > cxlmd); > > err: > put_device(dev); > +err_alloc: > + put_device(&cxl_nvb->dev); Is this ordering necessary? It's not reverse of the setup above, so if we can reordering to be so, that is probably a good thing. (move these NULL setting above the put_device(&cxl_nvb->dev)). > + cxlmd->cxl_nvb = NULL; > + cxlmd->cxl_nvd = NULL; > + > return rc; > } > EXPORT_SYMBOL_NS_GPL(devm_cxl_add_nvdimm, CXL); > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c > index f9ae5ad284ff..e73bec828032 100644 > --- a/drivers/cxl/core/region.c > +++ b/drivers/cxl/core/region.c > @@ -1812,6 +1812,7 @@ static struct lock_class_key cxl_pmem_region_key; > static struct cxl_pmem_region *cxl_pmem_region_alloc(struct cxl_region *cxlr) > { > struct cxl_region_params *p = &cxlr->params; > + struct cxl_nvdimm_bridge *cxl_nvb; > struct cxl_pmem_region *cxlr_pmem; > struct device *dev; > int i; > @@ -1839,6 +1840,14 @@ static struct cxl_pmem_region > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > struct cxl_memdev *cxlmd = cxled_to_memdev(cxled); > struct cxl_pmem_region_mapping *m = &cxlr_pmem->mapping[i]; > > + if (i == 0) { Whilst kind of obvious, maybe a comment in here that for end points in the region the cxl_nvb will be the same hence we just look it up for the first one? > + cxl_nvb = cxl_find_nvdimm_bridge(&cxlmd->dev); > + if (!cxl_nvb) { > + cxlr_pmem = ERR_PTR(-ENODEV); > + goto out; > + } > + cxlr->cxl_nvb = cxl_nvb; > + } > m->cxlmd = cxlmd; > get_device(&cxlmd->dev); > m->start = cxled->dpa_res->start; > @@ -1848,6 +1857,7 @@ static struct cxl_pmem_region > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > > dev = &cxlr_pmem->dev; > cxlr_pmem->cxlr = cxlr; > + cxlr->cxlr_pmem = cxlr_pmem; > device_initialize(dev); > lockdep_set_class(&dev->mutex, &cxl_pmem_region_key); > device_set_pm_not_required(dev); > @@ -1860,9 +1870,30 @@ static struct cxl_pmem_region > *cxl_pmem_region_alloc(struct cxl_region *cxlr) > return cxlr_pmem; > } > > -static void cxlr_pmem_unregister(void *dev) > +static void cxlr_pmem_unregister(void *_cxlr_pmem) > +{ > + struct cxl_pmem_region *cxlr_pmem = _cxlr_pmem; > + struct cxl_region *cxlr = cxlr_pmem->cxlr; > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; > + > + device_lock_assert(&cxl_nvb->dev); This scheme is obvious in this patch, but probably less so when just looking at the resulting code. Perhaps worth a comment here on why we care about that particular lock? > + cxlr->cxlr_pmem = NULL; > + cxlr_pmem->cxlr = NULL; > + device_unregister(&cxlr_pmem->dev); > +} > + > +static void cxlr_release_nvdimm(void *_cxlr) > { > - device_unregister(dev); > + struct cxl_region *cxlr = _cxlr; > + struct cxl_nvdimm_bridge *cxl_nvb = cxlr->cxl_nvb; > + > + device_lock(&cxl_nvb->dev); > + if (cxlr->cxlr_pmem) > + devm_release_action(&cxl_nvb->dev, cxlr_pmem_unregister, > + cxlr->cxlr_pmem); > + device_unlock(&cxl_nvb->dev); > + cxlr->cxl_nvb = NULL; > + put_device(&cxl_nvb->dev); > } > > /** > @@ -1874,12 +1905,14 @@ static void cxlr_pmem_unregister(void *dev) > static int devm_cxl_add_pmem_region(struct cxl_region *cxlr) > { > struct cxl_pmem_region *cxlr_pmem; > + struct cxl_nvdimm_bridge *cxl_nvb; > struct device *dev; > int rc; > > cxlr_pmem = cxl_pmem_region_alloc(cxlr); > if (IS_ERR(cxlr_pmem)) > return PTR_ERR(cxlr_pmem); > + cxl_nvb = cxlr->cxl_nvb; > > dev = &cxlr_pmem->dev; > rc = dev_set_name(dev, "pmem_region%d", cxlr->id); > @@ -1893,10 +1926,25 @@ static int devm_cxl_add_pmem_region(struct cxl_region > *cxlr) > dev_dbg(&cxlr->dev, "%s: register %s\n", dev_name(dev->parent), > dev_name(dev)); > > - return devm_add_action_or_reset(&cxlr->dev, cxlr_pmem_unregister, dev); > + device_lock(&cxl_nvb->dev); > + if (cxl_nvb->dev.driver) > + rc = devm_add_action_or_reset(&cxl_nvb->dev, > + cxlr_pmem_unregister, cxlr_pmem); > + else > + rc = -ENXIO; > + device_unlock(&cxl_nvb->dev); > + > + if (rc) > + goto err_bridge; > + > + /* @cxlr carries a reference on @cxl_nvb until cxlr_release_nvdimm */ > + return devm_add_action_or_reset(&cxlr->dev, cxlr_release_nvdimm, cxlr); > > err: > put_device(dev); > +err_bridge: > + put_device(&cxl_nvb->dev); > + cxlr->cxl_nvb = NULL; > return rc; > } > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 4ac7938eaf6c..9b5ba9626636 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -386,6 +386,8 @@ struct cxl_region_params { > * @id: This region's id. Id is globally unique across all regions > * @mode: Endpoint decoder allocation / access mode > * @type: Endpoint decoder target type > + * @cxl_nvb: nvdimm bridge for coordinating @cxlr_pmem shutdown I was going to suggest just carrying the struct device around, but this comment isn't quite true. I think cxl_region->cxl_nvb is also used in cxl_pmem_region_probe() to get the nvdimm_buss for nvdimm_pmem_region_create() > + * @cxlr_pmem: (for pmem regions) cached copy of the nvdimm bridge > * @params: active + config params for the region > */ > struct cxl_region { > @@ -393,6 +395,8 @@ struct cxl_region { > int id; > enum cxl_decoder_mode mode; > enum cxl_decoder_type type; > + struct cxl_nvdimm_bridge *cxl_nvb; > + struct cxl_pmem_region *cxlr_pmem; > struct cxl_region_params params; > }; > > @@ -438,7 +442,6 @@ struct cxl_pmem_region { > struct device dev; > struct cxl_region *cxlr; > struct nd_region *nd_region; > - struct cxl_nvdimm_bridge *bridge; > struct range hpa_range; > int nr_mappings; > struct cxl_pmem_region_mapping mapping[]; > @@ -637,7 +640,7 @@ struct cxl_nvdimm_bridge > *devm_cxl_add_nvdimm_bridge(struct device *host, > struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev); > bool is_cxl_nvdimm(struct device *dev); > bool is_cxl_nvdimm_bridge(struct device *dev); > -int devm_cxl_add_nvdimm(struct device *host, struct cxl_memdev *cxlmd); > +int devm_cxl_add_nvdimm(struct cxl_memdev *cxlmd); > struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct device *dev); > > #ifdef CONFIG_CXL_REGION > diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h > index 88e3a8e54b6a..c1c9960ab05f 100644 > --- a/drivers/cxl/cxlmem.h > +++ b/drivers/cxl/cxlmem.h > @@ -35,6 +35,8 @@ > * @cdev: char dev core object for ioctl operations > * @cxlds: The device state backing this device > * @detach_work: active memdev lost a port in its ancestry > + * @cxl_nvb: coordinate removal of @cxl_nvd if present > + * @cxl_nvd: optional bridge to an nvdimm if the device supports pmem > * @id: id number of this memdev instance. > */ > struct cxl_memdev { > @@ -42,6 +44,8 @@ struct cxl_memdev { > struct cdev cdev; > struct cxl_dev_state *cxlds; > struct work_struct detach_work; > + struct cxl_nvdimm_bridge *cxl_nvb; > + struct cxl_nvdimm *cxl_nvd; > int id; > }; > > diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c > index 652f00fc68ca..73357d0c3f25 100644 > --- a/drivers/cxl/pmem.c > +++ b/drivers/cxl/pmem.c > static struct cxl_driver cxl_nvdimm_driver = { > @@ -200,6 +182,16 @@ static int cxl_pmem_ctl(struct nvdimm_bus_descriptor > *nd_desc, > return cxl_pmem_nvdimm_ctl(nvdimm, cmd, buf, buf_len); > } > > +static void unregister_nvdimm_bus(void *_cxl_nvb) > +{ > + struct cxl_nvdimm_bridge *cxl_nvb = _cxl_nvb; > + struct nvdimm_bus *nvdimm_bus = cxl_nvb->nvdimm_bus; > + > + cxl_nvb->nvdimm_bus = NULL; > + nvdimm_bus_unregister(nvdimm_bus); > +} > + Single blank line. > + > static bool online_nvdimm_bus(struct cxl_nvdimm_bridge *cxl_nvb) > { > if (cxl_nvb->nvdimm_bus) > @@ -303,23 +295,21 @@ static int cxl_nvdimm_bridge_probe(struct device *dev) > { > struct cxl_nvdimm_bridge *cxl_nvb = to_cxl_nvdimm_bridge(dev); > > - if (cxl_nvb->state == CXL_NVB_DEAD) > - return -ENXIO; > + cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor){ ) { matches existing style in this file. > + .provider_name = "CXL", > + .module = THIS_MODULE, > + .ndctl = cxl_pmem_ctl, > + }; > > - if (cxl_nvb->state == CXL_NVB_NEW) { > - cxl_nvb->nd_desc = (struct nvdimm_bus_descriptor) { > - .provider_name = "CXL", > - .module = THIS_MODULE, > - .ndctl = cxl_pmem_ctl, > - }; > + cxl_nvb->nvdimm_bus = > + nvdimm_bus_register(&cxl_nvb->dev, &cxl_nvb->nd_desc); > > - INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state); > - } > + if (!cxl_nvb->nvdimm_bus) > + return -ENOMEM; > > - cxl_nvb->state = CXL_NVB_ONLINE; > - cxl_nvdimm_bridge_state_work(cxl_nvb); > + INIT_WORK(&cxl_nvb->state_work, cxl_nvb_update_state); > > - return 0; > + return devm_add_action_or_reset(dev, unregister_nvdimm_bus, cxl_nvb);