From: Ira Weiny <[email protected]>

Replace the no-op ack stub for cxl_rm_extent() with the real teardown:
resolve the released DPA range to its region and endpoint decoder,
locate the matching dc_extent in cxlr_dax->dc_extents (filtering by
cxled, range containment, and tag), and tear down the entire containing
tag group atomically through rm_tag_group().  Partial release is not
supported.

Invalidates caches once before rm_tag_group(). Then walk the
group's dc_extents and release each via its devm action installed
at online_tag_group() time.

When the released range maps to no region (host crashed before
persisting acceptance, region destruction raced device release, or the
device is confused) the host has nothing to drop, so reply via
memdev_release_extent() to keep the device's view consistent.

The same behavior is applied to the case where the region exists, but
not cxlr_dax of that region. If its dax region is not set up,
the host is not tracking the  extent, so extents can be released.

Based on an original patch by Navneet Singh.

Signed-off-by: Ira Weiny <[email protected]>
Signed-off-by: Anisa Su <[email protected]>

---
Changes:
1. Check rc of cxl_region_invalidate_memregion()

If flushing fails, don't release and wait for the device to
retry.

2. Check if cxlr->cxlr_dax is null before rm_tag_group()

Similar to if !cxlr, if cxlr->cxlr_dax is not set up, the host
is not tracking extents, so it's safe to reply to the device with
release.

3. core.h: move declaration for cxl_region_invalidate_memregion()
inside #ifdef CONFIG_CXL_REGION
---
 drivers/cxl/core/core.h   |   8 +++
 drivers/cxl/core/extent.c | 115 ++++++++++++++++++++++++++++++++++++++
 drivers/cxl/core/mbox.c   |  19 -------
 drivers/cxl/core/region.c |   2 +-
 4 files changed, 124 insertions(+), 20 deletions(-)

diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
index 6ac68f46a18e..bbbb86ababad 100644
--- a/drivers/cxl/core/core.h
+++ b/drivers/cxl/core/core.h
@@ -28,6 +28,8 @@ cxled_to_mds(struct cxl_endpoint_decoder *cxled)
 
 #ifdef CONFIG_CXL_REGION
 
+int cxl_region_invalidate_memregion(struct cxl_region *cxlr);
+
 struct cxl_region_context {
        struct cxl_endpoint_decoder *cxled;
        struct range hpa_range;
@@ -65,6 +67,7 @@ int devm_cxl_add_pmem_region(struct cxl_region *cxlr);
 
 int cxl_add_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent,
                   u16 seq_num);
+int cxl_rm_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent);
 int online_tag_group(struct cxl_dc_tag_group *group, bool skip_release);
 #else
 static inline u64 cxl_dpa_to_hpa(struct cxl_region *cxlr,
@@ -77,6 +80,11 @@ static inline int cxl_add_extent(struct cxl_memdev_state 
*mds,
 {
        return 0;
 }
+static inline int cxl_rm_extent(struct cxl_memdev_state *mds,
+                               struct cxl_extent *extent)
+{
+       return 0;
+}
 static inline int online_tag_group(struct cxl_dc_tag_group *group,
                                   bool skip_release)
 {
diff --git a/drivers/cxl/core/extent.c b/drivers/cxl/core/extent.c
index 0ebb581ca833..a590a89f3580 100644
--- a/drivers/cxl/core/extent.c
+++ b/drivers/cxl/core/extent.c
@@ -371,6 +371,121 @@ static void dc_extent_unregister(void *ext)
        device_unregister(&dc_extent->dev);
 }
 
+static void rm_tag_group(struct cxl_dc_tag_group *group)
+{
+       struct device *region_dev = &group->cxlr_dax->dev;
+       struct dc_extent *dc_extent;
+       unsigned long index;
+
+       /*
+        * Pin @group across the walk: each devm_release_action runs the
+        * dc_extent_unregister action synchronously, which drops the last
+        * reference on the dc_extent device and fires dc_extent_release.
+        * The release decrements group->nr_extents and, on the final
+        * decrement, frees @group.  Without the pin the next iteration's
+        * xa_find_after() dereferences a freed xarray.
+        */
+       group->nr_extents++;
+       xa_for_each(&group->dc_extents, index, dc_extent)
+               devm_release_action(region_dev, dc_extent_unregister, 
dc_extent);
+       group->nr_extents--;
+       if (!group->nr_extents)
+               free_tag_group(group);
+}
+
+int cxl_rm_extent(struct cxl_memdev_state *mds, struct cxl_extent *extent)
+{
+       u64 start_dpa = le64_to_cpu(extent->start_dpa);
+       struct cxl_memdev *cxlmd = mds->cxlds.cxlmd;
+       struct cxl_endpoint_decoder *cxled;
+       struct cxl_dax_region *cxlr_dax;
+       struct cxl_dc_tag_group *group;
+       struct dc_extent *dc_extent;
+       struct cxl_region *cxlr;
+       struct range dpa_range;
+       unsigned long idx;
+       uuid_t tag;
+       int rc;
+
+       dpa_range = (struct range) {
+               .start = start_dpa,
+               .end = start_dpa + le64_to_cpu(extent->length) - 1,
+       };
+
+       guard(rwsem_read)(&cxl_rwsem.region);
+       cxlr = cxl_dpa_to_region(cxlmd, start_dpa, &cxled);
+       if (!cxlr) {
+               /*
+                * No region can happen here for a few reasons:
+                *
+                * 1) Extents were accepted and the host crashed/rebooted
+                *    leaving them in an accepted state.  On reboot the host
+                *    has not yet created a region to own them.
+                *
+                * 2) Region destruction won the race with the device releasing
+                *    all the extents.  Here the release will be a duplicate of
+                *    the one sent via region destruction.
+                *
+                * 3) The device is confused and releasing extents for which no
+                *    region ever existed.
+                *
+                * In all these cases make sure the device knows we are not
+                * using this extent.
+                */
+               memdev_release_extent(mds, &dpa_range);
+               return -ENXIO;
+       }
+
+       cxlr_dax = cxlr->cxlr_dax;
+       if (!cxlr_dax) {
+               /*
+                * The region exists but its dax region is not set up, so the
+                * host is not tracking this extent.  Tell the device it is not
+                * in use, as in the no-region case above.
+                */
+               memdev_release_extent(mds, &dpa_range);
+               return -ENXIO;
+       }
+
+       import_uuid(&tag, extent->uuid);
+
+       /*
+        * Find the dc_extent whose DPA range covers the released range and
+        * whose tag matches.  The release targets the entire containing
+        * tag group atomically; partial release is not supported.
+        */
+       group = NULL;
+       xa_for_each(&cxlr_dax->dc_extents, idx, dc_extent) {
+               if (dc_extent->cxled != cxled)
+                       continue;
+               if (!range_contains(&dc_extent->dpa_range, &dpa_range))
+                       continue;
+               if (!uuid_equal(&dc_extent->group->uuid, &tag))
+                       continue;
+               group = dc_extent->group;
+               break;
+       }
+       if (!group) {
+               dev_err(&cxlr_dax->dev,
+                       "release DPA %pra (%pU) matches no dc_extent\n",
+                       &dpa_range, &tag);
+               return -EINVAL;
+       }
+
+       /*
+        * Invalidate CPU caches for the region before releasing the capacity
+        * back to the device so it cannot reassign the range while stale
+        * cached data lingers.  On failure do not release: leave the tag
+        * group intact and let the device retry.
+        */
+       rc = cxl_region_invalidate_memregion(cxlr);
+       if (rc)
+               return rc;
+
+       rm_tag_group(group);
+       return 0;
+}
+
 static void cleanup_pending_dc_extent(struct dc_extent *dc_extent)
 {
        struct cxl_dc_tag_group *group = dc_extent->group;
diff --git a/drivers/cxl/core/mbox.c b/drivers/cxl/core/mbox.c
index 7967b0db2c51..a072355f2f7c 100644
--- a/drivers/cxl/core/mbox.c
+++ b/drivers/cxl/core/mbox.c
@@ -1669,25 +1669,6 @@ static int handle_add_event(struct cxl_memdev_state *mds,
        return rc;
 }
 
-/*
- * Stub: ack the release back to the device so it knows we are not
- * using the range.  A later commit replaces this with the real
- * teardown that walks the region's tag group and tears down the
- * member dc_extent devices.
- */
-static int cxl_rm_extent(struct cxl_memdev_state *mds,
-                        struct cxl_extent *extent)
-{
-       u64 start_dpa = le64_to_cpu(extent->start_dpa);
-       struct range dpa_range = {
-               .start = start_dpa,
-               .end = start_dpa + le64_to_cpu(extent->length) - 1,
-       };
-
-       memdev_release_extent(mds, &dpa_range);
-       return 0;
-}
-
 static char *cxl_dcd_evt_type_str(u8 type)
 {
        switch (type) {
diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index f6e93bc59ae7..528f0b980f58 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -222,7 +222,7 @@ static struct cxl_region_ref *cxl_rr_load(struct cxl_port 
*port,
        return xa_load(&port->regions, (unsigned long)cxlr);
 }
 
-static int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
+int cxl_region_invalidate_memregion(struct cxl_region *cxlr)
 {
        if (!cpu_cache_has_invalidate_memregion()) {
                if (IS_ENABLED(CONFIG_CXL_REGION_INVALIDATION_TEST)) {
-- 
2.43.0


Reply via email to