On 5/23/26 2:43 AM, Anisa Su wrote:
> DC DAX regions are populated with dax_resource children that each carry a
> backing tag uuid and a per-allocation sequence number (seq_num). Add the
> userspace claim semantics that resolve those tagged groups into DAX
> devices.
>
> A DC region's seed dax device is created at 0-size on probe; userspace
> populates it by writing to its 'uuid' attribute:
>
> * A non-null UUID claims every dax_resource on this region whose tag
> matches, in seq_num order via uuid_claim_tagged(). The match set
> must form a dense 1..n sequence (no gap, no duplicate); the CXL
> side maintains this invariant for both sharable allocations (where
> the device stamps shared_extn_seq) and non-sharable allocations
> (where cxl_add_pending assigns arrival-order seq). The resulting
> DAX device's size equals the sum of every member extent's size.
>
> * "0" claims a single untagged dax_resource via
> uuid_claim_untagged(). Untagged extents are independent
> allocations; collapsing several would aggregate unrelated capacity,
> so each uuid="0" write consumes exactly one untagged resource.
>
> * A write that matches no dax_resource returns -ENOENT; the device
> stays at size 0.
>
> uuid_show() reads back the backing tag uuid (or the null UUID for an
> untagged claim). The attribute is read-only (0444) on non-DC dax
> devices; writes to it on non-DC regions return -EOPNOTSUPP.
>
> dev_dax_visible() exposes the uuid attribute only on DC dax devices.
>
> Based on an original patch by Navneet Singh.
>
> Signed-off-by: Ira Weiny <[email protected]>
> Signed-off-by: Anisa Su <[email protected]>
>
> ---
> Changes:
> [anisa: split out from the original "Surface dc_extents" commit;
> userspace tag-claim semantics only.]
> ---
> drivers/dax/bus.c | 260 +++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 256 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
> index c030eb103ad0..1dccb3e5cd0f 100644
> --- a/drivers/dax/bus.c
> +++ b/drivers/dax/bus.c
> @@ -5,6 +5,7 @@
> #include <linux/mutex.h>
> #include <linux/list.h>
> #include <linux/slab.h>
> +#include <linux/sort.h>
> #include <linux/dax.h>
> #include <linux/io.h>
> #include "dax-private.h"
> @@ -1316,6 +1317,89 @@ static ssize_t dev_dax_resize(struct dax_region
> *dax_region,
> return 0;
> }
>
> +/* DC extents are all-or-nothing: an extent is either free or fully claimed.
> */
> +static bool dax_resource_in_use(const struct dax_resource *dax_resource)
> +{
> + return dax_resource->use_cnt > 0;
> +}
> +
> +struct dax_uuid_match {
> + const struct dax_region *dax_region;
> + const uuid_t *uuid;
> +};
> +
> +static int find_uuid_extent(struct device *dev, const void *data)
> +{
> + const struct dax_uuid_match *match = data;
> + struct dax_resource *dax_resource;
> +
> + if (!match->dax_region->dc_ops->is_extent(dev))
> + return 0;
> +
> + dax_resource = dev_get_drvdata(dev);
> + if (!dax_resource || dax_resource_in_use(dax_resource))
> + return 0;
> + return uuid_equal(&dax_resource->uuid, match->uuid);
> +}
> +
> +struct dax_tag_collect {
> + const struct dax_region *dax_region;
> + const uuid_t *uuid;
> + struct dax_resource **arr;
> + unsigned int count;
> + unsigned int cap;
> +};
> +
> +static int collect_uuid_extent(struct device *dev, void *data)
> +{
> + struct dax_tag_collect *c = data;
> + struct dax_resource *dax_resource;
> +
> + if (!c->dax_region->dc_ops->is_extent(dev))
> + return 0;
> +
> + dax_resource = dev_get_drvdata(dev);
> + if (!dax_resource || dax_resource_in_use(dax_resource))
> + return 0;
> + if (!uuid_equal(&dax_resource->uuid, c->uuid))
> + return 0;
> +
> + if (c->count == c->cap)
> + return -ENOSPC;
> + c->arr[c->count++] = dax_resource;
> + return 0;
> +}
> +
> +static int count_uuid_extent(struct device *dev, void *data)
> +{
> + struct dax_tag_collect *c = data;
> + struct dax_resource *dax_resource;
> +
> + if (!c->dax_region->dc_ops->is_extent(dev))
> + return 0;
> +
> + dax_resource = dev_get_drvdata(dev);
> + if (!dax_resource || dax_resource_in_use(dax_resource))
> + return 0;
> + if (!uuid_equal(&dax_resource->uuid, c->uuid))
> + return 0;
> +
> + c->count++;
> + return 0;
> +}
> +
> +static int dax_resource_seq_cmp(const void *a, const void *b)
> +{
> + const struct dax_resource * const *pa = a;
> + const struct dax_resource * const *pb = b;
> +
> + if ((*pa)->seq_num < (*pb)->seq_num)
> + return -1;
> + if ((*pa)->seq_num > (*pb)->seq_num)
> + return 1;
> + return 0;
> +}
> +
> static ssize_t size_store(struct device *dev, struct device_attribute *attr,
> const char *buf, size_t len)
> {
> @@ -1548,13 +1632,177 @@ static DEVICE_ATTR_RO(numa_node);
> static ssize_t uuid_show(struct device *dev,
> struct device_attribute *attr, char *buf)
> {
> - return sysfs_emit(buf, "%d\n", 0);
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + int rc;
> +
> + rc = down_read_interruptible(&dax_dev_rwsem);
Since we are here, may as well convert these to ACQUIRE() and be rid of the
gotos
ACQUIRE(rwsem_read_intr, rwsem)(&dax_dev_rwsem);
if ((rc = ACQUIRE_ERR(rwsem_read_intr, &rwsem)))
...
> + if (rc)
> + return rc;
> +
> + for (int i = 0; i < dev_dax->nr_range; i++) {
> + struct dax_resource *r = dev_dax->ranges[i].dax_resource;
> +
> + if (r && !uuid_is_null(&r->uuid)) {
> + rc = sysfs_emit(buf, "%pUb\n", &r->uuid);
> + goto out;
> + }
> + }
> + rc = sysfs_emit(buf, "0\n");
As pointed out earlyer, should display null_uuid to be consistent.
> +out:
> + up_read(&dax_dev_rwsem);
> + return rc;
> +}
> +
> +static ssize_t uuid_claim_untagged(struct dax_region *dax_region,
> + struct dev_dax *dev_dax)
> +{
> + struct dax_uuid_match match = {
> + .dax_region = dax_region,
> + .uuid = &uuid_null,
> + };
> + struct dax_resource *dax_resource;
> + resource_size_t to_alloc;
> + struct device *extent_dev;
> + ssize_t alloc;
> +
> + extent_dev = device_find_child(dax_region->dev, &match,
> + find_uuid_extent);
> + if (!extent_dev)
> + return -ENOENT;
> +
> + dax_resource = dev_get_drvdata(extent_dev);
> + to_alloc = resource_size(dax_resource->res);
> + alloc = __dev_dax_resize(dax_resource->res, dev_dax, to_alloc,
> + dax_resource);
> + put_device(extent_dev);
> + if (alloc < 0)
> + return alloc;
> + if (alloc == 0)
> + return -ENOENT;
> + dax_resource->use_cnt++;
> + return 0;
> +}
> +
> +static ssize_t uuid_claim_tagged(struct dax_region *dax_region,
> + struct dev_dax *dev_dax, const uuid_t *uuid)
> +{
> + struct dax_tag_collect c = {
> + .dax_region = dax_region,
> + .uuid = uuid,
> + };
> + unsigned int i;
> + ssize_t rc;
> +
> + /* Two-pass: count, then collect into a sized array. */
> + device_for_each_child(dax_region->dev, &c, count_uuid_extent);
> + if (!c.count)
> + return -ENOENT;
> +
> + c.arr = kmalloc_array(c.count, sizeof(*c.arr), GFP_KERNEL);
> + if (!c.arr)
> + return -ENOMEM;
> + c.cap = c.count;
> + c.count = 0;
> +
> + rc = device_for_each_child(dax_region->dev, &c, collect_uuid_extent);
> + if (rc)
> + goto out;
> +
> + sort(c.arr, c.count, sizeof(*c.arr), dax_resource_seq_cmp, NULL);
> +
> + /*
> + * Tagged groups carry a dense 1..n @seq_num regardless of source
> + * (sharable: device-stamped; non-sharable: host-assigned in
> + * arrival order — see &struct dax_resource). A gap or
> + * out-of-range value here means an extent went missing on the
> + * cxl side (e.g. a per-extent failure in cxl_add_pending) or a
> + * cxl-side validation gap; in either case refuse the whole
> + * group rather than carve a partial allocation.
> + */
> + for (i = 0; i < c.count; i++) {
> + if (c.arr[i]->seq_num != i + 1) {
> + dev_WARN_ONCE(dax_region->dev, 1,
> + "tag %pUb seq invariant violated at slot %u
> (got %u)\n",
> + uuid, i, c.arr[i]->seq_num);
> + rc = -EINVAL;
> + goto out;
> + }
> + }
> +
> + for (i = 0; i < c.count; i++) {
> + resource_size_t to_alloc = resource_size(c.arr[i]->res);
> + ssize_t alloc;
> +
> + alloc = __dev_dax_resize(c.arr[i]->res, dev_dax, to_alloc,
> + c.arr[i]);
> + if (alloc < 0) {
> + rc = alloc;
> + goto rollback;
> + }
> + if (alloc == 0) {
> + rc = -ENOSPC;
> + goto rollback;
> + }
> + c.arr[i]->use_cnt++;
> + }
> + rc = 0;
> + goto out;
> +
> +rollback:
> + /*
> + * Partial failure: trim every range we added in this attempt.
> + * trim_dev_dax_range pops the most-recently-appended range from
> + * dev_dax->ranges[] and decrements its dax_resource->use_cnt, so
> + * looping until we have undone @i additions restores both
> + * dev_dax->ranges[] and the matched dax_resources' use_cnt.
> + */
> + while (i-- > 0)
> + trim_dev_dax_range(dev_dax);
> +out:
> + kfree(c.arr);
> + return rc;
> }
>
> static ssize_t uuid_store(struct device *dev, struct device_attribute *attr,
> const char *buf, size_t len)
> {
> - return -EOPNOTSUPP;
> + struct dev_dax *dev_dax = to_dev_dax(dev);
> + struct dax_region *dax_region = dev_dax->region;
> + uuid_t uuid;
> + ssize_t rc;
> +
> + if (!is_dynamic(dax_region))
> + return -EOPNOTSUPP;
> +
> + if (sysfs_streq(buf, "0"))
> + uuid_copy(&uuid, &uuid_null);
> + else {
> + rc = uuid_parse(buf, &uuid);
> + if (rc)
> + return rc;
> + }
> +
> + rc = down_write_killable(&dax_region_rwsem);
> + if (rc)
> + return rc;
> + if (!dax_region->dev->driver) {
> + rc = -ENXIO;
> + goto err_region;
> + }
> + rc = down_write_killable(&dax_dev_rwsem);
same comments about ACQUIRE()
> + if (rc)
> + goto err_region;
> +
Does it need to check if the device is already claimed before proceeding to
claiming? What happens if the uuid is written twice to this sysfs file?
DJ
> + if (uuid_is_null(&uuid))
> + rc = uuid_claim_untagged(dax_region, dev_dax);
> + else
> + rc = uuid_claim_tagged(dax_region, dev_dax, &uuid);
> +
> + up_write(&dax_dev_rwsem);
> +err_region:
> + up_write(&dax_region_rwsem);
> +
> + return rc < 0 ? rc : len;
> }
> static DEVICE_ATTR_RW(uuid);
>
> @@ -1614,8 +1862,12 @@ static umode_t dev_dax_visible(struct kobject *kobj,
> struct attribute *a, int n)
> return 0;
> if (a == &dev_attr_mapping.attr && is_dynamic(dax_region))
> return 0;
> - if ((a == &dev_attr_align.attr ||
> - a == &dev_attr_size.attr) && is_static(dax_region))
> + if (a == &dev_attr_uuid.attr && !is_dynamic(dax_region))
> + return 0444;
> + if (a == &dev_attr_align.attr &&
> + (is_static(dax_region) || is_dynamic(dax_region)))
> + return 0444;
> + if (a == &dev_attr_size.attr && is_static(dax_region))
> return 0444;
> return a->mode;
> }