The new fsdev driver provides pages/folios initialized compatibly with fsdax - normal rather than devdax-style refcounting, and starting out with order-0 folios.
When fsdev binds to a daxdev, it is usually (always?) switching from the devdax mode (device.c), which pre-initializes compound folios according to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the folios into a fsdax-compatible state. A side effect of this is that raw mmap doesn't (can't?) work on an fsdev dax instance. Accordingly, The fsdev driver does not provide raw mmap - devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw mmap capability. In this commit is just the framework, which remaps pages/folios compatibly with fsdax. Enabling dax changes: * bus.h: add DAXDRV_FSDEV_TYPE driver type * bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs * dax.h: prototype inode_dax(), which fsdev needs Suggested-by: Dan Williams <[email protected]> Suggested-by: Gregory Price <[email protected]> Signed-off-by: John Groves <[email protected]> --- MAINTAINERS | 8 ++ drivers/dax/Kconfig | 17 +++ drivers/dax/Makefile | 2 + drivers/dax/bus.c | 4 + drivers/dax/bus.h | 1 + drivers/dax/fsdev.c | 276 +++++++++++++++++++++++++++++++++++++++++++ include/linux/dax.h | 4 + 7 files changed, 312 insertions(+) create mode 100644 drivers/dax/fsdev.c diff --git a/MAINTAINERS b/MAINTAINERS index 765ad2daa218..90429cb06090 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7184,6 +7184,14 @@ L: [email protected] S: Supported F: drivers/dax/ +DEVICE DIRECT ACCESS (DAX) [fsdev_dax] +M: John Groves <[email protected]> +M: John Groves <[email protected]> +L: [email protected] +L: [email protected] +S: Supported +F: drivers/dax/fsdev.c + DEVICE FREQUENCY (DEVFREQ) M: MyungJoo Ham <[email protected]> M: Kyungmin Park <[email protected]> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index d656e4c0eb84..491325d914a8 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -78,4 +78,21 @@ config DEV_DAX_KMEM Say N if unsure. +config DEV_DAX_FS + tristate "FSDEV DAX: fs-dax compatible device driver" + depends on DEV_DAX + default DEV_DAX + help + Support a device-dax driver mode that is compatible with fs-dax + filesystems. Unlike the standard device-dax driver which + pre-initializes compound folios based on device alignment, this + driver leaves folios uninitialized (similar to pmem) allowing + fs-dax to manage folio lifecycles dynamically. + + This driver uses MEMORY_DEVICE_FS_DAX type and does not set + vmemmap_shift, making it compatible with filesystems like famfs + that use the iomap-based fs-dax infrastructure. + + Say M if you plan to use fs-dax filesystems on /dev/dax devices. + Say N if you only need raw character device access to DAX memory. endif diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile index 5ed5c39857c8..77aa3df3285c 100644 --- a/drivers/dax/Makefile +++ b/drivers/dax/Makefile @@ -4,11 +4,13 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o +obj-$(CONFIG_DEV_DAX_FS) += fsdev_dax.o dax-y := super.o dax-y += bus.o device_dax-y := device.o dax_pmem-y := pmem.o dax_cxl-y := cxl.o +fsdev_dax-y := fsdev.o obj-y += hmem/ diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index a2f9a3cc30a5..0d7228acb913 100644 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -84,6 +84,10 @@ static int dax_match_type(const struct dax_device_driver *dax_drv, struct device !IS_ENABLED(CONFIG_DEV_DAX_KMEM)) return 1; + /* fsdev driver can also bind to device-type dax devices */ + if (dax_drv->type == DAXDRV_FSDEV_TYPE && type == DAXDRV_DEVICE_TYPE) + return 1; + return 0; } diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h index cbbf64443098..880bdf7e72d7 100644 --- a/drivers/dax/bus.h +++ b/drivers/dax/bus.h @@ -31,6 +31,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data *data); enum dax_driver_type { DAXDRV_KMEM_TYPE, DAXDRV_DEVICE_TYPE, + DAXDRV_FSDEV_TYPE, }; struct dax_device_driver { diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c new file mode 100644 index 000000000000..2a3249d1529c --- /dev/null +++ b/drivers/dax/fsdev.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2026 Micron Technology, Inc. */ +#include <linux/memremap.h> +#include <linux/pagemap.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/cdev.h> +#include <linux/slab.h> +#include <linux/dax.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include "dax-private.h" +#include "bus.h" + +/* + * FS-DAX compatible devdax driver + * + * Unlike drivers/dax/device.c which pre-initializes compound folios based + * on device alignment (via vmemmap_shift), this driver leaves folios + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs + * to work without needing special handling for pre-initialized folios. + * + * Key differences from device.c: + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) + * - vmemmap_shift is NOT set (folios remain order-0) + * - fs-dax can dynamically create compound folios as needed + * - No mmap support - all access is through fs-dax/iomap + */ + + +static void fsdev_cdev_del(void *cdev) +{ + cdev_del(cdev); +} + +static void fsdev_kill(void *dev_dax) +{ + kill_dev_dax(dev_dax); +} + +/* + * Page map operations for FS-DAX mode + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c + * + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. + * The core mm code in free_zone_device_folio() handles the wake_up_var() + * directly for this memory type. + */ +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, + unsigned long pfn, unsigned long nr_pages, int mf_flags) +{ + struct dev_dax *dev_dax = pgmap->owner; + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; + u64 len = nr_pages << PAGE_SHIFT; + + return dax_holder_notify_failure(dev_dax->dax_dev, offset, + len, mf_flags); +} + +static const struct dev_pagemap_ops fsdev_pagemap_ops = { + .memory_failure = fsdev_pagemap_memory_failure, +}; + +/* + * Clear any stale folio state from pages in the given range. + * This is necessary because device_dax pre-initializes compound folios + * based on vmemmap_shift, and that state may persist after driver unbind. + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax + * expects to find clean order-0 folios that it can build into compound + * folios on demand. + * + * At probe time, no filesystem should be mounted yet, so all mappings + * are stale and must be cleared along with compound state. + */ +static void fsdev_clear_folio_state(struct dev_dax *dev_dax) +{ + int i; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + unsigned long pfn, end_pfn; + + pfn = PHYS_PFN(range->start); + end_pfn = PHYS_PFN(range->end) + 1; + + while (pfn < end_pfn) { + struct page *page = pfn_to_page(pfn); + struct folio *folio = (struct folio *)page; + struct dev_pagemap *pgmap = page_pgmap(page); + int order = folio_order(folio); + + /* + * Clear any stale mapping pointer. At probe time, + * no filesystem is mounted, so any mapping is stale. + */ + folio->mapping = NULL; + folio->share = 0; + + if (order > 0) { + int j; + + folio_reset_order(folio); + for (j = 0; j < (1UL << order); j++) { + struct page *p = page + j; + + ClearPageHead(p); + clear_compound_head(p); + ((struct folio *)p)->mapping = NULL; + ((struct folio *)p)->share = 0; + ((struct folio *)p)->pgmap = pgmap; + } + pfn += (1UL << order); + } else { + folio->pgmap = pgmap; + pfn++; + } + } + } +} + +static int fsdev_open(struct inode *inode, struct file *filp) +{ + struct dax_device *dax_dev = inode_dax(inode); + struct dev_dax *dev_dax = dax_get_private(dax_dev); + + dev_dbg(&dev_dax->dev, "trace\n"); + filp->private_data = dev_dax; + + return 0; +} + +static int fsdev_release(struct inode *inode, struct file *filp) +{ + struct dev_dax *dev_dax = filp->private_data; + + dev_dbg(&dev_dax->dev, "trace\n"); + return 0; +} + +static const struct file_operations fsdev_fops = { + .llseek = noop_llseek, + .owner = THIS_MODULE, + .open = fsdev_open, + .release = fsdev_release, +}; + +static int fsdev_dax_probe(struct dev_dax *dev_dax) +{ + struct dax_device *dax_dev = dev_dax->dax_dev; + struct device *dev = &dev_dax->dev; + struct dev_pagemap *pgmap; + u64 data_offset = 0; + struct inode *inode; + struct cdev *cdev; + void *addr; + int rc, i; + + if (static_dev_dax(dev_dax)) { + if (dev_dax->nr_range > 1) { + dev_warn(dev, + "static pgmap / multi-range device conflict\n"); + return -EINVAL; + } + + pgmap = dev_dax->pgmap; + } else { + if (dev_dax->pgmap) { + dev_warn(dev, + "dynamic-dax with pre-populated page map\n"); + return -EINVAL; + } + + pgmap = devm_kzalloc(dev, + struct_size(pgmap, ranges, dev_dax->nr_range - 1), + GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->nr_range = dev_dax->nr_range; + dev_dax->pgmap = pgmap; + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + pgmap->ranges[i] = *range; + } + } + + for (i = 0; i < dev_dax->nr_range; i++) { + struct range *range = &dev_dax->ranges[i].range; + + if (!devm_request_mem_region(dev, range->start, + range_len(range), dev_name(dev))) { + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", + i, range->start, range->end); + return -EBUSY; + } + } + + /* + * FS-DAX compatible mode: Use MEMORY_DEVICE_FS_DAX type and + * do NOT set vmemmap_shift. This leaves folios at order-0, + * allowing fs-dax to dynamically create compound folios as needed + * (similar to pmem behavior). + */ + pgmap->type = MEMORY_DEVICE_FS_DAX; + pgmap->ops = &fsdev_pagemap_ops; + pgmap->owner = dev_dax; + + /* + * CRITICAL DIFFERENCE from device.c: + * We do NOT set vmemmap_shift here, even if align > PAGE_SIZE. + * This ensures folios remain order-0 and are compatible with + * fs-dax's folio management. + */ + + addr = devm_memremap_pages(dev, pgmap); + if (IS_ERR(addr)) + return PTR_ERR(addr); + + /* + * Clear any stale compound folio state left over from a previous + * driver (e.g., device_dax with vmemmap_shift). + */ + fsdev_clear_folio_state(dev_dax); + + /* Detect whether the data is at a non-zero offset into the memory */ + if (pgmap->range.start != dev_dax->ranges[0].range.start) { + u64 phys = dev_dax->ranges[0].range.start; + u64 pgmap_phys = dev_dax->pgmap[0].range.start; + + if (!WARN_ON(pgmap_phys > phys)) + data_offset = phys - pgmap_phys; + + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", + __func__, phys, pgmap_phys, data_offset); + } + + inode = dax_inode(dax_dev); + cdev = inode->i_cdev; + cdev_init(cdev, &fsdev_fops); + cdev->owner = dev->driver->owner; + cdev_set_parent(cdev, &dev->kobj); + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); + if (rc) + return rc; + + run_dax(dax_dev); + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); +} + +static struct dax_device_driver fsdev_dax_driver = { + .probe = fsdev_dax_probe, + .type = DAXDRV_FSDEV_TYPE, +}; + +static int __init dax_init(void) +{ + return dax_driver_register(&fsdev_dax_driver); +} + +static void __exit dax_exit(void) +{ + dax_driver_unregister(&fsdev_dax_driver); +} + +MODULE_AUTHOR("John Groves"); +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); +MODULE_LICENSE("GPL"); +module_init(dax_init); +module_exit(dax_exit); +MODULE_ALIAS_DAX_DEVICE(0); diff --git a/include/linux/dax.h b/include/linux/dax.h index 9d624f4d9df6..74e098010016 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -51,6 +51,10 @@ struct dax_holder_operations { #if IS_ENABLED(CONFIG_DAX) struct dax_device *alloc_dax(void *private, const struct dax_operations *ops); + +#if IS_ENABLED(CONFIG_DEV_DAX_FS) +struct dax_device *inode_dax(struct inode *inode); +#endif void *dax_holder(struct dax_device *dax_dev); void put_dax(struct dax_device *dax_dev); void kill_dax(struct dax_device *dax_dev); -- 2.49.0
