The new fsdev driver provides pages/folios initialized compatibly with
fsdax - normal rather than devdax-style refcounting, and starting out
with order-0 folios.

When fsdev binds to a daxdev, it is usually (always?) switching from the
devdax mode (device.c), which pre-initializes compound folios according
to its alignment. Fsdev uses fsdev_clear_folio_state() to switch the
folios into a fsdax-compatible state.

A side effect of this is that raw mmap doesn't (can't?) work on an fsdev
dax instance. Accordingly, The fsdev driver does not provide raw mmap -
devices must be put in 'devdax' mode (drivers/dax/device.c) to get raw
mmap capability.

In this commit is just the framework, which remaps pages/folios compatibly
with fsdax.

Enabling dax changes:

* bus.h: add DAXDRV_FSDEV_TYPE driver type
* bus.c: allow DAXDRV_FSDEV_TYPE drivers to bind to daxdevs
* dax.h: prototype inode_dax(), which fsdev needs

Suggested-by: Dan Williams <[email protected]>
Suggested-by: Gregory Price <[email protected]>
Signed-off-by: John Groves <[email protected]>
---
 MAINTAINERS          |   8 ++
 drivers/dax/Kconfig  |  17 +++
 drivers/dax/Makefile |   2 +
 drivers/dax/bus.c    |   4 +
 drivers/dax/bus.h    |   1 +
 drivers/dax/fsdev.c  | 276 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/dax.h  |   4 +
 7 files changed, 312 insertions(+)
 create mode 100644 drivers/dax/fsdev.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 765ad2daa218..90429cb06090 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7184,6 +7184,14 @@ L:       [email protected]
 S:     Supported
 F:     drivers/dax/
 
+DEVICE DIRECT ACCESS (DAX) [fsdev_dax]
+M:     John Groves <[email protected]>
+M:     John Groves <[email protected]>
+L:     [email protected]
+L:     [email protected]
+S:     Supported
+F:     drivers/dax/fsdev.c
+
 DEVICE FREQUENCY (DEVFREQ)
 M:     MyungJoo Ham <[email protected]>
 M:     Kyungmin Park <[email protected]>
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index d656e4c0eb84..491325d914a8 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -78,4 +78,21 @@ config DEV_DAX_KMEM
 
          Say N if unsure.
 
+config DEV_DAX_FS
+       tristate "FSDEV DAX: fs-dax compatible device driver"
+       depends on DEV_DAX
+       default DEV_DAX
+       help
+         Support a device-dax driver mode that is compatible with fs-dax
+         filesystems. Unlike the standard device-dax driver which
+         pre-initializes compound folios based on device alignment, this
+         driver leaves folios uninitialized (similar to pmem) allowing
+         fs-dax to manage folio lifecycles dynamically.
+
+         This driver uses MEMORY_DEVICE_FS_DAX type and does not set
+         vmemmap_shift, making it compatible with filesystems like famfs
+         that use the iomap-based fs-dax infrastructure.
+
+         Say M if you plan to use fs-dax filesystems on /dev/dax devices.
+         Say N if you only need raw character device access to DAX memory.
 endif
diff --git a/drivers/dax/Makefile b/drivers/dax/Makefile
index 5ed5c39857c8..77aa3df3285c 100644
--- a/drivers/dax/Makefile
+++ b/drivers/dax/Makefile
@@ -4,11 +4,13 @@ obj-$(CONFIG_DEV_DAX) += device_dax.o
 obj-$(CONFIG_DEV_DAX_KMEM) += kmem.o
 obj-$(CONFIG_DEV_DAX_PMEM) += dax_pmem.o
 obj-$(CONFIG_DEV_DAX_CXL) += dax_cxl.o
+obj-$(CONFIG_DEV_DAX_FS) += fsdev_dax.o
 
 dax-y := super.o
 dax-y += bus.o
 device_dax-y := device.o
 dax_pmem-y := pmem.o
 dax_cxl-y := cxl.o
+fsdev_dax-y := fsdev.o
 
 obj-y += hmem/
diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index a2f9a3cc30a5..0d7228acb913 100644
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -84,6 +84,10 @@ static int dax_match_type(const struct dax_device_driver 
*dax_drv, struct device
            !IS_ENABLED(CONFIG_DEV_DAX_KMEM))
                return 1;
 
+       /* fsdev driver can also bind to device-type dax devices */
+       if (dax_drv->type == DAXDRV_FSDEV_TYPE && type == DAXDRV_DEVICE_TYPE)
+               return 1;
+
        return 0;
 }
 
diff --git a/drivers/dax/bus.h b/drivers/dax/bus.h
index cbbf64443098..880bdf7e72d7 100644
--- a/drivers/dax/bus.h
+++ b/drivers/dax/bus.h
@@ -31,6 +31,7 @@ struct dev_dax *devm_create_dev_dax(struct dev_dax_data 
*data);
 enum dax_driver_type {
        DAXDRV_KMEM_TYPE,
        DAXDRV_DEVICE_TYPE,
+       DAXDRV_FSDEV_TYPE,
 };
 
 struct dax_device_driver {
diff --git a/drivers/dax/fsdev.c b/drivers/dax/fsdev.c
new file mode 100644
index 000000000000..2a3249d1529c
--- /dev/null
+++ b/drivers/dax/fsdev.c
@@ -0,0 +1,276 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2026 Micron Technology, Inc. */
+#include <linux/memremap.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/slab.h>
+#include <linux/dax.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include "dax-private.h"
+#include "bus.h"
+
+/*
+ * FS-DAX compatible devdax driver
+ *
+ * Unlike drivers/dax/device.c which pre-initializes compound folios based
+ * on device alignment (via vmemmap_shift), this driver leaves folios
+ * uninitialized similar to pmem. This allows fs-dax filesystems like famfs
+ * to work without needing special handling for pre-initialized folios.
+ *
+ * Key differences from device.c:
+ * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC)
+ * - vmemmap_shift is NOT set (folios remain order-0)
+ * - fs-dax can dynamically create compound folios as needed
+ * - No mmap support - all access is through fs-dax/iomap
+ */
+
+
+static void fsdev_cdev_del(void *cdev)
+{
+       cdev_del(cdev);
+}
+
+static void fsdev_kill(void *dev_dax)
+{
+       kill_dev_dax(dev_dax);
+}
+
+/*
+ * Page map operations for FS-DAX mode
+ * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c
+ *
+ * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX.
+ * The core mm code in free_zone_device_folio() handles the wake_up_var()
+ * directly for this memory type.
+ */
+static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap,
+               unsigned long pfn, unsigned long nr_pages, int mf_flags)
+{
+       struct dev_dax *dev_dax = pgmap->owner;
+       u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start;
+       u64 len = nr_pages << PAGE_SHIFT;
+
+       return dax_holder_notify_failure(dev_dax->dax_dev, offset,
+                                        len, mf_flags);
+}
+
+static const struct dev_pagemap_ops fsdev_pagemap_ops = {
+       .memory_failure         = fsdev_pagemap_memory_failure,
+};
+
+/*
+ * Clear any stale folio state from pages in the given range.
+ * This is necessary because device_dax pre-initializes compound folios
+ * based on vmemmap_shift, and that state may persist after driver unbind.
+ * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax
+ * expects to find clean order-0 folios that it can build into compound
+ * folios on demand.
+ *
+ * At probe time, no filesystem should be mounted yet, so all mappings
+ * are stale and must be cleared along with compound state.
+ */
+static void fsdev_clear_folio_state(struct dev_dax *dev_dax)
+{
+       int i;
+
+       for (i = 0; i < dev_dax->nr_range; i++) {
+               struct range *range = &dev_dax->ranges[i].range;
+               unsigned long pfn, end_pfn;
+
+               pfn = PHYS_PFN(range->start);
+               end_pfn = PHYS_PFN(range->end) + 1;
+
+               while (pfn < end_pfn) {
+                       struct page *page = pfn_to_page(pfn);
+                       struct folio *folio = (struct folio *)page;
+                       struct dev_pagemap *pgmap = page_pgmap(page);
+                       int order = folio_order(folio);
+
+                       /*
+                        * Clear any stale mapping pointer. At probe time,
+                        * no filesystem is mounted, so any mapping is stale.
+                        */
+                       folio->mapping = NULL;
+                       folio->share = 0;
+
+                       if (order > 0) {
+                               int j;
+
+                               folio_reset_order(folio);
+                               for (j = 0; j < (1UL << order); j++) {
+                                       struct page *p = page + j;
+
+                                       ClearPageHead(p);
+                                       clear_compound_head(p);
+                                       ((struct folio *)p)->mapping = NULL;
+                                       ((struct folio *)p)->share = 0;
+                                       ((struct folio *)p)->pgmap = pgmap;
+                               }
+                               pfn += (1UL << order);
+                       } else {
+                               folio->pgmap = pgmap;
+                               pfn++;
+                       }
+               }
+       }
+}
+
+static int fsdev_open(struct inode *inode, struct file *filp)
+{
+       struct dax_device *dax_dev = inode_dax(inode);
+       struct dev_dax *dev_dax = dax_get_private(dax_dev);
+
+       dev_dbg(&dev_dax->dev, "trace\n");
+       filp->private_data = dev_dax;
+
+       return 0;
+}
+
+static int fsdev_release(struct inode *inode, struct file *filp)
+{
+       struct dev_dax *dev_dax = filp->private_data;
+
+       dev_dbg(&dev_dax->dev, "trace\n");
+       return 0;
+}
+
+static const struct file_operations fsdev_fops = {
+       .llseek = noop_llseek,
+       .owner = THIS_MODULE,
+       .open = fsdev_open,
+       .release = fsdev_release,
+};
+
+static int fsdev_dax_probe(struct dev_dax *dev_dax)
+{
+       struct dax_device *dax_dev = dev_dax->dax_dev;
+       struct device *dev = &dev_dax->dev;
+       struct dev_pagemap *pgmap;
+       u64 data_offset = 0;
+       struct inode *inode;
+       struct cdev *cdev;
+       void *addr;
+       int rc, i;
+
+       if (static_dev_dax(dev_dax))  {
+               if (dev_dax->nr_range > 1) {
+                       dev_warn(dev,
+                               "static pgmap / multi-range device conflict\n");
+                       return -EINVAL;
+               }
+
+               pgmap = dev_dax->pgmap;
+       } else {
+               if (dev_dax->pgmap) {
+                       dev_warn(dev,
+                                "dynamic-dax with pre-populated page map\n");
+                       return -EINVAL;
+               }
+
+               pgmap = devm_kzalloc(dev,
+                       struct_size(pgmap, ranges, dev_dax->nr_range - 1),
+                                    GFP_KERNEL);
+               if (!pgmap)
+                       return -ENOMEM;
+
+               pgmap->nr_range = dev_dax->nr_range;
+               dev_dax->pgmap = pgmap;
+
+               for (i = 0; i < dev_dax->nr_range; i++) {
+                       struct range *range = &dev_dax->ranges[i].range;
+
+                       pgmap->ranges[i] = *range;
+               }
+       }
+
+       for (i = 0; i < dev_dax->nr_range; i++) {
+               struct range *range = &dev_dax->ranges[i].range;
+
+               if (!devm_request_mem_region(dev, range->start,
+                                       range_len(range), dev_name(dev))) {
+                       dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve 
range\n",
+                                       i, range->start, range->end);
+                       return -EBUSY;
+               }
+       }
+
+       /*
+        * FS-DAX compatible mode: Use MEMORY_DEVICE_FS_DAX type and
+        * do NOT set vmemmap_shift. This leaves folios at order-0,
+        * allowing fs-dax to dynamically create compound folios as needed
+        * (similar to pmem behavior).
+        */
+       pgmap->type = MEMORY_DEVICE_FS_DAX;
+       pgmap->ops = &fsdev_pagemap_ops;
+       pgmap->owner = dev_dax;
+
+       /*
+        * CRITICAL DIFFERENCE from device.c:
+        * We do NOT set vmemmap_shift here, even if align > PAGE_SIZE.
+        * This ensures folios remain order-0 and are compatible with
+        * fs-dax's folio management.
+        */
+
+       addr = devm_memremap_pages(dev, pgmap);
+       if (IS_ERR(addr))
+               return PTR_ERR(addr);
+
+       /*
+        * Clear any stale compound folio state left over from a previous
+        * driver (e.g., device_dax with vmemmap_shift).
+        */
+       fsdev_clear_folio_state(dev_dax);
+
+       /* Detect whether the data is at a non-zero offset into the memory */
+       if (pgmap->range.start != dev_dax->ranges[0].range.start) {
+               u64 phys = dev_dax->ranges[0].range.start;
+               u64 pgmap_phys = dev_dax->pgmap[0].range.start;
+
+               if (!WARN_ON(pgmap_phys > phys))
+                       data_offset = phys - pgmap_phys;
+
+               pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx 
offset=%llx\n",
+                      __func__, phys, pgmap_phys, data_offset);
+       }
+
+       inode = dax_inode(dax_dev);
+       cdev = inode->i_cdev;
+       cdev_init(cdev, &fsdev_fops);
+       cdev->owner = dev->driver->owner;
+       cdev_set_parent(cdev, &dev->kobj);
+       rc = cdev_add(cdev, dev->devt, 1);
+       if (rc)
+               return rc;
+
+       rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev);
+       if (rc)
+               return rc;
+
+       run_dax(dax_dev);
+       return devm_add_action_or_reset(dev, fsdev_kill, dev_dax);
+}
+
+static struct dax_device_driver fsdev_dax_driver = {
+       .probe = fsdev_dax_probe,
+       .type = DAXDRV_FSDEV_TYPE,
+};
+
+static int __init dax_init(void)
+{
+       return dax_driver_register(&fsdev_dax_driver);
+}
+
+static void __exit dax_exit(void)
+{
+       dax_driver_unregister(&fsdev_dax_driver);
+}
+
+MODULE_AUTHOR("John Groves");
+MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver");
+MODULE_LICENSE("GPL");
+module_init(dax_init);
+module_exit(dax_exit);
+MODULE_ALIAS_DAX_DEVICE(0);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9d624f4d9df6..74e098010016 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -51,6 +51,10 @@ struct dax_holder_operations {
 
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
+
+#if IS_ENABLED(CONFIG_DEV_DAX_FS)
+struct dax_device *inode_dax(struct inode *inode);
+#endif
 void *dax_holder(struct dax_device *dax_dev);
 void put_dax(struct dax_device *dax_dev);
 void kill_dax(struct dax_device *dax_dev);
-- 
2.49.0


Reply via email to