The pin_user_pages() + page_maybe_dma_pinned() infrastructure is a
framework for tackling the kernel's struggles with gup+DMA.

DAX presents a unique flavor of the gup+DMA problem since pinned pages
are identical to physical filesystem blocks. Unlike the page-cache case,
a mapping of a file can not be truncated while DMA is in-flight because
the DMA must complete before the filesystem block is reclaimed.

DAX has a homegrown solution to this problem based on watching the
page->_refcount go idle. Beyond being awkward to catch that idle transition
in put_page(), it is overkill when only the page_maybe_dma_pinned()
transition needs to be captured.

Move the wakeup of filesystem-DAX truncate paths
({ext4,xfs,fuse_dax}_break_layouts()) to unpin_user_pages() with a new
wakeup_fsdax_pin_waiters() helper, and use !page_maybe_dma_pinned() as
the wake condition.

Cc: Jan Kara <j...@suse.cz>
Cc: "Darrick J. Wong" <djw...@kernel.org>
Cc: Christoph Hellwig <h...@lst.de>
Cc: John Hubbard <jhubb...@nvidia.com>
Reported-by: Jason Gunthorpe <j...@nvidia.com>
Reported-by: Matthew Wilcox <wi...@infradead.org>
Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
---
 fs/dax.c           |    4 ++--
 fs/ext4/inode.c    |    7 +++----
 fs/fuse/dax.c      |    6 +++---
 fs/xfs/xfs_file.c  |    6 +++---
 include/linux/mm.h |   28 ++++++++++++++++++++++++++++
 mm/gup.c           |    6 ++++--
 6 files changed, 43 insertions(+), 14 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 0f22f7b46de0..aceb587bc27e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -395,7 +395,7 @@ static void dax_disassociate_entry(void *entry, struct 
address_space *mapping,
        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);
 
-               WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+               WARN_ON_ONCE(trunc && page_maybe_dma_pinned(page));
                if (dax_mapping_is_cow(page->mapping)) {
                        /* keep the CoW flag if this page is still shared */
                        if (page->index-- > 0)
@@ -414,7 +414,7 @@ static struct page *dax_pinned_page(void *entry)
        for_each_mapped_pfn(entry, pfn) {
                struct page *page = pfn_to_page(pfn);
 
-               if (page_ref_count(page) > 1)
+               if (page_maybe_dma_pinned(page))
                        return page;
        }
        return NULL;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bf49bf506965..5e68e64f155a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3961,10 +3961,9 @@ int ext4_break_layouts(struct inode *inode)
                if (!page)
                        return 0;
 
-               error = ___wait_var_event(&page->_refcount,
-                               atomic_read(&page->_refcount) == 1,
-                               TASK_INTERRUPTIBLE, 0, 0,
-                               ext4_wait_dax_page(inode));
+               error = ___wait_var_event(page, !page_maybe_dma_pinned(page),
+                                         TASK_INTERRUPTIBLE, 0, 0,
+                                         ext4_wait_dax_page(inode));
        } while (error == 0);
 
        return error;
diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c
index e0b846f16bc5..6419ca420c42 100644
--- a/fs/fuse/dax.c
+++ b/fs/fuse/dax.c
@@ -676,9 +676,9 @@ static int __fuse_dax_break_layouts(struct inode *inode, 
bool *retry,
                return 0;
 
        *retry = true;
-       return ___wait_var_event(&page->_refcount,
-                       atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-                       0, 0, fuse_wait_dax_page(inode));
+       return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+                                TASK_INTERRUPTIBLE, 0, 0,
+                                fuse_wait_dax_page(inode));
 }
 
 /* dmap_end == 0 leads to unmapping of whole file */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 954bb6e83796..dbffb9481b71 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -827,9 +827,9 @@ xfs_break_dax_layouts(
                return 0;
 
        *retry = true;
-       return ___wait_var_event(&page->_refcount,
-                       atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
-                       0, 0, xfs_wait_dax_page(inode));
+       return ___wait_var_event(page, !page_maybe_dma_pinned(page),
+                                TASK_INTERRUPTIBLE, 0, 0,
+                                xfs_wait_dax_page(inode));
 }
 
 int
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3bedc449c14d..557d5447ebec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1517,6 +1517,34 @@ static inline bool page_maybe_dma_pinned(struct page 
*page)
        return folio_maybe_dma_pinned(page_folio(page));
 }
 
+#if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
+/*
+ * Unlike typical file backed pages that support truncating a page from
+ * a file while it is under active DMA, DAX pages need to hold off
+ * truncate operations until transient page pins are released.
+ *
+ * The filesystem (via dax_layout_pinned_page()) takes steps to make
+ * sure that any observation of the !page_maybe_dma_pinned() state is
+ * stable until the truncation completes.
+ */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+       struct page *page = &folio->page;
+
+       if (!folio_is_zone_device(folio))
+               return;
+       if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
+               return;
+       if (folio_maybe_dma_pinned(folio))
+               return;
+       wake_up_var(page);
+}
+#else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+static inline void wakeup_fsdax_pin_waiters(struct folio *folio)
+{
+}
+#endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
+
 /*
  * This should most likely only be called during fork() to see whether we
  * should break the cow immediately for an anon page on the src mm.
diff --git a/mm/gup.c b/mm/gup.c
index 732825157430..499c46296fda 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -177,8 +177,10 @@ static void gup_put_folio(struct folio *folio, int refs, 
unsigned int flags)
                        refs *= GUP_PIN_COUNTING_BIAS;
        }
 
-       if (!put_devmap_managed_page_refs(&folio->page, refs))
-               folio_put_refs(folio, refs);
+       folio_put_refs(folio, refs);
+
+       if (flags & FOLL_PIN)
+               wakeup_fsdax_pin_waiters(folio);
 }
 
 /**


Reply via email to