+/* Can we retract page tables for this file-backed VMA? */
+static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
It's not really the VMA that is retractable :)
Given that the function we are called this from is called
"retract_page_tables" (and not file_backed_...) I guess I would just
have called this
"page_tables_are_retractable"
"page_tables_support_retract"
Or sth. along those lines.
+{
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth removing
+ * page tables from, as PMD-mapping is likely to be split later.
+ */
+ if (READ_ONCE(vma->anon_vma))
+ return false;
+
+ /*
+ * When a vma is registered with uffd-wp, we cannot recycle
+ * the page table because there may be pte markers installed.
+ * Other vmas can still have the same file mapped hugely, but
+ * skip this one: it will always be mapped in small page size
+ * for uffd-wp registered ranges.
+ */
+ if (userfaultfd_wp(vma))
+ return false;
+
+ /*
+ * If the VMA contains guard regions then we can't collapse it.
+ *
+ * This is set atomically on guard marker installation under mmap/VMA
+ * read lock, and here we may not hold any VMA or mmap lock at all.
+ *
+ * This is therefore serialised on the PTE page table lock, which is
+ * obtained on guard region installation after the flag is set, so this
+ * check being performed under this lock excludes races.
+ */
+ if (vma_flag_test_atomic(vma, VM_MAYBE_GUARD_BIT))
+ return false;
+
+ return true;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1724,14 +1761,6 @@ static void retract_page_tables(struct address_space
*mapping, pgoff_t pgoff)
spinlock_t *ptl;
bool success = false;
- /*
- * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
- * got written to. These VMAs are likely not worth removing
- * page tables from, as PMD-mapping is likely to be split later.
- */
- if (READ_ONCE(vma->anon_vma))
- continue;
-
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (addr & ~HPAGE_PMD_MASK ||
vma->vm_end < addr + HPAGE_PMD_SIZE)
@@ -1743,14 +1772,8 @@ static void retract_page_tables(struct address_space
*mapping, pgoff_t pgoff)
if (hpage_collapse_test_exit(mm))
continue;
- /*
- * When a vma is registered with uffd-wp, we cannot recycle
- * the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
- */
- if (userfaultfd_wp(vma))
+
+ if (!file_backed_vma_is_retractable(vma))
continue;
/* PTEs were notified when unmapped; but now for the PMD? */
@@ -1777,15 +1800,15 @@ static void retract_page_tables(struct address_space
*mapping, pgoff_t pgoff)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
/*
- * Huge page lock is still held, so normally the page table
- * must remain empty; and we have already skipped anon_vma
- * and userfaultfd_wp() vmas. But since the mmap_lock is not
- * held, it is still possible for a racing userfaultfd_ioctl()
- * to have inserted ptes or markers. Now that we hold ptlock,
- * repeating the anon_vma check protects from one category,
- * and repeating the userfaultfd_wp() check from another.
+ * Huge page lock is still held, so normally the page table must
+ * remain empty; and we have already skipped anon_vma and
+ * userfaultfd_wp() vmas. But since the mmap_lock is not held,
+ * it is still possible for a racing userfaultfd_ioctl() or
+ * madvise() to have inserted ptes or markers. Now that we hold
+ * ptlock, repeating the retractable checks protects us from
+ * races against the prior checks.
*/
- if (likely(!vma->anon_vma && !userfaultfd_wp(vma))) {
+ if (likely(file_backed_vma_is_retractable(vma))) {
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
success = true;
diff --git a/mm/madvise.c b/mm/madvise.c
index 0b3280752bfb..5dbe40be7c65 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1141,15 +1141,21 @@ static long madvise_guard_install(struct
madvise_behavior *madv_behavior)
return -EINVAL;
/*
- * If we install guard markers, then the range is no longer
- * empty from a page table perspective and therefore it's
- * appropriate to have an anon_vma.
- *
- * This ensures that on fork, we copy page tables correctly.
+ * Set atomically under read lock. All pertinent readers will need to
+ * acquire an mmap/VMA write lock to read it. All remaining readers may
+ * or may not see the flag set, but we don't care.
+ */
+ vma_flag_set_atomic(vma, VM_MAYBE_GUARD_BIT);
+
In general LGTM.
+ /*
+ * If anonymous and we are establishing page tables the VMA ought to
+ * have an anon_vma associated with it.
Do you know why? I know that as soon as we have anon folios in there we
need it, but is it still required for guard regions? Patch #5 should
handle the for case I guess.
Which other code depends on that?
--
Cheers
David