Preparatory patch. Add the change_protection() primitive that
userfaultfd RWP will use.

An RWP-protected PTE is PAGE_NONE with the uffd PTE bit set. The
PROT_NONE half makes the CPU fault on any access; the uffd bit
distinguishes an RWP fault from a plain mprotect(PROT_NONE) or NUMA
hinting fault. MM_CP_UFFD_WP and MM_CP_UFFD_RWP share the same PTE
bit, so the two cannot be used together on the same range.

Two new change_protection() flags:

  MM_CP_UFFD_RWP            install PAGE_NONE and set the uffd bit
  MM_CP_UFFD_RWP_RESOLVE    restore vma->vm_page_prot, clear the uffd bit

Both are wired through change_pte_range(), change_huge_pmd(), and
hugetlb_change_protection() so anon, shmem, THP, and hugetlb all
share the same semantics.

Signed-off-by: Kiryl Shutsemau <[email protected]>
Assisted-by: Claude:claude-opus-4-6
---
 include/linux/mm.h            |  5 +++++
 include/linux/userfaultfd_k.h |  1 -
 mm/huge_memory.c              | 20 ++++++++++++------
 mm/hugetlb.c                  | 25 ++++++++++++++++------
 mm/mprotect.c                 | 40 +++++++++++++++++++++++++++++------
 5 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f53d1e978c0..2b65416bb760 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3291,6 +3291,11 @@ int get_cmdline(struct task_struct *task, char *buffer, 
int buflen);
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd RWP */
+#define  MM_CP_UFFD_RWP                    (1UL << 4) /* do rwp */
+#define  MM_CP_UFFD_RWP_RESOLVE            (1UL << 5) /* Resolve rwp */
+#define  MM_CP_UFFD_RWP_ALL                (MM_CP_UFFD_RWP | \
+                                           MM_CP_UFFD_RWP_RESOLVE)
 
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fcf308dba311..3725e61a7041 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -397,7 +397,6 @@ static inline bool userfaultfd_huge_pmd_wp(struct 
vm_area_struct *vma,
        return false;
 }
 
-
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d88fcccd386d..2537dca63c6c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2665,6 +2665,8 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
+       bool uffd_rwp_resolve = cp_flags & MM_CP_UFFD_RWP_RESOLVE;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        int ret = 1;
@@ -2679,11 +2681,18 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
                return 0;
 
        if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
-               change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
-                                           uffd_wp_resolve);
+               change_non_present_huge_pmd(mm, addr, pmd,
+                                           uffd_wp || uffd_rwp,
+                                           uffd_wp_resolve || 
uffd_rwp_resolve);
                goto unlock;
        }
 
+       /* Already in the desired state */
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
+       if (uffd_rwp && pmd_protnone(*pmd) && pmd_uffd(*pmd))
+               goto unlock;
+
        if (prot_numa) {
 
                /*
@@ -2694,9 +2703,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;
 
-               if (pmd_protnone(*pmd))
-                       goto unlock;
-
                if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
                                             
vma_is_single_threaded_private(vma)))
                        goto unlock;
@@ -2725,9 +2731,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
 
        entry = pmd_modify(oldpmd, newprot);
-       if (uffd_wp)
+       if (uffd_wp || uffd_rwp)
                entry = pmd_mkuffd(entry);
-       else if (uffd_wp_resolve)
+       else if (uffd_wp_resolve || uffd_rwp_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61cda9992043..63f6b19418b9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6409,6 +6409,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long last_addr_mask;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
+       bool uffd_rwp_resolve = cp_flags & MM_CP_UFFD_RWP_RESOLVE;
        struct mmu_gather tlb;
 
        /*
@@ -6434,6 +6436,11 @@ long hugetlb_change_protection(struct vm_area_struct 
*vma,
 
                ptep = hugetlb_walk(vma, address, psize);
                if (!ptep) {
+                       /*
+                        * uffd_wp installs a pte marker on the unpopulated
+                        * entry; RWP does not install markers so the
+                        * allocation is unnecessary for it.
+                        */
                        if (!uffd_wp) {
                                address |= last_addr_mask;
                                continue;
@@ -6455,7 +6462,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
                         * shouldn't happen at all.  Warn about it if it
                         * happened due to some reason.
                         */
-                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
+                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve ||
+                                    uffd_rwp || uffd_rwp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        address |= last_addr_mask;
@@ -6489,9 +6497,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
                                pages++;
                        }
 
-                       if (uffd_wp)
+                       if (uffd_wp || uffd_rwp)
                                newpte = pte_swp_mkuffd(newpte);
-                       else if (uffd_wp_resolve)
+                       else if (uffd_wp_resolve || uffd_rwp_resolve)
                                newpte = pte_swp_clear_uffd(newpte);
                        if (!pte_same(pte, newpte))
                                set_huge_pte_at(mm, address, ptep, newpte, 
psize);
@@ -6502,19 +6510,24 @@ long hugetlb_change_protection(struct vm_area_struct 
*vma,
                         * pte_marker_uffd_wp()==true implies !poison
                         * because they're mutual exclusive.
                         */
-                       if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
+                       if (pte_is_uffd_wp_marker(pte) &&
+                           (uffd_wp_resolve || uffd_rwp_resolve))
                                /* Safe to modify directly (non-present->none). 
*/
                                huge_pte_clear(mm, address, ptep, psize);
                } else {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));
 
+                       /* Already protnone with uffd bit set? Nothing to do. */
+                       if (uffd_rwp && pte_protnone(pte) && huge_pte_uffd(pte))
+                               goto next;
+
                        old_pte = huge_ptep_modify_prot_start(vma, address, 
ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
-                       if (uffd_wp)
+                       if (uffd_wp || uffd_rwp)
                                pte = huge_pte_mkuffd(pte);
-                       else if (uffd_wp_resolve)
+                       else if (uffd_wp_resolve || uffd_rwp_resolve)
                                pte = huge_pte_clear_uffd(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, 
old_pte, pte);
                        pages++;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8340c8b228c6..23e71f68cf7a 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -216,6 +216,8 @@ static long change_softleaf_pte(struct vm_area_struct *vma,
 {
        const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       const bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
+       const bool uffd_rwp_resolve = cp_flags & MM_CP_UFFD_RWP_RESOLVE;
        softleaf_t entry = softleaf_from_pte(oldpte);
        pte_t newpte;
 
@@ -256,7 +258,7 @@ static long change_softleaf_pte(struct vm_area_struct *vma,
                 * to unprotect it, drop it; the next page
                 * fault will trigger without uffd trapping.
                 */
-               if (uffd_wp_resolve) {
+               if (uffd_wp_resolve || uffd_rwp_resolve) {
                        pte_clear(vma->vm_mm, addr, pte);
                        return 1;
                }
@@ -265,9 +267,9 @@ static long change_softleaf_pte(struct vm_area_struct *vma,
                newpte = oldpte;
        }
 
-       if (uffd_wp)
+       if (uffd_wp || uffd_rwp)
                newpte = pte_swp_mkuffd(newpte);
-       else if (uffd_wp_resolve)
+       else if (uffd_wp_resolve || uffd_rwp_resolve)
                newpte = pte_swp_clear_uffd(newpte);
 
        if (!pte_same(oldpte, newpte)) {
@@ -284,14 +286,16 @@ static __always_inline void change_present_ptes(struct 
mmu_gather *tlb,
 {
        const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
        const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       const bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
+       const bool uffd_rwp_resolve = cp_flags & MM_CP_UFFD_RWP_RESOLVE;
        pte_t ptent, oldpte;
 
        oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
        ptent = pte_modify(oldpte, newprot);
 
-       if (uffd_wp)
+       if (uffd_wp || uffd_rwp)
                ptent = pte_mkuffd(ptent);
-       else if (uffd_wp_resolve)
+       else if (uffd_wp_resolve || uffd_rwp_resolve)
                ptent = pte_clear_uffd(ptent);
 
        /*
@@ -325,6 +329,7 @@ static long change_pte_range(struct mmu_gather *tlb,
        long pages = 0;
        bool is_private_single_threaded;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        int nr_ptes;
 
@@ -350,6 +355,14 @@ static long change_pte_range(struct mmu_gather *tlb,
                        /* Already in the desired state. */
                        if (prot_numa && pte_protnone(oldpte))
                                continue;
+                       /*
+                        * RWP-protected PTEs carry _PAGE_UFFD as a marker on
+                        * top of PROT_NONE. Skip only entries already in that
+                        * exact state; plain PROT_NONE from mprotect() still 
needs
+                        * to be promoted so future faults can be distinguished.
+                        */
+                       if (uffd_rwp && pte_protnone(oldpte) && 
pte_uffd(oldpte))
+                               continue;
 
                        page = vm_normal_page(vma, addr, oldpte);
                        if (page)
@@ -358,6 +371,8 @@ static long change_pte_range(struct mmu_gather *tlb,
                        /*
                         * Avoid trapping faults against the zero or KSM
                         * pages. See similar comment in change_huge_pmd.
+                        * Skip this filter for uffd RWP which
+                        * must set protnone regardless of NUMA placement.
                         */
                        if (prot_numa &&
                            !folio_can_map_prot_numa(folio, vma,
@@ -667,7 +682,16 @@ long change_protection(struct mmu_gather *tlb,
        pgprot_t newprot = vma->vm_page_prot;
        long pages;
 
-       BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+       /*
+        * MM_CP_UFFD_{WP,RWP} and _RESOLVE are mutually exclusive within one
+        * change, and WP and RWP cannot mix. Miswired callers get a warn and
+        * a no-op; userspace cannot reach this state.
+        */
+       if (WARN_ON_ONCE((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL ||
+                        (cp_flags & MM_CP_UFFD_RWP_ALL) == MM_CP_UFFD_RWP_ALL 
||
+                        ((cp_flags & MM_CP_UFFD_WP_ALL) &&
+                         (cp_flags & MM_CP_UFFD_RWP_ALL))))
+               return 0;
 
 #ifdef CONFIG_NUMA_BALANCING
        /*
@@ -681,6 +705,10 @@ long change_protection(struct mmu_gather *tlb,
        WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 #endif
 
+       if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_PROTNONE) &&
+           (cp_flags & MM_CP_UFFD_RWP))
+               newprot = PAGE_NONE;
+
        if (is_vm_hugetlb_page(vma))
                pages = hugetlb_change_protection(vma, start, end, newprot,
                                                  cp_flags);
-- 
2.51.2


Reply via email to