From: "Kiryl Shutsemau (Meta)" <[email protected]>

Preparatory patch. Add the change_protection() primitive that
userfaultfd RWP will use.

An RWP-protected PTE is PAGE_NONE with the uffd PTE bit set. The
PROT_NONE half makes the CPU fault on any access; the uffd bit
distinguishes an RWP fault from a plain mprotect(PROT_NONE) or NUMA
hinting fault. MM_CP_UFFD_WP and MM_CP_UFFD_RWP share the same PTE
bit, so the two cannot be used together on the same range.

Two new change_protection() flags:

  MM_CP_UFFD_RWP            install PAGE_NONE and set the uffd bit
  MM_CP_UFFD_RWP_RESOLVE    restore vma->vm_page_prot, clear the uffd bit

Both are wired through change_pte_range(), change_huge_pmd(), and
hugetlb_change_protection() so anon, shmem, THP, and hugetlb all
share the same semantics.

Signed-off-by: Kiryl Shutsemau <[email protected]>
Assisted-by: Claude:claude-opus-4-6
---
 include/linux/mm.h            |  5 ++++
 include/linux/userfaultfd_k.h |  1 -
 mm/huge_memory.c              | 30 +++++++++++++----------
 mm/hugetlb.c                  | 25 ++++++++++++++-----
 mm/mprotect.c                 | 46 +++++++++++++++++++++++++++--------
 5 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3f53d1e978c0..9054468774b5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3291,6 +3291,11 @@ int get_cmdline(struct task_struct *task, char *buffer, 
int buflen);
 #define  MM_CP_UFFD_WP_RESOLVE             (1UL << 3) /* Resolve wp */
 #define  MM_CP_UFFD_WP_ALL                 (MM_CP_UFFD_WP | \
                                            MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd RWP */
+#define  MM_CP_UFFD_RWP                    (1UL << 4) /* do rwp */
+#define  MM_CP_UFFD_RWP_RESOLVE            (1UL << 5) /* resolve rwp */
+#define  MM_CP_UFFD_RWP_ALL                (MM_CP_UFFD_RWP | \
+                                           MM_CP_UFFD_RWP_RESOLVE)
 
 bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
                             pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 889c7b45fec8..07766398d592 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -397,7 +397,6 @@ static inline bool userfaultfd_huge_pmd_wp(struct 
vm_area_struct *vma,
        return false;
 }
 
-
 static inline bool userfaultfd_armed(struct vm_area_struct *vma)
 {
        return false;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d88fcccd386d..befc919de69e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2615,8 +2615,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned 
long old_addr,
 }
 
 static void change_non_present_huge_pmd(struct mm_struct *mm,
-               unsigned long addr, pmd_t *pmd, bool uffd_wp,
-               bool uffd_wp_resolve)
+               unsigned long addr, pmd_t *pmd, bool uffd_prot,
+               bool uffd_prot_resolve)
 {
        softleaf_t entry = softleaf_from_pmd(*pmd);
        const struct folio *folio = softleaf_to_folio(entry);
@@ -2642,9 +2642,9 @@ static void change_non_present_huge_pmd(struct mm_struct 
*mm,
                newpmd = *pmd;
        }
 
-       if (uffd_wp)
+       if (uffd_prot)
                newpmd = pmd_swp_mkuffd(newpmd);
-       else if (uffd_wp_resolve)
+       else if (uffd_prot_resolve)
                newpmd = pmd_swp_clear_uffd(newpmd);
        if (!pmd_same(*pmd, newpmd))
                set_pmd_at(mm, addr, pmd, newpmd);
@@ -2665,8 +2665,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
        spinlock_t *ptl;
        pmd_t oldpmd, entry;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
-       bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
-       bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       bool uffd_prot = cp_flags & (MM_CP_UFFD_WP | MM_CP_UFFD_RWP);
+       bool uffd_prot_resolve = cp_flags &
+               (MM_CP_UFFD_WP_RESOLVE | MM_CP_UFFD_RWP_RESOLVE);
        int ret = 1;
 
        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
@@ -2679,11 +2680,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
                return 0;
 
        if (thp_migration_supported() && pmd_is_valid_softleaf(*pmd)) {
-               change_non_present_huge_pmd(mm, addr, pmd, uffd_wp,
-                                           uffd_wp_resolve);
+               change_non_present_huge_pmd(mm, addr, pmd, uffd_prot,
+                                           uffd_prot_resolve);
                goto unlock;
        }
 
+       /* Already in the desired state */
+       if (prot_numa && pmd_protnone(*pmd))
+               goto unlock;
+       if ((cp_flags & MM_CP_UFFD_RWP) && pmd_protnone(*pmd) && pmd_uffd(*pmd))
+               goto unlock;
+
        if (prot_numa) {
 
                /*
@@ -2694,9 +2701,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
                if (is_huge_zero_pmd(*pmd))
                        goto unlock;
 
-               if (pmd_protnone(*pmd))
-                       goto unlock;
-
                if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
                                             
vma_is_single_threaded_private(vma)))
                        goto unlock;
@@ -2725,9 +2729,9 @@ int change_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
        oldpmd = pmdp_invalidate_ad(vma, addr, pmd);
 
        entry = pmd_modify(oldpmd, newprot);
-       if (uffd_wp)
+       if (uffd_prot)
                entry = pmd_mkuffd(entry);
-       else if (uffd_wp_resolve)
+       else if (uffd_prot_resolve)
                /*
                 * Leave the write bit to be handled by PF interrupt
                 * handler, then things like COW could be properly
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f770c6504e26..3cdbf0057dce 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6409,6 +6409,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long last_addr_mask;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
+       bool uffd_rwp_resolve = cp_flags & MM_CP_UFFD_RWP_RESOLVE;
        struct mmu_gather tlb;
 
        /*
@@ -6434,6 +6436,11 @@ long hugetlb_change_protection(struct vm_area_struct 
*vma,
 
                ptep = hugetlb_walk(vma, address, psize);
                if (!ptep) {
+                       /*
+                        * uffd_wp installs a pte marker on the unpopulated
+                        * entry; uffd_rwp does not install markers so the
+                        * allocation is unnecessary for it.
+                        */
                        if (!uffd_wp) {
                                address |= last_addr_mask;
                                continue;
@@ -6455,7 +6462,8 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
                         * shouldn't happen at all.  Warn about it if it
                         * happened due to some reason.
                         */
-                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve);
+                       WARN_ON_ONCE(uffd_wp || uffd_wp_resolve ||
+                                    uffd_rwp || uffd_rwp_resolve);
                        pages++;
                        spin_unlock(ptl);
                        address |= last_addr_mask;
@@ -6489,9 +6497,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
                                pages++;
                        }
 
-                       if (uffd_wp)
+                       if (uffd_wp || uffd_rwp)
                                newpte = pte_swp_mkuffd(newpte);
-                       else if (uffd_wp_resolve)
+                       else if (uffd_wp_resolve || uffd_rwp_resolve)
                                newpte = pte_swp_clear_uffd(newpte);
                        if (!pte_same(pte, newpte))
                                set_huge_pte_at(mm, address, ptep, newpte, 
psize);
@@ -6502,19 +6510,24 @@ long hugetlb_change_protection(struct vm_area_struct 
*vma,
                         * pte_marker_uffd_wp()==true implies !poison
                         * because they're mutual exclusive.
                         */
-                       if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve)
+                       if (pte_is_uffd_wp_marker(pte) &&
+                           (uffd_wp_resolve || uffd_rwp_resolve))
                                /* Safe to modify directly (non-present->none). 
*/
                                huge_pte_clear(mm, address, ptep, psize);
                } else {
                        pte_t old_pte;
                        unsigned int shift = huge_page_shift(hstate_vma(vma));
 
+                       /* Already protnone with uffd bit set? Nothing to do. */
+                       if (uffd_rwp && pte_protnone(pte) && huge_pte_uffd(pte))
+                               goto next;
+
                        old_pte = huge_ptep_modify_prot_start(vma, address, 
ptep);
                        pte = huge_pte_modify(old_pte, newprot);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
-                       if (uffd_wp)
+                       if (uffd_wp || uffd_rwp)
                                pte = huge_pte_mkuffd(pte);
-                       else if (uffd_wp_resolve)
+                       else if (uffd_wp_resolve || uffd_rwp_resolve)
                                pte = huge_pte_clear_uffd(pte);
                        huge_ptep_modify_prot_commit(vma, address, ptep, 
old_pte, pte);
                        pages++;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8340c8b228c6..4a6b35482aee 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -214,8 +214,9 @@ static __always_inline void 
set_write_prot_commit_flush_ptes(struct vm_area_stru
 static long change_softleaf_pte(struct vm_area_struct *vma,
        unsigned long addr, pte_t *pte, pte_t oldpte, unsigned long cp_flags)
 {
-       const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
-       const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+       const bool uffd_prot = cp_flags & (MM_CP_UFFD_WP | MM_CP_UFFD_RWP);
+       const bool uffd_prot_resolve = cp_flags &
+               (MM_CP_UFFD_WP_RESOLVE | MM_CP_UFFD_RWP_RESOLVE);
        softleaf_t entry = softleaf_from_pte(oldpte);
        pte_t newpte;
 
@@ -256,7 +257,7 @@ static long change_softleaf_pte(struct vm_area_struct *vma,
                 * to unprotect it, drop it; the next page
                 * fault will trigger without uffd trapping.
                 */
-               if (uffd_wp_resolve) {
+               if (uffd_prot_resolve) {
                        pte_clear(vma->vm_mm, addr, pte);
                        return 1;
                }
@@ -265,9 +266,9 @@ static long change_softleaf_pte(struct vm_area_struct *vma,
                newpte = oldpte;
        }
 
-       if (uffd_wp)
+       if (uffd_prot)
                newpte = pte_swp_mkuffd(newpte);
-       else if (uffd_wp_resolve)
+       else if (uffd_prot_resolve)
                newpte = pte_swp_clear_uffd(newpte);
 
        if (!pte_same(oldpte, newpte)) {
@@ -282,16 +283,17 @@ static __always_inline void change_present_ptes(struct 
mmu_gather *tlb,
                int nr_ptes, unsigned long end, pgprot_t newprot,
                struct folio *folio, struct page *page, unsigned long cp_flags)
 {
-       const bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
-       const bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+       const bool uffd_prot = cp_flags & (MM_CP_UFFD_WP | MM_CP_UFFD_RWP);
+       const bool uffd_prot_resolve = cp_flags &
+               (MM_CP_UFFD_WP_RESOLVE | MM_CP_UFFD_RWP_RESOLVE);
        pte_t ptent, oldpte;
 
        oldpte = modify_prot_start_ptes(vma, addr, ptep, nr_ptes);
        ptent = pte_modify(oldpte, newprot);
 
-       if (uffd_wp)
+       if (uffd_prot)
                ptent = pte_mkuffd(ptent);
-       else if (uffd_wp_resolve)
+       else if (uffd_prot_resolve)
                ptent = pte_clear_uffd(ptent);
 
        /*
@@ -325,6 +327,7 @@ static long change_pte_range(struct mmu_gather *tlb,
        long pages = 0;
        bool is_private_single_threaded;
        bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+       bool uffd_rwp = cp_flags & MM_CP_UFFD_RWP;
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        int nr_ptes;
 
@@ -350,6 +353,14 @@ static long change_pte_range(struct mmu_gather *tlb,
                        /* Already in the desired state. */
                        if (prot_numa && pte_protnone(oldpte))
                                continue;
+                       /*
+                        * RWP-protected PTEs carry _PAGE_UFFD as a marker on
+                        * top of PROT_NONE. Skip only entries already in that
+                        * exact state; plain PROT_NONE from mprotect() still 
needs
+                        * to be promoted so future faults can be distinguished.
+                        */
+                       if (uffd_rwp && pte_protnone(oldpte) && 
pte_uffd(oldpte))
+                               continue;
 
                        page = vm_normal_page(vma, addr, oldpte);
                        if (page)
@@ -358,6 +369,8 @@ static long change_pte_range(struct mmu_gather *tlb,
                        /*
                         * Avoid trapping faults against the zero or KSM
                         * pages. See similar comment in change_huge_pmd.
+                        * Skip this filter for uffd RWP which
+                        * must set protnone regardless of NUMA placement.
                         */
                        if (prot_numa &&
                            !folio_can_map_prot_numa(folio, vma,
@@ -667,7 +680,16 @@ long change_protection(struct mmu_gather *tlb,
        pgprot_t newprot = vma->vm_page_prot;
        long pages;
 
-       BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL);
+       /*
+        * MM_CP_UFFD_{WP,RWP} and _RESOLVE are mutually exclusive within one
+        * change, and WP and RWP cannot mix. Miswired callers get a warn and
+        * a no-op; userspace cannot reach this state.
+        */
+       if (WARN_ON_ONCE((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL ||
+                        (cp_flags & MM_CP_UFFD_RWP_ALL) == MM_CP_UFFD_RWP_ALL 
||
+                        ((cp_flags & MM_CP_UFFD_WP_ALL) &&
+                         (cp_flags & MM_CP_UFFD_RWP_ALL))))
+               return 0;
 
 #ifdef CONFIG_NUMA_BALANCING
        /*
@@ -681,6 +703,10 @@ long change_protection(struct mmu_gather *tlb,
        WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
 #endif
 
+       if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_PROTNONE) &&
+           (cp_flags & MM_CP_UFFD_RWP))
+               newprot = PAGE_NONE;
+
        if (is_vm_hugetlb_page(vma))
                pages = hugetlb_change_protection(vma, start, end, newprot,
                                                  cp_flags);
-- 
2.51.2


Reply via email to