This feature allows userspace to intercept "minor" faults. By "minor"
faults, I mean the following situation:

Let there exist two mappings (i.e., VMAs) to the same page(s). One of
the mappings is registered with userfaultfd (in minor mode), and the
other is not. Via the non-UFFD mapping, the underlying pages have
already been allocated & filled with some contents. The UFFD mapping
has not yet been faulted in; when it is touched for the first time,
this results in what I'm calling a "minor" fault. As a concrete
example, when working with hugetlbfs, we have huge_pte_none(), but
find_lock_page() finds an existing page.

This commit adds the new feature flag used to enable this behavior. In
the hugetlb fault path, if we find that we have huge_pte_none(), but
find_lock_page() does indeed find an existing page, then we have a
"minor" fault, and if the VMA is UFFD-registered (with VM_UFFD_MISSING),
*and* this feature is enabled, we call into userfaultfd to handle it.

Why not add a new registration mode instead? After all, this being a
feature flag instead has drawbacks:

- You can't handle *only* minor faults, but *not* missing faults.
- This is a per-FD option, not a per-registration option, so if you want
  minor faults for some VMAs but not others, you need to open a separate
  FD for those two configurations.
- The userfaultfd_minor() check is more expensive, as we have to examine
  the userfaultfd_ctx.
- handle_userfault()'s "reason" argument is no longer 1:1 with VM_*
  flags, which has to be dealt with (complexity).

Basically, it comes down to the fact that we can't really add a new VM_*
flag. There are no unused bits left. :) With the current design of UFFD,
we don't write down the requested registration mode anywhere except this
flag either - there isn't any extended context we can check. So, I think
this is the only way.

Signed-off-by: Axel Rasmussen <[email protected]>
---
 fs/userfaultfd.c                 | 37 ++++++++++++++++++++------------
 include/linux/mm.h               |  2 +-
 include/linux/userfaultfd_k.h    |  9 ++++++++
 include/uapi/linux/userfaultfd.h | 15 +++++++++----
 mm/hugetlb.c                     | 32 +++++++++++++++++++++++++++
 5 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d663eae0266..edfdb8f1c740 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -178,6 +178,18 @@ static void userfaultfd_ctx_put(struct userfaultfd_ctx 
*ctx)
        }
 }
 
+bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
+       unsigned int features = ctx ? ctx->features : 0;
+       bool minor_hugetlbfs = (features & UFFD_FEATURE_MINOR_HUGETLBFS);
+
+       if (!userfaultfd_missing(vma))
+               return false;
+
+       return is_vm_hugetlb_page(vma) && minor_hugetlbfs;
+}
+
 static inline void msg_init(struct uffd_msg *msg)
 {
        BUILD_BUG_ON(sizeof(struct uffd_msg) != 32);
@@ -197,24 +209,21 @@ static inline struct uffd_msg userfault_msg(unsigned long 
address,
        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;
        msg.arg.pagefault.address = address;
+       /*
+        * These flags indicate why the userfault occurred:
+        * - UFFD_PAGEFAULT_FLAG_WP indicates a write protect fault.
+        * - UFFD_PAGEFAULT_FLAG_MINOR indicates a minor fault.
+        * - Neither of these flags being set indicates a MISSING fault.
+        *
+        * Separately, UFFD_PAGEFAULT_FLAG_WRITE indicates it was a write
+        * fault. Otherwise, it was a read fault.
+        */
        if (flags & FAULT_FLAG_WRITE)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WRITE
-                * was not set in a UFFD_EVENT_PAGEFAULT, it means it
-                * was a read fault, otherwise if set it means it's
-                * a write fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
        if (reason == UFFD_REASON_WP)
-               /*
-                * If UFFD_FEATURE_PAGEFAULT_FLAG_WP was set in the
-                * uffdio_api.features and UFFD_PAGEFAULT_FLAG_WP was
-                * not set in a UFFD_EVENT_PAGEFAULT, it means it was
-                * a missing fault, otherwise if set it means it's a
-                * write protect fault.
-                */
                msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+       if (reason == UFFD_REASON_MINOR)
+               msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
        if (features & UFFD_FEATURE_THREAD_ID)
                msg.arg.pagefault.feat.ptid = task_pid_vnr(current);
        return msg;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 89fca443e6f1..3ddc465e31b0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -272,7 +272,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_MAYSHARE    0x00000080
 
 #define VM_GROWSDOWN   0x00000100      /* general info on the segment */
-#define VM_UFFD_MISSING        0x00000200      /* missing pages tracking */
+#define VM_UFFD_MISSING        0x00000200      /* missing or minor fault 
tracking */
 #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct 
page", just pure PFN */
 #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
 #define VM_UFFD_WP     0x00001000      /* wrprotect pages tracking */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index cc1554e7162f..4e03268c65ec 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -15,6 +15,8 @@ enum uffd_trigger_reason {
        UFFD_REASON_MISSING,
        /* A write protect fault occurred. */
        UFFD_REASON_WP,
+       /* A minor fault occurred. */
+       UFFD_REASON_MINOR,
 };
 
 #ifdef CONFIG_USERFAULTFD
@@ -79,6 +81,8 @@ static inline bool userfaultfd_wp(struct vm_area_struct *vma)
        return vma->vm_flags & VM_UFFD_WP;
 }
 
+bool userfaultfd_minor(struct vm_area_struct *vma);
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
@@ -140,6 +144,11 @@ static inline bool userfaultfd_wp(struct vm_area_struct 
*vma)
        return false;
 }
 
+static inline bool userfaultfd_minor(struct vm_area_struct *vma)
+{
+       return false;
+}
+
 static inline bool userfaultfd_pte_wp(struct vm_area_struct *vma,
                                      pte_t pte)
 {
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 5f2d88212f7c..6b038d56bca7 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -22,12 +22,13 @@
 #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP |    \
                           UFFD_FEATURE_EVENT_FORK |            \
                           UFFD_FEATURE_EVENT_REMAP |           \
-                          UFFD_FEATURE_EVENT_REMOVE |  \
+                          UFFD_FEATURE_EVENT_REMOVE |          \
                           UFFD_FEATURE_EVENT_UNMAP |           \
                           UFFD_FEATURE_MISSING_HUGETLBFS |     \
                           UFFD_FEATURE_MISSING_SHMEM |         \
                           UFFD_FEATURE_SIGBUS |                \
-                          UFFD_FEATURE_THREAD_ID)
+                          UFFD_FEATURE_THREAD_ID |             \
+                          UFFD_FEATURE_MINOR_HUGETLBFS)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
@@ -125,8 +126,9 @@ struct uffd_msg {
 #define UFFD_EVENT_UNMAP       0x16
 
 /* flags for UFFD_EVENT_PAGEFAULT */
-#define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* If this was a write fault */
-#define UFFD_PAGEFAULT_FLAG_WP         (1<<1)  /* If reason is VM_UFFD_WP */
+#define UFFD_PAGEFAULT_FLAG_WRITE      (1<<0)  /* write fault */
+#define UFFD_PAGEFAULT_FLAG_WP         (1<<1)  /* write-protect fault */
+#define UFFD_PAGEFAULT_FLAG_MINOR      (1<<2)  /* minor fault */
 
 struct uffdio_api {
        /* userland asks for an API number and the features to enable */
@@ -171,6 +173,10 @@ struct uffdio_api {
         *
         * UFFD_FEATURE_THREAD_ID pid of the page faulted task_struct will
         * be returned, if feature is not requested 0 will be returned.
+        *
+        * If requested, UFFD_FEATURE_MINOR_HUGETLBFS indicates that hugetlbfs
+        * memory registered with REGISTER_MODE_MISSING will *also* receive
+        * events for minor faults, not just missing faults.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
@@ -181,6 +187,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_EVENT_UNMAP               (1<<6)
 #define UFFD_FEATURE_SIGBUS                    (1<<7)
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
+#define UFFD_FEATURE_MINOR_HUGETLBFS           (1<<9)
        __u64 features;
 
        __u64 ioctls;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2a90e0b4bf47..93307fb058b7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4366,6 +4366,38 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
+
+               /* Check for page in userfault range. */
+               if (userfaultfd_minor(vma)) {
+                       u32 hash;
+                       struct vm_fault vmf = {
+                               .vma = vma,
+                               .address = haddr,
+                               .flags = flags,
+                               /*
+                                * Hard to debug if it ends up being used by a
+                                * callee that assumes something about the
+                                * other uninitialized fields... same as in
+                                * memory.c
+                                */
+                       };
+
+                       unlock_page(page);
+
+                       /*
+                        * hugetlb_fault_mutex and i_mmap_rwsem must be dropped
+                        * before handling userfault.  Reacquire after handling
+                        * fault to make calling code simpler.
+                        */
+
+                       hash = hugetlb_fault_mutex_hash(mapping, idx);
+                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       i_mmap_unlock_read(mapping);
+                       ret = handle_userfault(&vmf, UFFD_REASON_MINOR);
+                       i_mmap_lock_read(mapping);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       goto out;
+               }
        }
 
        /*
-- 
2.30.0.478.g8a0d178c01-goog

Reply via email to