Since we can now perform actions after the VMA is established via
mmap_prepare, use desc->action_success_hook to set up the hugetlb lock once
the VMA is setup.

We also make changes throughout hugetlbfs to make this possible.

Signed-off-by: Lorenzo Stoakes <lorenzo.stoa...@oracle.com>
---
 fs/hugetlbfs/inode.c           | 30 +++++++------
 include/linux/hugetlb.h        |  9 +++-
 include/linux/hugetlb_inline.h | 15 ++++---
 mm/hugetlb.c                   | 77 ++++++++++++++++++++--------------
 4 files changed, 79 insertions(+), 52 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 3cfdf4091001..026bcc65bb79 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -96,8 +96,9 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] 
= {
 #define PGOFF_LOFFT_MAX \
        (((1UL << (PAGE_SHIFT + 1)) - 1) <<  (BITS_PER_LONG - (PAGE_SHIFT + 1)))
 
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
 {
+       struct file *file = desc->file;
        struct inode *inode = file_inode(file);
        loff_t len, vma_len;
        int ret;
@@ -112,8 +113,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
         * way when do_mmap unwinds (may be important on powerpc
         * and ia64).
         */
-       vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
-       vma->vm_ops = &hugetlb_vm_ops;
+       desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+       desc->vm_ops = &hugetlb_vm_ops;
 
        /*
         * page based offset in vm_pgoff could be sufficiently large to
@@ -122,16 +123,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
         * sizeof(unsigned long).  So, only check in those instances.
         */
        if (sizeof(unsigned long) == sizeof(loff_t)) {
-               if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+               if (desc->pgoff & PGOFF_LOFFT_MAX)
                        return -EINVAL;
        }
 
        /* must be huge page aligned */
-       if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+       if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
                return -EINVAL;
 
-       vma_len = (loff_t)(vma->vm_end - vma->vm_start);
-       len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+       vma_len = (loff_t)vma_desc_size(desc);
+       len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
        /* check for overflow */
        if (len < vma_len)
                return -EINVAL;
@@ -141,7 +142,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
 
        ret = -ENOMEM;
 
-       vm_flags = vma->vm_flags;
+       vm_flags = desc->vm_flags;
        /*
         * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
         * reserving here. Note: only for SHM hugetlbfs file, the inode
@@ -151,17 +152,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct 
vm_area_struct *vma)
                vm_flags |= VM_NORESERVE;
 
        if (hugetlb_reserve_pages(inode,
-                               vma->vm_pgoff >> huge_page_order(h),
-                               len >> huge_page_shift(h), vma,
-                               vm_flags) < 0)
+                       desc->pgoff >> huge_page_order(h),
+                       len >> huge_page_shift(h), desc,
+                       vm_flags) < 0)
                goto out;
 
        ret = 0;
-       if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+       if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
                i_size_write(inode, len);
 out:
        inode_unlock(inode);
 
+       /* Allocate the VMA lock after we set it up. */
+       if (!ret)
+               desc->action.success_hook = hugetlb_vma_lock_alloc;
        return ret;
 }
 
@@ -1219,7 +1223,7 @@ static void init_once(void *foo)
 
 static const struct file_operations hugetlbfs_file_operations = {
        .read_iter              = hugetlbfs_read_iter,
-       .mmap                   = hugetlbfs_file_mmap,
+       .mmap_prepare           = hugetlbfs_file_mmap_prepare,
        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
        .llseek                 = default_llseek,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 526d27e88b3b..b39f2b70ccab 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -150,8 +150,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
                             struct folio **foliop);
 #endif /* CONFIG_USERFAULTFD */
 long hugetlb_reserve_pages(struct inode *inode, long from, long to,
-                                               struct vm_area_struct *vma,
-                                               vm_flags_t vm_flags);
+                          struct vm_area_desc *desc, vm_flags_t vm_flags);
 long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
 bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list);
@@ -280,6 +279,7 @@ bool is_hugetlb_entry_hwpoisoned(pte_t pte);
 void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
 void fixup_hugetlb_reservations(struct vm_area_struct *vma);
 void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -466,6 +466,11 @@ static inline void fixup_hugetlb_reservations(struct 
vm_area_struct *vma)
 
 static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long 
addr) {}
 
+static inline int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+       return 0;
+}
+
 #endif /* !CONFIG_HUGETLB_PAGE */
 
 #ifndef pgd_write
diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 0660a03d37d9..a27aa0162918 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -2,22 +2,27 @@
 #ifndef _LINUX_HUGETLB_INLINE_H
 #define _LINUX_HUGETLB_INLINE_H
 
-#ifdef CONFIG_HUGETLB_PAGE
-
 #include <linux/mm.h>
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+#ifdef CONFIG_HUGETLB_PAGE
+
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
-       return !!(vma->vm_flags & VM_HUGETLB);
+       return !!(vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_flags(vm_flags_t vm_flags)
 {
        return false;
 }
 
 #endif
 
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+{
+       return is_vm_hugetlb_flags(vma->vm_flags);
+}
+
 #endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d812ad8f0b9f..cb6eda43cb7f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -119,7 +119,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init;
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
 static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
 static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
                unsigned long start, unsigned long end, bool take_locks);
@@ -417,17 +416,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct 
*vma)
        }
 }
 
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
 {
        struct hugetlb_vma_lock *vma_lock;
 
        /* Only establish in (flags) sharable vmas */
        if (!vma || !(vma->vm_flags & VM_MAYSHARE))
-               return;
+               return 0;
 
        /* Should never get here with non-NULL vm_private_data */
        if (vma->vm_private_data)
-               return;
+               return -EINVAL;
 
        vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
        if (!vma_lock) {
@@ -442,13 +445,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct 
*vma)
                 * allocation failure.
                 */
                pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
-               return;
+               return -EINVAL;
        }
 
        kref_init(&vma_lock->refs);
        init_rwsem(&vma_lock->rw_sema);
        vma_lock->vma = vma;
        vma->vm_private_data = vma_lock;
+
+       return 0;
 }
 
 /* Helper that removes a struct file_region from the resv_map cache and returns
@@ -1180,20 +1185,28 @@ static struct resv_map *vma_resv_map(struct 
vm_area_struct *vma)
        }
 }
 
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
 {
-       VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-       VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+       VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+       VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
 
-       set_vma_private_data(vma, (unsigned long)map);
+       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
 }
 
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map 
*map)
 {
-       VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
-       VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+       VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
 
-       set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+       desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long 
flags)
+{
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+       VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+       desc->private_data = (void *)((unsigned long)desc->private_data | 
flags);
 }
 
 static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
@@ -1203,6 +1216,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, 
unsigned long flag)
        return (get_vma_private_data(vma) & flag) != 0;
 }
 
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+       VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+       return ((unsigned long)desc->private_data) & flag;
+}
+
 bool __vma_private_lock(struct vm_area_struct *vma)
 {
        return !(vma->vm_flags & VM_MAYSHARE) &&
@@ -7225,9 +7245,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma,
  */
 
 long hugetlb_reserve_pages(struct inode *inode,
-                                       long from, long to,
-                                       struct vm_area_struct *vma,
-                                       vm_flags_t vm_flags)
+               long from, long to,
+               struct vm_area_desc *desc,
+               vm_flags_t vm_flags)
 {
        long chg = -1, add = -1, spool_resv, gbl_resv;
        struct hstate *h = hstate_inode(inode);
@@ -7242,12 +7262,6 @@ long hugetlb_reserve_pages(struct inode *inode,
                return -EINVAL;
        }
 
-       /*
-        * vma specific semaphore used for pmd sharing and fault/truncation
-        * synchronization
-        */
-       hugetlb_vma_lock_alloc(vma);
-
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
@@ -7260,9 +7274,9 @@ long hugetlb_reserve_pages(struct inode *inode,
         * Shared mappings base their reservation on the number of pages that
         * are already allocated on behalf of the file. Private mappings need
         * to reserve the full area even if read-only as mprotect() may be
-        * called to make the mapping read-write. Assume !vma is a shm mapping
+        * called to make the mapping read-write. Assume !desc is a shm mapping
         */
-       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+       if (!desc || desc->vm_flags & VM_MAYSHARE) {
                /*
                 * resv_map can not be NULL as hugetlb_reserve_pages is only
                 * called for inodes for which resv_maps were created (see
@@ -7279,8 +7293,8 @@ long hugetlb_reserve_pages(struct inode *inode,
 
                chg = to - from;
 
-               set_vma_resv_map(vma, resv_map);
-               set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+               set_vma_desc_resv_map(desc, resv_map);
+               set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
        }
 
        if (chg < 0)
@@ -7290,7 +7304,7 @@ long hugetlb_reserve_pages(struct inode *inode,
                                chg * pages_per_huge_page(h), &h_cg) < 0)
                goto out_err;
 
-       if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+       if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
                /* For private mappings, the hugetlb_cgroup uncharge info hangs
                 * of the resv_map.
                 */
@@ -7324,7 +7338,7 @@ long hugetlb_reserve_pages(struct inode *inode,
         * consumed reservations are stored in the map. Hence, nothing
         * else has to be done for private mappings here
         */
-       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+       if (!desc || desc->vm_flags & VM_MAYSHARE) {
                add = region_add(resv_map, from, to, regions_needed, h, h_cg);
 
                if (unlikely(add < 0)) {
@@ -7378,16 +7392,15 @@ long hugetlb_reserve_pages(struct inode *inode,
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
 out_err:
-       hugetlb_vma_lock_free(vma);
-       if (!vma || vma->vm_flags & VM_MAYSHARE)
+       if (!desc || desc->vm_flags & VM_MAYSHARE)
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
                 */
                if (chg >= 0 && add < 0)
                        region_abort(resv_map, from, to, regions_needed);
-       if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+       if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
                kref_put(&resv_map->refs, resv_map_release);
-               set_vma_resv_map(vma, NULL);
+               set_vma_desc_resv_map(desc, NULL);
        }
        return chg < 0 ? chg : add < 0 ? add : -EINVAL;
 }
-- 
2.51.0


Reply via email to