For shared mappings, the pointer to the hugetlb_cgroup to uncharge lives
in the resv_map entries, in file_region->reservation_counter.

When a file_region entry is added to the resv_map via region_add, we
also charge the appropriate hugetlb_cgroup and put the pointer to that
in file_region->reservation_counter. This is slightly delicate since we
need to not modify the resv_map until we know that charging the
reservation has succeeded. If charging doesn't succeed, we report the
error to the caller, so that the kernel fails the reservation.

On region_del, which is when the hugetlb memory is unreserved, we delete
the file_region entry in the resv_map, but also uncharge the
file_region->reservation_counter.

region_add() and region_chg() are heavily refactored to in this commit
to make the code easier to understand and remove duplication.

---
 mm/hugetlb.c | 443 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 280 insertions(+), 163 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 7c2df7574cf50..953e93359f021 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -242,208 +242,276 @@ struct file_region {
        struct list_head link;
        long from;
        long to;
+#ifdef CONFIG_CGROUP_HUGETLB
+       /*
+        * On shared mappings, each reserved region appears as a struct
+        * file_region in resv_map. These fields hold the info needed to
+        * uncharge each reservation.
+        */
+       struct page_counter *reservation_counter;
+       unsigned long pages_per_hpage;
+#endif
 };

-/*
- * Add the huge page range represented by [f, t) to the reserve
- * map.  In the normal case, existing regions will be expanded
- * to accommodate the specified range.  Sufficient regions should
- * exist for expansion due to the previous call to region_chg
- * with the same range.  However, it is possible that region_del
- * could have been called after region_chg and modifed the map
- * in such a way that no region exists to be expanded.  In this
- * case, pull a region descriptor from the cache associated with
- * the map and use that for the new range.
- *
- * Return the number of new huge pages added to the map.  This
- * number is greater than or equal to zero.
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
  */
-static long region_add(struct resv_map *resv, long f, long t)
+static struct file_region *get_file_region_entry_from_cache(
+               struct resv_map *resv, long from, long to)
 {
-       struct list_head *head = &resv->regions;
-       struct file_region *rg, *nrg, *trg;
-       long add = 0;
+       struct file_region *nrg = NULL;

-       spin_lock(&resv->lock);
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
+       VM_BUG_ON(resv->region_cache_count <= 0);

-       /*
-        * If no region exists which can be expanded to include the
-        * specified range, the list must have been modified by an
-        * interleving call to region_del().  Pull a region descriptor
-        * from the cache and use it for this range.
-        */
-       if (&rg->link == head || t < rg->from) {
-               VM_BUG_ON(resv->region_cache_count <= 0);
+       resv->region_cache_count--;
+       nrg = list_first_entry(&resv->region_cache, struct file_region,
+                       link);
+       VM_BUG_ON(!nrg);
+       list_del(&nrg->link);

-               resv->region_cache_count--;
-               nrg = list_first_entry(&resv->region_cache, struct file_region,
-                                       link);
-               list_del(&nrg->link);
+       nrg->from = from;
+       nrg->to = to;

-               nrg->from = f;
-               nrg->to = t;
-               list_add(&nrg->link, rg->link.prev);
+       return nrg;
+}

-               add += t - f;
-               goto out_locked;
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+               struct file_region *nrg, struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (h_cg) {
+               nrg->reservation_counter =
+                       &h_cg->reserved_hugepage[hstate_index(h)];
+               nrg->pages_per_hpage = pages_per_huge_page(h);
        }
+#endif
+}

-       /* Round our left edge to the current segment if it encloses us. */
-       if (f > rg->from)
-               f = rg->from;
+/* Must be called with resv->lock held. Calling this with dry_run == true will
+ * count the number of pages to be added but will not modify the linked list.
+ */
+static long add_reservations_in_range(struct resv_map *resv,
+               struct list_head *head, long f, long t,
+               struct hugetlb_cgroup *h_cg,
+               struct hstate *h,
+               bool dry_run)
+{
+       long add = 0;
+       long last_accounted_offset = f;
+       struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;

-       /* Check for and consume any regions we now overlap with. */
-       nrg = rg;
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
+       /* In this loop, we essentially handle an entry for the range
+        * last_accounted_offset -> rg->from, at every iteration, with some
+        * bounds checking.
+        */
+       list_for_each_entry_safe(rg, trg, head, link) {
+               /* Skip irrelevant regions that start before our range. */
+               if (rg->from < f) {
+                       /* If this region ends after the last accounted offset,
+                        * then we need to update last_accounted_offset.
+                        */
+                       if (rg->to > last_accounted_offset)
+                               last_accounted_offset = rg->to;
+                       continue;
+               }
+
+               /* When we find a region that starts beyond our range, we've
+                * finished.
+                */
                if (rg->from > t)
                        break;

-               /* If this area reaches higher then extend our area to
-                * include it completely.  If this is not the first area
-                * which we intend to reuse, free it. */
-               if (rg->to > t)
-                       t = rg->to;
-               if (rg != nrg) {
-                       /* Decrement return value by the deleted range.
-                        * Another range will span this area so that by
-                        * end of routine add will be >= zero
-                        */
-                       add -= (rg->to - rg->from);
-                       list_del(&rg->link);
-                       kfree(rg);
+               /* Add an entry for last_accounted_offset -> rg->from, and
+                * update last_accounted_offset.
+                */
+               if (rg->from > last_accounted_offset) {
+                       add += rg->from - last_accounted_offset;
+                       if (!dry_run) {
+                               nrg = get_file_region_entry_from_cache(resv,
+                                               last_accounted_offset,
+                                               rg->from);
+                               record_hugetlb_cgroup_uncharge_info(h_cg, nrg,
+                                               h);
+                               list_add(&nrg->link, rg->link.prev);
+                       }
+               }
+
+               last_accounted_offset = rg->to;
+
+               if (!dry_run) {
+                       if (rg->from == rg->to) {
+                               list_del(&rg->link);
+                               kfree(rg);
+                       }
                }
        }

-       add += (nrg->from - f);         /* Added to beginning of region */
-       nrg->from = f;
-       add += t - nrg->to;             /* Added to end of region */
-       nrg->to = t;
+       /* Handle the case where our range extends beyond
+        * last_accounted_offset.
+        */
+       if (last_accounted_offset < t) {
+               add += t - last_accounted_offset;
+               if (!dry_run) {
+                       nrg = get_file_region_entry_from_cache(resv,
+                                       last_accounted_offset, t);
+                       record_hugetlb_cgroup_uncharge_info(h_cg, nrg, h);
+                       list_add(&nrg->link, rg->link.prev);
+               }
+               last_accounted_offset = t;
+       }

-out_locked:
-       resv->adds_in_progress--;
-       spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
        return add;
 }

-/*
- * Examine the existing reserve map and determine how many
- * huge pages in the specified range [f, t) are NOT currently
- * represented.  This routine is called before a subsequent
- * call to region_add that will actually modify the reserve
- * map to add the specified range [f, t).  region_chg does
- * not change the number of huge pages represented by the
- * map.  However, if the existing regions in the map can not
- * be expanded to represent the new range, a new file_region
- * structure is added to the map as a placeholder.  This is
- * so that the subsequent region_add call will have all the
- * regions it needs and will not fail.
- *
- * Upon entry, region_chg will also examine the cache of region descriptors
- * associated with the map.  If there are not enough descriptors cached, one
- * will be allocated for the in progress add operation.
+static int charge_cgroup_if_shared_mapping(struct resv_map *resv,
+               struct hstate *h, long nr_pages, struct hugetlb_cgroup **h_cg)
+{
+       int ret = 0;
+#ifdef CONFIG_CGROUP_HUGETLB
+       /*
+        * If res->reservation_counter is NULL, then it means this is
+        * a shared mapping, and hugetlb cgroup accounting should be
+        * done on the file_region entries inside resv_map.
+        */
+       if (!resv->reservation_counter) {
+               ret = hugetlb_cgroup_charge_cgroup(
+                               hstate_index(h),
+                               nr_pages * pages_per_huge_page(h),
+                               h_cg, true);
+       }
+#endif
+       return ret;
+}
+
+/* This function will examine resv_map and deterimine how many huge pages are
+ * NOT currently represented. Then it will make sure resv->region_cache_count
+ * has enough entries in it to satisfy a following add_reservations_in_range
+ * call.
  *
- * Returns the number of huge pages that need to be added to the existing
- * reservation map for the range [f, t).  This number is greater or equal to
- * zero.  -ENOMEM is returned if a new file_region structure or cache entry
- * is needed and can not be allocated.
+ * Returns the number of hugepages pages NOT respresented on success with
+ * resv->lock held. Returns -ENOMEM if it needs to allocate a region_cache item
+ * and fails to do so, with the lock NOT held.
  */
-static long region_chg(struct resv_map *resv, long f, long t)
+static long allocate_enough_cache_for_range_and_lock(struct resv_map *resv,
+               long f, long t)
 {
        struct list_head *head = &resv->regions;
-       struct file_region *rg, *nrg = NULL;
+       struct file_region *trg = NULL;
        long chg = 0;

 retry:
        spin_lock(&resv->lock);
-retry_locked:
-       resv->adds_in_progress++;
+
+       /* Count how many hugepages in this range are NOT respresented. */
+       chg = add_reservations_in_range(resv, head, f, t, NULL, NULL, true);

        /*
         * Check for sufficient descriptors in the cache to accommodate
-        * the number of in progress add operations.
+        * the number of in progress add operations. There must be at least
+        * 1 extra in the cache.
         */
-       if (resv->adds_in_progress > resv->region_cache_count) {
-               struct file_region *trg;
-
-               VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 
1);
+       if (resv->region_cache_count < chg + 1) {
                /* Must drop lock to allocate a new descriptor. */
-               resv->adds_in_progress--;
                spin_unlock(&resv->lock);

-               trg = kmalloc(sizeof(*trg), GFP_KERNEL);
-               if (!trg) {
-                       kfree(nrg);
-                       return -ENOMEM;
-               }
+               while (resv->region_cache_count < chg + 1) {
+                       trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+                       if (!trg)
+                               return -ENOMEM;

-               spin_lock(&resv->lock);
-               list_add(&trg->link, &resv->region_cache);
-               resv->region_cache_count++;
-               goto retry_locked;
+                       spin_lock(&resv->lock);
+                       list_add(&trg->link, &resv->region_cache);
+                       resv->region_cache_count++;
+                       spin_unlock(&resv->lock);
+               }
+               goto retry;
        }

-       /* Locate the region we are before or in. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
+       return chg;
+}

-       /* If we are below the current region then a new region is required.
-        * Subtle, allocate a new region at the position but make it zero
-        * size such that we can guarantee to record the reservation. */
-       if (&rg->link == head || t < rg->from) {
-               if (!nrg) {
-                       resv->adds_in_progress--;
-                       spin_unlock(&resv->lock);
-                       nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-                       if (!nrg)
-                               return -ENOMEM;
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map.  In the normal case, existing regions will be taken off
+ * the cache to accommodate the specified range.  Sufficient
+ * regions should exist in the cache due to the previous call
+ * to region_chg with the same range, but we still check we have
+ * enough regions in the cache anyway, since something else could
+ * have consumed our regions.
+ *
+ * Return the number of new huge pages added to the map.  This
+ * number is greater than or equal to zero.  If for some reason
+ * we don't have enough entries in the cache, try to allocate
+ * more regions, and fail, we return -ENOMEM.
+ */
+static long region_add(struct hstate *h, struct resv_map *resv, long f, long t)
+{
+       struct list_head *head = &resv->regions;
+       long chg = 0, add = 0;
+       struct hugetlb_cgroup *h_cg = NULL;
+       int ret = 0;

-                       nrg->from = f;
-                       nrg->to   = f;
-                       INIT_LIST_HEAD(&nrg->link);
-                       goto retry;
-               }
+       /* Count how many charges we will need to do. Locks resv->lock on
+        * success.
+        */
+       chg = allocate_enough_cache_for_range_and_lock(resv, f, t);

-               list_add(&nrg->link, rg->link.prev);
-               chg = t - f;
-               goto out_nrg;
+       if (chg < 0) {
+               ret = chg;
+               spin_lock(&resv->lock);
+               goto out_locked;
        }

-       /* Round our left edge to the current segment if it encloses us. */
-       if (f > rg->from)
-               f = rg->from;
-       chg = t - f;
+       ret = charge_cgroup_if_shared_mapping(resv, h, chg, &h_cg);

-       /* Check for and consume any regions we now overlap with. */
-       list_for_each_entry(rg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
-               if (rg->from > t)
-                       goto out;
+       if (ret)
+               goto out_locked;

-               /* We overlap with this area, if it extends further than
-                * us then we must extend ourselves.  Account for its
-                * existing reservation. */
-               if (rg->to > t) {
-                       chg += rg->to - t;
-                       t = rg->to;
-               }
-               chg -= rg->to - rg->from;
-       }
+       add = add_reservations_in_range(resv, head, f, t, h_cg, h,
+                       false);

-out:
+       /*
+        * If these aren't equal, then there is a bug with
+        * consume_regions_we_overlap_with, and we're charging the wrong amount
+        * of memory. This should never happen as we are holding to the lock
+        * between the 2 add_reservations_in_range calls.
+        */
+       WARN_ON(add != chg);
+
+out_locked:
+       resv->adds_in_progress = 0;
        spin_unlock(&resv->lock);
-       /*  We already know we raced and no longer need the new region */
-       kfree(nrg);
-       return chg;
-out_nrg:
+       if (ret)
+               return ret;
+       VM_BUG_ON(add < 0);
+       return add;
+}
+
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented.  This routine is called before a subsequent
+ * call to region_add that will fill region_cache with enough
+ * entries to add the specified range [f, t).  region_chg does
+ * not change the number of huge pages represented by the
+ * map.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t).  This number is greater or equal to
+ * zero.  -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
+ */
+static long region_chg(struct resv_map *resv, long f, long t)
+{
+       long chg = allocate_enough_cache_for_range_and_lock(resv,
+                       f, t);
+
+       if (chg < 0)
+               return chg;
+
+       resv->adds_in_progress = chg;
+
        spin_unlock(&resv->lock);
        return chg;
 }
@@ -463,10 +531,43 @@ static void region_abort(struct resv_map *resv, long f, 
long t)
 {
        spin_lock(&resv->lock);
        VM_BUG_ON(!resv->region_cache_count);
-       resv->adds_in_progress--;
+       resv->adds_in_progress = 0;
        spin_unlock(&resv->lock);
 }

+static void get_hugetlb_cgroup_info(struct page_counter **reservation_counter,
+               unsigned long *pages_per_hpage, struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       /*
+        * Save counter information from the deleted
+        * node, in case we need to do an uncharge.
+        */
+       *reservation_counter = nrg->reservation_counter;
+       *pages_per_hpage = nrg->pages_per_hpage;
+#endif
+}
+
+static void uncharge_cgroup_if_shared_mapping(struct resv_map *resv,
+               struct page_counter *reservation_counter,
+               unsigned long pages_per_hpage,
+               unsigned long nr_pages)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       /*
+        * If resv->reservation_counter is NULL, then this is shared
+        * reservation, and the reserved memory is tracked in the file_struct
+        * entries inside of resv_map. So we need to uncharge the memory here.
+        */
+       if (reservation_counter && pages_per_hpage && nr_pages > 0 &&
+           !resv->reservation_counter) {
+               hugetlb_cgroup_uncharge_counter(
+                               reservation_counter,
+                               nr_pages * pages_per_hpage);
+       }
+#endif
+}
+
 /*
  * Delete the specified range [f, t) from the reserve map.  If the
  * t parameter is LONG_MAX, this indicates that ALL regions after f
@@ -487,6 +588,8 @@ static long region_del(struct resv_map *resv, long f, long 
t)
        struct file_region *rg, *trg;
        struct file_region *nrg = NULL;
        long del = 0;
+       struct page_counter *reservation_counter = NULL;
+       unsigned long pages_per_hpage = 0;

 retry:
        spin_lock(&resv->lock);
@@ -543,6 +646,9 @@ static long region_del(struct resv_map *resv, long f, long 
t)

                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
+                       get_hugetlb_cgroup_info(&reservation_counter,
+                                       &pages_per_hpage,
+                                       rg);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
@@ -559,6 +665,9 @@ static long region_del(struct resv_map *resv, long f, long 
t)

        spin_unlock(&resv->lock);
        kfree(nrg);
+
+       uncharge_cgroup_if_shared_mapping(resv, reservation_counter,
+                       pages_per_hpage, del);
        return del;
 }

@@ -1930,7 +2039,7 @@ static long __vma_reservation_common(struct hstate *h,
                ret = region_chg(resv, idx, idx + 1);
                break;
        case VMA_COMMIT_RESV:
-               ret = region_add(resv, idx, idx + 1);
+               ret = region_add(h, resv, idx, idx + 1);
                break;
        case VMA_END_RESV:
                region_abort(resv, idx, idx + 1);
@@ -1938,7 +2047,7 @@ static long __vma_reservation_common(struct hstate *h,
                break;
        case VMA_ADD_RESV:
                if (vma->vm_flags & VM_MAYSHARE)
-                       ret = region_add(resv, idx, idx + 1);
+                       ret = region_add(h, resv, idx, idx + 1);
                else {
                        region_abort(resv, idx, idx + 1);
                        ret = region_del(resv, idx, idx + 1);
@@ -4555,7 +4664,7 @@ int hugetlb_reserve_pages(struct inode *inode,
                                        struct vm_area_struct *vma,
                                        vm_flags_t vm_flags)
 {
-       long ret, chg;
+       long ret, chg, add;
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
@@ -4643,9 +4752,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         */
        ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-               /* put back original number of pages, chg */
-               (void)hugepage_subpool_put_pages(spool, chg);
-               goto out_err;
+               goto out_put_pages;
        }

        /*
@@ -4660,7 +4767,12 @@ int hugetlb_reserve_pages(struct inode *inode,
         * else has to be done for private mappings here
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
-               long add = region_add(resv_map, from, to);
+               add = region_add(h, resv_map, from, to);
+               if (add < 0) {
+                       ret = -ENOMEM;
+                       goto out_acct_memory;
+               }
+

                if (unlikely(chg > add)) {
                        /*
@@ -4678,10 +4790,15 @@ int hugetlb_reserve_pages(struct inode *inode,
                }
        }
        return 0;
+out_acct_memory:
+       hugetlb_acct_memory(h, -gbl_reserve);
+out_put_pages:
+       /* put back original number of pages, chg */
+       (void)hugepage_subpool_put_pages(spool, chg);
 out_err:
        if (!vma || vma->vm_flags & VM_MAYSHARE)
-               /* Don't call region_abort if region_chg failed */
-               if (chg >= 0)
+               /* Don't call region_abort if region_chg or region_add failed */
+               if (chg >= 0 && add >= 0)
                        region_abort(resv_map, from, to);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                kref_put(&resv_map->refs, resv_map_release);
--
2.23.0.187.g17f5b7556c-goog

Reply via email to