On Tue, Jan 28, 2014 at 08:57:44AM -0700, Khalid Aziz wrote:
> Backport from upstream commit 27c73ae759774e63313c1fbfeb17ba076cea64c5

This seems to be a clean cherry-pick for the 3.5 kernel.  I am queuing it
both for the 3.5 and the 3.11 kernels.  Thanks!

Cheers,
--
Luis

> Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database
> caused by THP") can cause dereference of a dangling pointer if
> split_huge_page runs during PageHuge() if there are updates to the
> tail_page->private field.
> 
> Also it is repeating compound_head twice for hugetlbfs and it is running
> compound_head+compound_trans_head for THP when a single one is needed in
> both cases.
> 
> The new code within the PageSlab() check doesn't need to verify that the
> THP page size is never bigger than the smallest hugetlbfs page size, to
> avoid memory corruption.
> 
> A longstanding theoretical race condition was found while fixing the
> above (see the change right after the skip_unlock label, that is
> relevant for the compound_lock path too).
> 
> By re-establishing the _mapcount tail refcounting for all compound
> pages, this also fixes the below problem:
> 
>   echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
> 
>   BUG: Bad page state in process bash  pfn:59a01
>   page:ffffea000139b038 count:0 mapcount:10 mapping:          (null) index:0x0
>   page flags: 0x1c00000000008000(tail)
>   Modules linked in:
>   CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25
>   Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
>   Call Trace:
>     dump_stack+0x55/0x76
>     bad_page+0xd5/0x130
>     free_pages_prepare+0x213/0x280
>     __free_pages+0x36/0x80
>     update_and_free_page+0xc1/0xd0
>     free_pool_huge_page+0xc2/0xe0
>     set_max_huge_pages.part.58+0x14c/0x220
>     nr_hugepages_store_common.isra.60+0xd0/0xf0
>     nr_hugepages_store+0x13/0x20
>     kobj_attr_store+0xf/0x20
>     sysfs_write_file+0x189/0x1e0
>     vfs_write+0xc5/0x1f0
>     SyS_write+0x55/0xb0
>     system_call_fastpath+0x16/0x1b
> 
> Signed-off-by: Khalid Aziz <[email protected]>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Tested-by: Khalid Aziz <[email protected]>
> Cc: Pravin Shelar <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Ben Hutchings <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Mel Gorman <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> Signed-off-by: Linus Torvalds <[email protected]>
> ---
>  include/linux/hugetlb.h |    6 +++
>  mm/hugetlb.c            |   17 ++++++++
>  mm/swap.c               |  107 
> ++++++++++++++++++++++++++++++++++-------------
>  3 files changed, 101 insertions(+), 29 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6baa73d..4f42a9c 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -24,6 +24,7 @@ struct hugepage_subpool *hugepage_new_subpool(long 
> nr_blocks);
>  void hugepage_put_subpool(struct hugepage_subpool *spool);
>  
>  int PageHuge(struct page *page);
> +int PageHeadHuge(struct page *page_head);
>  
>  void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
>  int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, 
> loff_t *);
> @@ -85,6 +86,11 @@ static inline int PageHuge(struct page *page)
>       return 0;
>  }
>  
> +static inline int PageHeadHuge(struct page *page_head)
> +{
> +     return 0;
> +}
> +
>  static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
>  {
>  }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index af20b77..7111f2f 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -679,6 +679,23 @@ int PageHuge(struct page *page)
>  }
>  EXPORT_SYMBOL_GPL(PageHuge);
>  
> +/*
> + * PageHeadHuge() only returns true for hugetlbfs head page, but not for
> + * normal or transparent huge pages.
> + */
> +int PageHeadHuge(struct page *page_head)
> +{
> +     compound_page_dtor *dtor;
> +
> +     if (!PageHead(page_head))
> +             return 0;
> +
> +     dtor = get_compound_page_dtor(page_head);
> +
> +     return dtor == free_huge_page;
> +}
> +EXPORT_SYMBOL_GPL(PageHeadHuge);
> +
>  pgoff_t __basepage_index(struct page *page)
>  {
>       struct page *page_head = compound_head(page);
> diff --git a/mm/swap.c b/mm/swap.c
> index f689e9a..a8feea6 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -77,18 +77,6 @@ static void __put_compound_page(struct page *page)
>  
>  static void put_compound_page(struct page *page)
>  {
> -     /*
> -      * hugetlbfs pages cannot be split from under us.  If this is a
> -      * hugetlbfs page, check refcount on head page and release the page if
> -      * the refcount becomes zero.
> -      */
> -     if (PageHuge(page)) {
> -             page = compound_head(page);
> -             if (put_page_testzero(page))
> -                     __put_compound_page(page);
> -             return;
> -     }
> -
>       if (unlikely(PageTail(page))) {
>               /* __split_huge_page_refcount can run under us */
>               struct page *page_head = compound_trans_head(page);
> @@ -96,6 +84,35 @@ static void put_compound_page(struct page *page)
>               if (likely(page != page_head &&
>                          get_page_unless_zero(page_head))) {
>                       unsigned long flags;
> +
> +                      if (PageHeadHuge(page_head)) {
> +                             if (likely(PageTail(page))) {
> +                                     /*
> +                                      * __split_huge_page_refcount
> +                                      * cannot race here.
> +                                      */
> +                                     VM_BUG_ON(!PageHead(page_head));
> +                                     atomic_dec(&page->_mapcount);
> +                                     if (put_page_testzero(page_head))
> +                                             VM_BUG_ON(1);
> +                                     if (put_page_testzero(page_head))
> +                                             __put_compound_page(page_head);
> +                                     return;
> +                             } else {
> +                                     /*
> +                                      * __split_huge_page_refcount
> +                                      * run before us, "page" was a
> +                                      * THP tail. The split
> +                                      * page_head has been freed
> +                                      * and reallocated as slab or
> +                                      * hugetlbfs page of smaller
> +                                      * order (only possible if
> +                                      * reallocated as slab on
> +                                      * x86).
> +                                      */
> +                                     goto skip_lock;
> +                             }
> +                     }
>                       /*
>                        * page_head wasn't a dangling pointer but it
>                        * may not be a head page anymore by the time
> @@ -107,9 +124,29 @@ static void put_compound_page(struct page *page)
>                               /* __split_huge_page_refcount run before us */
>                               compound_unlock_irqrestore(page_head, flags);
>                               VM_BUG_ON(PageHead(page_head));
> -                             if (put_page_testzero(page_head))
> -                                     __put_single_page(page_head);
> -                     out_put_single:
> +skip_lock:
> +                             if (put_page_testzero(page_head)) {
> +                                     /*
> +                                      * The head page may have been
> +                                      * freed and reallocated as a
> +                                      * compound page of smaller
> +                                      * order and then freed again.
> +                                      * All we know is that it
> +                                      * cannot have become: a THP
> +                                      * page, a compound page of
> +                                      * higher order, a tail page.
> +                                      * That is because we still
> +                                      * hold the refcount of the
> +                                      * split THP tail and
> +                                      * page_head was the THP head
> +                                      * before the split.
> +                                      */
> +                                     if (PageHead(page_head))
> +                                             __put_compound_page(page_head);
> +                                     else
> +                                             __put_single_page(page_head);
> +                             }
> +out_put_single:
>                               if (put_page_testzero(page))
>                                       __put_single_page(page);
>                               return;
> @@ -173,21 +210,34 @@ bool __get_page_tail(struct page *page)
>        */
>       unsigned long flags;
>       bool got = false;
> -     struct page *page_head;
> +     struct page *page_head = compound_trans_head(page);
>  
> -     /*
> -      * If this is a hugetlbfs page it cannot be split under us.  Simply
> -      * increment refcount for the head page.
> -      */
> -     if (PageHuge(page)) {
> -             page_head = compound_head(page);
> -             atomic_inc(&page_head->_count);
> -             got = true;
> -             goto out;
> -     }
> -
> -     page_head = compound_trans_head(page);
>       if (likely(page != page_head && get_page_unless_zero(page_head))) {
> +             /* Ref to put_compound_page() comment. */
> +             if (PageHeadHuge(page_head)) {
> +                     if (likely(PageTail(page))) {
> +                             /*
> +                              * This is a hugetlbfs
> +                              * page. __split_huge_page_refcount
> +                              * cannot race here.
> +                              */
> +                             VM_BUG_ON(!PageHead(page_head));
> +                             __get_page_tail_foll(page, false);
> +                             return true;
> +                     } else {
> +                             /*
> +                              * __split_huge_page_refcount run
> +                              * before us, "page" was a THP
> +                              * tail. The split page_head has been
> +                              * freed and reallocated as slab or
> +                              * hugetlbfs page of smaller order
> +                              * (only possible if reallocated as
> +                              * slab on x86).
> +                              */
> +                             put_page(page_head);
> +                             return false;
> +                     }
> +             }
>               /*
>                * page_head wasn't a dangling pointer but it
>                * may not be a head page anymore by the time
> @@ -204,7 +254,6 @@ bool __get_page_tail(struct page *page)
>               if (unlikely(!got))
>                       put_page(page_head);
>       }
> -out:
>       return got;
>  }
>  EXPORT_SYMBOL(__get_page_tail);
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to [email protected]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to