On Tue, Jan 28, 2014 at 08:57:44AM -0700, Khalid Aziz wrote:
> Backport from upstream commit 27c73ae759774e63313c1fbfeb17ba076cea64c5
This seems to be a clean cherry-pick for the 3.5 kernel. I am queuing it
both for the 3.5 and the 3.11 kernels. Thanks!
Cheers,
--
Luis
> Commit 7cb2ef56e6a8 ("mm: fix aio performance regression for database
> caused by THP") can cause dereference of a dangling pointer if
> split_huge_page runs during PageHuge() if there are updates to the
> tail_page->private field.
>
> Also it is repeating compound_head twice for hugetlbfs and it is running
> compound_head+compound_trans_head for THP when a single one is needed in
> both cases.
>
> The new code within the PageSlab() check doesn't need to verify that the
> THP page size is never bigger than the smallest hugetlbfs page size, to
> avoid memory corruption.
>
> A longstanding theoretical race condition was found while fixing the
> above (see the change right after the skip_unlock label, that is
> relevant for the compound_lock path too).
>
> By re-establishing the _mapcount tail refcounting for all compound
> pages, this also fixes the below problem:
>
> echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
>
> BUG: Bad page state in process bash pfn:59a01
> page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0
> page flags: 0x1c00000000008000(tail)
> Modules linked in:
> CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ #25
> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
> Call Trace:
> dump_stack+0x55/0x76
> bad_page+0xd5/0x130
> free_pages_prepare+0x213/0x280
> __free_pages+0x36/0x80
> update_and_free_page+0xc1/0xd0
> free_pool_huge_page+0xc2/0xe0
> set_max_huge_pages.part.58+0x14c/0x220
> nr_hugepages_store_common.isra.60+0xd0/0xf0
> nr_hugepages_store+0x13/0x20
> kobj_attr_store+0xf/0x20
> sysfs_write_file+0x189/0x1e0
> vfs_write+0xc5/0x1f0
> SyS_write+0x55/0xb0
> system_call_fastpath+0x16/0x1b
>
> Signed-off-by: Khalid Aziz <[email protected]>
> Signed-off-by: Andrea Arcangeli <[email protected]>
> Tested-by: Khalid Aziz <[email protected]>
> Cc: Pravin Shelar <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Ben Hutchings <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> Cc: Johannes Weiner <[email protected]>
> Cc: Mel Gorman <[email protected]>
> Cc: Rik van Riel <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Minchan Kim <[email protected]>
> Signed-off-by: Andrew Morton <[email protected]>
> Signed-off-by: Linus Torvalds <[email protected]>
> ---
> include/linux/hugetlb.h | 6 +++
> mm/hugetlb.c | 17 ++++++++
> mm/swap.c | 107
> ++++++++++++++++++++++++++++++++++-------------
> 3 files changed, 101 insertions(+), 29 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 6baa73d..4f42a9c 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -24,6 +24,7 @@ struct hugepage_subpool *hugepage_new_subpool(long
> nr_blocks);
> void hugepage_put_subpool(struct hugepage_subpool *spool);
>
> int PageHuge(struct page *page);
> +int PageHeadHuge(struct page *page_head);
>
> void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
> int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *,
> loff_t *);
> @@ -85,6 +86,11 @@ static inline int PageHuge(struct page *page)
> return 0;
> }
>
> +static inline int PageHeadHuge(struct page *page_head)
> +{
> + return 0;
> +}
> +
> static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
> {
> }
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index af20b77..7111f2f 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -679,6 +679,23 @@ int PageHuge(struct page *page)
> }
> EXPORT_SYMBOL_GPL(PageHuge);
>
> +/*
> + * PageHeadHuge() only returns true for hugetlbfs head page, but not for
> + * normal or transparent huge pages.
> + */
> +int PageHeadHuge(struct page *page_head)
> +{
> + compound_page_dtor *dtor;
> +
> + if (!PageHead(page_head))
> + return 0;
> +
> + dtor = get_compound_page_dtor(page_head);
> +
> + return dtor == free_huge_page;
> +}
> +EXPORT_SYMBOL_GPL(PageHeadHuge);
> +
> pgoff_t __basepage_index(struct page *page)
> {
> struct page *page_head = compound_head(page);
> diff --git a/mm/swap.c b/mm/swap.c
> index f689e9a..a8feea6 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -77,18 +77,6 @@ static void __put_compound_page(struct page *page)
>
> static void put_compound_page(struct page *page)
> {
> - /*
> - * hugetlbfs pages cannot be split from under us. If this is a
> - * hugetlbfs page, check refcount on head page and release the page if
> - * the refcount becomes zero.
> - */
> - if (PageHuge(page)) {
> - page = compound_head(page);
> - if (put_page_testzero(page))
> - __put_compound_page(page);
> - return;
> - }
> -
> if (unlikely(PageTail(page))) {
> /* __split_huge_page_refcount can run under us */
> struct page *page_head = compound_trans_head(page);
> @@ -96,6 +84,35 @@ static void put_compound_page(struct page *page)
> if (likely(page != page_head &&
> get_page_unless_zero(page_head))) {
> unsigned long flags;
> +
> + if (PageHeadHuge(page_head)) {
> + if (likely(PageTail(page))) {
> + /*
> + * __split_huge_page_refcount
> + * cannot race here.
> + */
> + VM_BUG_ON(!PageHead(page_head));
> + atomic_dec(&page->_mapcount);
> + if (put_page_testzero(page_head))
> + VM_BUG_ON(1);
> + if (put_page_testzero(page_head))
> + __put_compound_page(page_head);
> + return;
> + } else {
> + /*
> + * __split_huge_page_refcount
> + * run before us, "page" was a
> + * THP tail. The split
> + * page_head has been freed
> + * and reallocated as slab or
> + * hugetlbfs page of smaller
> + * order (only possible if
> + * reallocated as slab on
> + * x86).
> + */
> + goto skip_lock;
> + }
> + }
> /*
> * page_head wasn't a dangling pointer but it
> * may not be a head page anymore by the time
> @@ -107,9 +124,29 @@ static void put_compound_page(struct page *page)
> /* __split_huge_page_refcount run before us */
> compound_unlock_irqrestore(page_head, flags);
> VM_BUG_ON(PageHead(page_head));
> - if (put_page_testzero(page_head))
> - __put_single_page(page_head);
> - out_put_single:
> +skip_lock:
> + if (put_page_testzero(page_head)) {
> + /*
> + * The head page may have been
> + * freed and reallocated as a
> + * compound page of smaller
> + * order and then freed again.
> + * All we know is that it
> + * cannot have become: a THP
> + * page, a compound page of
> + * higher order, a tail page.
> + * That is because we still
> + * hold the refcount of the
> + * split THP tail and
> + * page_head was the THP head
> + * before the split.
> + */
> + if (PageHead(page_head))
> + __put_compound_page(page_head);
> + else
> + __put_single_page(page_head);
> + }
> +out_put_single:
> if (put_page_testzero(page))
> __put_single_page(page);
> return;
> @@ -173,21 +210,34 @@ bool __get_page_tail(struct page *page)
> */
> unsigned long flags;
> bool got = false;
> - struct page *page_head;
> + struct page *page_head = compound_trans_head(page);
>
> - /*
> - * If this is a hugetlbfs page it cannot be split under us. Simply
> - * increment refcount for the head page.
> - */
> - if (PageHuge(page)) {
> - page_head = compound_head(page);
> - atomic_inc(&page_head->_count);
> - got = true;
> - goto out;
> - }
> -
> - page_head = compound_trans_head(page);
> if (likely(page != page_head && get_page_unless_zero(page_head))) {
> + /* Ref to put_compound_page() comment. */
> + if (PageHeadHuge(page_head)) {
> + if (likely(PageTail(page))) {
> + /*
> + * This is a hugetlbfs
> + * page. __split_huge_page_refcount
> + * cannot race here.
> + */
> + VM_BUG_ON(!PageHead(page_head));
> + __get_page_tail_foll(page, false);
> + return true;
> + } else {
> + /*
> + * __split_huge_page_refcount run
> + * before us, "page" was a THP
> + * tail. The split page_head has been
> + * freed and reallocated as slab or
> + * hugetlbfs page of smaller order
> + * (only possible if reallocated as
> + * slab on x86).
> + */
> + put_page(page_head);
> + return false;
> + }
> + }
> /*
> * page_head wasn't a dangling pointer but it
> * may not be a head page anymore by the time
> @@ -204,7 +254,6 @@ bool __get_page_tail(struct page *page)
> if (unlikely(!got))
> put_page(page_head);
> }
> -out:
> return got;
> }
> EXPORT_SYMBOL(__get_page_tail);
> --
> 1.7.10.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html