On Tue, Jun 20, 2017 at 07:07:10PM -0400, Zi Yan wrote:
> From: Zi Yan <zi....@cs.rutgers.edu>
> 
> This patch adds thp migration's core code, including conversions
> between a PMD entry and a swap entry, setting PMD migration entry,
> removing PMD migration entry, and waiting on PMD migration entries.
> 
> This patch makes it possible to support thp migration.
> If you fail to allocate a destination page as a thp, you just split
> the source thp as we do now, and then enter the normal page migration.
> If you succeed to allocate destination thp, you enter thp migration.
> Subsequent patches actually enable thp migration for each caller of
> page migration by allowing its get_new_page() callback to
> allocate thps.
> 
> ChangeLog v1 -> v2:
> - support pte-mapped thp, doubly-mapped thp
> 
> Signed-off-by: Naoya Horiguchi <n-horigu...@ah.jp.nec.com>
> 
> ChangeLog v2 -> v3:
> - use page_vma_mapped_walk()
> - use pmdp_huge_clear_flush() instead of pmdp_huge_get_and_clear() in
>   set_pmd_migration_entry()
> 
> ChangeLog v3 -> v4:
> - factor out the code of removing pte pgtable page in zap_huge_pmd()
> 
> ChangeLog v4 -> v5:
> - remove unnecessary PTE-mapped THP code in remove_migration_pmd()
>   and set_pmd_migration_entry()
> - restructure the code in zap_huge_pmd() to avoid factoring out
>   the pte pgtable page code
> - in zap_huge_pmd(), check that PMD swap entries are migration entries
> - change author information
> 
> ChangeLog v5 -> v7
> - use macro to disable the code when thp migration is not enabled
> 
> Signed-off-by: Zi Yan <zi....@cs.rutgers.edu>
> Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
> ---
>  arch/x86/include/asm/pgtable_64.h |  2 +
>  include/linux/swapops.h           | 69 +++++++++++++++++++++++++++++-
>  mm/huge_memory.c                  | 88 
> ++++++++++++++++++++++++++++++++++++---
>  mm/migrate.c                      | 32 +++++++++++++-
>  mm/page_vma_mapped.c              | 17 ++++++--
>  mm/pgtable-generic.c              |  3 +-
>  mm/rmap.c                         | 13 ++++++
>  7 files changed, 212 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pgtable_64.h 
> b/arch/x86/include/asm/pgtable_64.h
> index 45b7a4094de0..eac7f8cf4ae0 100644
> --- a/arch/x86/include/asm/pgtable_64.h
> +++ b/arch/x86/include/asm/pgtable_64.h
> @@ -208,7 +208,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
>                                        ((type) << (SWP_TYPE_FIRST_BIT)) \
>                                        | ((offset) << SWP_OFFSET_FIRST_BIT) })
>  #define __pte_to_swp_entry(pte)              ((swp_entry_t) { pte_val((pte)) 
> })
> +#define __pmd_to_swp_entry(pmd)              ((swp_entry_t) { pmd_val((pmd)) 
> })
>  #define __swp_entry_to_pte(x)                ((pte_t) { .pte = (x).val })
> +#define __swp_entry_to_pmd(x)                ((pmd_t) { .pmd = (x).val })
>  
>  extern int kern_addr_valid(unsigned long addr);
>  extern void cleanup_highmap(void);
> diff --git a/include/linux/swapops.h b/include/linux/swapops.h
> index c5ff7b217ee6..ae0c5fc18788 100644
> --- a/include/linux/swapops.h
> +++ b/include/linux/swapops.h
> @@ -103,7 +103,8 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
>  #ifdef CONFIG_MIGRATION
>  static inline swp_entry_t make_migration_entry(struct page *page, int write)
>  {
> -     BUG_ON(!PageLocked(page));
> +     BUG_ON(!PageLocked(compound_head(page)));
> +
>       return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ,
>                       page_to_pfn(page));
>  }
> @@ -126,7 +127,7 @@ static inline struct page 
> *migration_entry_to_page(swp_entry_t entry)
>        * Any use of migration entries may only occur while the
>        * corresponding page is locked
>        */
> -     BUG_ON(!PageLocked(p));
> +     BUG_ON(!PageLocked(compound_head(p)));
>       return p;
>  }
>  
> @@ -163,6 +164,70 @@ static inline int is_write_migration_entry(swp_entry_t 
> entry)
>  
>  #endif
>  
> +struct page_vma_mapped_walk;
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +extern void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> +             struct page *page);
> +
> +extern void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
> +             struct page *new);
> +
> +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> +     swp_entry_t arch_entry;
> +
> +     arch_entry = __pmd_to_swp_entry(pmd);
> +     return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> +     swp_entry_t arch_entry;
> +
> +     arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> +     return __swp_entry_to_pmd(arch_entry);
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> +     return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> +}
> +#else
> +static inline void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> +             struct page *page)
> +{
> +     BUILD_BUG();
> +}
> +
> +static inline void remove_migration_pmd(struct page_vma_mapped_walk *pvmw,
> +             struct page *new)
> +{
> +     BUILD_BUG();
> +}
> +
> +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { 
> }
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> +     BUILD_BUG();
> +     return swp_entry(0, 0);
> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> +     BUILD_BUG();
> +     return (pmd_t){ 0 };
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> +     return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>  
>  extern atomic_long_t num_poisoned_pages __read_mostly;
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 421631ff3aeb..d9405ba628f6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1641,10 +1641,27 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>               spin_unlock(ptl);
>               tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
>       } else {
> -             struct page *page = pmd_page(orig_pmd);
> -             page_remove_rmap(page, true);
> -             VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
> -             VM_BUG_ON_PAGE(!PageHead(page), page);
> +             struct page *page = NULL;
> +             int migration = 0;
> +
> +             if (pmd_present(orig_pmd)) {
> +                     page = pmd_page(orig_pmd);
> +                     page_remove_rmap(page, true);
> +                     VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
> +                     VM_BUG_ON_PAGE(!PageHead(page), page);
> +             } else {
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION

Can we have IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) instead here and below?

> +                     swp_entry_t entry;
> +
> +                     VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
> +                     entry = pmd_to_swp_entry(orig_pmd);
> +                     page = pfn_to_page(swp_offset(entry));
> +                     migration = 1;

I guess something like 'flush_needed' instead would be more descriptive.

> +#else
> +                     WARN_ONCE(1, "Non present huge pmd without pmd 
> migration enabled!");
> +#endif
> +             }
> +
>               if (PageAnon(page)) {
>                       zap_deposited_table(tlb->mm, pmd);
>                       add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> @@ -1653,8 +1670,10 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
> vm_area_struct *vma,
>                               zap_deposited_table(tlb->mm, pmd);
>                       add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
>               }
> +
>               spin_unlock(ptl);
> -             tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
> +             if (!migration)
> +                     tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
>       }
>       return 1;
>  }
> @@ -2694,3 +2713,62 @@ static int __init split_huge_pages_debugfs(void)
>  }
>  late_initcall(split_huge_pages_debugfs);
>  #endif
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
> +             struct page *page)
> +{
> +     struct vm_area_struct *vma = pvmw->vma;
> +     struct mm_struct *mm = vma->vm_mm;
> +     unsigned long address = pvmw->address;
> +     pmd_t pmdval;
> +     swp_entry_t entry;
> +
> +     if (!(pvmw->pmd && !pvmw->pte))
> +             return;
> +
> +     mmu_notifier_invalidate_range_start(mm, address,
> +                     address + HPAGE_PMD_SIZE);
> +
> +     flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
> +     pmdval = pmdp_huge_clear_flush(vma, address, pvmw->pmd);

We don't hold mmap_sem for write here, right?

I *think* it means we can race with MADV_DONTNEED the same way as
described in ced108037c2a.

I guess pmdp_invalidate() approach is required.

> +     if (pmd_dirty(pmdval))
> +             set_page_dirty(page);
> +     entry = make_migration_entry(page, pmd_write(pmdval));
> +     pmdval = swp_entry_to_pmd(entry);
> +     set_pmd_at(mm, address, pvmw->pmd, pmdval);
> +     page_remove_rmap(page, true);
> +     put_page(page);
> +
> +     mmu_notifier_invalidate_range_end(mm, address,
> +                     address + HPAGE_PMD_SIZE);
> +}
> +
> +void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page 
> *new)
> +{
> +     struct vm_area_struct *vma = pvmw->vma;
> +     struct mm_struct *mm = vma->vm_mm;
> +     unsigned long address = pvmw->address;
> +     unsigned long mmun_start = address & HPAGE_PMD_MASK;
> +     unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
> +     pmd_t pmde;
> +     swp_entry_t entry;
> +
> +     if (!(pvmw->pmd && !pvmw->pte))
> +             return;
> +
> +     entry = pmd_to_swp_entry(*pvmw->pmd);
> +     get_page(new);
> +     pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
> +     if (is_write_migration_entry(entry))
> +             pmde = maybe_pmd_mkwrite(pmde, vma);
> +
> +     flush_cache_range(vma, mmun_start, mmun_end);
> +     page_add_anon_rmap(new, vma, mmun_start, true);
> +     set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
> +     flush_tlb_range(vma, mmun_start, mmun_end);

Why do we need flush here? We replace non-present pmd with a present one.

And we are under ptl, but flush IIRC can sleep.

> +     if (vma->vm_flags & VM_LOCKED)
> +             mlock_vma_page(new);
> +     update_mmu_cache_pmd(vma, address, pvmw->pmd);
> +}
> +#endif
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 627671551873..cae5c3b3b491 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -215,6 +215,15 @@ static bool remove_migration_pte(struct page *page, 
> struct vm_area_struct *vma,
>                       new = page - pvmw.page->index +
>                               linear_page_index(vma, pvmw.address);
>  
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +             /* PMD-mapped THP migration entry */
> +             if (!pvmw.pte && pvmw.page) {
> +                     VM_BUG_ON_PAGE(PageHuge(page) || 
> !PageTransCompound(page), page);
> +                     remove_migration_pmd(&pvmw, new);
> +                     continue;
> +             }
> +#endif
> +
>               get_page(new);
>               pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
>               if (pte_swp_soft_dirty(*pvmw.pte))
> @@ -329,6 +338,27 @@ void migration_entry_wait_huge(struct vm_area_struct 
> *vma,
>       __migration_entry_wait(mm, pte, ptl);
>  }
>  
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
> +{
> +     spinlock_t *ptl;
> +     struct page *page;
> +
> +     ptl = pmd_lock(mm, pmd);
> +     if (!is_pmd_migration_entry(*pmd))
> +             goto unlock;
> +     page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
> +     if (!get_page_unless_zero(page))
> +             goto unlock;
> +     spin_unlock(ptl);
> +     wait_on_page_locked(page);
> +     put_page(page);
> +     return;
> +unlock:
> +     spin_unlock(ptl);
> +}
> +#endif
> +
>  #ifdef CONFIG_BLOCK
>  /* Returns true if all buffers are successfully locked */
>  static bool buffer_migrate_lock_buffers(struct buffer_head *head,
> @@ -1087,7 +1117,7 @@ static ICE_noinline int unmap_and_move(new_page_t 
> get_new_page,
>               goto out;
>       }
>  
> -     if (unlikely(PageTransHuge(page))) {
> +     if (unlikely(PageTransHuge(page) && !PageTransHuge(newpage))) {
>               lock_page(page);
>               rc = split_huge_page(page);
>               unlock_page(page);
> diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
> index 8ec6ba230bb9..ff5517e67788 100644
> --- a/mm/page_vma_mapped.c
> +++ b/mm/page_vma_mapped.c
> @@ -138,16 +138,27 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
> *pvmw)
>       if (!pud_present(*pud))
>               return false;
>       pvmw->pmd = pmd_offset(pud, pvmw->address);
> -     if (pmd_trans_huge(*pvmw->pmd)) {
> +     if (pmd_trans_huge(*pvmw->pmd) || is_pmd_migration_entry(*pvmw->pmd)) {
>               pvmw->ptl = pmd_lock(mm, pvmw->pmd);
> -             if (!pmd_present(*pvmw->pmd))
> -                     return not_found(pvmw);
>               if (likely(pmd_trans_huge(*pvmw->pmd))) {
>                       if (pvmw->flags & PVMW_MIGRATION)
>                               return not_found(pvmw);
>                       if (pmd_page(*pvmw->pmd) != page)
>                               return not_found(pvmw);
>                       return true;
> +             } else if (!pmd_present(*pvmw->pmd)) {

Shouldn't we check PVMW_MIGRATION here?

> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +                     if 
> (unlikely(is_migration_entry(pmd_to_swp_entry(*pvmw->pmd)))) {
> +                             swp_entry_t entry = 
> pmd_to_swp_entry(*pvmw->pmd);
> +
> +                             if (migration_entry_to_page(entry) != page)
> +                                     return not_found(pvmw);
> +                             return true;
> +                     }
> +#else
> +                     WARN_ONCE(1, "Non present huge pmd without pmd 
> migration enabled!");
> +#endif
> +                     return not_found(pvmw);
>               } else {
>                       /* THP pmd was split under us: handle on pte level */
>                       spin_unlock(pvmw->ptl);
> diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
> index c99d9512a45b..1175f6a24fdb 100644
> --- a/mm/pgtable-generic.c
> +++ b/mm/pgtable-generic.c
> @@ -124,7 +124,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, 
> unsigned long address,
>  {
>       pmd_t pmd;
>       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -     VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> +     VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
> +                        !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
>       pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
>       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>       return pmd;
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 91948fbbb0bb..b28f633cd569 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -1302,6 +1302,7 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>       bool ret = true;
>       enum ttu_flags flags = (enum ttu_flags)arg;
>  
> +
>       /* munlock has nothing to gain from examining un-locked vmas */
>       if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
>               return true;
> @@ -1312,6 +1313,18 @@ static bool try_to_unmap_one(struct page *page, struct 
> vm_area_struct *vma,
>       }
>  
>       while (page_vma_mapped_walk(&pvmw)) {
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +             /* PMD-mapped THP migration entry */
> +             if (flags & TTU_MIGRATION) {
> +                     if (!pvmw.pte && page) {
> +                             VM_BUG_ON_PAGE(PageHuge(page) || 
> !PageTransCompound(page),
> +                                             page);
> +                             set_pmd_migration_entry(&pvmw, page);
> +                             continue;
> +                     }
> +             }
> +#endif
> +
>               /*
>                * If the page is mlock()d, we cannot swap it out.
>                * If it's recently referenced (perhaps page_referenced
> -- 
> 2.11.0
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"d...@kvack.org";> em...@kvack.org </a>

-- 
 Kirill A. Shutemov

Reply via email to