Bharata B Rao <bhar...@linux.ibm.com> writes: > We can hit the following BUG_ON during memory unplug: > > kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:342! > Oops: Exception in kernel mode, sig: 5 [#1] > LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries > NIP [c000000000093308] pmd_fragment_free+0x48/0xc0 > LR [c00000000147bfec] remove_pagetable+0x578/0x60c > Call Trace: > 0xc000008050000000 (unreliable) > remove_pagetable+0x384/0x60c > radix__remove_section_mapping+0x18/0x2c > remove_section_mapping+0x1c/0x3c > arch_remove_memory+0x11c/0x180 > try_remove_memory+0x120/0x1b0 > __remove_memory+0x20/0x40 > dlpar_remove_lmb+0xc0/0x114 > dlpar_memory+0x8b0/0xb20 > handle_dlpar_errorlog+0xc0/0x190 > pseries_hp_work_fn+0x2c/0x60 > process_one_work+0x30c/0x810 > worker_thread+0x98/0x540 > kthread+0x1c4/0x1d0 > ret_from_kernel_thread+0x5c/0x74 > > This occurs when unplug is attempted for such memory which has > been mapped using memblock pages as part of early kernel page > table setup. We wouldn't have initialized the PMD or PTE fragment > count for those PMD or PTE pages. > > Fixing this includes 3 parts: > > - Re-walk the init_mm page tables from mem_init() and initialize > the PMD and PTE fragment count to 1. > - When freeing PUD, PMD and PTE page table pages, check explicitly > if they come from memblock and if so free then appropriately. > - When we do early memblock based allocation of PMD and PUD pages, > allocate in PAGE_SIZE granularity so that we are sure the > complete page is used as pagetable page. > > Since we now do PAGE_SIZE allocations for both PUD table and > PMD table (Note that PTE table allocation is already of PAGE_SIZE), > we end up allocating more memory for the same amount of system RAM. > Here is a comparision of how much more we need for a 64T and 2G > system after this patch: > > 1. 64T system > ------------- > 64T RAM would need 64G for vmemmap with struct page size being 64B. > > 128 PUD tables for 64T memory (1G mappings) > 1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings) > > With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K > With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K > > 2. 2G system > ------------ > 2G RAM would need 2M for vmemmap with struct page size being 64B. > > 1 PUD table for 2G memory (1G mapping) > 1 PUD table and 1 PMD table for 2M vmemmap (2M mappings) > > With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K > With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K
How about we just do void pmd_fragment_free(unsigned long *pmd) { struct page *page = virt_to_page(pmd); /* * Early pmd pages allocated via memblock * allocator need to be freed differently */ if (PageReserved(page)) return free_reserved_page(page); BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { pgtable_pmd_page_dtor(page); __free_page(page); } } That way we could avoid the fixup_pgtable_fragments completely? > > Signed-off-by: Bharata B Rao <bhar...@linux.ibm.com> > --- > arch/powerpc/include/asm/book3s/64/pgalloc.h | 11 ++- > arch/powerpc/include/asm/book3s/64/radix.h | 1 + > arch/powerpc/include/asm/sparsemem.h | 1 + > arch/powerpc/mm/book3s64/pgtable.c | 31 +++++++- > arch/powerpc/mm/book3s64/radix_pgtable.c | 80 +++++++++++++++++++- > arch/powerpc/mm/mem.c | 5 ++ > arch/powerpc/mm/pgtable-frag.c | 9 ++- > 7 files changed, 129 insertions(+), 9 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h > b/arch/powerpc/include/asm/book3s/64/pgalloc.h > index 69c5b051734f..56d695f0095c 100644 > --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h > +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h > @@ -109,7 +109,16 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, > unsigned long addr) > > static inline void pud_free(struct mm_struct *mm, pud_t *pud) > { > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > + struct page *page = virt_to_page(pud); > + > + /* > + * Early pud pages allocated via memblock allocator > + * can't be directly freed to slab > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); > } > > static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) > diff --git a/arch/powerpc/include/asm/book3s/64/radix.h > b/arch/powerpc/include/asm/book3s/64/radix.h > index 0cba794c4fb8..90f05d52f46d 100644 > --- a/arch/powerpc/include/asm/book3s/64/radix.h > +++ b/arch/powerpc/include/asm/book3s/64/radix.h > @@ -297,6 +297,7 @@ static inline unsigned long radix__get_tree_size(void) > int radix__create_section_mapping(unsigned long start, unsigned long end, > int nid, pgprot_t prot); > int radix__remove_section_mapping(unsigned long start, unsigned long end); > +void radix__fixup_pgtable_fragments(void); > #endif /* CONFIG_MEMORY_HOTPLUG */ > #endif /* __ASSEMBLY__ */ > #endif > diff --git a/arch/powerpc/include/asm/sparsemem.h > b/arch/powerpc/include/asm/sparsemem.h > index c89b32443cff..d0b22a937a7a 100644 > --- a/arch/powerpc/include/asm/sparsemem.h > +++ b/arch/powerpc/include/asm/sparsemem.h > @@ -16,6 +16,7 @@ > extern int create_section_mapping(unsigned long start, unsigned long end, > int nid, pgprot_t prot); > extern int remove_section_mapping(unsigned long start, unsigned long end); > +void fixup_pgtable_fragments(void); > > #ifdef CONFIG_PPC_BOOK3S_64 > extern int resize_hpt_for_hotplug(unsigned long new_mem_size); > diff --git a/arch/powerpc/mm/book3s64/pgtable.c > b/arch/powerpc/mm/book3s64/pgtable.c > index c58ad1049909..ee94a28dc6f9 100644 > --- a/arch/powerpc/mm/book3s64/pgtable.c > +++ b/arch/powerpc/mm/book3s64/pgtable.c > @@ -184,6 +184,13 @@ int __meminit remove_section_mapping(unsigned long > start, unsigned long end) > > return hash__remove_section_mapping(start, end); > } > + > +void fixup_pgtable_fragments(void) > +{ > + if (radix_enabled()) > + radix__fixup_pgtable_fragments(); > +} > + > #endif /* CONFIG_MEMORY_HOTPLUG */ > > void __init mmu_partition_table_init(void) > @@ -341,13 +348,23 @@ void pmd_fragment_free(unsigned long *pmd) > > BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); > if (atomic_dec_and_test(&page->pt_frag_refcount)) { > - pgtable_pmd_page_dtor(page); > - __free_page(page); > + /* > + * Early pmd pages allocated via memblock > + * allocator wouldn't have called _ctor > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else { > + pgtable_pmd_page_dtor(page); > + __free_page(page); > + } > } > } > > static inline void pgtable_free(void *table, int index) > { > + struct page *page; > + > switch (index) { > case PTE_INDEX: > pte_fragment_free(table, 0); > @@ -356,7 +373,15 @@ static inline void pgtable_free(void *table, int index) > pmd_fragment_free(table); > break; > case PUD_INDEX: > - kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > + page = virt_to_page(table); > + /* > + * Early pud pages allocated via memblock > + * allocator need to be freed differently > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); > break; > #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) > /* 16M hugepd directory at pud level */ > diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c > b/arch/powerpc/mm/book3s64/radix_pgtable.c > index ffccfe00ca2a..58e42393d5e8 100644 > --- a/arch/powerpc/mm/book3s64/radix_pgtable.c > +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c > @@ -37,6 +37,71 @@ > unsigned int mmu_pid_bits; > unsigned int mmu_base_pid; > > +static void fixup_pte_fragments(pmd_t *pmd) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { > + pte_t *pte; > + struct page *page; > + > + if (pmd_none(*pmd)) > + continue; > + if (pmd_is_leaf(*pmd)) > + continue; > + > + pte = pte_offset_kernel(pmd, 0); > + page = virt_to_page(pte); > + atomic_inc(&page->pt_frag_refcount); > + } > +} > + > +static void fixup_pmd_fragments(pud_t *pud) > +{ > + int i; > + > + for (i = 0; i < PTRS_PER_PUD; i++, pud++) { > + pmd_t *pmd; > + struct page *page; > + > + if (pud_none(*pud)) > + continue; > + if (pud_is_leaf(*pud)) > + continue; > + > + pmd = pmd_offset(pud, 0); > + page = virt_to_page(pmd); > + atomic_inc(&page->pt_frag_refcount); > + fixup_pte_fragments(pmd); > + } > +} > + > +/* > + * Walk the init_mm page tables and fixup the PMD and PTE fragment > + * counts. This allows the PUD, PMD and PTE pages to be freed > + * back to buddy allocator properly during memory unplug. > + */ > +void radix__fixup_pgtable_fragments(void) > +{ > + int i; > + pgd_t *pgd = pgd_offset_k(0UL); > + > + spin_lock(&init_mm.page_table_lock); > + for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { > + p4d_t *p4d = p4d_offset(pgd, 0UL); > + pud_t *pud; > + > + if (p4d_none(*p4d)) > + continue; > + if (p4d_is_leaf(*p4d)) > + continue; > + > + pud = pud_offset(p4d, 0); > + fixup_pmd_fragments(pud); > + } > + spin_unlock(&init_mm.page_table_lock); > +} > + > static __ref void *early_alloc_pgtable(unsigned long size, int nid, > unsigned long region_start, unsigned long region_end) > { > @@ -58,6 +123,13 @@ static __ref void *early_alloc_pgtable(unsigned long > size, int nid, > return ptr; > } > > +/* > + * When allocating pud or pmd pointers, we allocate a complete page > + * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This > + * is to ensure that the page obtained from the memblock allocator > + * can be completely used as page table page and can be freed > + * correctly when the page table entries are removed. > + */ > static int early_map_kernel_page(unsigned long ea, unsigned long pa, > pgprot_t flags, > unsigned int map_page_size, > @@ -74,8 +146,8 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > pgdp = pgd_offset_k(ea); > p4dp = p4d_offset(pgdp, ea); > if (p4d_none(*p4dp)) { > - pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, > - region_start, region_end); > + pudp = early_alloc_pgtable(PAGE_SIZE, nid, > + region_start, region_end); > p4d_populate(&init_mm, p4dp, pudp); > } > pudp = pud_offset(p4dp, ea); > @@ -84,8 +156,8 @@ static int early_map_kernel_page(unsigned long ea, > unsigned long pa, > goto set_the_pte; > } > if (pud_none(*pudp)) { > - pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, > - region_start, region_end); > + pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start, > + region_end); > pud_populate(&init_mm, pudp, pmdp); > } > pmdp = pmd_offset(pudp, ea); > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index 5f7fe13211e9..b8ea004c3ebf 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -54,6 +54,10 @@ > > #include <mm/mmu_decl.h> > > +void __weak fixup_pgtable_fragments(void) > +{ > +} > + > #ifndef CPU_FTR_COHERENT_ICACHE > #define CPU_FTR_COHERENT_ICACHE 0 /* XXX for now */ > #define CPU_FTR_NOEXECUTE 0 > @@ -301,6 +305,7 @@ void __init mem_init(void) > > memblock_free_all(); > > + fixup_pgtable_fragments(); > #ifdef CONFIG_HIGHMEM > { > unsigned long pfn, highmem_mapnr; > diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c > index ee4bd6d38602..16213c09896a 100644 > --- a/arch/powerpc/mm/pgtable-frag.c > +++ b/arch/powerpc/mm/pgtable-frag.c > @@ -114,6 +114,13 @@ void pte_fragment_free(unsigned long *table, int kernel) > if (atomic_dec_and_test(&page->pt_frag_refcount)) { > if (!kernel) > pgtable_pte_page_dtor(page); > - __free_page(page); > + /* > + * Early pte pages allocated via memblock > + * allocator need to be freed differently > + */ > + if (PageReserved(page)) > + free_reserved_page(page); > + else > + __free_page(page); > } > } > -- > 2.21.3