from:"Aneesh Kumar K.V"

[PATCH 3/3] powerpc/mm: Update the memory limit based on direct mapping restrictions

2024-04-03 Thread Aneesh Kumar K.V (IBM)

memory limit value specified by the user are further updated such that
the value is 16MB aligned. This is because hash translation mode use
16MB as direct mapping page size. Make sure we update the global
variable 'memory_limit' with the 16MB aligned value such that all kernel
components will see the new aligned value of the memory limit.

Signed-off-by: Aneesh Kumar K.V (IBM) 
---
 arch/powerpc/kernel/prom.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 7451bedad1f4..b8f764453eaa 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -779,7 +779,6 @@ static inline void save_fscr_to_task(void) {}
 
 void __init early_init_devtree(void *params)
 {
-   phys_addr_t limit;
 
DBG(" -> early_init_devtree(%px)\n", params);
 
@@ -850,8 +849,8 @@ void __init early_init_devtree(void *params)
memory_limit = 0;
 
/* Align down to 16 MB which is large page size with hash page 
translation */
-   limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M);
-   memblock_enforce_memory_limit(limit);
+   memory_limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), 
SZ_16M);
+   memblock_enforce_memory_limit(memory_limit);
 
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES)
if (!early_radix_enabled())
-- 
2.44.0

[PATCH 2/3] powerpc/fadump: Don't update the user-specified memory limit

2024-04-03 Thread Aneesh Kumar K.V (IBM)

If the user specifies the memory limit, the kernel should honor it such
that all allocation and reservations are made within the memory limit
specified. fadump was breaking that rule. Remove the code which updates
the memory limit such that fadump reservations are done within the
limit specified.

Cc: Mahesh Salgaonkar  
Signed-off-by: Aneesh Kumar K.V (IBM) 
---
 arch/powerpc/kernel/fadump.c | 16 
 1 file changed, 16 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d14eda1e8589..4e768d93c6d4 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -573,22 +573,6 @@ int __init fadump_reserve_mem(void)
}
}
 
-   /*
-* Calculate the memory boundary.
-* If memory_limit is less than actual memory boundary then reserve
-* the memory for fadump beyond the memory_limit and adjust the
-* memory_limit accordingly, so that the running kernel can run with
-* specified memory_limit.
-*/
-   if (memory_limit && memory_limit < memblock_end_of_DRAM()) {
-   size = get_fadump_area_size();
-   if ((memory_limit + size) < memblock_end_of_DRAM())
-   memory_limit += size;
-   else
-   memory_limit = memblock_end_of_DRAM();
-   printk(KERN_INFO "Adjusted memory_limit for firmware-assisted"
-   " dump, now %#016llx\n", memory_limit);
-   }
if (memory_limit)
mem_boundary = memory_limit;
else
-- 
2.44.0

[PATCH 1/3] powerpc/mm: Align memory_limit value specified using mem= kernel parameter

2024-04-03 Thread Aneesh Kumar K.V (IBM)

The value specified for the memory limit is used to set a restriction on
memory usage. It is important to ensure that this restriction is within
the linear map kernel address space range. The hash page table
translation uses a 16MB page size to map the kernel linear map address
space. htab_bolt_mapping() function aligns down the size of the range
while mapping kernel linear address space. Since the memblock limit is
enforced very early during boot, before we can detect the type of memory
translation (radix vs hash), we align the memory limit value specified
as a kernel parameter to 16MB. This alignment value will work for both
hash and radix translations.

Signed-off-by: Aneesh Kumar K.V (IBM) 
---
 arch/powerpc/kernel/prom.c  | 7 +--
 arch/powerpc/kernel/prom_init.c | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index cd8d8883de90..7451bedad1f4 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -846,8 +846,11 @@ void __init early_init_devtree(void *params)
reserve_crashkernel();
early_reserve_mem();
 
-   /* Ensure that total memory size is page-aligned. */
-   limit = ALIGN(memory_limit ?: memblock_phys_mem_size(), PAGE_SIZE);
+   if (memory_limit > memblock_phys_mem_size())
+   memory_limit = 0;
+
+   /* Align down to 16 MB which is large page size with hash page 
translation */
+   limit = ALIGN_DOWN(memory_limit ?: memblock_phys_mem_size(), SZ_16M);
memblock_enforce_memory_limit(limit);
 
 #if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_PPC_4K_PAGES)
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 0ef358285337..fbb68fc28ed3 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -817,8 +817,8 @@ static void __init early_cmdline_parse(void)
opt += 4;
prom_memory_limit = prom_memparse(opt, (const char **));
 #ifdef CONFIG_PPC64
-   /* Align to 16 MB == size of ppc64 large page */
-   prom_memory_limit = ALIGN(prom_memory_limit, 0x100);
+   /* Align down to 16 MB which is large page size with hash page 
translation */
+   prom_memory_limit = ALIGN_DOWN(prom_memory_limit, SZ_16M);
 #endif
}
 

base-commit: 3e92c1e6cd876754b64d1998ec0a01800ed954a6
-- 
2.44.0

Re: [PATCH] powerpc: align memory_limit to 16MB in early_parse_mem

2024-03-03 Thread Aneesh Kumar K.V

On 3/2/24 4:53 AM, Michael Ellerman wrote:
> Hi Joel,
> 
> Joel Savitz  writes:
>> On 64-bit powerpc, usage of a non-16MB-aligned value for the mem= kernel
>> cmdline parameter results in a system hang at boot.
> 
> Can you give us any more details on that? It might be a bug we can fix.
> 
>> For example, using 'mem=4198400K' will always reproduce this issue.
>>
>> This patch fixes the problem by aligning any argument to mem= to 16MB
>> corresponding with the large page size on powerpc.
> 
> The large page size depends on the MMU, with Radix it's 2MB or 1GB. So
> depending on what's happening 16MB may not be enough.
> 
> What system are you testing on?
> 

htab_bolt_mapping should have aligned things to a lower value that is 16MB 
aligned.

/* Carefully map only the possible range */
vaddr = ALIGN(vstart, step);
paddr = ALIGN(pstart, step);
vend  = ALIGN_DOWN(vend, step);



-aneesh

Re: [PATCH] mm/debug_vm_pgtable: Fix BUG_ON with pud advanced test

2024-02-19 Thread Aneesh Kumar K.V

On 2/20/24 8:16 AM, Andrew Morton wrote:
> On Mon, 29 Jan 2024 13:43:39 +0530 "Aneesh Kumar K.V" 
>  wrote:
> 
>>> return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
>>> }
>>> #endif
>>>
>>> #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
>>> static inline int pud_devmap(pud_t pud)
>>> {
>>> return !!(pud_val(pud) & _PAGE_DEVMAP);
>>> }
>>> #else
>>> static inline int pud_devmap(pud_t pud)
>>> {
>>> return 0;
>>> }
>>> #endif
>>>
>>> We might need some more clarity on this regarding x86 platform's pud huge
>>> page implementation.
>>>
>>
>> static vm_fault_t create_huge_pud(struct vm_fault *vmf)
>> {
>> #if defined(CONFIG_TRANSPARENT_HUGEPAGE) &&  \
>>  defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
>>  struct vm_area_struct *vma = vmf->vma;
>>  /* No support for anonymous transparent PUD pages yet */
>>  if (vma_is_anonymous(vma))
>>  return VM_FAULT_FALLBACK;
>>  if (vma->vm_ops->huge_fault)
>>  return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
>> #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
>>  return VM_FAULT_FALLBACK;
>> }
> 
> cryptic reply, unreplied to.
> 
> What's the thinking here?  Should we proceed with the patch as-is, or
> are changes needed?
> 

Sorry for the confusion. What i wanted to update with the code was to reiterate
that no architectures currently does anonymous pud hugepage. So restricting
debug_vm_pgtable pud hugepage test to devmap pte entries should be ok w.r.t
these tests.

-aneesh

Re: [PATCH] mm/debug_vm_pgtable: Fix BUG_ON with pud advanced test

2024-01-29 Thread Aneesh Kumar K.V

On 1/29/24 12:23 PM, Anshuman Khandual wrote:
> 
> 
> On 1/29/24 11:56, Aneesh Kumar K.V wrote:
>> On 1/29/24 11:52 AM, Anshuman Khandual wrote:
>>>
>>>
>>> On 1/29/24 11:30, Aneesh Kumar K.V (IBM) wrote:
>>>> Architectures like powerpc add debug checks to ensure we find only devmap
>>>> PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM.
>>>> This patch marks the ptes used for PUD advanced test devmap pte entries
>>>> so that we don't hit on debug checks on architecture like ppc64 as
>>>> below.
>>>>
>>>> WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 
>>>> radix__pud_hugepage_update+0x38/0x138
>>>> 
>>>> NIP [c00a7004] radix__pud_hugepage_update+0x38/0x138
>>>> LR [c00a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
>>>> Call Trace:
>>>> [c4a2f950] [c4a2f9a0] 0xc4a2f9a0 (unreliable)
>>>> [c4a2f980] [000d34c1] 0xd34c1
>>>> [c4a2f9a0] [c206ba98] pud_advanced_tests+0x118/0x334
>>>> [c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
>>>> [c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388
>>>>
>>>> Also
>>>>
>>>>  kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
>>>>  
>>>>
>>>>  NIP [c0096510] pudp_huge_get_and_clear_full+0x98/0x174
>>>>  LR [c206bb34] pud_advanced_tests+0x1b4/0x334
>>>>  Call Trace:
>>>>  [c4a2f950] [000d34c10000] 0xd34c100000000 (unreliable)
>>>>  [c4a2f9a0] [c206bb34] pud_advanced_tests+0x1b4/0x334
>>>>  [c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
>>>>  [c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388
>>>>
>>>> Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud 
>>>> hugepage")
>>>> Signed-off-by: Aneesh Kumar K.V (IBM) 
>>>> ---
>>>>  mm/debug_vm_pgtable.c | 8 
>>>>  1 file changed, 8 insertions(+)
>>>>
>>>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>>>> index 5662e29fe253..65c19025da3d 100644
>>>> --- a/mm/debug_vm_pgtable.c
>>>> +++ b/mm/debug_vm_pgtable.c
>>>> @@ -362,6 +362,12 @@ static void __init pud_advanced_tests(struct 
>>>> pgtable_debug_args *args)
>>>>vaddr &= HPAGE_PUD_MASK;
>>>>  
>>>>pud = pfn_pud(args->pud_pfn, args->page_prot);
>>>> +  /*
>>>> +   * Some architectures have debug checks to make sure
>>>> +   * huge pud mapping are only found with devmap entries
>>>> +   * For now test with only devmap entries.
>>>> +   */
>>> Do you see this behaviour to be changed in powerpc anytime soon ? Otherwise
>>> these pud_mkdevmap() based work arounds, might be required to stick around
>>> for longer just to prevent powerpc specific triggers. Given PUD transparent
>>> huge pages i.e HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD are just supported on x86
>>> and powerpc platforms, could not this problem be solved in a more uniform
>>> manner.
>>>
>>
>>
>> IIUC pud level transparent hugepages are only supported with devmap entries 
>> even
>> on x86. We don't do anonymous pud hugepage.
> 
> There are some 'pud_trans_huge(orig_pud) || pud_devmap(orig_pud)' checks in
> core paths i.e in mm/memory.c which might suggest pud_trans_huge() to exist
> without also being a devmap. I might be missing something here, but on x86
> platform following helpers suggest pud_trans_huge() to exist without being
> a devmap as well.
> 
> #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> static inline int pud_trans_huge(pud_t pud)
> {
> return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
> }
> #endif
> 
> #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
> static inline int pud_devmap(pud_t pud)
> {
> return !!(pud_val(pud) & _PAGE_DEVMAP);
> }
> #else
> static inline int pud_devmap(pud_t pud)
> {
> return 0;
> }
> #endif
> 
> We might need some more clarity on this regarding x86 platform's pud huge
> page implementation.
> 

static vm_fault_t create_huge_pud(struct vm_fault *vmf)
{
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
struct vm_area_struct *vma = vmf->vma;
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vma))
return VM_FAULT_FALLBACK;
if (vma->vm_ops->huge_fault)
return vma->vm_ops->huge_fault(vmf, PUD_ORDER);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
return VM_FAULT_FALLBACK;
}



-aneesh

Re: [PATCH] mm/debug_vm_pgtable: Fix BUG_ON with pud advanced test

2024-01-28 Thread Aneesh Kumar K.V

On 1/29/24 11:52 AM, Anshuman Khandual wrote:
> 
> 
> On 1/29/24 11:30, Aneesh Kumar K.V (IBM) wrote:
>> Architectures like powerpc add debug checks to ensure we find only devmap
>> PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM.
>> This patch marks the ptes used for PUD advanced test devmap pte entries
>> so that we don't hit on debug checks on architecture like ppc64 as
>> below.
>>
>> WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 
>> radix__pud_hugepage_update+0x38/0x138
>> 
>> NIP [c00a7004] radix__pud_hugepage_update+0x38/0x138
>> LR [c00a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
>> Call Trace:
>> [c4a2f950] [c4a2f9a0] 0xc4a2f9a0 (unreliable)
>> [c4a2f980] [000d34c1] 0xd34c1
>> [c4a2f9a0] [c206ba98] pud_advanced_tests+0x118/0x334
>> [c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
>> [c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388
>>
>> Also
>>
>>  kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
>>  
>>
>>  NIP [c0096510] pudp_huge_get_and_clear_full+0x98/0x174
>>  LR [c206bb34] pud_advanced_tests+0x1b4/0x334
>>  Call Trace:
>>  [c4a2f950] [000d34c1] 0xd34c1 (unreliable)
>>  [c4a2f9a0] [c206bb34] pud_advanced_tests+0x1b4/0x334
>>  [c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
>>  [c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388
>>
>> Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
>> Signed-off-by: Aneesh Kumar K.V (IBM) 
>> ---
>>  mm/debug_vm_pgtable.c | 8 
>>  1 file changed, 8 insertions(+)
>>
>> diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
>> index 5662e29fe253..65c19025da3d 100644
>> --- a/mm/debug_vm_pgtable.c
>> +++ b/mm/debug_vm_pgtable.c
>> @@ -362,6 +362,12 @@ static void __init pud_advanced_tests(struct 
>> pgtable_debug_args *args)
>>  vaddr &= HPAGE_PUD_MASK;
>>  
>>  pud = pfn_pud(args->pud_pfn, args->page_prot);
>> +/*
>> + * Some architectures have debug checks to make sure
>> + * huge pud mapping are only found with devmap entries
>> + * For now test with only devmap entries.
>> + */
> Do you see this behaviour to be changed in powerpc anytime soon ? Otherwise
> these pud_mkdevmap() based work arounds, might be required to stick around
> for longer just to prevent powerpc specific triggers. Given PUD transparent
> huge pages i.e HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD are just supported on x86
> and powerpc platforms, could not this problem be solved in a more uniform
> manner.
> 


IIUC pud level transparent hugepages are only supported with devmap entries even
on x86. We don't do anonymous pud hugepage.

>> +pud = pud_mkdevmap(pud);
>>  set_pud_at(args->mm, vaddr, args->pudp, pud);
>>  flush_dcache_page(page);
>>  pudp_set_wrprotect(args->mm, vaddr, args->pudp);
>> @@ -374,6 +380,7 @@ static void __init pud_advanced_tests(struct 
>> pgtable_debug_args *args)
>>  WARN_ON(!pud_none(pud));
>>  #endif /* __PAGETABLE_PMD_FOLDED */
>>  pud = pfn_pud(args->pud_pfn, args->page_prot);
>> +pud = pud_mkdevmap(pud);
>>  pud = pud_wrprotect(pud);
>>  pud = pud_mkclean(pud);
>>  set_pud_at(args->mm, vaddr, args->pudp, pud);
>> @@ -391,6 +398,7 @@ static void __init pud_advanced_tests(struct 
>> pgtable_debug_args *args)
>>  #endif /* __PAGETABLE_PMD_FOLDED */
>>  
>>  pud = pfn_pud(args->pud_pfn, args->page_prot);
>> +pud = pud_mkdevmap(pud);
>>  pud = pud_mkyoung(pud);
>>  set_pud_at(args->mm, vaddr, args->pudp, pud);
>>  flush_dcache_page(page);


-aneesh

[PATCH] mm/debug_vm_pgtable: Fix BUG_ON with pud advanced test

2024-01-28 Thread Aneesh Kumar K.V (IBM)

Architectures like powerpc add debug checks to ensure we find only devmap
PUD pte entries. These debug checks are only done with CONFIG_DEBUG_VM.
This patch marks the ptes used for PUD advanced test devmap pte entries
so that we don't hit on debug checks on architecture like ppc64 as
below.

WARNING: CPU: 2 PID: 1 at arch/powerpc/mm/book3s64/radix_pgtable.c:1382 
radix__pud_hugepage_update+0x38/0x138

NIP [c00a7004] radix__pud_hugepage_update+0x38/0x138
LR [c00a77a8] radix__pudp_huge_get_and_clear+0x28/0x60
Call Trace:
[c4a2f950] [c4a2f9a0] 0xc4a2f9a0 (unreliable)
[c4a2f980] [000d34c1] 0xd34c1
[c4a2f9a0] [c206ba98] pud_advanced_tests+0x118/0x334
[c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
[c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388

Also

 kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:202!
 

 NIP [c0096510] pudp_huge_get_and_clear_full+0x98/0x174
 LR [c206bb34] pud_advanced_tests+0x1b4/0x334
 Call Trace:
 [c4a2f950] [000d34c1] 0xd34c1 (unreliable)
 [c4a2f9a0] [c206bb34] pud_advanced_tests+0x1b4/0x334
 [c4a2fa40] [c206db34] debug_vm_pgtable+0xcbc/0x1c48
 [c4a2fc10] [c000fd28] do_one_initcall+0x60/0x388

Fixes: 27af67f35631 ("powerpc/book3s64/mm: enable transparent pud hugepage")
Signed-off-by: Aneesh Kumar K.V (IBM) 
---
 mm/debug_vm_pgtable.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 5662e29fe253..65c19025da3d 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -362,6 +362,12 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
vaddr &= HPAGE_PUD_MASK;
 
pud = pfn_pud(args->pud_pfn, args->page_prot);
+   /*
+* Some architectures have debug checks to make sure
+* huge pud mapping are only found with devmap entries
+* For now test with only devmap entries.
+*/
+   pud = pud_mkdevmap(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
@@ -374,6 +380,7 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
+   pud = pud_mkdevmap(pud);
pud = pud_wrprotect(pud);
pud = pud_mkclean(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
@@ -391,6 +398,7 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
pud = pfn_pud(args->pud_pfn, args->page_prot);
+   pud = pud_mkdevmap(pud);
pud = pud_mkyoung(pud);
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
-- 
2.43.0

Re: [PATCH] powerpc/mm/hash: Code cleanup

2024-01-25 Thread Aneesh Kumar K.V

On 1/25/24 3:16 PM, Kunwu Chan wrote:
> This part was commented in about 17 years before.
> If there are no plans to enable this part code in the future,
> we can remove this dead code.
> 
> Signed-off-by: Kunwu Chan 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h | 22 ---
>  1 file changed, 22 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 1c4eebbc69c9..d39ec7134a78 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -731,26 +731,6 @@ struct hash_mm_context {
>  #endif /* CONFIG_PPC_SUBPAGE_PROT */
>  };
>  
> -#if 0
> -/*
> - * The code below is equivalent to this function for arguments
> - * < 2^VSID_BITS, which is all this should ever be called
> - * with.  However gcc is not clever enough to compute the
> - * modulus (2^n-1) without a second multiply.
> - */
> -#define vsid_scramble(protovsid, size) \
> - protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size))
> -
> -/* simplified form avoiding mod operation */
> -#define vsid_scramble(protovsid, size) \
> - ({   \
> - unsigned long x; \
> - x = (protovsid) * VSID_MULTIPLIER_##size;\
> - x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \
> - (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \
> - })
> -
> -#else /* 1 */
>  static inline unsigned long vsid_scramble(unsigned long protovsid,
> unsigned long vsid_multiplier, int vsid_bits)
>  {
> @@ -764,8 +744,6 @@ static inline unsigned long vsid_scramble(unsigned long 
> protovsid,
>   return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus;
>  }
>  
> -#endif /* 1 */
> -
>  /* Returns the segment size indicator for a user address */
>  static inline int user_segment_size(unsigned long addr)
>  {

That was done to make sure one can follow the actual compiled code better. 

-aneesh

[GIT PULL] Please pull powerpc/linux.git powerpc-6.8-2 tag

2024-01-20 Thread Aneesh Kumar K.V



-BEGIN PGP SIGNED MESSAGE-
Hash: SHA512

Hi Linus,

Please pull powerpc fixes for 6.8:

The following changes since commit d2441d3e8c0c076d0a2e705fa235c76869a85140:

  MAINTAINERS: powerpc: Add Aneesh & Naveen (2023-12-13 22:35:57 +1100)

are available in the git repository at:

  https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
tags/powerpc-6.8-2

for you to fetch changes up to 18f14afe281648e31ed35c9ad2fcb724c4838ad9:

  powerpc/64s: Increase default stack size to 32KB (2024-01-19 00:10:14 +0530)

- --
powerpc fixes for 6.8 #2

 - 18f14afe2816 powerpc/64s: Increase default stack size to 32KB BY: Michael 
Ellerman

Thanks to:
Michael Ellerman

- --
Michael Ellerman (1):
  powerpc/64s: Increase default stack size to 32KB


 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)
-BEGIN PGP SIGNATURE-

iHUEARYKAB0WIQTYs9CDOrDQRwKRmtrJvCLnGrjHVgUCZayyTgAKCRDJvCLnGrjH
VlgzAQDkYdg1/DjYNR4Ie/i4MdlYj2cA4s4kTR1x25QHHb6t7AD/URA8U20cMSla
etAHTJ3/Y/B/C/nL4GsC2uC5nYZ+bgo=
=WIHK
-END PGP SIGNATURE-

Re: [PATCH 09/12] KVM: PPC: Book3S HV nestedv2: Do not call H_COPY_TOFROM_GUEST

2023-12-10 Thread Aneesh Kumar K.V

On 12/11/23 9:26 AM, Vaibhav Jain wrote:
> Hi Aneesh,
> 
> Thanks for looking into this patch. My responses inline:
> 
> "Aneesh Kumar K.V (IBM)"  writes:
> 
> 
>> May be we should use 
>> firmware_has_feature(FW_FEATURE_H_COPY_TOFROM_GUEST))?
>> 
>> the nestedv2 can end up using the above hcall if it is supported by
>> the hypervisor right? In its absence we will have to translate the
>> guest ea using xlate and then use kvm_guest_read to read location
>> using the guest real address right? That xlate will also involves
>> multiple kvm_guest_read.
>> 
>> 
> Yes, Agreed and thats a nice suggestion. However ATM the hypervisor 
> supporting Nestedv2 doesnt have support for this hcall. In future
> once we have support for this hcall for nestedv2 from the hypervisor
> we can replace this branch with a firmware_has_feature() test.
> 

What I am suggesting is we convert that conditional to firmware_has_feature so 
that
later when hypervisor supports this hcall all older kernel can make
use of the copy_tofrom_guest without any code change.

>>> Signed-off-by: Jordan Niethe  --- 
>>> arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++ 1 file changed, 3
>>> insertions(+)
>>> 
>>> diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c
>>> b/arch/powerpc/kvm/book3s_64_mmu_radix.c index
>>> 916af6c153a5..4a1abb9f7c05 100644 ---
>>> a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++
>>> b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -40,6 +40,9 @@
>>> unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, 
>>> unsigned long quadrant, ret = n; bool is_load = !!to;
>>> 
>>> +   if (kvmhv_is_nestedv2()) +  return H_UNSUPPORTED; + /* Can't
>>> access quadrants 1 or 2 in non-HV mode, call the HV to do it */ 
>>> if (kvmhv_on_pseries()) return
>>> plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, -- 
>>> 2.42.0
>

Re: [PATCH] powerpc: Restrict ARCH_HIBERNATION_POSSIBLE to supported configurations

2023-11-15 Thread Aneesh Kumar K.V

Vishal Chourasia  writes:

> This patch modifies the ARCH_HIBERNATION_POSSIBLE option to ensure that it
> correctly depends on these PowerPC configurations being enabled. As a result,
> it prevents the HOTPLUG_CPU from being selected when the required dependencies
> are not satisfied.
>
> This change aligns the dependency tree with the expected hardware support for
> CPU hot-plugging under PowerPC architectures, ensuring that the kernel
> configuration steps do not lead to inconsistent states.
>
> Signed-off-by: Vishal Chourasia 
> ---
> During the configuration process with 'make randconfig' followed by
> 'make olddefconfig', we observed a warning indicating an unmet direct
> dependency for the HOTPLUG_CPU option. The dependency in question relates to
> various PowerPC configurations (PPC_PSERIES, PPC_PMAC, PPC_POWERNV,
> FSL_SOC_BOOKE) which were not enabled, yet the HOTPLUG_CPU was being
> erroneously selected due to an implicit assumption by the PM_SLEEP_SMP option.
> This misalignment in dependencies could potentially lead to inconsistent 
> kernel
> configuration states, especially when considering the necessary hardware
> support for CPU hot-plugging on PowerPC platforms. The patch aims to correct
> this by ensuring that ARCH_HIBERNATION_POSSIBLE is contingent upon the
> appropriate PowerPC configurations being active.
>
> steps to reproduce (before applying the patch):
>
> Run 'make pseries_le_defconfig'
> Run 'make menuconfig'
> Enable hibernation [ Kernel options -> Hibernation (aka 'suspend to disk') ] 
> Disable [ Platform support -> IBM PowerNV (Non-Virtualized) platform support ]
> Disable [ Platform support -> IBM pSeries & new (POWER5-based) iSeries ]
> Enable SMP [ Processor support -> Symmetric multi-processing support ]
> Save the config
> Run 'make olddefconfig'
>
>  arch/powerpc/Kconfig | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 6f105ee4f3cf..bf99ff9869f6 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -380,8 +380,9 @@ config DEFAULT_UIMAGE
> Used to allow a board to specify it wants a uImage built by default
>  
>  config ARCH_HIBERNATION_POSSIBLE
> - bool
> - default y
> + def_bool y
> + depends on PPC_PSERIES || \
> + PPC_PMAC || PPC_POWERNV || FSL_SOC_BOOKE
>  
>  config ARCH_SUSPEND_POSSIBLE
>   def_bool y
>

I am wondering whether it should be switched to using select from
config PPC? 

-aneesh

Re: [PATCH v4 1/5] powerpc/smp: Enable Asym packing for cores on shared processor

2023-11-14 Thread Aneesh Kumar K.V

Srikar Dronamraju  writes:

> If there are shared processor LPARs, underlying Hypervisor can have more
> virtual cores to handle than actual physical cores.
>
> Starting with Power 9, a big core (aka SMT8 core) has 2 nearly
> independent thread groups. On a shared processors LPARs, it helps to
> pack threads to lesser number of cores so that the overall system
> performance and utilization improves. PowerVM schedules at a big core
> level. Hence packing to fewer cores helps.
>
> For example: Lets says there are two 8-core Shared LPARs that are
> actually sharing a 8 Core shared physical pool, each running 8 threads
> each. Then Consolidating 8 threads to 4 cores on each LPAR would help
> them to perform better. This is because each of the LPAR will get
> 100% time to run applications and there will no switching required by
> the Hypervisor.
>
> To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level
> when the system is running in shared processor mode and has big cores.
>
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog:
> v3 -> v4:
> - Dont use splpar_asym_pack with SMT
> - Conflict resolution due to rebase
>   (DIE changed to PKG)
> v2 -> v3:
> - Handle comments from Michael Ellerman.
> - Rework using existing cpu_has_features static key
> v1->v2: Using Jump label instead of a variable.
>
>  arch/powerpc/kernel/smp.c | 37 +
>  1 file changed, 29 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index ab691c89d787..69a3262024f1 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -993,16 +993,20 @@ static bool shared_caches;
>  /* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
> - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> + if (!cpu_has_feature(CPU_FTR_ASYM_SMT))
> + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
>  
> - if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
> - printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
> - flags |= SD_ASYM_PACKING;
> - }
> - return flags;
> + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING;
>  }
>  #endif
>

Only relevant change there is dropping printk_once(). Rest of the
changes are not needed?

-aneesh

Re: [PATCH v4 0/5] powerpc/smp: Topology and shared processor optimizations

2023-11-14 Thread Aneesh Kumar K.V

Srikar Dronamraju  writes:

> PowerVM systems configured in shared processors mode have some unique
> challenges. Some device-tree properties will be missing on a shared
> processor. Hence some sched domains may not make sense for shared processor
> systems.
>
> Most shared processor systems are over-provisioned. Underlying PowerVM
> Hypervisor would schedule at a Big Core granularity. The most recent power
> processors support two almost independent cores. In a lightly loaded
> condition, it helps the overall system performance if we pack to lesser
> number of Big Cores.
>

Is this good to do if the systems are not over-provisioned? What will be
the performance impact in that case with and without the change?

-aneesh

Re: [PATCH v4 1/5] powerpc/smp: Enable Asym packing for cores on shared processor

2023-11-14 Thread Aneesh Kumar K.V

Srikar Dronamraju  writes:

> If there are shared processor LPARs, underlying Hypervisor can have more
> virtual cores to handle than actual physical cores.
>
> Starting with Power 9, a big core (aka SMT8 core) has 2 nearly
> independent thread groups. On a shared processors LPARs, it helps to
> pack threads to lesser number of cores so that the overall system
> performance and utilization improves. PowerVM schedules at a big core
> level. Hence packing to fewer cores helps.
>
> For example: Lets says there are two 8-core Shared LPARs that are
> actually sharing a 8 Core shared physical pool, each running 8 threads
> each. Then Consolidating 8 threads to 4 cores on each LPAR would help
> them to perform better. This is because each of the LPAR will get
> 100% time to run applications and there will no switching required by
> the Hypervisor.
>

Will this patch consolidate things to first 8 threads or just the one
Big core? /me continues to look at other patches and wonder whether 4/5
should come before this? 


>
> To achieve this, enable SD_ASYM_PACKING flag at CACHE, MC and DIE level
> when the system is running in shared processor mode and has big cores.
>
> Signed-off-by: Srikar Dronamraju 


-aneesh

Re: [PATCH v5 1/3] powerpc: make fadump resilient with memory add/remove events

2023-11-14 Thread Aneesh Kumar K.V

Sourabh Jain  writes:



> diff --git a/arch/powerpc/include/asm/fadump-internal.h 
> b/arch/powerpc/include/asm/fadump-internal.h
> index 27f9e11eda28..7be3d8894520 100644
> --- a/arch/powerpc/include/asm/fadump-internal.h
> +++ b/arch/powerpc/include/asm/fadump-internal.h
> @@ -42,7 +42,25 @@ static inline u64 fadump_str_to_u64(const char *str)
>  
>  #define FADUMP_CPU_UNKNOWN   (~((u32)0))
>  
> -#define FADUMP_CRASH_INFO_MAGIC  fadump_str_to_u64("FADMPINF")
> +/*
> + * The introduction of new fields in the fadump crash info header has
> + * led to a change in the magic key, from `FADMPINF` to `FADMPSIG`.
> + * This alteration ensures backward compatibility, enabling the kernel
> + * with the updated fadump crash info to handle kernel dumps from older
> + * kernels.
> + *
> + * To prevent the need for further changes to the magic number in the
> + * event of future modifications to the fadump header, a version field
> + * has been introduced to track the fadump crash info header version.
> + *
> + * Historically, there was no connection between the magic number and
> + * the fadump crash info header version. However, moving forward, the
> + * `FADMPINF` magic number in header will be treated as version 0, while
> + * the `FADMPSIG` magic number in header will include a version field to
> + * determine its version.
> + */
> +#define FADUMP_CRASH_INFO_MAGIC  fadump_str_to_u64("FADMPSIG")
> +#define FADUMP_VERSION   1
>

Can we keep the old magic details as

#define FADUMP_CRASH_INFO_MAGIC_OLD fadump_str_to_u64("FADMPINF")
#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPSIG")

Also considering the struct need not be backward compatible, can we just
do

struct fadump_crash_info_header {
u64 magic_number;
u32 crashing_cpu;
u64 elfcorehdr_addr;
u64 elfcorehdr_size;
u64 vmcoreinfo_raddr;
u64 vmcoreinfo_size;
struct pt_regs  regs;
struct cpumask  cpu_mask;
};

static inline bool fadump_compatible(struct fadump_crash_info_header
*fdh)
{
return (fdh->magic_number == FADUMP_CRASH_INFO_MAGIC)
}

and fail fadump if we find it not compatible?

-aneesh

[PATCH] powerpc/sched: Cleanup vcpu_is_preempted()

2023-11-13 Thread Aneesh Kumar K.V

No functional change in this patch. A helper is added to find if
vcpu is dispatched by hypervisor. Use that instead of opencoding.
Also clarify some of the comments.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/paravirt.h | 33 ++---
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/paravirt.h 
b/arch/powerpc/include/asm/paravirt.h
index ac4279208d63..b78b82d66057 100644
--- a/arch/powerpc/include/asm/paravirt.h
+++ b/arch/powerpc/include/asm/paravirt.h
@@ -76,6 +76,17 @@ static inline bool is_vcpu_idle(int vcpu)
 {
return lppaca_of(vcpu).idle;
 }
+
+static inline bool vcpu_is_dispatched(int vcpu)
+{
+   /*
+* This is the yield_count.  An "odd" value (low bit on) means that
+* the processor is yielded (either because of an OS yield or a
+* hypervisor preempt).  An even value implies that the processor is
+* currently executing.
+*/
+   return (!(yield_count_of(vcpu) & 1));
+}
 #else
 static inline bool is_shared_processor(void)
 {
@@ -109,6 +120,10 @@ static inline bool is_vcpu_idle(int vcpu)
 {
return false;
 }
+static inline bool vcpu_is_dispatched(int vcpu)
+{
+   return true;
+}
 #endif
 
 #define vcpu_is_preempted vcpu_is_preempted
@@ -134,12 +149,12 @@ static inline bool vcpu_is_preempted(int cpu)
 * If the hypervisor has dispatched the target CPU on a physical
 * processor, then the target CPU is definitely not preempted.
 */
-   if (!(yield_count_of(cpu) & 1))
+   if (vcpu_is_dispatched(cpu))
return false;
 
/*
-* If the target CPU has yielded to Hypervisor but OS has not
-* requested idle then the target CPU is definitely preempted.
+* if the target CPU is not dispatched and the guest OS
+* has not marked the CPU idle, then it is hypervisor preempted.
 */
if (!is_vcpu_idle(cpu))
return true;
@@ -166,7 +181,7 @@ static inline bool vcpu_is_preempted(int cpu)
 
/*
 * The PowerVM hypervisor dispatches VMs on a whole core
-* basis. So we know that a thread sibling of the local CPU
+* basis. So we know that a thread sibling of the executing CPU
 * cannot have been preempted by the hypervisor, even if it
 * has called H_CONFER, which will set the yield bit.
 */
@@ -174,15 +189,17 @@ static inline bool vcpu_is_preempted(int cpu)
return false;
 
/*
-* If any of the threads of the target CPU's core are not
-* preempted or ceded, then consider target CPU to be
-* non-preempted.
+* The specific target CPU was marked by guest OS as idle, but
+* then also check all other cpus in the core for PowerVM
+* because it does core scheduling and one of the vcpu
+* of the core getting preempted by hypervisor implies
+* other vcpus can also be considered preempted.
 */
first_cpu = cpu_first_thread_sibling(cpu);
for (i = first_cpu; i < first_cpu + threads_per_core; i++) {
if (i == cpu)
continue;
-   if (!(yield_count_of(i) & 1))
+   if (vcpu_is_dispatched(i))
return false;
if (!is_vcpu_idle(i))
return true;
-- 
2.41.0

[PATCH v2] powerpc/book3s/hash: Drop _PAGE_PRIVILEGED from PAGE_NONE

2023-11-13 Thread Aneesh Kumar K.V

There used to be a dependency on _PAGE_PRIVILEGED with pte_savedwrite.
But that got dropped by
commit 6a56ccbcf6c6 ("mm/autonuma: use can_change_(pte|pmd)_writable() to 
replace savedwrite")

With the change in this patch numa fault pte (pte_protnone()) gets mapped as 
regular user pte
with RWX cleared (no-access) whereas earlier it used to be mapped 
_PAGE_PRIVILEGED.

Hash fault handling code did get some WARN_ON added because those
functions are not expected to get called with _PAGE_READ cleared.
commit 18061c17c8ec ("powerpc/mm: Update PROTFAULT handling in the page fault 
path")
explains the details.

Also revert commit 1abce0580b89 ("powerpc/64s: Fix __pte_needs_flush() false 
positive warning")

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 9 +++--
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 9 ++---
 arch/powerpc/mm/book3s64/hash_utils.c | 7 +++
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb77eddca54b..2cc58ac74080 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -17,12 +17,6 @@
 #define _PAGE_EXEC 0x1 /* execute permission */
 #define _PAGE_WRITE0x2 /* write access allowed */
 #define _PAGE_READ 0x4 /* read access allowed */
-#define _PAGE_NA   _PAGE_PRIVILEGED
-#define _PAGE_NAX  _PAGE_EXEC
-#define _PAGE_RO   _PAGE_READ
-#define _PAGE_ROX  (_PAGE_READ | _PAGE_EXEC)
-#define _PAGE_RW   (_PAGE_READ | _PAGE_WRITE)
-#define _PAGE_RWX  (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
 #define _PAGE_PRIVILEGED   0x8 /* kernel access only */
 #define _PAGE_SAO  0x00010 /* Strong access order */
 #define _PAGE_NON_IDEMPOTENT   0x00020 /* non idempotent memory */
@@ -529,6 +523,9 @@ static inline bool pte_user(pte_t pte)
 }
 
 #define pte_access_permitted pte_access_permitted
+/*
+ * execute-only mappings return false
+ */
 static inline bool pte_access_permitted(pte_t pte, bool write)
 {
/*
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 1950c1b825b4..fd642b729775 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -158,11 +158,6 @@ static inline void flush_tlb_fix_spurious_fault(struct 
vm_area_struct *vma,
 */
 }
 
-static inline bool __pte_protnone(unsigned long pte)
-{
-   return (pte & (pgprot_val(PAGE_NONE) | _PAGE_RWX)) == 
pgprot_val(PAGE_NONE);
-}
-
 static inline bool __pte_flags_need_flush(unsigned long oldval,
  unsigned long newval)
 {
@@ -179,8 +174,8 @@ static inline bool __pte_flags_need_flush(unsigned long 
oldval,
/*
 * We do not expect kernel mappings or non-PTEs or not-present PTEs.
 */
-   VM_WARN_ON_ONCE(!__pte_protnone(oldval) && oldval & _PAGE_PRIVILEGED);
-   VM_WARN_ON_ONCE(!__pte_protnone(newval) && newval & _PAGE_PRIVILEGED);
+   VM_WARN_ON_ONCE(oldval & _PAGE_PRIVILEGED);
+   VM_WARN_ON_ONCE(newval & _PAGE_PRIVILEGED);
VM_WARN_ON_ONCE(!(oldval & _PAGE_PTE));
VM_WARN_ON_ONCE(!(newval & _PAGE_PTE));
VM_WARN_ON_ONCE(!(oldval & _PAGE_PRESENT));
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index ad2afa08e62e..0626a25b0d72 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -310,9 +310,16 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags, unsigned long flags
else
rflags |= 0x3;
}
+   VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping 
request");
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+   /*
+* We should never hit this in normal fault handling because
+* a permission check (check_pte_access()) will bubble this
+* to higher level linux handler even for PAGE_NONE.
+*/
+   VM_WARN_ONCE(!(pteflags & _PAGE_RWX), "no-access mapping 
request");
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
-- 
2.41.0

Re: [PATCH v2 29/37] powerpc/nohash: Replace pte_user() by pte_read()

2023-11-13 Thread Aneesh Kumar K.V

Christophe Leroy  writes:

> Le 07/11/2023 à 14:34, Aneesh Kumar K.V a écrit :
>> Christophe Leroy  writes:
>> 
>>> Le 31/10/2023 à 11:15, Aneesh Kumar K.V a écrit :
>>>> Christophe Leroy  writes:

>>
>> 
>> We are adding the pte flags check not the map addr check there. Something 
>> like this?
>
> Well, ok, but then why do we want to do that check for ioremap() and not 
> for everything else ? vmap() for instance will not perform any such 
> check. All it does is to clear the EXEC bit.
>
> As far as I can see, no other architecture does such a check, so why is 
> it needed on powerpc at all ?
>
> Regardless, comments below.
>

Looking at ioremap_prot() I am not clear whether we can really use the
flag value argument as is. For ex: x86 does 

pgprot2cachemode(__pgprot(prot_val))

I see that we use ioremap_prot() for generic_access_phys() and with
/dev/mem and __access_remote_vm() we can get called with a user pte
mapping prot flags? 

If such an prot value can be observed then the original change to clear
EXEC and mark it privileged is required?

/* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */
pte = pte_exprotect(pte);
pte = pte_mkprivileged(pte);

We already handle exec in pgprot_nx() and we need add back
pte_mkprivileged()? 

-aneesh

get_user_pages() and EXEC_ONLY mapping.

2023-11-10 Thread Aneesh Kumar K.V



Hello,

Some architectures can now support EXEC_ONLY mappings and I am wondering
what get_user_pages() on those addresses should return. Earlier
PROT_EXEC implied PROT_READ and pte_access_permitted() returned true for
that. But arm64 does have this explicit comment that says

 /*
 * p??_access_permitted() is true for valid user mappings (PTE_USER
 * bit set, subject to the write permission check). For execute-only
 * mappings, like PROT_EXEC with EPAN (both PTE_USER and PTE_UXN bits
 * not set) must return false. PROT_NONE mappings do not have the
 * PTE_VALID bit set.
 */

Is that correct? We should be able to get struct page for PROT_EXEC
mappings?

-aneesh

Re: [PATCH v2 29/37] powerpc/nohash: Replace pte_user() by pte_read()

2023-11-07 Thread Aneesh Kumar K.V

Christophe Leroy  writes:

> Le 31/10/2023 à 11:15, Aneesh Kumar K.V a écrit :
>> Christophe Leroy  writes:
>> 
>>> pte_user() is now only used in pte_access_permitted() to check
>>> access on vmas. User flag is cleared to make a page unreadable.
>>>
>>> So rename it pte_read() and remove pte_user() which isn't used
>>> anymore.
>>>
>>> For the time being it checks _PAGE_USER but in the near futur
>>> all plateforms will be converted to _PAGE_READ so lets support
>>> both for now.
>>>
>>> Signed-off-by: Christophe Leroy 
>>> ---
>>>   arch/powerpc/include/asm/nohash/32/pte-8xx.h |  7 ---
>>>   arch/powerpc/include/asm/nohash/pgtable.h| 13 +++--
>>>   arch/powerpc/mm/ioremap.c|  4 
>>>   3 files changed, 7 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
>>> b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>>> index 62c965a4511a..1ee38befd29a 100644
>>> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>>> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
>>> @@ -112,13 +112,6 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
>>>   
>>>   #define pte_mkwrite_novma pte_mkwrite_novma
>>>   
>>> -static inline bool pte_user(pte_t pte)
>>> -{
>>> -   return !(pte_val(pte) & _PAGE_SH);
>>> -}
>>> -
>>> -#define pte_user pte_user
>>> -
>>>   static inline pte_t pte_mkhuge(pte_t pte)
>>>   {
>>> return __pte(pte_val(pte) | _PAGE_SPS | _PAGE_HUGE);
>>> diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
>>> b/arch/powerpc/include/asm/nohash/pgtable.h
>>> index ee677162f9e6..aba56fe3b1c6 100644
>>> --- a/arch/powerpc/include/asm/nohash/pgtable.h
>>> +++ b/arch/powerpc/include/asm/nohash/pgtable.h
>>> @@ -160,9 +160,6 @@ static inline int pte_write(pte_t pte)
>>> return pte_val(pte) & _PAGE_WRITE;
>>>   }
>>>   #endif
>>> -#ifndef pte_read
>>> -static inline int pte_read(pte_t pte)  { return 1; }
>>> -#endif
>>>   static inline int pte_dirty(pte_t pte){ return pte_val(pte) & 
>>> _PAGE_DIRTY; }
>>>   static inline int pte_special(pte_t pte)  { return pte_val(pte) & 
>>> _PAGE_SPECIAL; }
>>>   static inline int pte_none(pte_t pte) { return (pte_val(pte) 
>>> & ~_PTE_NONE_MASK) == 0; }
>>> @@ -190,10 +187,14 @@ static inline int pte_young(pte_t pte)
>>>* and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
>>>* _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case 
>>> too.
>>>*/
>>> -#ifndef pte_user
>>> -static inline bool pte_user(pte_t pte)
>>> +#ifndef pte_read
>>> +static inline bool pte_read(pte_t pte)
>>>   {
>>> +#ifdef _PAGE_READ
>>> +   return (pte_val(pte) & _PAGE_READ) == _PAGE_READ;
>>> +#else
>>> return (pte_val(pte) & _PAGE_USER) == _PAGE_USER;
>>> +#endif
>>>   }
>>>   #endif
>>>   
>>> @@ -208,7 +209,7 @@ static inline bool pte_access_permitted(pte_t pte, bool 
>>> write)
>>>  * A read-only access is controlled by _PAGE_USER bit.
>>>  * We have _PAGE_READ set for WRITE and EXECUTE
>>>  */
>>> -   if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
>>> +   if (!pte_present(pte) || !pte_read(pte))
>>> return false;
>>>   
>>> if (write && !pte_write(pte))
>>> diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
>>> index 7823c38f09de..7b0afcabd89f 100644
>>> --- a/arch/powerpc/mm/ioremap.c
>>> +++ b/arch/powerpc/mm/ioremap.c
>>> @@ -50,10 +50,6 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t 
>>> size, unsigned long flags)
>>> if (pte_write(pte))
>>> pte = pte_mkdirty(pte);
>>>   
>>> -   /* we don't want to let _PAGE_USER leak out */
>>> -   if (WARN_ON(pte_user(pte)))
>>> -   return NULL;
>>>
>> 
>> This check is still valid right? I understand that we want to remove
>> _PAGE_USER. But then loosing this check is ok?
>
> Well, we may have to think about it for book3s/64. For all others 
> _PAGE_USER is gone and replaced by a check of addresses versus TASK_SIZE.
>
> As ioremap() will map into vmalloc space that address i

[RFC PATCH] powerpc/book3s/hash: Drop _PAGE_PRIVILEGED from PAGE_NONE

2023-11-02 Thread Aneesh Kumar K.V

There used to be a dependency on _PAGE_PRIVILEGED with pte_savedwrite.
But that got dropped by
commit 6a56ccbcf6c6 ("mm/autonuma: use can_change_(pte|pmd)_writable() to 
replace savedwrite")

With this change numa fault pte (pte_protnone()) gets mapped as regular user pte
with RWX cleared (no-access). This also remove pte_user() from
book3s/64.

pte_access_permitted() now checks for _PAGE_EXEC because we now support
EXECONLY mappings.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 23 +---
 arch/powerpc/mm/book3s64/hash_utils.c| 17 +++
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index cb77eddca54b..7c7de7b56df0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -17,12 +17,6 @@
 #define _PAGE_EXEC 0x1 /* execute permission */
 #define _PAGE_WRITE0x2 /* write access allowed */
 #define _PAGE_READ 0x4 /* read access allowed */
-#define _PAGE_NA   _PAGE_PRIVILEGED
-#define _PAGE_NAX  _PAGE_EXEC
-#define _PAGE_RO   _PAGE_READ
-#define _PAGE_ROX  (_PAGE_READ | _PAGE_EXEC)
-#define _PAGE_RW   (_PAGE_READ | _PAGE_WRITE)
-#define _PAGE_RWX  (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)
 #define _PAGE_PRIVILEGED   0x8 /* kernel access only */
 #define _PAGE_SAO  0x00010 /* Strong access order */
 #define _PAGE_NON_IDEMPOTENT   0x00020 /* non idempotent memory */
@@ -119,9 +113,9 @@
 /*
  * user access blocked by key
  */
-#define _PAGE_KERNEL_RW(_PAGE_PRIVILEGED | _PAGE_RW | 
_PAGE_DIRTY)
 #define _PAGE_KERNEL_RO (_PAGE_PRIVILEGED | _PAGE_READ)
 #define _PAGE_KERNEL_ROX(_PAGE_PRIVILEGED | _PAGE_READ | _PAGE_EXEC)
+#define _PAGE_KERNEL_RW(_PAGE_PRIVILEGED | _PAGE_RW | 
_PAGE_DIRTY)
 #define _PAGE_KERNEL_RWX   (_PAGE_PRIVILEGED | _PAGE_DIRTY | _PAGE_RW | 
_PAGE_EXEC)
 /*
  * _PAGE_CHG_MASK masks of bits that are to be preserved across
@@ -523,19 +517,14 @@ static inline bool arch_pte_access_permitted(u64 pte, 
bool write, bool execute)
 }
 #endif /* CONFIG_PPC_MEM_KEYS */
 
-static inline bool pte_user(pte_t pte)
-{
-   return !(pte_raw(pte) & cpu_to_be64(_PAGE_PRIVILEGED));
-}
-
 #define pte_access_permitted pte_access_permitted
 static inline bool pte_access_permitted(pte_t pte, bool write)
 {
-   /*
-* _PAGE_READ is needed for any access and will be
-* cleared for PROT_NONE
-*/
-   if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
+
+   if (!pte_present(pte))
+   return false;
+
+   if (!(pte_read(pte) || pte_exec(pte)))
return false;
 
if (write && !pte_write(pte))
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index ad2afa08e62e..b2eda22195f0 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -310,9 +310,26 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags, unsigned long flags
else
rflags |= 0x3;
}
+   WARN_ON(!(pteflags & _PAGE_RWX));
} else {
if (pteflags & _PAGE_RWX)
rflags |= 0x2;
+   else {
+   /*
+* PAGE_NONE will get mapped to 0b110 (slb key 1 no 
access)
+* We picked 0b110 instead of 0b000 so that slb key 0 
will
+* get only read only access for the same rflags.
+*/
+   if (mmu_has_feature(MMU_FTR_KERNEL_RO))
+   rflags |= (HPTE_R_PP0 | 0x2);
+   /*
+* rflags = HPTE_R_N
+* Without KERNEL_RO feature this will result in slb
+* key 0 with read/write. But ISA only supports that.
+* There is no key 1 no-access and key 0 read-only
+* pp bit support.
+*/
+   }
if (!((pteflags & _PAGE_WRITE) && (pteflags & _PAGE_DIRTY)))
rflags |= 0x1;
}
-- 
2.41.0

Re: [PATCH v2 37/37] powerpc: Support execute-only on all powerpc

2023-11-01 Thread Aneesh Kumar K.V

Christophe Leroy  writes:

> Introduce PAGE_EXECONLY_X macro which provides exec-only rights.
> The _X may be seen as redundant with the EXECONLY but it helps
> keep consistancy, all macros having the EXEC right have _X.
>
> And put it next to PAGE_NONE as PAGE_EXECONLY_X is
> somehow PAGE_NONE + EXEC just like all other SOMETHING_X are
> just SOMETHING + EXEC.
>
> On book3s/64 PAGE_EXECONLY becomes PAGE_READONLY_X.
>
> On book3s/64, as PAGE_EXECONLY is only valid for Radix add
> VM_READ flag in vm_get_page_prot() for non-Radix.
>
> And update access_error() so that a non exec fault on a VM_EXEC only
> mapping is always invalid, even when the underlying layer don't
> always generate a fault for that.
>
> For 8xx, set PAGE_EXECONLY_X as _PAGE_NA | _PAGE_EXEC.
> For others, only set it as just _PAGE_EXEC
>
> With that change, 8xx, e500 and 44x fully honor execute-only
> protection.
>
> On 40x that is a partial implementation of execute-only. The
> implementation won't be complete because once a TLB has been loaded
> via the Instruction TLB miss handler, it will be possible to read
> the page. But at least it can't be read unless it is executed first.
>
> On 603 MMU, TLB missed are handled by SW and there are separate
> DTLB and ITLB. Execute-only is therefore now supported by not loading
> DTLB when read access is not permitted.
>
> On hash (604) MMU it is more tricky because hash table is common to
> load/store and execute. Nevertheless it is still possible to check
> whether _PAGE_READ is set before loading hash table for a load/store
> access. At least it can't be read unless it is executed first.
>
> Signed-off-by: Christophe Leroy 
> Cc: Russell Currey 
> Cc: Kees Cook 
> ---
>  arch/powerpc/include/asm/book3s/32/pgtable.h |  2 +-
>  arch/powerpc/include/asm/book3s/64/pgtable.h |  4 +---
>  arch/powerpc/include/asm/nohash/32/pte-8xx.h |  1 +
>  arch/powerpc/include/asm/nohash/pgtable.h|  2 +-
>  arch/powerpc/include/asm/nohash/pte-e500.h   |  1 +
>  arch/powerpc/include/asm/pgtable-masks.h |  2 ++
>  arch/powerpc/mm/book3s64/pgtable.c   | 10 --
>  arch/powerpc/mm/fault.c  |  9 +
>  arch/powerpc/mm/pgtable.c|  4 ++--
>  9 files changed, 18 insertions(+), 17 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
> b/arch/powerpc/include/asm/book3s/32/pgtable.h
> index 244621c88510..52971ee30717 100644
> --- a/arch/powerpc/include/asm/book3s/32/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
> @@ -425,7 +425,7 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>  {
>   /*
>* A read-only access is controlled by _PAGE_READ bit.
> -  * We have _PAGE_READ set for WRITE and EXECUTE
> +  * We have _PAGE_READ set for WRITE
>*/
>   if (!pte_present(pte) || !pte_read(pte))
>   return false; 
>

Should this now be updated to check for EXEC bit ? 

> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 0fd12bdc7b5e..751b01227e36 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -18,6 +18,7 @@
>  #define _PAGE_WRITE  0x2 /* write access allowed */
>  #define _PAGE_READ   0x4 /* read access allowed */
>  #define _PAGE_NA _PAGE_PRIVILEGED
> +#define _PAGE_NAX_PAGE_EXEC
>  #define _PAGE_RO _PAGE_READ
>  #define _PAGE_ROX(_PAGE_READ | _PAGE_EXEC)
>  #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE)
> @@ -141,9 +142,6 @@
>  
>  #include 
>  
> -/* Radix only, Hash uses PAGE_READONLY_X + execute-only pkey instead */
> -#define PAGE_EXECONLY__pgprot(_PAGE_BASE | _PAGE_EXEC)
> -
>  /* Permission masks used for kernel mappings */
>  #define PAGE_KERNEL  __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW)
>  #define PAGE_KERNEL_NC   __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | 
> _PAGE_TOLERANT)
> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
> b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> index 1ee38befd29a..137dc3c84e45 100644
> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> @@ -48,6 +48,7 @@
>  
>  #define _PAGE_HUGE   0x0800  /* Copied to L1 PS bit 29 */
>  
> +#define _PAGE_NAX(_PAGE_NA | _PAGE_EXEC)
>  #define _PAGE_ROX(_PAGE_RO | _PAGE_EXEC)
>  #define _PAGE_RW 0
>  #define _PAGE_RWX_PAGE_EXEC
> diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
> b/arch/powerpc/include/asm/nohash/pgtable.h
> index f922c84b23eb..a50be1de9f83 100644
> --- a/arch/powerpc/include/asm/nohash/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/pgtable.h
> @@ -203,7 +203,7 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>  {
>   /*
>* A read-only access is controlled by _PAGE_READ bit.
> -  * We have _PAGE_READ set for WRITE and EXECUTE
> +

Re: [PATCH v2 29/37] powerpc/nohash: Replace pte_user() by pte_read()

2023-10-31 Thread Aneesh Kumar K.V

Christophe Leroy  writes:

> pte_user() is now only used in pte_access_permitted() to check
> access on vmas. User flag is cleared to make a page unreadable.
>
> So rename it pte_read() and remove pte_user() which isn't used
> anymore.
>
> For the time being it checks _PAGE_USER but in the near futur
> all plateforms will be converted to _PAGE_READ so lets support
> both for now.
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/nohash/32/pte-8xx.h |  7 ---
>  arch/powerpc/include/asm/nohash/pgtable.h| 13 +++--
>  arch/powerpc/mm/ioremap.c|  4 
>  3 files changed, 7 insertions(+), 17 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
> b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> index 62c965a4511a..1ee38befd29a 100644
> --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
> @@ -112,13 +112,6 @@ static inline pte_t pte_mkwrite_novma(pte_t pte)
>  
>  #define pte_mkwrite_novma pte_mkwrite_novma
>  
> -static inline bool pte_user(pte_t pte)
> -{
> - return !(pte_val(pte) & _PAGE_SH);
> -}
> -
> -#define pte_user pte_user
> -
>  static inline pte_t pte_mkhuge(pte_t pte)
>  {
>   return __pte(pte_val(pte) | _PAGE_SPS | _PAGE_HUGE);
> diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
> b/arch/powerpc/include/asm/nohash/pgtable.h
> index ee677162f9e6..aba56fe3b1c6 100644
> --- a/arch/powerpc/include/asm/nohash/pgtable.h
> +++ b/arch/powerpc/include/asm/nohash/pgtable.h
> @@ -160,9 +160,6 @@ static inline int pte_write(pte_t pte)
>   return pte_val(pte) & _PAGE_WRITE;
>  }
>  #endif
> -#ifndef pte_read
> -static inline int pte_read(pte_t pte){ return 1; }
> -#endif
>  static inline int pte_dirty(pte_t pte)   { return pte_val(pte) & 
> _PAGE_DIRTY; }
>  static inline int pte_special(pte_t pte) { return pte_val(pte) & 
> _PAGE_SPECIAL; }
>  static inline int pte_none(pte_t pte){ return (pte_val(pte) 
> & ~_PTE_NONE_MASK) == 0; }
> @@ -190,10 +187,14 @@ static inline int pte_young(pte_t pte)
>   * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in
>   * _PAGE_USER.  Need to explicitly match _PAGE_BAP_UR bit in that case too.
>   */
> -#ifndef pte_user
> -static inline bool pte_user(pte_t pte)
> +#ifndef pte_read
> +static inline bool pte_read(pte_t pte)
>  {
> +#ifdef _PAGE_READ
> + return (pte_val(pte) & _PAGE_READ) == _PAGE_READ;
> +#else
>   return (pte_val(pte) & _PAGE_USER) == _PAGE_USER;
> +#endif
>  }
>  #endif
>  
> @@ -208,7 +209,7 @@ static inline bool pte_access_permitted(pte_t pte, bool 
> write)
>* A read-only access is controlled by _PAGE_USER bit.
>* We have _PAGE_READ set for WRITE and EXECUTE
>*/
> - if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte))
> + if (!pte_present(pte) || !pte_read(pte))
>   return false;
>  
>   if (write && !pte_write(pte))
> diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c
> index 7823c38f09de..7b0afcabd89f 100644
> --- a/arch/powerpc/mm/ioremap.c
> +++ b/arch/powerpc/mm/ioremap.c
> @@ -50,10 +50,6 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t size, 
> unsigned long flags)
>   if (pte_write(pte))
>   pte = pte_mkdirty(pte);
>  
> - /* we don't want to let _PAGE_USER leak out */
> - if (WARN_ON(pte_user(pte)))
> - return NULL;
>

This check is still valid right? I understand that we want to remove
_PAGE_USER. But then loosing this check is ok? 

> -
>   if (iowa_is_active())
>   return iowa_ioremap(addr, size, pte_pgprot(pte), caller);
>   return __ioremap_caller(addr, size, pte_pgprot(pte), caller);
> -- 
> 2.41.0

Re: [PATCH 02/12] powerpc/pseries: Restructure hvc_get_chars() endianness

2023-10-30 Thread Aneesh Kumar K.V

Benjamin Gray  writes:

> Sparse reports an endian mismatch in hvc_get_chars().
>
> At first it seemed like the retbuf should be __be64[], but actually
> retbuf holds serialized registers returned by the hypervisor call, so
> it's correctly CPU endian typed.
>
> Instead, it is the be64_to_cpu() that's misleading. The intent is to do
> a byte swap on a little endian CPU. The swap is required because we
> wanted to store the register values to memory without 'swapping' bytes,
> so that the high order byte of the first register is the first byte
> in the result buffer.
>
> In diagram form, on a LE CPU with the letters representing the return
> string bytes:
>
> (register bytes) A B C D E F G H   I J K L M N O P
>   (retbuf mem bytes) H G F E D C B A   P O N M L K J I
> (buf/lbuf mem bytes) A B C D E F G H   I J K L M N O P
>
> So retbuf stores the registers in CPU endian, and buf always stores in
> big endian.
>
> So replace the byte swap function with cpu_to_be64() and cast lbuf as an
> array of __be64 to match the semantics closer.
>
> Signed-off-by: Benjamin Gray 
> ---
>  arch/powerpc/platforms/pseries/hvconsole.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/hvconsole.c 
> b/arch/powerpc/platforms/pseries/hvconsole.c
> index 1ac52963e08b..647718a15e78 100644
> --- a/arch/powerpc/platforms/pseries/hvconsole.c
> +++ b/arch/powerpc/platforms/pseries/hvconsole.c
> @@ -29,11 +29,11 @@ int hvc_get_chars(uint32_t vtermno, char *buf, int count)
>  {
>   long ret;
>   unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> - unsigned long *lbuf = (unsigned long *)buf;
> + __be64 *lbuf = (__be64 __force *)buf;
>  
>   ret = plpar_hcall(H_GET_TERM_CHAR, retbuf, vtermno);
> - lbuf[0] = be64_to_cpu(retbuf[1]);
> - lbuf[1] = be64_to_cpu(retbuf[2]);
> + lbuf[0] = cpu_to_be64(retbuf[1]);
> + lbuf[1] = cpu_to_be64(retbuf[2]);
>  
>   if (ret == H_SUCCESS)
>   return retbuf[0];
>

There is no functionality change in this patch. It is clarifying the
details that it expect the buf to have the big-endian format and retbuf
contains native endian format.

Not sure why this was not picked.

Reviewed-by: Aneesh Kumar K.V

Re: [PATCH v7 1/5] powerpc/code-patching: introduce patch_instructions()

2023-10-30 Thread Aneesh Kumar K.V

Hari Bathini  writes:

> patch_instruction() entails setting up pte, patching the instruction,
> clearing the pte and flushing the tlb. If multiple instructions need
> to be patched, every instruction would have to go through the above
> drill unnecessarily. Instead, introduce patch_instructions() function
> that sets up the pte, clears the pte and flushes the tlb only once
> per page range of instructions to be patched. Duplicate most of the
> patch_instruction() code instead of merging with it, to avoid the
> performance degradation observed on ppc32, for patch_instruction(),
> with the code path merged. Also, setup poking_init() always as BPF
> expects poking_init() to be setup even when STRICT_KERNEL_RWX is off.
>
> Signed-off-by: Hari Bathini 
> Acked-by: Song Liu 
>

A lot of this is duplicate of patch_instruction(). Can we consolidate
thing between them? 

> ---
>
> Changes in v7:
> * Fixed crash observed with !STRICT_RWX.
>
>
>  arch/powerpc/include/asm/code-patching.h |   1 +
>  arch/powerpc/lib/code-patching.c | 141 ++-
>  2 files changed, 139 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/code-patching.h 
> b/arch/powerpc/include/asm/code-patching.h
> index 3f881548fb61..0e29ccf903d0 100644
> --- a/arch/powerpc/include/asm/code-patching.h
> +++ b/arch/powerpc/include/asm/code-patching.h
> @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr,
>  int patch_branch(u32 *addr, unsigned long target, int flags);
>  int patch_instruction(u32 *addr, ppc_inst_t instr);
>  int raw_patch_instruction(u32 *addr, ppc_inst_t instr);
> +int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr);
>  
>  static inline unsigned long patch_site_addr(s32 *site)
>  {
> diff --git a/arch/powerpc/lib/code-patching.c 
> b/arch/powerpc/lib/code-patching.c
> index b00112d7ad46..e1c1fd9246d8 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -204,9 +204,6 @@ void __init poking_init(void)
>  {
>   int ret;
>  
> - if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
> - return;
> -
>   if (mm_patch_enabled())
>   ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
>   "powerpc/text_poke_mm:online",
> @@ -378,6 +375,144 @@ int patch_instruction(u32 *addr, ppc_inst_t instr)
>  }
>  NOKPROBE_SYMBOL(patch_instruction);
>  
> +static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool 
> repeat_instr)
> +{
> + unsigned long start = (unsigned long)patch_addr;
> +
> + /* Repeat instruction */
> + if (repeat_instr) {
> + ppc_inst_t instr = ppc_inst_read(code);
> +
> + if (ppc_inst_prefixed(instr)) {
> + u64 val = ppc_inst_as_ulong(instr);
> +
> + memset64((u64 *)patch_addr, val, len / 8);
> + } else {
> + u32 val = ppc_inst_val(instr);
> +
> + memset32(patch_addr, val, len / 4);
> + }
> + } else {
> + memcpy(patch_addr, code, len);
> + }
> +
> + smp_wmb();  /* smp write barrier */
> + flush_icache_range(start, start + len);
> + return 0;
> +}
> +
> +/*
> + * A page is mapped and instructions that fit the page are patched.
> + * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below.
> + */
> +static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool 
> repeat_instr)
> +{
> + struct mm_struct *patching_mm, *orig_mm;
> + unsigned long pfn = get_patch_pfn(addr);
> + unsigned long text_poke_addr;
> + spinlock_t *ptl;
> + u32 *patch_addr;
> + pte_t *pte;
> + int err;
> +
> + patching_mm = __this_cpu_read(cpu_patching_context.mm);
> + text_poke_addr = __this_cpu_read(cpu_patching_context.addr);
> + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr));
> +
> + pte = get_locked_pte(patching_mm, text_poke_addr, );
> + if (!pte)
> + return -ENOMEM;
> +
> + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, 
> PAGE_KERNEL), 0);
> +
> + /* order PTE update before use, also serves as the hwsync */
> + asm volatile("ptesync" ::: "memory");
> +
> + /* order context switch after arbitrary prior code */
> + isync();
> +
> + orig_mm = start_using_temp_mm(patching_mm);
> +
> + err = __patch_instructions(patch_addr, code, len, repeat_instr);
> +
> + /* context synchronisation performed by __patch_instructions */
> + stop_using_temp_mm(patching_mm, orig_mm);
> +
> + pte_clear(patching_mm, text_poke_addr, pte);
> + /*
> +  * ptesync to order PTE update before TLB invalidation done
> +  * by radix__local_flush_tlb_page_psize (in _tlbiel_va)
> +  */
> + local_flush_tlb_page_psize(patching_mm, text_poke_addr, 
> mmu_virtual_psize);
> +
> + pte_unmap_unlock(pte, ptl);
> +
> + return err;
> +}
> +
> +/*
> + * A

[PATCH v2] powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes

2023-10-24 Thread Aneesh Kumar K.V

With commit 9fee28baa601 ("powerpc: implement the new page table range
API") we added set_ptes to powerpc architecture. The implementation
included calling arch_enter/leave_lazy_mmu() calls.

The patch removes the usage of arch_enter/leave_lazy_mmu() because
set_pte is not supposed to be used when updating a pte entry. Powerpc
architecture uses this rule to skip the expensive tlb invalidate which
is not needed when you are setting up the pte for the first time. See
commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa() for updating
_PAGE_NUMA bit") for more details

The patch also makes sure we are not using the interface to update a
valid/present pte entry by adding VM_WARN_ON check all the ptes we
are setting up. Furthermore, we add a comment to set_pte_filter to
clarify it can only update folio-related flags and cannot filter
pfn specific details in pte filtering.

Removal of arch_enter/leave_lazy_mmu() also will avoid nesting of
these functions that are not supported. For ex:

remap_pte_range()
  -> arch_enter_lazy_mmu()
  -> set_ptes()
  -> arch_enter_lazy_mmu()
  -> arch_leave_lazy_mmu()
  -> arch_leave_lazy_mmu()

Fixes: 9fee28baa601 ("powerpc: implement the new page table range API")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/pgtable.c | 32 ++--
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..4d69bfb9bc11 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -104,6 +104,8 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; }
 /* Embedded type MMU with HW exec support. This is a bit more complicated
  * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so
  * instead we "filter out" the exec permission for non clean pages.
+ *
+ * This is also called once for the folio. So only work with folio->flags here.
  */
 static inline pte_t set_pte_filter(pte_t pte)
 {
@@ -190,29 +192,39 @@ static pte_t set_access_flags_filter(pte_t pte, struct 
vm_area_struct *vma,
 void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte, unsigned int nr)
 {
-   /*
-* Make sure hardware valid bit is not set. We don't do
-* tlb flush for this update.
-*/
-   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
/* Note: mm->context.id might not yet have been assigned as
 * this context might not have been activated yet when this
-* is called.
+* is called. Filter the pte value and use the filtered value
+* to setup all the ptes in the range.
 */
pte = set_pte_filter(pte);
 
-   /* Perform the setting of the PTE */
-   arch_enter_lazy_mmu_mode();
+   /*
+* We don't need to call arch_enter/leave_lazy_mmu_mode()
+* because we expect set_ptes to be only be used on not present
+* and not hw_valid ptes. Hence there is no translation cache flush
+* involved that need to be batched.
+*/
for (;;) {
+
+   /*
+* Make sure hardware valid bit is not set. We don't do
+* tlb flush for this update.
+*/
+   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+   /* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
if (--nr == 0)
break;
ptep++;
-   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
addr += PAGE_SIZE;
+   /*
+* increment the pfn.
+*/
+   pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte)));
}
-   arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)
-- 
2.41.0

Re: [PATCH] powerpc/mm: Update set_ptes to call pte_filter for all the ptes

2023-10-18 Thread Aneesh Kumar K.V

Aneesh Kumar K V  writes:

> On 10/18/23 11:25 AM, Christophe Leroy wrote:
>> 
>> 
>> Le 18/10/2023 à 06:55, Aneesh Kumar K.V a écrit :
>>> With commit 9fee28baa601 ("powerpc: implement the new page table range
>>> API") we added set_ptes to powerpc architecture but the implementation
>>> missed calling the pte filter for all the ptes we are setting in the
>>> range. set_pte_filter can be used for filter pte values and on some
>>> platforms which don't support coherent icache it clears the exec bit so
>>> that we can flush the icache on exec fault
>>>
>>> The patch also removes the usage of arch_enter/leave_lazy_mmu() because
>>> set_pte is not supposed to be used when updating a pte entry. Powerpc
>>> architecture uses this rule to skip the expensive tlb invalidate which
>>> is not needed when you are setting up the pte for the first time. See
>>> commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa() for updating
>>> _PAGE_NUMA bit") for more details
>>>
>>> Fixes: 9fee28baa601 ("powerpc: implement the new page table range API")
>>> Signed-off-by: Aneesh Kumar K.V 
>>> ---
>>>   arch/powerpc/mm/pgtable.c | 33 -
>>>   1 file changed, 20 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
>>> index 3ba9fe411604..95ab20cca2da 100644
>>> --- a/arch/powerpc/mm/pgtable.c
>>> +++ b/arch/powerpc/mm/pgtable.c
>>> @@ -191,28 +191,35 @@ void set_ptes(struct mm_struct *mm, unsigned long 
>>> addr, pte_t *ptep,
>>> pte_t pte, unsigned int nr)
>>>   {
>>> /*
>>> -* Make sure hardware valid bit is not set. We don't do
>>> -* tlb flush for this update.
>>> +* We don't need to call arch_enter/leave_lazy_mmu_mode()
>>> +* because we expect set_ptes to be only be used on not present
>>> +* and not hw_valid ptes. Hence there is not translation cache flush
>>> +* involved that need to be batched.
>>>  */
>>> -   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
>>> +   for (;;) {
>>>   
>>> -   /* Note: mm->context.id might not yet have been assigned as
>>> -* this context might not have been activated yet when this
>>> -* is called.
>>> -*/
>>> -   pte = set_pte_filter(pte);
>>> +   /*
>>> +* Make sure hardware valid bit is not set. We don't do
>>> +* tlb flush for this update.
>>> +*/
>>> +   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
>>>   
>>> -   /* Perform the setting of the PTE */
>>> -   arch_enter_lazy_mmu_mode();
>>> -   for (;;) {
>>> +   /* Note: mm->context.id might not yet have been assigned as
>>> +* this context might not have been activated yet when this
>>> +* is called.
>>> +*/
>>> +   pte = set_pte_filter(pte);
>> 
>> Why do you need to call set_pte_filter() inside the loop ?
>> The only difference between previous pte and next pte is the RPN, other 
>> flags remain untouched so I can't see why you need to call 
>> set_pte_filter() again.
>> 
>
> I missed the fact that we use the filtered pte in all the ptes in the range. 
> One other details
> that made me look at calling the filter in the loop was we clearing the 
> struct page->flags.
> The only flag right now we care about the PG_dcache_clean and that moved to 
> folio. So we might be
> good here. May be we add a comment in set_pte_filter saying can operate only 
> on folio->flags ? 
>
>>> +
>>> +   /* Perform the setting of the PTE */
>>> __set_pte_at(mm, addr, ptep, pte, 0);
>>> if (--nr == 0)
>>> break;
>>> ptep++;
>>> -   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
>>>         addr += PAGE_SIZE;
>>> +   /* increment the pfn */
>>> +   pte = __pte(pte_val(pte) + PAGE_SIZE);
>> 
>> PAGE_SIZE doesn't work on all platforms, see for instance e500.
>> 
>> see comment at 
>> https://elixir.bootlin.com/linux/v6.3-rc2/source/arch/powerpc/include/asm/nohash/32/pgtable.h#L147
>> 
>> And then you see 
>> https://elixir.bootlin.com/linux/v6.3-rc2/source/arch/powerpc/include/asm/nohash/pte-e500.h#L63

[PATCH] powerpc/mm: Update set_ptes to call pte_filter for all the ptes

2023-10-17 Thread Aneesh Kumar K.V

With commit 9fee28baa601 ("powerpc: implement the new page table range
API") we added set_ptes to powerpc architecture but the implementation
missed calling the pte filter for all the ptes we are setting in the
range. set_pte_filter can be used for filter pte values and on some
platforms which don't support coherent icache it clears the exec bit so
that we can flush the icache on exec fault

The patch also removes the usage of arch_enter/leave_lazy_mmu() because
set_pte is not supposed to be used when updating a pte entry. Powerpc
architecture uses this rule to skip the expensive tlb invalidate which
is not needed when you are setting up the pte for the first time. See
commit 56eecdb912b5 ("mm: Use ptep/pmdp_set_numa() for updating
_PAGE_NUMA bit") for more details

Fixes: 9fee28baa601 ("powerpc: implement the new page table range API")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/pgtable.c | 33 -
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..95ab20cca2da 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -191,28 +191,35 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
pte_t pte, unsigned int nr)
 {
/*
-* Make sure hardware valid bit is not set. We don't do
-* tlb flush for this update.
+* We don't need to call arch_enter/leave_lazy_mmu_mode()
+* because we expect set_ptes to be only be used on not present
+* and not hw_valid ptes. Hence there is not translation cache flush
+* involved that need to be batched.
 */
-   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+   for (;;) {
 
-   /* Note: mm->context.id might not yet have been assigned as
-* this context might not have been activated yet when this
-* is called.
-*/
-   pte = set_pte_filter(pte);
+   /*
+* Make sure hardware valid bit is not set. We don't do
+* tlb flush for this update.
+*/
+   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
-   /* Perform the setting of the PTE */
-   arch_enter_lazy_mmu_mode();
-   for (;;) {
+   /* Note: mm->context.id might not yet have been assigned as
+* this context might not have been activated yet when this
+* is called.
+*/
+   pte = set_pte_filter(pte);
+
+   /* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
if (--nr == 0)
break;
ptep++;
-   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
addr += PAGE_SIZE;
+   /* increment the pfn */
+   pte = __pte(pte_val(pte) + PAGE_SIZE);
+
}
-   arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)
-- 
2.41.0

Re: [PATCH 0/2] Allow nesting of lazy MMU mode

2023-10-17 Thread Aneesh Kumar K.V

Erhard Furtner  writes:

> On Thu, 12 Oct 2023 20:54:13 +0100
> "Matthew Wilcox (Oracle)"  wrote:
>
>> Dave Woodhouse reported that we now nest calls to
>> arch_enter_lazy_mmu_mode().  That was inadvertent, but in principle we
>> should allow it.  On further investigation, Juergen already fixed it
>> for Xen, but didn't tell anyone.  Fix it for Sparc & PowerPC too.
>> This may or may not help fix the problem that Erhard reported.
>> 
>> Matthew Wilcox (Oracle) (2):
>>   powerpc: Allow nesting of lazy MMU mode
>>   sparc: Allow nesting of lazy MMU mode
>> 
>>  arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 5 ++---
>>  arch/sparc/mm/tlb.c| 5 ++---
>>  2 files changed, 4 insertions(+), 6 deletions(-)
>> 
>> -- 
>> 2.40.1
>
> Applied the patch on top of v6.6-rc5 but unfortunately it did not fix my 
> reported issue.
>
> Regards,
> Erhard
> 

With the problem reported I guess we are finding the page->compound_head
wrong and hence folio->flags PG_dcache_clean check crashing. I still
don't know why we find page->compound_head wrong. Michael noted we are
using FLAT_MEM. That implies we are suppose to inialize struct page correctly
via init_unavailable_range because we are hitting this on an ioremap
address. We need to instrument the kernel to track the initialization of
the struct page backing these pfns which we know is crashing.

W.r.t arch_enter_lazy_mmu_mode() we can skip that completely on powerpc
because we don't allow the usage of set_pte on a valid pte entries. pte
updates are not done via set_pte interface and hence there is no TLB
invalidate required while using set_pte(). 

ie, we can do something like below. The change also make sure we call
set_pte_filter on all the ptes we are setting via set_ptes(). I haven't
sent this as a proper patch because we still are not able to fix the
issue Erhard reported. 

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..95ab20cca2da 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -191,28 +191,35 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
pte_t pte, unsigned int nr)
 {
/*
-* Make sure hardware valid bit is not set. We don't do
-* tlb flush for this update.
+* We don't need to call arch_enter/leave_lazy_mmu_mode()
+* because we expect set_ptes to be only be used on not present
+* and not hw_valid ptes. Hence there is not translation cache flush
+* involved that need to be batched.
 */
-   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+   for (;;) {
 
-   /* Note: mm->context.id might not yet have been assigned as
-* this context might not have been activated yet when this
-* is called.
-*/
-   pte = set_pte_filter(pte);
+   /*
+* Make sure hardware valid bit is not set. We don't do
+* tlb flush for this update.
+*/
+   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
 
-   /* Perform the setting of the PTE */
-   arch_enter_lazy_mmu_mode();
-   for (;;) {
+   /* Note: mm->context.id might not yet have been assigned as
+* this context might not have been activated yet when this
+* is called.
+*/
+   pte = set_pte_filter(pte);
+
+   /* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
if (--nr == 0)
break;
ptep++;
-   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
addr += PAGE_SIZE;
+   /* increment the pfn */
+   pte = __pte(pte_val(pte) + PAGE_SIZE);
+
}
-   arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)

Re: [Bisected] PowerMac G5 fails booting kernel 6.6-rc3 (BUG: Unable to handle kernel data access at 0xfeffbb62ffec65fe)

2023-10-06 Thread Aneesh Kumar K.V

Erhard Furtner  writes:

> On Fri, 06 Oct 2023 11:04:15 +0530
> "Aneesh Kumar K.V"  wrote:
>
>> Can you check this change?
>> 
>> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
>> index 3ba9fe411604..6d144fedd557 100644
>

...

>>  void unmap_kernel_page(unsigned long va)
>
> Thanks for having a look into the issue! Your patch applies but I got a build 
> failure:
>
>  # make
>   CALLscripts/checksyscalls.sh
>   CC  arch/powerpc/mm/pgtable.o
> In file included from ./include/linux/mm.h:29,
>  from arch/powerpc/mm/pgtable.c:22:
> ./include/linux/pgtable.h:247:71: error: expected declaration specifiers or 
> '...' before numeric constant
>   247 | #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 
> 1)
>   |   
> ^
> arch/powerpc/mm/pgtable.c:190:13: note: in expansion of macro 'set_pte_at'
>   190 | static void set_pte_at(struct mm_struct *mm, unsigned long addr, 
> pte_t *ptep,
>   | ^~

Sorry that I shared a change without build testing.  Here is the updated change

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..e563e13ffd88 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -190,29 +190,28 @@ static pte_t set_access_flags_filter(pte_t pte, struct 
vm_area_struct *vma,
 void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
pte_t pte, unsigned int nr)
 {
-   /*
-* Make sure hardware valid bit is not set. We don't do
-* tlb flush for this update.
-*/
-   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
-
-   /* Note: mm->context.id might not yet have been assigned as
-* this context might not have been activated yet when this
-* is called.
-*/
-   pte = set_pte_filter(pte);
-
/* Perform the setting of the PTE */
-   arch_enter_lazy_mmu_mode();
for (;;) {
+
+   /*
+* Make sure hardware valid bit is not set. We don't do
+* tlb flush for this update.
+*/
+   VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep));
+
+   /* Note: mm->context.id might not yet have been assigned as
+* this context might not have been activated yet when this
+* is called.
+*/
+   pte = set_pte_filter(pte);
+
+   /* Perform the setting of the PTE */
__set_pte_at(mm, addr, ptep, pte, 0);
if (--nr == 0)
break;
ptep++;
-   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
addr += PAGE_SIZE;
}
-   arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)

Re: [Bisected] PowerMac G5 fails booting kernel 6.6-rc3 (BUG: Unable to handle kernel data access at 0xfeffbb62ffec65fe)

2023-10-05 Thread Aneesh Kumar K.V



Hi,

Erhard Furtner  writes:

> Greetings!
>
> Kernel 6.5.5 boots fine on my PowerMac G5 11,2 but kernel 6.6-rc3 fails to 
> boot with following dmesg shown on the OpenFirmware console (transcribed 
> screenshot):

> I bisected the issue and got 9fee28baa601f4dbf869b1373183b312d2d5ef3d as 1st 
> bad commit:
>

Can you check this change?

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 3ba9fe411604..6d144fedd557 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct 
vm_area_struct *vma,
 /*
  * set_pte stores a linux PTE into the linux page table.
  */
-void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
-   pte_t pte, unsigned int nr)
+static void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+   pte_t pte)
 {
/*
 * Make sure hardware valid bit is not set. We don't do
@@ -203,16 +203,23 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
pte = set_pte_filter(pte);
 
/* Perform the setting of the PTE */
-   arch_enter_lazy_mmu_mode();
+   __set_pte_at(mm, addr, ptep, pte, 0);
+}
+
+/*
+ * set_pte stores a linux PTE into the linux page table.
+ */
+void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
+   pte_t pte, unsigned int nr)
+{
+   /* Perform the setting of the PTE */
for (;;) {
-   __set_pte_at(mm, addr, ptep, pte, 0);
+   set_pte_at(mm, addr, ptep, pte);
if (--nr == 0)
break;
ptep++;
-   pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT));
addr += PAGE_SIZE;
}
-   arch_leave_lazy_mmu_mode();
 }
 
 void unmap_kernel_page(unsigned long va)

Re: [PATCH v2 1/2] powerpc: add `cur_cpu_spec` symbol to vmcoreinfo

2023-09-21 Thread Aneesh Kumar K.V

Aditya Gupta  writes:

> On Wed, Sep 20, 2023 at 05:45:36PM +0530, Aneesh Kumar K.V wrote:
>> Aditya Gupta  writes:
>> 
>> > Since below commit, address mapping for vmemmap has changed for Radix
>> > MMU, where address mapping is stored in kernel page table itself,
>> > instead of earlier used 'vmemmap_list'.
>> >
>> > commit 368a0590d954 ("powerpc/book3s64/vmemmap: switch radix to use
>> > a different vmemmap handling function")
>> >
>> > Hence with upstream kernel, in case of Radix MMU, makedumpfile fails to do
>> > address translation for vmemmap addresses, as it depended on vmemmap_list,
>> > which can now be empty.
>> >
>> > While fixing the address translation in makedumpfile, it was identified
>> > that currently makedumpfile cannot distinguish between Hash MMU and
>> > Radix MMU, unless VMLINUX is passed with -x flag to makedumpfile.
>> > And hence fails to assign offsets and shifts correctly (such as in L4 to
>> > PGDIR offset calculation in makedumpfile).
>> >
>> > For getting the MMU, makedumpfile uses `cur_cpu_spec.mmu_features`.
>> >
>> > Add `cur_cpu_spec` symbol and offset of `mmu_features` in the
>> > `cpu_spec` struct, to VMCOREINFO, so that makedumpfile can assign the
>> > offsets correctly, without needing a VMLINUX.
>> >
>> > Fixes: 368a0590d954 ("powerpc/book3s64/vmemmap: switch radix to use a 
>> > different vmemmap handling function")
>> > Reported-by: Sachin Sant 
>> > Tested-by: Sachin Sant 
>> > Signed-off-by: Aditya Gupta 
>> >
>> > ---
>> > Corresponding makedumpfile patches to fix address translation, in Radix
>> > MMU case:
>> >
>> > Link: 
>> > https://lore.kernel.org/kexec/b5f0f00e-f2b1-47d7-a143-5683d10dc...@linux.ibm.com/T/#t
>> > ---
>> > ---
>> >  arch/powerpc/kexec/core.c | 2 ++
>> >  1 file changed, 2 insertions(+)
>> >
>> > diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
>> > index de64c7962991..369b8334a4f0 100644
>> > --- a/arch/powerpc/kexec/core.c
>> > +++ b/arch/powerpc/kexec/core.c
>> > @@ -63,6 +63,8 @@ void arch_crash_save_vmcoreinfo(void)
>> >  #ifndef CONFIG_NUMA
>> >VMCOREINFO_SYMBOL(contig_page_data);
>> >  #endif
>> > +  VMCOREINFO_SYMBOL(cur_cpu_spec);
>> > +  VMCOREINFO_OFFSET(cpu_spec, mmu_features);
>> >  #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
>> >VMCOREINFO_SYMBOL(vmemmap_list);
>> >VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
>> >
>> 
>> That implies we now have to be careful when updating MMU_FTR_* #defines.
>> It is not bad considering other hacks we do in crash to identify kernel
>> changes tied to version number. But i am wondering if there another way
>> to identify radix vs hash?
>> 
>
> I could not find another way to get any other flag for RADIX vs HASH in
> makedumpfile. And currently I don't know of any other way.
>
> Both makedumpfile and crash look for '0x40' flag set in
> 'cur_cpu_spec.mmu_features', so only requirement for 'MMU_FTR_TYPE_RADIX' is 
> to
> be '0x40', or we will need to change the value accordingly in the tools.
>

Instead of exporting cur_cpu_spec.mmu_feature, you could do
coreinfo_mmu_features that does

if (radix_enabled())
   coreinfo_mmu_feature = VMCORE_INFO_RADIX_TRANSLATION;

-aneesh

Re: [PATCH v2 1/2] powerpc: add `cur_cpu_spec` symbol to vmcoreinfo

2023-09-20 Thread Aneesh Kumar K.V

Aditya Gupta  writes:

> Since below commit, address mapping for vmemmap has changed for Radix
> MMU, where address mapping is stored in kernel page table itself,
> instead of earlier used 'vmemmap_list'.
>
> commit 368a0590d954 ("powerpc/book3s64/vmemmap: switch radix to use
> a different vmemmap handling function")
>
> Hence with upstream kernel, in case of Radix MMU, makedumpfile fails to do
> address translation for vmemmap addresses, as it depended on vmemmap_list,
> which can now be empty.
>
> While fixing the address translation in makedumpfile, it was identified
> that currently makedumpfile cannot distinguish between Hash MMU and
> Radix MMU, unless VMLINUX is passed with -x flag to makedumpfile.
> And hence fails to assign offsets and shifts correctly (such as in L4 to
> PGDIR offset calculation in makedumpfile).
>
> For getting the MMU, makedumpfile uses `cur_cpu_spec.mmu_features`.
>
> Add `cur_cpu_spec` symbol and offset of `mmu_features` in the
> `cpu_spec` struct, to VMCOREINFO, so that makedumpfile can assign the
> offsets correctly, without needing a VMLINUX.
>
> Fixes: 368a0590d954 ("powerpc/book3s64/vmemmap: switch radix to use a 
> different vmemmap handling function")
> Reported-by: Sachin Sant 
> Tested-by: Sachin Sant 
> Signed-off-by: Aditya Gupta 
>
> ---
> Corresponding makedumpfile patches to fix address translation, in Radix
> MMU case:
>
> Link: 
> https://lore.kernel.org/kexec/b5f0f00e-f2b1-47d7-a143-5683d10dc...@linux.ibm.com/T/#t
> ---
> ---
>  arch/powerpc/kexec/core.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
> index de64c7962991..369b8334a4f0 100644
> --- a/arch/powerpc/kexec/core.c
> +++ b/arch/powerpc/kexec/core.c
> @@ -63,6 +63,8 @@ void arch_crash_save_vmcoreinfo(void)
>  #ifndef CONFIG_NUMA
>   VMCOREINFO_SYMBOL(contig_page_data);
>  #endif
> + VMCOREINFO_SYMBOL(cur_cpu_spec);
> + VMCOREINFO_OFFSET(cpu_spec, mmu_features);
>  #if defined(CONFIG_PPC64) && defined(CONFIG_SPARSEMEM_VMEMMAP)
>   VMCOREINFO_SYMBOL(vmemmap_list);
>   VMCOREINFO_SYMBOL(mmu_vmemmap_psize);
>

That implies we now have to be careful when updating MMU_FTR_* #defines.
It is not bad considering other hacks we do in crash to identify kernel
changes tied to version number. But i am wondering if there another way
to identify radix vs hash?

-aneesh

[PATCH 2/2] powerpc/mm/book3s64: Use 256M as the upper limit with coherent device memory attached

2023-08-28 Thread Aneesh Kumar K.V

commit 4d15721177d5 ("powerpc/mm: Cleanup memory block size probing")
used 256MB as the memory block size when we have
ibm,coherent-device-memory device tree node present. Instead of
returning with 256MB memory block size, continue to check the rest of the memory
regions and make sure we can still map them using a 256MB memory block size.

Fixes: 4d15721177d5 ("powerpc/mm: Cleanup memory block size probing")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index e3d7379ef480..a8557867ece0 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -569,8 +569,12 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
 */
compatible = of_get_flat_dt_prop(node, "compatible", NULL);
if (compatible && !strcmp(compatible, 
"ibm,coherent-device-memory")) {
-   *block_size = SZ_256M;
-   return 1;
+   if (*block_size > SZ_256M)
+   *block_size = SZ_256M;
+   /*
+* We keep 256M as the upper limit with GPU present.
+*/
+   return 0;
}
}
/* continue looking for other memory device types */
-- 
2.41.0

[PATCH 1/2] powerpc/mm/book3s64: Fix build error with SPARSEMEM disabled

2023-08-28 Thread Aneesh Kumar K.V

With CONFIG_SPARSEMEM disabled the below kernel build error is observed.

 arch/powerpc/mm/init_64.c:477:38: error: use of undeclared identifier 
'SECTION_SIZE_BITS'

CONFIG_MEMORY_HOTPLUG depends on CONFIG_SPARSEMEM and it is more clear
to describe the code dependency in terms of MEMORY_HOTPLUG. Outside
memory hotplug the kernel uses memory_block_size for kernel directmap.
Instead of depending on SECTION_SIZE_BITS to compute the direct map
page size, add a new #define which defaults to 16M(same as existing
SECTION_SIZE)

Fixes: 4d15721177d5 ("powerpc/mm: Cleanup memory block size probing")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fcda46c2b8df..e3d7379ef480 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -472,12 +472,23 @@ static int __init dt_scan_mmu_pid_width(unsigned long 
node,
return 1;
 }
 
+/*
+ * Outside hotplug the kernel uses this value to map the kernel direct map
+ * with radix. To be compatible with older kernels, let's keep this value
+ * as 16M which is also SECTION_SIZE with SPARSEMEM. We can ideally map
+ * things with 1GB size in the case where we don't support hotplug.
+ */
+#ifndef CONFIG_MEMORY_HOTPLUG
+#define DEFAULT_MEMORY_BLOCK_SIZE  SZ_16M
+#else
+#define DEFAULT_MEMORY_BLOCK_SIZE  MIN_MEMORY_BLOCK_SIZE
+#endif
+
 static void update_memory_block_size(unsigned long *block_size, unsigned long 
mem_size)
 {
-   unsigned long section_size = 1UL << SECTION_SIZE_BITS;
-
-   for (; *block_size > section_size; *block_size >>= 2) {
+   unsigned long min_memory_block_size = DEFAULT_MEMORY_BLOCK_SIZE;
 
+   for (; *block_size > min_memory_block_size; *block_size >>= 2) {
if ((mem_size & *block_size) == 0)
break;
}
@@ -507,7 +518,7 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
/*
 * Nothing in the device tree
 */
-   *block_size = MIN_MEMORY_BLOCK_SIZE;
+   *block_size = DEFAULT_MEMORY_BLOCK_SIZE;
else
*block_size = of_read_number(prop, dt_root_size_cells);
/*
-- 
2.41.0

[PATCH v8 5/6] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-08-08 Thread Aneesh Kumar K.V

Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Reviewed-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 21 +++
 .../platforms/pseries/hotplug-memory.c|  2 +-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d0497d13f5b4..938294c996dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index a4893b17705a..33464e6d6431 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   if (!radix_enabled())
+   return false;
+   /*
+* With 4K page size and 2M PMD_SIZE, we can align
+* things better with memory block size value
+* starting from 128MB. Hence align things with PMD_SIZE.
+*/
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..4f3d6a2f9065 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
-- 
2.41.0

[PATCH v8 6/6] mm/memory_hotplug: Embed vmem_altmap details in memory block

2023-08-08 Thread Aneesh Kumar K.V

With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block, embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

Acked-by: Michal Hocko 
Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 27 +
 include/linux/memory.h |  8 ++-
 mm/memory_hotplug.c| 54 ++
 3 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..8191709c9ad2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -105,7 +105,8 @@ EXPORT_SYMBOL(unregister_memory_notifier);
 static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
-
+   /* Verify that the altmap is freed */
+   WARN_ON(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   mem->altmap = altmap;
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size,
return -EINVAL;
 
for (block_id = start_block_id; block_id != end_block_id; block_id++)

[PATCH v8 4/6] mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

2023-08-08 Thread Aneesh Kumar K.V

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page
block aligned with different memory block sizes. Using this facility
implies the kernel will be reserving some pages for every memoryblock.
This allows the memmap on memory feature to be widely useful with
different memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Acked-by: Michal Hocko 
Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 mm/memory_hotplug.c   | 120 +++---
 2 files changed, 113 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
b/Documentation/admin-guide/mm/memory-hotplug.rst
index bd77841041af..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@ The following module parameters are currently defined:
 memory in a way that huge pages in bigger
 granularity cannot be formed on hotplugged
 memory.
+
+With value "force" it could result in memory
+wastage due to memmap size limitations. For
+example, if the memmap for a memory block
+requires 1 MiB, but the pageblock size is 2
+MiB, 1 MiB of hotplugged memory will be wasted.
+Note that there are still cases where the
+feature cannot be enforced: for example, if the
+memmap is smaller than a single page, or if the
+architecture does not support the forced mode
+in all configurations.
+
 ``online_policy``   read-write: Set the basic policy used for
 automatic zone selection when onlining memory
 blocks without specifying a target zone.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 746cb7c08c64..76b813991bdc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+   MEMMAP_ON_MEMORY_DISABLE = 0,
+   MEMMAP_ON_MEMORY_ENABLE,
+   MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+   unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+   /*
+* In "forced" memmap_on_memory mode, we add extra pages to align the
+* vmemmap size to cover full pageblocks. That way, we can add memory
+* even if the vmemmap size is not properly aligned, however, we might 
waste
+* memory.
+*/
+   if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+   return pageblock_align(nr_pages);
+   return nr_pages;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+   int ret, mode;
+   bool enabled;
+
+   if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+   mode = MEMMAP_ON_MEMORY_FORCE;
+   } else {
+   ret = kstrtobool(val, );
+   if (ret < 0)
+   return ret;
+   if (enabled)
+   mode = MEMMAP_ON_MEMORY_ENABLE;
+   else
+   mode = MEMMAP_ON_MEMORY_DISABLE;
+   }
+   *((int *)kp->arg) = mode;
+   if (mode == MEMMAP_ON_MEMORY_FORCE) {
+   unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
+
+   pr_info_once("Memory hotplug will waste %ld pages in each 
memory block\n",
+memmap_pages - PFN_UP(memory_block_memmap_size

[PATCH v8 3/6] mm/memory_hotplug: Allow architecture to override memmap on memory support check

2023-08-08 Thread Aneesh Kumar K.V

Some architectures would want different restrictions. Hence add an
architecture-specific override.

The PMD_SIZE check is moved there.

Acked-by: Michal Hocko 
Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index eca32ccd45cc..746cb7c08c64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,10 +1247,26 @@ static int online_memory_block(struct memory_block 
*mem, void *arg)
return device_online(>dev);
 }
 
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   /*
+* As default, we want the vmemmap to span a complete PMD such that we
+* can map the vmemmap using a single PMD if supported by the
+* architecture.
+*/
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
-   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+   unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long remaining_size = size - vmemmap_size;
 
/*
@@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 */
return mhp_memmap_on_memory() &&
   size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
+  arch_supports_memmap_on_memory(vmemmap_size);
 }
 
 /*
-- 
2.41.0

[PATCH v8 2/6] mm/memory_hotplug: Allow memmap on memory hotplug request to fallback

2023-08-08 Thread Aneesh Kumar K.V

If not supported, fallback to not using memap on memmory. This avoids
the need for callers to do the fallback.

Acked-by: Michal Hocko 
Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/acpi/acpi_memhotplug.c |  3 +--
 include/linux/memory_hotplug.h |  3 ++-
 mm/memory_hotplug.c| 13 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
-   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
  * To do so, we will use the beginning of the hot-added range to build
  * the page tables for the memmap array that describes the entire range.
  * Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 /*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7cfd13c91568..eca32ccd45cc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
@@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
-   if (!mhp_supports_memmap_on_memory(size)) {
-   ret = -EINVAL;
-   goto error;
+   if (mhp_supports_memmap_on_memory(size)) {
+   mhp_altmap.free = PHYS_PFN(size);
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   params.altmap = _altmap;
}
-   mhp_altmap.free = PHYS_PFN(size);
-   mhp_altmap.base_pfn = PHYS_PFN(start);
-   params.altmap = _altmap;
+   /* fallback to not using altmap  */
}
 
/* call arch's memory hotadd */
-- 
2.41.0

[PATCH v8 1/6] mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig

2023-08-08 Thread Aneesh Kumar K.V

Instead of adding menu entry with all supported architectures, add
mm/Kconfig variable and select the same from supported architectures.

No functional change in this patch.

Acked-by: Michal Hocko 
Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/arm64/Kconfig | 4 +---
 arch/x86/Kconfig   | 4 +---
 mm/Kconfig | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1573257a4d6..0f749cfab8e6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -347,9 +348,6 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78224aa76409..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/mm/Kconfig b/mm/Kconfig
index 5fe49c030961..721dc88423c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY
 
 endif # MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+   bool
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
2.41.0

[PATCH v8 0/6] Add support for memmap on memory feature on ppc64

2023-08-08 Thread Aneesh Kumar K.V

This patch series update memmap on memory feature to fall back to
memmap allocation outside the memory block if the alignment rules are
not met. This makes the feature more useful on architectures like
ppc64 where alignment rules are different with 64K page size.

This patch series is dependent on dax vmemmap optimization series
posted here
https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/

Changes from v7:
* Drop patch 7 because we are still discussing a runtime update of this
feature is useful.

Changes from v6:
* Update comments in the code
* Update commit message for patch 7

Changes from v5:
* Update commit message
* Move memory alloc/free to the callers in patch 6
* Address review feedback w.r.t patch 4

Changes from v4:
* Use altmap.free instead of altmap.reserve
* Address review feedback

Changes from v3:
* Extend the module parameter memmap_on_memory to force allocation even
  though we can waste hotplug memory.

Changes from v2:
* Rebase to latest linus tree
* Redo the series based on review feedback. Multiple changes to the patchset.

Changes from v1:
* update the memblock to store vmemmap_altmap details. This is required
so that when we remove the memory we can find the altmap details which
is needed on some architectures.
* rebase to latest linus tree


Aneesh Kumar K.V (6):
  mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
  mm/memory_hotplug: Allow memmap on memory hotplug request to fallback
  mm/memory_hotplug: Allow architecture to override memmap on memory
support check
  mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned
to pageblocks
  powerpc/book3s64/memhotplug: Enable memmap on memory for radix
  mm/memory_hotplug: Embed vmem_altmap details in memory block

 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 arch/arm64/Kconfig|   4 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/pgtable.h|  21 ++
 .../platforms/pseries/hotplug-memory.c|   2 +-
 arch/x86/Kconfig  |   4 +-
 drivers/acpi/acpi_memhotplug.c|   3 +-
 drivers/base/memory.c |  27 ++-
 include/linux/memory.h|   8 +-
 include/linux/memory_hotplug.h|   3 +-
 mm/Kconfig|   3 +
 mm/memory_hotplug.c   | 185 ++
 12 files changed, 209 insertions(+), 64 deletions(-)

-- 
2.41.0

[PATCH v4 2/2] powerpc/mm: Add memory_block_size as a kernel parameter

2023-07-31 Thread Aneesh Kumar K.V

Certain devices can possess non-standard memory capacities, not constrained
to multiples of 1GB. Provide a kernel parameter so that we can map the
device memory completely on memory hotplug.

Restrict memory_block_size value to a power of 2 value similar to LMB size.
The memory block size should also be more than the section size.

Reviewed-by: Reza Arbab 
Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/kernel-parameters.txt |  3 +++
 arch/powerpc/kernel/setup_64.c| 23 +++
 arch/powerpc/mm/init_64.c | 17 ++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a1457995fd41..4e49696e0976 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3201,6 +3201,9 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
 
+   memory_block_size=size [PPC]
+Use this parameter to configure the memory block size 
value.
+
memtest=[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
Format: 
default : 0 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 246201d0d879..cbdb924462c7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -892,6 +892,29 @@ unsigned long memory_block_size_bytes(void)
 
return MIN_MEMORY_BLOCK_SIZE;
 }
+
+/*
+ * Restrict to a power of 2 value for memblock which is larger than
+ * section size
+ */
+static int __init parse_mem_block_size(char *ptr)
+{
+   unsigned int order;
+   unsigned long size = memparse(ptr, NULL);
+
+   order = fls64(size);
+   if (!order)
+   return 0;
+
+   order--;
+   if (order < SECTION_SIZE_BITS)
+   return 0;
+
+   memory_block_size = 1UL << order;
+
+   return 0;
+}
+early_param("memory_block_size", parse_mem_block_size);
 #endif
 
 #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index d74d4a441616..fcda46c2b8df 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -566,13 +566,20 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
return 0;
 }
 
-/*
- * start with 1G memory block size. Early init will
- * fix this with correct value.
- */
-unsigned long memory_block_size __ro_after_init = 1UL << 30;
+unsigned long memory_block_size __ro_after_init;
 static void __init early_init_memory_block_size(void)
 {
+   /*
+* if it is set via early param just return.
+*/
+   if (memory_block_size)
+   return;
+
+   /*
+* start with 1G memory block size. update_memory_block_size()
+* will derive the right value based on device tree details.
+*/
+   memory_block_size = SZ_1G;
/*
 * We need to do memory_block_size probe early so that
 * radix__early_init_mmu() can use this as limit for
-- 
2.41.0

[PATCH v4 1/2] powerpc/mm: Cleanup memory block size probing

2023-07-31 Thread Aneesh Kumar K.V

Parse the device tree in early init to find the memory block size to be
used by the kernel. Consolidate the memory block size device tree parsing
to one helper and use that on both powernv and pseries. We still want to
use machine-specific callback because on all machine types other than
powernv and pseries we continue to return MIN_MEMORY_BLOCK_SIZE.

pseries_memory_block_size used to look for the second memory
block (memory@x) to determine the memory_block_size value. This patch
changed that to look at all memory blocks and make sure we can map them all
correctly using the computed memory block size value.

Add workaround to force 256MB memory block size if device driver managed
memory such as GPU memory is present. This helps to add GPU memory
that is not aligned to 1G.

Co-developed-by: Reza Arbab 
Signed-off-by: Reza Arbab 
Signed-off-by: Aneesh Kumar K.V 
---
Changes from v3:
* Update workaround for GPU memory

 arch/powerpc/include/asm/book3s/64/mmu.h  |   5 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  65 +-
 arch/powerpc/mm/init_64.c | 112 ++
 arch/powerpc/platforms/powernv/setup.c|  10 +-
 .../platforms/pseries/hotplug-memory.c|  60 +-
 arch/powerpc/platforms/pseries/pseries.h  |   2 -
 arch/powerpc/platforms/pseries/setup.c|   7 ++
 7 files changed, 126 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 570a4960cf17..28033fd5403c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -71,10 +71,7 @@ extern unsigned int mmu_pid_bits;
 /* Base PID to allocate from */
 extern unsigned int mmu_base_pid;
 
-/*
- * memory block size used with radix translation.
- */
-extern unsigned long __ro_after_init radix_mem_block_size;
+extern unsigned long __ro_after_init memory_block_size;
 
 #define PRTB_SIZE_SHIFT(mmu_pid_bits + 4)
 #define PRTB_ENTRIES   (1ul << mmu_pid_bits)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..b5102491b50f 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -37,7 +37,6 @@
 #include 
 
 unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -300,7 +299,7 @@ static int __meminit create_physical_mapping(unsigned long 
start,
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
-   unsigned long max_mapping_size = radix_mem_block_size;
+   unsigned long max_mapping_size = memory_block_size;
 
if (debug_pagealloc_enabled_or_kfence())
max_mapping_size = PAGE_SIZE;
@@ -502,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long 
node,
return 1;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char 
*uname, int
- depth, void *data)
-{
-   unsigned long *mem_block_size = (unsigned long *)data;
-   const __be32 *prop;
-   int len;
-
-   if (depth != 1)
-   return 0;
-
-   if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
-   return 0;
-
-   prop = of_get_flat_dt_prop(node, "ibm,lmb-size", );
-
-   if (!prop || len < dt_root_size_cells * sizeof(__be32))
-   /*
-* Nothing in the device tree
-*/
-   *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-   else
-   *mem_block_size = of_read_number(prop, dt_root_size_cells);
-   return 1;
-}
-
-static unsigned long __init radix_memory_block_size(void)
-{
-   unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
-   /*
-* OPAL firmware feature is set by now. Hence we are ok
-* to test OPAL feature.
-*/
-   if (firmware_has_feature(FW_FEATURE_OPAL))
-   mem_block_size = 1UL * 1024 * 1024 * 1024;
-   else
-   of_scan_flat_dt(probe_memory_block_size, _block_size);
-
-   return mem_block_size;
-}
-
-#else   /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long __init radix_memory_block_size(void)
-{
-   return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
 void __init radix__early_init_devtree(void)
 {
int rc;
@@ -577,16 +524,6 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
-   /*
-* Max mapping size used when mapping pages. We don't use
-* ppc_md.memory_block_size() here because this get called
-* early and we don't have m

[PATCH v7 6/7] mm/memory_hotplug: Embed vmem_altmap details in memory block

2023-07-31 Thread Aneesh Kumar K.V

With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block, embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 27 +
 include/linux/memory.h |  8 ++
 mm/memory_hotplug.c| 55 ++
 3 files changed, 53 insertions(+), 37 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..8191709c9ad2 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -105,7 +105,8 @@ EXPORT_SYMBOL(unregister_memory_notifier);
 static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
-
+   /* Verify that the altmap is freed */
+   WARN_ON(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   mem->altmap = altmap;
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size,
return -EINVAL;
 
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-   ret

[PATCH v7 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-07-31 Thread Aneesh Kumar K.V

Allow updating memmap_on_memory mode after the kernel boot. Memory
hotplug done after the mode update will use the new mmemap_on_memory
value.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 33 +
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1ce8ad04a980..d282664f558e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -89,7 +89,10 @@ static int set_memmap_mode(const char *val, const struct 
kernel_param *kp)
else
mode = MEMMAP_ON_MEMORY_DISABLE;
}
+   /* Avoid changing memmap mode during hotplug. */
+   get_online_mems();
*((int *)kp->arg) = mode;
+   put_online_mems();
if (mode == MEMMAP_ON_MEMORY_FORCE) {
unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
 
@@ -110,7 +113,7 @@ static const struct kernel_param_ops memmap_mode_ops = {
.set = set_memmap_mode,
.get = get_memmap_mode,
 };
-module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444);
+module_param_cb(memmap_on_memory, _mode_ops, _mode, 0644);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug\n"
 "With value \"force\" it could result in memory wastage due "
 "to memmap size limitations (Y/N/force)");
@@ -2172,22 +2175,20 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
 * the same granularity it was added - a single memory block.
 */
-   if (mhp_memmap_on_memory()) {
-   ret = walk_memory_blocks(start, size, , test_has_altmap_cb);
-   if (ret) {
-   if (size != memory_block_size_bytes()) {
-   pr_warn("Refuse to remove %#llx - %#llx,"
-   "wrong granularity\n",
-   start, start + size);
-   return -EINVAL;
-   }
-   altmap = mem->altmap;
-   /*
-* Mark altmap NULL so that we can add a debug
-* check on memblock free.
-*/
-   mem->altmap = NULL;
+   ret = walk_memory_blocks(start, size, , test_has_altmap_cb);
+   if (ret) {
+   if (size != memory_block_size_bytes()) {
+   pr_warn("Refuse to remove %#llx - %#llx,"
+   "wrong granularity\n",
+   start, start + size);
+   return -EINVAL;
}
+   altmap = mem->altmap;
+   /*
+* Mark altmap NULL so that we can add a debug
+* check on memblock free.
+*/
+   mem->altmap = NULL;
}
 
/* remove memmap entry */
-- 
2.41.0

[PATCH v7 5/7] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-07-31 Thread Aneesh Kumar K.V

Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Reviewed-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 21 +++
 .../platforms/pseries/hotplug-memory.c|  2 +-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d0497d13f5b4..938294c996dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index a4893b17705a..33464e6d6431 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   if (!radix_enabled())
+   return false;
+   /*
+* With 4K page size and 2M PMD_SIZE, we can align
+* things better with memory block size value
+* starting from 128MB. Hence align things with PMD_SIZE.
+*/
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..4f3d6a2f9065 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
-- 
2.41.0

[PATCH v7 4/7] mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

2023-07-31 Thread Aneesh Kumar K.V

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page
block aligned with different memory block sizes. Using this facility
implies the kernel will be reserving some pages for every memoryblock.
This allows the memmap on memory feature to be widely useful with
different memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 mm/memory_hotplug.c   | 120 +++---
 2 files changed, 113 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
b/Documentation/admin-guide/mm/memory-hotplug.rst
index bd77841041af..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@ The following module parameters are currently defined:
 memory in a way that huge pages in bigger
 granularity cannot be formed on hotplugged
 memory.
+
+With value "force" it could result in memory
+wastage due to memmap size limitations. For
+example, if the memmap for a memory block
+requires 1 MiB, but the pageblock size is 2
+MiB, 1 MiB of hotplugged memory will be wasted.
+Note that there are still cases where the
+feature cannot be enforced: for example, if the
+memmap is smaller than a single page, or if the
+architecture does not support the forced mode
+in all configurations.
+
 ``online_policy``   read-write: Set the basic policy used for
 automatic zone selection when onlining memory
 blocks without specifying a target zone.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 746cb7c08c64..76b813991bdc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+   MEMMAP_ON_MEMORY_DISABLE = 0,
+   MEMMAP_ON_MEMORY_ENABLE,
+   MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+   unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+   /*
+* In "forced" memmap_on_memory mode, we add extra pages to align the
+* vmemmap size to cover full pageblocks. That way, we can add memory
+* even if the vmemmap size is not properly aligned, however, we might 
waste
+* memory.
+*/
+   if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+   return pageblock_align(nr_pages);
+   return nr_pages;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+   int ret, mode;
+   bool enabled;
+
+   if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+   mode = MEMMAP_ON_MEMORY_FORCE;
+   } else {
+   ret = kstrtobool(val, );
+   if (ret < 0)
+   return ret;
+   if (enabled)
+   mode = MEMMAP_ON_MEMORY_ENABLE;
+   else
+   mode = MEMMAP_ON_MEMORY_DISABLE;
+   }
+   *((int *)kp->arg) = mode;
+   if (mode == MEMMAP_ON_MEMORY_FORCE) {
+   unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
+
+   pr_info_once("Memory hotplug will waste %ld pages in each 
memory block\n",
+memmap_pages - PFN_UP(memory_block_memmap_size()));
+   }
+

[PATCH v7 3/7] mm/memory_hotplug: Allow architecture to override memmap on memory support check

2023-07-31 Thread Aneesh Kumar K.V

Some architectures would want different restrictions. Hence add an
architecture-specific override.

The PMD_SIZE check is moved there.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index eca32ccd45cc..746cb7c08c64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,10 +1247,26 @@ static int online_memory_block(struct memory_block 
*mem, void *arg)
return device_online(>dev);
 }
 
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   /*
+* As default, we want the vmemmap to span a complete PMD such that we
+* can map the vmemmap using a single PMD if supported by the
+* architecture.
+*/
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
-   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+   unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long remaining_size = size - vmemmap_size;
 
/*
@@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 */
return mhp_memmap_on_memory() &&
   size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
+  arch_supports_memmap_on_memory(vmemmap_size);
 }
 
 /*
-- 
2.41.0

[PATCH v7 2/7] mm/memory_hotplug: Allow memmap on memory hotplug request to fallback

2023-07-31 Thread Aneesh Kumar K.V

If not supported, fallback to not using memap on memmory. This avoids
the need for callers to do the fallback.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/acpi/acpi_memhotplug.c |  3 +--
 include/linux/memory_hotplug.h |  3 ++-
 mm/memory_hotplug.c| 13 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
-   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
  * To do so, we will use the beginning of the hot-added range to build
  * the page tables for the memmap array that describes the entire range.
  * Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 /*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7cfd13c91568..eca32ccd45cc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
@@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
-   if (!mhp_supports_memmap_on_memory(size)) {
-   ret = -EINVAL;
-   goto error;
+   if (mhp_supports_memmap_on_memory(size)) {
+   mhp_altmap.free = PHYS_PFN(size);
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   params.altmap = _altmap;
}
-   mhp_altmap.free = PHYS_PFN(size);
-   mhp_altmap.base_pfn = PHYS_PFN(start);
-   params.altmap = _altmap;
+   /* fallback to not using altmap  */
}
 
/* call arch's memory hotadd */
-- 
2.41.0

[PATCH v7 1/7] mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig

2023-07-31 Thread Aneesh Kumar K.V

Instead of adding menu entry with all supported architectures, add
mm/Kconfig variable and select the same from supported architectures.

No functional change in this patch.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/arm64/Kconfig | 4 +---
 arch/x86/Kconfig   | 4 +---
 mm/Kconfig | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1573257a4d6..0f749cfab8e6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -347,9 +348,6 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78224aa76409..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/mm/Kconfig b/mm/Kconfig
index 5fe49c030961..721dc88423c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY
 
 endif # MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+   bool
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
2.41.0

[PATCH v7 0/7] Add support for memmap on memory feature on ppc64

2023-07-31 Thread Aneesh Kumar K.V

This patch series update memmap on memory feature to fall back to
memmap allocation outside the memory block if the alignment rules are
not met. This makes the feature more useful on architectures like
ppc64 where alignment rules are different with 64K page size.

This patch series is dependent on dax vmemmap optimization series
posted here
https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/

Changes from v6:
* Update comments in the code
* Update commit message for patch 7

Changes from v5:
* Update commit message
* Move memory alloc/free to the callers in patch 6
* Address review feedback w.r.t patch 4

Changes from v4:
* Use altmap.free instead of altmap.reserve
* Address review feedback

Changes from v3:
* Extend the module parameter memmap_on_memory to force allocation even
  though we can waste hotplug memory.

Changes from v2:
* Rebase to latest linus tree
* Redo the series based on review feedback. Multiple changes to the patchset.

Changes from v1:
* update the memblock to store vmemmap_altmap details. This is required
so that when we remove the memory we can find the altmap details which
is needed on some architectures.
* rebase to latest linus tree



Aneesh Kumar K.V (7):
  mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
  mm/memory_hotplug: Allow memmap on memory hotplug request to fallback
  mm/memory_hotplug: Allow architecture to override memmap on memory
support check
  mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned
to pageblocks
  powerpc/book3s64/memhotplug: Enable memmap on memory for radix
  mm/memory_hotplug: Embed vmem_altmap details in memory block
  mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

 .../admin-guide/mm/memory-hotplug.rst |  12 +
 arch/arm64/Kconfig|   4 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/pgtable.h|  21 ++
 .../platforms/pseries/hotplug-memory.c|   2 +-
 arch/x86/Kconfig  |   4 +-
 drivers/acpi/acpi_memhotplug.c|   3 +-
 drivers/base/memory.c |  27 ++-
 include/linux/memory.h|   8 +-
 include/linux/memory_hotplug.h|   3 +-
 mm/Kconfig|   3 +
 mm/memory_hotplug.c   | 205 ++
 12 files changed, 220 insertions(+), 73 deletions(-)

-- 
2.41.0

[PATCH v3 2/2] powerpc/mm: Add memory_block_size as a kernel parameter

2023-07-28 Thread Aneesh Kumar K.V

Certain devices can possess non-standard memory capacities, not constrained
to multiples of 1GB. Provide a kernel parameter so that we can map the
device memory completely on memory hotplug.

Restrict memory_block_size value to a power of 2 value similar to LMB size.
The memory block size should also be more than the section size.

Reviewed-by: Reza Arbab 
Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/kernel-parameters.txt |  3 +++
 arch/powerpc/kernel/setup_64.c| 23 +++
 arch/powerpc/mm/init_64.c | 17 ++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index a1457995fd41..4e49696e0976 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3201,6 +3201,9 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
 
+   memory_block_size=size [PPC]
+Use this parameter to configure the memory block size 
value.
+
memtest=[KNL,X86,ARM,M68K,PPC,RISCV] Enable memtest
Format: 
default : 0 
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 246201d0d879..cbdb924462c7 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -892,6 +892,29 @@ unsigned long memory_block_size_bytes(void)
 
return MIN_MEMORY_BLOCK_SIZE;
 }
+
+/*
+ * Restrict to a power of 2 value for memblock which is larger than
+ * section size
+ */
+static int __init parse_mem_block_size(char *ptr)
+{
+   unsigned int order;
+   unsigned long size = memparse(ptr, NULL);
+
+   order = fls64(size);
+   if (!order)
+   return 0;
+
+   order--;
+   if (order < SECTION_SIZE_BITS)
+   return 0;
+
+   memory_block_size = 1UL << order;
+
+   return 0;
+}
+early_param("memory_block_size", parse_mem_block_size);
 #endif
 
 #if defined(CONFIG_PPC_INDIRECT_PIO) || defined(CONFIG_PPC_INDIRECT_MMIO)
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 29178b3aafe6..dbed37d6cffb 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -569,13 +569,20 @@ static int __init probe_memory_block_size(unsigned long 
node, const char *uname,
return 0;
 }
 
-/*
- * start with 1G memory block size. Early init will
- * fix this with correct value.
- */
-unsigned long memory_block_size __ro_after_init = 1UL << 30;
+unsigned long memory_block_size __ro_after_init;
 static void __init early_init_memory_block_size(void)
 {
+   /*
+* if it is set via early param just return.
+*/
+   if (memory_block_size)
+   return;
+
+   /*
+* start with 1G memory block size. update_memory_block_size()
+* will derive the right value based on device tree details.
+*/
+   memory_block_size = SZ_1G;
/*
 * We need to do memory_block_size probe early so that
 * radix__early_init_mmu() can use this as limit for
-- 
2.41.0

[PATCH v3 1/2] powerpc/mm: Cleanup memory block size probing

2023-07-28 Thread Aneesh Kumar K.V

Parse the device tree in early init to find the memory block size to be
used by the kernel. Consolidate the memory block size device tree parsing
to one helper and use that on both powernv and pseries. We still want to
use machine-specific callback because on all machine types other than
powernv and pseries we continue to return MIN_MEMORY_BLOCK_SIZE.

pseries_memory_block_size used to look for the second memory
block (memory@x) to determine the memory_block_size value. This patch
changed that to look at all memory blocks and make sure we can map them all
correctly using the computed memory block size value.

Add workaround to force 256MB memory block size if device driver managed
memory such as GPU memory is present. This helps to add GPU memory
that is not aligned to 1G.

Signed-off-by: Aneesh Kumar K.V 
---
Changes from v2:
* Add workaround for forcing 256MB memory blocksize with GPU

 arch/powerpc/include/asm/book3s/64/mmu.h  |   5 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  65 +-
 arch/powerpc/mm/init_64.c | 115 ++
 arch/powerpc/platforms/powernv/setup.c|  10 +-
 .../platforms/pseries/hotplug-memory.c|  60 +
 arch/powerpc/platforms/pseries/pseries.h  |   2 -
 arch/powerpc/platforms/pseries/setup.c|   7 ++
 7 files changed, 129 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 570a4960cf17..28033fd5403c 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -71,10 +71,7 @@ extern unsigned int mmu_pid_bits;
 /* Base PID to allocate from */
 extern unsigned int mmu_base_pid;
 
-/*
- * memory block size used with radix translation.
- */
-extern unsigned long __ro_after_init radix_mem_block_size;
+extern unsigned long __ro_after_init memory_block_size;
 
 #define PRTB_SIZE_SHIFT(mmu_pid_bits + 4)
 #define PRTB_ENTRIES   (1ul << mmu_pid_bits)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..b5102491b50f 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -37,7 +37,6 @@
 #include 
 
 unsigned int mmu_base_pid;
-unsigned long radix_mem_block_size __ro_after_init;
 
 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
unsigned long region_start, unsigned long region_end)
@@ -300,7 +299,7 @@ static int __meminit create_physical_mapping(unsigned long 
start,
bool prev_exec, exec = false;
pgprot_t prot;
int psize;
-   unsigned long max_mapping_size = radix_mem_block_size;
+   unsigned long max_mapping_size = memory_block_size;
 
if (debug_pagealloc_enabled_or_kfence())
max_mapping_size = PAGE_SIZE;
@@ -502,58 +501,6 @@ static int __init radix_dt_scan_page_sizes(unsigned long 
node,
return 1;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __init probe_memory_block_size(unsigned long node, const char 
*uname, int
- depth, void *data)
-{
-   unsigned long *mem_block_size = (unsigned long *)data;
-   const __be32 *prop;
-   int len;
-
-   if (depth != 1)
-   return 0;
-
-   if (strcmp(uname, "ibm,dynamic-reconfiguration-memory"))
-   return 0;
-
-   prop = of_get_flat_dt_prop(node, "ibm,lmb-size", );
-
-   if (!prop || len < dt_root_size_cells * sizeof(__be32))
-   /*
-* Nothing in the device tree
-*/
-   *mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-   else
-   *mem_block_size = of_read_number(prop, dt_root_size_cells);
-   return 1;
-}
-
-static unsigned long __init radix_memory_block_size(void)
-{
-   unsigned long mem_block_size = MIN_MEMORY_BLOCK_SIZE;
-
-   /*
-* OPAL firmware feature is set by now. Hence we are ok
-* to test OPAL feature.
-*/
-   if (firmware_has_feature(FW_FEATURE_OPAL))
-   mem_block_size = 1UL * 1024 * 1024 * 1024;
-   else
-   of_scan_flat_dt(probe_memory_block_size, _block_size);
-
-   return mem_block_size;
-}
-
-#else   /* CONFIG_MEMORY_HOTPLUG */
-
-static unsigned long __init radix_memory_block_size(void)
-{
-   return 1UL * 1024 * 1024 * 1024;
-}
-
-#endif /* CONFIG_MEMORY_HOTPLUG */
-
-
 void __init radix__early_init_devtree(void)
 {
int rc;
@@ -577,16 +524,6 @@ void __init radix__early_init_devtree(void)
mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
psize_to_rpti_pgsize(MMU_PAGE_64K);
}
-
-   /*
-* Max mapping size used when mapping pages. We don't use
-* ppc_md.memory_block_size() here because this get called
-* early and we don't have machine probe called yet. Also
-* the pser

[PATCH v6 12/13 -fix] powerpc/book3s64/radix: Remove mmu_vmemmap_psize

2023-07-28 Thread Aneesh Kumar K.V

>From 2d37f0570983bfa710e73a6485e178658e8f4b38 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Fri, 28 Jul 2023 14:47:46 +0530
Subject: [PATCH] powerpc/mm: Fix kernel build error

 arch/powerpc/mm/init_64.c:201:15: error: no previous prototype for function 
'__vmemmap_populate' [-Werror,-Wmissing-prototypes]
   int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
 ^
   arch/powerpc/mm/init_64.c:201:1: note: declare 'static' if the function is 
not intended to be used outside of this translation unit
   int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202307281617.oxcxz84j-...@intel.com/
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 8d184e89e943..e0208cb12058 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -198,8 +198,8 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, 
unsigned long start,
return false;
 }
 
-int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
-struct vmem_altmap *altmap)
+static int __meminit __vmemmap_populate(unsigned long start, unsigned long 
end, int node,
+   struct vmem_altmap *altmap)
 {
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
-- 
2.41.0

[PATCH v6 10/13 -fix2] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-07-28 Thread Aneesh Kumar K.V

>From a3f49a79ffa78a7de736af77e13fdbb272c9f221 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Fri, 28 Jul 2023 15:36:53 +0530
Subject: [PATCH] powerpc/mm: Fix kernel build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  CC  arch/powerpc/mm/init_64.o
src/linux/arch/powerpc/mm/init_64.c:324:12: error: no previous prototype for 
‘__vmemmap_free’ [-Werror=missing-prototypes]
  324 | void __ref __vmemmap_free(unsigned long start, unsigned long end,

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 2271c0a7ea0d..141c20d02797 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -308,8 +308,8 @@ static unsigned long vmemmap_list_free(unsigned long start)
return vmem_back->phys;
 }
 
-void __ref __vmemmap_free(unsigned long start, unsigned long end,
- struct vmem_altmap *altmap)
+static void __ref __vmemmap_free(unsigned long start, unsigned long end,
+struct vmem_altmap *altmap)
 {
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
unsigned long page_order = get_order(page_size);
-- 
2.41.0

[PATCH v6 7/7] mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

2023-07-27 Thread Aneesh Kumar K.V

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 35 +++
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa8724bd1d53..7c877756b363 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -89,7 +89,12 @@ static int set_memmap_mode(const char *val, const struct 
kernel_param *kp)
else
mode = MEMMAP_ON_MEMORY_DISABLE;
}
+   /*
+* Avoid changing memmap mode during hotplug.
+*/
+   get_online_mems();
*((int *)kp->arg) = mode;
+   put_online_mems();
if (mode == MEMMAP_ON_MEMORY_FORCE) {
unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
 
@@ -110,7 +115,7 @@ static const struct kernel_param_ops memmap_mode_ops = {
.set = set_memmap_mode,
.get = get_memmap_mode,
 };
-module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444);
+module_param_cb(memmap_on_memory, _mode_ops, _mode, 0644);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug\n"
 "With value \"force\" it could result in memory wastage due "
 "to memmap size limitations (Y/N/force)");
@@ -2172,22 +2177,20 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
 * the same granularity it was added - a single memory block.
 */
-   if (mhp_memmap_on_memory()) {
-   ret = walk_memory_blocks(start, size, , test_has_altmap_cb);
-   if (ret) {
-   if (size != memory_block_size_bytes()) {
-   pr_warn("Refuse to remove %#llx - %#llx,"
-   "wrong granularity\n",
-   start, start + size);
-   return -EINVAL;
-   }
-   altmap = mem->altmap;
-   /*
-* Mark altmap NULL so that we can add a debug
-* check on memblock free.
-*/
-   mem->altmap = NULL;
+   ret = walk_memory_blocks(start, size, , test_has_altmap_cb);
+   if (ret) {
+   if (size != memory_block_size_bytes()) {
+   pr_warn("Refuse to remove %#llx - %#llx,"
+   "wrong granularity\n",
+   start, start + size);
+   return -EINVAL;
}
+   altmap = mem->altmap;
+   /*
+* Mark altmap NULL so that we can add a debug
+* check on memblock free.
+*/
+   mem->altmap = NULL;
}
 
/* remove memmap entry */
-- 
2.41.0

[PATCH v6 5/7] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-07-27 Thread Aneesh Kumar K.V

Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Reviewed-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 21 +++
 .../platforms/pseries/hotplug-memory.c|  2 +-
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d0497d13f5b4..938294c996dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index a4893b17705a..33464e6d6431 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   if (!radix_enabled())
+   return false;
+   /*
+* With 4K page size and 2M PMD_SIZE, we can align
+* things better with memory block size value
+* starting from 128MB. Hence align things with PMD_SIZE.
+*/
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..4f3d6a2f9065 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
-- 
2.41.0

[PATCH v6 6/7] mm/memory_hotplug: Embed vmem_altmap details in memory block

2023-07-27 Thread Aneesh Kumar K.V

With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block, embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 25 +++---
 include/linux/memory.h |  8 ++
 mm/memory_hotplug.c| 58 +++---
 3 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..57ed61212277 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,6 +106,7 @@ static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
 
+   WARN_ON(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   mem->altmap = altmap;
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, 
unsigned long size,
return -EINVAL;
 
for (block_id = start_block_id; block_id != end_block_id; block_id++) {
-   ret = add_hotplug_memory_block(block_id, vmemmap_pages, group);
+   ret = add_hotplug_memor

[PATCH v6 2/7] mm/memory_hotplug: Allow memmap on memory hotplug request to fallback

2023-07-27 Thread Aneesh Kumar K.V

If not supported, fallback to not using memap on memmory. This avoids
the need for callers to do the fallback.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/acpi/acpi_memhotplug.c |  3 +--
 include/linux/memory_hotplug.h |  3 ++-
 mm/memory_hotplug.c| 13 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
-   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
  * To do so, we will use the beginning of the hot-added range to build
  * the page tables for the memmap array that describes the entire range.
  * Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 /*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7cfd13c91568..eca32ccd45cc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
@@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
-   if (!mhp_supports_memmap_on_memory(size)) {
-   ret = -EINVAL;
-   goto error;
+   if (mhp_supports_memmap_on_memory(size)) {
+   mhp_altmap.free = PHYS_PFN(size);
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   params.altmap = _altmap;
}
-   mhp_altmap.free = PHYS_PFN(size);
-   mhp_altmap.base_pfn = PHYS_PFN(start);
-   params.altmap = _altmap;
+   /* fallback to not using altmap  */
}
 
/* call arch's memory hotadd */
-- 
2.41.0

[PATCH v6 4/7] mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

2023-07-27 Thread Aneesh Kumar K.V

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page
block aligned with different memory block sizes. Using this facility
implies the kernel will be reserving some pages for every memoryblock.
This allows the memmap on memory feature to be widely useful with
different memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 mm/memory_hotplug.c   | 120 +++---
 2 files changed, 113 insertions(+), 19 deletions(-)

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
b/Documentation/admin-guide/mm/memory-hotplug.rst
index bd77841041af..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@ The following module parameters are currently defined:
 memory in a way that huge pages in bigger
 granularity cannot be formed on hotplugged
 memory.
+
+With value "force" it could result in memory
+wastage due to memmap size limitations. For
+example, if the memmap for a memory block
+requires 1 MiB, but the pageblock size is 2
+MiB, 1 MiB of hotplugged memory will be wasted.
+Note that there are still cases where the
+feature cannot be enforced: for example, if the
+memmap is smaller than a single page, or if the
+architecture does not support the forced mode
+in all configurations.
+
 ``online_policy``   read-write: Set the basic policy used for
 automatic zone selection when onlining memory
 blocks without specifying a target zone.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 746cb7c08c64..fe94feb32d71 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,83 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+   MEMMAP_ON_MEMORY_DISABLE = 0,
+   MEMMAP_ON_MEMORY_ENABLE,
+   MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+   unsigned long nr_pages = PFN_UP(memory_block_memmap_size());
+
+   /*
+* In "forced" memmap_on_memory mode, we add extra pages to align the
+* vmemmap size to cover full pageblocks. That way, we can add memory
+* even if the vmemmap size is not properly aligned, however, we might 
waste
+* memory.
+*/
+   if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+   return pageblock_align(nr_pages);
+   return nr_pages;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+   int ret, mode;
+   bool enabled;
+
+   if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+   mode = MEMMAP_ON_MEMORY_FORCE;
+   } else {
+   ret = kstrtobool(val, );
+   if (ret < 0)
+   return ret;
+   if (enabled)
+   mode = MEMMAP_ON_MEMORY_ENABLE;
+   else
+   mode = MEMMAP_ON_MEMORY_DISABLE;
+   }
+   *((int *)kp->arg) = mode;
+   if (mode == MEMMAP_ON_MEMORY_FORCE) {
+   unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
+
+   pr_info_once("Memory hotplug will reserve %ld pages in each 
memory block\n",
+memmap_pages - PFN_UP(memory_block_memmap_size()));
+   }
+

[PATCH v6 3/7] mm/memory_hotplug: Allow architecture to override memmap on memory support check

2023-07-27 Thread Aneesh Kumar K.V

Some architectures would want different restrictions. Hence add an
architecture-specific override.

The PMD_SIZE check is moved there.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index eca32ccd45cc..746cb7c08c64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,10 +1247,26 @@ static int online_memory_block(struct memory_block 
*mem, void *arg)
return device_online(>dev);
 }
 
+static inline unsigned long memory_block_memmap_size(void)
+{
+   return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+}
+
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size)
+{
+   /*
+* As default, we want the vmemmap to span a complete PMD such that we
+* can map the vmemmap using a single PMD if supported by the
+* architecture.
+*/
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
-   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+   unsigned long vmemmap_size = memory_block_memmap_size();
unsigned long remaining_size = size - vmemmap_size;
 
/*
@@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 */
return mhp_memmap_on_memory() &&
   size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
+  arch_supports_memmap_on_memory(vmemmap_size);
 }
 
 /*
-- 
2.41.0

[PATCH v6 1/7] mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig

2023-07-27 Thread Aneesh Kumar K.V

Instead of adding menu entry with all supported architectures, add
mm/Kconfig variable and select the same from supported architectures.

No functional change in this patch.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/arm64/Kconfig | 4 +---
 arch/x86/Kconfig   | 4 +---
 mm/Kconfig | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1573257a4d6..0f749cfab8e6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -347,9 +348,6 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78224aa76409..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/mm/Kconfig b/mm/Kconfig
index 5fe49c030961..721dc88423c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY
 
 endif # MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+   bool
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
2.41.0

[PATCH v6 0/7] Add support for memmap on memory feature on ppc64

2023-07-27 Thread Aneesh Kumar K.V

This patch series update memmap on memory feature to fall back to
memmap allocation outside the memory block if the alignment rules are
not met. This makes the feature more useful on architectures like
ppc64 where alignment rules are different with 64K page size.

This patch series is dependent on dax vmemmap optimization series
posted here
https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/

Changes from v5:
* Update commit message
* Move memory alloc/free to the callers in patch 6
* Address review feedback w.r.t patch 4

Changes from v4:
* Use altmap.free instead of altmap.reserve
* Address review feedback

Changes from v3:
* Extend the module parameter memmap_on_memory to force allocation even
  though we can waste hotplug memory.

Changes from v2:
* Rebase to latest linus tree
* Redo the series based on review feedback. Multiple changes to the patchset.

Changes from v1:
* update the memblock to store vmemmap_altmap details. This is required
so that when we remove the memory we can find the altmap details which
is needed on some architectures.
* rebase to latest linus tree


Aneesh Kumar K.V (7):
  mm/memory_hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
  mm/memory_hotplug: Allow memmap on memory hotplug request to fallback
  mm/memory_hotplug: Allow architecture to override memmap on memory
support check
  mm/memory_hotplug: Support memmap_on_memory when memmap is not aligned
to pageblocks
  powerpc/book3s64/memhotplug: Enable memmap on memory for radix
  mm/memory_hotplug: Embed vmem_altmap details in memory block
  mm/memory_hotplug: Enable runtime update of memmap_on_memory parameter

 .../admin-guide/mm/memory-hotplug.rst |  12 +
 arch/arm64/Kconfig|   4 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/pgtable.h|  21 ++
 .../platforms/pseries/hotplug-memory.c|   2 +-
 arch/x86/Kconfig  |   4 +-
 drivers/acpi/acpi_memhotplug.c|   3 +-
 drivers/base/memory.c |  25 ++-
 include/linux/memory.h|   8 +-
 include/linux/memory_hotplug.h|   3 +-
 mm/Kconfig|   3 +
 mm/memory_hotplug.c   | 210 ++
 12 files changed, 224 insertions(+), 72 deletions(-)

-- 
2.41.0

Re: [PATCH v6 00/13] Add support for DAX vmemmap optimization for ppc64

2023-07-26 Thread Aneesh Kumar K.V

Andrew Morton  writes:

> On Wed, 26 Jul 2023 10:59:32 +0530 Aneesh Kumar K V 
>  wrote:
>
>> On 7/26/23 12:59 AM, Andrew Morton wrote:
>> > On Tue, 25 Jul 2023 00:37:46 +0530 "Aneesh Kumar K.V" 
>> >  wrote:
>> > 
>> >> This patch series implements changes required to support DAX vmemmap
>> >> optimization for ppc64.
>> > 
>> > Do we have any measurements to help us understand the magnitude
>> > of this optimization?
>> > 
>> > And any documentation which helps users understand whether and
>> > why they should enable this feature?
>> 
>> That is memory space optimization due to kernel reusing the tail page struct 
>> pages. The details
>> of the optimization is documented in patch 11. We document there the impact 
>> with both 4k and
>> 64K page size.
>
> I suppose that with sufficient arithmetic one could use
> Documentation/powerpc/vmemmap_dedup.rst to figure out the bottom-line
> savings.
>
> I was more expecting a straightforward statement in the [0/N] overview
> to help people understand why they're reading this patchset at all.
> Like "saves 5% of total memory on my XXX machine".

This is specific to devdax usage and also depends on devdax alignment.
The actual saving details are also documented in mm/vmemmap_dedup.rst.
The saving will be based on the devdax device memory size and aligment.

With 64K page size for 16384 pages added (1G) we save 14 pages
With 4K page size for 262144 pages added (1G) we save 4094 pages
With 4K page size for 512 pages added (2M) we save 6 pages

-aneesh

[PATCH v6 10/13 -fix] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-07-26 Thread Aneesh Kumar K.V



>From 9125b1815758ab3b83966aeead6f486c0708ea73 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Thu, 27 Jul 2023 10:02:37 +0530
Subject: [PATCH] powerpc/mm: Fix section mismatch warning

remove_pte_table is only called from remove_pmd_table which is marked
__meminit. These functions are used only during memory hotplug and they
also call functions marked __meminit. Mark remove_pte_table also with
__meminit so that we can avoid the section mismatch warning below.

WARNING: modpost: vmlinux: section mismatch in reference: 
remove_pte_table+0x230 (section: .text.remove_pte_table) -> free_vmemmap_pages 
(section: .meminit.text)
WARNING: modpost: vmlinux: section mismatch in reference: 
remove_pte_table+0x43c (section: .text.remove_pte_table) -> vmemmap_populated 
(section: .meminit.text)
WARNING: modpost: vmlinux: section mismatch in reference: 
remove_pte_table+0x4a4 (section: .text.remove_pte_table) -> free_vmemmap_pages 
(section: .meminit.text)

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 53f8340e390c..6d04dd579d03 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -793,9 +793,9 @@ static void __meminit free_vmemmap_pages(struct page *page,
free_pages((unsigned long)page_address(page), order);
 }
 
-static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-unsigned long end, bool direct,
-struct vmem_altmap *altmap)
+static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
+  unsigned long end, bool direct,
+  struct vmem_altmap *altmap)
 {
unsigned long next, pages = 0;
pte_t *pte;
-- 
2.41.0

[PATCH v6 11/13 -fix] powerpc/book3s64/radix: Add support for vmemmap optimization for radix

2023-07-26 Thread Aneesh Kumar K.V



>From 9252360e483246e13e6bb28cd6773af2b99eeb55 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Wed, 26 Jul 2023 10:54:14 +0530
Subject: [PATCH] -next build fixup

Fix build error

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/radix.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 3195f268ed7f..357e23a403d3 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -364,8 +364,10 @@ int radix__remove_section_mapping(unsigned long start, 
unsigned long end);
 
 void radix__kernel_map_pages(struct page *page, int numpages, int enable);
 
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 #define vmemmap_can_optimize vmemmap_can_optimize
 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap 
*pgmap);
+#endif
 
 #define vmemmap_populate_compound_pages vmemmap_populate_compound_pages
 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
-- 
2.41.0

[PATCH v6 01/13 -fix] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support

2023-07-26 Thread Aneesh Kumar K.V

>From 81719b31a4e86d2f7352da653175b7c508a94303 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Wed, 26 Jul 2023 13:45:28 +0530
Subject: [PATCH] mm/debug_vm_pgtable: Use the new
 has_transparent_pud_hugepage()

Use the new helper to check pud hugepage support. Architecture like ppc64
will enable the config value CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
and can still have pud hugepage support disabled for hash translation.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/debug_vm_pgtable.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ee2c4c1dcfc8..d61eaa075c75 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -302,7 +302,7 @@ static void __init pud_basic_tests(struct 
pgtable_debug_args *args, int idx)
unsigned long val = idx, *ptr = 
pud_t pud;
 
-   if (!has_transparent_hugepage())
+   if (!has_transparent_pud_hugepage())
return;
 
pr_debug("Validating PUD basic (%pGv)\n", ptr);
@@ -343,7 +343,7 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
unsigned long vaddr = args->vaddr;
pud_t pud;
 
-   if (!has_transparent_hugepage())
+   if (!has_transparent_pud_hugepage())
return;
 
page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL;
@@ -405,7 +405,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args 
*args)
 {
pud_t pud;
 
-   if (!has_transparent_hugepage())
+   if (!has_transparent_pud_hugepage())
return;
 
pr_debug("Validating PUD leaf\n");
@@ -732,7 +732,7 @@ static void __init pud_devmap_tests(struct 
pgtable_debug_args *args)
 {
pud_t pud;
 
-   if (!has_transparent_hugepage())
+   if (!has_transparent_pud_hugepage())
return;
 
pr_debug("Validating PUD devmap\n");
@@ -981,7 +981,7 @@ static void __init pud_thp_tests(struct pgtable_debug_args 
*args)
 {
pud_t pud;
 
-   if (!has_transparent_hugepage())
+   if (!has_transparent_pud_hugepage())
return;
 
pr_debug("Validating PUD based THP\n");
@@ -1022,8 +1022,7 @@ static void __init destroy_args(struct pgtable_debug_args 
*args)
 
/* Free (huge) page */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-   IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
-   has_transparent_hugepage() &&
+   has_transparent_pud_hugepage() &&
args->pud_pfn != ULONG_MAX) {
if (args->is_contiguous_page) {
free_contig_range(args->pud_pfn,
@@ -1274,8 +1273,7 @@ static int __init init_args(struct pgtable_debug_args 
*args)
 * if we fail to allocate (huge) pages.
 */
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
-   IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) &&
-   has_transparent_hugepage()) {
+   has_transparent_pud_hugepage()) {
page = debug_vm_pgtable_alloc_huge_page(args,
HPAGE_PUD_SHIFT - PAGE_SHIFT);
if (page) {
-- 
2.41.0

Re: [PATCH v5 6/7] mm/hotplug: Embed vmem_altmap details in memory block

2023-07-26 Thread Aneesh Kumar K.V

David Hildenbrand  writes:

> On 25.07.23 12:02, Aneesh Kumar K.V wrote:
>> With memmap on memory, some architecture needs more details w.r.t altmap
>> such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
>> computing them again when we remove a memory block, embed vmem_altmap
>> details in struct memory_block if we are using memmap on memory block
>> feature.
>> 
>> No functional change in this patch
>> 
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>
> [...]
>
>>   
>>   static int add_memory_block(unsigned long block_id, unsigned long state,
>> -unsigned long nr_vmemmap_pages,
>> +struct vmem_altmap *altmap,
>>  struct memory_group *group)
>>   {
>>  struct memory_block *mem;
>> @@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, 
>> unsigned long state,
>>  mem->start_section_nr = block_id * sections_per_block;
>>  mem->state = state;
>>  mem->nid = NUMA_NO_NODE;
>> -mem->nr_vmemmap_pages = nr_vmemmap_pages;
>> +if (altmap) {
>> +mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
>> +if (!mem->altmap) {
>> +kfree(mem);
>> +return -ENOMEM;
>> +}
>> +memcpy(mem->altmap, altmap, sizeof(*altmap));
>> +}
>
> I'm wondering if we should instead let the caller do the alloc/free. So we 
> would alloc
> int the caller and would only store the pointer.
>
> Before removing the memory block, we would clear the pointer and free it in 
> the caller.
>
> IOW, when removing a memory block and we still have an altmap set, something 
> would be wrong.
>
> See below on try_remove_memory() handling.
>
> [...]
>
>> -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
>> +static int get_vmemmap_altmap_cb(struct memory_block *mem, void *arg)
>>   {
>> +struct vmem_altmap *altmap = (struct vmem_altmap *)arg;
>>  /*
>> - * If not set, continue with the next block.
>> + * If we have any pages allocated from altmap
>> + * return the altmap details and break callback.
>>   */
>> -return mem->nr_vmemmap_pages;
>> +if (mem->altmap) {
>> +memcpy(altmap, mem->altmap, sizeof(struct vmem_altmap));
>> +return 1;
>> +}
>> +return 0;
>>   }
>>   
>>   static int check_cpu_on_node(int nid)
>> @@ -2146,9 +2152,8 @@ EXPORT_SYMBOL(try_offline_node);
>>   
>>   static int __ref try_remove_memory(u64 start, u64 size)
>>   {
>> -struct vmem_altmap mhp_altmap = {};
>> -struct vmem_altmap *altmap = NULL;
>> -unsigned long nr_vmemmap_pages;
>> +int ret;
>> +struct vmem_altmap mhp_altmap, *altmap = NULL;
>>  int rc = 0, nid = NUMA_NO_NODE;
>>   
>>  BUG_ON(check_hotplug_memory_range(start, size));
>> @@ -2171,24 +2176,15 @@ static int __ref try_remove_memory(u64 start, u64 
>> size)
>>   * the same granularity it was added - a single memory block.
>>   */
>>  if (mhp_memmap_on_memory()) {
>> -nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
>> -  get_nr_vmemmap_pages_cb);
>> -if (nr_vmemmap_pages) {
>> +ret = walk_memory_blocks(start, size, _altmap,
>> + get_vmemmap_altmap_cb);
>> +if (ret) {
>>  if (size != memory_block_size_bytes()) {
>>  pr_warn("Refuse to remove %#llx - %#llx,"
>>  "wrong granularity\n",
>>  start, start + size);
>>  return -EINVAL;
>>  }
>> -
>> -/*
>> - * Let remove_pmd_table->free_hugepage_table do the
>> - * right thing if we used vmem_altmap when hot-adding
>> - * the range.
>> - */
>> -mhp_altmap.base_pfn = PHYS_PFN(start);
>> -mhp_altmap.free = nr_vmemmap_pages;
>> -mhp_altmap.alloc = nr_vmemmap_pages;
>>  altmap = _altmap;
>>  }
>
>
> Instead of that, I suggest (whitespace damage expected):
>
> dif

Re: [PATCH v5 4/7] mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

2023-07-25 Thread Aneesh Kumar K.V

David Hildenbrand  writes:

> On 25.07.23 12:02, Aneesh Kumar K.V wrote:
>> Currently, memmap_on_memory feature is only supported with memory block
>> sizes that result in vmemmap pages covering full page blocks. This is
>> because memory onlining/offlining code requires applicable ranges to be
>> pageblock-aligned, for example, to set the migratetypes properly.
>> 
>> This patch helps to lift that restriction by reserving more pages than
>> required for vmemmap space. This helps the start address to be page
>> block aligned with different memory block sizes. Using this facility
>> implies the kernel will be reserving some pages for every memoryblock.
>> This allows the memmap on memory feature to be widely useful with
>> different memory block size values.
>> 
>> For ex: with 64K page size and 256MiB memory block size, we require 4
>> pages to map vmemmap pages, To align things correctly we end up adding a
>> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
>> 
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>   .../admin-guide/mm/memory-hotplug.rst |  12 ++
>>   mm/memory_hotplug.c   | 121 --
>>   2 files changed, 119 insertions(+), 14 deletions(-)
>> 
>> diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
>> b/Documentation/admin-guide/mm/memory-hotplug.rst
>> index bd77841041af..2994958c7ce8 100644
>> --- a/Documentation/admin-guide/mm/memory-hotplug.rst
>> +++ b/Documentation/admin-guide/mm/memory-hotplug.rst
>> @@ -433,6 +433,18 @@ The following module parameters are currently defined:
>>   memory in a way that huge pages in bigger
>>   granularity cannot be formed on hotplugged
>>   memory.
>> +
>> + With value "force" it could result in memory
>> + wastage due to memmap size limitations. For
>> + example, if the memmap for a memory block
>> + requires 1 MiB, but the pageblock size is 2
>> + MiB, 1 MiB of hotplugged memory will be wasted.
>> + Note that there are still cases where the
>> + feature cannot be enforced: for example, if the
>> + memmap is smaller than a single page, or if the
>> + architecture does not support the forced mode
>> + in all configurations.
>> +
>>   ``online_policy``   read-write: Set the basic policy used for
>>   automatic zone selection when onlining memory
>>   blocks without specifying a target zone.
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 457824a6ecb8..5b472e137898 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -41,17 +41,89 @@
>>   #include "internal.h"
>>   #include "shuffle.h"
>>   
>> +enum {
>> +MEMMAP_ON_MEMORY_DISABLE = 0,
>> +MEMMAP_ON_MEMORY_ENABLE,
>> +MEMMAP_ON_MEMORY_FORCE,
>> +};
>> +
>> +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
>> +
>> +static inline unsigned long memory_block_memmap_pages(void)
>> +{
>> +unsigned long memmap_size;
>> +
>> +memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
>> +return memmap_size >> PAGE_SHIFT;
>
> I'd really move a !page variant (memory_block_memmap_size()) to the 
> previous patch and use it in mhp_supports_memmap_on_memory() and 
> arch_supports_memmap_on_memory().
>
> Then, in this patch, reuse that function in 
> memory_block_memmap_on_memory_pages() and ...
>
>> +}
>> +
>> +static inline unsigned long memory_block_memmap_on_memory_pages(void)
>> +{
>> +unsigned long nr_pages = memory_block_memmap_pages();
>
> ... do here a
>
> nr_pages = PHYS_PFN(memory_block_memmap_size());
>
>
> Conceptually, it would be even cleaner to have here
>
> nr_pages = PFN_UP(memory_block_memmap_size());
>
> even though one can argue that mhp_supports_memmap_on_memory() will make 
> sure that the unaligned value (memory_block_memmap_size()) covers full 
> pages, but at least to me it looks cleaner that way. No strong opinion.
>
>
>> +
>> +/*
>> + * In "forced" memmap_on_memory mode, we add extra pages to align the
>> + * vmemmap size

[PATCH v5 3/7] mm/hotplug: Allow architecture to override memmap on memory support check

2023-07-25 Thread Aneesh Kumar K.V

Some architectures would want different restrictions. Hence add an
architecture-specific override.

The PMD_SIZE check is moved there.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index eca32ccd45cc..457824a6ecb8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,9 +1247,24 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
+#ifndef arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long size)
+{
+   unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
+
+   /*
+* As default, we want the vmemmap to span a complete PMD such that we
+* can map the vmemmap using a single PMD if supported by the
+* architecture.
+*/
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+}
+#endif
+
 static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
-   unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
+   unsigned long nr_vmemmap_pages = size >> PAGE_SHIFT;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
unsigned long remaining_size = size - vmemmap_size;
 
@@ -1281,8 +1296,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long 
size)
 */
return mhp_memmap_on_memory() &&
   size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
+  IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) &&
+  arch_supports_memmap_on_memory(size);
 }
 
 /*
-- 
2.41.0

[PATCH v5 7/7] mm/hotplug: Enable runtime update of memmap_on_memory parameter

2023-07-25 Thread Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V 
---
 mm/memory_hotplug.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 96e794f39313..6cb6eac1aee5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -95,7 +95,12 @@ static int set_memmap_mode(const char *val, const struct 
kernel_param *kp)
mode =  MEMMAP_ON_MEMORY_DISABLE;
 
 matched:
+   /*
+* Avoid changing memmap mode during hotplug.
+*/
+   get_online_mems();
*((int *)kp->arg) =  mode;
+   put_online_mems();
if (mode == MEMMAP_ON_MEMORY_FORCE) {
unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
 
@@ -116,7 +121,7 @@ static const struct kernel_param_ops memmap_mode_ops = {
.set = set_memmap_mode,
.get = get_memmap_mode,
 };
-module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444);
+module_param_cb(memmap_on_memory, _mode_ops, _mode, 0644);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug\n"
 "With value \"force\" it could result in memory wastage due "
 "to memmap size limitations (Y/N/force)");
@@ -2175,18 +2180,16 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
 * the same granularity it was added - a single memory block.
 */
-   if (mhp_memmap_on_memory()) {
-   ret = walk_memory_blocks(start, size, _altmap,
-get_vmemmap_altmap_cb);
-   if (ret) {
-   if (size != memory_block_size_bytes()) {
-   pr_warn("Refuse to remove %#llx - %#llx,"
-   "wrong granularity\n",
-   start, start + size);
-   return -EINVAL;
-   }
-   altmap = _altmap;
+   ret = walk_memory_blocks(start, size, _altmap,
+get_vmemmap_altmap_cb);
+   if (ret) {
+   if (size != memory_block_size_bytes()) {
+   pr_warn("Refuse to remove %#llx - %#llx,"
+   "wrong granularity\n",
+   start, start + size);
+   return -EINVAL;
}
+   altmap = _altmap;
}
 
/* remove memmap entry */
-- 
2.41.0

[PATCH v5 5/7] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-07-25 Thread Aneesh Kumar K.V

Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 24 +++
 .../platforms/pseries/hotplug-memory.c|  3 ++-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index d0497d13f5b4..938294c996dc 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index a4893b17705a..9b4a1fd24025 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -161,6 +161,30 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long size)
+{
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size = nr_pages * sizeof(struct page);
+
+   if (!radix_enabled())
+   return false;
+   /*
+* With 4K page size and 2M PMD_SIZE, we can align
+* things better with memory block size value
+* starting from 128MB. Hence align things with PMD_SIZE.
+*/
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..1447509357a7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -617,6 +617,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
u32 drc_index)
 
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
+   mhp_t mhp_flags = MHP_NONE | MHP_MEMMAP_ON_MEMORY;
unsigned long block_sz;
int nid, rc;
 
@@ -637,7 +638,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, mhp_flags);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
-- 
2.41.0

[PATCH v5 6/7] mm/hotplug: Embed vmem_altmap details in memory block

2023-07-25 Thread Aneesh Kumar K.V

With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block, embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 32 +++-
 include/linux/memory.h |  8 ++--
 mm/memory_hotplug.c| 41 ++---
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..0210ed7b7696 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,6 +106,7 @@ static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
 
+   kfree(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->free;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   if (altmap) {
+   mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+   if (!mem->altmap) {
+   kfree(mem);
+   return -ENOMEM;
+   }
+   memcpy(mem->altmap, altmap, sizeof(*altmap));
+   }
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +797,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +832,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start));
@@ -832,7 +846,7 @@ int creat

[PATCH v5 4/7] mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks

2023-07-25 Thread Aneesh Kumar K.V

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page
block aligned with different memory block sizes. Using this facility
implies the kernel will be reserving some pages for every memoryblock.
This allows the memmap on memory feature to be widely useful with
different memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Signed-off-by: Aneesh Kumar K.V 
---
 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 mm/memory_hotplug.c   | 121 --
 2 files changed, 119 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
b/Documentation/admin-guide/mm/memory-hotplug.rst
index bd77841041af..2994958c7ce8 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -433,6 +433,18 @@ The following module parameters are currently defined:
 memory in a way that huge pages in bigger
 granularity cannot be formed on hotplugged
 memory.
+
+With value "force" it could result in memory
+wastage due to memmap size limitations. For
+example, if the memmap for a memory block
+requires 1 MiB, but the pageblock size is 2
+MiB, 1 MiB of hotplugged memory will be wasted.
+Note that there are still cases where the
+feature cannot be enforced: for example, if the
+memmap is smaller than a single page, or if the
+architecture does not support the forced mode
+in all configurations.
+
 ``online_policy``   read-write: Set the basic policy used for
 automatic zone selection when onlining memory
 blocks without specifying a target zone.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 457824a6ecb8..5b472e137898 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -41,17 +41,89 @@
 #include "internal.h"
 #include "shuffle.h"
 
+enum {
+   MEMMAP_ON_MEMORY_DISABLE = 0,
+   MEMMAP_ON_MEMORY_ENABLE,
+   MEMMAP_ON_MEMORY_FORCE,
+};
+
+static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE;
+
+static inline unsigned long memory_block_memmap_pages(void)
+{
+   unsigned long memmap_size;
+
+   memmap_size = PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page);
+   return memmap_size >> PAGE_SHIFT;
+}
+
+static inline unsigned long memory_block_memmap_on_memory_pages(void)
+{
+   unsigned long nr_pages = memory_block_memmap_pages();
+
+   /*
+* In "forced" memmap_on_memory mode, we add extra pages to align the
+* vmemmap size to cover full pageblocks. That way, we can add memory
+* even if the vmemmap size is not properly aligned, however, we might 
waste
+* memory.
+*/
+   if (memmap_mode == MEMMAP_ON_MEMORY_FORCE)
+   return pageblock_align(nr_pages);
+   return nr_pages;
+}
+
 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
 /*
  * memory_hotplug.memmap_on_memory parameter
  */
-static bool memmap_on_memory __ro_after_init;
-module_param(memmap_on_memory, bool, 0444);
-MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug");
+static int set_memmap_mode(const char *val, const struct kernel_param *kp)
+{
+   int ret, mode;
+   bool enabled;
+
+   if (sysfs_streq(val, "force") ||  sysfs_streq(val, "FORCE")) {
+   mode =  MEMMAP_ON_MEMORY_FORCE;
+   goto matched;
+   }
+
+   ret = kstrtobool(val, );
+   if (ret < 0)
+   return ret;
+   if (enabled)
+   mode =  MEMMAP_ON_MEMORY_ENABLE;
+   else
+   mode =  MEMMAP_ON_MEMORY_DISABLE;
+
+matched:
+   *((int *)kp->arg) =  mode;
+   if (mode == MEMMAP_ON_MEMORY_FORCE) {
+   unsigned long memmap_pages = 
memory_block_memmap_on_memory_pages();
+
+   pr_info("Memory hotplug will reserve %ld pages in each memory 
block\n",
+   memmap_pages - memory_block_memmap_pages

[PATCH v5 2/7] mm/hotplug: Allow memmap on memory hotplug request to fallback

2023-07-25 Thread Aneesh Kumar K.V

If not supported, fallback to not using memap on memmory. This avoids
the need for callers to do the fallback.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/acpi/acpi_memhotplug.c |  3 +--
 include/linux/memory_hotplug.h |  3 ++-
 mm/memory_hotplug.c| 13 ++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 24f662d8bd39..d0c1a71007d0 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (!info->length)
continue;
 
-   if (mhp_supports_memmap_on_memory(info->length))
-   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
+   mhp_flags |= MHP_MEMMAP_ON_MEMORY;
result = __add_memory(mgid, info->start_addr, info->length,
  mhp_flags);
 
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 013c69753c91..7d2076583494 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,6 +97,8 @@ typedef int __bitwise mhp_t;
  * To do so, we will use the beginning of the hot-added range to build
  * the page tables for the memmap array that describes the entire range.
  * Only selected architectures support it with SPARSE_VMEMMAP.
+ * This is only a hint, the core kernel can decide to not do this based on
+ * different alignment checks.
  */
 #define MHP_MEMMAP_ON_MEMORY   ((__force mhp_t)BIT(1))
 /*
@@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int 
nid,
 extern int arch_create_linear_mapping(int nid, u64 start, u64 size,
  struct mhp_params *params);
 void arch_remove_linear_mapping(u64 start, u64 size);
-extern bool mhp_supports_memmap_on_memory(unsigned long size);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #endif /* __LINUX_MEMORY_HOTPLUG_H */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7cfd13c91568..eca32ccd45cc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
return device_online(>dev);
 }
 
-bool mhp_supports_memmap_on_memory(unsigned long size)
+static bool mhp_supports_memmap_on_memory(unsigned long size)
 {
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
@@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource 
*res, mhp_t mhp_flags)
 * Self hosted memmap array
 */
if (mhp_flags & MHP_MEMMAP_ON_MEMORY) {
-   if (!mhp_supports_memmap_on_memory(size)) {
-   ret = -EINVAL;
-   goto error;
+   if (mhp_supports_memmap_on_memory(size)) {
+   mhp_altmap.free = PHYS_PFN(size);
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   params.altmap = _altmap;
}
-   mhp_altmap.free = PHYS_PFN(size);
-   mhp_altmap.base_pfn = PHYS_PFN(start);
-   params.altmap = _altmap;
+   /* fallback to not using altmap  */
}
 
/* call arch's memory hotadd */
-- 
2.41.0

[PATCH v5 1/7] mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig

2023-07-25 Thread Aneesh Kumar K.V

Instead of adding menu entry with all supported architectures, add
mm/Kconfig variable and select the same from supported architectures.

No functional change in this patch.

Acked-by: David Hildenbrand 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/arm64/Kconfig | 4 +---
 arch/x86/Kconfig   | 4 +---
 mm/Kconfig | 3 +++
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b1573257a4d6..0f749cfab8e6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -78,6 +78,7 @@ config ARM64
select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION
select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_USE_CMPXCHG_LOCKREF
select ARCH_USE_GNU_PROPERTY
select ARCH_USE_MEMTEST
@@ -347,9 +348,6 @@ config GENERIC_CSUM
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78224aa76409..d0258e92a8af 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -102,6 +102,7 @@ config X86
select ARCH_HAS_DEBUG_WX
select ARCH_HAS_ZONE_DMA_SET if EXPERT
select ARCH_HAVE_NMI_SAFE_CMPXCHG
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
@@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES
def_bool y
depends on ARCH_ENABLE_MEMORY_HOTPLUG
 
-config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
-   def_bool y
-
 menu "Power management and ACPI options"
 
 config ARCH_HIBERNATION_HEADER
diff --git a/mm/Kconfig b/mm/Kconfig
index 5fe49c030961..721dc88423c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY
 
 endif # MEMORY_HOTPLUG
 
+config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
+   bool
+
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
-- 
2.41.0

[PATCH v5 0/7] Add support for memmap on memory feature on ppc64

2023-07-25 Thread Aneesh Kumar K.V

This patch series update memmap on memory feature to fall back to
memmap allocation outside the memory block if the alignment rules are
not met. This makes the feature more useful on architectures like
ppc64 where alignment rules are different with 64K page size.

This patch series is dependent on dax vmemmap optimization series
posted here
https://lore.kernel.org/linux-mm/20230718022934.90447-1-aneesh.ku...@linux.ibm.com/

Changes from v4:
* Use altmap.free instead of altmap.reserve
* Address review feedback

Changes from v3:
* Extend the module parameter memmap_on_memory to force allocation even
  though we can waste hotplug memory.

Changes from v2:
* Rebase to latest linus tree
* Redo the series based on review feedback. Multiple changes to the patchset.

Changes from v1:
* update the memblock to store vmemmap_altmap details. This is required
so that when we remove the memory we can find the altmap details which
is needed on some architectures.
* rebase to latest linus tree



Aneesh Kumar K.V (7):
  mm/hotplug: Simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig
  mm/hotplug: Allow memmap on memory hotplug request to fallback
  mm/hotplug: Allow architecture to override memmap on memory support
check
  mm/hotplug: Support memmap_on_memory when memmap is not aligned to
pageblocks
  powerpc/book3s64/memhotplug: Enable memmap on memory for radix
  mm/hotplug: Embed vmem_altmap details in memory block
  mm/hotplug: Enable runtime update of memmap_on_memory parameter

 .../admin-guide/mm/memory-hotplug.rst |  12 ++
 arch/arm64/Kconfig|   4 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/pgtable.h|  24 +++
 .../platforms/pseries/hotplug-memory.c|   3 +-
 arch/x86/Kconfig  |   4 +-
 drivers/acpi/acpi_memhotplug.c|   3 +-
 drivers/base/memory.c |  32 ++-
 include/linux/memory.h|   8 +-
 include/linux/memory_hotplug.h|   3 +-
 mm/Kconfig|   3 +
 mm/memory_hotplug.c   | 201 ++
 12 files changed, 229 insertions(+), 69 deletions(-)

-- 
2.41.0

[PATCH v6 02/13] mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg

2023-07-24 Thread Aneesh Kumar K.V

We will use this in a later patch to do tlb flush when clearing pud entries
on powerpc. This is similar to commit 93a98695f2f9 ("mm: change
pmdp_huge_get_and_clear_full take vm_area_struct as arg")

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 4 ++--
 mm/debug_vm_pgtable.c   | 2 +-
 mm/huge_memory.c| 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5eb6bdf30c62..124427ece520 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -456,11 +456,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct 
vm_area_struct *vma,
 #endif
 
 #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL
-static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm,
+static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
int full)
 {
-   return pudp_huge_get_and_clear(mm, address, pudp);
+   return pudp_huge_get_and_clear(vma->vm_mm, address, pudp);
 }
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index ee119e33fef1..ee2c4c1dcfc8 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct 
pgtable_debug_args *args)
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
 
 #ifndef __PAGETABLE_PMD_FOLDED
-   pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1);
+   pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1);
pud = READ_ONCE(*args->pudp);
WARN_ON(!pud_none(pud));
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e0420de0e2e0..e371503f7746 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1981,7 +1981,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
if (!ptl)
return 0;
 
-   pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
+   pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
-- 
2.41.0

[PATCH v6 06/13] mm/huge pud: Use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE

2023-07-24 Thread Aneesh Kumar K.V

pudp_set_wrprotect and move_huge_pud helpers are only used when
CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and
move_huge_pmd_helpers use architecture override only if
CONFIG_TRANSPARENT_HUGEPAGE is set

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 2 ++
 mm/mremap.c | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 0af8bc4ce258..f34e0f2cb4d8 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -564,6 +564,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 #endif
 #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static inline void pudp_set_wrprotect(struct mm_struct *mm,
  unsigned long address, pud_t *pudp)
 {
@@ -577,6 +578,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm,
 {
BUILD_BUG();
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 #endif
 
diff --git a/mm/mremap.c b/mm/mremap.c
index 11e06e4ab33b..056478c106ee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct 
*vma,
 }
 #endif
 
-#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && 
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
 static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
  unsigned long new_addr, pud_t *old_pud, pud_t 
*new_pud)
 {
-- 
2.41.0

[PATCH v6 12/13] powerpc/book3s64/radix: Remove mmu_vmemmap_psize

2023-07-24 Thread Aneesh Kumar K.V

This is not used by radix anymore.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ---
 arch/powerpc/mm/init_64.c| 21 ++---
 2 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e5356ac37e99..25b46058f556 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void)
 #else
mmu_virtual_psize = MMU_PAGE_4K;
 #endif
-
-#ifdef CONFIG_SPARSEMEM_VMEMMAP
-   /* vmemmap mapping */
-   if (mmu_psize_defs[MMU_PAGE_2M].shift) {
-   /*
-* map vmemmap using 2M if available
-*/
-   mmu_vmemmap_psize = MMU_PAGE_2M;
-   } else
-   mmu_vmemmap_psize = mmu_virtual_psize;
-#endif
 #endif
/*
 * initialize page table size
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 5701faca39ef..6db7a063ba63 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, 
unsigned long start,
return false;
 }
 
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node,
-   struct vmem_altmap *altmap)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+struct vmem_altmap *altmap)
 {
bool altmap_alloc;
unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
-#ifdef CONFIG_PPC_BOOK3S_64
-   if (radix_enabled())
-   return radix__vmemmap_populate(start, end, node, altmap);
-#endif
-
/* Align to the page size of the linear mapping. */
start = ALIGN_DOWN(start, page_size);
 
@@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
return 0;
 }
 
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+  struct vmem_altmap *altmap)
+{
+
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (radix_enabled())
+   return radix__vmemmap_populate(start, end, node, altmap);
+#endif
+
+   return __vmemmap_populate(start, end, node, altmap);
+}
+
 #ifdef CONFIG_MEMORY_HOTPLUG
 static unsigned long vmemmap_list_free(unsigned long start)
 {
-- 
2.41.0

[PATCH v6 13/13] powerpc/book3s64/radix: Add debug message to give more details of vmemmap allocation

2023-07-24 Thread Aneesh Kumar K.V

Add some extra vmemmap pr_debug message that will indicate the type of
vmemmap allocations.

For ex: with DAX vmemmap optimization we can find the below details:
[  187.166580] radix-mmu: PAGE_SIZE vmemmap mapping
[  187.166587] radix-mmu: PAGE_SIZE vmemmap mapping
[  187.166591] radix-mmu: Tail page reuse vmemmap mapping
[  187.166594] radix-mmu: Tail page reuse vmemmap mapping
[  187.166598] radix-mmu: Tail page reuse vmemmap mapping
[  187.166601] radix-mmu: Tail page reuse vmemmap mapping
[  187.166604] radix-mmu: Tail page reuse vmemmap mapping
[  187.166608] radix-mmu: Tail page reuse vmemmap mapping
[  187.166611] radix-mmu: Tail page reuse vmemmap mapping
[  187.166614] radix-mmu: Tail page reuse vmemmap mapping
[  187.166617] radix-mmu: Tail page reuse vmemmap mapping
[  187.166620] radix-mmu: Tail page reuse vmemmap mapping
[  187.166623] radix-mmu: Tail page reuse vmemmap mapping
[  187.166626] radix-mmu: Tail page reuse vmemmap mapping
[  187.166629] radix-mmu: Tail page reuse vmemmap mapping
[  187.166632] radix-mmu: Tail page reuse vmemmap mapping

And without vmemmap optimization
[  293.549931] radix-mmu: PMD_SIZE vmemmap mapping
[  293.549984] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550032] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550076] radix-mmu: PMD_SIZE vmemmap mapping
[  293.550117] radix-mmu: PMD_SIZE vmemmap mapping

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 25b46058f556..59aaa30a7c0d 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -1033,6 +1033,7 @@ static pte_t * __meminit 
radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long
p = vmemmap_alloc_block_buf(PAGE_SIZE, node, 
NULL);
if (!p)
return NULL;
+   pr_debug("PAGE_SIZE vmemmap mapping\n");
} else {
/*
 * When a PTE/PMD entry is freed from the init_mm
@@ -1045,6 +1046,7 @@ static pte_t * __meminit 
radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long
 */
get_page(reuse);
p = page_to_virt(reuse);
+   pr_debug("Tail page reuse vmemmap mapping\n");
}
 
VM_BUG_ON(!PAGE_ALIGNED(addr));
@@ -1154,6 +1156,7 @@ int __meminit radix__vmemmap_populate(unsigned long 
start, unsigned long end, in
p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (p) {
vmemmap_set_pmd(pmd, p, node, addr, next);
+   pr_debug("PMD_SIZE vmemmap mapping\n");
continue;
} else if (altmap) {
/*
-- 
2.41.0

[PATCH v6 11/13] powerpc/book3s64/radix: Add support for vmemmap optimization for radix

2023-07-24 Thread Aneesh Kumar K.V

With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap
page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence
with 64K page size, we don't use vmemmap deduplication for PMD-level
mapping.

Signed-off-by: Aneesh Kumar K.V 
---
 Documentation/mm/vmemmap_dedup.rst |   1 +
 Documentation/powerpc/index.rst|   1 +
 Documentation/powerpc/vmemmap_dedup.rst| 101 ++
 arch/powerpc/Kconfig   |   1 +
 arch/powerpc/include/asm/book3s/64/radix.h |   9 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 203 +
 6 files changed, 316 insertions(+)
 create mode 100644 Documentation/powerpc/vmemmap_dedup.rst

diff --git a/Documentation/mm/vmemmap_dedup.rst 
b/Documentation/mm/vmemmap_dedup.rst
index a4b12ff906c4..c573e08b5043 100644
--- a/Documentation/mm/vmemmap_dedup.rst
+++ b/Documentation/mm/vmemmap_dedup.rst
@@ -210,6 +210,7 @@ the device (altmap).
 
 The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
 PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
+For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst
 
 The differences with HugeTLB are relatively minor.
 
diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst
index d33b554ca7ba..a50834798454 100644
--- a/Documentation/powerpc/index.rst
+++ b/Documentation/powerpc/index.rst
@@ -36,6 +36,7 @@ powerpc
 ultravisor
 vas-api
 vcpudispatch_stats
+vmemmap_dedup
 
 features
 
diff --git a/Documentation/powerpc/vmemmap_dedup.rst 
b/Documentation/powerpc/vmemmap_dedup.rst
new file mode 100644
index ..dc4db59fdf87
--- /dev/null
+++ b/Documentation/powerpc/vmemmap_dedup.rst
@@ -0,0 +1,101 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==
+Device DAX
+==
+
+The device-dax interface uses the tail deduplication technique explained in
+Documentation/mm/vmemmap_dedup.rst
+
+On powerpc, vmemmap deduplication is only used with radix MMU translation. Also
+with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap
+deduplication.
+
+With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap
+page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no
+vmemmap deduplication possible.
+
+With 1G PUD level mapping, we require 16384 struct pages and a single 64K
+vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we
+require 16 64K pages in vmemmap to map the struct page for 1G PUD level 
mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+ +---+ ---virt_to_page---> +---+   mapping to   +---+
+ |   | | 0 | -> | 0 |
+ |   | +---++---+
+ |   | | 1 | -> | 1 |
+ |   | +---++---+
+ |   | | 2 | ^ ^ ^ ^ ^ ^
+ |   | +---+   | | | | |
+ |   | | 3 | --+ | | | |
+ |   | +---+ | | | |
+ |   | | 4 | + | | |
+ |PUD| +---+   | | |
+ |   level   | | . | --+ | |
+ |  mapping  | +---+ | |
+ |   | | . | + |
+ |   | +---+   |
+ |   | | 15| --+
+ |   | +---+
+ |   |
+ |   |
+ |   |
+ +---+
+
+
+With 4K page size, 2M PMD level mapping requires 512 struct pages and a single
+4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we
+require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping.
+
+Here's how things look like on device-dax after the sections are populated::
+
+ +---+ ---virt_to_page---> +---+   mapping to   +---+
+ |   | | 0 | -> | 0 |
+ |   | +---++---+
+ |   | | 1 | -> | 1 |
+ |   | +---++---+
+ |   | | 2 | ^ ^ ^ ^ ^ ^
+ |   | +---+   | | | | |
+ |   |

[PATCH v6 10/13] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-07-24 Thread Aneesh Kumar K.V

This is in preparation to update radix to implement vmemmap optimization
for devdax. Below are the rules w.r.t radix vmemmap mapping

1. First try to map things using PMD (2M)
2. With altmap if altmap cross-boundary check returns true, fall back to
   PAGE_SIZE
3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to
   PAGE_SIZE

On removing vmemmap mapping, check if every subsection that is using the
vmemmap area is invalid. If found to be invalid, that implies we can safely
free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86
because with 64K page size, we need to do the above check even at the
PAGE_SIZE granularity.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/radix.h |   2 +
 arch/powerpc/include/asm/pgtable.h |   6 +
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 325 +++--
 arch/powerpc/mm/init_64.c  |  26 +-
 4 files changed, 328 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 2ef92f36340f..f1461289643a 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned 
long start,
 unsigned long phys);
 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
  int node, struct vmem_altmap *altmap);
+void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
+  struct vmem_altmap *altmap);
 extern void radix__vmemmap_remove_mapping(unsigned long start,
unsigned long page_size);
 
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 445a22987aa3..a4893b17705a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -157,6 +157,12 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd)
return (pgtable_t)pmd_page_vaddr(pmd);
 }
 
+#ifdef CONFIG_PPC64
+int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
+bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
+  unsigned long page_size);
+#endif /* CONFIG_PPC64 */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 227fea53c217..53f8340e390c 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -744,8 +744,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
p4d_clear(p4d);
 }
 
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long 
end)
+{
+   unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
+
+   return !vmemmap_populated(start, PMD_SIZE);
+}
+
+static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long 
end)
+{
+   unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+
+   return !vmemmap_populated(start, PAGE_SIZE);
+
+}
+#endif
+
+static void __meminit free_vmemmap_pages(struct page *page,
+struct vmem_altmap *altmap,
+int order)
+{
+   unsigned int nr_pages = 1 << order;
+
+   if (altmap) {
+   unsigned long alt_start, alt_end;
+   unsigned long base_pfn = page_to_pfn(page);
+
+   /*
+* with 2M vmemmap mmaping we can have things setup
+* such that even though atlmap is specified we never
+* used altmap.
+*/
+   alt_start = altmap->base_pfn;
+   alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
+
+   if (base_pfn >= alt_start && base_pfn < alt_end) {
+   vmem_altmap_free(altmap, nr_pages);
+   return;
+   }
+   }
+
+   if (PageReserved(page)) {
+   /* allocated from memblock */
+   while (nr_pages--)
+   free_reserved_page(page++);
+   } else
+   free_pages((unsigned long)page_address(page), order);
+}
+
 static void remove_pte_table(pte_t *pte_start, unsigned long addr,
-unsigned long end, bool direct)
+unsigned long end, bool direct,
+struct vmem_altmap *altmap)
 {
unsigned long next, pages = 0;
pte_t *pte;
@@ -759,24 +809,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned 
long addr,
if (!pte_present(*pte))
continue;
 
-   if (!PAGE_ALI

[PATCH v6 09/13] powerpc/book3s64/mm: Enable transparent pud hugepage

2023-07-24 Thread Aneesh Kumar K.V

This is enabled only with radix translation and 1G hugepage size. This will
be used with devdax device memory with a namespace alignment of 1G.

Anon transparent hugepage is not supported even though we do have helpers
checking pud_trans_huge(). We should never find that return true. The only
expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP.

Some of the helpers are never expected to get called on hash translation
and hence is marked to call BUG() in such a case.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash.h |   9 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 155 --
 arch/powerpc/include/asm/book3s/64/radix.h|  36 
 .../include/asm/book3s/64/tlbflush-radix.h|   2 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
 arch/powerpc/mm/book3s64/pgtable.c|  78 +
 arch/powerpc/mm/book3s64/radix_pgtable.c  |  28 
 arch/powerpc/mm/book3s64/radix_tlb.c  |   7 +
 arch/powerpc/platforms/Kconfig.cputype|   1 +
 include/trace/events/thp.h|  10 ++
 10 files changed, 323 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 17e7a778c856..efce6ef3e2a9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -133,7 +133,16 @@ static inline int get_region_id(unsigned long ea)
 }
 
 #definehash__pmd_bad(pmd)  (pmd_val(pmd) & H_PMD_BAD_BITS)
+
+/*
+ * pud comparison that will work with both pte and page table pointer.
+ */
+static inline int hash__pud_same(pud_t pud_a, pud_t pud_b)
+{
+   return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & 
~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0);
+}
 #definehash__pud_bad(pud)  (pud_val(pud) & H_PUD_BAD_BITS)
+
 static inline int hash__p4d_bad(p4d_t p4d)
 {
return (p4d_val(p4d) == 0);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4acc9690f599..a8204566cfd0 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte)
 {
return __pud_raw(pte_raw(pte));
 }
+
+static inline pte_t *pudp_ptep(pud_t *pud)
+{
+   return (pte_t *)pud;
+}
+
+#define pud_pfn(pud)   pte_pfn(pud_pte(pud))
+#define pud_dirty(pud) pte_dirty(pud_pte(pud))
+#define pud_young(pud) pte_young(pud_pte(pud))
+#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud)))
+#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud)))
+#define pud_mkdirty(pud)   pte_pud(pte_mkdirty(pud_pte(pud)))
+#define pud_mkclean(pud)   pte_pud(pte_mkclean(pud_pte(pud)))
+#define pud_mkyoung(pud)   pte_pud(pte_mkyoung(pud_pte(pud)))
+#define pud_mkwrite(pud)   pte_pud(pte_mkwrite(pud_pte(pud)))
 #define pud_write(pud) pte_write(pud_pte(pud))
 
+#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
+#define pud_soft_dirty(pmd)pte_soft_dirty(pud_pte(pud))
+#define pud_mksoft_dirty(pmd)  pte_pud(pte_mksoft_dirty(pud_pte(pud)))
+#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud)))
+#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
+
 static inline int pud_bad(pud_t pud)
 {
if (radix_enabled())
@@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool 
write)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot);
+extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot);
 extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
 extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
   pmd_t *pmdp, pmd_t pmd);
+extern void set_pud_at(struct mm_struct *mm, unsigned long addr,
+  pud_t *pudp, pud_t pud);
+
 static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmd)
 {
 }
 
+static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
+   unsigned long addr, pud_t *pud)
+{
+}
+
 extern int hash__has_transparent_hugepage(void);
 static inline int has_transparent_hugepage(void)
 {
@@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void)
 }
 #define has_transparent_hugepage has_transparent_hugepage
 
+static inline int has_transparent_pud_hugepage(void)
+{
+   if (radix_enabled())
+   return radix__has_transparent_pud_hugepage();
+   return 0;
+}
+#define has_transparent_pud_hugepage has_transparent_pud_hugepage
+
 static inline unsigned long
 pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
unsigned long clr, unsigned long set)
@@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long 
addr,

[PATCH v6 08/13] powerpc/mm/trace: Convert trace event to trace event class

2023-07-24 Thread Aneesh Kumar K.V

A follow-up patch will add a pud variant for this same event.
Using event class makes that addition simpler.

No functional change in this patch.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/hash_pgtable.c  |  2 +-
 arch/powerpc/mm/book3s64/radix_pgtable.c |  2 +-
 include/trace/events/thp.h   | 23 ---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c 
b/arch/powerpc/mm/book3s64/hash_pgtable.c
index 51f48984abca..988948d69bc1 100644
--- a/arch/powerpc/mm/book3s64/hash_pgtable.c
+++ b/arch/powerpc/mm/book3s64/hash_pgtable.c
@@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct 
*mm, unsigned long addr
 
old = be64_to_cpu(old_be);
 
-   trace_hugepage_update(addr, old, clr, set);
+   trace_hugepage_update_pmd(addr, old, clr, set);
if (old & H_PAGE_HASHPTE)
hpte_do_hugepage_flush(mm, addr, pmdp, old);
return old;
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index e7ea492ac510..02e185d2e4d6 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct 
*mm, unsigned long add
 #endif
 
old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
-   trace_hugepage_update(addr, old, clr, set);
+   trace_hugepage_update_pmd(addr, old, clr, set);
 
return old;
 }
diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h
index 202b3e3e67ff..a95c78b10561 100644
--- a/include/trace/events/thp.h
+++ b/include/trace/events/thp.h
@@ -8,25 +8,29 @@
 #include 
 #include 
 
-TRACE_EVENT(hugepage_set_pmd,
+DECLARE_EVENT_CLASS(hugepage_set,
 
-   TP_PROTO(unsigned long addr, unsigned long pmd),
-   TP_ARGS(addr, pmd),
+   TP_PROTO(unsigned long addr, unsigned long pte),
+   TP_ARGS(addr, pte),
TP_STRUCT__entry(
__field(unsigned long, addr)
-   __field(unsigned long, pmd)
+   __field(unsigned long, pte)
),
 
TP_fast_assign(
__entry->addr = addr;
-   __entry->pmd = pmd;
+   __entry->pte = pte;
),
 
-   TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, 
__entry->pmd)
+   TP_printk("Set page table entry with 0x%lx with 0x%lx", 
__entry->addr, __entry->pte)
 );
 
+DEFINE_EVENT(hugepage_set, hugepage_set_pmd,
+   TP_PROTO(unsigned long addr, unsigned long pmd),
+   TP_ARGS(addr, pmd)
+);
 
-TRACE_EVENT(hugepage_update,
+DECLARE_EVENT_CLASS(hugepage_update,
 
TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, 
unsigned long set),
TP_ARGS(addr, pte, clr, set),
@@ -48,6 +52,11 @@ TRACE_EVENT(hugepage_update,
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 
0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
 );
 
+DEFINE_EVENT(hugepage_update, hugepage_update_pmd,
+   TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, 
unsigned long set),
+   TP_ARGS(addr, pmd, clr, set)
+);
+
 DECLARE_EVENT_CLASS(migration_pmd,
 
TP_PROTO(unsigned long addr, unsigned long pmd),
-- 
2.41.0

[PATCH v6 07/13] mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization

2023-07-24 Thread Aneesh Kumar K.V

Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap
optimization includes an update of both the permissions (writeable to
read-only) and the output address (pfn) of the vmemmap ptes. That is not
supported without unmapping of pte(marking it invalid) by some
architectures.

With DAX vmemmap optimization we don't require such pte updates and
architectures can enable DAX vmemmap optimization while having hugetlb
vmemmap optimization disabled. Hence split DAX optimization support into a
different config.

s390, loongarch and riscv don't have devdax support. So the DAX config is not
enabled for them. With this change, arm64 should be able to select DAX
optimization

[1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable 
HUGETLB_PAGE_OPTIMIZE_VMEMMAP")

Signed-off-by: Aneesh Kumar K.V 
---
 arch/loongarch/Kconfig | 2 +-
 arch/riscv/Kconfig | 2 +-
 arch/s390/Kconfig  | 2 +-
 arch/x86/Kconfig   | 3 ++-
 fs/Kconfig | 2 +-
 include/linux/mm.h | 2 +-
 mm/Kconfig | 5 -
 7 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index e55511af4c77..537ca2a4005a 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -59,7 +59,7 @@ config LOONGARCH
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
select ARCH_WANT_LD_ORPHAN_WARN
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_NO_INSTR
select BUILDTIME_TABLE_SORT
select COMMON_CLK
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4c07b9189c86..6943d34c1ec1 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -53,7 +53,7 @@ config RISCV
select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE
select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select BUILDTIME_TABLE_SORT if MMU
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 290b6f93b816..8ff6d1c21e38 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,7 +127,7 @@ config S390
select ARCH_WANTS_NO_INSTR
select ARCH_WANT_DEFAULT_BPF_JIT
select ARCH_WANT_IPC_PARSE_VERSION
-   select ARCH_WANT_OPTIMIZE_VMEMMAP
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
select BUILDTIME_TABLE_SORT
select CLONE_BACKWARDS2
select DMA_OPS if PCI
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7422db409770..78224aa76409 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -128,7 +128,8 @@ config X86
select ARCH_WANT_GENERAL_HUGETLB
select ARCH_WANT_HUGE_PMD_SHARE
select ARCH_WANT_LD_ORPHAN_WARN
-   select ARCH_WANT_OPTIMIZE_VMEMMAP   if X86_64
+   select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP   if X86_64
+   select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP   if X86_64
select ARCH_WANTS_THP_SWAP  if X86_64
select ARCH_HAS_PARANOID_L1D_FLUSH
select BUILDTIME_TABLE_SORT
diff --git a/fs/Kconfig b/fs/Kconfig
index 19975b104bc3..f3be721bab6d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -254,7 +254,7 @@ config HUGETLB_PAGE
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
def_bool HUGETLB_PAGE
-   depends on ARCH_WANT_OPTIMIZE_VMEMMAP
+   depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
 
 config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fe6741539740..f8899bda941a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3650,7 +3650,7 @@ void vmemmap_free(unsigned long start, unsigned long end,
 #endif
 
 #define VMEMMAP_RESERVE_NR 2
-#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
+#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
 static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
  struct dev_pagemap *pgmap)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index da681dda8af1..5fe49c030961 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -486,7 +486,10 @@ config SPARSEMEM_VMEMMAP
 # Select this config option from the architecture Kconfig, if it is preferred
 # to enable the feature of HugeTLB/dev_dax vmemmap optimization.
 #
-config ARCH_WANT_OPTIMIZE_VMEMMAP
+config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP
+   bool
+
+config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
bool
 
 config HAVE_MEMBLOCK_PHYS_MAP
-- 
2.41.0

[PATCH v6 05/13] mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME

2023-07-24 Thread Aneesh Kumar K.V

This helps architectures to override pmd_same and pud_same independently.

Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/pgtable.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 124427ece520..0af8bc4ce258 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -699,11 +699,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
return pmd_val(pmd_a) == pmd_val(pmd_b);
 }
+#endif
 
+#ifndef pud_same
 static inline int pud_same(pud_t pud_a, pud_t pud_b)
 {
return pud_val(pud_a) == pud_val(pud_b);
 }
+#define pud_same pud_same
 #endif
 
 #ifndef __HAVE_ARCH_P4D_SAME
-- 
2.41.0

[PATCH v6 04/13] mm/vmemmap: Allow architectures to override how vmemmap optimization works

2023-07-24 Thread Aneesh Kumar K.V

Architectures like powerpc will like to use different page table allocators
and mapping mechanisms to implement vmemmap optimization. Similar to
vmemmap_populate allow architectures to implement
vmemap_populate_compound_pages

Signed-off-by: Aneesh Kumar K.V 
---
 mm/sparse-vmemmap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a044a130405b..a2cbe44c48e1 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long 
start, unsigned long end,
return 0;
 }
 
+#ifndef vmemmap_populate_compound_pages
 /*
  * For compound pages bigger than section size (e.g. x86 1G compound
  * pages with 2M subsection size) fill the rest of sections as tail
@@ -446,6 +447,8 @@ static int __meminit 
vmemmap_populate_compound_pages(unsigned long start_pfn,
return 0;
 }
 
+#endif
+
 struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
struct dev_pagemap *pgmap)
-- 
2.41.0

[PATCH v6 03/13] mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to override

2023-07-24 Thread Aneesh Kumar K.V

dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within
vmemmap such that tail page mapping can point to the second PAGE_SIZE area.
Enforce that in vmemmap_can_optimize() function.

Architectures like powerpc also want to enable vmemmap optimization
conditionally (only with radix MMU translation). Hence allow architecture
override.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/mm.h | 27 +++
 mm/mm_init.c   |  2 +-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a5d68baea231..fe6741539740 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3649,13 +3649,32 @@ void vmemmap_free(unsigned long start, unsigned long 
end,
struct vmem_altmap *altmap);
 #endif
 
+#define VMEMMAP_RESERVE_NR 2
 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP
-static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
-  struct dev_pagemap *pgmap)
+static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap,
+ struct dev_pagemap *pgmap)
 {
-   return is_power_of_2(sizeof(struct page)) &&
-   pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap;
+   unsigned long nr_pages;
+   unsigned long nr_vmemmap_pages;
+
+   if (!pgmap || !is_power_of_2(sizeof(struct page)))
+   return false;
+
+   nr_pages = pgmap_vmemmap_nr(pgmap);
+   nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT);
+   /*
+* For vmemmap optimization with DAX we need minimum 2 vmemmap
+* pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst
+*/
+   return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR);
 }
+/*
+ * If we don't have an architecture override, use the generic rule
+ */
+#ifndef vmemmap_can_optimize
+#define vmemmap_can_optimize __vmemmap_can_optimize
+#endif
+
 #else
 static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap,
   struct dev_pagemap *pgmap)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index acb0ac194672..641c56fd08a2 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct 
vmem_altmap *altmap,
if (!vmemmap_can_optimize(altmap, pgmap))
return pgmap_vmemmap_nr(pgmap);
 
-   return 2 * (PAGE_SIZE / sizeof(struct page));
+   return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page));
 }
 
 static void __ref memmap_init_compound(struct page *head,
-- 
2.41.0

[PATCH v6 01/13] mm/hugepage pud: Allow arch-specific helper function to check huge page pud support

2023-07-24 Thread Aneesh Kumar K.V

Architectures like powerpc would like to enable transparent huge page pud
support only with radix translation. To support that add
has_transparent_pud_hugepage() helper that architectures can override.

Reviewed-by: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/pfn_devs.c | 2 +-
 include/linux/pgtable.h   | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index af7d9301520c..18ad315581ca 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned 
long *alignments)
 
if (has_transparent_hugepage()) {
alignments[1] = HPAGE_PMD_SIZE;
-   if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
+   if (has_transparent_pud_hugepage())
alignments[2] = HPAGE_PUD_SIZE;
}
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5f36c055794b..5eb6bdf30c62 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1505,6 +1505,9 @@ typedef unsigned int pgtbl_mod_mask;
 #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE)
 #endif
 
+#ifndef has_transparent_pud_hugepage
+#define has_transparent_pud_hugepage() 
IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+#endif
 /*
  * On some architectures it depends on the mm if the p4d/pud or pmd
  * layer of the page table hierarchy is folded or not.
-- 
2.41.0

[PATCH v6 00/13] Add support for DAX vmemmap optimization for ppc64

2023-07-24 Thread Aneesh Kumar K.V

This patch series implements changes required to support DAX vmemmap
optimization for ppc64. The vmemmap optimization is only enabled with radix MMU
translation and 1GB PUD mapping with 64K page size. The patch series also split
hugetlb vmemmap optimization as a separate Kconfig variable so that
architectures can enable DAX vmemmap optimization without enabling hugetlb
vmemmap optimization. This should enable architectures like arm64 to enable DAX
vmemmap optimization while they can't enable hugetlb vmemmap optimization. More
details of the same are in patch "mm/vmemmap optimization: Split hugetlb and
devdax vmemmap optimization"

Changes from v5:
* rebase to mm-unstable branch

Changes from v4:
* Address review feedback
* Add the Reviewed-by:

Changes from v3:
* Rebase to latest linus tree
* Build fix with SPARSEMEM_VMEMMP disabled
* Add hash_pud_same outisde THP Kconfig

Changes from v2:
* Rebase to latest linus tree
* Address review feedback

Changes from V1:
* Fix make htmldocs warning
* Fix vmemmap allocation bugs with different alignment values.
* Correctly check for section validity to before we free vmemmap area



Aneesh Kumar K.V (13):
  mm/hugepage pud: Allow arch-specific helper function to check huge
page pud support
  mm: Change pudp_huge_get_and_clear_full take vm_area_struct as arg
  mm/vmemmap: Improve vmemmap_can_optimize and allow architectures to
override
  mm/vmemmap: Allow architectures to override how vmemmap optimization
works
  mm: Add pud_same similar to __HAVE_ARCH_P4D_SAME
  mm/huge pud: Use transparent huge pud helpers only with
CONFIG_TRANSPARENT_HUGEPAGE
  mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization
  powerpc/mm/trace: Convert trace event to trace event class
  powerpc/book3s64/mm: Enable transparent pud hugepage
  powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap
handling function
  powerpc/book3s64/radix: Add support for vmemmap optimization for radix
  powerpc/book3s64/radix: Remove mmu_vmemmap_psize
  powerpc/book3s64/radix: Add debug message to give more details of
vmemmap allocation

 Documentation/mm/vmemmap_dedup.rst|   1 +
 Documentation/powerpc/index.rst   |   1 +
 Documentation/powerpc/vmemmap_dedup.rst   | 101 
 arch/loongarch/Kconfig|   2 +-
 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/book3s/64/hash.h |   9 +
 arch/powerpc/include/asm/book3s/64/pgtable.h  | 155 -
 arch/powerpc/include/asm/book3s/64/radix.h|  47 ++
 .../include/asm/book3s/64/tlbflush-radix.h|   2 +
 arch/powerpc/include/asm/book3s/64/tlbflush.h |   8 +
 arch/powerpc/include/asm/pgtable.h|   6 +
 arch/powerpc/mm/book3s64/hash_pgtable.c   |   2 +-
 arch/powerpc/mm/book3s64/pgtable.c|  78 +++
 arch/powerpc/mm/book3s64/radix_pgtable.c  | 572 --
 arch/powerpc/mm/book3s64/radix_tlb.c  |   7 +
 arch/powerpc/mm/init_64.c |  37 +-
 arch/powerpc/platforms/Kconfig.cputype|   1 +
 arch/riscv/Kconfig|   2 +-
 arch/s390/Kconfig |   2 +-
 arch/x86/Kconfig  |   3 +-
 drivers/nvdimm/pfn_devs.c |   2 +-
 fs/Kconfig|   2 +-
 include/linux/mm.h|  29 +-
 include/linux/pgtable.h   |  12 +-
 include/trace/events/thp.h|  33 +-
 mm/Kconfig|   5 +-
 mm/debug_vm_pgtable.c |   2 +-
 mm/huge_memory.c  |   2 +-
 mm/mm_init.c  |   2 +-
 mm/mremap.c   |   2 +-
 mm/sparse-vmemmap.c   |   3 +
 31 files changed, 1049 insertions(+), 82 deletions(-)
 create mode 100644 Documentation/powerpc/vmemmap_dedup.rst

-- 
2.41.0

Re: [PATCH v5 10/13] powerpc/book3s64/vmemmap: Switch radix to use a different vmemmap handling function

2023-07-24 Thread Aneesh Kumar K.V

"Aneesh Kumar K.V"  writes:

> This is in preparation to update radix to implement vmemmap optimization
> for devdax. Below are the rules w.r.t radix vmemmap mapping
>
> 1. First try to map things using PMD (2M)
> 2. With altmap if altmap cross-boundary check returns true, fall back to
>PAGE_SIZE
> 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to
>PAGE_SIZE
>
> On removing vmemmap mapping, check if every subsection that is using the
> vmemmap area is invalid. If found to be invalid, that implies we can safely
> free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86
> because with 64K page size, we need to do the above check even at the
> PAGE_SIZE granularity.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |   2 +
>  arch/powerpc/include/asm/pgtable.h |   4 +
>  arch/powerpc/mm/book3s64/radix_pgtable.c   | 326 +++--
>  arch/powerpc/mm/init_64.c  |  26 +-
>  4 files changed, 327 insertions(+), 31 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 2ef92f36340f..f1461289643a 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -331,6 +331,8 @@ extern int __meminit 
> radix__vmemmap_create_mapping(unsigned long start,
>unsigned long phys);
>  int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end,
> int node, struct vmem_altmap *altmap);
> +void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
> +struct vmem_altmap *altmap);
>  extern void radix__vmemmap_remove_mapping(unsigned long start,
>   unsigned long page_size);
>  
> diff --git a/arch/powerpc/include/asm/pgtable.h 
> b/arch/powerpc/include/asm/pgtable.h
> index 6a88bfdaa69b..68817ea7f994 100644
> --- a/arch/powerpc/include/asm/pgtable.h
> +++ b/arch/powerpc/include/asm/pgtable.h
> @@ -165,6 +165,10 @@ static inline bool is_ioremap_addr(const void *x)
>  
>   return addr >= IOREMAP_BASE && addr < IOREMAP_END;
>  }
> +
> +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
> vmemmap_map_size);
> +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
> +unsigned long page_size);
>  #endif /* CONFIG_PPC64 */
>  
>  #endif /* __ASSEMBLY__ */
> diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
> b/arch/powerpc/mm/book3s64/radix_pgtable.c
> index 227fea53c217..9a7f3707b6fb 100644
> --- a/arch/powerpc/mm/book3s64/radix_pgtable.c
> +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
> @@ -744,8 +744,59 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
>   p4d_clear(p4d);
>  }
>  
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned 
> long end)
> +{
> + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
> +
> + return !vmemmap_populated(start, PMD_SIZE);
> +}
> +
> +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned 
> long end)
> +{
> + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
> +
> + return !vmemmap_populated(start, PAGE_SIZE);
> +
> +}
> +#endif
> +
> +static void __meminit free_vmemmap_pages(struct page *page,
> +  struct vmem_altmap *altmap,
> +  int order)
> +{
> + unsigned int nr_pages = 1 << order;
> +
> + if (altmap) {
> + unsigned long alt_start, alt_end;
> + unsigned long base_pfn = page_to_pfn(page);
> +
> + /*
> +  * with 2M vmemmap mmaping we can have things setup
> +  * such that even though atlmap is specified we never
> +  * used altmap.
> +  */
> + alt_start = altmap->base_pfn;
> + alt_end = altmap->base_pfn + altmap->reserve +
> + altmap->free + altmap->alloc + altmap->align;
> +
> + if (base_pfn >= alt_start && base_pfn < alt_end) {
> + vmem_altmap_free(altmap, nr_pages);
> + return;
> + }
> + }
> +

Please take this diff on top of this patch when adding this series to
-mm .

commit 613569d9517be60611a86bf4b9821b150c4c4954
Author: Aneesh Kumar K.V 
Date:   Mon Jul 24 22:49:29 2023 +0530

powerpc/mm/altmap: Fix altmap boundary check

altmap->free

[PATCH] powerpc/mm/altmap: Fix altmap boundary check

2023-07-24 Thread Aneesh Kumar K.V

altmap->free includes the entire free space from which altmap blocks
can be allocated. So when checking whether the kernel is doing altmap
block free, compute the boundary correctly.

Cc: David Hildenbrand 
Cc: Dan Williams 
Fixes: 9ef34630a461 ("powerpc/mm: Fallback to RAM if the altmap is unusable")
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/init_64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index fe1b83020e0d..0ec5b45b1e86 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -314,8 +314,7 @@ void __ref vmemmap_free(unsigned long start, unsigned long 
end,
start = ALIGN_DOWN(start, page_size);
if (altmap) {
alt_start = altmap->base_pfn;
-   alt_end = altmap->base_pfn + altmap->reserve +
- altmap->free + altmap->alloc + altmap->align;
+   alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
}
 
pr_debug("vmemmap_free %lx...%lx\n", start, end);
-- 
2.41.0

Re: [PATCH v4 4/6] mm/hotplug: Allow pageblock alignment via altmap reservation

2023-07-24 Thread Aneesh Kumar K.V

David Hildenbrand  writes:

> On 24.07.23 18:02, Aneesh Kumar K V wrote:
>> On 7/24/23 9:11 PM, David Hildenbrand wrote:
>>> On 24.07.23 17:16, Aneesh Kumar K V wrote:
>>>
>
> /*
>    * In "forced" memmap_on_memory mode, we always align the vmemmap size 
> up to cover
>    * full pageblocks. That way, we can add memory even if the vmemmap 
> size is not properly
>    * aligned, however, we might waste memory.
>    */

 I am finding that confusing. We do want things to be pageblock_nr_pages 
 aligned both ways.
 With MEMMAP_ON_MEMORY_FORCE, we do that by allocating more space for 
 memmap and
 in the default case we do that by making sure only memory blocks of 
 specific size supporting
 that alignment can use MEMMAP_ON_MEMORY feature.
>>>
>>> See the usage inm hp_supports_memmap_on_memory(), I guess that makes sense 
>>> then.
>>>
>>> But if you have any ideas on how to clarify that (terminology), I'm all 
>>> ears!
>>>
>> 
>> 
>> I updated the commit message
>> 
>> mm/hotplug: Support memmap_on_memory when memmap is not aligned to pageblocks
>> 
>> Currently, memmap_on_memory feature is only supported with memory block
>> sizes that result in vmemmap pages covering full page blocks. This is
>> because memory onlining/offlining code requires applicable ranges to be
>> pageblock-aligned, for example, to set the migratetypes properly.
>> 
>> This patch helps to lift that restriction by reserving more pages than
>> required for vmemmap space. This helps to align the start addr to be
>> page block aligned with different memory block sizes. This implies the
>> kernel will be reserving some pages for every memoryblock. This also
>> allows the memmap on memory feature to be widely useful with different
>> memory block size values.
>> 
>> For ex: with 64K page size and 256MiB memory block size, we require 4
>> pages to map vmemmap pages, To align things correctly we end up adding a
>> reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.
>> 
>> 
>
> Much better.
>
>> Also while implementing your  suggestion to use 
>> memory_block_memmap_on_memory_size()
>> I am finding it not really useful because in mhp_supports_memmap_on_memory() 
>> we are checking
>> if remaining_size is pageblock_nr_pages aligned (dax_kmem may want to use 
>> that helper
>> later).
>
> Let's focus on this patchset here first.
>
> Factoring out how manye memmap pages we actually need vs. how many pages 
> we need when aligning up sound very reasonable to me.
>
>
> Can you elaborate what the problem is?
>
>> Also I still think altmap.reserve is easier because of the start_pfn 
>> calculation.
>> (more on this below)
>
> Can you elaborate? Do you mean the try_remove_memory() change?
>
>> 
>> 
>>> [...]
>>>
>> +    return arch_supports_memmap_on_memory(size);
>>     }
>>       /*
>> @@ -1311,7 +1391,11 @@ int __ref add_memory_resource(int nid, struct 
>> resource *res, mhp_t mhp_flags)
>>     {
>>     struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
>>     enum memblock_flags memblock_flags = MEMBLOCK_NONE;
>> -    struct vmem_altmap mhp_altmap = {};
>> +    struct vmem_altmap mhp_altmap = {
>> +    .base_pfn =  PHYS_PFN(res->start),
>> +    .end_pfn  =  PHYS_PFN(res->end),
>> +    .reserve  = memory_block_align_base(resource_size(res)),
>
> Can you remind me why we have to set reserve here at all?
>
> IOW, can't we simply set
>
> .free = memory_block_memmap_on_memory_size();
>
> end then pass
>
> mhp_altmap.alloc + mhp_altmap.free
>
> to create_memory_block_devices() instead?
>

 But with the dax usage of altmap, altmap->reserve is what we use to 
 reserve things to get
 the required alignment. One difference is where we allocate the struct 
 page at. For this specific
 case it should not matter.

 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap 
 *altmap)
 {
  return altmap->base_pfn + altmap->reserve + altmap->alloc
      + altmap->align;
 }

 And other is where we online a memory block

 We find the start pfn using mem->altmap->alloc + mem->altmap->reserve;

 Considering altmap->reserve is what dax pfn_dev use, is there a reason you 
 want to use altmap->free for this?
>>>
>>> "Reserve" is all about "reserving that much memory for driver usage".
>>>
>>> We don't care about that. We simply want vmemmap allocations coming from 
>>> the pageblock(s) we set aside. Where exactly, we don't care.
>>>
 I find it confusing to update free when we haven't allocated any altmap 
 blocks yet.
>>>
>>> "
>>> @reserve: pages mapped, but reserved for driver use (relative to @base)"
>>> @free: free pages set aside in the mapping for memmap storage
>>> @alloc: track pages consumed, private to vmemmap_populate()
>>> "
>>>
>>> To

[PATCH] mm/hotplug: Enable runtime update of memmap_on_memory parameter

2023-07-21 Thread Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V 
---
This is dependent on patches posted at
https://lore.kernel.org/linux-mm/20230718024409.95742-1-aneesh.ku...@linux.ibm.com/

 mm/memory_hotplug.c | 27 +++
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a8adbe030f9..21a5113fe9be 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -86,7 +86,12 @@ static int set_memmap_mode(const char *val, const struct 
kernel_param *kp)
mode =  MEMMAP_ON_MEMORY_DISABLE;
 
 matched:
+   /*
+* Avoid changing memmap mode during hotplug.
+*/
+   get_online_mems();
*((int *)kp->arg) =  mode;
+   put_online_mems();
if (mode == MEMMAP_ON_MEMORY_FORCE) {
pr_info("Memory hotplug will reserve %ld pages in each memory 
block\n",
memory_block_align_base(memory_block_size_bytes()));
@@ -108,7 +113,7 @@ static const struct kernel_param_ops memmap_mode_ops = {
.set = set_memmap_mode,
.get = get_memmap_mode,
 };
-module_param_cb(memmap_on_memory, _mode_ops, _mode, 0444);
+module_param_cb(memmap_on_memory, _mode_ops, _mode, 0644);
 MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory 
hotplug\n"
"With value \"force\" it could result in memory wastage due to memmap 
size limitations \n"
"For example, if the memmap for a memory block requires 1 MiB, but the 
pageblock \n"
@@ -2162,18 +2167,16 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
 * the same granularity it was added - a single memory block.
 */
-   if (mhp_memmap_on_memory()) {
-   ret = walk_memory_blocks(start, size, _altmap,
-get_vmemmap_altmap_cb);
-   if (ret) {
-   if (size != memory_block_size_bytes()) {
-   pr_warn("Refuse to remove %#llx - %#llx,"
-   "wrong granularity\n",
-   start, start + size);
-   return -EINVAL;
-   }
-   altmap = _altmap;
+   ret = walk_memory_blocks(start, size, _altmap,
+get_vmemmap_altmap_cb);
+   if (ret) {
+   if (size != memory_block_size_bytes()) {
+   pr_warn("Refuse to remove %#llx - %#llx,"
+   "wrong granularity\n",
+   start, start + size);
+   return -EINVAL;
}
+   altmap = _altmap;
}
 
/* remove memmap entry */
-- 
2.41.0

Re: [PATCH v3 04/13] powerpc: assert_pte_locked() use pte_offset_map_nolock()

2023-07-18 Thread Aneesh Kumar K.V

Hugh Dickins  writes:

> Instead of pte_lockptr(), use the recently added pte_offset_map_nolock()
> in assert_pte_locked().  BUG if pte_offset_map_nolock() fails: this is
> stricter than the previous implementation, which skipped when pmd_none()
> (with a comment on khugepaged collapse transitions): but wouldn't we want
> to know, if an assert_pte_locked() caller can be racing such transitions?
>

The reason we had that pmd_none check there was to handle khugpaged. In
case of khugepaged we do pmdp_collapse_flush and then do a ptep_clear.
ppc64 had the assert_pte_locked check inside that ptep_clear.

_pmd = pmdp_collapse_flush(vma, address, pmd);
..
ptep_clear()
-> asset_ptep_locked()
---> pmd_none
-> BUG


The problem is how assert_pte_locked() verify whether we are holding
ptl. It does that by walking the page table again and in this specific
case by the time we call the function we already had cleared pmd .
>
> This mod might cause new crashes: which either expose my ignorance, or
> indicate issues to be fixed, or limit the usage of assert_pte_locked().
>
> Signed-off-by: Hugh Dickins 
> ---
>  arch/powerpc/mm/pgtable.c | 16 ++--
>  1 file changed, 6 insertions(+), 10 deletions(-)
>
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index cb2dcdb18f8e..16b061af86d7 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned 
> long addr)
>   p4d_t *p4d;
>   pud_t *pud;
>   pmd_t *pmd;
> + pte_t *pte;
> + spinlock_t *ptl;
>  
>   if (mm == _mm)
>   return;
> @@ -321,16 +323,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned 
> long addr)
>   pud = pud_offset(p4d, addr);
>   BUG_ON(pud_none(*pud));
>   pmd = pmd_offset(pud, addr);
> - /*
> -  * khugepaged to collapse normal pages to hugepage, first set
> -  * pmd to none to force page fault/gup to take mmap_lock. After
> -  * pmd is set to none, we do a pte_clear which does this assertion
> -  * so if we find pmd none, return.
> -  */
> - if (pmd_none(*pmd))
> - return;
> - BUG_ON(!pmd_present(*pmd));
> - assert_spin_locked(pte_lockptr(mm, pmd));
> + pte = pte_offset_map_nolock(mm, pmd, addr, );
> + BUG_ON(!pte);
> + assert_spin_locked(ptl);
> + pte_unmap(pte);
>  }
>  #endif /* CONFIG_DEBUG_VM */
>  
> -- 
> 2.35.3

[PATCH v4 6/6] mm/hotplug: Embed vmem_altmap details in memory block

2023-07-17 Thread Aneesh Kumar K.V

With memmap on memory, some architecture needs more details w.r.t altmap
such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of
computing them again when we remove a memory block embed vmem_altmap
details in struct memory_block if we are using memmap on memory block
feature.

No functional change in this patch

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/base/memory.c  | 32 +++-
 include/linux/memory.h |  8 ++--
 mm/memory_hotplug.c| 38 ++
 3 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index b456ac213610..cef6506f0209 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -106,6 +106,7 @@ static void memory_block_release(struct device *dev)
 {
struct memory_block *mem = to_memory_block(dev);
 
+   kfree(mem->altmap);
kfree(mem);
 }
 
@@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
struct zone *zone;
int ret;
 
@@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem)
 * stage helps to keep accounting easier to follow - e.g vmemmaps
 * belong to the same zone as the memory they backed.
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve;
+
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, 
zone);
if (ret)
@@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem)
 {
unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
-   unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+   unsigned long nr_vmemmap_pages = 0;
int ret;
 
if (!mem->zone)
@@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem)
 * Unaccount before offlining, such that unpopulated zone and kthreads
 * can properly be torn down in offline_pages().
 */
+   if (mem->altmap)
+   nr_vmemmap_pages = mem->altmap->alloc + mem->altmap->reserve;
+
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
  -nr_vmemmap_pages);
@@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid,
 #endif
 
 static int add_memory_block(unsigned long block_id, unsigned long state,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
struct memory_block *mem;
@@ -744,7 +751,14 @@ static int add_memory_block(unsigned long block_id, 
unsigned long state,
mem->start_section_nr = block_id * sections_per_block;
mem->state = state;
mem->nid = NUMA_NO_NODE;
-   mem->nr_vmemmap_pages = nr_vmemmap_pages;
+   if (altmap) {
+   mem->altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL);
+   if (!mem->altmap) {
+   kfree(mem);
+   return -ENOMEM;
+   }
+   memcpy(mem->altmap, altmap, sizeof(*altmap));
+   }
INIT_LIST_HEAD(>group_next);
 
 #ifndef CONFIG_NUMA
@@ -783,14 +797,14 @@ static int __init add_boot_memory_block(unsigned long 
base_section_nr)
if (section_count == 0)
return 0;
return add_memory_block(memory_block_id(base_section_nr),
-   MEM_ONLINE, 0,  NULL);
+   MEM_ONLINE, NULL,  NULL);
 }
 
 static int add_hotplug_memory_block(unsigned long block_id,
-   unsigned long nr_vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
-   return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group);
+   return add_memory_block(block_id, MEM_OFFLINE, altmap, group);
 }
 
 static void remove_memory_block(struct memory_block *memory)
@@ -818,7 +832,7 @@ static void remove_memory_block(struct memory_block *memory)
  * Called under device_hotplug_lock.
  */
 int create_memory_block_devices(unsigned long start, unsigned long size,
-   unsigned long vmemmap_pages,
+   struct vmem_altmap *altmap,
struct memory_group *group)
 {
const unsigned long start_block_id = pfn_to_blo

[PATCH v4 5/6] powerpc/book3s64/memhotplug: Enable memmap on memory for radix

2023-07-17 Thread Aneesh Kumar K.V

Radix vmemmap mapping can map things correctly at the PMD level or PTE
level based on different device boundary checks. Hence we skip the
restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also
makes the feature widely useful because to use PMD_SIZE vmemmap area we
require a memory block size of 2GiB

We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature
can work with a memory block size of 256MB. Using altmap.reserve feature
to align things correctly at pageblock granularity. We can end up
losing some pages in memory with this. For ex: with a 256MiB memory block
size, we require 4 pages to map vmemmap pages, In order to align things
correctly we end up adding a reserve of 28 pages. ie, for every 4096
pages 28 pages get reserved.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/pgtable.h| 24 +++
 .../platforms/pseries/hotplug-memory.c|  3 ++-
 mm/memory_hotplug.c   |  2 ++
 4 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 116d6add0bb0..f890907e5bbf 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -157,6 +157,7 @@ config PPC
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
+   select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 68817ea7f994..3d35371395a9 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -169,6 +169,30 @@ static inline bool is_ioremap_addr(const void *x)
 int __meminit vmemmap_populated(unsigned long vmemmap_addr, int 
vmemmap_map_size);
 bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start,
   unsigned long page_size);
+/*
+ * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details
+ * some of the restrictions. We don't check for PMD_SIZE because our
+ * vmemmap allocation code can fallback correctly. The pageblock
+ * alignment requirement is met using altmap->reserve blocks.
+ */
+#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory
+static inline bool arch_supports_memmap_on_memory(unsigned long size)
+{
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+   unsigned long vmemmap_size = nr_pages * sizeof(struct page);
+
+   if (!radix_enabled())
+   return false;
+
+   if (IS_ENABLED(CONFIG_PPC_4K_PAGES))
+   return IS_ALIGNED(vmemmap_size, PMD_SIZE);
+   /*
+* The pageblock alignment requirement is met by using
+* reserve blocks in altmap.
+*/
+   return true;
+}
+
 #endif /* CONFIG_PPC64 */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9c62c2c3b3d0..1447509357a7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -617,6 +617,7 @@ static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, 
u32 drc_index)
 
 static int dlpar_add_lmb(struct drmem_lmb *lmb)
 {
+   mhp_t mhp_flags = MHP_NONE | MHP_MEMMAP_ON_MEMORY;
unsigned long block_sz;
int nid, rc;
 
@@ -637,7 +638,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = first_online_node;
 
/* Add the memory */
-   rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE);
+   rc = __add_memory(nid, lmb->base_addr, block_sz, mhp_flags);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c409f5ff6a59..6da063c80733 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -2174,6 +2174,8 @@ static int __ref try_remove_memory(u64 start, u64 size)
 * right thing if we used vmem_altmap when hot-adding
 * the range.
 */
+   mhp_altmap.base_pfn = PHYS_PFN(start);
+   mhp_altmap.free = PHYS_PFN(size) - nr_vmemmap_pages;
mhp_altmap.alloc = nr_vmemmap_pages;
altmap = _altmap;
}
-- 
2.41.0

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 4531 matches

Mail list logo