Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Michal Hocko
On Mon 31-07-17 16:27:46, Gerald Schaefer wrote:
> On Mon, 31 Jul 2017 14:55:56 +0200
> Michal Hocko  wrote:
> 
> > On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
> > [...]
> > > > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long 
> > > > start, unsigned long end, int node)
> > > >  * use large frames even if they are only 
> > > > partially
> > > >  * used.
> > > >  * Otherwise we would have also page tables 
> > > > since
> > > > -* vmemmap_populate gets called for each section
> > > > +* __vmemmap_populate gets called for each 
> > > > section
> > > >  * separately. */
> > > > if (MACHINE_HAS_EDAT1) {
> > > > void *new_page;
> > > > 
> > > > -   new_page = 
> > > > vmemmap_alloc_block(PMD_SIZE, node);
> > > > +   new_page = 
> > > > __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > if (!new_page)
> > > > goto out;
> > > > pmd_val(*pm_dir) = __pa(new_page) | 
> > > > sgt_prot;
> > > 
> > > There is another call to vmemmap_alloc_block() in this function, a couple
> > > of lines below, this should also be replaced by 
> > > __vmemmap_alloc_block_buf().
> > 
> > I've noticed that one but in general I have only transformed PMD
> > mappings because we shouldn't even get to pte level if the forme works
> > AFAICS. Memory sections should be always 2MB aligned unless I am missing
> > something. Or is this not true?
> 
> vmemmap_populate() on s390 will only stop at pmd level if we have HW
> support for large pages (MACHINE_HAS_EDAT1). In that case we will allocate
> a PMD_SIZE block with vmemmap_alloc_block() and map it on pmd level as
> a large page.
> 
> Without HW large page support, we will continue to allocate a pte page,
> populate the pmd entry with that, and fall through to the pte_none()
> check below, with its PAGE_SIZE vmemmap_alloc_block() allocation. In this
> case we should use the __vmemmap_alloc_block_buf().

OK, I see. I've considered s390 will support large pages in general. I
will fold this in. Thanks!
---
commit df13e3a1237c3fef399e26b0f5a015715df12ede
Author: Michal Hocko 
Date:   Mon Jul 31 16:34:18 2017 +0200

fold me "mm, arch: unify vmemmap_populate altmap handling"

- use altmap even for ptes in case the HW doesn't support large pages
  as per Gerald Schaefer

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 07120bc137a1..764b6393e66c 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -273,7 +273,7 @@ int __meminit __vmemmap_populate(unsigned long start, 
unsigned long end, int nod
if (pte_none(*pt_dir)) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PAGE_SIZE, node);
+   new_page = __vmemmap_alloc_block_buf(PAGE_SIZE, node, 
altmap);
if (!new_page)
goto out;
pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Michal Hocko
On Mon 31-07-17 16:27:46, Gerald Schaefer wrote:
> On Mon, 31 Jul 2017 14:55:56 +0200
> Michal Hocko  wrote:
> 
> > On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
> > [...]
> > > > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long 
> > > > start, unsigned long end, int node)
> > > >  * use large frames even if they are only 
> > > > partially
> > > >  * used.
> > > >  * Otherwise we would have also page tables 
> > > > since
> > > > -* vmemmap_populate gets called for each section
> > > > +* __vmemmap_populate gets called for each 
> > > > section
> > > >  * separately. */
> > > > if (MACHINE_HAS_EDAT1) {
> > > > void *new_page;
> > > > 
> > > > -   new_page = 
> > > > vmemmap_alloc_block(PMD_SIZE, node);
> > > > +   new_page = 
> > > > __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
> > > > if (!new_page)
> > > > goto out;
> > > > pmd_val(*pm_dir) = __pa(new_page) | 
> > > > sgt_prot;
> > > 
> > > There is another call to vmemmap_alloc_block() in this function, a couple
> > > of lines below, this should also be replaced by 
> > > __vmemmap_alloc_block_buf().
> > 
> > I've noticed that one but in general I have only transformed PMD
> > mappings because we shouldn't even get to pte level if the forme works
> > AFAICS. Memory sections should be always 2MB aligned unless I am missing
> > something. Or is this not true?
> 
> vmemmap_populate() on s390 will only stop at pmd level if we have HW
> support for large pages (MACHINE_HAS_EDAT1). In that case we will allocate
> a PMD_SIZE block with vmemmap_alloc_block() and map it on pmd level as
> a large page.
> 
> Without HW large page support, we will continue to allocate a pte page,
> populate the pmd entry with that, and fall through to the pte_none()
> check below, with its PAGE_SIZE vmemmap_alloc_block() allocation. In this
> case we should use the __vmemmap_alloc_block_buf().

OK, I see. I've considered s390 will support large pages in general. I
will fold this in. Thanks!
---
commit df13e3a1237c3fef399e26b0f5a015715df12ede
Author: Michal Hocko 
Date:   Mon Jul 31 16:34:18 2017 +0200

fold me "mm, arch: unify vmemmap_populate altmap handling"

- use altmap even for ptes in case the HW doesn't support large pages
  as per Gerald Schaefer

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 07120bc137a1..764b6393e66c 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -273,7 +273,7 @@ int __meminit __vmemmap_populate(unsigned long start, 
unsigned long end, int nod
if (pte_none(*pt_dir)) {
void *new_page;
 
-   new_page = vmemmap_alloc_block(PAGE_SIZE, node);
+   new_page = __vmemmap_alloc_block_buf(PAGE_SIZE, node, 
altmap);
if (!new_page)
goto out;
pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Gerald Schaefer
On Mon, 31 Jul 2017 14:55:56 +0200
Michal Hocko  wrote:

> On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
> [...]
> > > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long start, 
> > > unsigned long end, int node)
> > >* use large frames even if they are only partially
> > >* used.
> > >* Otherwise we would have also page tables since
> > > -  * vmemmap_populate gets called for each section
> > > +  * __vmemmap_populate gets called for each section
> > >* separately. */
> > >   if (MACHINE_HAS_EDAT1) {
> > >   void *new_page;
> > > 
> > > - new_page = vmemmap_alloc_block(PMD_SIZE, node);
> > > + new_page = __vmemmap_alloc_block_buf(PMD_SIZE, 
> > > node, altmap);
> > >   if (!new_page)
> > >   goto out;
> > >   pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
> > 
> > There is another call to vmemmap_alloc_block() in this function, a couple
> > of lines below, this should also be replaced by __vmemmap_alloc_block_buf().
> 
> I've noticed that one but in general I have only transformed PMD
> mappings because we shouldn't even get to pte level if the forme works
> AFAICS. Memory sections should be always 2MB aligned unless I am missing
> something. Or is this not true?

vmemmap_populate() on s390 will only stop at pmd level if we have HW
support for large pages (MACHINE_HAS_EDAT1). In that case we will allocate
a PMD_SIZE block with vmemmap_alloc_block() and map it on pmd level as
a large page.

Without HW large page support, we will continue to allocate a pte page,
populate the pmd entry with that, and fall through to the pte_none()
check below, with its PAGE_SIZE vmemmap_alloc_block() allocation. In this
case we should use the __vmemmap_alloc_block_buf().

Regards,
Gerald



Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Gerald Schaefer
On Mon, 31 Jul 2017 14:55:56 +0200
Michal Hocko  wrote:

> On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
> [...]
> > > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long start, 
> > > unsigned long end, int node)
> > >* use large frames even if they are only partially
> > >* used.
> > >* Otherwise we would have also page tables since
> > > -  * vmemmap_populate gets called for each section
> > > +  * __vmemmap_populate gets called for each section
> > >* separately. */
> > >   if (MACHINE_HAS_EDAT1) {
> > >   void *new_page;
> > > 
> > > - new_page = vmemmap_alloc_block(PMD_SIZE, node);
> > > + new_page = __vmemmap_alloc_block_buf(PMD_SIZE, 
> > > node, altmap);
> > >   if (!new_page)
> > >   goto out;
> > >   pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
> > 
> > There is another call to vmemmap_alloc_block() in this function, a couple
> > of lines below, this should also be replaced by __vmemmap_alloc_block_buf().
> 
> I've noticed that one but in general I have only transformed PMD
> mappings because we shouldn't even get to pte level if the forme works
> AFAICS. Memory sections should be always 2MB aligned unless I am missing
> something. Or is this not true?

vmemmap_populate() on s390 will only stop at pmd level if we have HW
support for large pages (MACHINE_HAS_EDAT1). In that case we will allocate
a PMD_SIZE block with vmemmap_alloc_block() and map it on pmd level as
a large page.

Without HW large page support, we will continue to allocate a pte page,
populate the pmd entry with that, and fall through to the pte_none()
check below, with its PAGE_SIZE vmemmap_alloc_block() allocation. In this
case we should use the __vmemmap_alloc_block_buf().

Regards,
Gerald



Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Michal Hocko
On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
[...]
> > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long start, 
> > unsigned long end, int node)
> >  * use large frames even if they are only partially
> >  * used.
> >  * Otherwise we would have also page tables since
> > -* vmemmap_populate gets called for each section
> > +* __vmemmap_populate gets called for each section
> >  * separately. */
> > if (MACHINE_HAS_EDAT1) {
> > void *new_page;
> > 
> > -   new_page = vmemmap_alloc_block(PMD_SIZE, node);
> > +   new_page = __vmemmap_alloc_block_buf(PMD_SIZE, 
> > node, altmap);
> > if (!new_page)
> > goto out;
> > pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
> 
> There is another call to vmemmap_alloc_block() in this function, a couple
> of lines below, this should also be replaced by __vmemmap_alloc_block_buf().

I've noticed that one but in general I have only transformed PMD
mappings because we shouldn't even get to pte level if the forme works
AFAICS. Memory sections should be always 2MB aligned unless I am missing
something. Or is this not true?
-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Michal Hocko
On Mon 31-07-17 14:40:53, Gerald Schaefer wrote:
[...]
> > @@ -247,12 +248,12 @@ int __meminit vmemmap_populate(unsigned long start, 
> > unsigned long end, int node)
> >  * use large frames even if they are only partially
> >  * used.
> >  * Otherwise we would have also page tables since
> > -* vmemmap_populate gets called for each section
> > +* __vmemmap_populate gets called for each section
> >  * separately. */
> > if (MACHINE_HAS_EDAT1) {
> > void *new_page;
> > 
> > -   new_page = vmemmap_alloc_block(PMD_SIZE, node);
> > +   new_page = __vmemmap_alloc_block_buf(PMD_SIZE, 
> > node, altmap);
> > if (!new_page)
> > goto out;
> > pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
> 
> There is another call to vmemmap_alloc_block() in this function, a couple
> of lines below, this should also be replaced by __vmemmap_alloc_block_buf().

I've noticed that one but in general I have only transformed PMD
mappings because we shouldn't even get to pte level if the forme works
AFAICS. Memory sections should be always 2MB aligned unless I am missing
something. Or is this not true?
-- 
Michal Hocko
SUSE Labs


Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Gerald Schaefer
On Wed, 26 Jul 2017 10:33:30 +0200
Michal Hocko  wrote:

> From: Michal Hocko 
> 
> vmem_altmap allows vmemmap_populate to allocate memmap (struct page
> array) from an alternative allocator rather than bootmem resp.
> kmalloc. Only x86 currently supports altmap handling, most likely
> because only nvdim code uses this mechanism currently and the code
> depends on ZONE_DEVICE which is present only for x86_64. This will
> change in follow up changes so we would like other architectures
> to support it as well.
> 
> Provide vmemmap_populate generic implementation which simply resolves
> altmap and then call into arch specific __vmemmap_populate.
> Architectures then only need to use __vmemmap_alloc_block_buf to
> allocate the memmap. vmemmap_free then needs to call vmem_altmap_free
> if there is any altmap associated with the address.
> 
> This patch shouldn't introduce any functional changes because
> to_vmem_altmap always returns NULL on !x86_x64.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Tony Luck 
> Cc: Fenghua Yu 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Martin Schwidefsky 
> Cc: Heiko Carstens 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: "H. Peter Anvin" 
> Cc: linuxppc-...@lists.ozlabs.org
> Cc: linux-i...@vger.kernel.org
> Cc: x...@kernel.org
> Signed-off-by: Michal Hocko 
> ---
>  arch/arm64/mm/mmu.c   |  9 ++---
>  arch/ia64/mm/discontig.c  |  4 +++-
>  arch/powerpc/mm/init_64.c | 29 -
>  arch/s390/mm/vmem.c   |  7 ---
>  arch/sparc/mm/init_64.c   |  6 +++---
>  arch/x86/mm/init_64.c |  4 ++--
>  include/linux/memremap.h  | 13 ++---
>  include/linux/mm.h| 19 ++-
>  mm/sparse-vmemmap.c   |  2 +-
>  9 files changed, 59 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 0c429ec6fde8..5de1161e7a1b 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -649,12 +649,15 @@ int kern_addr_valid(unsigned long addr)
>  }
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
> + WARN(altmap, "altmap unsupported\n");
>   return vmemmap_populate_basepages(start, end, node);
>  }
>  #else/* !ARM64_SWAPPER_USES_SECTION_MAPS */
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
>   unsigned long addr = start;
>   unsigned long next;
> @@ -677,7 +680,7 @@ int __meminit vmemmap_populate(unsigned long start, 
> unsigned long end, int node)
>   if (pmd_none(*pmd)) {
>   void *p = NULL;
> 
> - p = vmemmap_alloc_block_buf(PMD_SIZE, node);
> + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
>   if (!p)
>   return -ENOMEM;
> 
> diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
> index 878626805369..2a939e877ced 100644
> --- a/arch/ia64/mm/discontig.c
> +++ b/arch/ia64/mm/discontig.c
> @@ -753,8 +753,10 @@ void arch_refresh_nodedata(int update_node, pg_data_t 
> *update_pgdat)
>  #endif
> 
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
> + WARN(altmap, "altmap unsupported\n");
>   return vmemmap_populate_basepages(start, end, node);
>  }
> 
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index ec84b31c6c86..5ea5e870a589 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #include 
>  #include 
> @@ -115,7 +116,8 @@ static struct vmemmap_backing *next;
>  static int num_left;
>  static int num_freed;
> 
> -static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
> +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node,
> + struct vmem_altmap *altmap)
>  {
>   struct vmemmap_backing *vmem_back;
>   /* get from freed entries first */
> @@ -129,7 +131,7 @@ static __meminit struct vmemmap_backing * 
> vmemmap_list_alloc(int node)
> 
>   /* allocate a page when 

Re: [RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-31 Thread Gerald Schaefer
On Wed, 26 Jul 2017 10:33:30 +0200
Michal Hocko  wrote:

> From: Michal Hocko 
> 
> vmem_altmap allows vmemmap_populate to allocate memmap (struct page
> array) from an alternative allocator rather than bootmem resp.
> kmalloc. Only x86 currently supports altmap handling, most likely
> because only nvdim code uses this mechanism currently and the code
> depends on ZONE_DEVICE which is present only for x86_64. This will
> change in follow up changes so we would like other architectures
> to support it as well.
> 
> Provide vmemmap_populate generic implementation which simply resolves
> altmap and then call into arch specific __vmemmap_populate.
> Architectures then only need to use __vmemmap_alloc_block_buf to
> allocate the memmap. vmemmap_free then needs to call vmem_altmap_free
> if there is any altmap associated with the address.
> 
> This patch shouldn't introduce any functional changes because
> to_vmem_altmap always returns NULL on !x86_x64.
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Tony Luck 
> Cc: Fenghua Yu 
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: Martin Schwidefsky 
> Cc: Heiko Carstens 
> Cc: Thomas Gleixner 
> Cc: Ingo Molnar 
> Cc: "H. Peter Anvin" 
> Cc: linuxppc-...@lists.ozlabs.org
> Cc: linux-i...@vger.kernel.org
> Cc: x...@kernel.org
> Signed-off-by: Michal Hocko 
> ---
>  arch/arm64/mm/mmu.c   |  9 ++---
>  arch/ia64/mm/discontig.c  |  4 +++-
>  arch/powerpc/mm/init_64.c | 29 -
>  arch/s390/mm/vmem.c   |  7 ---
>  arch/sparc/mm/init_64.c   |  6 +++---
>  arch/x86/mm/init_64.c |  4 ++--
>  include/linux/memremap.h  | 13 ++---
>  include/linux/mm.h| 19 ++-
>  mm/sparse-vmemmap.c   |  2 +-
>  9 files changed, 59 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 0c429ec6fde8..5de1161e7a1b 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -649,12 +649,15 @@ int kern_addr_valid(unsigned long addr)
>  }
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
>  #if !ARM64_SWAPPER_USES_SECTION_MAPS
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
> + WARN(altmap, "altmap unsupported\n");
>   return vmemmap_populate_basepages(start, end, node);
>  }
>  #else/* !ARM64_SWAPPER_USES_SECTION_MAPS */
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
>   unsigned long addr = start;
>   unsigned long next;
> @@ -677,7 +680,7 @@ int __meminit vmemmap_populate(unsigned long start, 
> unsigned long end, int node)
>   if (pmd_none(*pmd)) {
>   void *p = NULL;
> 
> - p = vmemmap_alloc_block_buf(PMD_SIZE, node);
> + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
>   if (!p)
>   return -ENOMEM;
> 
> diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
> index 878626805369..2a939e877ced 100644
> --- a/arch/ia64/mm/discontig.c
> +++ b/arch/ia64/mm/discontig.c
> @@ -753,8 +753,10 @@ void arch_refresh_nodedata(int update_node, pg_data_t 
> *update_pgdat)
>  #endif
> 
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP
> -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
> node)
> +int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
> node,
> + struct vmem_altmap *altmap)
>  {
> + WARN(altmap, "altmap unsupported\n");
>   return vmemmap_populate_basepages(start, end, node);
>  }
> 
> diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
> index ec84b31c6c86..5ea5e870a589 100644
> --- a/arch/powerpc/mm/init_64.c
> +++ b/arch/powerpc/mm/init_64.c
> @@ -44,6 +44,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #include 
>  #include 
> @@ -115,7 +116,8 @@ static struct vmemmap_backing *next;
>  static int num_left;
>  static int num_freed;
> 
> -static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
> +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node,
> + struct vmem_altmap *altmap)
>  {
>   struct vmemmap_backing *vmem_back;
>   /* get from freed entries first */
> @@ -129,7 +131,7 @@ static __meminit struct vmemmap_backing * 
> vmemmap_list_alloc(int node)
> 
>   /* allocate a page when required and hand out chunks */
>   if (!num_left) {
> - next = vmemmap_alloc_block(PAGE_SIZE, node);
> + next = __vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
>   if (unlikely(!next)) {
>   WARN_ON(1);
>   return NULL;
> @@ 

[RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-26 Thread Michal Hocko
From: Michal Hocko 

vmem_altmap allows vmemmap_populate to allocate memmap (struct page
array) from an alternative allocator rather than bootmem resp.
kmalloc. Only x86 currently supports altmap handling, most likely
because only nvdim code uses this mechanism currently and the code
depends on ZONE_DEVICE which is present only for x86_64. This will
change in follow up changes so we would like other architectures
to support it as well.

Provide vmemmap_populate generic implementation which simply resolves
altmap and then call into arch specific __vmemmap_populate.
Architectures then only need to use __vmemmap_alloc_block_buf to
allocate the memmap. vmemmap_free then needs to call vmem_altmap_free
if there is any altmap associated with the address.

This patch shouldn't introduce any functional changes because
to_vmem_altmap always returns NULL on !x86_x64.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Tony Luck 
Cc: Fenghua Yu 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-i...@vger.kernel.org
Cc: x...@kernel.org
Signed-off-by: Michal Hocko 
---
 arch/arm64/mm/mmu.c   |  9 ++---
 arch/ia64/mm/discontig.c  |  4 +++-
 arch/powerpc/mm/init_64.c | 29 -
 arch/s390/mm/vmem.c   |  7 ---
 arch/sparc/mm/init_64.c   |  6 +++---
 arch/x86/mm/init_64.c |  4 ++--
 include/linux/memremap.h  | 13 ++---
 include/linux/mm.h| 19 ++-
 mm/sparse-vmemmap.c   |  2 +-
 9 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0c429ec6fde8..5de1161e7a1b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -649,12 +649,15 @@ int kern_addr_valid(unsigned long addr)
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
+   WARN(altmap, "altmap unsupported\n");
return vmemmap_populate_basepages(start, end, node);
 }
 #else  /* !ARM64_SWAPPER_USES_SECTION_MAPS */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
unsigned long addr = start;
unsigned long next;
@@ -677,7 +680,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (pmd_none(*pmd)) {
void *p = NULL;
 
-   p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (!p)
return -ENOMEM;
 
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 878626805369..2a939e877ced 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -753,8 +753,10 @@ void arch_refresh_nodedata(int update_node, pg_data_t 
*update_pgdat)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
+   WARN(altmap, "altmap unsupported\n");
return vmemmap_populate_basepages(start, end, node);
 }
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index ec84b31c6c86..5ea5e870a589 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -115,7 +116,8 @@ static struct vmemmap_backing *next;
 static int num_left;
 static int num_freed;
 
-static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
+static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node,
+   struct vmem_altmap *altmap)
 {
struct vmemmap_backing *vmem_back;
/* get from freed entries first */
@@ -129,7 +131,7 @@ static __meminit struct vmemmap_backing * 
vmemmap_list_alloc(int node)
 
/* allocate a page when required and hand out chunks */
if (!num_left) {
-   next = vmemmap_alloc_block(PAGE_SIZE, node);
+   next = __vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (unlikely(!next)) {
WARN_ON(1);

[RFC PATCH 2/5] mm, arch: unify vmemmap_populate altmap handling

2017-07-26 Thread Michal Hocko
From: Michal Hocko 

vmem_altmap allows vmemmap_populate to allocate memmap (struct page
array) from an alternative allocator rather than bootmem resp.
kmalloc. Only x86 currently supports altmap handling, most likely
because only nvdim code uses this mechanism currently and the code
depends on ZONE_DEVICE which is present only for x86_64. This will
change in follow up changes so we would like other architectures
to support it as well.

Provide vmemmap_populate generic implementation which simply resolves
altmap and then call into arch specific __vmemmap_populate.
Architectures then only need to use __vmemmap_alloc_block_buf to
allocate the memmap. vmemmap_free then needs to call vmem_altmap_free
if there is any altmap associated with the address.

This patch shouldn't introduce any functional changes because
to_vmem_altmap always returns NULL on !x86_x64.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Tony Luck 
Cc: Fenghua Yu 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: "H. Peter Anvin" 
Cc: linuxppc-...@lists.ozlabs.org
Cc: linux-i...@vger.kernel.org
Cc: x...@kernel.org
Signed-off-by: Michal Hocko 
---
 arch/arm64/mm/mmu.c   |  9 ++---
 arch/ia64/mm/discontig.c  |  4 +++-
 arch/powerpc/mm/init_64.c | 29 -
 arch/s390/mm/vmem.c   |  7 ---
 arch/sparc/mm/init_64.c   |  6 +++---
 arch/x86/mm/init_64.c |  4 ++--
 include/linux/memremap.h  | 13 ++---
 include/linux/mm.h| 19 ++-
 mm/sparse-vmemmap.c   |  2 +-
 9 files changed, 59 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 0c429ec6fde8..5de1161e7a1b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -649,12 +649,15 @@ int kern_addr_valid(unsigned long addr)
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #if !ARM64_SWAPPER_USES_SECTION_MAPS
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
+   WARN(altmap, "altmap unsupported\n");
return vmemmap_populate_basepages(start, end, node);
 }
 #else  /* !ARM64_SWAPPER_USES_SECTION_MAPS */
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
unsigned long addr = start;
unsigned long next;
@@ -677,7 +680,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node)
if (pmd_none(*pmd)) {
void *p = NULL;
 
-   p = vmemmap_alloc_block_buf(PMD_SIZE, node);
+   p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
if (!p)
return -ENOMEM;
 
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 878626805369..2a939e877ced 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -753,8 +753,10 @@ void arch_refresh_nodedata(int update_node, pg_data_t 
*update_pgdat)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(unsigned long start, unsigned long end, int 
node)
+int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int 
node,
+   struct vmem_altmap *altmap)
 {
+   WARN(altmap, "altmap unsupported\n");
return vmemmap_populate_basepages(start, end, node);
 }
 
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index ec84b31c6c86..5ea5e870a589 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -115,7 +116,8 @@ static struct vmemmap_backing *next;
 static int num_left;
 static int num_freed;
 
-static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node)
+static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node,
+   struct vmem_altmap *altmap)
 {
struct vmemmap_backing *vmem_back;
/* get from freed entries first */
@@ -129,7 +131,7 @@ static __meminit struct vmemmap_backing * 
vmemmap_list_alloc(int node)
 
/* allocate a page when required and hand out chunks */
if (!num_left) {
-   next = vmemmap_alloc_block(PAGE_SIZE, node);
+   next = __vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (unlikely(!next)) {
WARN_ON(1);
return NULL;
@@ -144,11 +146,12 @@ static __meminit struct vmemmap_backing * 
vmemmap_list_alloc(int node)
 
 static __meminit void vmemmap_list_populate(unsigned long phys,
unsigned long start,
-   int node)
+