Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-27 Thread Joonsoo Kim
On Thu, Sep 22, 2016 at 05:59:46PM +0200, Vlastimil Babka wrote:
> On 09/22/2016 08:50 AM, Joonsoo Kim wrote:
> >On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:
> >>>
> >>> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> >>> > void __init init_cma_reserved_pageblock(struct page *page)
> >>> > {
> >>> > unsigned i = pageblock_nr_pages;
> >>> >+unsigned long pfn = page_to_pfn(page);
> >>> > struct page *p = page;
> >>> >+int nid = page_to_nid(page);
> >>> >+
> >>> >+/*
> >>> >+ * ZONE_CMA will steal present pages from other zones by 
> >>> >changing
> >>> >+ * page links so page_zone() is changed. Before that,
> >>> >+ * we need to adjust previous zone's page count first.
> >>> >+ */
> >>> >+adjust_present_page_count(page, -pageblock_nr_pages);
> >>> >
> >>> > do {
> >>> > __ClearPageReserved(p);
> >>> > set_page_count(p, 0);
> >>> >-} while (++p, --i);
> >>> >+
> >>> >+/* Steal pages from other zones */
> >>> >+set_page_links(p, ZONE_CMA, nid, pfn);
> >>> >+} while (++p, ++pfn, --i);
> >>> >+
> >>> >+adjust_present_page_count(page, pageblock_nr_pages);
> >>>
> >>> This seems to assign pages to ZONE_CMA on the proper node, which is
> >>> good. But then ZONE_CMA on multiple nodes will have unnecessary
> >>> holes in the spanned pages, as each will contain only a subset.
> >>
> >>True, I will fix it and respin the series.
> >
> >I now realize that it's too late to send full series for next
> >merge window. I will send full series after next merge window is closed.
> 
> I think there might still be rc8 thus another week.

Indeed. I will send full series, soon.

> 
> >Anyway, I'd like to confirm that following incremental patch will solve
> >your concern.
> 
> Yeah that should work, as long as single cma areas don't include multiple 
> nodes?

Single cma areas cannot include multiple nodes at least until now.
There is a check that single cma area is on a single zone.

Thanks.

> 
> >Thanks.
> >
> >
> >-->8--
> > mm/cma.c | 25 -
> > 1 file changed, 16 insertions(+), 9 deletions(-)
> >
> >diff --git a/mm/cma.c b/mm/cma.c
> >index d69bdf7..8375554 100644
> >--- a/mm/cma.c
> >+++ b/mm/cma.c
> >@@ -146,22 +146,29 @@ static int __init cma_init_reserved_areas(void)
> > {
> >int i;
> >struct zone *zone;
> >-   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >+   pg_data_t *pgdat;
> >
> >if (!cma_area_count)
> >return 0;
> >
> >-   for (i = 0; i < cma_area_count; i++) {
> >-   if (start_pfn > cma_areas[i].base_pfn)
> >-   start_pfn = cma_areas[i].base_pfn;
> >-   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> >-   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> >-   }
> >+   for_each_online_pgdat(pgdat) {
> >+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >
> >-   for_each_zone(zone) {
> >-   if (!is_zone_cma(zone))
> >+   for (i = 0; i < cma_area_count; i++) {
> >+   if (page_to_nid(pfn_to_page(cma_areas[i].base_pfn)) 
> >!=
> 
> We have pfn_to_nid() (although the implementation is just like this).

Will fix.

Thanks.



Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-27 Thread Joonsoo Kim
On Thu, Sep 22, 2016 at 05:59:46PM +0200, Vlastimil Babka wrote:
> On 09/22/2016 08:50 AM, Joonsoo Kim wrote:
> >On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:
> >>>
> >>> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> >>> > void __init init_cma_reserved_pageblock(struct page *page)
> >>> > {
> >>> > unsigned i = pageblock_nr_pages;
> >>> >+unsigned long pfn = page_to_pfn(page);
> >>> > struct page *p = page;
> >>> >+int nid = page_to_nid(page);
> >>> >+
> >>> >+/*
> >>> >+ * ZONE_CMA will steal present pages from other zones by 
> >>> >changing
> >>> >+ * page links so page_zone() is changed. Before that,
> >>> >+ * we need to adjust previous zone's page count first.
> >>> >+ */
> >>> >+adjust_present_page_count(page, -pageblock_nr_pages);
> >>> >
> >>> > do {
> >>> > __ClearPageReserved(p);
> >>> > set_page_count(p, 0);
> >>> >-} while (++p, --i);
> >>> >+
> >>> >+/* Steal pages from other zones */
> >>> >+set_page_links(p, ZONE_CMA, nid, pfn);
> >>> >+} while (++p, ++pfn, --i);
> >>> >+
> >>> >+adjust_present_page_count(page, pageblock_nr_pages);
> >>>
> >>> This seems to assign pages to ZONE_CMA on the proper node, which is
> >>> good. But then ZONE_CMA on multiple nodes will have unnecessary
> >>> holes in the spanned pages, as each will contain only a subset.
> >>
> >>True, I will fix it and respin the series.
> >
> >I now realize that it's too late to send full series for next
> >merge window. I will send full series after next merge window is closed.
> 
> I think there might still be rc8 thus another week.

Indeed. I will send full series, soon.

> 
> >Anyway, I'd like to confirm that following incremental patch will solve
> >your concern.
> 
> Yeah that should work, as long as single cma areas don't include multiple 
> nodes?

Single cma areas cannot include multiple nodes at least until now.
There is a check that single cma area is on a single zone.

Thanks.

> 
> >Thanks.
> >
> >
> >-->8--
> > mm/cma.c | 25 -
> > 1 file changed, 16 insertions(+), 9 deletions(-)
> >
> >diff --git a/mm/cma.c b/mm/cma.c
> >index d69bdf7..8375554 100644
> >--- a/mm/cma.c
> >+++ b/mm/cma.c
> >@@ -146,22 +146,29 @@ static int __init cma_init_reserved_areas(void)
> > {
> >int i;
> >struct zone *zone;
> >-   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >+   pg_data_t *pgdat;
> >
> >if (!cma_area_count)
> >return 0;
> >
> >-   for (i = 0; i < cma_area_count; i++) {
> >-   if (start_pfn > cma_areas[i].base_pfn)
> >-   start_pfn = cma_areas[i].base_pfn;
> >-   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> >-   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> >-   }
> >+   for_each_online_pgdat(pgdat) {
> >+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >
> >-   for_each_zone(zone) {
> >-   if (!is_zone_cma(zone))
> >+   for (i = 0; i < cma_area_count; i++) {
> >+   if (page_to_nid(pfn_to_page(cma_areas[i].base_pfn)) 
> >!=
> 
> We have pfn_to_nid() (although the implementation is just like this).

Will fix.

Thanks.



Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-22 Thread Vlastimil Babka

On 09/22/2016 08:50 AM, Joonsoo Kim wrote:

On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:

>
> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > void __init init_cma_reserved_pageblock(struct page *page)
> > {
> >   unsigned i = pageblock_nr_pages;
> >+  unsigned long pfn = page_to_pfn(page);
> >   struct page *p = page;
> >+  int nid = page_to_nid(page);
> >+
> >+  /*
> >+   * ZONE_CMA will steal present pages from other zones by changing
> >+   * page links so page_zone() is changed. Before that,
> >+   * we need to adjust previous zone's page count first.
> >+   */
> >+  adjust_present_page_count(page, -pageblock_nr_pages);
> >
> >   do {
> >   __ClearPageReserved(p);
> >   set_page_count(p, 0);
> >-  } while (++p, --i);
> >+
> >+  /* Steal pages from other zones */
> >+  set_page_links(p, ZONE_CMA, nid, pfn);
> >+  } while (++p, ++pfn, --i);
> >+
> >+  adjust_present_page_count(page, pageblock_nr_pages);
>
> This seems to assign pages to ZONE_CMA on the proper node, which is
> good. But then ZONE_CMA on multiple nodes will have unnecessary
> holes in the spanned pages, as each will contain only a subset.

True, I will fix it and respin the series.


I now realize that it's too late to send full series for next
merge window. I will send full series after next merge window is closed.


I think there might still be rc8 thus another week.


Anyway, I'd like to confirm that following incremental patch will solve
your concern.


Yeah that should work, as long as single cma areas don't include multiple nodes?


Thanks.


-->8--
 mm/cma.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index d69bdf7..8375554 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -146,22 +146,29 @@ static int __init cma_init_reserved_areas(void)
 {
int i;
struct zone *zone;
-   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+   pg_data_t *pgdat;

if (!cma_area_count)
return 0;

-   for (i = 0; i < cma_area_count; i++) {
-   if (start_pfn > cma_areas[i].base_pfn)
-   start_pfn = cma_areas[i].base_pfn;
-   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
-   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
-   }
+   for_each_online_pgdat(pgdat) {
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;

-   for_each_zone(zone) {
-   if (!is_zone_cma(zone))
+   for (i = 0; i < cma_area_count; i++) {
+   if (page_to_nid(pfn_to_page(cma_areas[i].base_pfn)) !=


We have pfn_to_nid() (although the implementation is just like this).


+   pgdat->node_id)
+   continue;
+
+   start_pfn = min(start_pfn, cma_areas[i].base_pfn);
+   end_pfn = max(end_pfn, cma_areas[i].base_pfn +
+   cma_areas[i].count);
+   }
+
+   if (!end_pfn)
continue;

+   zone = >node_zones[ZONE_CMA];
+
/* ZONE_CMA doesn't need to exceed CMA region */
zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -





Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-22 Thread Vlastimil Babka

On 09/22/2016 08:50 AM, Joonsoo Kim wrote:

On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:

>
> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > void __init init_cma_reserved_pageblock(struct page *page)
> > {
> >   unsigned i = pageblock_nr_pages;
> >+  unsigned long pfn = page_to_pfn(page);
> >   struct page *p = page;
> >+  int nid = page_to_nid(page);
> >+
> >+  /*
> >+   * ZONE_CMA will steal present pages from other zones by changing
> >+   * page links so page_zone() is changed. Before that,
> >+   * we need to adjust previous zone's page count first.
> >+   */
> >+  adjust_present_page_count(page, -pageblock_nr_pages);
> >
> >   do {
> >   __ClearPageReserved(p);
> >   set_page_count(p, 0);
> >-  } while (++p, --i);
> >+
> >+  /* Steal pages from other zones */
> >+  set_page_links(p, ZONE_CMA, nid, pfn);
> >+  } while (++p, ++pfn, --i);
> >+
> >+  adjust_present_page_count(page, pageblock_nr_pages);
>
> This seems to assign pages to ZONE_CMA on the proper node, which is
> good. But then ZONE_CMA on multiple nodes will have unnecessary
> holes in the spanned pages, as each will contain only a subset.

True, I will fix it and respin the series.


I now realize that it's too late to send full series for next
merge window. I will send full series after next merge window is closed.


I think there might still be rc8 thus another week.


Anyway, I'd like to confirm that following incremental patch will solve
your concern.


Yeah that should work, as long as single cma areas don't include multiple nodes?


Thanks.


-->8--
 mm/cma.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index d69bdf7..8375554 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -146,22 +146,29 @@ static int __init cma_init_reserved_areas(void)
 {
int i;
struct zone *zone;
-   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+   pg_data_t *pgdat;

if (!cma_area_count)
return 0;

-   for (i = 0; i < cma_area_count; i++) {
-   if (start_pfn > cma_areas[i].base_pfn)
-   start_pfn = cma_areas[i].base_pfn;
-   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
-   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
-   }
+   for_each_online_pgdat(pgdat) {
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;

-   for_each_zone(zone) {
-   if (!is_zone_cma(zone))
+   for (i = 0; i < cma_area_count; i++) {
+   if (page_to_nid(pfn_to_page(cma_areas[i].base_pfn)) !=


We have pfn_to_nid() (although the implementation is just like this).


+   pgdat->node_id)
+   continue;
+
+   start_pfn = min(start_pfn, cma_areas[i].base_pfn);
+   end_pfn = max(end_pfn, cma_areas[i].base_pfn +
+   cma_areas[i].count);
+   }
+
+   if (!end_pfn)
continue;

+   zone = >node_zones[ZONE_CMA];
+
/* ZONE_CMA doesn't need to exceed CMA region */
zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -





Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-22 Thread Joonsoo Kim
On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:
> On Wed, Sep 21, 2016 at 11:20:11AM +0200, Vlastimil Babka wrote:
> > On 08/29/2016 07:07 AM, js1...@gmail.com wrote:
> > >From: Joonsoo Kim 
> > >
> > >Until now, reserved pages for CMA are managed in the ordinary zones
> > >where page's pfn are belong to. This approach has numorous problems
> > >and fixing them isn't easy. (It is mentioned on previous patch.)
> > >To fix this situation, ZONE_CMA is introduced in previous patch, but,
> > >not yet populated. This patch implement population of ZONE_CMA
> > >by stealing reserved pages from the ordinary zones.
> > >
> > >Unlike previous implementation that kernel allocation request with
> > >__GFP_MOVABLE could be serviced from CMA region, allocation request only
> > >with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
> > >approach. This is an inevitable design decision to use the zone
> > >implementation because ZONE_CMA could contain highmem. Due to this
> > >decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.
> > >
> > >I don't think it would be a problem because most of file cache pages
> > >and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
> > >be proved by the fact that there are many systems with ZONE_HIGHMEM and
> > >they work fine. Notable disadvantage is that we cannot use these pages
> > >for blockdev file cache page, because it usually has __GFP_MOVABLE but
> > >not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
> > >cons. In my experience, blockdev file cache pages are one of the top
> > >reason that causes cma_alloc() to fail temporarily. So, we can get more
> > >guarantee of cma_alloc() success by discarding that case.
> > >
> > >Implementation itself is very easy to understand. Steal when cma area is
> > >initialized and recalculate various per zone stat/threshold.
> > >
> > >Signed-off-by: Joonsoo Kim 
> > 
> > ...
> > 
> > >@@ -145,6 +145,28 @@ err:
> > > static int __init cma_init_reserved_areas(void)
> > > {
> > >   int i;
> > >+  struct zone *zone;
> > >+  unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> > >+
> > >+  if (!cma_area_count)
> > >+  return 0;
> > >+
> > >+  for (i = 0; i < cma_area_count; i++) {
> > >+  if (start_pfn > cma_areas[i].base_pfn)
> > >+  start_pfn = cma_areas[i].base_pfn;
> > >+  if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> > >+  end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> > >+  }
> > >+
> > >+  for_each_zone(zone) {
> > >+  if (!is_zone_cma(zone))
> > >+  continue;
> > >+
> > >+  /* ZONE_CMA doesn't need to exceed CMA region */
> > >+  zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
> > >+  zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
> > >+  zone->zone_start_pfn;
> > >+  }
> > 
> > Hmm, so what happens on a system with multiple nodes? Each will have
> > its own ZONE_CMA, and all will have the same start pfn and spanned
> > pages?
> 
> Each of zone_start_pfn and spanned_pages are initialized in
> calculate_node_totalpages() which considers node boundary. So, they will
> have not the same start pfn and spanned pages. However, each would
> contain unnecessary holes.
> 
> > 
> > > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > > void __init init_cma_reserved_pageblock(struct page *page)
> > > {
> > >   unsigned i = pageblock_nr_pages;
> > >+  unsigned long pfn = page_to_pfn(page);
> > >   struct page *p = page;
> > >+  int nid = page_to_nid(page);
> > >+
> > >+  /*
> > >+   * ZONE_CMA will steal present pages from other zones by changing
> > >+   * page links so page_zone() is changed. Before that,
> > >+   * we need to adjust previous zone's page count first.
> > >+   */
> > >+  adjust_present_page_count(page, -pageblock_nr_pages);
> > >
> > >   do {
> > >   __ClearPageReserved(p);
> > >   set_page_count(p, 0);
> > >-  } while (++p, --i);
> > >+
> > >+  /* Steal pages from other zones */
> > >+  set_page_links(p, ZONE_CMA, nid, pfn);
> > >+  } while (++p, ++pfn, --i);
> > >+
> > >+  adjust_present_page_count(page, pageblock_nr_pages);
> > 
> > This seems to assign pages to ZONE_CMA on the proper node, which is
> > good. But then ZONE_CMA on multiple nodes will have unnecessary
> > holes in the spanned pages, as each will contain only a subset.
> 
> True, I will fix it and respin the series.

I now realize that it's too late to send full series for next
merge window. I will send full series after next merge window is closed.

Anyway, I'd like to confirm that following incremental patch will solve
your concern.

Thanks.


-->8--
 mm/cma.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index d69bdf7..8375554 

Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-22 Thread Joonsoo Kim
On Thu, Sep 22, 2016 at 02:45:46PM +0900, Joonsoo Kim wrote:
> On Wed, Sep 21, 2016 at 11:20:11AM +0200, Vlastimil Babka wrote:
> > On 08/29/2016 07:07 AM, js1...@gmail.com wrote:
> > >From: Joonsoo Kim 
> > >
> > >Until now, reserved pages for CMA are managed in the ordinary zones
> > >where page's pfn are belong to. This approach has numorous problems
> > >and fixing them isn't easy. (It is mentioned on previous patch.)
> > >To fix this situation, ZONE_CMA is introduced in previous patch, but,
> > >not yet populated. This patch implement population of ZONE_CMA
> > >by stealing reserved pages from the ordinary zones.
> > >
> > >Unlike previous implementation that kernel allocation request with
> > >__GFP_MOVABLE could be serviced from CMA region, allocation request only
> > >with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
> > >approach. This is an inevitable design decision to use the zone
> > >implementation because ZONE_CMA could contain highmem. Due to this
> > >decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.
> > >
> > >I don't think it would be a problem because most of file cache pages
> > >and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
> > >be proved by the fact that there are many systems with ZONE_HIGHMEM and
> > >they work fine. Notable disadvantage is that we cannot use these pages
> > >for blockdev file cache page, because it usually has __GFP_MOVABLE but
> > >not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
> > >cons. In my experience, blockdev file cache pages are one of the top
> > >reason that causes cma_alloc() to fail temporarily. So, we can get more
> > >guarantee of cma_alloc() success by discarding that case.
> > >
> > >Implementation itself is very easy to understand. Steal when cma area is
> > >initialized and recalculate various per zone stat/threshold.
> > >
> > >Signed-off-by: Joonsoo Kim 
> > 
> > ...
> > 
> > >@@ -145,6 +145,28 @@ err:
> > > static int __init cma_init_reserved_areas(void)
> > > {
> > >   int i;
> > >+  struct zone *zone;
> > >+  unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> > >+
> > >+  if (!cma_area_count)
> > >+  return 0;
> > >+
> > >+  for (i = 0; i < cma_area_count; i++) {
> > >+  if (start_pfn > cma_areas[i].base_pfn)
> > >+  start_pfn = cma_areas[i].base_pfn;
> > >+  if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> > >+  end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> > >+  }
> > >+
> > >+  for_each_zone(zone) {
> > >+  if (!is_zone_cma(zone))
> > >+  continue;
> > >+
> > >+  /* ZONE_CMA doesn't need to exceed CMA region */
> > >+  zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
> > >+  zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
> > >+  zone->zone_start_pfn;
> > >+  }
> > 
> > Hmm, so what happens on a system with multiple nodes? Each will have
> > its own ZONE_CMA, and all will have the same start pfn and spanned
> > pages?
> 
> Each of zone_start_pfn and spanned_pages are initialized in
> calculate_node_totalpages() which considers node boundary. So, they will
> have not the same start pfn and spanned pages. However, each would
> contain unnecessary holes.
> 
> > 
> > > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > > void __init init_cma_reserved_pageblock(struct page *page)
> > > {
> > >   unsigned i = pageblock_nr_pages;
> > >+  unsigned long pfn = page_to_pfn(page);
> > >   struct page *p = page;
> > >+  int nid = page_to_nid(page);
> > >+
> > >+  /*
> > >+   * ZONE_CMA will steal present pages from other zones by changing
> > >+   * page links so page_zone() is changed. Before that,
> > >+   * we need to adjust previous zone's page count first.
> > >+   */
> > >+  adjust_present_page_count(page, -pageblock_nr_pages);
> > >
> > >   do {
> > >   __ClearPageReserved(p);
> > >   set_page_count(p, 0);
> > >-  } while (++p, --i);
> > >+
> > >+  /* Steal pages from other zones */
> > >+  set_page_links(p, ZONE_CMA, nid, pfn);
> > >+  } while (++p, ++pfn, --i);
> > >+
> > >+  adjust_present_page_count(page, pageblock_nr_pages);
> > 
> > This seems to assign pages to ZONE_CMA on the proper node, which is
> > good. But then ZONE_CMA on multiple nodes will have unnecessary
> > holes in the spanned pages, as each will contain only a subset.
> 
> True, I will fix it and respin the series.

I now realize that it's too late to send full series for next
merge window. I will send full series after next merge window is closed.

Anyway, I'd like to confirm that following incremental patch will solve
your concern.

Thanks.


-->8--
 mm/cma.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/mm/cma.c b/mm/cma.c
index d69bdf7..8375554 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -146,22 

Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-21 Thread Joonsoo Kim
On Wed, Sep 21, 2016 at 11:20:11AM +0200, Vlastimil Babka wrote:
> On 08/29/2016 07:07 AM, js1...@gmail.com wrote:
> >From: Joonsoo Kim 
> >
> >Until now, reserved pages for CMA are managed in the ordinary zones
> >where page's pfn are belong to. This approach has numorous problems
> >and fixing them isn't easy. (It is mentioned on previous patch.)
> >To fix this situation, ZONE_CMA is introduced in previous patch, but,
> >not yet populated. This patch implement population of ZONE_CMA
> >by stealing reserved pages from the ordinary zones.
> >
> >Unlike previous implementation that kernel allocation request with
> >__GFP_MOVABLE could be serviced from CMA region, allocation request only
> >with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
> >approach. This is an inevitable design decision to use the zone
> >implementation because ZONE_CMA could contain highmem. Due to this
> >decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.
> >
> >I don't think it would be a problem because most of file cache pages
> >and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
> >be proved by the fact that there are many systems with ZONE_HIGHMEM and
> >they work fine. Notable disadvantage is that we cannot use these pages
> >for blockdev file cache page, because it usually has __GFP_MOVABLE but
> >not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
> >cons. In my experience, blockdev file cache pages are one of the top
> >reason that causes cma_alloc() to fail temporarily. So, we can get more
> >guarantee of cma_alloc() success by discarding that case.
> >
> >Implementation itself is very easy to understand. Steal when cma area is
> >initialized and recalculate various per zone stat/threshold.
> >
> >Signed-off-by: Joonsoo Kim 
> 
> ...
> 
> >@@ -145,6 +145,28 @@ err:
> > static int __init cma_init_reserved_areas(void)
> > {
> > int i;
> >+struct zone *zone;
> >+unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >+
> >+if (!cma_area_count)
> >+return 0;
> >+
> >+for (i = 0; i < cma_area_count; i++) {
> >+if (start_pfn > cma_areas[i].base_pfn)
> >+start_pfn = cma_areas[i].base_pfn;
> >+if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> >+end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> >+}
> >+
> >+for_each_zone(zone) {
> >+if (!is_zone_cma(zone))
> >+continue;
> >+
> >+/* ZONE_CMA doesn't need to exceed CMA region */
> >+zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
> >+zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
> >+zone->zone_start_pfn;
> >+}
> 
> Hmm, so what happens on a system with multiple nodes? Each will have
> its own ZONE_CMA, and all will have the same start pfn and spanned
> pages?

Each of zone_start_pfn and spanned_pages are initialized in
calculate_node_totalpages() which considers node boundary. So, they will
have not the same start pfn and spanned pages. However, each would
contain unnecessary holes.

> 
> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > void __init init_cma_reserved_pageblock(struct page *page)
> > {
> > unsigned i = pageblock_nr_pages;
> >+unsigned long pfn = page_to_pfn(page);
> > struct page *p = page;
> >+int nid = page_to_nid(page);
> >+
> >+/*
> >+ * ZONE_CMA will steal present pages from other zones by changing
> >+ * page links so page_zone() is changed. Before that,
> >+ * we need to adjust previous zone's page count first.
> >+ */
> >+adjust_present_page_count(page, -pageblock_nr_pages);
> >
> > do {
> > __ClearPageReserved(p);
> > set_page_count(p, 0);
> >-} while (++p, --i);
> >+
> >+/* Steal pages from other zones */
> >+set_page_links(p, ZONE_CMA, nid, pfn);
> >+} while (++p, ++pfn, --i);
> >+
> >+adjust_present_page_count(page, pageblock_nr_pages);
> 
> This seems to assign pages to ZONE_CMA on the proper node, which is
> good. But then ZONE_CMA on multiple nodes will have unnecessary
> holes in the spanned pages, as each will contain only a subset.

True, I will fix it and respin the series.

Thanks.



Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-21 Thread Joonsoo Kim
On Wed, Sep 21, 2016 at 11:20:11AM +0200, Vlastimil Babka wrote:
> On 08/29/2016 07:07 AM, js1...@gmail.com wrote:
> >From: Joonsoo Kim 
> >
> >Until now, reserved pages for CMA are managed in the ordinary zones
> >where page's pfn are belong to. This approach has numorous problems
> >and fixing them isn't easy. (It is mentioned on previous patch.)
> >To fix this situation, ZONE_CMA is introduced in previous patch, but,
> >not yet populated. This patch implement population of ZONE_CMA
> >by stealing reserved pages from the ordinary zones.
> >
> >Unlike previous implementation that kernel allocation request with
> >__GFP_MOVABLE could be serviced from CMA region, allocation request only
> >with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
> >approach. This is an inevitable design decision to use the zone
> >implementation because ZONE_CMA could contain highmem. Due to this
> >decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.
> >
> >I don't think it would be a problem because most of file cache pages
> >and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
> >be proved by the fact that there are many systems with ZONE_HIGHMEM and
> >they work fine. Notable disadvantage is that we cannot use these pages
> >for blockdev file cache page, because it usually has __GFP_MOVABLE but
> >not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
> >cons. In my experience, blockdev file cache pages are one of the top
> >reason that causes cma_alloc() to fail temporarily. So, we can get more
> >guarantee of cma_alloc() success by discarding that case.
> >
> >Implementation itself is very easy to understand. Steal when cma area is
> >initialized and recalculate various per zone stat/threshold.
> >
> >Signed-off-by: Joonsoo Kim 
> 
> ...
> 
> >@@ -145,6 +145,28 @@ err:
> > static int __init cma_init_reserved_areas(void)
> > {
> > int i;
> >+struct zone *zone;
> >+unsigned long start_pfn = UINT_MAX, end_pfn = 0;
> >+
> >+if (!cma_area_count)
> >+return 0;
> >+
> >+for (i = 0; i < cma_area_count; i++) {
> >+if (start_pfn > cma_areas[i].base_pfn)
> >+start_pfn = cma_areas[i].base_pfn;
> >+if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
> >+end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
> >+}
> >+
> >+for_each_zone(zone) {
> >+if (!is_zone_cma(zone))
> >+continue;
> >+
> >+/* ZONE_CMA doesn't need to exceed CMA region */
> >+zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
> >+zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
> >+zone->zone_start_pfn;
> >+}
> 
> Hmm, so what happens on a system with multiple nodes? Each will have
> its own ZONE_CMA, and all will have the same start pfn and spanned
> pages?

Each of zone_start_pfn and spanned_pages are initialized in
calculate_node_totalpages() which considers node boundary. So, they will
have not the same start pfn and spanned pages. However, each would
contain unnecessary holes.

> 
> > /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
> > void __init init_cma_reserved_pageblock(struct page *page)
> > {
> > unsigned i = pageblock_nr_pages;
> >+unsigned long pfn = page_to_pfn(page);
> > struct page *p = page;
> >+int nid = page_to_nid(page);
> >+
> >+/*
> >+ * ZONE_CMA will steal present pages from other zones by changing
> >+ * page links so page_zone() is changed. Before that,
> >+ * we need to adjust previous zone's page count first.
> >+ */
> >+adjust_present_page_count(page, -pageblock_nr_pages);
> >
> > do {
> > __ClearPageReserved(p);
> > set_page_count(p, 0);
> >-} while (++p, --i);
> >+
> >+/* Steal pages from other zones */
> >+set_page_links(p, ZONE_CMA, nid, pfn);
> >+} while (++p, ++pfn, --i);
> >+
> >+adjust_present_page_count(page, pageblock_nr_pages);
> 
> This seems to assign pages to ZONE_CMA on the proper node, which is
> good. But then ZONE_CMA on multiple nodes will have unnecessary
> holes in the spanned pages, as each will contain only a subset.

True, I will fix it and respin the series.

Thanks.



Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-21 Thread Vlastimil Babka

On 08/29/2016 07:07 AM, js1...@gmail.com wrote:

From: Joonsoo Kim 

Until now, reserved pages for CMA are managed in the ordinary zones
where page's pfn are belong to. This approach has numorous problems
and fixing them isn't easy. (It is mentioned on previous patch.)
To fix this situation, ZONE_CMA is introduced in previous patch, but,
not yet populated. This patch implement population of ZONE_CMA
by stealing reserved pages from the ordinary zones.

Unlike previous implementation that kernel allocation request with
__GFP_MOVABLE could be serviced from CMA region, allocation request only
with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
approach. This is an inevitable design decision to use the zone
implementation because ZONE_CMA could contain highmem. Due to this
decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.

I don't think it would be a problem because most of file cache pages
and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
be proved by the fact that there are many systems with ZONE_HIGHMEM and
they work fine. Notable disadvantage is that we cannot use these pages
for blockdev file cache page, because it usually has __GFP_MOVABLE but
not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
cons. In my experience, blockdev file cache pages are one of the top
reason that causes cma_alloc() to fail temporarily. So, we can get more
guarantee of cma_alloc() success by discarding that case.

Implementation itself is very easy to understand. Steal when cma area is
initialized and recalculate various per zone stat/threshold.

Signed-off-by: Joonsoo Kim 


...


@@ -145,6 +145,28 @@ err:
 static int __init cma_init_reserved_areas(void)
 {
int i;
+   struct zone *zone;
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+   if (!cma_area_count)
+   return 0;
+
+   for (i = 0; i < cma_area_count; i++) {
+   if (start_pfn > cma_areas[i].base_pfn)
+   start_pfn = cma_areas[i].base_pfn;
+   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
+   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
+   }
+
+   for_each_zone(zone) {
+   if (!is_zone_cma(zone))
+   continue;
+
+   /* ZONE_CMA doesn't need to exceed CMA region */
+   zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
+   zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
+   zone->zone_start_pfn;
+   }


Hmm, so what happens on a system with multiple nodes? Each will have its 
own ZONE_CMA, and all will have the same start pfn and spanned pages?



 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
unsigned i = pageblock_nr_pages;
+   unsigned long pfn = page_to_pfn(page);
struct page *p = page;
+   int nid = page_to_nid(page);
+
+   /*
+* ZONE_CMA will steal present pages from other zones by changing
+* page links so page_zone() is changed. Before that,
+* we need to adjust previous zone's page count first.
+*/
+   adjust_present_page_count(page, -pageblock_nr_pages);

do {
__ClearPageReserved(p);
set_page_count(p, 0);
-   } while (++p, --i);
+
+   /* Steal pages from other zones */
+   set_page_links(p, ZONE_CMA, nid, pfn);
+   } while (++p, ++pfn, --i);
+
+   adjust_present_page_count(page, pageblock_nr_pages);


This seems to assign pages to ZONE_CMA on the proper node, which is 
good. But then ZONE_CMA on multiple nodes will have unnecessary holes in 
the spanned pages, as each will contain only a subset.





Re: [PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-09-21 Thread Vlastimil Babka

On 08/29/2016 07:07 AM, js1...@gmail.com wrote:

From: Joonsoo Kim 

Until now, reserved pages for CMA are managed in the ordinary zones
where page's pfn are belong to. This approach has numorous problems
and fixing them isn't easy. (It is mentioned on previous patch.)
To fix this situation, ZONE_CMA is introduced in previous patch, but,
not yet populated. This patch implement population of ZONE_CMA
by stealing reserved pages from the ordinary zones.

Unlike previous implementation that kernel allocation request with
__GFP_MOVABLE could be serviced from CMA region, allocation request only
with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
approach. This is an inevitable design decision to use the zone
implementation because ZONE_CMA could contain highmem. Due to this
decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.

I don't think it would be a problem because most of file cache pages
and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
be proved by the fact that there are many systems with ZONE_HIGHMEM and
they work fine. Notable disadvantage is that we cannot use these pages
for blockdev file cache page, because it usually has __GFP_MOVABLE but
not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
cons. In my experience, blockdev file cache pages are one of the top
reason that causes cma_alloc() to fail temporarily. So, we can get more
guarantee of cma_alloc() success by discarding that case.

Implementation itself is very easy to understand. Steal when cma area is
initialized and recalculate various per zone stat/threshold.

Signed-off-by: Joonsoo Kim 


...


@@ -145,6 +145,28 @@ err:
 static int __init cma_init_reserved_areas(void)
 {
int i;
+   struct zone *zone;
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+   if (!cma_area_count)
+   return 0;
+
+   for (i = 0; i < cma_area_count; i++) {
+   if (start_pfn > cma_areas[i].base_pfn)
+   start_pfn = cma_areas[i].base_pfn;
+   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
+   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
+   }
+
+   for_each_zone(zone) {
+   if (!is_zone_cma(zone))
+   continue;
+
+   /* ZONE_CMA doesn't need to exceed CMA region */
+   zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
+   zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
+   zone->zone_start_pfn;
+   }


Hmm, so what happens on a system with multiple nodes? Each will have its 
own ZONE_CMA, and all will have the same start pfn and spanned pages?



 /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
 void __init init_cma_reserved_pageblock(struct page *page)
 {
unsigned i = pageblock_nr_pages;
+   unsigned long pfn = page_to_pfn(page);
struct page *p = page;
+   int nid = page_to_nid(page);
+
+   /*
+* ZONE_CMA will steal present pages from other zones by changing
+* page links so page_zone() is changed. Before that,
+* we need to adjust previous zone's page count first.
+*/
+   adjust_present_page_count(page, -pageblock_nr_pages);

do {
__ClearPageReserved(p);
set_page_count(p, 0);
-   } while (++p, --i);
+
+   /* Steal pages from other zones */
+   set_page_links(p, ZONE_CMA, nid, pfn);
+   } while (++p, ++pfn, --i);
+
+   adjust_present_page_count(page, pageblock_nr_pages);


This seems to assign pages to ZONE_CMA on the proper node, which is 
good. But then ZONE_CMA on multiple nodes will have unnecessary holes in 
the spanned pages, as each will contain only a subset.





[PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-08-28 Thread js1304
From: Joonsoo Kim 

Until now, reserved pages for CMA are managed in the ordinary zones
where page's pfn are belong to. This approach has numorous problems
and fixing them isn't easy. (It is mentioned on previous patch.)
To fix this situation, ZONE_CMA is introduced in previous patch, but,
not yet populated. This patch implement population of ZONE_CMA
by stealing reserved pages from the ordinary zones.

Unlike previous implementation that kernel allocation request with
__GFP_MOVABLE could be serviced from CMA region, allocation request only
with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
approach. This is an inevitable design decision to use the zone
implementation because ZONE_CMA could contain highmem. Due to this
decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.

I don't think it would be a problem because most of file cache pages
and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
be proved by the fact that there are many systems with ZONE_HIGHMEM and
they work fine. Notable disadvantage is that we cannot use these pages
for blockdev file cache page, because it usually has __GFP_MOVABLE but
not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
cons. In my experience, blockdev file cache pages are one of the top
reason that causes cma_alloc() to fail temporarily. So, we can get more
guarantee of cma_alloc() success by discarding that case.

Implementation itself is very easy to understand. Steal when cma area is
initialized and recalculate various per zone stat/threshold.

Signed-off-by: Joonsoo Kim 
---
 include/linux/memory_hotplug.h |  3 ---
 include/linux/mm.h |  1 +
 mm/cma.c   | 56 ++
 mm/internal.h  |  3 +++
 mm/page_alloc.c| 29 +++---
 5 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 01033fa..ea5af47 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -198,9 +198,6 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d85402..f45e0e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1933,6 +1933,7 @@ extern void setup_per_cpu_pageset(void);
 
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
+extern void setup_zone_pageset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
diff --git a/mm/cma.c b/mm/cma.c
index 384c2cb..d69bdf7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -38,6 +38,7 @@
 #include 
 
 #include "cma.h"
+#include "internal.h"
 
 struct cma cma_areas[MAX_CMA_AREAS];
 unsigned cma_area_count;
@@ -116,10 +117,9 @@ static int __init cma_activate_area(struct cma *cma)
for (j = pageblock_nr_pages; j; --j, pfn++) {
WARN_ON_ONCE(!pfn_valid(pfn));
/*
-* alloc_contig_range requires the pfn range
-* specified to be in the same zone. Make this
-* simple by forcing the entire CMA resv range
-* to be in the same zone.
+* In init_cma_reserved_pageblock(), present_pages is
+* adjusted with assumption that all pages come from
+* a single zone. It could be fixed but not yet done.
 */
if (page_zone(pfn_to_page(pfn)) != zone)
goto err;
@@ -145,6 +145,28 @@ err:
 static int __init cma_init_reserved_areas(void)
 {
int i;
+   struct zone *zone;
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+   if (!cma_area_count)
+   return 0;
+
+   for (i = 0; i < cma_area_count; i++) {
+   if (start_pfn > cma_areas[i].base_pfn)
+   start_pfn = cma_areas[i].base_pfn;
+   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
+   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
+   }
+
+   for_each_zone(zone) {
+   if (!is_zone_cma(zone))
+   continue;
+
+   /* ZONE_CMA doesn't need to exceed CMA region */
+   zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
+   zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
+   zone->zone_start_pfn;
+   }
 
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(_areas[i]);
@@ 

[PATCH v5 3/6] mm/cma: populate ZONE_CMA

2016-08-28 Thread js1304
From: Joonsoo Kim 

Until now, reserved pages for CMA are managed in the ordinary zones
where page's pfn are belong to. This approach has numorous problems
and fixing them isn't easy. (It is mentioned on previous patch.)
To fix this situation, ZONE_CMA is introduced in previous patch, but,
not yet populated. This patch implement population of ZONE_CMA
by stealing reserved pages from the ordinary zones.

Unlike previous implementation that kernel allocation request with
__GFP_MOVABLE could be serviced from CMA region, allocation request only
with GFP_HIGHUSER_MOVABLE can be serviced from CMA region in the new
approach. This is an inevitable design decision to use the zone
implementation because ZONE_CMA could contain highmem. Due to this
decision, ZONE_CMA will work like as ZONE_HIGHMEM or ZONE_MOVABLE.

I don't think it would be a problem because most of file cache pages
and anonymous pages are requested with GFP_HIGHUSER_MOVABLE. It could
be proved by the fact that there are many systems with ZONE_HIGHMEM and
they work fine. Notable disadvantage is that we cannot use these pages
for blockdev file cache page, because it usually has __GFP_MOVABLE but
not __GFP_HIGHMEM and __GFP_USER. But, in this case, there is pros and
cons. In my experience, blockdev file cache pages are one of the top
reason that causes cma_alloc() to fail temporarily. So, we can get more
guarantee of cma_alloc() success by discarding that case.

Implementation itself is very easy to understand. Steal when cma area is
initialized and recalculate various per zone stat/threshold.

Signed-off-by: Joonsoo Kim 
---
 include/linux/memory_hotplug.h |  3 ---
 include/linux/mm.h |  1 +
 mm/cma.c   | 56 ++
 mm/internal.h  |  3 +++
 mm/page_alloc.c| 29 +++---
 5 files changed, 80 insertions(+), 12 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 01033fa..ea5af47 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -198,9 +198,6 @@ void put_online_mems(void);
 void mem_hotplug_begin(void);
 void mem_hotplug_done(void);
 
-extern void set_zone_contiguous(struct zone *zone);
-extern void clear_zone_contiguous(struct zone *zone);
-
 #else /* ! CONFIG_MEMORY_HOTPLUG */
 /*
  * Stub functions for when hotplug is off
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d85402..f45e0e4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1933,6 +1933,7 @@ extern void setup_per_cpu_pageset(void);
 
 extern void zone_pcp_update(struct zone *zone);
 extern void zone_pcp_reset(struct zone *zone);
+extern void setup_zone_pageset(struct zone *zone);
 
 /* page_alloc.c */
 extern int min_free_kbytes;
diff --git a/mm/cma.c b/mm/cma.c
index 384c2cb..d69bdf7 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -38,6 +38,7 @@
 #include 
 
 #include "cma.h"
+#include "internal.h"
 
 struct cma cma_areas[MAX_CMA_AREAS];
 unsigned cma_area_count;
@@ -116,10 +117,9 @@ static int __init cma_activate_area(struct cma *cma)
for (j = pageblock_nr_pages; j; --j, pfn++) {
WARN_ON_ONCE(!pfn_valid(pfn));
/*
-* alloc_contig_range requires the pfn range
-* specified to be in the same zone. Make this
-* simple by forcing the entire CMA resv range
-* to be in the same zone.
+* In init_cma_reserved_pageblock(), present_pages is
+* adjusted with assumption that all pages come from
+* a single zone. It could be fixed but not yet done.
 */
if (page_zone(pfn_to_page(pfn)) != zone)
goto err;
@@ -145,6 +145,28 @@ err:
 static int __init cma_init_reserved_areas(void)
 {
int i;
+   struct zone *zone;
+   unsigned long start_pfn = UINT_MAX, end_pfn = 0;
+
+   if (!cma_area_count)
+   return 0;
+
+   for (i = 0; i < cma_area_count; i++) {
+   if (start_pfn > cma_areas[i].base_pfn)
+   start_pfn = cma_areas[i].base_pfn;
+   if (end_pfn < cma_areas[i].base_pfn + cma_areas[i].count)
+   end_pfn = cma_areas[i].base_pfn + cma_areas[i].count;
+   }
+
+   for_each_zone(zone) {
+   if (!is_zone_cma(zone))
+   continue;
+
+   /* ZONE_CMA doesn't need to exceed CMA region */
+   zone->zone_start_pfn = max(zone->zone_start_pfn, start_pfn);
+   zone->spanned_pages = min(zone_end_pfn(zone), end_pfn) -
+   zone->zone_start_pfn;
+   }
 
for (i = 0; i < cma_area_count; i++) {
int ret = cma_activate_area(_areas[i]);
@@ -153,9 +175,33 @@ static int __init