Re: [PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-19 Thread kemi


On 2017年12月19日 20:28, Michal Hocko wrote:
> On Tue 19-12-17 14:39:22, Kemi Wang wrote:
>> There is not really any use to get NUMA stats separated by zone, and
>> current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
>> cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
>> existed per-cpu infrastructure.
> 
> Let's hope that nobody really depends on the per-zone numbers. It would
> be really strange as those counters are inherently per-node and that is
> what users should care about but who knows...
> 
> Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch
> keeps it and follow up patches even use it further. I will comment on
> those separately but this still makes these few counters really special
> which I think is wrong.
> 

Well, that's what I can think of to keep a balance between performance 
and simplification. If you have a better idea, please post it and 
I will follow that surely.
 
>> Suggested-by: Andi Kleen 
>> Suggested-by: Michal Hocko 
>> Signed-off-by: Kemi Wang 
> 
> I have to fully grasp the rest of the series before I'll give my Ack,
> but I _really_ like the simplification this adds to the code. I believe
> it can be even simpler.
> 
>> ---
>>  drivers/base/node.c|  23 +++
>>  include/linux/mmzone.h |  27 
>>  include/linux/vmstat.h |  31 -
>>  mm/mempolicy.c |   2 +-
>>  mm/page_alloc.c|  16 +++--
>>  mm/vmstat.c| 177 
>> +
>>  6 files changed, 46 insertions(+), 230 deletions(-)
>>
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index ee090ab..a045ea1 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
>> "interleave_hit %lu\n"
>> "local_node %lu\n"
>> "other_node %lu\n",
>> -   sum_zone_numa_state(dev->id, NUMA_HIT),
>> -   sum_zone_numa_state(dev->id, NUMA_MISS),
>> -   sum_zone_numa_state(dev->id, NUMA_FOREIGN),
>> -   sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
>> -   sum_zone_numa_state(dev->id, NUMA_LOCAL),
>> -   sum_zone_numa_state(dev->id, NUMA_OTHER));
>> +   node_page_state(NODE_DATA(dev->id), NUMA_HIT),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_MISS),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
>>  }
>> +
>>  static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
>>  
>>  static ssize_t node_read_vmstat(struct device *dev,
>> @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
>>  n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
>>   sum_zone_node_page_state(nid, i));
>>  
>> -#ifdef CONFIG_NUMA
>> -for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
>> -n += sprintf(buf+n, "%s %lu\n",
>> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>> - sum_zone_numa_state(nid, i));
>> -#endif
>> -
>>  for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
>>  n += sprintf(buf+n, "%s %lu\n",
>> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
>> - NR_VM_NUMA_STAT_ITEMS],
>> + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>>   node_page_state(pgdat, i));
>>  
>>  return n;
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 67f2e3c..c06d880 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -115,20 +115,6 @@ struct zone_padding {
>>  #define ZONE_PADDING(name)
>>  #endif
>>  
>> -#ifdef CONFIG_NUMA
>> -enum numa_stat_item {
>> -NUMA_HIT,   /* allocated in intended node */
>> -NUMA_MISS,  /* allocated in non intended node */
>> -NUMA_FOREIGN,   /* was intended here, hit elsewhere */
>> -NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
>> -NUMA_LOCAL, /* allocation from local node */
>> -NUMA_OTHER, /* allocation from other node */
>> -NR_VM_NUMA_STAT_ITEMS
>> -};
>> -#else
>> -#define NR_VM_NUMA_STAT_ITEMS 0
>> -#endif
>> -
>>  enum zone_stat_item {
>>  /* First 128 byte cacheline (assuming 64 bit words) */
>>  NR_FREE_PAGES,
>> @@ -151,7 +137,18 @@ enum zone_stat_item {
>>  NR_VM_ZONE_STAT_ITEMS };
>>  
>>  enum node_stat_item {
>> -NR_LRU_BASE,
>> +#ifdef CONFIG_NUMA
>> +NUMA_HIT,   /* allocated in intended node */
>> +NUMA_MISS, 

Re: [PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-19 Thread kemi


On 2017年12月19日 20:28, Michal Hocko wrote:
> On Tue 19-12-17 14:39:22, Kemi Wang wrote:
>> There is not really any use to get NUMA stats separated by zone, and
>> current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
>> cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
>> existed per-cpu infrastructure.
> 
> Let's hope that nobody really depends on the per-zone numbers. It would
> be really strange as those counters are inherently per-node and that is
> what users should care about but who knows...
> 
> Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch
> keeps it and follow up patches even use it further. I will comment on
> those separately but this still makes these few counters really special
> which I think is wrong.
> 

Well, that's what I can think of to keep a balance between performance 
and simplification. If you have a better idea, please post it and 
I will follow that surely.
 
>> Suggested-by: Andi Kleen 
>> Suggested-by: Michal Hocko 
>> Signed-off-by: Kemi Wang 
> 
> I have to fully grasp the rest of the series before I'll give my Ack,
> but I _really_ like the simplification this adds to the code. I believe
> it can be even simpler.
> 
>> ---
>>  drivers/base/node.c|  23 +++
>>  include/linux/mmzone.h |  27 
>>  include/linux/vmstat.h |  31 -
>>  mm/mempolicy.c |   2 +-
>>  mm/page_alloc.c|  16 +++--
>>  mm/vmstat.c| 177 
>> +
>>  6 files changed, 46 insertions(+), 230 deletions(-)
>>
>> diff --git a/drivers/base/node.c b/drivers/base/node.c
>> index ee090ab..a045ea1 100644
>> --- a/drivers/base/node.c
>> +++ b/drivers/base/node.c
>> @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
>> "interleave_hit %lu\n"
>> "local_node %lu\n"
>> "other_node %lu\n",
>> -   sum_zone_numa_state(dev->id, NUMA_HIT),
>> -   sum_zone_numa_state(dev->id, NUMA_MISS),
>> -   sum_zone_numa_state(dev->id, NUMA_FOREIGN),
>> -   sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
>> -   sum_zone_numa_state(dev->id, NUMA_LOCAL),
>> -   sum_zone_numa_state(dev->id, NUMA_OTHER));
>> +   node_page_state(NODE_DATA(dev->id), NUMA_HIT),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_MISS),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
>> +   node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
>>  }
>> +
>>  static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
>>  
>>  static ssize_t node_read_vmstat(struct device *dev,
>> @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
>>  n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
>>   sum_zone_node_page_state(nid, i));
>>  
>> -#ifdef CONFIG_NUMA
>> -for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
>> -n += sprintf(buf+n, "%s %lu\n",
>> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>> - sum_zone_numa_state(nid, i));
>> -#endif
>> -
>>  for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
>>  n += sprintf(buf+n, "%s %lu\n",
>> - vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
>> - NR_VM_NUMA_STAT_ITEMS],
>> + vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>>   node_page_state(pgdat, i));
>>  
>>  return n;
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 67f2e3c..c06d880 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -115,20 +115,6 @@ struct zone_padding {
>>  #define ZONE_PADDING(name)
>>  #endif
>>  
>> -#ifdef CONFIG_NUMA
>> -enum numa_stat_item {
>> -NUMA_HIT,   /* allocated in intended node */
>> -NUMA_MISS,  /* allocated in non intended node */
>> -NUMA_FOREIGN,   /* was intended here, hit elsewhere */
>> -NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
>> -NUMA_LOCAL, /* allocation from local node */
>> -NUMA_OTHER, /* allocation from other node */
>> -NR_VM_NUMA_STAT_ITEMS
>> -};
>> -#else
>> -#define NR_VM_NUMA_STAT_ITEMS 0
>> -#endif
>> -
>>  enum zone_stat_item {
>>  /* First 128 byte cacheline (assuming 64 bit words) */
>>  NR_FREE_PAGES,
>> @@ -151,7 +137,18 @@ enum zone_stat_item {
>>  NR_VM_ZONE_STAT_ITEMS };
>>  
>>  enum node_stat_item {
>> -NR_LRU_BASE,
>> +#ifdef CONFIG_NUMA
>> +NUMA_HIT,   /* allocated in intended node */
>> +NUMA_MISS,  /* allocated in non intended node */
>> +

Re: [PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-19 Thread Michal Hocko
On Tue 19-12-17 14:39:22, Kemi Wang wrote:
> There is not really any use to get NUMA stats separated by zone, and
> current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
> cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
> existed per-cpu infrastructure.

Let's hope that nobody really depends on the per-zone numbers. It would
be really strange as those counters are inherently per-node and that is
what users should care about but who knows...

Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch
keeps it and follow up patches even use it further. I will comment on
those separately but this still makes these few counters really special
which I think is wrong.

> Suggested-by: Andi Kleen 
> Suggested-by: Michal Hocko 
> Signed-off-by: Kemi Wang 

I have to fully grasp the rest of the series before I'll give my Ack,
but I _really_ like the simplification this adds to the code. I believe
it can be even simpler.

> ---
>  drivers/base/node.c|  23 +++
>  include/linux/mmzone.h |  27 
>  include/linux/vmstat.h |  31 -
>  mm/mempolicy.c |   2 +-
>  mm/page_alloc.c|  16 +++--
>  mm/vmstat.c| 177 
> +
>  6 files changed, 46 insertions(+), 230 deletions(-)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index ee090ab..a045ea1 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
>  "interleave_hit %lu\n"
>  "local_node %lu\n"
>  "other_node %lu\n",
> -sum_zone_numa_state(dev->id, NUMA_HIT),
> -sum_zone_numa_state(dev->id, NUMA_MISS),
> -sum_zone_numa_state(dev->id, NUMA_FOREIGN),
> -sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
> -sum_zone_numa_state(dev->id, NUMA_LOCAL),
> -sum_zone_numa_state(dev->id, NUMA_OTHER));
> +node_page_state(NODE_DATA(dev->id), NUMA_HIT),
> +node_page_state(NODE_DATA(dev->id), NUMA_MISS),
> +node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
> +node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
> +node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
> +node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
>  }
> +
>  static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
>  
>  static ssize_t node_read_vmstat(struct device *dev,
> @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
>   n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
>sum_zone_node_page_state(nid, i));
>  
> -#ifdef CONFIG_NUMA
> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
> - n += sprintf(buf+n, "%s %lu\n",
> -  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
> -  sum_zone_numa_state(nid, i));
> -#endif
> -
>   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
>   n += sprintf(buf+n, "%s %lu\n",
> -  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
> -  NR_VM_NUMA_STAT_ITEMS],
> +  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>node_page_state(pgdat, i));
>  
>   return n;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 67f2e3c..c06d880 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -115,20 +115,6 @@ struct zone_padding {
>  #define ZONE_PADDING(name)
>  #endif
>  
> -#ifdef CONFIG_NUMA
> -enum numa_stat_item {
> - NUMA_HIT,   /* allocated in intended node */
> - NUMA_MISS,  /* allocated in non intended node */
> - NUMA_FOREIGN,   /* was intended here, hit elsewhere */
> - NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
> - NUMA_LOCAL, /* allocation from local node */
> - NUMA_OTHER, /* allocation from other node */
> - NR_VM_NUMA_STAT_ITEMS
> -};
> -#else
> -#define NR_VM_NUMA_STAT_ITEMS 0
> -#endif
> -
>  enum zone_stat_item {
>   /* First 128 byte cacheline (assuming 64 bit words) */
>   NR_FREE_PAGES,
> @@ -151,7 +137,18 @@ enum zone_stat_item {
>   NR_VM_ZONE_STAT_ITEMS };
>  
>  enum node_stat_item {
> - NR_LRU_BASE,
> +#ifdef CONFIG_NUMA
> + NUMA_HIT,   /* allocated in intended node */
> + NUMA_MISS,  /* allocated in non intended node */
> + NUMA_FOREIGN,   /* was intended here, hit elsewhere */
> + NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
> + NUMA_LOCAL, /* allocation from local node */
> + NUMA_OTHER, /* 

Re: [PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-19 Thread Michal Hocko
On Tue 19-12-17 14:39:22, Kemi Wang wrote:
> There is not really any use to get NUMA stats separated by zone, and
> current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
> cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
> existed per-cpu infrastructure.

Let's hope that nobody really depends on the per-zone numbers. It would
be really strange as those counters are inherently per-node and that is
what users should care about but who knows...

Anyway, I hoped we could get rid of NR_VM_NUMA_STAT_ITEMS but your patch
keeps it and follow up patches even use it further. I will comment on
those separately but this still makes these few counters really special
which I think is wrong.

> Suggested-by: Andi Kleen 
> Suggested-by: Michal Hocko 
> Signed-off-by: Kemi Wang 

I have to fully grasp the rest of the series before I'll give my Ack,
but I _really_ like the simplification this adds to the code. I believe
it can be even simpler.

> ---
>  drivers/base/node.c|  23 +++
>  include/linux/mmzone.h |  27 
>  include/linux/vmstat.h |  31 -
>  mm/mempolicy.c |   2 +-
>  mm/page_alloc.c|  16 +++--
>  mm/vmstat.c| 177 
> +
>  6 files changed, 46 insertions(+), 230 deletions(-)
> 
> diff --git a/drivers/base/node.c b/drivers/base/node.c
> index ee090ab..a045ea1 100644
> --- a/drivers/base/node.c
> +++ b/drivers/base/node.c
> @@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
>  "interleave_hit %lu\n"
>  "local_node %lu\n"
>  "other_node %lu\n",
> -sum_zone_numa_state(dev->id, NUMA_HIT),
> -sum_zone_numa_state(dev->id, NUMA_MISS),
> -sum_zone_numa_state(dev->id, NUMA_FOREIGN),
> -sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
> -sum_zone_numa_state(dev->id, NUMA_LOCAL),
> -sum_zone_numa_state(dev->id, NUMA_OTHER));
> +node_page_state(NODE_DATA(dev->id), NUMA_HIT),
> +node_page_state(NODE_DATA(dev->id), NUMA_MISS),
> +node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
> +node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
> +node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
> +node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
>  }
> +
>  static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
>  
>  static ssize_t node_read_vmstat(struct device *dev,
> @@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
>   n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
>sum_zone_node_page_state(nid, i));
>  
> -#ifdef CONFIG_NUMA
> - for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
> - n += sprintf(buf+n, "%s %lu\n",
> -  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
> -  sum_zone_numa_state(nid, i));
> -#endif
> -
>   for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
>   n += sprintf(buf+n, "%s %lu\n",
> -  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
> -  NR_VM_NUMA_STAT_ITEMS],
> +  vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
>node_page_state(pgdat, i));
>  
>   return n;
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 67f2e3c..c06d880 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -115,20 +115,6 @@ struct zone_padding {
>  #define ZONE_PADDING(name)
>  #endif
>  
> -#ifdef CONFIG_NUMA
> -enum numa_stat_item {
> - NUMA_HIT,   /* allocated in intended node */
> - NUMA_MISS,  /* allocated in non intended node */
> - NUMA_FOREIGN,   /* was intended here, hit elsewhere */
> - NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
> - NUMA_LOCAL, /* allocation from local node */
> - NUMA_OTHER, /* allocation from other node */
> - NR_VM_NUMA_STAT_ITEMS
> -};
> -#else
> -#define NR_VM_NUMA_STAT_ITEMS 0
> -#endif
> -
>  enum zone_stat_item {
>   /* First 128 byte cacheline (assuming 64 bit words) */
>   NR_FREE_PAGES,
> @@ -151,7 +137,18 @@ enum zone_stat_item {
>   NR_VM_ZONE_STAT_ITEMS };
>  
>  enum node_stat_item {
> - NR_LRU_BASE,
> +#ifdef CONFIG_NUMA
> + NUMA_HIT,   /* allocated in intended node */
> + NUMA_MISS,  /* allocated in non intended node */
> + NUMA_FOREIGN,   /* was intended here, hit elsewhere */
> + NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
> + NUMA_LOCAL, /* allocation from local node */
> + NUMA_OTHER, /* allocation from other node */
> + NR_VM_NUMA_STAT_ITEMS,
> 

[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-18 Thread Kemi Wang
There is not really any use to get NUMA stats separated by zone, and
current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
existed per-cpu infrastructure.

Suggested-by: Andi Kleen 
Suggested-by: Michal Hocko 
Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  23 +++
 include/linux/mmzone.h |  27 
 include/linux/vmstat.h |  31 -
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  16 +++--
 mm/vmstat.c| 177 +
 6 files changed, 46 insertions(+), 230 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..a045ea1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
+  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
 }
+
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
 static ssize_t node_read_vmstat(struct device *dev,
@@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-#ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
-#endif
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
-NR_VM_NUMA_STAT_ITEMS],
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..c06d880 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -115,20 +115,6 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
-#ifdef CONFIG_NUMA
-enum numa_stat_item {
-   NUMA_HIT,   /* allocated in intended node */
-   NUMA_MISS,  /* allocated in non intended node */
-   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
-   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
-   NUMA_LOCAL, /* allocation from local node */
-   NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
-};
-#else
-#define NR_VM_NUMA_STAT_ITEMS 0
-#endif
-
 enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
@@ -151,7 +137,18 @@ enum zone_stat_item {
NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
-   NR_LRU_BASE,
+#ifdef CONFIG_NUMA
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
+   NUMA_LOCAL, /* allocation from local node */
+   NUMA_OTHER, /* allocation from other node */
+   NR_VM_NUMA_STAT_ITEMS,
+#else
+#defineNR_VM_NUMA_STAT_ITEMS 0
+#endif
+   NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /*  " " "   "   " */
NR_INACTIVE_FILE,   /*  " " "   "   " */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..80bf290 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern 

[PATCH v2 1/5] mm: migrate NUMA stats from per-zone to per-node

2017-12-18 Thread Kemi Wang
There is not really any use to get NUMA stats separated by zone, and
current per-zone NUMA stats is only consumed in /proc/zoneinfo. For code
cleanup purpose, we move NUMA stats from per-zone to per-node and reuse the
existed per-cpu infrastructure.

Suggested-by: Andi Kleen 
Suggested-by: Michal Hocko 
Signed-off-by: Kemi Wang 
---
 drivers/base/node.c|  23 +++
 include/linux/mmzone.h |  27 
 include/linux/vmstat.h |  31 -
 mm/mempolicy.c |   2 +-
 mm/page_alloc.c|  16 +++--
 mm/vmstat.c| 177 +
 6 files changed, 46 insertions(+), 230 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index ee090ab..a045ea1 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -169,13 +169,14 @@ static ssize_t node_read_numastat(struct device *dev,
   "interleave_hit %lu\n"
   "local_node %lu\n"
   "other_node %lu\n",
-  sum_zone_numa_state(dev->id, NUMA_HIT),
-  sum_zone_numa_state(dev->id, NUMA_MISS),
-  sum_zone_numa_state(dev->id, NUMA_FOREIGN),
-  sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
-  sum_zone_numa_state(dev->id, NUMA_LOCAL),
-  sum_zone_numa_state(dev->id, NUMA_OTHER));
+  node_page_state(NODE_DATA(dev->id), NUMA_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_MISS),
+  node_page_state(NODE_DATA(dev->id), NUMA_FOREIGN),
+  node_page_state(NODE_DATA(dev->id), NUMA_INTERLEAVE_HIT),
+  node_page_state(NODE_DATA(dev->id), NUMA_LOCAL),
+  node_page_state(NODE_DATA(dev->id), NUMA_OTHER));
 }
+
 static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
 
 static ssize_t node_read_vmstat(struct device *dev,
@@ -190,17 +191,9 @@ static ssize_t node_read_vmstat(struct device *dev,
n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
 sum_zone_node_page_state(nid, i));
 
-#ifdef CONFIG_NUMA
-   for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
-   n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
-sum_zone_numa_state(nid, i));
-#endif
-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
n += sprintf(buf+n, "%s %lu\n",
-vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
-NR_VM_NUMA_STAT_ITEMS],
+vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
 node_page_state(pgdat, i));
 
return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 67f2e3c..c06d880 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -115,20 +115,6 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
-#ifdef CONFIG_NUMA
-enum numa_stat_item {
-   NUMA_HIT,   /* allocated in intended node */
-   NUMA_MISS,  /* allocated in non intended node */
-   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
-   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
-   NUMA_LOCAL, /* allocation from local node */
-   NUMA_OTHER, /* allocation from other node */
-   NR_VM_NUMA_STAT_ITEMS
-};
-#else
-#define NR_VM_NUMA_STAT_ITEMS 0
-#endif
-
 enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
@@ -151,7 +137,18 @@ enum zone_stat_item {
NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
-   NR_LRU_BASE,
+#ifdef CONFIG_NUMA
+   NUMA_HIT,   /* allocated in intended node */
+   NUMA_MISS,  /* allocated in non intended node */
+   NUMA_FOREIGN,   /* was intended here, hit elsewhere */
+   NUMA_INTERLEAVE_HIT,/* interleaver preferred this zone */
+   NUMA_LOCAL, /* allocation from local node */
+   NUMA_OTHER, /* allocation from other node */
+   NR_VM_NUMA_STAT_ITEMS,
+#else
+#defineNR_VM_NUMA_STAT_ITEMS 0
+#endif
+   NR_LRU_BASE = NR_VM_NUMA_STAT_ITEMS,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /*  " " "   "   " */
NR_INACTIVE_FILE,   /*  " " "   "   " */
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 1779c98..80bf290 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -118,37 +118,8 @@ static inline void vm_events_fold_cpu(int cpu)
  * Zone and node-based page accounting with per cpu differentials.
  */
 extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
-extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
 extern