[PATCH 58/63] sched: numa: adjust scan rate in task_numa_placement

2013-10-07 Thread Mel Gorman
From: Rik van Riel 

Adjust numa_scan_period in task_numa_placement, depending on how much
useful work the numa code can do. The more local faults there are in a
given scan window the longer the period (and hence the slower the scan rate)
during the next window. If there are excessive shared faults then the scan
period will decrease with the amount of scaling depending on whether the
ratio of shared/private faults. If the preferred node changes then the
scan rate is reset to recheck if the task is properly placed.

Signed-off-by: Rik van Riel 
Signed-off-by: Mel Gorman 
---
 include/linux/sched.h |   9 
 kernel/sched/fair.c   | 112 +++---
 mm/huge_memory.c  |   4 +-
 mm/memory.c   |   9 ++--
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b859621..c1bd367 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1355,6 +1355,14 @@ struct task_struct {
 */
unsigned long *numa_faults_buffer;
 
+   /*
+* numa_faults_locality tracks if faults recorded during the last
+* scan window were remote/local. The task scan period is adapted
+* based on the locality of the faults with different weights
+* depending on whether they were shared or private faults
+*/
+   unsigned long numa_faults_locality[2];
+
int numa_preferred_nid;
unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1445,6 +1453,7 @@ struct task_struct {
 #define TNF_MIGRATED   0x01
 #define TNF_NO_GROUP   0x02
 #define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL0x08
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03698f5..d8514c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
 
sched_setnuma(p, env.dst_nid);
 
+   /*
+* Reset the scan period if the task is being rescheduled on an
+* alternative node to recheck if the tasks is now properly placed.
+*/
+   p->numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct 
*p)
p->numa_migrate_retry = jiffies + HZ*5;
 }
 
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+   unsigned long shared, unsigned long private)
+{
+   unsigned int period_slot;
+   int ratio;
+   int diff;
+
+   unsigned long remote = p->numa_faults_locality[0];
+   unsigned long local = p->numa_faults_locality[1];
+
+   /*
+* If there were no record hinting faults then either the task is
+* completely idle or all activity is areas that are not of interest
+* to automatic numa balancing. Scan slower
+*/
+   if (local + shared == 0) {
+   p->numa_scan_period = min(p->numa_scan_period_max,
+   p->numa_scan_period << 1);
+
+   p->mm->numa_next_scan = jiffies +
+   msecs_to_jiffies(p->numa_scan_period);
+
+   return;
+   }
+
+   /*
+* Prepare to scale scan period relative to the current period.
+*   == NUMA_PERIOD_THRESHOLD scan period stays the same
+*   <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+*   >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+*/
+   period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+   ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+   if (ratio >= NUMA_PERIOD_THRESHOLD) {
+   int slot = ratio - NUMA_PERIOD_THRESHOLD;
+   if (!slot)
+   slot = 1;
+   diff = slot * period_slot;
+   } else {
+   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+   /*
+* Scale scan rate increases based on sharing. There is an
+* inverse relationship between the degree of sharing and
+ 

[PATCH 58/63] sched: numa: adjust scan rate in task_numa_placement

2013-10-07 Thread Mel Gorman
From: Rik van Riel r...@redhat.com

Adjust numa_scan_period in task_numa_placement, depending on how much
useful work the numa code can do. The more local faults there are in a
given scan window the longer the period (and hence the slower the scan rate)
during the next window. If there are excessive shared faults then the scan
period will decrease with the amount of scaling depending on whether the
ratio of shared/private faults. If the preferred node changes then the
scan rate is reset to recheck if the task is properly placed.

Signed-off-by: Rik van Riel r...@redhat.com
Signed-off-by: Mel Gorman mgor...@suse.de
---
 include/linux/sched.h |   9 
 kernel/sched/fair.c   | 112 +++---
 mm/huge_memory.c  |   4 +-
 mm/memory.c   |   9 ++--
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b859621..c1bd367 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1355,6 +1355,14 @@ struct task_struct {
 */
unsigned long *numa_faults_buffer;
 
+   /*
+* numa_faults_locality tracks if faults recorded during the last
+* scan window were remote/local. The task scan period is adapted
+* based on the locality of the faults with different weights
+* depending on whether they were shared or private faults
+*/
+   unsigned long numa_faults_locality[2];
+
int numa_preferred_nid;
unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1445,6 +1453,7 @@ struct task_struct {
 #define TNF_MIGRATED   0x01
 #define TNF_NO_GROUP   0x02
 #define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL0x08
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 03698f5..d8514c8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
 
sched_setnuma(p, env.dst_nid);
 
+   /*
+* Reset the scan period if the task is being rescheduled on an
+* alternative node to recheck if the tasks is now properly placed.
+*/
+   p-numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct 
*p)
p-numa_migrate_retry = jiffies + HZ*5;
 }
 
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+   unsigned long shared, unsigned long private)
+{
+   unsigned int period_slot;
+   int ratio;
+   int diff;
+
+   unsigned long remote = p-numa_faults_locality[0];
+   unsigned long local = p-numa_faults_locality[1];
+
+   /*
+* If there were no record hinting faults then either the task is
+* completely idle or all activity is areas that are not of interest
+* to automatic numa balancing. Scan slower
+*/
+   if (local + shared == 0) {
+   p-numa_scan_period = min(p-numa_scan_period_max,
+   p-numa_scan_period  1);
+
+   p-mm-numa_next_scan = jiffies +
+   msecs_to_jiffies(p-numa_scan_period);
+
+   return;
+   }
+
+   /*
+* Prepare to scale scan period relative to the current period.
+*   == NUMA_PERIOD_THRESHOLD scan period stays the same
+* NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+*   = NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+*/
+   period_slot = DIV_ROUND_UP(p-numa_scan_period, NUMA_PERIOD_SLOTS);
+   ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+   if (ratio = NUMA_PERIOD_THRESHOLD) {
+   int slot = ratio - NUMA_PERIOD_THRESHOLD;
+   if (!slot)
+   slot = 1;
+   diff = slot * period_slot;
+   } else {
+   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+   /*
+* Scale scan rate increases based on sharing. There is an
+* inverse relationship between the degree 

[PATCH 58/63] sched: numa: adjust scan rate in task_numa_placement

2013-09-27 Thread Mel Gorman
From: Rik van Riel 

Adjust numa_scan_period in task_numa_placement, depending on how much
useful work the numa code can do. The more local faults there are in a
given scan window the longer the period (and hence the slower the scan rate)
during the next window. If there are excessive shared faults then the scan
period will decrease with the amount of scaling depending on whether the
ratio of shared/private faults. If the preferred node changes then the
scan rate is reset to recheck if the task is properly placed.

Signed-off-by: Rik van Riel 
Signed-off-by: Mel Gorman 
---
 include/linux/sched.h |   9 
 kernel/sched/fair.c   | 112 +++---
 mm/huge_memory.c  |   4 +-
 mm/memory.c   |   9 ++--
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 718bb60..918baf3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,6 +1356,14 @@ struct task_struct {
 */
unsigned long *numa_faults_buffer;
 
+   /*
+* numa_faults_locality tracks if faults recorded during the last
+* scan window were remote/local. The task scan period is adapted
+* based on the locality of the faults with different weights
+* depending on whether they were shared or private faults
+*/
+   unsigned long numa_faults_locality[2];
+
int numa_preferred_nid;
unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1439,6 +1447,7 @@ struct task_struct {
 #define TNF_MIGRATED   0x01
 #define TNF_NO_GROUP   0x02
 #define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL0x08
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3486042..4f99c09 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
 
sched_setnuma(p, env.dst_nid);
 
+   /*
+* Reset the scan period if the task is being rescheduled on an
+* alternative node to recheck if the tasks is now properly placed.
+*/
+   p->numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct 
*p)
p->numa_migrate_retry = jiffies + HZ*5;
 }
 
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+   unsigned long shared, unsigned long private)
+{
+   unsigned int period_slot;
+   int ratio;
+   int diff;
+
+   unsigned long remote = p->numa_faults_locality[0];
+   unsigned long local = p->numa_faults_locality[1];
+
+   /*
+* If there were no record hinting faults then either the task is
+* completely idle or all activity is areas that are not of interest
+* to automatic numa balancing. Scan slower
+*/
+   if (local + shared == 0) {
+   p->numa_scan_period = min(p->numa_scan_period_max,
+   p->numa_scan_period << 1);
+
+   p->mm->numa_next_scan = jiffies +
+   msecs_to_jiffies(p->numa_scan_period);
+
+   return;
+   }
+
+   /*
+* Prepare to scale scan period relative to the current period.
+*   == NUMA_PERIOD_THRESHOLD scan period stays the same
+*   <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+*   >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+*/
+   period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+   ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+   if (ratio >= NUMA_PERIOD_THRESHOLD) {
+   int slot = ratio - NUMA_PERIOD_THRESHOLD;
+   if (!slot)
+   slot = 1;
+   diff = slot * period_slot;
+   } else {
+   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+   /*
+* Scale scan rate increases based on sharing. There is an
+* inverse relationship between the degree of sharing and
+ 

[PATCH 58/63] sched: numa: adjust scan rate in task_numa_placement

2013-09-27 Thread Mel Gorman
From: Rik van Riel r...@redhat.com

Adjust numa_scan_period in task_numa_placement, depending on how much
useful work the numa code can do. The more local faults there are in a
given scan window the longer the period (and hence the slower the scan rate)
during the next window. If there are excessive shared faults then the scan
period will decrease with the amount of scaling depending on whether the
ratio of shared/private faults. If the preferred node changes then the
scan rate is reset to recheck if the task is properly placed.

Signed-off-by: Rik van Riel r...@redhat.com
Signed-off-by: Mel Gorman mgor...@suse.de
---
 include/linux/sched.h |   9 
 kernel/sched/fair.c   | 112 +++---
 mm/huge_memory.c  |   4 +-
 mm/memory.c   |   9 ++--
 4 files changed, 105 insertions(+), 29 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 718bb60..918baf3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1356,6 +1356,14 @@ struct task_struct {
 */
unsigned long *numa_faults_buffer;
 
+   /*
+* numa_faults_locality tracks if faults recorded during the last
+* scan window were remote/local. The task scan period is adapted
+* based on the locality of the faults with different weights
+* depending on whether they were shared or private faults
+*/
+   unsigned long numa_faults_locality[2];
+
int numa_preferred_nid;
unsigned long numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
@@ -1439,6 +1447,7 @@ struct task_struct {
 #define TNF_MIGRATED   0x01
 #define TNF_NO_GROUP   0x02
 #define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL0x08
 
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3486042..4f99c09 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
 
sched_setnuma(p, env.dst_nid);
 
+   /*
+* Reset the scan period if the task is being rescheduled on an
+* alternative node to recheck if the tasks is now properly placed.
+*/
+   p-numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct 
*p)
p-numa_migrate_retry = jiffies + HZ*5;
 }
 
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+   unsigned long shared, unsigned long private)
+{
+   unsigned int period_slot;
+   int ratio;
+   int diff;
+
+   unsigned long remote = p-numa_faults_locality[0];
+   unsigned long local = p-numa_faults_locality[1];
+
+   /*
+* If there were no record hinting faults then either the task is
+* completely idle or all activity is areas that are not of interest
+* to automatic numa balancing. Scan slower
+*/
+   if (local + shared == 0) {
+   p-numa_scan_period = min(p-numa_scan_period_max,
+   p-numa_scan_period  1);
+
+   p-mm-numa_next_scan = jiffies +
+   msecs_to_jiffies(p-numa_scan_period);
+
+   return;
+   }
+
+   /*
+* Prepare to scale scan period relative to the current period.
+*   == NUMA_PERIOD_THRESHOLD scan period stays the same
+* NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+*   = NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+*/
+   period_slot = DIV_ROUND_UP(p-numa_scan_period, NUMA_PERIOD_SLOTS);
+   ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+   if (ratio = NUMA_PERIOD_THRESHOLD) {
+   int slot = ratio - NUMA_PERIOD_THRESHOLD;
+   if (!slot)
+   slot = 1;
+   diff = slot * period_slot;
+   } else {
+   diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+   /*
+* Scale scan rate increases based on sharing. There is an
+* inverse relationship between the degree