[PATCH 07/10] workqueue: simplify flush_workqueue_prep_cwqs()

2012-09-24 Thread Lai Jiangshan
Move the advance of wq-work_color into flush_workqueue_prep_cwqs()
and rename flush_workqueue_prep_cwqs() to workqueue_start_flush(),
It will simplify the caller.

Since the @flush_color and @work_color can't be -1, so we remove
the test in workqueue_start_flush() and fix the indent.

Since the @flush_color is always equals to wq-work_color(before advance),
so we remove the @flush_color argument. @flush_color becomes a
local variable.

Since the @work_color is always equals to work_next_color(flush_color),
so we remove the @work_color argument. @work_color is renamed to
@next_color and becomes a local variable

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   69 +++
 1 files changed, 26 insertions(+), 43 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 803a22c..be407e1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2531,40 +2531,37 @@ static void insert_wq_barrier(struct 
cpu_workqueue_struct *cwq,
 }
 
 /**
- * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing
+ * workqueue_start_flush - start workqueue flushing
  * @wq: workqueue being flushed
- * @flush_color: new flush color,  0 for no-op
- * @work_color: new work color,  0 for no-op
  *
- * Prepare cwqs for workqueue flushing.
+ * Start a new flush color and prepare cwqs for workqueue flushing.
  *
- * The caller should have initialized @wq-first_flusher prior to
- * calling this function with non-negative @flush_color.  If
- * @flush_color is negative, no flush color update is done and %false
- * is returned.
+ * Called with color space is not full.  The current work_color
+ * becomes new flush_color and work_color is advanced by one.
+ * All cwq's work_color are set to new work_color(advanced by one).
  *
- * If @work_color is non-negative, all cwqs should have the same
- * work_color which is previous to @work_color and all will be
- * advanced to @work_color.
+ * The caller should have initialized @wq-first_flusher prior to
+ * calling this function.
  *
  * CONTEXT:
  * mutex_lock(wq-flush_mutex).
  *
  * RETURNS:
- * %true if @flush_color = 0 and there's something to flush.  %false
- * otherwise.
+ * %true if there's some cwqs to flush.  %false otherwise.
  */
-static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq,
- int flush_color, int work_color)
+static bool workqueue_start_flush(struct workqueue_struct *wq)
 {
+   int flush_color = wq-work_color;
+   int next_color = work_next_color(wq-work_color);
bool wait = false;
unsigned int cpu;
 
-   if (flush_color = 0) {
-   BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
-   /* this ref is held by first flusher */
-   atomic_set(wq-nr_cwqs_to_flush[flush_color], 1);
-   }
+   BUG_ON(next_color == wq-flush_color);
+   wq-work_color = next_color;
+
+   BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
+   /* this ref is held by first flusher */
+   atomic_set(wq-nr_cwqs_to_flush[flush_color], 1);
 
for_each_cwq_cpu(cpu, wq) {
struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
@@ -2572,17 +2569,13 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
 
spin_lock_irq(gcwq-lock);
 
-   if (flush_color = 0) {
-   if (cwq-nr_in_flight[flush_color]) {
-   atomic_inc(wq-nr_cwqs_to_flush[flush_color]);
-   wait = true;
-   }
+   if (cwq-nr_in_flight[flush_color]) {
+   atomic_inc(wq-nr_cwqs_to_flush[flush_color]);
+   wait = true;
}
 
-   if (work_color = 0) {
-   BUG_ON(work_color != work_next_color(cwq-work_color));
-   cwq-work_color = work_color;
-   }
+   BUG_ON(next_color != work_next_color(cwq-work_color));
+   cwq-work_color = next_color;
 
spin_unlock_irq(gcwq-lock);
}
@@ -2622,14 +2615,9 @@ void flush_workqueue(struct workqueue_struct *wq)
next_color = work_next_color(wq-work_color);
 
if (next_color != wq-flush_color) {
-   /*
-* Color space is not full.  The current work_color
-* becomes our flush_color and work_color is advanced
-* by one.
-*/
+   /* Color space is not full */
BUG_ON(!list_empty(wq-flusher_overflow));
this_flusher.flush_color = flush_color;
-   wq-work_color = next_color;
 
if (!wq-first_flusher) {
/* no flush in progress, become the first flusher */
@@ -2637,8 +2625,7 @@ void flush_workqueue(struct workqueue_struct *wq

[PATCH 10/10] workqueue: remove wq-flush_color

2012-09-24 Thread Lai Jiangshan
Use wq-first_flusher-flush_color instead.

If current task is the first_flusher, we use @flush_color
or work_next_color(flush_color) directly.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   52 
 1 files changed, 20 insertions(+), 32 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d78fe08..e703659 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -248,7 +248,6 @@ struct workqueue_struct {
 
struct mutexflush_mutex;/* protects wq flushing */
int work_color; /* F: current work color */
-   int flush_color;/* F: current flush color */
atomic_tnr_cwqs_to_flush[WORK_NR_COLORS];
struct wq_flusher   *first_flusher; /* F: first flusher */
struct list_headflusher_queue;  /* F: flush waiters */
@@ -2555,7 +2554,7 @@ static bool workqueue_start_flush(struct workqueue_struct 
*wq)
bool wait = false;
unsigned int cpu;
 
-   BUG_ON(next_color == wq-flush_color);
+   BUG_ON(next_color == wq-first_flusher-flush_color);
wq-work_color = next_color;
 
BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
@@ -2614,29 +2613,23 @@ void flush_workqueue(struct workqueue_struct *wq)
next_color = work_next_color(wq-work_color);
this_flusher.flush_color = flush_color;
 
-   if (next_color != wq-flush_color) {
-   /* Color space is not full */
-   if (!wq-first_flusher) {
-   /* no flush in progress, become the first flusher */
-   BUG_ON(wq-flush_color != flush_color);
-
-   wq-first_flusher = this_flusher;
-
-   if (!workqueue_start_flush(wq)) {
-   /* nothing to flush, done */
-   wq_dec_flusher_ref(wq, flush_color);
-   wq-flush_color = next_color;
-   wq-first_flusher = NULL;
-   goto out_unlock;
-   }
+   if (!wq-first_flusher) {
+   /* no flush in progress, become the first flusher */
+   wq-first_flusher = this_flusher;
 
+   if (!workqueue_start_flush(wq)) {
+   /* nothing to flush, done */
wq_dec_flusher_ref(wq, flush_color);
-   } else {
-   /* wait in queue */
-   BUG_ON(wq-flush_color == this_flusher.flush_color);
-   list_add_tail(this_flusher.list, wq-flusher_queue);
-   workqueue_start_flush(wq);
+   wq-first_flusher = NULL;
+   goto out_unlock;
}
+
+   wq_dec_flusher_ref(wq, flush_color);
+   } else if (next_color != wq-first_flusher-flush_color) {
+   /* Color space is not full, wait in queue */
+   BUG_ON(wq-first_flusher-flush_color == flush_color);
+   list_add_tail(this_flusher.list, wq-flusher_queue);
+   workqueue_start_flush(wq);
} else {
/*
 * Oops, color space is full, queue it without starting flush.
@@ -2663,21 +2656,17 @@ void flush_workqueue(struct workqueue_struct *wq)
 
BUG_ON(wq-first_flusher != this_flusher);
BUG_ON(!list_empty(this_flusher.list));
-   BUG_ON(wq-flush_color != this_flusher.flush_color);
 
/* complete all the flushers sharing the current flush color */
list_for_each_entry_safe(next, tmp, wq-flusher_queue, list) {
-   if (next-flush_color != wq-flush_color)
+   if (next-flush_color != flush_color)
break;
list_del_init(next-list);
complete(next-done);
}
 
-   /* this flush_color is finished, advance by one */
-   wq-flush_color = work_next_color(wq-flush_color);
-
if (list_empty(wq-flusher_queue)) {
-   BUG_ON(wq-flush_color != wq-work_color);
+   BUG_ON(work_next_color(flush_color) != wq-work_color);
wq-first_flusher = NULL;
goto out_unlock;
}
@@ -2686,9 +2675,6 @@ void flush_workqueue(struct workqueue_struct *wq)
 * Need to flush more colors.  Make the next flusher
 * the new first flusher and arm it.
 */
-   BUG_ON(wq-flush_color == wq-work_color);
-   BUG_ON(wq-flush_color != next-flush_color);
-
last = list_entry(wq-flusher_queue.prev, struct wq_flusher, list);
list_del_init(next-list);
wq-first_flusher = next;
@@ -2698,9 +2684,11 @@ void flush_workqueue(struct workqueue_struct *wq)
workqueue_start_flush(wq);
 
BUG_ON(work_next_color(last-flush_color

[PATCH 00/10] workqueue: restructure flush_workqueue() and start all flusher at the same time

2012-09-24 Thread Lai Jiangshan
The core patch is patch6, it makes all flusher can start and the same time
and allow us do more cleanup.

Only patch1 and patch6 change the behavior of the code.
All other patches do not change any behavior.

Lai Jiangshan (10):
  workqueue: always pass flush responsibility to next
  workqueue: remove unneeded check
  workqueue: remove while(true)
  workqueue: use nr_cwqs_to_flush array
  workqueue: add wq_dec_flusher_ref()
  workqueue: start all flusher at the same time
  workqueue: simplify flush_workqueue_prep_cwqs()
  workqueue: assign overflowed flushers's flush color when queued
  workqueue: remove flusher_overflow
  workqueue: remove wq-flush_color

 kernel/workqueue.c |  243 +++
 1 files changed, 91 insertions(+), 152 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/10] workqueue: assign overflowed flushers's flush color when queued

2012-09-24 Thread Lai Jiangshan
wq-work_color is unchanged between overflowed flusher queued on
 -flusher_overflow and requeued on -flusher_queue.

So we can assign overflowed flushers's flush color when they are queued
on -flusher_overflow.

This patch makes the flusher's flush color more clear:
flusher's flush color is the work color of the WQ
when flush_workqueue() starts.

Remove an unneeded loop.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   19 +--
 1 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index be407e1..f687893 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2613,11 +2613,11 @@ void flush_workqueue(struct workqueue_struct *wq)
 */
flush_color = wq-work_color;
next_color = work_next_color(wq-work_color);
+   this_flusher.flush_color = flush_color;
 
if (next_color != wq-flush_color) {
/* Color space is not full */
BUG_ON(!list_empty(wq-flusher_overflow));
-   this_flusher.flush_color = flush_color;
 
if (!wq-first_flusher) {
/* no flush in progress, become the first flusher */
@@ -2643,8 +2643,8 @@ void flush_workqueue(struct workqueue_struct *wq)
} else {
/*
 * Oops, color space is full, wait on overflow queue.
-* The next flush completion will assign us
-* flush_color and transfer to flusher_queue.
+* The next flush completion will start flush for us
+* with freed flush color and transfer us to flusher_queue.
 */
list_add_tail(this_flusher.list, wq-flusher_overflow);
}
@@ -2684,15 +2684,14 @@ void flush_workqueue(struct workqueue_struct *wq)
 
/* one color has been freed, handle overflow queue */
if (!list_empty(wq-flusher_overflow)) {
+   BUG_ON(list_first_entry(wq-flusher_overflow,
+   struct wq_flusher,
+   list)-flush_color
+  != wq-work_color);
/*
-* Assign the same color to all overflowed
-* flushers, advance work_color and append to
-* flusher_queue.  This is the start-to-wait
-* phase for these overflowed flushers.
+* start flush with the freed color and append
+* overflowed flushers to the flusher_queue.
 */
-   list_for_each_entry(tmp, wq-flusher_overflow, list)
-   tmp-flush_color = wq-work_color;
-
list_splice_tail_init(wq-flusher_overflow,
  wq-flusher_queue);
workqueue_start_flush(wq);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/10] workqueue: remove flusher_overflow

2012-09-24 Thread Lai Jiangshan
We can detect whether a flusher is started by comparing its
flush_color VS wq-work_color.

We move all overflowed flusher to flusher_queue, and then we
start flush for them when there is freed color. we detect it
by the color of the last flusher of the flusher_queue.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   39 +--
 1 files changed, 13 insertions(+), 26 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f687893..d78fe08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -252,7 +252,6 @@ struct workqueue_struct {
atomic_tnr_cwqs_to_flush[WORK_NR_COLORS];
struct wq_flusher   *first_flusher; /* F: first flusher */
struct list_headflusher_queue;  /* F: flush waiters */
-   struct list_headflusher_overflow; /* F: flush overflow list */
 
mayday_mask_t   mayday_mask;/* cpus requesting rescue */
struct worker   *rescuer;   /* I: rescue worker */
@@ -2600,7 +2599,7 @@ void flush_workqueue(struct workqueue_struct *wq)
.flush_color = -1,
.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
};
-   struct wq_flusher *next, *tmp;
+   struct wq_flusher *next, *tmp, *last;
int flush_color, next_color;
 
lock_map_acquire(wq-lockdep_map);
@@ -2617,8 +2616,6 @@ void flush_workqueue(struct workqueue_struct *wq)
 
if (next_color != wq-flush_color) {
/* Color space is not full */
-   BUG_ON(!list_empty(wq-flusher_overflow));
-
if (!wq-first_flusher) {
/* no flush in progress, become the first flusher */
BUG_ON(wq-flush_color != flush_color);
@@ -2642,11 +2639,11 @@ void flush_workqueue(struct workqueue_struct *wq)
}
} else {
/*
-* Oops, color space is full, wait on overflow queue.
+* Oops, color space is full, queue it without starting flush.
 * The next flush completion will start flush for us
-* with freed flush color and transfer us to flusher_queue.
+* with freed flush color.
 */
-   list_add_tail(this_flusher.list, wq-flusher_overflow);
+   list_add_tail(this_flusher.list, wq-flusher_queue);
}
 
mutex_unlock(wq-flush_mutex);
@@ -2676,27 +2673,9 @@ void flush_workqueue(struct workqueue_struct *wq)
complete(next-done);
}
 
-   BUG_ON(!list_empty(wq-flusher_overflow) 
-  wq-flush_color != work_next_color(wq-work_color));
-
/* this flush_color is finished, advance by one */
wq-flush_color = work_next_color(wq-flush_color);
 
-   /* one color has been freed, handle overflow queue */
-   if (!list_empty(wq-flusher_overflow)) {
-   BUG_ON(list_first_entry(wq-flusher_overflow,
-   struct wq_flusher,
-   list)-flush_color
-  != wq-work_color);
-   /*
-* start flush with the freed color and append
-* overflowed flushers to the flusher_queue.
-*/
-   list_splice_tail_init(wq-flusher_overflow,
- wq-flusher_queue);
-   workqueue_start_flush(wq);
-   }
-
if (list_empty(wq-flusher_queue)) {
BUG_ON(wq-flush_color != wq-work_color);
wq-first_flusher = NULL;
@@ -2710,8 +2689,17 @@ void flush_workqueue(struct workqueue_struct *wq)
BUG_ON(wq-flush_color == wq-work_color);
BUG_ON(wq-flush_color != next-flush_color);
 
+   last = list_entry(wq-flusher_queue.prev, struct wq_flusher, list);
list_del_init(next-list);
wq-first_flusher = next;
+
+   /* if have unstarted flushers appended, start flush for them */
+   if (last-flush_color == wq-work_color)
+   workqueue_start_flush(wq);
+
+   BUG_ON(work_next_color(last-flush_color) != wq-work_color);
+
+   /* arm new first flusher */
wq_dec_flusher_ref(wq, wq-flush_color);
 
 out_unlock:
@@ -3221,7 +3209,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
for (color = 0; color  WORK_NR_COLORS; color++)
atomic_set(wq-nr_cwqs_to_flush[color], 0);
INIT_LIST_HEAD(wq-flusher_queue);
-   INIT_LIST_HEAD(wq-flusher_overflow);
 
lockdep_init_map(wq-lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(wq-list);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http

[PATCH 04/10] workqueue: use nr_cwqs_to_flush array

2012-09-24 Thread Lai Jiangshan
Each color uses its own nr_cwqs_to_flush[color].

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   21 -
 1 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5439fb6..861b4c7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -250,7 +250,7 @@ struct workqueue_struct {
struct mutexflush_mutex;/* protects wq flushing */
int work_color; /* F: current work color */
int flush_color;/* F: current flush color */
-   atomic_tnr_cwqs_to_flush; /* flush in progress */
+   atomic_tnr_cwqs_to_flush[WORK_NR_COLORS];
struct wq_flusher   *first_flusher; /* F: first flusher */
struct list_headflusher_queue;  /* F: flush waiters */
struct list_headflusher_overflow; /* F: flush overflow list */
@@ -1036,7 +1036,7 @@ static void cwq_dec_nr_in_flight(struct 
cpu_workqueue_struct *cwq, int color)
 * If this was the last cwq, wake up the first flusher.  It
 * will handle the rest.
 */
-   if (atomic_dec_and_test(cwq-wq-nr_cwqs_to_flush))
+   if (atomic_dec_and_test(cwq-wq-nr_cwqs_to_flush[color]))
complete(cwq-wq-first_flusher-done);
 }
 
@@ -2540,8 +2540,8 @@ static void insert_wq_barrier(struct cpu_workqueue_struct 
*cwq,
  * -1.  If no cwq has in-flight commands at the specified color, all
  * cwq-flush_color's stay at -1 and %false is returned.  If any cwq
  * has in flight commands, its cwq-flush_color is set to
- * @flush_color, @wq-nr_cwqs_to_flush is updated accordingly, cwq
- * wakeup logic is armed and %true is returned.
+ * @flush_color, @wq-nr_cwqs_to_flush[flush_color] is updated accordingly,
+ * cwq wakeup logic is armed and %true is returned.
  *
  * The caller should have initialized @wq-first_flusher prior to
  * calling this function with non-negative @flush_color.  If
@@ -2566,8 +2566,8 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
unsigned int cpu;
 
if (flush_color = 0) {
-   BUG_ON(atomic_read(wq-nr_cwqs_to_flush));
-   atomic_set(wq-nr_cwqs_to_flush, 1);
+   BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
+   atomic_set(wq-nr_cwqs_to_flush[flush_color], 1);
}
 
for_each_cwq_cpu(cpu, wq) {
@@ -2581,7 +2581,7 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
 
if (cwq-nr_in_flight[flush_color]) {
cwq-flush_color = flush_color;
-   atomic_inc(wq-nr_cwqs_to_flush);
+   atomic_inc(wq-nr_cwqs_to_flush[flush_color]);
wait = true;
}
}
@@ -2594,7 +2594,8 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
spin_unlock_irq(gcwq-lock);
}
 
-   if (flush_color = 0  atomic_dec_and_test(wq-nr_cwqs_to_flush))
+   if (flush_color = 0 
+   atomic_dec_and_test(wq-nr_cwqs_to_flush[flush_color]))
complete(wq-first_flusher-done);
 
return wait;
@@ -3211,6 +3212,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
struct workqueue_struct *wq;
unsigned int cpu;
size_t namelen;
+   int color;
 
/* determine namelen, allocate wq and format name */
va_start(args, lock_name);
@@ -3239,7 +3241,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
wq-flags = flags;
wq-saved_max_active = max_active;
mutex_init(wq-flush_mutex);
-   atomic_set(wq-nr_cwqs_to_flush, 0);
+   for (color = 0; color  WORK_NR_COLORS; color++)
+   atomic_set(wq-nr_cwqs_to_flush[color], 0);
INIT_LIST_HEAD(wq-flusher_queue);
INIT_LIST_HEAD(wq-flusher_overflow);
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/10] workqueue: always pass flush responsibility to next

2012-09-24 Thread Lai Jiangshan
depriving the responsibility make the code complex, we pass it to the next
unconditionally.

After this change, we don't need to go back to repeat cascading, so we use
break to exit the loop.

The loop will be remove in later patch.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   11 ++-
 1 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a59171a..360b7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2740,15 +2740,8 @@ void flush_workqueue(struct workqueue_struct *wq)
 
list_del_init(next-list);
wq-first_flusher = next;
-
-   if (flush_workqueue_prep_cwqs(wq, wq-flush_color, -1))
-   break;
-
-   /*
-* Meh... this color is already done, clear first
-* flusher and repeat cascading.
-*/
-   wq-first_flusher = NULL;
+   flush_workqueue_prep_cwqs(wq, wq-flush_color, -1);
+   break;
}
 
 out_unlock:
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/10] workqueue: start all flusher at the same time

2012-09-24 Thread Lai Jiangshan
Start all flusher at the same time(except the overflowed flushers).

Since we have nr_cwqs_to_flush array for each color, any flush
color can start the same time, every color don't interfere
each other. The progress of flushing for any color is the same
as the old code.

A little different is that: the later color can't finished
until the previous color finished. So the top flusher of
previous color always hold a reference of the later color.
These is done by flush_workqueue_prep_cwqs() get a ref of
the just starting flush color, and it does not release it,
this ref's owner is assigned to the first flusher.
(so wq_dec_flusher_ref() is removed out from flush_workqueue_prep_cwqs()).

When the first flusher finished its flushing, it will release the ref
of the next color, and then the next color can finish.

All flush color can start and the same time, cwq_dec_nr_in_flight() must
decrease ref of any color, so cwq_dec_nr_in_flight() can't use cwq-flush_color,
it must use cwq-work_color == color to detect the @color is started to flush.
cwq-work_color is not used any more, remove it.

For any flush color, flush_workqueue_prep_cwqs() is called only one time.
(old code may call twice) it saves a big loop(when we have many CPUs)
and avoid to touch all gcwq again.

Wake-up-and-cascade phase become simpler and return quickly.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   46 ++
 1 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e5ba08c..803a22c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -196,7 +196,6 @@ struct cpu_workqueue_struct {
struct worker_pool  *pool;  /* I: the associated pool */
struct workqueue_struct *wq;/* I: the owning workqueue */
int work_color; /* L: current color */
-   int flush_color;/* L: flushing color */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
int nr_active;  /* L: nr of active works */
@@ -1031,17 +1030,15 @@ static void cwq_dec_nr_in_flight(struct 
cpu_workqueue_struct *cwq, int color)
cwq_activate_first_delayed(cwq);
}
 
-   /* is flush in progress and are we at the flushing tip? */
-   if (likely(cwq-flush_color != color))
+   /* is flush in progress? */
+   if (likely(cwq-work_color == color))
return;
 
/* are there still in-flight works? */
if (cwq-nr_in_flight[color])
return;
 
-   /* this cwq is done, clear flush_color */
-   cwq-flush_color = -1;
-
+   /* this cwq is done, release the flusher ref of the color */
wq_dec_flusher_ref(cwq-wq, color);
 }
 
@@ -2541,13 +2538,6 @@ static void insert_wq_barrier(struct 
cpu_workqueue_struct *cwq,
  *
  * Prepare cwqs for workqueue flushing.
  *
- * If @flush_color is non-negative, flush_color on all cwqs should be
- * -1.  If no cwq has in-flight commands at the specified color, all
- * cwq-flush_color's stay at -1 and %false is returned.  If any cwq
- * has in flight commands, its cwq-flush_color is set to
- * @flush_color, @wq-nr_cwqs_to_flush[flush_color] is updated accordingly,
- * cwq wakeup logic is armed and %true is returned.
- *
  * The caller should have initialized @wq-first_flusher prior to
  * calling this function with non-negative @flush_color.  If
  * @flush_color is negative, no flush color update is done and %false
@@ -2572,6 +2562,7 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
 
if (flush_color = 0) {
BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
+   /* this ref is held by first flusher */
atomic_set(wq-nr_cwqs_to_flush[flush_color], 1);
}
 
@@ -2582,10 +2573,7 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
spin_lock_irq(gcwq-lock);
 
if (flush_color = 0) {
-   BUG_ON(cwq-flush_color != -1);
-
if (cwq-nr_in_flight[flush_color]) {
-   cwq-flush_color = flush_color;
atomic_inc(wq-nr_cwqs_to_flush[flush_color]);
wait = true;
}
@@ -2599,9 +2587,6 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
spin_unlock_irq(gcwq-lock);
}
 
-   if (flush_color = 0)
-   wq_dec_flusher_ref(wq, flush_color);
-
return wait;
 }
 
@@ -2623,7 +2608,7 @@ void flush_workqueue(struct workqueue_struct *wq)
.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
};
struct wq_flusher *next, *tmp;
-   int next_color;
+   int flush_color

[PATCH 03/10] workqueue: remove while(true)

2012-09-24 Thread Lai Jiangshan
The loop count is always=1, remove the while(true) and fix the indent.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   86 +---
 1 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index acd9e2f..5439fb6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2617,6 +2617,7 @@ void flush_workqueue(struct workqueue_struct *wq)
.flush_color = -1,
.done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
};
+   struct wq_flusher *next, *tmp;
int next_color;
 
lock_map_acquire(wq-lockdep_map);
@@ -2686,59 +2687,54 @@ void flush_workqueue(struct workqueue_struct *wq)
BUG_ON(!list_empty(this_flusher.list));
BUG_ON(wq-flush_color != this_flusher.flush_color);
 
-   while (true) {
-   struct wq_flusher *next, *tmp;
-
-   /* complete all the flushers sharing the current flush color */
-   list_for_each_entry_safe(next, tmp, wq-flusher_queue, list) {
-   if (next-flush_color != wq-flush_color)
-   break;
-   list_del_init(next-list);
-   complete(next-done);
-   }
+   /* complete all the flushers sharing the current flush color */
+   list_for_each_entry_safe(next, tmp, wq-flusher_queue, list) {
+   if (next-flush_color != wq-flush_color)
+   break;
+   list_del_init(next-list);
+   complete(next-done);
+   }
 
-   BUG_ON(!list_empty(wq-flusher_overflow) 
-  wq-flush_color != work_next_color(wq-work_color));
+   BUG_ON(!list_empty(wq-flusher_overflow) 
+  wq-flush_color != work_next_color(wq-work_color));
 
-   /* this flush_color is finished, advance by one */
-   wq-flush_color = work_next_color(wq-flush_color);
+   /* this flush_color is finished, advance by one */
+   wq-flush_color = work_next_color(wq-flush_color);
 
-   /* one color has been freed, handle overflow queue */
-   if (!list_empty(wq-flusher_overflow)) {
-   /*
-* Assign the same color to all overflowed
-* flushers, advance work_color and append to
-* flusher_queue.  This is the start-to-wait
-* phase for these overflowed flushers.
-*/
-   list_for_each_entry(tmp, wq-flusher_overflow, list)
-   tmp-flush_color = wq-work_color;
+   /* one color has been freed, handle overflow queue */
+   if (!list_empty(wq-flusher_overflow)) {
+   /*
+* Assign the same color to all overflowed
+* flushers, advance work_color and append to
+* flusher_queue.  This is the start-to-wait
+* phase for these overflowed flushers.
+*/
+   list_for_each_entry(tmp, wq-flusher_overflow, list)
+   tmp-flush_color = wq-work_color;
 
-   wq-work_color = work_next_color(wq-work_color);
+   wq-work_color = work_next_color(wq-work_color);
 
-   list_splice_tail_init(wq-flusher_overflow,
- wq-flusher_queue);
-   flush_workqueue_prep_cwqs(wq, -1, wq-work_color);
-   }
+   list_splice_tail_init(wq-flusher_overflow,
+ wq-flusher_queue);
+   flush_workqueue_prep_cwqs(wq, -1, wq-work_color);
+   }
 
-   if (list_empty(wq-flusher_queue)) {
-   BUG_ON(wq-flush_color != wq-work_color);
-   wq-first_flusher = NULL;
-   break;
-   }
+   if (list_empty(wq-flusher_queue)) {
+   BUG_ON(wq-flush_color != wq-work_color);
+   wq-first_flusher = NULL;
+   goto out_unlock;
+   }
 
-   /*
-* Need to flush more colors.  Make the next flusher
-* the new first flusher and arm cwqs.
-*/
-   BUG_ON(wq-flush_color == wq-work_color);
-   BUG_ON(wq-flush_color != next-flush_color);
+   /*
+* Need to flush more colors.  Make the next flusher
+* the new first flusher and arm cwqs.
+*/
+   BUG_ON(wq-flush_color == wq-work_color);
+   BUG_ON(wq-flush_color != next-flush_color);
 
-   list_del_init(next-list);
-   wq-first_flusher = next;
-   flush_workqueue_prep_cwqs(wq, wq-flush_color, -1);
-   break;
-   }
+   list_del_init(next-list

[PATCH 05/10] workqueue: add wq_dec_flusher_ref()

2012-09-24 Thread Lai Jiangshan
it is a helper to relase reference.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   22 +-
 1 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 861b4c7..e5ba08c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -995,6 +995,16 @@ static void cwq_activate_first_delayed(struct 
cpu_workqueue_struct *cwq)
cwq_activate_delayed_work(work);
 }
 
+static void wq_dec_flusher_ref(struct workqueue_struct *wq, int color)
+{
+   /*
+* If this was the last reference, wake up the first flusher.
+* It will handle the rest.
+*/
+   if (atomic_dec_and_test(wq-nr_cwqs_to_flush[color]))
+   complete(wq-first_flusher-done);
+}
+
 /**
  * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight
  * @cwq: cwq of interest
@@ -1032,12 +1042,7 @@ static void cwq_dec_nr_in_flight(struct 
cpu_workqueue_struct *cwq, int color)
/* this cwq is done, clear flush_color */
cwq-flush_color = -1;
 
-   /*
-* If this was the last cwq, wake up the first flusher.  It
-* will handle the rest.
-*/
-   if (atomic_dec_and_test(cwq-wq-nr_cwqs_to_flush[color]))
-   complete(cwq-wq-first_flusher-done);
+   wq_dec_flusher_ref(cwq-wq, color);
 }
 
 /**
@@ -2594,9 +2599,8 @@ static bool flush_workqueue_prep_cwqs(struct 
workqueue_struct *wq,
spin_unlock_irq(gcwq-lock);
}
 
-   if (flush_color = 0 
-   atomic_dec_and_test(wq-nr_cwqs_to_flush[flush_color]))
-   complete(wq-first_flusher-done);
+   if (flush_color = 0)
+   wq_dec_flusher_ref(wq, flush_color);
 
return wait;
 }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/10] workqueue: remove unneeded check

2012-09-24 Thread Lai Jiangshan
Since we always pass flush responsibility to next, we will not deprive it,
now the -first_flusher can only be changed by the first_flusher, no one
can change it, no race as 4ce48b37 said.

Remove the check introduced by 4ce48b37, use BUG_ON() instead.

Also move wq-first_flusher = NULL; later.

This patch doesn't make any functional difference.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |8 ++--
 1 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 360b7e2..acd9e2f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2682,12 +2682,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 
mutex_lock(wq-flush_mutex);
 
-   /* we might have raced, check again with mutex held */
-   if (wq-first_flusher != this_flusher)
-   goto out_unlock;
-
-   wq-first_flusher = NULL;
-
+   BUG_ON(wq-first_flusher != this_flusher);
BUG_ON(!list_empty(this_flusher.list));
BUG_ON(wq-flush_color != this_flusher.flush_color);
 
@@ -2728,6 +2723,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 
if (list_empty(wq-flusher_queue)) {
BUG_ON(wq-flush_color != wq-work_color);
+   wq-first_flusher = NULL;
break;
}
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/10] workqueue: restructure flush_workqueue() and start all flusher at the same time

2012-09-25 Thread Lai Jiangshan
On 09/25/2012 04:39 AM, Tejun Heo wrote:
 Hello, Lai.
 
 On Mon, Sep 24, 2012 at 06:07:02PM +0800, Lai Jiangshan wrote:
 The core patch is patch6, it makes all flusher can start and the same time
 and allow us do more cleanup.

 Only patch1 and patch6 change the behavior of the code.
 All other patches do not change any behavior.
 
 It would have been nice if you described what this patchset tries to
 achieve how in the head message.
 
 I don't see anything wrong with the patchset but flush_workqueue() is
 quite hairy before this patchset and I'm not sure the situation
 improves a lot afterwards.  The current code is known / verified to
 work for quite some time and I'd *much* prefer to keep it stable
 unless it can be vastly simpler.

Hi, Tejun

I know your attitude, it is OK if you reject it.

I found the flush_workqueue() is not nature for me, especially
the usage of the colors and flush_workqueue_prep_cwqs().
so I try to improve it without change too much things/behavior.

(These patchset delay other simple patches, I think I should
send simple patches at first.)

 
 I do like the removal of explicit cascading and would have gone that
 direction if this code is just being implemented but I'm quite
 skeptical whether changing over to that now is justifiable.  Flush
 bugs tend to be nasty and often difficult to track down.
 
 I'll think more about it.  How confident are you about the change?
 How did you test them?  For changes like this, it usually helps a lot
 to describe how things were tested as part of head and/or commit
 messages.
 

I always check the code by hard reviewing the code. I always try to image
there are many thread run in my brain orderless and I write all possible
transitions in paper. This progress is the most important, no test can
replace it.

Human brain can wrong, the attached patch is my testing code.
It verify flush_workqueue() by cookie number.

type make test to start the test.
type Ctrl and c to stop the test.

I also need your testing code for workqueue. ^_^

Thanks,
Lai

diff --git a/workqueue_flush_test/Makefile b/workqueue_flush_test/Makefile
new file mode 100644
index 000..3ecc7aa
--- /dev/null
+++ b/workqueue_flush_test/Makefile
@@ -0,0 +1,10 @@
+obj-m += wflush.o
+
+all:
+   make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+   make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean
+
+test: all
+   bash ./test.sh
diff --git a/workqueue_flush_test/test.sh b/workqueue_flush_test/test.sh
new file mode 100644
index 000..a9d2d6a
--- /dev/null
+++ b/workqueue_flush_test/test.sh
@@ -0,0 +1,12 @@
+#/bin/bash
+
+make
+sync
+sync
+echo testing...
+trap 'echo interrupt test' INT
+sudo insmod wflush.ko
+sleep 6
+sudo rmmod wflush.ko
+echo test done
+
diff --git a/workqueue_flush_test/wflush.c b/workqueue_flush_test/wflush.c
new file mode 100644
index 000..971e73d
--- /dev/null
+++ b/workqueue_flush_test/wflush.c
@@ -0,0 +1,129 @@
+#include linux/module.h
+#include linux/kthread.h
+#include linux/workqueue.h
+#include linux/delay.h
+#include linux/mutex.h
+#include linux/list.h
+#include linux/random.h
+
+struct cookie_struct {
+   struct list_head head;
+   unsigned long cookie;
+};
+
+static DEFINE_MUTEX(cookie_lock);
+static unsigned long test_cookie;
+static LIST_HEAD(cookie_head);
+
+static unsigned long last_cookie(void)
+{
+   unsigned long cookie;
+
+   mutex_lock(cookie_lock);
+   cookie = test_cookie - 1; /* c-cookie = test_cookie++; */
+   mutex_unlock(cookie_lock);
+
+   return cookie;
+}
+
+static unsigned long tip_cookie(void)
+{
+   unsigned long cookie;
+
+   mutex_lock(cookie_lock);
+   if (list_empty(cookie_head))
+   cookie = test_cookie;
+   else
+   cookie = list_first_entry(cookie_head, struct cookie_struct, 
head)-cookie;
+   mutex_unlock(cookie_lock);
+
+   return cookie;
+}
+
+struct test_work {
+   struct work_struct w;
+   struct cookie_struct c;
+};
+
+static void add_test_work(struct test_work *t)
+{
+   struct cookie_struct *c = t-c;
+
+   mutex_lock(cookie_lock);
+   c-cookie = test_cookie++;
+   BUG_ON(!list_empty(c-head));
+   list_add_tail(c-head, cookie_head);
+   schedule_work(t-w);
+   mutex_unlock(cookie_lock);
+
+   udelay(1+random32()%50);
+}
+
+static void test_work_fn(struct work_struct *w)
+{
+   struct test_work *t = container_of(w, struct test_work, w);
+
+   mutex_lock(cookie_lock);
+   list_del_init(t-c.head);
+   mutex_unlock(cookie_lock);
+
+   udelay(1+random32()%50);
+}
+
+static int test_thread(void *arg)
+{
+   unsigned long lcookie, tcookie;
+   struct test_work t[10];
+   int i;
+
+   for (i = 0; i  ARRAY_SIZE(t); i++) {
+   INIT_WORK_ONSTACK(t[i].w, test_work_fn);
+   INIT_LIST_HEAD(t[i].c.head);
+   }
+
+   do {
+   for (i = 0; i  ARRAY_SIZE(t); i

Re: [PATCH 00/10] workqueue: restructure flush_workqueue() and start all flusher at the same time

2012-09-25 Thread Lai Jiangshan
On 09/25/2012 04:39 AM, Tejun Heo wrote:
 
 I do like the removal of explicit cascading and would have gone that
 direction if this code is just being implemented but I'm quite
 skeptical whether changing over to that now is justifiable.  Flush
 bugs tend to be nasty and often difficult to track down.
 

Hi, Tejun

I know your attitude, it is OK if you reject it.

It is not possible to remove cascading. If cascading code is
not in flush_workqueue(), it must be in some where else.

If you force overflow to wait for freed color before do flush(which also
force only one flusher for one color), and force the sole flush_workqueue()
to grab -flush_mutex twice, we can simplify the flush_workqueue().
(see the attached patch, it remove 100 LOC, and the cascading code becomes
only 3 LOC). But these two forcing slow down the caller a little.

(And if you allow to use SRCU(which is only TWO colors), you can remove another
150 LOC. flush_workqueue() will become single line. But it will add some more 
overhead
in flush_workqueue() because SRCU's readsite is lockless)

Thanks,
Lai

This patch is applied on top of patch7. it replaces patch8~10

 workqueue.c |  168 ++--
 1 file changed, 30 insertions(+), 138 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index be407e1..bff0ae0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -204,15 +204,6 @@ struct cpu_workqueue_struct {
 };
 
 /*
- * Structure used to wait for workqueue flush.
- */
-struct wq_flusher {
-   struct list_headlist;   /* F: list of flushers */
-   int flush_color;/* F: flush color waiting for */
-   struct completion   done;   /* flush completion */
-};
-
-/*
  * All cpumasks are assumed to be always set on UP and thus can't be
  * used to determine whether there's something to be done.
  */
@@ -250,9 +241,8 @@ struct workqueue_struct {
int work_color; /* F: current work color */
int flush_color;/* F: current flush color */
atomic_tnr_cwqs_to_flush[WORK_NR_COLORS];
-   struct wq_flusher   *first_flusher; /* F: first flusher */
-   struct list_headflusher_queue;  /* F: flush waiters */
-   struct list_headflusher_overflow; /* F: flush overflow list */
+   struct completion   *flusher[WORK_NR_COLORS]; /* F: flusers */
+   wait_queue_head_t   flusher_overflow; /* flush overflow queue */
 
mayday_mask_t   mayday_mask;/* cpus requesting rescue */
struct worker   *rescuer;   /* I: rescue worker */
@@ -1001,7 +991,7 @@ static void wq_dec_flusher_ref(struct workqueue_struct 
*wq, int color)
 * It will handle the rest.
 */
if (atomic_dec_and_test(wq-nr_cwqs_to_flush[color]))
-   complete(wq-first_flusher-done);
+   complete(wq-flusher[color]);
 }
 
 /**
@@ -2540,27 +2530,20 @@ static void insert_wq_barrier(struct 
cpu_workqueue_struct *cwq,
  * becomes new flush_color and work_color is advanced by one.
  * All cwq's work_color are set to new work_color(advanced by one).
  *
- * The caller should have initialized @wq-first_flusher prior to
- * calling this function.
- *
  * CONTEXT:
  * mutex_lock(wq-flush_mutex).
- *
- * RETURNS:
- * %true if there's some cwqs to flush.  %false otherwise.
  */
-static bool workqueue_start_flush(struct workqueue_struct *wq)
+static void workqueue_start_flush(struct workqueue_struct *wq)
 {
int flush_color = wq-work_color;
int next_color = work_next_color(wq-work_color);
-   bool wait = false;
unsigned int cpu;
 
BUG_ON(next_color == wq-flush_color);
wq-work_color = next_color;
 
BUG_ON(atomic_read(wq-nr_cwqs_to_flush[flush_color]));
-   /* this ref is held by first flusher */
+   /* this ref is held by previous flusher */
atomic_set(wq-nr_cwqs_to_flush[flush_color], 1);
 
for_each_cwq_cpu(cpu, wq) {
@@ -2569,18 +2552,14 @@ static bool workqueue_start_flush(struct 
workqueue_struct *wq)
 
spin_lock_irq(gcwq-lock);
 
-   if (cwq-nr_in_flight[flush_color]) {
+   if (cwq-nr_in_flight[flush_color])
atomic_inc(wq-nr_cwqs_to_flush[flush_color]);
-   wait = true;
-   }
 
BUG_ON(next_color != work_next_color(cwq-work_color));
cwq-work_color = next_color;
 
spin_unlock_irq(gcwq-lock);
}
-
-   return wait;
 }
 
 /**
@@ -2595,127 +2574,41 @@ static bool workqueue_start_flush(struct 
workqueue_struct *wq)
  */
 void flush_workqueue(struct workqueue_struct *wq)
 {
-   struct wq_flusher this_flusher = {
-   .list = LIST_HEAD_INIT(this_flusher.list),
-   .flush_color = -1,
-   .done = 

Re: [PATCH 00/10] workqueue: restructure flush_workqueue() and start all flusher at the same time

2012-09-25 Thread Lai Jiangshan
On 09/26/2012 04:24 AM, Tejun Heo wrote:
 Hello, Lai.
 
 On Tue, Sep 25, 2012 at 05:02:43PM +0800, Lai Jiangshan wrote:
 It is not possible to remove cascading. If cascading code is
 not in flush_workqueue(), it must be in some where else.
 
 Yeah, sure, I liked that it didn't have to be done explicitly as a
 separate step.
 
 If you force overflow to wait for freed color before do flush(which also
 force only one flusher for one color), and force the sole flush_workqueue()
 to grab -flush_mutex twice, we can simplify the flush_workqueue().
 (see the attached patch, it remove 100 LOC, and the cascading code becomes
 only 3 LOC). But these two forcing slow down the caller a little.
 
 Hmmm... so, that's a lot simpler.  flush_workqueue() isn't a super-hot
 code path and I don't think grabbing mutex twice is too big a deal.  I
 haven't actually reviewed the code but if it can be much simpler and
 thus easier to understand and verify, I might go for that.

I updated it. it is attached, it forces flush_workqueue() to grab mutex 
twice(no other forcing).
overflow queue is implemented in a different way. This new algorithm may become 
our choice
likely, please review this one.

 
 (And if you allow to use SRCU(which is only TWO colors), you can remove 
 another
 150 LOC. flush_workqueue() will become single line. But it will add some 
 more overhead
 in flush_workqueue() because SRCU's readsite is lockless)
 
 I'm not really following how SRCU would factor into this but
 supporting multiple colors was something explicitly requested by
 Linus.  The initial implementation was a lot simpler which supported
 only two colors.  Linus was worried that the high possibility of
 flusher clustering could lead to chaining of latencies.
 

I did not know this history, thank you.

But the number of colors is not essential.
Does the algorithm chain flushers is essential.

If we can have multiple flushers for each color. It is not chained.
If we have only one flusher for one color. It is chained. Even we have multiple
color, it is still partially chained(image we have very high frequent 
flush_workqueue()).

The initial implementation of flush_workqueue() is chained algorithm.
The initial implementation of SRCU is also chained algorithm.
but the current SRCU which was implemented by me is not chained
(I don't propose to use SRCU for flush_workqueue(), I just discuss it)

The simple version of flush_workqueue() which I sent yesterday is chained,
because it forces overflow flushers wait for free color and forces only one
flusher for one color.

Since not chaining is important/essential. I sent a new draft implement today.
it uses multiple queues, one for each color(like SRCU).
this version is also simple, it remove 90 LOC.

Thanks,
Lai

This patch is still applied on top of patch7. it replaces patch8~10

 workqueue.c |  152 diff --git 
a/kernel/workqueue.c b/kernel/workqueue.c
index be407e1..00f02ba 100644

--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -208,7 +208,6 @@ struct cpu_workqueue_struct {
  */
 struct wq_flusher {
struct list_headlist;   /* F: list of flushers */
-   int flush_color;/* F: flush color waiting for */
struct completion   done;   /* flush completion */
 };
 
@@ -250,9 +249,7 @@ struct workqueue_struct {
int work_color; /* F: current work color */
int flush_color;/* F: current flush color */
atomic_tnr_cwqs_to_flush[WORK_NR_COLORS];
-   struct wq_flusher   *first_flusher; /* F: first flusher */
-   struct list_headflusher_queue;  /* F: flush waiters */
-   struct list_headflusher_overflow; /* F: flush overflow list */
+   struct list_headflusher[WORK_NR_COLORS]; /* F: flushers */
 
mayday_mask_t   mayday_mask;/* cpus requesting rescue */
struct worker   *rescuer;   /* I: rescue worker */
@@ -1000,8 +997,11 @@ static void wq_dec_flusher_ref(struct workqueue_struct 
*wq, int color)
 * If this was the last reference, wake up the first flusher.
 * It will handle the rest.
 */
-   if (atomic_dec_and_test(wq-nr_cwqs_to_flush[color]))
-   complete(wq-first_flusher-done);
+   if (atomic_dec_and_test(wq-nr_cwqs_to_flush[color])) {
+   BUG_ON(color != wq-flush_color);
+   complete(list_first_entry(wq-flusher[color],
+  struct wq_flusher, list)-done);
+   }
 }
 
 /**
@@ -2540,27 +2540,20 @@ static void insert_wq_barrier(struct 
cpu_workqueue_struct *cwq,
  * becomes new flush_color and work_color is advanced by one.
  * All cwq's work_color are set to new work_color(advanced by one).
  *
- * The caller should have initialized @wq-first_flusher prior to
- * calling this function.
- *
  * CONTEXT:
  * mutex_lock(wq-flush_mutex

[PATCH 00/12] workqueue: simple cleanups

2012-09-26 Thread Lai Jiangshan
these all are different cleanups for workqueue.

depends:
Patch2 depends on Patch1
Patch3 depends on Patch1
Patch7 depends on Patch6

Patch7 need to be merged after Patch3 is merged.(not really depends)

Lai Jiangshan (12):
  workqueue: add WORKER_RESCUER
  workqueue: disallow set_cpus_allowed_ptr() from work item
  workqueue: remove WORKER_PREP from rescuer
  workqueue: simplify is_chained_work()
  workqueue: don't wake up other workers in rescuer
  workqueue: destroy_worker() can only destory idle worker not just
created worker
  workqueue: remove WORKER_STARTED
  workqueue: fix comments of insert_work()
  workqueue: declare system_highpri_wq
  cpu-hotplug.txt: fix comments of work_on_cpu()
  workqueue: add WQ_CPU_INTENSIVE to system_long_wq
  workqueue: avoid work_on_cpu() to interfere system_wq

 Documentation/cpu-hotplug.txt |2 +-
 include/linux/workqueue.h |4 ++
 kernel/sched/core.c   |8 ++-
 kernel/workqueue.c|   82 +
 4 files changed, 44 insertions(+), 52 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/12] workqueue: don't wake up other workers in rescuer

2012-09-26 Thread Lai Jiangshan
rescuer is NOT_RUNNING, so there is no sense when it wakes up other workers,
if there are available normal workers, they are already woken up when needed.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |8 
 1 files changed, 0 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c718b94..6c339bf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2438,14 +2438,6 @@ repeat:
 
process_scheduled_works(rescuer);
 
-   /*
-* Leave this gcwq.  If keep_working() is %true, notify a
-* regular worker; otherwise, we end up with 0 concurrency
-* and stalling the execution.
-*/
-   if (keep_working(pool))
-   wake_up_worker(pool);
-
spin_unlock_irq(gcwq-lock);
}
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH 11/12] workqueue: add WQ_CPU_INTENSIVE to system_long_wq

2012-09-26 Thread Lai Jiangshan
works in system_long_wq will be running long.
add WQ_CPU_INTENSIVE to system_long_wq to avoid these kinds of works occupy
the running wokers which delay the normal works.

if system_long_wq is designed for only sleep-long works, not running-long works,
this patch makes no sense.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 89fd1b2..ccb1d60 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3850,7 +3850,7 @@ static int __init init_workqueues(void)
 
system_wq = alloc_workqueue(events, 0, 0);
system_highpri_wq = alloc_workqueue(events_highpri, WQ_HIGHPRI, 0);
-   system_long_wq = alloc_workqueue(events_long, 0, 0);
+   system_long_wq = alloc_workqueue(events_long, WQ_CPU_INTENSIVE, 0);
system_unbound_wq = alloc_workqueue(events_unbound, WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue(events_freezable,
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/12] workqueue: destroy_worker() can only destory idle worker not just created worker

2012-09-26 Thread Lai Jiangshan
After we reimpletment hotplug code, we will not destroy just newly
created worker, all created worker will enter idle soon.
And destroy_worker() is not used to destroy just newly created worker,
it always destory idle worker, so we need to fix destroy_worker()
and update the comments.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   14 +++---
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6c339bf..fe3b1d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1756,8 +1756,7 @@ static struct worker *alloc_worker(void)
  * @pool: pool the new worker will belong to
  *
  * Create a new worker which is bound to @pool.  The returned worker
- * can be started by calling start_worker() or destroyed using
- * destroy_worker().
+ * can be started by calling start_worker();
  *
  * CONTEXT:
  * Might sleep.  Does GFP_KERNEL allocations.
@@ -1847,7 +1846,7 @@ static void start_worker(struct worker *worker)
 }
 
 /**
- * destroy_worker - destroy a workqueue worker
+ * destroy_worker - destroy a idle workqueue worker
  * @worker: worker to be destroyed
  *
  * Destroy @worker and adjust @gcwq stats accordingly.
@@ -1864,11 +1863,12 @@ static void destroy_worker(struct worker *worker)
/* sanity check frenzy */
BUG_ON(worker-current_work);
BUG_ON(!list_empty(worker-scheduled));
+   BUG_ON(!(worker-flags  WORKER_STARTED));
+   BUG_ON(!(worker-flags  WORKER_IDLE));
+   BUG_ON(list_empty(worker-entry));
 
-   if (worker-flags  WORKER_STARTED)
-   pool-nr_workers--;
-   if (worker-flags  WORKER_IDLE)
-   pool-nr_idle--;
+   pool-nr_workers--;
+   pool-nr_idle--;
 
list_del_init(worker-entry);
worker-flags |= WORKER_DIE;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 10/12] cpu-hotplug.txt: fix comments of work_on_cpu()

2012-09-26 Thread Lai Jiangshan
This is a tiny fix of the comments of work_on_cpu() which is changed
back to use workqueue and it can be run at some time.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/cpu-hotplug.txt |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/Documentation/cpu-hotplug.txt b/Documentation/cpu-hotplug.txt
index 66ef8f3..d19c5fd 100644
--- a/Documentation/cpu-hotplug.txt
+++ b/Documentation/cpu-hotplug.txt
@@ -317,7 +317,7 @@ Q: I need to ensure that a particular cpu is not removed 
when there is some
work specific to this cpu is in progress.
 A: There are two ways.  If your code can be run in interrupt context, use
smp_call_function_single(), otherwise use work_on_cpu().  Note that
-   work_on_cpu() is slow, and can fail due to out of memory:
+   work_on_cpu() is slow:
 
int my_func_on_cpu(int cpu)
{
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/12] workqueue: remove WORKER_PREP from rescuer

2012-09-26 Thread Lai Jiangshan
There is no reason to use WORKER_PREP, remove it from rescuer.

And there is no reason to set it so early in alloc_worker(),
move worker-flags = WORKER_PREP to start_worker().

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c55884d..e41c562 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1759,8 +1759,6 @@ static struct worker *alloc_worker(void)
INIT_LIST_HEAD(worker-entry);
INIT_LIST_HEAD(worker-scheduled);
INIT_WORK(worker-rebind_work, busy_worker_rebind_fn);
-   /* on creation a worker is in !idle  prep state */
-   worker-flags = WORKER_PREP;
}
return worker;
 }
@@ -1854,6 +1852,7 @@ fail:
 static void start_worker(struct worker *worker)
 {
worker-flags |= WORKER_STARTED;
+   worker-flags |= WORKER_PREP;
worker-pool-nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker-task);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/12] workqueue: avoid work_on_cpu() to interfere system_wq

2012-09-26 Thread Lai Jiangshan
We can't expect how long the work of work_on_cpu() will run.
So move it to system_long_wq to avoid it interfere system_wq.

Note:
The initial implement(2d3854a3) of work_on_cpu() uses its own workqueue.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccb1d60..c14d94c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3638,7 +3638,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), 
void *arg)
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
 
INIT_WORK_ONSTACK(wfc.work, work_for_cpu_fn);
-   schedule_work_on(cpu, wfc.work);
+   queue_work_on(cpu, system_long_wq, wfc.work);
flush_work(wfc.work);
return wfc.ret;
 }
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/12] workqueue: declare system_highpri_wq

2012-09-26 Thread Lai Jiangshan
system_highpri_wq is missed in workqueue.h, add it back.
also add a short comment for it.

CC: Joonsoo Kim js1...@gmail.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/workqueue.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 2b58905..68b1d2a 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -294,6 +294,9 @@ enum {
  * short queue flush time.  Don't queue works which can run for too
  * long.
  *
+ * system_highpri_wq is similar to system_wq but services for urgent works
+ * and works will be processed in high priority workers.
+ *
  * system_long_wq is similar to system_wq but may host long running
  * works.  Queue flushing might take relatively long.
  *
@@ -306,6 +309,7 @@ enum {
  * freezable.
  */
 extern struct workqueue_struct *system_wq;
+extern struct workqueue_struct *system_highpri_wq;
 extern struct workqueue_struct *system_long_wq;
 extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 08/12] workqueue: fix comments of insert_work()

2012-09-26 Thread Lai Jiangshan
This comment is so important to understand the CMWQ, fix it and make
new reviewer who read workqueue.c at the first time.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d37f446..89fd1b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1169,7 +1169,7 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
list_add_tail(work-entry, head);
 
/*
-* Ensure either worker_sched_deactivated() sees the above
+* Ensure either wq_worker_sleeping() sees the above
 * list_add_tail() or we see zero nr_running to avoid workers
 * lying around lazily while there are works to be processed.
 */
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 07/12] workqueue: remove WORKER_STARTED

2012-09-26 Thread Lai Jiangshan
All newly created worker will enter idle soon,
WORKER_STARTED is not used any more, remove it.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |3 ---
 1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fe3b1d3..d37f446 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -69,7 +69,6 @@ enum {
POOL_MANAGING_WORKERS   = 1  1,   /* managing workers */
 
/* worker flags */
-   WORKER_STARTED  = 1  0,   /* started */
WORKER_DIE  = 1  1,   /* die die die */
WORKER_IDLE = 1  2,   /* is idle */
WORKER_PREP = 1  3,   /* preparing to run works */
@@ -1838,7 +1837,6 @@ fail:
  */
 static void start_worker(struct worker *worker)
 {
-   worker-flags |= WORKER_STARTED;
worker-flags |= WORKER_PREP;
worker-pool-nr_workers++;
worker_enter_idle(worker);
@@ -1863,7 +1861,6 @@ static void destroy_worker(struct worker *worker)
/* sanity check frenzy */
BUG_ON(worker-current_work);
BUG_ON(!list_empty(worker-scheduled));
-   BUG_ON(!(worker-flags  WORKER_STARTED));
BUG_ON(!(worker-flags  WORKER_IDLE));
BUG_ON(list_empty(worker-entry));
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/12] workqueue: add WORKER_RESCUER

2012-09-26 Thread Lai Jiangshan
rescuer thread must be a worker which is WORKER_NOT_RUNNING:
If it is *not* WORKER_NOT_RUNNING, it will increase the nr_running
and it disables the normal workers wrongly.

So rescuer thread must be WORKER_NOT_RUNNING.

Currently code implement it by always setting WORKER_PREP on rescuer thread,
but this kind of implement is ugly:
A)  It reuses WORKER_PREP which is used for a different meaning.
B)  It does not told us rescuer thread is WORKER_NOT_RUNNING.

So we add WORKER_RESCUER to fix these two sematic.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 737ab01..ec882a6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -73,11 +73,12 @@ enum {
WORKER_DIE  = 1  1,   /* die die die */
WORKER_IDLE = 1  2,   /* is idle */
WORKER_PREP = 1  3,   /* preparing to run works */
+   WORKER_RESCUER  = 1  4,   /* rescuer thread */
WORKER_CPU_INTENSIVE= 1  6,   /* cpu intensive */
WORKER_UNBOUND  = 1  7,   /* worker is unbound */
 
WORKER_NOT_RUNNING  = WORKER_PREP | WORKER_UNBOUND |
- WORKER_CPU_INTENSIVE,
+ WORKER_RESCUER | WORKER_CPU_INTENSIVE,
 
NR_WORKER_POOLS = 2,/* # worker pools per gcwq */
 
@@ -2405,6 +2406,7 @@ static int rescuer_thread(void *__wq)
bool is_unbound = wq-flags  WQ_UNBOUND;
unsigned int cpu;
 
+   rescuer-flags |= WORKER_RESCUER;
set_user_nice(current, RESCUER_NICE_LEVEL);
 repeat:
set_current_state(TASK_INTERRUPTIBLE);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 04/12] workqueue: simplify is_chained_work()

2012-09-26 Thread Lai Jiangshan
is_chained_work() is too complicated. we can simply found out
whether current task is worker by PF_WQ_WORKER or wq-rescuer.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   36 
 1 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e41c562..c718b94 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1182,34 +1182,22 @@ static void insert_work(struct cpu_workqueue_struct 
*cwq,
 
 /*
  * Test whether @work is being queued from another work executing on the
- * same workqueue.  This is rather expensive and should only be used from
- * cold paths.
+ * same workqueue.
  */
 static bool is_chained_work(struct workqueue_struct *wq)
 {
-   unsigned long flags;
-   unsigned int cpu;
+   struct worker *worker = NULL;
 
-   for_each_gcwq_cpu(cpu) {
-   struct global_cwq *gcwq = get_gcwq(cpu);
-   struct worker *worker;
-   struct hlist_node *pos;
-   int i;
+   if (wq-rescuer  current == wq-rescuer-task) /* rescuer_thread() */
+   worker = wq-rescuer;
+   else if (current-flags  PF_WQ_WORKER) /* worker_thread() */
+   worker = kthread_data(current);
 
-   spin_lock_irqsave(gcwq-lock, flags);
-   for_each_busy_worker(worker, i, pos, gcwq) {
-   if (worker-task != current)
-   continue;
-   spin_unlock_irqrestore(gcwq-lock, flags);
-   /*
-* I'm @worker, no locking necessary.  See if @work
-* is headed to the same workqueue.
-*/
-   return worker-current_cwq-wq == wq;
-   }
-   spin_unlock_irqrestore(gcwq-lock, flags);
-   }
-   return false;
+   /*
+* I'm @worker, no locking necessary.  See if @work
+* is headed to the same workqueue.
+*/
+   return worker  worker-current_cwq-wq == wq;
 }
 
 static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
@@ -1231,7 +1219,7 @@ static void __queue_work(unsigned int cpu, struct 
workqueue_struct *wq,
 
debug_work_activate(work);
 
-   /* if dying, only works from the same workqueue are allowed */
+   /* if draining, only works from the same workqueue are allowed */
if (unlikely(wq-flags  WQ_DRAINING) 
WARN_ON_ONCE(!is_chained_work(wq)))
return;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/12] workqueue: disallow set_cpus_allowed_ptr() from work item

2012-09-26 Thread Lai Jiangshan
workers depend on local-wake-up, if a work function change its CPU, it will
corrupt workqueue, disallow this behavior.

When set_cpus_allowed_ptr() is called from workqueue.c in worker_thread(),
we clear the PF_WQ_WORKER before set_cpus_allowed_ptr() and set it back
after. (rescuer thread has no PF_WQ_WORKER, skip this behavior)

It prevents other/future BUGs like 
https://bugzilla.kernel.org/show_bug.cgi?id=47301.

CC: tangchen tangc...@cn.fujitsu.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/sched/core.c |8 +---
 kernel/workqueue.c  |   10 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d325c4b..355d3cc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5187,9 +5187,11 @@ int set_cpus_allowed_ptr(struct task_struct *p, const 
struct cpumask *new_mask)
goto out;
}
 
-   if (unlikely((p-flags  PF_THREAD_BOUND)  p != current)) {
-   ret = -EINVAL;
-   goto out;
+   if (unlikely(p-flags  PF_THREAD_BOUND)) {
+   if (WARN_ON_ONCE(p-flags  PF_WQ_WORKER) || p != current) {
+   ret = -EINVAL;
+   goto out;
+   }
}
 
do_set_cpus_allowed(p, new_mask);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ec882a6..c55884d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1600,6 +1600,7 @@ __acquires(gcwq-lock)
struct global_cwq *gcwq = worker-pool-gcwq;
struct task_struct *task = worker-task;
 
+   BUG_ON(task != current);
while (true) {
/*
 * The following call may fail, succeed or succeed
@@ -1607,9 +1608,16 @@ __acquires(gcwq-lock)
 * it races with cpu hotunplug operation.  Verify
 * against GCWQ_DISASSOCIATED.
 */
-   if (!(gcwq-flags  GCWQ_DISASSOCIATED))
+   if (!(gcwq-flags  GCWQ_DISASSOCIATED)) {
+   if (!(worker-flags  WORKER_RESCUER))
+   task-flags = ~PF_WQ_WORKER;
+
set_cpus_allowed_ptr(task, get_cpu_mask(gcwq-cpu));
 
+   if (!(worker-flags  WORKER_RESCUER))
+   task-flags |= PF_WQ_WORKER;
+   }
+
spin_lock_irq(gcwq-lock);
if (gcwq-flags  GCWQ_DISASSOCIATED)
return false;
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/3] memory_hotplug: fix memory hotplug bug

2012-09-27 Thread Lai Jiangshan
We found 3 bug while we test and develop memory hotplug.

PATCH1~2: the old code does not handle node_states[N_NORMAL_MEMORY] correctly,
it corrupts the memory.

PATCH3: move the modification of zone_start_pfn into corresponding lock.

CC: Rob Landley r...@landley.net
CC: Andrew Morton a...@linux-foundation.org
CC: Jiang Liu jiang@huawei.com
CC: Jianguo Wu wujian...@huawei.com
CC: Kay Sievers kay.siev...@vrfy.org
CC: Greg Kroah-Hartman gre...@suse.de
CC: Xishi Qiu qiuxi...@huawei.com
CC: Mel Gorman mgor...@suse.de
CC: linux-...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux...@kvack.org

Lai Jiangshan (3):
  memory_hotplug: fix missing nodemask management
  slub, hotplug: ignore unrelated node's hot-adding and hot-removing
  memory,hotplug: Don't modify the zone_start_pfn outside of
zone_span_writelock()

 Documentation/memory-hotplug.txt |5 ++-
 include/linux/memory.h   |1 +
 mm/memory_hotplug.c  |   96 +++---
 mm/page_alloc.c  |3 +-
 mm/slub.c|4 +-
 5 files changed, 87 insertions(+), 22 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] slub, hotplug: ignore unrelated node's hot-adding and hot-removing

2012-09-27 Thread Lai Jiangshan
SLUB only fucus on the nodes which has normal memory, so ignore the other
node's hot-adding and hot-removing.

Aka: if some memroy of a node(which has no onlined memory) is online,
but this new memory onlined is not normal memory(HIGH memory example),
we should not allocate kmem_cache_node for SLUB.

And if the last normal memory is offlined, but the node still has memroy,
we should remove kmem_cache_node for that node.(current code delay it when
all of the memory is offlined)

so we only do something when marg-status_change_nid_normal  0.
marg-status_change_nid is not suitable here.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/slub.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index 2fdd96f..2d78639 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3577,7 +3577,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;
 
-   offline_node = marg-status_change_nid;
+   offline_node = marg-status_change_nid_normal;
 
/*
 * If the node still has available memory. we need kmem_cache_node
@@ -3610,7 +3610,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
-   int nid = marg-status_change_nid;
+   int nid = marg-status_change_nid_normal;
int ret = 0;
 
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] memory_hotplug: Don't modify the zone_start_pfn outside of zone_span_writelock()

2012-09-27 Thread Lai Jiangshan
The __add_zone() maybe call sleep-able init_currently_empty_zone()
to init wait_table,

But this function also modifies the zone_start_pfn without any lock.
It is bugy.

So we move this modification out, and we ensure the modification
of zone_start_pfn is only done with zone_span_writelock() held or in booting.

Since zone_start_pfn is not modified by init_currently_empty_zone()
grow_zone_span() needs to check zone_start_pfn before update it.

CC: Mel Gorman m...@csn.ul.ie
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Reported-by: Yasuaki ISIMATU isimatu.yasu...@jp.fujitsu.com
Tested-by: Wen Congyang we...@cn.fujitsu.com
---
 mm/memory_hotplug.c |2 +-
 mm/page_alloc.c |3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b62d429b..790561f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long 
start_pfn,
zone_span_writelock(zone);
 
old_zone_end_pfn = zone-zone_start_pfn + zone-spanned_pages;
-   if (start_pfn  zone-zone_start_pfn)
+   if (!zone-zone_start_pfn || start_pfn  zone-zone_start_pfn)
zone-zone_start_pfn = start_pfn;
 
zone-spanned_pages = max(old_zone_end_pfn, end_pfn) -
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c13ea75..2545013 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3997,8 +3997,6 @@ int __meminit init_currently_empty_zone(struct zone *zone,
return ret;
pgdat-nr_zones = zone_idx(zone) + 1;
 
-   zone-zone_start_pfn = zone_start_pfn;
-
mminit_dprintk(MMINIT_TRACE, memmap_init,
Initialising map node %d zone %lu pfns %lu - %lu\n,
pgdat-node_id,
@@ -4465,6 +4463,7 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat,
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
+   zone-zone_start_pfn = zone_start_pfn;
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] memory_hotplug: fix stale node_states[N_NORMAL_MEMORY]

2012-09-27 Thread Lai Jiangshan
Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY],
it forgets to manage node_states[N_NORMAL_MEMORY]. it causes
node_states[N_NORMAL_MEMORY] becomes stale.

We add check_nodemasks_changes_online() and check_nodemasks_changes_offline()
to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY]
are changed while hotpluging.

Also add @status_change_nid_normal to struct memory_notify, thus
the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY]
are changed.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/memory-hotplug.txt |5 ++-
 include/linux/memory.h   |1 +
 mm/memory_hotplug.c  |   94 +++--
 3 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 6d0c251..6e6cbc7 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -377,15 +377,18 @@ The third argument is passed by pointer of struct 
memory_notify.
 struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
+   int status_change_nid_normal;
int status_change_nid;
 }
 
 start_pfn is start_pfn of online/offline memory.
 nr_pages is # of pages of online/offline memory.
+status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
+is (will be) set/clear, if this is -1, then nodemask status is not changed.
 status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
 set/clear. It means a new(memoryless) node gets new memory by online and a
 node loses all memory. If this is -1, then nodemask status is not changed.
-If status_changed_nid = 0, callback should create/discard structures for the
+If status_changed_nid* = 0, callback should create/discard structures for the
 node if necessary.
 
 --
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ff9a9f8..a09216d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
 struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
+   int status_change_nid_normal;
int status_change_nid;
 };
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6a5b90d..b62d429b 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -460,6 +460,34 @@ static int online_pages_range(unsigned long start_pfn, 
unsigned long nr_pages,
return 0;
 }
 
+static void check_nodemasks_changes_online(unsigned long nr_pages,
+   struct zone *zone, struct memory_notify *arg)
+{
+   int nid = zone_to_nid(zone);
+   enum zone_type zone_last = ZONE_NORMAL;
+
+   if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
+   zone_last = ZONE_MOVABLE;
+
+   if (zone_idx(zone) = zone_last  !node_state(nid, N_NORMAL_MEMORY))
+   arg-status_change_nid_normal = nid;
+   else
+   arg-status_change_nid_normal = -1;
+
+   if (!node_state(nid, N_HIGH_MEMORY))
+   arg-status_change_nid = nid;
+   else
+   arg-status_change_nid = -1;
+}
+
+static void set_nodemasks(int node, struct memory_notify *arg)
+{
+   if (arg-status_change_nid_normal = 0)
+   node_set_state(node, N_NORMAL_MEMORY);
+
+   node_set_state(node, N_HIGH_MEMORY);
+}
+
 
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 {
@@ -471,13 +499,18 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
struct memory_notify arg;
 
lock_memory_hotplug();
+   /*
+* This doesn't need a lock to do pfn_to_page().
+* The section can't be removed here because of the
+* memory_block-state_mutex.
+*/
+   zone = page_zone(pfn_to_page(pfn));
+
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
-   arg.status_change_nid = -1;
+   check_nodemasks_changes_online(nr_pages, zone, arg);
 
nid = page_to_nid(pfn_to_page(pfn));
-   if (node_present_pages(nid) == 0)
-   arg.status_change_nid = nid;
 
ret = memory_notify(MEM_GOING_ONLINE, arg);
ret = notifier_to_errno(ret);
@@ -487,12 +520,6 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
return ret;
}
/*
-* This doesn't need a lock to do pfn_to_page().
-* The section can't be removed here because of the
-* memory_block-state_mutex.
-*/
-   zone = page_zone(pfn_to_page(pfn));
-   /*
 * If this zone is not populated, then it is not in zonelist.
 * This means the page allocator ignores this zone.
 * So, zonelist must be updated after online.
@@ -517,7 +544,7 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages)
zone-present_pages += onlined_pages;
zone-zone_pgdat-node_present_pages += onlined_pages

Re: [PATCH 1/3] memory_hotplug: fix stale node_states[N_NORMAL_MEMORY]

2012-09-28 Thread Lai Jiangshan
On 09/27/2012 10:32 PM, Ni zhan Chen wrote:
 On 09/27/2012 02:47 PM, Lai Jiangshan wrote:
 Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY],
 it forgets to manage node_states[N_NORMAL_MEMORY]. it causes
 node_states[N_NORMAL_MEMORY] becomes stale.

 We add check_nodemasks_changes_online() and check_nodemasks_changes_offline()
 to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY]
 are changed while hotpluging.

 Also add @status_change_nid_normal to struct memory_notify, thus
 the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY]
 are changed.
 
 I still don't understand why need care N_NORMAL_MEMORY here, could you explain
 in details?

Hi, Chen

In short node_states[N_NORMAL_MEMORY] will become wrong in some situation.
many memory management code access to this node_states[N_NORMAL_MEMORY].

I will add more detail in the changelog in next round.

Thanks,
Lai

 

 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 ---
   Documentation/memory-hotplug.txt |5 ++-
   include/linux/memory.h   |1 +
   mm/memory_hotplug.c  |   94 
 +++--
   3 files changed, 83 insertions(+), 17 deletions(-)

 diff --git a/Documentation/memory-hotplug.txt 
 b/Documentation/memory-hotplug.txt
 index 6d0c251..6e6cbc7 100644
 --- a/Documentation/memory-hotplug.txt
 +++ b/Documentation/memory-hotplug.txt
 @@ -377,15 +377,18 @@ The third argument is passed by pointer of struct 
 memory_notify.
   struct memory_notify {
  unsigned long start_pfn;
  unsigned long nr_pages;
 +   int status_change_nid_normal;
  int status_change_nid;
   }
 start_pfn is start_pfn of online/offline memory.
   nr_pages is # of pages of online/offline memory.
 +status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
 +is (will be) set/clear, if this is -1, then nodemask status is not changed.
   status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will 
 be)
   set/clear. It means a new(memoryless) node gets new memory by online and a
   node loses all memory. If this is -1, then nodemask status is not changed.
 -If status_changed_nid = 0, callback should create/discard structures for 
 the
 +If status_changed_nid* = 0, callback should create/discard structures for 
 the
   node if necessary.
 --
 diff --git a/include/linux/memory.h b/include/linux/memory.h
 index ff9a9f8..a09216d 100644
 --- a/include/linux/memory.h
 +++ b/include/linux/memory.h
 @@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
   struct memory_notify {
   unsigned long start_pfn;
   unsigned long nr_pages;
 +int status_change_nid_normal;
   int status_change_nid;
   };
   diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
 index 6a5b90d..b62d429b 100644
 --- a/mm/memory_hotplug.c
 +++ b/mm/memory_hotplug.c
 @@ -460,6 +460,34 @@ static int online_pages_range(unsigned long start_pfn, 
 unsigned long nr_pages,
   return 0;
   }
   +static void check_nodemasks_changes_online(unsigned long nr_pages,
 +struct zone *zone, struct memory_notify *arg)
 +{
 +int nid = zone_to_nid(zone);
 +enum zone_type zone_last = ZONE_NORMAL;
 +
 +if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
 +zone_last = ZONE_MOVABLE;
 +
 +if (zone_idx(zone) = zone_last  !node_state(nid, N_NORMAL_MEMORY))
 +arg-status_change_nid_normal = nid;
 +else
 +arg-status_change_nid_normal = -1;
 +
 +if (!node_state(nid, N_HIGH_MEMORY))
 +arg-status_change_nid = nid;
 +else
 +arg-status_change_nid = -1;
 +}
 +
 +static void set_nodemasks(int node, struct memory_notify *arg)
 +{
 +if (arg-status_change_nid_normal = 0)
 +node_set_state(node, N_NORMAL_MEMORY);
 +
 +node_set_state(node, N_HIGH_MEMORY);
 +}
 +
 int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
   {
 @@ -471,13 +499,18 @@ int __ref online_pages(unsigned long pfn, unsigned 
 long nr_pages)
   struct memory_notify arg;
 lock_memory_hotplug();
 +/*
 + * This doesn't need a lock to do pfn_to_page().
 + * The section can't be removed here because of the
 + * memory_block-state_mutex.
 + */
 +zone = page_zone(pfn_to_page(pfn));
 +
   arg.start_pfn = pfn;
   arg.nr_pages = nr_pages;
 -arg.status_change_nid = -1;
 +check_nodemasks_changes_online(nr_pages, zone, arg);
 nid = page_to_nid(pfn_to_page(pfn));
 -if (node_present_pages(nid) == 0)
 -arg.status_change_nid = nid;
 ret = memory_notify(MEM_GOING_ONLINE, arg);
   ret = notifier_to_errno(ret);
 @@ -487,12 +520,6 @@ int __ref online_pages(unsigned long pfn, unsigned long 
 nr_pages)
   return ret;
   }
   /*
 - * This doesn't need a lock to do pfn_to_page().
 - * The section can't be removed here because of the
 - * memory_block-state_mutex.
 - */
 -zone = page_zone

Re: [PATCH 3/3] memory_hotplug: Don't modify the zone_start_pfn outside of zone_span_writelock()

2012-09-28 Thread Lai Jiangshan
Hi, KOSAKI

On 09/28/2012 06:30 AM, KOSAKI Motohiro wrote:
 (9/27/12 2:47 AM), Lai Jiangshan wrote:
 The __add_zone() maybe call sleep-able init_currently_empty_zone()
 to init wait_table,
 
 This doesn't explain why sleepable is critical important. I think sleepable
 is jsut unrelated. The fact is only: to write zone-zone_start_pfn require
 zone_span_writelock, but init_currently_empty_zone() doesn't take it.

You are right, sleepable is not critical important, but the lock is critical.

I am Sorry that I added sleep-able and misled guys.

Actually I want to say:

1) to write zone-zone_start_pfn require zone_span_writelock
2) init_currently_empty_zone() is sleepable, so we can't use 
zone_span_writelock()
   protect the whole init_currently_empty_zone().
3) so we have to move the modification code out of init_currently_empty_zone()
   as this patch does.

 
 

 But this function also modifies the zone_start_pfn without any lock.
 It is bugy.
 
 buggy?
 
 
 So we move this modification out, and we ensure the modification
 of zone_start_pfn is only done with zone_span_writelock() held or in booting.

 Since zone_start_pfn is not modified by init_currently_empty_zone()
 grow_zone_span() needs to check zone_start_pfn before update it.

 CC: Mel Gorman m...@csn.ul.ie
 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 Reported-by: Yasuaki ISIMATU isimatu.yasu...@jp.fujitsu.com
 Tested-by: Wen Congyang we...@cn.fujitsu.com
 ---
  mm/memory_hotplug.c |2 +-
  mm/page_alloc.c |3 +--
  2 files changed, 2 insertions(+), 3 deletions(-)

 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
 index b62d429b..790561f 100644
 --- a/mm/memory_hotplug.c
 +++ b/mm/memory_hotplug.c
 @@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned 
 long start_pfn,
  zone_span_writelock(zone);
  
  old_zone_end_pfn = zone-zone_start_pfn + zone-spanned_pages;
 -if (start_pfn  zone-zone_start_pfn)
 +if (!zone-zone_start_pfn || start_pfn  zone-zone_start_pfn)
  zone-zone_start_pfn = start_pfn;
 
 Wrong. zone-zone_start_pfn==0 may be valid pfn. You shouldn't assume it is 
 uninitialized
 value.

Good catch, I will use zone-spanned_pages instead.


Thanks,
Lai

 
 
  
  zone-spanned_pages = max(old_zone_end_pfn, end_pfn) -
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index c13ea75..2545013 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -3997,8 +3997,6 @@ int __meminit init_currently_empty_zone(struct zone 
 *zone,
  return ret;
  pgdat-nr_zones = zone_idx(zone) + 1;
  
 -zone-zone_start_pfn = zone_start_pfn;
 -
  mminit_dprintk(MMINIT_TRACE, memmap_init,
  Initialising map node %d zone %lu pfns %lu - %lu\n,
  pgdat-node_id,
 @@ -4465,6 +4463,7 @@ static void __paginginit free_area_init_core(struct 
 pglist_data *pgdat,
  ret = init_currently_empty_zone(zone, zone_start_pfn,
  size, MEMMAP_EARLY);
  BUG_ON(ret);
 +zone-zone_start_pfn = zone_start_pfn;
  memmap_init(size, nid, j, zone_start_pfn);
  zone_start_pfn += size;
  }

 
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] slub, hotplug: ignore unrelated node's hot-adding and hot-removing

2012-09-28 Thread Lai Jiangshan
HI, Christoph, KOSAKI

SLAB always allocates kmem_list3 for all nodes(N_HIGH_MEMORY), also node 
bug/bad things happens.
SLUB always requires kmem_cache_node on the correct node, so these fix is 
needed.

SLAB uses for_each_online_node() to travel nodes and do maintain,
and it tolerates kmem_list3 on alien nodes.
SLUB uses for_each_node_state(node, N_NORMAL_MEMORY) to travel nodes and do 
maintain,
and it does not tolerate kmem_cache_node on alien nodes.

Maybe we need to change SLAB future and let it use
for_each_node_state(node, N_NORMAL_MEMORY), But I don't want to change SLAB
until I find something bad in SLAB.

Thanks,
Lai

On 09/28/2012 06:35 AM, Christoph wrote:
 While you are at it: Could you move the code into slab_common.c so that there 
 is only one version to maintain?
 
 On Sep 27, 2012, at 17:04, KOSAKI Motohiro kosaki.motoh...@gmail.com wrote:
 
 (9/27/12 2:47 AM), Lai Jiangshan wrote:
 SLUB only fucus on the nodes which has normal memory, so ignore the other
 node's hot-adding and hot-removing.

 Aka: if some memroy of a node(which has no onlined memory) is online,
 but this new memory onlined is not normal memory(HIGH memory example),
 we should not allocate kmem_cache_node for SLUB.

 And if the last normal memory is offlined, but the node still has memroy,
 we should remove kmem_cache_node for that node.(current code delay it when
 all of the memory is offlined)

 so we only do something when marg-status_change_nid_normal  0.
 marg-status_change_nid is not suitable here.

 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 ---
 mm/slub.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

 diff --git a/mm/slub.c b/mm/slub.c
 index 2fdd96f..2d78639 100644
 --- a/mm/slub.c
 +++ b/mm/slub.c
 @@ -3577,7 +3577,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;

 -offline_node = marg-status_change_nid;
 +offline_node = marg-status_change_nid_normal;

/*
 * If the node still has available memory. we need kmem_cache_node
 @@ -3610,7 +3610,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
 -int nid = marg-status_change_nid;
 +int nid = marg-status_change_nid_normal;
int ret = 0;

 Looks reasonable. I think slab need similar fix too.



 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/3] memory_hotplug: Don't modify the zone_start_pfn outside of zone_span_writelock()

2012-09-28 Thread Lai Jiangshan
Hi, Chen,

On 09/27/2012 09:19 PM, Ni zhan Chen wrote:
 On 09/27/2012 02:47 PM, Lai Jiangshan wrote:
 The __add_zone() maybe call sleep-able init_currently_empty_zone()
 to init wait_table,

 But this function also modifies the zone_start_pfn without any lock.
 It is bugy.

 So we move this modification out, and we ensure the modification
 of zone_start_pfn is only done with zone_span_writelock() held or in booting.

 Since zone_start_pfn is not modified by init_currently_empty_zone()
 grow_zone_span() needs to check zone_start_pfn before update it.

 CC: Mel Gorman m...@csn.ul.ie
 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 Reported-by: Yasuaki ISIMATU isimatu.yasu...@jp.fujitsu.com
 Tested-by: Wen Congyang we...@cn.fujitsu.com
 ---
   mm/memory_hotplug.c |2 +-
   mm/page_alloc.c |3 +--
   2 files changed, 2 insertions(+), 3 deletions(-)

 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
 index b62d429b..790561f 100644
 --- a/mm/memory_hotplug.c
 +++ b/mm/memory_hotplug.c
 @@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned 
 long start_pfn,
   zone_span_writelock(zone);
 old_zone_end_pfn = zone-zone_start_pfn + zone-spanned_pages;
 -if (start_pfn  zone-zone_start_pfn)
 +if (!zone-zone_start_pfn || start_pfn  zone-zone_start_pfn)
   zone-zone_start_pfn = start_pfn;
 zone-spanned_pages = max(old_zone_end_pfn, end_pfn) -
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 index c13ea75..2545013 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
 @@ -3997,8 +3997,6 @@ int __meminit init_currently_empty_zone(struct zone 
 *zone,
   return ret;
   pgdat-nr_zones = zone_idx(zone) + 1;
   -zone-zone_start_pfn = zone_start_pfn;
 -
 
 then how can mminit_dprintk print zone-zone_start_pfn ? always print 0 make 
 no sense.


The full code here:

mminit_dprintk(MMINIT_TRACE, memmap_init,
Initialising map node %d zone %lu pfns %lu - %lu\n,
pgdat-node_id,
(unsigned long)zone_idx(zone),
zone_start_pfn, (zone_start_pfn + size));


It doesn't always print 0, it still behaves as I expected.
Could you elaborate?

Thanks,
Lai 


 
   mminit_dprintk(MMINIT_TRACE, memmap_init,
   Initialising map node %d zone %lu pfns %lu - %lu\n,
   pgdat-node_id,
 @@ -4465,6 +4463,7 @@ static void __paginginit free_area_init_core(struct 
 pglist_data *pgdat,
   ret = init_currently_empty_zone(zone, zone_start_pfn,
   size, MEMMAP_EARLY);
   BUG_ON(ret);
 +zone-zone_start_pfn = zone_start_pfn;
   memmap_init(size, nid, j, zone_start_pfn);
   zone_start_pfn += size;
   }
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Do not change worker's running cpu in cmci_rediscover().

2012-09-28 Thread Lai Jiangshan
Add CC: Tejun Heo, Peter Zijlstra.

Hi, Tejun

This is a bug whose root cause is the same as
https://bugzilla.kernel.org/show_bug.cgi?id=47301.

Acked-by: Lai Jiangshan la...@cn.fujitsu.com

thanks,
Lai


On 09/27/2012 05:19 PM, Tang Chen wrote:
 1. cmci_rediscover() is only called by the CPU_POST_DEAD event handler, which
 means the corresponding cpu has already dead. As a result, it won't be 
 accessed
 in the for_each_online_cpu loop.
 So, we could change the if(cpu == dying) statement into a BUG_ON().
 
 2. cmci_rediscover() used set_cpus_allowed_ptr() to change the current 
 process's
 running cpu, and migrate itself to the dest cpu. But worker processes are not
 allowed to be migrated. If current is a worker, the worker will be migrated to
 another cpu, but the corresponding  worker_pool is still on the original cpu.
 
 In this case, the following BUG_ON in try_to_wake_up_local() will be 
 triggered:
 BUG_ON(rq != this_rq());
 
 This will cause the kernel panic.
 
 This patch removes the set_cpus_allowed_ptr() call, and put the cmci 
 rediscover
 jobs onto all the other cpus using system_wq. This could bring some delay for
 the jobs.
 
 The following is call trace.
 
 [ 6155.451107] [ cut here ]
 [ 6155.452019] kernel BUG at kernel/sched/core.c:1654!
 ..
 [ 6155.452019] RIP: 0010:[810add15]  [810add15] 
 try_to_wake_up_local+0x115/0x130
 ..
 [ 6155.452019] Call Trace:
 [ 6155.452019]  [8166fc14] __schedule+0x764/0x880
 [ 6155.452019]  [81670059] schedule+0x29/0x70
 [ 6155.452019]  [8166de65] schedule_timeout+0x235/0x2d0
 [ 6155.452019]  [810db57d] ? mark_held_locks+0x8d/0x140
 [ 6155.452019]  [810dd463] ? __lock_release+0x133/0x1a0
 [ 6155.452019]  [81671c50] ? _raw_spin_unlock_irq+0x30/0x50
 [ 6155.452019]  [810db8f5] ? trace_hardirqs_on_caller+0x105/0x190
 [ 6155.452019]  [8166fefb] wait_for_common+0x12b/0x180
 [ 6155.452019]  [810b0b30] ? try_to_wake_up+0x2f0/0x2f0
 [ 6155.452019]  [8167002d] wait_for_completion+0x1d/0x20
 [ 6155.452019]  [8110008a] stop_one_cpu+0x8a/0xc0
 [ 6155.452019]  [810abd40] ? __migrate_task+0x1a0/0x1a0
 [ 6155.452019]  [810a6ab8] ? complete+0x28/0x60
 [ 6155.452019]  [810b0fd8] set_cpus_allowed_ptr+0x128/0x130
 [ 6155.452019]  [81036785] cmci_rediscover+0xf5/0x140
 [ 6155.452019]  [816643c0] mce_cpu_callback+0x18d/0x19d
 [ 6155.452019]  [81676187] notifier_call_chain+0x67/0x150
 [ 6155.452019]  [810a03de] __raw_notifier_call_chain+0xe/0x10
 [ 6155.452019]  [81070470] __cpu_notify+0x20/0x40
 [ 6155.452019]  [810704a5] cpu_notify_nofail+0x15/0x30
 [ 6155.452019]  [81655182] _cpu_down+0x262/0x2e0
 [ 6155.452019]  [81655236] cpu_down+0x36/0x50
 [ 6155.452019]  [813d3eaa] acpi_processor_remove+0x50/0x11e
 [ 6155.452019]  [813a6978] acpi_device_remove+0x90/0xb2
 [ 6155.452019]  [8143cbec] __device_release_driver+0x7c/0xf0
 [ 6155.452019]  [8143cd6f] device_release_driver+0x2f/0x50
 [ 6155.452019]  [813a7870] acpi_bus_remove+0x32/0x6d
 [ 6155.452019]  [813a7932] acpi_bus_trim+0x87/0xee
 [ 6155.452019]  [813a7a21] acpi_bus_hot_remove_device+0x88/0x16b
 [ 6155.452019]  [813a33ee] acpi_os_execute_deferred+0x27/0x34
 [ 6155.452019]  [81090589] process_one_work+0x219/0x680
 [ 6155.452019]  [81090528] ? process_one_work+0x1b8/0x680
 [ 6155.452019]  [813a33c7] ? acpi_os_wait_events_complete+0x23/0x23
 [ 6155.452019]  [810923be] worker_thread+0x12e/0x320
 [ 6155.452019]  [81092290] ? manage_workers+0x110/0x110
 [ 6155.452019]  [81098396] kthread+0xc6/0xd0
 [ 6155.452019]  [8167c4c4] kernel_thread_helper+0x4/0x10
 [ 6155.452019]  [81671f30] ? retint_restore_args+0x13/0x13
 [ 6155.452019]  [810982d0] ? __init_kthread_worker+0x70/0x70
 [ 6155.452019]  [8167c4c0] ? gs_change+0x13/0x13
 
 Signed-off-by: Tang Chen tangc...@cn.fujitsu.com
 Signed-off-by: Miao Xie mi...@cn.fujitsu.com
 ---
  arch/x86/kernel/cpu/mcheck/mce_intel.c |   34 +--
  1 files changed, 19 insertions(+), 15 deletions(-)
 
 diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c 
 b/arch/x86/kernel/cpu/mcheck/mce_intel.c
 index 38e49bc..f7d9795 100644
 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
 +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
 @@ -163,34 +163,38 @@ void cmci_clear(void)
   raw_spin_unlock_irqrestore(cmci_discover_lock, flags);
  }
  
 +static long cmci_rediscover_work_func(void *arg)
 +{
 + int banks;
 +
 + /* Recheck banks in case CPUs don't all have the same */
 + if (cmci_supported(banks))
 + cmci_discover(banks, 0);
 +
 + return 0;
 +}
 +
  /*
   * After a CPU went down cycle through all the others and rediscover
   * Must run in process context.
   */
  void cmci_rediscover(int dying

[PATCH] task_work: avoid unneeded cmpxchg() in task_work_run()

2012-10-08 Thread Lai Jiangshan
We only require cmpxchg()retry when task is exiting.
xchg() is enough in other cases like original code in ac3d0da8.

So we try our best to use xchg() and avoid competitionlatency
from task_work_add().

Also remove the inner loop

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c9..82a42e7 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -56,14 +56,13 @@ void task_work_run(void)
 * work-func() can do task_work_add(), do not set
 * work_exited unless the list is empty.
 */
-   do {
-   work = ACCESS_ONCE(task-task_works);
-   head = !work  (task-flags  PF_EXITING) ?
-   work_exited : NULL;
-   } while (cmpxchg(task-task_works, work, head) != work);
-
-   if (!work)
+   if (!ACCESS_ONCE(task-task_works) ||
+   !(work = xchg(task-task_works, NULL))) {
+   if ((task-flags  PF_EXITING) 
+   cmpxchg(task-task_works, NULL, work_exited))
+   continue;
break;
+   }
/*
 * Synchronize with task_work_cancel(). It can't remove
 * the first entry == work, cmpxchg(task_works) should
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] slub, hotplug: ignore unrelated node's hot-adding and hot-removing

2012-10-24 Thread Lai Jiangshan
On 09/29/2012 06:26 AM, KOSAKI Motohiro wrote:
 On Fri, Sep 28, 2012 at 3:19 AM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 HI, Christoph, KOSAKI

 SLAB always allocates kmem_list3 for all nodes(N_HIGH_MEMORY), also node 
 bug/bad things happens.
 SLUB always requires kmem_cache_node on the correct node, so these fix is 
 needed.

 SLAB uses for_each_online_node() to travel nodes and do maintain,
 and it tolerates kmem_list3 on alien nodes.
 SLUB uses for_each_node_state(node, N_NORMAL_MEMORY) to travel nodes and do 
 maintain,
 and it does not tolerate kmem_cache_node on alien nodes.

 Maybe we need to change SLAB future and let it use
 for_each_node_state(node, N_NORMAL_MEMORY), But I don't want to change SLAB
 until I find something bad in SLAB.
 
 SLAB can't use highmem. then traverse zones which don't have normal
 memory is silly IMHO.

SLAB tolerates dummy kmem_list3 on alien nodes.

 If this is not bug, current slub behavior is also not bug. Is there
 any difference?

SLUB can't tolerates dummy kmem_cache_node on alien nodes, otherwise
n-nr_slabs will be corrupted when we online a node which don't have normal 
memory,
and trigger a WARN_ON(). And it will trigger BUG_ON() when we remove the node.

Since SLUB always use for_each_node_state(node, N_NORMAL_MEMORY), we should make
all the other code in slub.c be compatible with it. otherwise we will break the
design of SLUB.

Since SLAB always use for_each_online_node(), it means it accept some silly 
behavior
in the design, we don't need to change it before we decide to remove the whole
silly things at together. there is not waring and buggy in SLAB in this view.

 
 If I understand correctly, current code may waste some additional
 memory on corner case. but it doesn't make memory leak both when slab
 and slub.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2 V2] slub, hotplug: ignore unrelated node's hot-adding and hot-removing

2012-10-24 Thread Lai Jiangshan
SLUB only fucus on the nodes which has normal memory, so ignore the other
node's hot-adding and hot-removing.

Aka: if some memroy of a node(which has no onlined memory) is online,
but this new memory onlined is not normal memory(HIGH memory example),
we should not allocate kmem_cache_node for SLUB.

And if the last normal memory is offlined, but the node still has memroy,
we should remove kmem_cache_node for that node.(current code delay it when
all of the memory is offlined)

so we only do something when marg-status_change_nid_normal  0.
marg-status_change_nid is not suitable here.

The same problem doesn't exsit in SLAB, because SLAB allocates kmem_list3
for every node even the node don't have normal memory, SLAB tolerates
kmem_list3 on alien nodes. SLUB only fucus on the nodes which has normal
memory, it don't tolerates alien kmem_cache_node, the patch makes
SLUB become self-compatible and avoid WARN and BUG in a rare condition.

CC: David Rientjes rient...@google.com
Cc: Minchan Kim minchan@gmail.com
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
CC: Rob Landley r...@landley.net
CC: Andrew Morton a...@linux-foundation.org
CC: Jiang Liu jiang@huawei.com
CC: Kay Sievers kay.siev...@vrfy.org
CC: Greg Kroah-Hartman gre...@suse.de
CC: Mel Gorman mgor...@suse.de
CC: 'FNST-Wen Congyang' we...@cn.fujitsu.com
CC: linux-...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux...@kvack.org
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/slub.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index a0d6984..487f0bd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg)
struct memory_notify *marg = arg;
int offline_node;
 
-   offline_node = marg-status_change_nid;
+   offline_node = marg-status_change_nid_normal;
 
/*
 * If the node still has available memory. we need kmem_cache_node
@@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg)
struct kmem_cache_node *n;
struct kmem_cache *s;
struct memory_notify *marg = arg;
-   int nid = marg-status_change_nid;
+   int nid = marg-status_change_nid_normal;
int ret = 0;
 
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2 V2] memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]

2012-10-24 Thread Lai Jiangshan
Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY],
it forgets to manage node_states[N_NORMAL_MEMORY]. it may cause
node_states[N_NORMAL_MEMORY] becomes incorrect.

Example, if a node is empty before online, and we online a memory
which is in ZONE_NORMAL. And after online,  node_states[N_HIGH_MEMORY]
is correct, but node_states[N_NORMAL_MEMORY] is incorrect,
the online code don't set the new online node to
node_states[N_NORMAL_MEMORY].

The same things like it will happen when offline(the offline code
don't clear the node from node_states[N_NORMAL_MEMORY] when needed).
Some memory managment code depends node_states[N_NORMAL_MEMORY],
so we have to fix up the node_states[N_NORMAL_MEMORY].

We add node_states_check_changes_online() and 
node_states_check_changes_offline()
to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY]
are changed while hotpluging.

Also add @status_change_nid_normal to struct memory_notify, thus
the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY]
are changed. (We can add a @flags and reuse @status_change_nid instead of
introducing @status_change_nid_normal, but it will add much more complicated
in memory hotplug callback in every subsystem. So introdcing
@status_change_nid_normal is better and it don't change the sematic
of @status_change_nid)

Changed from V1:
add more comments
change the function name

CC: David Rientjes rient...@google.com
Cc: Minchan Kim minchan@gmail.com
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
CC: Rob Landley r...@landley.net
CC: Andrew Morton a...@linux-foundation.org
CC: Jiang Liu jiang@huawei.com
CC: Kay Sievers kay.siev...@vrfy.org
CC: Greg Kroah-Hartman gre...@suse.de
CC: Mel Gorman mgor...@suse.de
CC: 'FNST-Wen Congyang' we...@cn.fujitsu.com
CC: linux-...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux...@kvack.org
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/memory-hotplug.txt |5 +-
 include/linux/memory.h   |1 +
 mm/memory_hotplug.c  |  136 +-
 3 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 6d0c251..6e6cbc7 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -377,15 +377,18 @@ The third argument is passed by pointer of struct 
memory_notify.
 struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
+   int status_change_nid_normal;
int status_change_nid;
 }
 
 start_pfn is start_pfn of online/offline memory.
 nr_pages is # of pages of online/offline memory.
+status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
+is (will be) set/clear, if this is -1, then nodemask status is not changed.
 status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
 set/clear. It means a new(memoryless) node gets new memory by online and a
 node loses all memory. If this is -1, then nodemask status is not changed.
-If status_changed_nid = 0, callback should create/discard structures for the
+If status_changed_nid* = 0, callback should create/discard structures for the
 node if necessary.
 
 --
diff --git a/include/linux/memory.h b/include/linux/memory.h
index ff9a9f8..a09216d 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
 struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
+   int status_change_nid_normal;
int status_change_nid;
 };
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ec899a2..a1920fb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -467,6 +467,53 @@ static int online_pages_range(unsigned long start_pfn, 
unsigned long nr_pages,
return 0;
 }
 
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+   struct zone *zone, struct memory_notify *arg)
+{
+   int nid = zone_to_nid(zone);
+   enum zone_type zone_last = ZONE_NORMAL;
+
+   /*
+* If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+* which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
+*
+* If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
+* which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+*/
+   if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
+   zone_last = ZONE_MOVABLE;
+
+   /*
+* if the memory to be online is in a zone of 0...zone_last, and
+* the zones of 0...zone_last don't have memory before online, we will
+* need to set the node to node_states[N_NORMAL_MEMORY] after
+* the memory is online

[PATCH 0/2 V2] memory_hotplug: fix memory hotplug bug

2012-10-24 Thread Lai Jiangshan
We found 2 bugs while we test and develop memory hotplug.

The hotplug code does not handle node_states[N_NORMAL_MEMORY] correctly,
it may corrupt the memory.

And we ensure the SLUB do NOT respond when node_states[N_NORMAL_MEMORY]
is not changed.

The patchset is based on mainline(3d0ceac129f3ea0b125289055a3aa7519d38df77)


CC: David Rientjes rient...@google.com
Cc: Minchan Kim minchan@gmail.com
CC: KOSAKI Motohiro kosaki.motoh...@jp.fujitsu.com
CC: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
CC: Rob Landley r...@landley.net
CC: Andrew Morton a...@linux-foundation.org
CC: Jiang Liu jiang@huawei.com
CC: Kay Sievers kay.siev...@vrfy.org
CC: Greg Kroah-Hartman gre...@suse.de
CC: Mel Gorman mgor...@suse.de
CC: 'FNST-Wen Congyang' we...@cn.fujitsu.com
CC: linux-...@vger.kernel.org
CC: linux-kernel@vger.kernel.org
CC: linux...@kvack.org

Lai Jiangshan (2):
  memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]
  slub, hotplug: ignore unrelated node's hot-adding and hot-removing

 Documentation/memory-hotplug.txt |5 +-
 include/linux/memory.h   |1 +
 mm/memory_hotplug.c  |  136 +-
 mm/slub.c|4 +-
 4 files changed, 127 insertions(+), 19 deletions(-)

-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/3] lglock: remove unused DEFINE_LGLOCK_LOCKDEP()

2012-10-08 Thread Lai Jiangshan
struct lglocks use their own lock_key/lock_dep_map which are defined
in struct lglock. DEFINE_LGLOCK_LOCKDEP() is unused now, so we remove it
and save a small piece of memory.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/lglock.h |9 -
 1 files changed, 0 insertions(+), 9 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index f01e5f6..45eff71 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -36,16 +36,8 @@
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
-
-#define DEFINE_LGLOCK_LOCKDEP(name)\
- struct lock_class_key name##_lock_key;
\
- struct lockdep_map name##_lock_dep_map;   \
- EXPORT_SYMBOL(name##_lock_dep_map)
-
 #else
 #define LOCKDEP_INIT_MAP(a, b, c, d)
-
-#define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
 struct lglock {
@@ -57,7 +49,6 @@ struct lglock {
 };
 
 #define DEFINE_LGLOCK(name)\
-   DEFINE_LGLOCK_LOCKDEP(name);\
DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)  \
= __ARCH_SPIN_LOCK_UNLOCKED;\
struct lglock name = { .lock = name ## _lock }
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/3] lglock: make the per_cpu locks static

2012-10-08 Thread Lai Jiangshan
The per_cpu locks are not used outside nor exported.
Add a static linkage keyword to it.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/lglock.h |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 45eff71..8f97451 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -49,7 +49,7 @@ struct lglock {
 };
 
 #define DEFINE_LGLOCK(name)\
-   DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)  \
+   static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)   \
= __ARCH_SPIN_LOCK_UNLOCKED;\
struct lglock name = { .lock = name ## _lock }
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/3] lglock: add DEFINE_STATIC_LGLOCK()

2012-10-08 Thread Lai Jiangshan
When if the lglock don't to be exported,
we can use DEFINE_STATIC_LGLOCK().

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 fs/file_table.c|2 +-
 include/linux/lglock.h |8 +++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 701985e..e26fd31 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -36,7 +36,7 @@ struct files_stat_struct files_stat = {
.max_files = NR_FILE
 };
 
-DEFINE_LGLOCK(files_lglock);
+DEFINE_STATIC_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
diff --git a/include/linux/lglock.h b/include/linux/lglock.h
index 8f97451..0d24e93 100644
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -32,7 +32,8 @@
 #define br_write_lock(name)lg_global_lock(name)
 #define br_write_unlock(name)  lg_global_unlock(name)
 
-#define DEFINE_BRLOCK(name)DEFINE_LGLOCK(name)
+#define DEFINE_BRLOCK(name)DEFINE_LGLOCK(name)
+#define DEFINE_STATIC_BRLOCK(name) DEFINE_STATIC_LGLOCK(name)
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
@@ -53,6 +54,11 @@ struct lglock {
= __ARCH_SPIN_LOCK_UNLOCKED;\
struct lglock name = { .lock = name ## _lock }
 
+#define DEFINE_STATIC_LGLOCK(name) \
+   static DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)   \
+   = __ARCH_SPIN_LOCK_UNLOCKED;\
+   static struct lglock name = { .lock = name ## _lock }
+
 void lg_lock_init(struct lglock *lg, char *name);
 void lg_local_lock(struct lglock *lg);
 void lg_local_unlock(struct lglock *lg);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V2] task_work: avoid unneeded cmpxchg() in task_work_run()

2012-10-09 Thread Lai Jiangshan
On 10/09/2012 07:04 PM, Peter Zijlstra wrote:
 On Mon, 2012-10-08 at 14:38 +0200, Oleg Nesterov wrote:
 But the code looks more complex, and the only advantage is that
 non-exiting task does xchg() instead of cmpxchg(). Not sure this
 worth the trouble, in this case task_work_run() will likey run
 the callbacks (the caller checks -task_works != NULL), I do not
 think this can add any noticeable speedup. 
 
 Yeah, I agree, the patch doesn't seem worth the trouble. It makes tricky
 code unreadable at best.
 

To gain better readability, we need to move work_exited things out
from task_work_run() too.

Thanks,
Lai

Subject: task_work: avoid unneeded cmpxchg() in task_work_run()

We only require cmpxchg()retry when task is exiting.
xchg() is enough in other cases like original code in ac3d0da8.

So we use xchg() for task_work_run() and move the logic
of exit_task_work() out from task_work_run().

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---

diff --git a/include/linux/task_work.h b/include/linux/task_work.h
index ca5a1cf..1e686a5 100644
--- a/include/linux/task_work.h
+++ b/include/linux/task_work.h
@@ -15,10 +15,6 @@ init_task_work(struct callback_head *twork, task_work_func_t 
func)
 int task_work_add(struct task_struct *task, struct callback_head *twork, bool);
 struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t);
 void task_work_run(void);
-
-static inline void exit_task_work(struct task_struct *task)
-{
-   task_work_run();
-}
+void exit_task_work(struct task_struct *task);
 
 #endif /* _LINUX_TASK_WORK_H */
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c9..87ef3b7 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -52,16 +52,7 @@ void task_work_run(void)
struct callback_head *work, *head, *next;
 
for (;;) {
-   /*
-* work-func() can do task_work_add(), do not set
-* work_exited unless the list is empty.
-*/
-   do {
-   work = ACCESS_ONCE(task-task_works);
-   head = !work  (task-flags  PF_EXITING) ?
-   work_exited : NULL;
-   } while (cmpxchg(task-task_works, work, head) != work);
-
+   work = xchg(task-task_works, NULL);
if (!work)
break;
/*
@@ -90,3 +81,17 @@ void task_work_run(void)
} while (work);
}
 }
+
+void exit_task_work(struct task_struct *task)
+{
+   for (;;) {
+   /*
+* work-func() can do task_work_add(), do not set
+* work_exited unless the list is empty.
+*/
+   if (unlikely(task-task_works))
+   task_work_run();
+   if (cmpxchg(task-task_works, NULL, work_exited) == NULL)
+   break;
+   }
+}
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] rcutorture: use DEFINE_STATIC_SRCU()

2012-10-12 Thread Lai Jiangshan
use DEFINE_STATIC_SRCU() to simplify the rcutorture.c

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/rcutorture.c |   41 ++---
 1 files changed, 6 insertions(+), 35 deletions(-)

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 25b1503..7939edf 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -332,7 +332,6 @@ rcu_stutter_wait(char *title)
 
 struct rcu_torture_ops {
void (*init)(void);
-   void (*cleanup)(void);
int (*readlock)(void);
void (*read_delay)(struct rcu_random_state *rrsp);
void (*readunlock)(int idx);
@@ -424,7 +423,6 @@ static void rcu_torture_deferred_free(struct rcu_torture *p)
 
 static struct rcu_torture_ops rcu_ops = {
.init   = NULL,
-   .cleanup= NULL,
.readlock   = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
@@ -468,7 +466,6 @@ static void rcu_sync_torture_init(void)
 
 static struct rcu_torture_ops rcu_sync_ops = {
.init   = rcu_sync_torture_init,
-   .cleanup= NULL,
.readlock   = rcu_torture_read_lock,
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
@@ -486,7 +483,6 @@ static struct rcu_torture_ops rcu_sync_ops = {
 
 static struct rcu_torture_ops rcu_expedited_ops = {
.init   = rcu_sync_torture_init,
-   .cleanup= NULL,
.readlock   = rcu_torture_read_lock,
.read_delay = rcu_read_delay,  /* just reuse rcu's version. */
.readunlock = rcu_torture_read_unlock,
@@ -529,7 +525,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture 
*p)
 
 static struct rcu_torture_ops rcu_bh_ops = {
.init   = NULL,
-   .cleanup= NULL,
.readlock   = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay,  /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -546,7 +541,6 @@ static struct rcu_torture_ops rcu_bh_ops = {
 
 static struct rcu_torture_ops rcu_bh_sync_ops = {
.init   = rcu_sync_torture_init,
-   .cleanup= NULL,
.readlock   = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay,  /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -563,7 +557,6 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
 
 static struct rcu_torture_ops rcu_bh_expedited_ops = {
.init   = rcu_sync_torture_init,
-   .cleanup= NULL,
.readlock   = rcu_bh_torture_read_lock,
.read_delay = rcu_read_delay,  /* just reuse rcu's version. */
.readunlock = rcu_bh_torture_read_unlock,
@@ -582,19 +575,7 @@ static struct rcu_torture_ops rcu_bh_expedited_ops = {
  * Definitions for srcu torture testing.
  */
 
-static struct srcu_struct srcu_ctl;
-
-static void srcu_torture_init(void)
-{
-   init_srcu_struct(srcu_ctl);
-   rcu_sync_torture_init();
-}
-
-static void srcu_torture_cleanup(void)
-{
-   synchronize_srcu(srcu_ctl);
-   cleanup_srcu_struct(srcu_ctl);
-}
+DEFINE_STATIC_SRCU(srcu_ctl);
 
 static int srcu_torture_read_lock(void) __acquires(srcu_ctl)
 {
@@ -665,8 +646,7 @@ static int srcu_torture_stats(char *page)
 }
 
 static struct rcu_torture_ops srcu_ops = {
-   .init   = srcu_torture_init,
-   .cleanup= srcu_torture_cleanup,
+   .init   = rcu_sync_torture_init,
.readlock   = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -680,8 +660,7 @@ static struct rcu_torture_ops srcu_ops = {
 };
 
 static struct rcu_torture_ops srcu_sync_ops = {
-   .init   = srcu_torture_init,
-   .cleanup= srcu_torture_cleanup,
+   .init   = rcu_sync_torture_init,
.readlock   = srcu_torture_read_lock,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock,
@@ -705,8 +684,7 @@ static void srcu_torture_read_unlock_raw(int idx) 
__releases(srcu_ctl)
 }
 
 static struct rcu_torture_ops srcu_raw_ops = {
-   .init   = srcu_torture_init,
-   .cleanup= srcu_torture_cleanup,
+   .init   = rcu_sync_torture_init,
.readlock   = srcu_torture_read_lock_raw,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
@@ -720,8 +698,7 @@ static struct rcu_torture_ops srcu_raw_ops = {
 };
 
 static struct rcu_torture_ops srcu_raw_sync_ops = {
-   .init   = srcu_torture_init,
-   .cleanup= srcu_torture_cleanup,
+   .init   = rcu_sync_torture_init,
.readlock   = srcu_torture_read_lock_raw,
.read_delay = srcu_read_delay,
.readunlock = srcu_torture_read_unlock_raw,
@@ -740,8

[PATCH 1/4] srcu: add my name

2012-10-12 Thread Lai Jiangshan
I changed a lot for srcu, add my name here, thus any one can blame/contact
to me when needed.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/srcu.h |2 ++
 kernel/srcu.c|2 ++
 2 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 55a5c52..a55ddb1 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -16,8 +16,10 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
  *
  * Author: Paul McKenney paul...@us.ibm.com
+ *Lai Jiangshan la...@cn.fujitsu.com
  *
  * For detailed explanation of Read-Copy Update mechanism see -
  * Documentation/RCU/ *.txt
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2095be3..610486d 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -16,8 +16,10 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  * Copyright (C) IBM Corporation, 2006
+ * Copyright (C) Fujitsu, 2012
  *
  * Author: Paul McKenney paul...@us.ibm.com
+ *Lai Jiangshan la...@cn.fujitsu.com
  *
  * For detailed explanation of Read-Copy Update mechanism see -
  * Documentation/RCU/ *.txt
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] srcu: add DEFINE_SRCU()

2012-10-12 Thread Lai Jiangshan
In old days, we have two different API sets for dynamic-allocated per_cpu data
and DEFINE_PER_CPU()-defined per_cpu data, and since we used
dynamic-allocated per_cpu data, we can't use DEFINE_PER_CPU()-defined
per_cpu data(otherwise we will introduce a lot of duplicated code.

In new days, we have only one API sets for both type of per_cpu data,
so we can use DEFINE_PER_CPU() for DEFINE_SRCU() which allows us
define and init srcu struct in build time and allows us use srcu APIs
in very early boot time.

We also provide DEFINE_STATIC_SRCU() which defines an internal srcu struct
inside a single *.c.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/srcu.h |   30 ++
 1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 5cce128..f986df1 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -42,6 +42,8 @@ struct rcu_batch {
struct rcu_head *head, **tail;
 };
 
+#define RCU_BATCH_INIT(name) { NULL, (name.head) }
+
 struct srcu_struct {
unsigned completed;
struct srcu_struct_array __percpu *per_cpu_ref;
@@ -72,14 +74,42 @@ int __init_srcu_struct(struct srcu_struct *sp, const char 
*name,
__init_srcu_struct((sp), #sp, __srcu_key); \
 })
 
+#define __SRCU_DEP_MAP_INIT(srcu_name) .dep_map = { .name = #srcu_name },
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 int init_srcu_struct(struct srcu_struct *sp);
 
+#define __SRCU_DEP_MAP_INIT(srcu_name)
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 void process_srcu(struct work_struct *work);
 
+#define __SRCU_STRUCT_INIT(name)   \
+   {   \
+   .completed = -300,  \
+   .per_cpu_ref = name##_srcu_array,  \
+   .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),\
+   .running = false,   \
+   .batch_queue = RCU_BATCH_INIT(name.batch_queue),\
+   .batch_check0 = RCU_BATCH_INIT(name.batch_check0),  \
+   .batch_check1 = RCU_BATCH_INIT(name.batch_check1),  \
+   .batch_done = RCU_BATCH_INIT(name.batch_done),  \
+   .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu),\
+   __SRCU_DEP_MAP_INIT(name)   \
+   }
+
+/*
+ * define and init a srcu struct at build time.
+ * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
+ */
+#define DEFINE_SRCU(name)  \
+   static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+   struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+
+#define DEFINE_STATIC_SRCU(name)   
\
+   static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
+   static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
+
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
  * @sp: srcu_struct in queue the callback
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] srcu: export process_srcu()

2012-10-12 Thread Lai Jiangshan
process_srcu() will be used in DEFINE_SRCU() (only).
Although it is exported, it is still an internal in srcu.h.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/srcu.h |2 ++
 kernel/srcu.c|6 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index a55ddb1..5cce128 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -78,6 +78,8 @@ int init_srcu_struct(struct srcu_struct *sp);
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
+void process_srcu(struct work_struct *work);
+
 /**
  * call_srcu() - Queue a callback for invocation after an SRCU grace period
  * @sp: srcu_struct in queue the callback
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 610486d..bfe4c5a 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -94,9 +94,6 @@ static inline void rcu_batch_move(struct rcu_batch *to, 
struct rcu_batch *from)
}
 }
 
-/* single-thread state-machine */
-static void process_srcu(struct work_struct *work);
-
 static int init_srcu_struct_fields(struct srcu_struct *sp)
 {
sp-completed = 0;
@@ -639,7 +636,7 @@ static void srcu_reschedule(struct srcu_struct *sp)
 /*
  * This is the work-queue function that handles SRCU grace periods.
  */
-static void process_srcu(struct work_struct *work)
+void process_srcu(struct work_struct *work)
 {
struct srcu_struct *sp;
 
@@ -650,3 +647,4 @@ static void process_srcu(struct work_struct *work)
srcu_invoke_callbacks(sp);
srcu_reschedule(sp);
 }
+EXPORT_SYMBOL_GPL(process_srcu);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] srcu: Add DEFINE_SRCU()

2012-10-12 Thread Lai Jiangshan
These patches add a simple DEFINE_SRCU() which define and init
the srcu struct in build time, and allow us use srcu in very early
boot time.

Lai Jiangshan (4):
  srcu: add my name
  srcu: export process_srcu()
  srcu: add DEFINE_SRCU
  rcutorture: use DEFINE_STATIC_SRCU()

 include/linux/srcu.h |   34 ++
 kernel/rcutorture.c  |   41 ++---
 kernel/srcu.c|8 
 3 files changed, 44 insertions(+), 39 deletions(-)

-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/3] memory_hotplug: fix stale node_states[N_NORMAL_MEMORY]

2012-10-25 Thread Lai Jiangshan
Hi, KOSAKI 

On 09/28/2012 06:03 AM, KOSAKI Motohiro wrote:
 (9/27/12 2:47 AM), Lai Jiangshan wrote:
 Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY],
 it forgets to manage node_states[N_NORMAL_MEMORY]. it causes
 node_states[N_NORMAL_MEMORY] becomes stale.
 
 What's mean 'stale'? I guess
 
 : Currently memory_hotplug doesn't turn on/off node_states[N_NORMAL_MEMORY]


Right.

 and
 : then it will be invalid if the platform has highmem. Luckily, almost memory 
 : hotplug aware platform don't have highmem, but are not all.
 
 right?

Some platforms(32 bit) support logic-memory-hotplug.
Some platforms have movable memory.
They are all considered.

 I supporse this patch only meaningful on ARM platform practically.
 

any platform whic supports memory-hotplug.

 
 
 We add check_nodemasks_changes_online() and check_nodemasks_changes_offline()
 to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY]
 are changed while hotpluging.
 
 
 Also add @status_change_nid_normal to struct memory_notify, thus
 the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY]
 are changed.
 
 status_change_nid_normal is very ugly to me. When status_change_nid and 
 status_change_nid_normal has positive value, they are always the same.
 nid and flags value are more natual to me.

If we use flags, the semantic of status_change_nid is changed and we need to
change more current code, and we will add complicated to the memory hotplug
callbacks.

like this:

-   node = arg-status_change_nid;
+   if (arg-status_change_flags  (1UL  N_HIGH_MEMORY))
+   node = arg-status_change_nid;
+   else
+   node = -1;

 
 
 

 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 ---
  Documentation/memory-hotplug.txt |5 ++-
  include/linux/memory.h   |1 +
  mm/memory_hotplug.c  |   94 
 +++--
  3 files changed, 83 insertions(+), 17 deletions(-)

 diff --git a/Documentation/memory-hotplug.txt 
 b/Documentation/memory-hotplug.txt
 index 6d0c251..6e6cbc7 100644
 --- a/Documentation/memory-hotplug.txt
 +++ b/Documentation/memory-hotplug.txt
 @@ -377,15 +377,18 @@ The third argument is passed by pointer of struct 
 memory_notify.
  struct memory_notify {
 unsigned long start_pfn;
 unsigned long nr_pages;
 +   int status_change_nid_normal;
 int status_change_nid;
  }
  
  start_pfn is start_pfn of online/offline memory.
  nr_pages is # of pages of online/offline memory.
 +status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
 +is (will be) set/clear, if this is -1, then nodemask status is not changed.
  status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
  set/clear. It means a new(memoryless) node gets new memory by online and a
  node loses all memory. If this is -1, then nodemask status is not changed.
 -If status_changed_nid = 0, callback should create/discard structures for 
 the
 +If status_changed_nid* = 0, callback should create/discard structures for 
 the
  node if necessary.
  
  --
 diff --git a/include/linux/memory.h b/include/linux/memory.h
 index ff9a9f8..a09216d 100644
 --- a/include/linux/memory.h
 +++ b/include/linux/memory.h
 @@ -53,6 +53,7 @@ int arch_get_memory_phys_device(unsigned long start_pfn);
  struct memory_notify {
  unsigned long start_pfn;
  unsigned long nr_pages;
 +int status_change_nid_normal;
  int status_change_nid;
  };
  
 diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
 index 6a5b90d..b62d429b 100644
 --- a/mm/memory_hotplug.c
 +++ b/mm/memory_hotplug.c
 @@ -460,6 +460,34 @@ static int online_pages_range(unsigned long start_pfn, 
 unsigned long nr_pages,
  return 0;
  }
  
 +static void check_nodemasks_changes_online(unsigned long nr_pages,
 +struct zone *zone, struct memory_notify *arg)
 +{
 +int nid = zone_to_nid(zone);
 +enum zone_type zone_last = ZONE_NORMAL;
 +
 +if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
 +zone_last = ZONE_MOVABLE;
 
 This is very strange (or ugly) code. ZONE_MOVABLE don't depend on high mem.

If we don't have HIGHMEM,
any node of N_NORMAL_MEMORY has 0...ZONE_MOVABLE

if we have HIGHMEM,
any node of N_NORMAL_MEMORY has 0...ZONE_NORMAL

 
 
 +
 +if (zone_idx(zone) = zone_last  !node_state(nid, N_NORMAL_MEMORY))
 +arg-status_change_nid_normal = nid;
 +else
 +arg-status_change_nid_normal = -1;
 
 Wrong. The onlined node may only have high mem zone. IOW, think fake numa 
 case etc.

zone_idx(zone) = zone_last checks this case. the result is else branch.

 
 
 +
 +if (!node_state(nid, N_HIGH_MEMORY))
 +arg-status_change_nid = nid;
 +else
 +arg-status_change_nid = -1;
 +}
 +
 +static void set_nodemasks(int node, struct memory_notify *arg)
 
 Too ugly. just remove this and use node_set_state() directly

Re: [PATCH 1/2 V2] memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY]

2012-10-25 Thread Lai Jiangshan
On 10/25/2012 12:17 PM, KOSAKI Motohiro wrote:
 On Wed, Oct 24, 2012 at 5:43 AM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY],
 it forgets to manage node_states[N_NORMAL_MEMORY]. it may cause
 node_states[N_NORMAL_MEMORY] becomes incorrect.

 Example, if a node is empty before online, and we online a memory
 which is in ZONE_NORMAL. And after online,  node_states[N_HIGH_MEMORY]
 is correct, but node_states[N_NORMAL_MEMORY] is incorrect,
 the online code don't set the new online node to
 node_states[N_NORMAL_MEMORY].

 The same things like it will happen when offline(the offline code
 don't clear the node from node_states[N_NORMAL_MEMORY] when needed).
 Some memory managment code depends node_states[N_NORMAL_MEMORY],
 so we have to fix up the node_states[N_NORMAL_MEMORY].

 We add node_states_check_changes_online() and 
 node_states_check_changes_offline()
 to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY]
 are changed while hotpluging.

 Also add @status_change_nid_normal to struct memory_notify, thus
 the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY]
 are changed. (We can add a @flags and reuse @status_change_nid instead of
 introducing @status_change_nid_normal, but it will add much more complicated
 in memory hotplug callback in every subsystem. So introdcing
 @status_change_nid_normal is better and it don't change the sematic
 of @status_change_nid)

 Changed from V1:
 add more comments
 change the function name
 
 Your patch didn't fix my previous comments and don't works correctly.
 Please test your own patch before resubmitting. You should consider both
 zone normal only node and zone high only node.
 

The comments in the code already answered/explained your previous comments.

Thanks,
Lai
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] page_alloc: fix the incorrect adjustment to zone-present_pages

2012-10-26 Thread Lai Jiangshan
Current free_area_init_core() has incorrect adjustment code to adjust
-present_pages. It will cause -present_pages overflow, make the
system unusable(can't create any process/thread in our test) and cause further 
problem.

Details:
1) Some/many ZONEs don't have memory which is used by memmap.
   { Or all the actual memory used for memmap is much less than the 
memmap_pages
   (memmap_pages = PAGE_ALIGN(span_size * sizeof(struct page))  PAGE_SHIFT)
   CONFIG_SPARSEMEM is an example. }

2) incorrect adjustment in free_area_init_core(): zone-present_pages -= 
memmap_pages
3) but the zone has big hole, it causes the result of zone-present_pages 
become much smaller
4) when we offline a/several memory section of the zone: zone-present_pages -= 
offline_size
5) Now, zone-present_pages will/may be *OVERFLOW*.

So the adjustment is dangerous and incorrect.

Addition 1:
And in current kernel, the memmaps have nothing related/bound to any ZONE:
FLATMEM: global memmap
CONFIG_DISCONTIGMEM: node-specific memmap
CONFIG_SPARSEMEM: memorysection-specific memmap
None of them is ZONE-specific memmap, and the memory used for memmap is not 
bound to any ZONE.
So the adjustment zone-present_pages -= memmap_pages subtracts unrelated 
value
and makes no sense.

Addition 2:
We introduced this adjustment and tried to make page-reclaim/watermark happier,
but the adjustment is wrong in current kernel, and even makes 
page-reclaim/watermark
worse. It is against its original purpose/reason.

This adjustment is incorrect/buggy, subtracts unrelated value and violates its 
original
purpose, so we simply remove the adjustment.

CC: Mel Gorman mgor...@suse.de
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/page_alloc.c |   20 +---
 1 files changed, 1 insertions(+), 19 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bb90971..6bf72e3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4455,30 +4455,12 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat,
 
for (j = 0; j  MAX_NR_ZONES; j++) {
struct zone *zone = pgdat-node_zones + j;
-   unsigned long size, realsize, memmap_pages;
+   unsigned long size, realsize;
 
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
 
-   /*
-* Adjust realsize so that it accounts for how much memory
-* is used by this zone for memmap. This affects the watermark
-* and per-cpu initialisations
-*/
-   memmap_pages =
-   PAGE_ALIGN(size * sizeof(struct page))  PAGE_SHIFT;
-   if (realsize = memmap_pages) {
-   realsize -= memmap_pages;
-   if (memmap_pages)
-   printk(KERN_DEBUG
-%s zone: %lu pages used for memmap\n,
-  zone_names[j], memmap_pages);
-   } else
-   printk(KERN_WARNING
- %s zone: %lu pages exceeds realsize %lu\n,
-   zone_names[j], memmap_pages, realsize);
-
/* Account for reserved pages */
if (j == 0  realsize  dma_reserve) {
realsize -= dma_reserve;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 00/26] mm, memory-hotplug: dynamic configure movable memory and introduce movable node

2012-10-29 Thread Lai Jiangshan
Movable memory is a very important concept of memory-management,
we need to consolidate it and make use of it on systems.

Movable memory is needed for
o   anti-fragmentation(hugepage, big-order allocation...)
o   logic hot-remove(virtualization, Memory capacity on Demand)
o   physic hot-remove(power-saving, hardware partitioning, hardware fault 
management)

All these require dynamic configuring the memory and making better utilities of 
memories
and safer. We also need physic hot-remove, so we need movable node too.
(Although some systems support physic-memory-migration, we don't require all
memory on physic-node is movable, but movable node is still needed here
for logic-node if we want to make physic-migration is transparent)

We add dynamic configuration commands online_movalbe and online_kernel.
We also add non-dynamic boot option kernelcore_max_addr.
We may add some more dynamic/non-dynamic configuration in future.


The patchset is based on 3.7-rc3 with these three patches already applied:
https://lkml.org/lkml/2012/10/24/151
https://lkml.org/lkml/2012/10/26/150

You can also simply pull all the patches from:
git pull https://github.com/laijs/linux.git hotplug-next



Issues):

mempolicy(M_BIND) don't act well when the nodemask has movable nodes only,
the kernel allocation will fail and the task can't create new task or other
kernel objects.

So we change the strategy/policy
when the bound nodemask has movable node(s) only, we only
apply mempolicy for userspace allocation, don't apply it
for kernel allocation.

CPUSET also has the same problem, but the code spread in page_alloc.c,
and we doesn't fix it yet, we can/will change allocation strategy to one of
these 3 strategies:
1) the same strategy as mempolicy
2) change cpuset, make nodemask always has at least a normal node
3) split nodemask: nodemask_user and nodemask_kernel

Thoughts?



Patches):

patch1-3: add online_movable and online_kernel, bot don't result movable 
node
Patch4cleanup for node_state_attr
Patch5introduce N_MEMORY
Patch6-17 use N_MEMORY instead N_HIGH_MEMORY.
  The patches are separated by subsystem,
  Patch18 also changes the node_states initialization
Patch18-20Add  MOVABLE-dedicated node 
Patch21-25Add kernelcore_max_addr
patch26:  mempolicy handle movable node




Changes):

change V5-V4:
consolidate online_movable/online_kernel
nodemask management

change V4-v3
rebase.
online_movable/online_kernel can create a zone from empty
or empyt a zone

change V3-v2:
Proper nodemask management

change V2-V1:

The original V1 patchset of MOVABLE-dedicated node is here:
http://comments.gmane.org/gmane.linux.kernel.mm/78122

The new V2 adds N_MEMORY and a notion of MOVABLE-dedicated node.
And fix some related problems.

The orignal V1 patchset of add online_movable is here:
https://lkml.org/lkml/2012/7/4/145

The new V2 discards the MIGRATE_HOTREMOVE approach, and use a more straight
implementation(only 1 patch).



Lai Jiangshan (22):
  mm, memory-hotplug: dynamic configure movable memory and portion
memory
  memory_hotplug: handle empty zone when online_movable/online_kernel
  memory_hotplug: ensure every online node has NORMAL memory
  node: cleanup node_state_attr
  node_states: introduce N_MEMORY
  cpuset: use N_MEMORY instead N_HIGH_MEMORY
  procfs: use N_MEMORY instead N_HIGH_MEMORY
  memcontrol: use N_MEMORY instead N_HIGH_MEMORY
  oom: use N_MEMORY instead N_HIGH_MEMORY
  mm,migrate: use N_MEMORY instead N_HIGH_MEMORY
  mempolicy: use N_MEMORY instead N_HIGH_MEMORY
  hugetlb: use N_MEMORY instead N_HIGH_MEMORY
  vmstat: use N_MEMORY instead N_HIGH_MEMORY
  kthread: use N_MEMORY instead N_HIGH_MEMORY
  init: use N_MEMORY instead N_HIGH_MEMORY
  vmscan: use N_MEMORY instead N_HIGH_MEMORY
  page_alloc: use N_MEMORY instead N_HIGH_MEMORY change the node_states
initialization
  hotplug: update nodemasks management
  numa: add CONFIG_MOVABLE_NODE for movable-dedicated node
  memory_hotplug: allow online/offline memory to result movable node
  page_alloc: add kernelcore_max_addr
  mempolicy: fix is_valid_nodemask()

Yasuaki Ishimatsu (4):
  x86: get pg_data_t's memory from other node
  x86: use memblock_set_current_limit() to set memblock.current_limit
  memblock: limit memory address from memblock
  memblock: compare current_limit with end variable at
memblock_find_in_range_node()

 Documentation/cgroups/cpusets.txt   |2 +-
 Documentation/kernel-parameters.txt |9 +
 Documentation/memory-hotplug.txt|   19 ++-
 arch/x86/kernel/setup.c |4 +-
 arch/x86/mm/init_64.c   |4 +-
 arch/x86/mm/numa.c  |8 +-
 drivers/base/memory.c   |   27 ++--
 drivers/base/node.c |   28 ++--
 fs/proc/kcore.c |2 +-
 fs/proc/task_mmu.c

[V5 PATCH 01/26] mm, memory-hotplug: dynamic configure movable memory and portion memory

2012-10-29 Thread Lai Jiangshan
Add online_movable and online_kernel for logic memory hotplug.
This is the dynamic version of movablecore  kernelcore.

We have the same reason to introduce it as to introduce movablecore  
kernelcore.
It has the same motive as movablecore  kernelcore, but it is 
dynamic/running-time:

o   We can configure memory as kernelcore or movablecore after boot.

Userspace workload is increased, we need more hugepage, we can't
use online_movable to add memory and allow the system use more
THP(transparent-huge-page), vice-verse when kernel workload is increase.

Also help for virtualization to dynamic configure host/guest's memory,
to save/(reduce waste) memory.

Memory capacity on Demand

o   When a new node is physically online after boot, we need to use
online_movable or online_kernel to configure/portion it
as we expected when we logic-online it.

This configuration also helps for physically-memory-migrate.

o   all benefit as the same as existed movablecore  kernelcore.

o   Preparing for movable-node, which is very important for power-saving,
hardware partitioning and high-available-system(hardware fault 
management).

(Note, we don't introduce movable-node here.)


Action behavior:
When a memoryblock/memorysection is onlined by online_movable, the kernel
will not have directly reference to the page of the memoryblock,
thus we can remove that memory any time when needed.

When it is online by online_kernel, the kernel can use it.
When it is online by online, the zone type doesn't changed.

Current constraints:
Only the memoryblock which is adjacent to the ZONE_MOVABLE
can be online from ZONE_NORMAL to ZONE_MOVABLE.


Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/memory-hotplug.txt |   14 +-
 drivers/base/memory.c|   27 ++
 include/linux/memory_hotplug.h   |   13 +-
 mm/memory_hotplug.c  |  101 +-
 4 files changed, 142 insertions(+), 13 deletions(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 6e6cbc7..c6f993d 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -161,7 +161,8 @@ a recent addition and not present on older kernels.
in the memory block.
 'state'   : read-write
 at read:  contains online/offline state of memory.
-at write: user can specify online, offline command
+at write: user can specify online_kernel,
+online_movable, online, offline command
 which will be performed on al sections in the block.
 'phys_device' : read-only: designed to show the name of physical memory
 device.  This is not well implemented now.
@@ -255,6 +256,17 @@ For onlining, you have to write online to the section's 
state file as:
 
 % echo online  /sys/devices/system/memory/memoryXXX/state
 
+This onlining will not change the ZONE type of the target memory section,
+If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE:
+
+% echo online_movable  /sys/devices/system/memory/memoryXXX/state
+(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE)
+
+And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL:
+
+% echo online_kernel  /sys/devices/system/memory/memoryXXX/state
+(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL)
+
 After this, section memoryXXX's state will be 'online' and the amount of
 available memory will be increased.
 
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 86c8821..15a1dd7 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -246,7 +246,7 @@ static bool pages_correctly_reserved(unsigned long 
start_pfn,
  * OK to have direct references to sparsemem variables in here.
  */
 static int
-memory_block_action(unsigned long phys_index, unsigned long action)
+memory_block_action(unsigned long phys_index, unsigned long action, int 
online_type)
 {
unsigned long start_pfn;
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
@@ -261,7 +261,7 @@ memory_block_action(unsigned long phys_index, unsigned long 
action)
if (!pages_correctly_reserved(start_pfn, nr_pages))
return -EBUSY;
 
-   ret = online_pages(start_pfn, nr_pages);
+   ret = online_pages(start_pfn, nr_pages, online_type);
break;
case MEM_OFFLINE:
ret = offline_pages(start_pfn, nr_pages);
@@ -276,7 +276,8 @@ memory_block_action(unsigned long phys_index, unsigned long 
action)
 }
 
 static int __memory_block_change_state(struct memory_block *mem,
-   unsigned long to_state, unsigned long from_state_req

[V5 PATCH 03/26] memory_hotplug: ensure every online node has NORMAL memory

2012-10-29 Thread Lai Jiangshan
Old  memory hotplug code and new online/movable may cause a online node
don't have any normal memory, but memory-management acts bad when we have
nodes which is online but don't have any normal memory.
Example: it may cause a bound task fail on all kernel allocation and
cause the task can't create task or create other kernel object.

So we disable non-normal-memory-node here, we will enable it
when we prepared.


Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/memory_hotplug.c |   40 
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bdcdaf6..9af9641 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -589,6 +589,12 @@ static int online_pages_range(unsigned long start_pfn, 
unsigned long nr_pages,
return 0;
 }
 
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+   return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -654,6 +660,12 @@ int __ref online_pages(unsigned long pfn, unsigned long 
nr_pages, int online_typ
 */
zone = page_zone(pfn_to_page(pfn));
 
+   if ((zone_idx(zone)  ZONE_NORMAL || online_type == ONLINE_MOVABLE) 
+   !can_online_high_movable(zone)) {
+   unlock_memory_hotplug();
+   return -1;
+   }
+
if (online_type == ONLINE_KERNEL  zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
unlock_memory_hotplug();
@@ -1058,6 +1070,30 @@ check_pages_isolated(unsigned long start_pfn, unsigned 
long end_pfn)
return offlined;
 }
 
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+   struct pglist_data *pgdat = zone-zone_pgdat;
+   unsigned long present_pages = 0;
+   enum zone_type zt;
+
+   for (zt = 0; zt = ZONE_NORMAL; zt++)
+   present_pages += pgdat-node_zones[zt].present_pages;
+
+   if (present_pages  nr_pages)
+   return true;
+
+   present_pages = 0;
+   for (; zt = ZONE_MOVABLE; zt++)
+   present_pages += pgdat-node_zones[zt].present_pages;
+
+   /*
+* we can't offline the last normal memory until all
+* higher memory is offlined.
+*/
+   return present_pages == 0;
+}
+
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1145,6 +1181,10 @@ static int __ref __offline_pages(unsigned long start_pfn,
node = zone_to_nid(zone);
nr_pages = end_pfn - start_pfn;
 
+   ret = -EINVAL;
+   if (zone_idx(zone) = ZONE_NORMAL  !can_offline_normal(zone, 
nr_pages))
+   goto out;
+
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, 
true);
if (ret)
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 09/26] oom: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 mm/oom_kill.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 79e0f3e..aa2d89c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -257,7 +257,7 @@ static enum oom_constraint constrained_alloc(struct 
zonelist *zonelist,
 * the page allocator means a mempolicy is in effect.  Cpuset policy
 * is enforced in get_page_from_freelist().
 */
-   if (nodemask  !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+   if (nodemask  !nodes_subset(node_states[N_MEMORY], *nodemask)) {
*totalpages = total_swap_pages;
for_each_node_mask(nid, *nodemask)
*totalpages += node_spanned_pages(nid);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 20/26] memory_hotplug: allow online/offline memory to result movable node

2012-10-29 Thread Lai Jiangshan
Now, memory management can handle movable node or nodes which don't have
any normal memory, so we can dynamic configure and add movable node by:
online a ZONE_MOVABLE memory from a previous offline node
offline the last normal memory which result a non-normal-memory-node

movable-node is very important for power-saving,
hardware partitioning and high-available-system(hardware fault management).


Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/memory_hotplug.c |   16 
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a55b547..756744c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -589,11 +589,19 @@ static int online_pages_range(unsigned long start_pfn, 
unsigned long nr_pages,
return 0;
 }
 
+#ifdef CONFIG_MOVABLE_NODE
+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+   return true;
+}
+#else /* #ifdef CONFIG_MOVABLE_NODE */
 /* ensure every online node has NORMAL memory */
 static bool can_online_high_movable(struct zone *zone)
 {
return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 }
+#endif /* #ifdef CONFIG_MOVABLE_NODE */
 
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
@@ -1097,6 +1105,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned 
long end_pfn)
return offlined;
 }
 
+#ifdef CONFIG_MOVABLE_NODE
+/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+   return true;
+}
+#else /* #ifdef CONFIG_MOVABLE_NODE */
 /* ensure the node has NORMAL memory if it is still online */
 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 {
@@ -1120,6 +1135,7 @@ static bool can_offline_normal(struct zone *zone, 
unsigned long nr_pages)
 */
return present_pages == 0;
 }
+#endif /* #ifdef CONFIG_MOVABLE_NODE */
 
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 04/26] node: cleanup node_state_attr

2012-10-29 Thread Lai Jiangshan
use [index] = init_value
use N_x instead of hardcode.

Make it more readability and easy to add new state.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 drivers/base/node.c |   20 ++--
 1 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index af1a177..5d7731e 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -614,23 +614,23 @@ static ssize_t show_node_state(struct device *dev,
{ __ATTR(name, 0444, show_node_state, NULL), state }
 
 static struct node_attr node_state_attr[] = {
-   _NODE_ATTR(possible, N_POSSIBLE),
-   _NODE_ATTR(online, N_ONLINE),
-   _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
-   _NODE_ATTR(has_cpu, N_CPU),
+   [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
+   [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
+   [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
 #ifdef CONFIG_HIGHMEM
-   _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
+   [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
+   [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 };
 
 static struct attribute *node_state_attrs[] = {
-   node_state_attr[0].attr.attr,
-   node_state_attr[1].attr.attr,
-   node_state_attr[2].attr.attr,
-   node_state_attr[3].attr.attr,
+   node_state_attr[N_POSSIBLE].attr.attr,
+   node_state_attr[N_ONLINE].attr.attr,
+   node_state_attr[N_NORMAL_MEMORY].attr.attr,
 #ifdef CONFIG_HIGHMEM
-   node_state_attr[4].attr.attr,
+   node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
+   node_state_attr[N_CPU].attr.attr,
NULL
 };
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 13/26] vmstat: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Christoph Lameter c...@linux.com
---
 mm/vmstat.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index c737057..1b5cacd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -930,7 +930,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
 
/* check memoryless node */
-   if (!node_state(pgdat-node_id, N_HIGH_MEMORY))
+   if (!node_state(pgdat-node_id, N_MEMORY))
return 0;
 
seq_printf(m, Page block order: %d\n, pageblock_order);
@@ -1292,7 +1292,7 @@ static int unusable_show(struct seq_file *m, void *arg)
pg_data_t *pgdat = (pg_data_t *)arg;
 
/* check memoryless node */
-   if (!node_state(pgdat-node_id, N_HIGH_MEMORY))
+   if (!node_state(pgdat-node_id, N_MEMORY))
return 0;
 
walk_zones_in_node(m, pgdat, unusable_show_print);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 17/26] page_alloc: use N_MEMORY instead N_HIGH_MEMORY change the node_states initialization

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Since we introduced N_MEMORY, we update the initialization of node_states.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 arch/x86/mm/init_64.c |4 +++-
 mm/page_alloc.c   |   40 ++--
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3baff25..2ead3c8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -630,7 +630,9 @@ void __init paging_init(void)
 *   numa support is not compiled in, and later node_set_state
 *   will not set it back.
 */
-   node_clear_state(0, N_NORMAL_MEMORY);
+   node_clear_state(0, N_MEMORY);
+   if (N_MEMORY != N_NORMAL_MEMORY)
+   node_clear_state(0, N_NORMAL_MEMORY);
 
zone_sizes_init();
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b1ef9b0..b70c929 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1692,7 +1692,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, 
unsigned long mark,
  *
  * If the zonelist cache is present in the passed in zonelist, then
  * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
+ * tasks mems_allowed, or node_states[N_MEMORY].)
  *
  * If the zonelist cache is not available for this zonelist, does
  * nothing and returns NULL.
@@ -1721,7 +1721,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, 
int alloc_flags)
 
allowednodes = !in_interrupt()  (alloc_flags  ALLOC_CPUSET) ?
cpuset_current_mems_allowed :
-   node_states[N_HIGH_MEMORY];
+   node_states[N_MEMORY];
return allowednodes;
 }
 
@@ -3194,7 +3194,7 @@ static int find_next_best_node(int node, nodemask_t 
*used_node_mask)
return node;
}
 
-   for_each_node_state(n, N_HIGH_MEMORY) {
+   for_each_node_state(n, N_MEMORY) {
 
/* Don't want a node to appear more than once */
if (node_isset(n, *used_node_mask))
@@ -3336,7 +3336,7 @@ static int default_zonelist_order(void)
 * local memory, NODE_ORDER may be suitable.
  */
average_size = total_size /
-   (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
+   (nodes_weight(node_states[N_MEMORY]) + 1);
for_each_online_node(nid) {
low_kmem_size = 0;
total_size = 0;
@@ -4669,7 +4669,7 @@ unsigned long __init 
find_min_pfn_with_active_regions(void)
 /*
  * early_calculate_totalpages()
  * Sum pages in active regions for movable zone.
- * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ * Populate N_MEMORY for calculating usable_nodes.
  */
 static unsigned long __init early_calculate_totalpages(void)
 {
@@ -4682,7 +4682,7 @@ static unsigned long __init 
early_calculate_totalpages(void)
 
totalpages += pages;
if (pages)
-   node_set_state(nid, N_HIGH_MEMORY);
+   node_set_state(nid, N_MEMORY);
}
return totalpages;
 }
@@ -4699,9 +4699,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
-   nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
+   nodemask_t saved_node_state = node_states[N_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
-   int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+   int usable_nodes = nodes_weight(node_states[N_MEMORY]);
 
/*
 * If movablecore was specified, calculate what size of
@@ -4736,7 +4736,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
 
/*
@@ -4828,23 +4828,27 @@ restart:
 
 out:
/* restore the node_state */
-   node_states[N_HIGH_MEMORY] = saved_node_state;
+   node_states[N_MEMORY] = saved_node_state;
 }
 
-/* Any regular memory on that node ? */
-static void __init check_for_regular_memory(pg_data_t *pgdat)
+/* Any regular or high memory on that node ? */
+static void check_for_memory(pg_data_t *pgdat, int nid)
 {
-#ifdef CONFIG_HIGHMEM
enum zone_type zone_type;
 
-   for (zone_type = 0; zone_type = ZONE_NORMAL; zone_type

[V5 PATCH 08/26] memcontrol: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/memcontrol.c  |   18 +-
 mm/page_cgroup.c |2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7acf43b..1b69665 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -800,7 +800,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct 
mem_cgroup *memcg,
int nid;
u64 total = 0;
 
-   for_each_node_state(nid, N_HIGH_MEMORY)
+   for_each_node_state(nid, N_MEMORY)
total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
return total;
 }
@@ -1611,9 +1611,9 @@ static void mem_cgroup_may_update_nodemask(struct 
mem_cgroup *memcg)
return;
 
/* make a nodemask where this memcg uses memory from */
-   memcg-scan_nodes = node_states[N_HIGH_MEMORY];
+   memcg-scan_nodes = node_states[N_MEMORY];
 
-   for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+   for_each_node_mask(nid, node_states[N_MEMORY]) {
 
if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
node_clear(nid, memcg-scan_nodes);
@@ -1684,7 +1684,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup 
*memcg, bool noswap)
/*
 * Check rest of nodes.
 */
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
if (node_isset(nid, memcg-scan_nodes))
continue;
if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
@@ -3759,7 +3759,7 @@ move_account:
drain_all_stock_sync(memcg);
ret = 0;
mem_cgroup_start_move(memcg);
-   for_each_node_state(node, N_HIGH_MEMORY) {
+   for_each_node_state(node, N_MEMORY) {
for (zid = 0; !ret  zid  MAX_NR_ZONES; zid++) {
enum lru_list lru;
for_each_lru(lru) {
@@ -4087,7 +4087,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, 
struct cftype *cft,
 
total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
seq_printf(m, total=%lu, total_nr);
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
seq_printf(m,  N%d=%lu, nid, node_nr);
}
@@ -4095,7 +4095,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, 
struct cftype *cft,
 
file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
seq_printf(m, file=%lu, file_nr);
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_FILE);
seq_printf(m,  N%d=%lu, nid, node_nr);
@@ -4104,7 +4104,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, 
struct cftype *cft,
 
anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
seq_printf(m, anon=%lu, anon_nr);
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
LRU_ALL_ANON);
seq_printf(m,  N%d=%lu, nid, node_nr);
@@ -4113,7 +4113,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, 
struct cftype *cft,
 
unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
seq_printf(m, unevictable=%lu, unevictable_nr);
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
BIT(LRU_UNEVICTABLE));
seq_printf(m,  N%d=%lu, nid, node_nr);
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5ddad0c..c1054ad 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -271,7 +271,7 @@ void __init page_cgroup_init(void)
if (mem_cgroup_disabled())
return;
 
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
unsigned long start_pfn, end_pfn;
 
start_pfn = node_start_pfn(nid);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 19/26] numa: add CONFIG_MOVABLE_NODE for movable-dedicated node

2012-10-29 Thread Lai Jiangshan
All are prepared, we can actually introduce N_MEMORY.
add CONFIG_MOVABLE_NODE make we can use it for movable-dedicated node

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 drivers/base/node.c  |6 ++
 include/linux/nodemask.h |4 
 mm/Kconfig   |8 
 mm/page_alloc.c  |3 +++
 4 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 4c3aa7c..9cdd66f 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -620,6 +620,9 @@ static struct node_attr node_state_attr[] = {
 #ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+   [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
+#endif
[N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
 };
 
@@ -630,6 +633,9 @@ static struct attribute *node_state_attrs[] = {
 #ifdef CONFIG_HIGHMEM
node_state_attr[N_HIGH_MEMORY].attr.attr,
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+   node_state_attr[N_MEMORY].attr.attr,
+#endif
node_state_attr[N_CPU].attr.attr,
NULL
 };
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index c6ebdc9..4e2cbfa 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -380,7 +380,11 @@ enum node_states {
 #else
N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+   N_MEMORY,   /* The node has memory(regular, high, movable) 
*/
+#else
N_MEMORY = N_HIGH_MEMORY,
+#endif
N_CPU,  /* The node has one or more cpus */
NR_NODE_STATES
 };
diff --git a/mm/Kconfig b/mm/Kconfig
index a3f8ddd..957ebd5 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,14 @@ config NO_BOOTMEM
 config MEMORY_ISOLATION
boolean
 
+config MOVABLE_NODE
+   boolean Enable to assign a node has only movable memory
+   depends on HAVE_MEMBLOCK
+   depends on NO_BOOTMEM
+   depends on X86_64
+   depends on NUMA
+   default y
+
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
bool Allow for memory hot-add
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b70c929..a42337f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 #ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
 #endif
+#ifdef CONFIG_MOVABLE_NODE
+   [N_MEMORY] = { { [0] = 1UL } },
+#endif
[N_CPU] = { { [0] = 1UL } },
 #endif /* NUMA */
 };
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 24/26] memblock: limit memory address from memblock

2012-10-29 Thread Lai Jiangshan
From: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

Setting kernelcore_max_pfn means all memory which is bigger than
the boot parameter is allocated as ZONE_MOVABLE. So memory which
is allocated by memblock also should be limited by the parameter.

The patch limits memory from memblock.

Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/memblock.h |1 +
 mm/memblock.c|5 -
 mm/page_alloc.c  |6 +-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index d452ee1..3e52911 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -42,6 +42,7 @@ struct memblock {
 
 extern struct memblock memblock;
 extern int memblock_debug;
+extern phys_addr_t memblock_limit;
 
 #define memblock_dbg(fmt, ...) \
if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
diff --git a/mm/memblock.c b/mm/memblock.c
index 6259055..ee2e307 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -957,7 +957,10 @@ void __init_memblock memblock_trim_memory(phys_addr_t 
align)
 
 void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 {
-   memblock.current_limit = limit;
+   if (!memblock_limit || (memblock_limit  limit))
+   memblock.current_limit = limit;
+   else
+   memblock.current_limit = memblock_limit;
 }
 
 static void __init_memblock memblock_dump(struct memblock_type *type, char 
*name)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 11df8b5..f76b696 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -208,6 +208,8 @@ static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 
+phys_addr_t memblock_limit;
+
 /* movable_zone is the real zone pages in ZONE_MOVABLE are taken from */
 int movable_zone;
 EXPORT_SYMBOL(movable_zone);
@@ -4976,7 +4978,9 @@ static int __init cmdline_parse_core(char *p, unsigned 
long *core)
  */
 static int __init cmdline_parse_kernelcore_max_addr(char *p)
 {
-   return cmdline_parse_core(p, required_kernelcore_max_pfn);
+   cmdline_parse_core(p, required_kernelcore_max_pfn);
+   memblock_limit = required_kernelcore_max_pfn  PAGE_SHIFT;
+   return 0;
 }
 early_param(kernelcore_max_addr, cmdline_parse_kernelcore_max_addr);
 #endif
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 18/26] hotplug: update nodemasks management

2012-10-29 Thread Lai Jiangshan
update nodemasks management for N_MEMORY

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/memory-hotplug.txt |5 ++-
 include/linux/memory.h   |1 +
 mm/memory_hotplug.c  |   87 +++---
 3 files changed, 77 insertions(+), 16 deletions(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index c6f993d..8e5eacb 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -390,6 +390,7 @@ struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
+   int status_change_nid_high;
int status_change_nid;
 }
 
@@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory.
 nr_pages is # of pages of online/offline memory.
 status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask
 is (will be) set/clear, if this is -1, then nodemask status is not changed.
-status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be)
+status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask
+is (will be) set/clear, if this is -1, then nodemask status is not changed.
+status_change_nid is set node id when N_MEMORY of nodemask is (will be)
 set/clear. It means a new(memoryless) node gets new memory by online and a
 node loses all memory. If this is -1, then nodemask status is not changed.
 If status_changed_nid* = 0, callback should create/discard structures for the
diff --git a/include/linux/memory.h b/include/linux/memory.h
index a09216d..45e93b4 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -54,6 +54,7 @@ struct memory_notify {
unsigned long start_pfn;
unsigned long nr_pages;
int status_change_nid_normal;
+   int status_change_nid_high;
int status_change_nid;
 };
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9af9641..a55b547 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -603,13 +603,15 @@ static void node_states_check_changes_online(unsigned 
long nr_pages,
enum zone_type zone_last = ZONE_NORMAL;
 
/*
-* If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
-* which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL.
+* If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+* contains nodes which have zones of 0...ZONE_NORMAL,
+* set zone_last to ZONE_NORMAL.
 *
-* If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes
-* which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+* If we don't have HIGHMEM nor movable node,
+* node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+* 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
 */
-   if (N_HIGH_MEMORY == N_NORMAL_MEMORY)
+   if (N_MEMORY == N_NORMAL_MEMORY)
zone_last = ZONE_MOVABLE;
 
/*
@@ -623,12 +625,34 @@ static void node_states_check_changes_online(unsigned 
long nr_pages,
else
arg-status_change_nid_normal = -1;
 
+#ifdef CONFIG_HIGHMEM
+   /*
+* If we have movable node, node_states[N_HIGH_MEMORY]
+* contains nodes which have zones of 0...ZONE_HIGH,
+* set zone_last to ZONE_HIGH.
+*
+* If we don't have movable node, node_states[N_NORMAL_MEMORY]
+* contains nodes which have zones of 0...ZONE_MOVABLE,
+* set zone_last to ZONE_MOVABLE.
+*/
+   zone_last = ZONE_HIGH;
+   if (N_MEMORY == N_HIGH_MEMORY)
+   zone_last = ZONE_MOVABLE;
+
+   if (zone_idx(zone) = zone_last  !node_state(nid, N_HIGH_MEMORY))
+   arg-status_change_nid_high = nid;
+   else
+   arg-status_change_nid_high = -1;
+#else
+   arg-status_change_nid_high = arg-status_change_nid_normal;
+#endif
+
/*
 * if the node don't have memory befor online, we will need to
-* set the node to node_states[N_HIGH_MEMORY] after the memory
+* set the node to node_states[N_MEMORY] after the memory
 * is online.
 */
-   if (!node_state(nid, N_HIGH_MEMORY))
+   if (!node_state(nid, N_MEMORY))
arg-status_change_nid = nid;
else
arg-status_change_nid = -1;
@@ -639,7 +663,10 @@ static void node_states_set_node(int node, struct 
memory_notify *arg)
if (arg-status_change_nid_normal = 0)
node_set_state(node, N_NORMAL_MEMORY);
 
-   node_set_state(node, N_HIGH_MEMORY);
+   if (arg-status_change_nid_high = 0)
+   node_set_state(node, N_HIGH_MEMORY);
+
+   node_set_state(node, N_MEMORY);
 }
 
 
@@ -1103,13 +1130,15 @@ static void node_states_check_changes_offline(unsigned 
long nr_pages,
enum zone_type zt, zone_last = ZONE_NORMAL;
 
/*
-* If we have HIGHMEM, node_states[N_NORMAL_MEMORY

[V5 PATCH 05/26] node_states: introduce N_MEMORY

2012-10-29 Thread Lai Jiangshan
We have N_NORMAL_MEMORY for standing for the nodes that have normal memory with
zone_type = ZONE_NORMAL.

And we have N_HIGH_MEMORY for standing for the nodes that have normal or high
memory.

But we don't have any word to stand for the nodes that have *any* memory.

And we have N_CPU but without N_MEMORY.

Current code reuse the N_HIGH_MEMORY for this purpose because any node which
has memory must have high memory or normal memory currently.

A)  But this reusing is bad for *readability*. Because the name
N_HIGH_MEMORY just stands for high or normal:

A.example 1)
mem_cgroup_nr_lru_pages():
for_each_node_state(nid, N_HIGH_MEMORY)

The user will be confused(why this function just counts for high or
normal memory node? does it counts for ZONE_MOVABLE's lru pages?)
until someone else tell them N_HIGH_MEMORY is reused to stand for
nodes that have any memory.

A.cont) If we introduce N_MEMORY, we can reduce this confusing
AND make the code more clearly:

A.example 2) mm/page_cgroup.c use N_HIGH_MEMORY twice:

One is in page_cgroup_init(void):
for_each_node_state(nid, N_HIGH_MEMORY) {

It means if the node have memory, we will allocate page_cgroup map for
the node. We should use N_MEMORY instead here to gaim more clearly.

The second using is in alloc_page_cgroup():
if (node_state(nid, N_HIGH_MEMORY))
addr = vzalloc_node(size, nid);

It means if the node has high or normal memory that can be allocated
from kernel. We should keep N_HIGH_MEMORY here, and it will be better
if the any memory semantic of N_HIGH_MEMORY is removed.

B)  This reusing is out-dated if we introduce MOVABLE-dedicated node.
The MOVABLE-dedicated node should not appear in
node_stats[N_HIGH_MEMORY] nor node_stats[N_NORMAL_MEMORY],
because MOVABLE-dedicated node has no high or normal memory.

In x86_64, N_HIGH_MEMORY=N_NORMAL_MEMORY, if a MOVABLE-dedicated node
is in node_stats[N_HIGH_MEMORY], it is also means it is in
node_stats[N_NORMAL_MEMORY], it causes SLUB wrong.

The slub uses
for_each_node_state(nid, N_NORMAL_MEMORY)
and creates kmem_cache_node for MOVABLE-dedicated node and cause 
problem.

In one word, we need a N_MEMORY. We just intrude it as an alias to
N_HIGH_MEMORY and fix all im-proper usages of N_HIGH_MEMORY in late patches.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Christoph Lameter c...@linux.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 include/linux/nodemask.h |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 7afc363..c6ebdc9 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -380,6 +380,7 @@ enum node_states {
 #else
N_HIGH_MEMORY = N_NORMAL_MEMORY,
 #endif
+   N_MEMORY = N_HIGH_MEMORY,
N_CPU,  /* The node has one or more cpus */
NR_NODE_STATES
 };
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 07/26] procfs: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 fs/proc/kcore.c|2 +-
 fs/proc/task_mmu.c |4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67ee..e96d4f1 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
/* Not inializedupdate now */
/* find out max pfn */
end_pfn = 0;
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
unsigned long node_end;
node_end  = NODE_DATA(nid)-node_start_pfn +
NODE_DATA(nid)-node_spanned_pages;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9..2d89601 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, 
struct vm_area_struct *vma,
return NULL;
 
nid = page_to_nid(page);
-   if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+   if (!node_isset(nid, node_states[N_MEMORY]))
return NULL;
 
return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int 
is_pid)
if (md-writeback)
seq_printf(m,  writeback=%lu, md-writeback);
 
-   for_each_node_state(n, N_HIGH_MEMORY)
+   for_each_node_state(n, N_MEMORY)
if (md-node[n])
seq_printf(m,  N%d=%lu, n, md-node[n]);
 out:
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 12/26] hugetlb: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 drivers/base/node.c |2 +-
 mm/hugetlb.c|   24 
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/base/node.c b/drivers/base/node.c
index 5d7731e..4c3aa7c 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node;
 static inline bool hugetlb_register_node(struct node *node)
 {
if (__hugetlb_register_node 
-   node_state(node-dev.id, N_HIGH_MEMORY)) {
+   node_state(node-dev.id, N_MEMORY)) {
__hugetlb_register_node(node);
return true;
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 59a0059..7720ade 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h,
 * on-line nodes with memory and will handle the hstate accounting.
 */
while (nr_pages--) {
-   if (!free_pool_huge_page(h, node_states[N_HIGH_MEMORY], 1))
+   if (!free_pool_huge_page(h, node_states[N_MEMORY], 1))
break;
}
 }
@@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct 
vm_area_struct *vma,
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
struct huge_bootmem_page *m;
-   int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
+   int nr_nodes = nodes_weight(node_states[N_MEMORY]);
 
while (nr_nodes) {
void *addr;
 
addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h,
-   node_states[N_HIGH_MEMORY])),
+   node_states[N_MEMORY])),
huge_page_size(h), huge_page_size(h), 0);
 
if (addr) {
@@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct 
hstate *h)
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_fresh_huge_page(h,
-node_states[N_HIGH_MEMORY]))
+node_states[N_MEMORY]))
break;
}
h-max_huge_pages = i;
@@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
if (!(obey_mempolicy 
init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
-   nodes_allowed = node_states[N_HIGH_MEMORY];
+   nodes_allowed = node_states[N_MEMORY];
}
} else if (nodes_allowed) {
/*
@@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool 
obey_mempolicy,
count += h-nr_huge_pages - h-nr_huge_pages_node[nid];
init_nodemask_of_node(nodes_allowed, nid);
} else
-   nodes_allowed = node_states[N_HIGH_MEMORY];
+   nodes_allowed = node_states[N_MEMORY];
 
h-max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
 
-   if (nodes_allowed != node_states[N_HIGH_MEMORY])
+   if (nodes_allowed != node_states[N_MEMORY])
NODEMASK_FREE(nodes_allowed);
 
return len;
@@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void)
 {
int nid;
 
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
struct node *node = node_devices[nid];
if (node-dev.id == nid)
hugetlb_register_node(node);
@@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order)
for (i = 0; i  MAX_NUMNODES; ++i)
INIT_LIST_HEAD(h-hugepage_freelists[i]);
INIT_LIST_HEAD(h-hugepage_activelist);
-   h-next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
-   h-next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
+   h-next_nid_to_alloc = first_node(node_states[N_MEMORY]);
+   h-next_nid_to_free = first_node(node_states[N_MEMORY]);
snprintf(h-name, HSTATE_NAME_LEN, hugepages-%lukB,
huge_page_size(h)/1024);
/*
@@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool 
obey_mempolicy,
if (!(obey_mempolicy 
   init_nodemask_of_mempolicy(nodes_allowed))) {
NODEMASK_FREE(nodes_allowed);
-   nodes_allowed = node_states[N_HIGH_MEMORY

[V5 PATCH 23/26] x86: use memblock_set_current_limit() to set memblock.current_limit

2012-10-29 Thread Lai Jiangshan
From: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

memblock.current_limit is set directly though memblock_set_current_limit()
is prepared. So fix it.

Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 arch/x86/kernel/setup.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..ab3017a 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -890,7 +890,7 @@ void __init setup_arch(char **cmdline_p)
 
cleanup_highmap();
 
-   memblock.current_limit = get_max_mapped();
+   memblock_set_current_limit(get_max_mapped());
memblock_x86_fill();
 
/*
@@ -940,7 +940,7 @@ void __init setup_arch(char **cmdline_p)
max_low_pfn = max_pfn;
}
 #endif
-   memblock.current_limit = get_max_mapped();
+   memblock_set_current_limit(get_max_mapped());
dma_contiguous_reserve(0);
 
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 14/26] kthread: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/kthread.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 29fb60c..691dc2e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -428,7 +428,7 @@ int kthreadd(void *unused)
set_task_comm(tsk, kthreadd);
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
-   set_mems_allowed(node_states[N_HIGH_MEMORY]);
+   set_mems_allowed(node_states[N_MEMORY]);
 
current-flags |= PF_NOFREEZE;
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 25/26] memblock: compare current_limit with end variable at memblock_find_in_range_node()

2012-10-29 Thread Lai Jiangshan
From: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

memblock_find_in_range_node() does not compare memblock.current_limit
with end variable. Thus even if memblock.current_limit is smaller than
end variable, the function allocates memory address that is bigger than
memblock.current_limit.

The patch adds the check to memblock_find_in_range_node()

Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/memblock.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/memblock.c b/mm/memblock.c
index ee2e307..50ab53c 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -100,11 +100,12 @@ phys_addr_t __init_memblock 
memblock_find_in_range_node(phys_addr_t start,
phys_addr_t align, int nid)
 {
phys_addr_t this_start, this_end, cand;
+   phys_addr_t current_limit = memblock.current_limit;
u64 i;
 
/* pump up @end */
-   if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
-   end = memblock.current_limit;
+   if ((end == MEMBLOCK_ALLOC_ACCESSIBLE) || (end  current_limit))
+   end = current_limit;
 
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 22/26] x86: get pg_data_t's memory from other node

2012-10-29 Thread Lai Jiangshan
From: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com

If system can create movable node which all memory of the
node is allocated as ZONE_MOVABLE, setup_node_data() cannot
allocate memory for the node's pg_data_t.
So when memblock_alloc_nid() fails, setup_node_data() retries
memblock_alloc().

Signed-off-by: Yasuaki Ishimatsu isimatu.yasu...@jp.fujitsu.com
Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 arch/x86/mm/numa.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 2d125be..a86e315 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -223,9 +223,13 @@ static void __init setup_node_data(int nid, u64 start, u64 
end)
remapped = true;
} else {
nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid);
-   if (!nd_pa) {
-   pr_err(Cannot find %zu bytes in node %d\n,
+   if (!nd_pa)
+   printk(KERN_WARNING Cannot find %zu bytes in node 
%d\n,
   nd_size, nid);
+   nd_pa = memblock_alloc(nd_size, SMP_CACHE_BYTES);
+   if (!nd_pa) {
+   pr_err(Cannot find %zu bytes in other node\n,
+  nd_size);
return;
}
nd = __va(nd_pa);
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 10/26] mm,migrate: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Christoph Lameter c...@linux.com
---
 mm/migrate.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 77ed2d7..d595e58 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1201,7 +1201,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t 
task_nodes,
if (node  0 || node = MAX_NUMNODES)
goto out_pm;
 
-   if (!node_state(node, N_HIGH_MEMORY))
+   if (!node_state(node, N_MEMORY))
goto out_pm;
 
err = -EACCES;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 26/26] mempolicy: fix is_valid_nodemask()

2012-10-29 Thread Lai Jiangshan
is_valid_nodemask() is introduced by 19770b32. but it does not match
its comments, because it does not check the zone which  policy_zone.

Also in b377fd, this commits told us, if highest zone is ZONE_MOVABLE,
we should also apply memory policies to it. so ZONE_MOVABLE should be valid zone
for policies. is_valid_nodemask() need to be changed to match it.

Fix: check all zones, even its zoneid  policy_zone.
Use nodes_intersects() instead open code to check it.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Reported-by: Wen Congyang we...@cn.fujitsu.com
---
 mm/mempolicy.c |   36 ++--
 1 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d4a084c..ed7c249 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -140,19 +140,7 @@ static const struct mempolicy_operations {
 /* Check that the nodemask contains at least one populated zone */
 static int is_valid_nodemask(const nodemask_t *nodemask)
 {
-   int nd, k;
-
-   for_each_node_mask(nd, *nodemask) {
-   struct zone *z;
-
-   for (k = 0; k = policy_zone; k++) {
-   z = NODE_DATA(nd)-node_zones[k];
-   if (z-present_pages  0)
-   return 1;
-   }
-   }
-
-   return 0;
+   return nodes_intersects(*nodemask, node_states[N_MEMORY]);
 }
 
 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
@@ -1572,6 +1560,26 @@ struct mempolicy *get_vma_policy(struct task_struct 
*task,
return pol;
 }
 
+static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
+{
+   enum zone_type dynamic_policy_zone = policy_zone;
+
+   BUG_ON(dynamic_policy_zone == ZONE_MOVABLE);
+
+   /*
+* if policy-v.nodes has movable memory only,
+* we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
+*
+* policy-v.nodes is intersect with node_states[N_MEMORY].
+* so if the following test faile, it implies
+* policy-v.nodes has movable memory only.
+*/
+   if (!nodes_intersects(policy-v.nodes, node_states[N_HIGH_MEMORY]))
+   dynamic_policy_zone = ZONE_MOVABLE;
+
+   return zone = dynamic_policy_zone;
+}
+
 /*
  * Return a nodemask representing a mempolicy for filtering nodes for
  * page allocation
@@ -1580,7 +1588,7 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct 
mempolicy *policy)
 {
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy-mode == MPOL_BIND) 
-   gfp_zone(gfp) = policy_zone 
+   apply_policy_zone(policy, gfp_zone(gfp)) 
cpuset_nodemask_valid_mems_allowed(policy-v.nodes))
return policy-v.nodes;
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 15/26] init: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 init/main.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/init/main.c b/init/main.c
index 9cf77ab..9595968 100644
--- a/init/main.c
+++ b/init/main.c
@@ -855,7 +855,7 @@ static void __init kernel_init_freeable(void)
/*
 * init can allocate pages on any node
 */
-   set_mems_allowed(node_states[N_HIGH_MEMORY]);
+   set_mems_allowed(node_states[N_MEMORY]);
/*
 * init can run on any cpu.
 */
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 06/26] cpuset: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 Documentation/cgroups/cpusets.txt |2 +-
 include/linux/cpuset.h|2 +-
 kernel/cpuset.c   |   32 
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/Documentation/cgroups/cpusets.txt 
b/Documentation/cgroups/cpusets.txt
index cefd3d8..12e01d4 100644
--- a/Documentation/cgroups/cpusets.txt
+++ b/Documentation/cgroups/cpusets.txt
@@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional 
kernel code.
 The cpus and mems files in the root (top_cpuset) cpuset are
 read-only.  The cpus file automatically tracks the value of
 cpu_online_mask using a CPU hotplug notifier, and the mems file
-automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e.,
+automatically tracks the value of node_states[N_MEMORY]--i.e.,
 nodes with memory--using the cpuset_track_online_nodes() hook.
 
 
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 838320f..8c8a60d29 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct 
task_struct *p)
return node_possible_map;
 }
 
-#define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY])
+#define cpuset_current_mems_allowed (node_states[N_MEMORY])
 static inline void cpuset_init_current_mems_allowed(void) {}
 
 static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f33c715..2b133db 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs,
  * are online, with memory.  If none are online with memory, walk
  * up the cpuset hierarchy until we find one that does have some
  * online mems.  If we get all the way to the top and still haven't
- * found any online mems, return node_states[N_HIGH_MEMORY].
+ * found any online mems, return node_states[N_MEMORY].
  *
  * One way or another, we guarantee to return some non-empty subset
- * of node_states[N_HIGH_MEMORY].
+ * of node_states[N_MEMORY].
  *
  * Call with callback_mutex held.
  */
@@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs,
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 {
while (cs  !nodes_intersects(cs-mems_allowed,
-   node_states[N_HIGH_MEMORY]))
+   node_states[N_MEMORY]))
cs = cs-parent;
if (cs)
nodes_and(*pmask, cs-mems_allowed,
-   node_states[N_HIGH_MEMORY]);
+   node_states[N_MEMORY]);
else
-   *pmask = node_states[N_HIGH_MEMORY];
-   BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));
+   *pmask = node_states[N_MEMORY];
+   BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY]));
 }
 
 /*
@@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
return -ENOMEM;
 
/*
-* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
+* top_cpuset.mems_allowed tracks node_stats[N_MEMORY];
 * it's read-only
 */
if (cs == top_cpuset) {
@@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct 
cpuset *trialcs,
goto done;
 
if (!nodes_subset(trialcs-mems_allowed,
-   node_states[N_HIGH_MEMORY])) {
+   node_states[N_MEMORY])) {
retval =  -EINVAL;
goto done;
}
@@ -2034,7 +2034,7 @@ static struct cpuset *cpuset_next(struct list_head *queue)
  * before dropping down to the next.  It always processes a node before
  * any of its children.
  *
- * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY
+ * In the case of memory hot-unplug, it will remove nodes from N_MEMORY
  * if all present pages from a node are offlined.
  */
 static void
@@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum 
hotplug_event event)
 
/* Continue past cpusets with all mems online */
if (nodes_subset(cp-mems_allowed,
-   node_states[N_HIGH_MEMORY]))
+   node_states[N_MEMORY]))
continue;
 
oldmems = cp-mems_allowed;
@@ -2081,7 +2081,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum 
hotplug_event event

[V5 PATCH 02/26] memory_hotplug: handle empty zone when online_movable/online_kernel

2012-10-29 Thread Lai Jiangshan
make online_movable/online_kernel can empty a zone
or can move memory to a empty zone.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/memory_hotplug.c |   51 +--
 1 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6d3bec4..bdcdaf6 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -227,8 +227,17 @@ static void resize_zone(struct zone *zone, unsigned long 
start_pfn,
 
zone_span_writelock(zone);
 
-   zone-zone_start_pfn = start_pfn;
-   zone-spanned_pages = end_pfn - start_pfn;
+   if (end_pfn - start_pfn) {
+   zone-zone_start_pfn = start_pfn;
+   zone-spanned_pages = end_pfn - start_pfn;
+   } else {
+   /*
+* make it consist as free_area_init_core(),
+* if spanned_pages = 0, then keep start_pfn = 0
+*/
+   zone-zone_start_pfn = 0;
+   zone-spanned_pages = 0;
+   }
 
zone_span_writeunlock(zone);
 }
@@ -244,10 +253,19 @@ static void fix_zone_id(struct zone *zone, unsigned long 
start_pfn,
set_page_links(pfn_to_page(pfn), zid, nid, pfn);
 }
 
-static int move_pfn_range_left(struct zone *z1, struct zone *z2,
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
unsigned long start_pfn, unsigned long end_pfn)
 {
+   int ret;
unsigned long flags;
+   unsigned long z1_start_pfn;
+
+   if (!z1-wait_table) {
+   ret = init_currently_empty_zone(z1, start_pfn,
+   end_pfn - start_pfn, MEMMAP_HOTPLUG);
+   if (ret)
+   return ret;
+   }
 
pgdat_resize_lock(z1-zone_pgdat, flags);
 
@@ -261,7 +279,13 @@ static int move_pfn_range_left(struct zone *z1, struct 
zone *z2,
if (end_pfn = z2-zone_start_pfn)
goto out_fail;
 
-   resize_zone(z1, z1-zone_start_pfn, end_pfn);
+   /* use start_pfn for z1's start_pfn if z1 is empty */
+   if (z1-spanned_pages)
+   z1_start_pfn = z1-zone_start_pfn;
+   else
+   z1_start_pfn = start_pfn;
+
+   resize_zone(z1, z1_start_pfn, end_pfn);
resize_zone(z2, end_pfn, z2-zone_start_pfn + z2-spanned_pages);
 
pgdat_resize_unlock(z1-zone_pgdat, flags);
@@ -274,10 +298,19 @@ out_fail:
return -1;
 }
 
-static int move_pfn_range_right(struct zone *z1, struct zone *z2,
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
unsigned long start_pfn, unsigned long end_pfn)
 {
+   int ret;
unsigned long flags;
+   unsigned long z2_end_pfn;
+
+   if (!z2-wait_table) {
+   ret = init_currently_empty_zone(z2, start_pfn,
+   end_pfn - start_pfn, MEMMAP_HOTPLUG);
+   if (ret)
+   return ret;
+   }
 
pgdat_resize_lock(z1-zone_pgdat, flags);
 
@@ -291,8 +324,14 @@ static int move_pfn_range_right(struct zone *z1, struct 
zone *z2,
if (start_pfn = z1-zone_start_pfn + z1-spanned_pages)
goto out_fail;
 
+   /* use end_pfn for z2's end_pfn if z2 is empty */
+   if (z2-spanned_pages)
+   z2_end_pfn = z2-zone_start_pfn + z2-spanned_pages;
+   else
+   z2_end_pfn = end_pfn;
+
resize_zone(z1, z1-zone_start_pfn, start_pfn);
-   resize_zone(z2, start_pfn, z2-zone_start_pfn + z2-spanned_pages);
+   resize_zone(z2, start_pfn, z2_end_pfn);
 
pgdat_resize_unlock(z1-zone_pgdat, flags);
 
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 16/26] vmscan: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
Acked-by: Hillf Danton dhi...@gmail.com
---
 mm/vmscan.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2624edc..98a2e11 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3135,7 +3135,7 @@ static int __devinit cpu_callback(struct notifier_block 
*nfb,
int nid;
 
if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
pg_data_t *pgdat = NODE_DATA(nid);
const struct cpumask *mask;
 
@@ -3191,7 +3191,7 @@ static int __init kswapd_init(void)
int nid;
 
swap_setup();
-   for_each_node_state(nid, N_HIGH_MEMORY)
+   for_each_node_state(nid, N_MEMORY)
kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 11/26] mempolicy: use N_MEMORY instead N_HIGH_MEMORY

2012-10-29 Thread Lai Jiangshan
N_HIGH_MEMORY stands for the nodes that has normal or high memory.
N_MEMORY stands for the nodes that has any memory.

The code here need to handle with the nodes which have memory, we should
use N_MEMORY instead.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 mm/mempolicy.c |   12 ++--
 1 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d04a8a5..d4a084c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol,
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
-   /* Check N_HIGH_MEMORY */
+   /* Check N_MEMORY */
nodes_and(nsc-mask1,
- cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+ cpuset_current_mems_allowed, node_states[N_MEMORY]);
 
VM_BUG_ON(!nodes);
if (pol-mode == MPOL_PREFERRED  nodes_empty(*nodes))
@@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, 
maxnode,
goto out_put;
}
 
-   if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
+   if (!nodes_subset(*new, node_states[N_MEMORY])) {
err = -EINVAL;
goto out_put;
}
@@ -2361,7 +2361,7 @@ void __init numa_policy_init(void)
 * fall back to the largest node if they're all smaller.
 */
nodes_clear(interleave_nodes);
-   for_each_node_state(nid, N_HIGH_MEMORY) {
+   for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
 
/* Preserve the largest node */
@@ -2442,7 +2442,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, 
int no_context)
*nodelist++ = '\0';
if (nodelist_parse(nodelist, nodes))
goto out;
-   if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+   if (!nodes_subset(nodes, node_states[N_MEMORY]))
goto out;
} else
nodes_clear(nodes);
@@ -2476,7 +2476,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, 
int no_context)
 * Default to online nodes with memory if no nodelist
 */
if (!nodelist)
-   nodes = node_states[N_HIGH_MEMORY];
+   nodes = node_states[N_MEMORY];
break;
case MPOL_LOCAL:
/*
-- 
1.7.4.4

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[V5 PATCH 21/26] page_alloc: add kernelcore_max_addr

2012-10-29 Thread Lai Jiangshan
Current ZONE_MOVABLE (kernelcore=) setting policy with boot option doesn't meet
our requirement. We need something like kernelcore_max_addr=XX boot option
to limit the kernelcore upper address.

The memory with higher address will be migratable(movable) and they
are easier to be offline(always ready to be offline when the system don't 
require
so much memory).

It makes things easy when we dynamic hot-add/remove memory, make better
utilities of memories, and helps for THP.

All kernelcore_max_addr=, kernelcore= and movablecore= can be safely specified
at the same time(or any 2 of them).

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 Documentation/kernel-parameters.txt |9 +
 mm/page_alloc.c |   29 -
 2 files changed, 37 insertions(+), 1 deletions(-)

diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 9776f06..2b72ffb 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1223,6 +1223,15 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
use the HighMem zone if it exists, and the Normal
zone if it does not.
 
+   kernelcore_max_addr=nn[KMG] [KNL,X86,IA-64,PPC] This parameter
+   is the same effect as kernelcore parameter, except it
+   specifies the up physical address of memory range
+   usable by the kernel for non-movable allocations.
+   If both kernelcore and kernelcore_max_addr are
+   specified, this requested's priority is higher than
+   kernelcore's.
+   See the kernelcore parameter.
+
kgdbdbgp=   [KGDB,HW] kgdb over EHCI usb debug port.
Format: Controller#[,poll interval]
The controller # is the number of the ehci usb debug
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a42337f..11df8b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -203,6 +203,7 @@ static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 static unsigned long __meminitdata 
arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+static unsigned long __initdata required_kernelcore_max_pfn;
 static unsigned long __initdata required_kernelcore;
 static unsigned long __initdata required_movablecore;
 static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -4700,6 +4701,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
 {
int i, nid;
unsigned long usable_startpfn;
+   unsigned long kernelcore_max_pfn;
unsigned long kernelcore_node, kernelcore_remaining;
/* save the state before borrow the nodemask */
nodemask_t saved_node_state = node_states[N_MEMORY];
@@ -4728,6 +4730,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
required_kernelcore = max(required_kernelcore, corepages);
}
 
+   if (required_kernelcore_max_pfn  !required_kernelcore)
+   required_kernelcore = totalpages;
+
/* If kernelcore was not specified, there is no ZONE_MOVABLE */
if (!required_kernelcore)
goto out;
@@ -4736,6 +4741,12 @@ static void __init find_zone_movable_pfns_for_nodes(void)
find_usable_zone_for_movable();
usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
 
+   if (required_kernelcore_max_pfn)
+   kernelcore_max_pfn = required_kernelcore_max_pfn;
+   else
+   kernelcore_max_pfn = ULONG_MAX  PAGE_SHIFT;
+   kernelcore_max_pfn = max(kernelcore_max_pfn, usable_startpfn);
+
 restart:
/* Spread kernelcore memory as evenly as possible throughout nodes */
kernelcore_node = required_kernelcore / usable_nodes;
@@ -4762,8 +4773,12 @@ restart:
unsigned long size_pages;
 
start_pfn = max(start_pfn, zone_movable_pfn[nid]);
-   if (start_pfn = end_pfn)
+   end_pfn = min(kernelcore_max_pfn, end_pfn);
+   if (start_pfn = end_pfn) {
+   if (!zone_movable_pfn[nid])
+   zone_movable_pfn[nid] = start_pfn;
continue;
+   }
 
/* Account for what is only usable for kernelcore */
if (start_pfn  usable_startpfn) {
@@ -4954,6 +4969,18 @@ static int __init cmdline_parse_core(char *p, unsigned 
long *core)
return 0;
 }
 
+#ifdef CONFIG_MOVABLE_NODE
+/*
+ * kernelcore_max_addr=addr sets the up physical address of memory range
+ * for use for allocations that cannot be reclaimed or migrated.
+ */
+static int __init

Re: [PATCH] percpu-rwsem: use barrier in unlock path

2012-10-16 Thread Lai Jiangshan
On 10/17/2012 10:23 AM, Linus Torvalds wrote:
 [ Architecture people, note the potential new SMP barrier! ]
 
 On Tue, Oct 16, 2012 at 4:30 PM, Mikulas Patocka mpato...@redhat.com wrote:
 +   /*
 +* The lock is considered unlocked when p-locked is set to false.
 +* Use barrier prevent reordering of operations around p-locked.
 +*/
 +#if defined(CONFIG_X86)  (!defined(CONFIG_X86_PPRO_FENCE)  
 !defined(CONFIG_X86_OOSTORE))
 +   barrier();
 +#else
 +   smp_mb();
 +#endif
 p-locked = false;
 
 Ugh. The #if is too ugly to live.

Even the previous patch is applied, percpu_down_read() still
needs mb() to pair with it.

 
 This is a classic case of people who write their own serialization
 primitives invariably get them wrong. And this fix is just horrible,
 and code like this should not be allowed.

One of the most major problems of 62ac665ff9fc07497ca524bd20d6a96893d11071 is 
that
it is merged without Ackeds or Revieweds from Paul or Peter or someone else
who are expert at synchronization/arch memory models.

I suggest any new synchronization should stay in -tip for 2 or more cycles
before merged to mainline.

Thanks,
Lai
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] percpu-rwsem: use barrier in unlock path

2012-10-17 Thread Lai Jiangshan
On 10/18/2012 04:28 AM, Steven Rostedt wrote:
 On Wed, Oct 17, 2012 at 11:07:21AM -0400, Mikulas Patocka wrote:

 Even the previous patch is applied, percpu_down_read() still
 needs mb() to pair with it.

 percpu_down_read uses rcu_read_lock which should guarantee that memory 
 accesses don't escape in front of a rcu-protected section.
 
 You do realize that rcu_read_lock() does nothing more that a barrier(),
 right?
 
 Paul worked really hard to get rcu_read_locks() to not call HW barriers.
 

 If rcu_read_unlock has only an unlock barrier and not a full barrier, 
 memory accesses could be moved in front of rcu_read_unlock and reordered 
 with this_cpu_inc(*p-counters), but it doesn't matter because 
 percpu_down_write does synchronize_rcu(), so it never sees these accesses 
 halfway through.
 
 Looking at the patch, you are correct. The read side doesn't need the
 memory barrier as the worse thing that will happen is that it sees the
 locked = false, and will just grab the mutex unnecessarily.

-
A memory barrier can be added iff these two things are known:
1) it disables the disordering between what and what.
2) what is the corresponding mb() that it pairs with.

You tried to add a mb() in percpu_up_write(), OK, I know it disables the 
disordering
between the writes to the protected data and the statement p-locked = false,
But I can't find out the corresponding mb() that it pairs with.

percpu_down_read()  writes to the data
The cpu cache/prefetch the data writes to the data
which is chaos  writes to the data
percpu_up_write()
mb()
p-locked = 
false;
unlikely(p-locked)
the cpu see p-lock = false,
don't discard the cached/prefetch data
this_cpu_inc(*p-counters);
the code of read-access to the data
and we use the chaos data*

So you need to add a mb() after unlikely(p-locked).

-

The RCU you use don't protect any data. It protects codes of the fast path:
unlikely(p-locked);
this_cpu_inc(*p-counters);

and synchronize_rcu() ensures all previous fast path had fully finished
this_cpu_inc(*p-counters);.

It don't protect other code/data, if you want to protect other code or other
data, please add more synchronizations or mb()s.

---

I extremely hate a synchronization protects code instead of data.
but sometimes I also have to do it.

---

a very draft example of paired-mb()s is here:


diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index cf80f7e..84a93c0 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -12,6 +12,14 @@ struct percpu_rw_semaphore {
struct mutex mtx;
 };
 
+#if 1
+#define light_mb() barrier()
+#define heavy_mb() synchronize_sched()
+#else
+#define light_mb() smp_mb()
+#define heavy_mb() smp_mb();
+#endif
+
 static inline void percpu_down_read(struct percpu_rw_semaphore *p)
 {
rcu_read_lock();
@@ -24,22 +32,12 @@ static inline void percpu_down_read(struct 
percpu_rw_semaphore *p)
}
this_cpu_inc(*p-counters);
rcu_read_unlock();
+   light_mb(); /* A, between read of p-locked and read of data, paired 
with D */
 }
 
 static inline void percpu_up_read(struct percpu_rw_semaphore *p)
 {
-   /*
-* On X86, write operation in this_cpu_dec serves as a memory unlock
-* barrier (i.e. memory accesses may be moved before the write, but
-* no memory accesses are moved past the write).
-* On other architectures this may not be the case, so we need smp_mb()
-* there.
-*/
-#if defined(CONFIG_X86)  (!defined(CONFIG_X86_PPRO_FENCE)  
!defined(CONFIG_X86_OOSTORE))
-   barrier();
-#else
-   smp_mb();
-#endif
+   light_mb(); /* B, between read of the data and write to p-counter, 
paired with C */
this_cpu_dec(*p-counters);
 }
 
@@ -61,11 +59,12 @@ static inline void percpu_down_write(struct 
percpu_rw_semaphore *p)
synchronize_rcu();
while (__percpu_count(p-counters))
msleep(1);
-   smp_rmb(); /* paired with smp_mb() in percpu_sem_up_read() */
+   heavy_mb(); /* C, between read of p-counter and write to data, paired 
with B */
 }
 
 static inline void percpu_up_write(struct percpu_rw_semaphore *p)
 {
+   heavy_mb(); /* D, between write to data and write to p-locked, paired 
with A */
p-locked = false;
mutex_unlock(p-mtx);
 }
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  

Re: [PATCH 0/8] workqueue: advance concurrency management

2013-04-18 Thread Lai Jiangshan
Ping.

On Mon, Apr 15, 2013 at 12:41 AM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 I found the early-increasing nr_running in wq_worker_waking_up() is useless
 in many cases. it tries to avoid waking up idle workers for pending work item.
 but delay increasing nr_running does not increase waking up idle workers.

 so we delay increasing and remove wq_worker_waking_up() and ...

 enjoy a simpler concurrency management.

 Lai Jiangshan (8):
   workqueue: remove @cpu from wq_worker_sleeping()
   workqueue: use create_and_start_worker() in manage_workers()
   workqueue: remove cpu_intensive from process_one_work()
   workqueue: quit cm mode when sleeping
   workqueue: remove disabled wq_worker_waking_up()
   workqueue: make nr_running non-atomic
   workqueue: move worker-flags up
   workqueue: rename -nr_running to -nr_cm_workers

  kernel/sched/core.c |6 +-
  kernel/workqueue.c  |  234 +++---
  kernel/workqueue_internal.h |9 +-
  3 files changed, 89 insertions(+), 160 deletions(-)

 --
 1.7.7.6

 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/8] workqueue: advance concurrency management

2013-04-20 Thread Lai Jiangshan
On Sat, Apr 20, 2013 at 2:11 AM, Tejun Heo t...@kernel.org wrote:
 Hey,

 On Fri, Apr 19, 2013 at 06:10:57AM +0800, Lai Jiangshan wrote:
 Ping.

 Sorry, I've been at collab summit / lsf.  Plus, it's a bit too late
 for for-3.10 anyway.  Anyways, after glancing over it, here are my
 preliminary thoughts.  The first one looks good but I'm not sure about
 dropping nr_running adjustment.  The only real benefit coming from
 that is dropping a sched callback and if there's any performance /
 overhead impact, I'm afraid it's gonna be negative.  There are actual
 benefits in using as few tasks as possible -

waking_up() callback doesn't win too much in this.


 the cache footprint gets smaller,

cache footprint also be reduced in different way in the patchset.
and memory atomic operations are reduced.

 so unless there's a clear indication that the suggested

Only simple.
and remove the optimization from rare cases.

 behavior is better in some way, I'm not sure what we're buying with
 the proposed changes.

 Thanks.

 --
 tejun
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/7] workqueue: add __WQ_FREEZING and remove POOL_FREEZING

2013-04-20 Thread Lai Jiangshan
Please forget all my other patches.

But these 1/7 and 2/7 __WQ_FREEZING patches can be in 3.10

On Thu, Apr 4, 2013 at 10:12 PM, Tejun Heo t...@kernel.org wrote:
 Hello, Lai.

 On Thu, Apr 04, 2013 at 10:05:32AM +0800, Lai Jiangshan wrote:
 @@ -4757,25 +4747,16 @@ void thaw_workqueues(void)
  {
   struct workqueue_struct *wq;
   struct pool_workqueue *pwq;
 - struct worker_pool *pool;
 - int pi;

   mutex_lock(wq_pool_mutex);

   if (!workqueue_freezing)
   goto out_unlock;

 - /* clear FREEZING */
 - for_each_pool(pool, pi) {
 - spin_lock_irq(pool-lock);
 - WARN_ON_ONCE(!(pool-flags  POOL_FREEZING));
 - pool-flags = ~POOL_FREEZING;
 - spin_unlock_irq(pool-lock);
 - }
 -
   /* restore max_active and repopulate worklist */
   list_for_each_entry(wq, workqueues, list) {
   mutex_lock(wq-mutex);
 + wq-flags = ~__WQ_FREEZING;

 I want an assertion here.

freezing codes is very simple for verifying.

 Maybe we can fold the next patch into this
 one and add WARN_ON_ONCE() here?

I consider the two patches are different intent.

Thanks,
Lai


   for_each_pwq(pwq, wq)
   pwq_adjust_max_active(pwq);
   mutex_unlock(wq-mutex);

 Thanks.

 --
 tejun
 --
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/7] workqueue: rename rebind_workers() to associate_cpu_pool()

2013-04-03 Thread Lai Jiangshan
merge the code of clearing POOL_DISASSOCIATED to rebind_workers(), and
rename rebind_workers() to associate_cpu_pool().

It merges high related code together and simplify
workqueue_cpu_up_callback().

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   21 ++---
 1 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 66a9d71..b4369de 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2270,7 +2270,7 @@ recheck:
 * worker or that someone else has already assumed the manager
 * role.  This is where @worker starts participating in concurrency
 * management if applicable and concurrency management is restored
-* after being rebound.  See rebind_workers() for details.
+* after being rebound.  See associate_cpu_pool() for details.
 */
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
 
@@ -4431,12 +4431,13 @@ static void wq_unbind_fn(struct work_struct *work)
 }
 
 /**
- * rebind_workers - rebind all workers of a pool to the associated CPU
+ * associate_cpu_pool - rebind all workers of a pool to the associated CPU
  * @pool: pool of interest
  *
- * @pool-cpu is coming online.  Rebind all workers to the CPU.
+ * @pool-cpu is coming online.  Rebind all workers to the CPU and
+ * set the pool associated
  */
-static void rebind_workers(struct worker_pool *pool)
+static void associate_cpu_pool(struct worker_pool *pool)
 {
struct worker *worker;
int wi;
@@ -4451,8 +4452,9 @@ static void rebind_workers(struct worker_pool *pool)
 * from CPU_ONLINE, the following shouldn't fail.
 */
for_each_pool_worker(worker, wi, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker-task,
- pool-attrs-cpumask)  0);
+   if (WARN_ON_ONCE(set_cpus_allowed_ptr(worker-task,
+   pool-attrs-cpumask)  0))
+   return;
 
spin_lock_irq(pool-lock);
 
@@ -4491,6 +4493,7 @@ static void rebind_workers(struct worker_pool *pool)
ACCESS_ONCE(worker-flags) = worker_flags;
}
 
+   pool-flags = ~POOL_DISASSOCIATED;
spin_unlock_irq(pool-lock);
 }
 
@@ -4558,11 +4561,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct 
notifier_block *nfb,
mutex_lock(pool-manager_mutex);
 
if (pool-cpu == cpu) {
-   spin_lock_irq(pool-lock);
-   pool-flags = ~POOL_DISASSOCIATED;
-   spin_unlock_irq(pool-lock);
-
-   rebind_workers(pool);
+   associate_cpu_pool(pool);
} else if (pool-cpu  0) {
restore_unbound_workers_cpumask(pool, cpu);
}
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/7] workqueue: set __WQ_FREEZING only when freezable

2013-04-03 Thread Lai Jiangshan
simplify pwq_adjust_max_active().
make freeze_workqueues_begin() and thaw_workqueues() fast skip non-freezable wq.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   13 ++---
 1 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e06a5b0..66a9d71 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3644,18 +3644,13 @@ static void pwq_unbound_release_workfn(struct 
work_struct *work)
 static void pwq_adjust_max_active(struct pool_workqueue *pwq)
 {
struct workqueue_struct *wq = pwq-wq;
-   bool freezable = wq-flags  WQ_FREEZABLE;
 
/* for @wq-saved_max_active and @wq-flags */
lockdep_assert_held(wq-mutex);
 
-   /* fast exit for non-freezable wqs */
-   if (!freezable  pwq-max_active == wq-saved_max_active)
-   return;
-
spin_lock_irq(pwq-pool-lock);
 
-   if (!freezable || !(wq-flags  __WQ_FREEZING)) {
+   if (!(wq-flags  __WQ_FREEZING)) {
pwq-max_active = wq-saved_max_active;
 
while (!list_empty(pwq-delayed_works) 
@@ -4151,7 +4146,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
mutex_lock(wq_pool_mutex);
 
mutex_lock(wq-mutex);
-   if (workqueue_freezing)
+   if ((wq-flags  WQ_FREEZABLE)  workqueue_freezing)
wq-flags |= __WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
@@ -4677,6 +4672,8 @@ void freeze_workqueues_begin(void)
workqueue_freezing = true;
 
list_for_each_entry(wq, workqueues, list) {
+   if (!(wq-flags  WQ_FREEZABLE))
+   continue;
mutex_lock(wq-mutex);
WARN_ON_ONCE(wq-flags  __WQ_FREEZING);
wq-flags |= __WQ_FREEZING;
@@ -4755,6 +4752,8 @@ void thaw_workqueues(void)
 
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, workqueues, list) {
+   if (!(wq-flags  WQ_FREEZABLE))
+   continue;
mutex_lock(wq-mutex);
wq-flags = ~__WQ_FREEZING;
for_each_pwq(pwq, wq)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/7] workqueue: simplify workqueue_cpu_up_callback(CPU_ONLINE)

2013-04-03 Thread Lai Jiangshan
If we have 4096 CPUs, workqueue_cpu_up_callback() will travel too much CPUs,
to avoid it, we use for_each_cpu_worker_pool() for the cpu pools and
use for_each_unbound_pool() for unbound pools.

After it, for_each_pool() becomes unused, but we keep it for future
possible usage.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   53 ++-
 1 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b4369de..a383eaf 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -354,6 +354,23 @@ static void copy_workqueue_attrs(struct workqueue_attrs 
*to,
else
 
 /**
+ * for_each_unbound_pool - iterate through all unbound worker_pools in the 
system
+ * @pool: iteration cursor
+ * @bkt: bucket (of integer) used for iteration
+ *
+ * This must be called either with wq_pool_mutex held or sched RCU read
+ * locked.  If the pool needs to be used beyond the locking in effect, the
+ * caller is responsible for guaranteeing that the pool stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_unbound_pool(pool, bkt)   \
+   hash_for_each(unbound_pool_hash, bkt, pool, hash_node)  \
+   if (({ assert_rcu_or_pool_mutex(); false; })) { }   \
+   else
+
+/**
  * for_each_pool_worker - iterate through all workers of a worker_pool
  * @worker: iteration cursor
  * @wi: integer used for iteration
@@ -4442,7 +4459,7 @@ static void associate_cpu_pool(struct worker_pool *pool)
struct worker *worker;
int wi;
 
-   lockdep_assert_held(pool-manager_mutex);
+   mutex_lock(pool-manager_mutex);
 
/*
 * Restore CPU affinity of all workers.  As all idle workers should
@@ -4454,7 +4471,7 @@ static void associate_cpu_pool(struct worker_pool *pool)
for_each_pool_worker(worker, wi, pool)
if (WARN_ON_ONCE(set_cpus_allowed_ptr(worker-task,
pool-attrs-cpumask)  0))
-   return;
+   goto out_unlock;
 
spin_lock_irq(pool-lock);
 
@@ -4495,6 +4512,9 @@ static void associate_cpu_pool(struct worker_pool *pool)
 
pool-flags = ~POOL_DISASSOCIATED;
spin_unlock_irq(pool-lock);
+
+out_unlock:
+   mutex_unlock(pool-manager_mutex);
 }
 
 /**
@@ -4509,25 +4529,28 @@ static void associate_cpu_pool(struct worker_pool *pool)
  */
 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 {
-   static cpumask_t cpumask;
+   static cpumask_t cpumask; /* protected by wq_pool_mutex */
struct worker *worker;
int wi;
 
-   lockdep_assert_held(pool-manager_mutex);
+   mutex_lock(pool-manager_mutex);
 
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool-attrs-cpumask))
-   return;
+   goto out_unlock;
 
/* is @cpu the only online CPU? */
cpumask_and(cpumask, pool-attrs-cpumask, cpu_online_mask);
if (cpumask_weight(cpumask) != 1)
-   return;
+   goto out_unlock;
 
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, wi, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker-task,
  pool-attrs-cpumask)  0);
+
+out_unlock:
+   mutex_unlock(pool-manager_mutex);
 }
 
 /*
@@ -4541,7 +4564,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct 
notifier_block *nfb,
int cpu = (unsigned long)hcpu;
struct worker_pool *pool;
struct workqueue_struct *wq;
-   int pi;
+   int bkt;
 
switch (action  ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
@@ -4555,19 +4578,13 @@ static int __cpuinit workqueue_cpu_up_callback(struct 
notifier_block *nfb,
 
case CPU_DOWN_FAILED:
case CPU_ONLINE:
-   mutex_lock(wq_pool_mutex);
+   for_each_cpu_worker_pool(pool, cpu)
+   associate_cpu_pool(pool);
 
-   for_each_pool(pool, pi) {
-   mutex_lock(pool-manager_mutex);
-
-   if (pool-cpu == cpu) {
-   associate_cpu_pool(pool);
-   } else if (pool-cpu  0) {
-   restore_unbound_workers_cpumask(pool, cpu);
-   }
+   mutex_lock(wq_pool_mutex);
 
-   mutex_unlock(pool-manager_mutex);
-   }
+   for_each_unbound_pool(pool, bkt)
+   restore_unbound_workers_cpumask(pool, cpu);
 
/* update NUMA affinity of unbound workqueues */
list_for_each_entry(wq, workqueues, list)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux

[PATCH 1/7] workqueue: add __WQ_FREEZING and remove POOL_FREEZING

2013-04-03 Thread Lai Jiangshan
freezing is nothing related to pools, but POOL_FREEZING adds a connection,
and causes freeze_workqueues_begin() and thaw_workqueues() complicated.

Since freezing is workqueue instance attribute, so we introduce __WQ_FREEZING
to wq-flags instead and remove POOL_FREEZING.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 include/linux/workqueue.h |1 +
 kernel/workqueue.c|   33 +++--
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7179756..672b51e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -300,6 +300,7 @@ enum {
WQ_CPU_INTENSIVE= 1  5, /* cpu instensive workqueue */
WQ_SYSFS= 1  6, /* visible in sysfs, see 
wq_sysfs_register() */
 
+   __WQ_FREEZING   = 1  15, /* internel: workqueue is freezing */
__WQ_DRAINING   = 1  16, /* internal: workqueue is draining */
__WQ_ORDERED= 1  17, /* internal: workqueue is ordered */
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dd2a4c4..e06a5b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,7 +68,6 @@ enum {
 */
POOL_MANAGE_WORKERS = 1  0,   /* need to manage workers */
POOL_DISASSOCIATED  = 1  2,   /* cpu can't serve workers */
-   POOL_FREEZING   = 1  3,   /* freeze in progress */
 
/* worker flags */
WORKER_STARTED  = 1  0,   /* started */
@@ -3556,9 +3555,6 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
if (!pool || init_worker_pool(pool)  0)
goto fail;
 
-   if (workqueue_freezing)
-   pool-flags |= POOL_FREEZING;
-
lockdep_set_subclass(pool-lock, 1);   /* see put_pwq() */
copy_workqueue_attrs(pool-attrs, attrs);
 
@@ -3650,7 +3646,7 @@ static void pwq_adjust_max_active(struct pool_workqueue 
*pwq)
struct workqueue_struct *wq = pwq-wq;
bool freezable = wq-flags  WQ_FREEZABLE;
 
-   /* for @wq-saved_max_active */
+   /* for @wq-saved_max_active and @wq-flags */
lockdep_assert_held(wq-mutex);
 
/* fast exit for non-freezable wqs */
@@ -3659,7 +3655,7 @@ static void pwq_adjust_max_active(struct pool_workqueue 
*pwq)
 
spin_lock_irq(pwq-pool-lock);
 
-   if (!freezable || !(pwq-pool-flags  POOL_FREEZING)) {
+   if (!freezable || !(wq-flags  __WQ_FREEZING)) {
pwq-max_active = wq-saved_max_active;
 
while (!list_empty(pwq-delayed_works) 
@@ -4155,6 +4151,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char 
*fmt,
mutex_lock(wq_pool_mutex);
 
mutex_lock(wq-mutex);
+   if (workqueue_freezing)
+   wq-flags |= __WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(wq-mutex);
@@ -4670,26 +4668,18 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
  */
 void freeze_workqueues_begin(void)
 {
-   struct worker_pool *pool;
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
-   int pi;
 
mutex_lock(wq_pool_mutex);
 
WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
 
-   /* set FREEZING */
-   for_each_pool(pool, pi) {
-   spin_lock_irq(pool-lock);
-   WARN_ON_ONCE(pool-flags  POOL_FREEZING);
-   pool-flags |= POOL_FREEZING;
-   spin_unlock_irq(pool-lock);
-   }
-
list_for_each_entry(wq, workqueues, list) {
mutex_lock(wq-mutex);
+   WARN_ON_ONCE(wq-flags  __WQ_FREEZING);
+   wq-flags |= __WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(wq-mutex);
@@ -4757,25 +4747,16 @@ void thaw_workqueues(void)
 {
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
-   struct worker_pool *pool;
-   int pi;
 
mutex_lock(wq_pool_mutex);
 
if (!workqueue_freezing)
goto out_unlock;
 
-   /* clear FREEZING */
-   for_each_pool(pool, pi) {
-   spin_lock_irq(pool-lock);
-   WARN_ON_ONCE(!(pool-flags  POOL_FREEZING));
-   pool-flags = ~POOL_FREEZING;
-   spin_unlock_irq(pool-lock);
-   }
-
/* restore max_active and repopulate worklist */
list_for_each_entry(wq, workqueues, list) {
mutex_lock(wq-mutex);
+   wq-flags = ~__WQ_FREEZING;
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(wq-mutex);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ

[PATCH 7/7] workqueue: avoid false negative WARN_ON()

2013-04-03 Thread Lai Jiangshan
it is very common wq-dfl_pwq-refcnt  1.

[7.939873] WARNING: at kernel/workqueue.c:4201 
destroy_workqueue+0x6a/0x13e()
[7.943601] Hardware name: 4286C12
[7.947250] Modules linked in: sdhci_pci sdhci mmc_core usb_storage i915 
drm_kms_helper drm i2c_algo_bit i2c_core video
[7.951313] Pid: 361, comm: umount Not tainted 3.9.0-rc5+ #29
[7.955309] Call Trace:
[7.959346]  [c04314a7] warn_slowpath_common+0x7c/0x93
[7.963506]  [c044796a] ? destroy_workqueue+0x6a/0x13e
[7.967748]  [c044796a] ? destroy_workqueue+0x6a/0x13e
[7.971981]  [c04314e0] warn_slowpath_null+0x22/0x24
[7.976383]  [c044796a] destroy_workqueue+0x6a/0x13e
[7.980875]  [c056dc01] ext4_put_super+0x43/0x2c4
[7.985407]  [c050bd48] ? dispose_list+0x28/0x32
[7.989987]  [c050c652] ? evict_inodes+0xcf/0xd7
[7.994509]  [c04fb7b8] generic_shutdown_super+0x4b/0xb9
[7.999130]  [c04fb848] kill_block_super+0x22/0x60
[8.003594]  [c04fb960] deactivate_locked_super+0x2f/0x56
[8.008077]  [c04fc41b] deactivate_super+0x2e/0x31
[8.012523]  [c050f1e6] mntput_no_expire+0x103/0x108
[8.017050]  [c050fdce] sys_umount+0x2a2/0x2c4
[8.021429]  [c050fe0e] sys_oldumount+0x1e/0x20
[8.025678]  [c085ba4d] sysenter_do_call+0x12/0x38

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3f33077..f015c38 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4198,7 +4198,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
}
}
 
-   if (WARN_ON(pwq-refcnt  1) ||
+   if (WARN_ON((pwq != wq-dfl_pwq)  (pwq-refcnt  1)) ||
WARN_ON(pwq-nr_active) ||
WARN_ON(!list_empty(pwq-delayed_works))) {
mutex_unlock(wq-mutex);
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/7] workqueue: node-awared allocation for unbound pool

2013-04-03 Thread Lai Jiangshan
calculate the node of the pool earlier, and allocate the pool
from the node.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   29 +++--
 1 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 737646d..3f33077 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -539,7 +539,7 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  * @wq: the target workqueue
  * @node: the node ID
  *
- * This must be called either with pwq_lock held or sched RCU read locked.
+ * This must be called either with wq-mutex held or sched RCU read locked.
  * If the pwq needs to be used beyond the locking in effect, the caller is
  * responsible for guaranteeing that the pwq stays online.
  */
@@ -3555,7 +3555,7 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
 {
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
-   int node;
+   int pool_node = NUMA_NO_NODE, node;
 
lockdep_assert_held(wq_pool_mutex);
 
@@ -3563,29 +3563,30 @@ static struct worker_pool *get_unbound_pool(const 
struct workqueue_attrs *attrs)
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool-attrs, attrs)) {
pool-refcnt++;
-   goto out_unlock;
+   goto out_pool;
}
}
 
-   /* nope, create a new one */
-   pool = kzalloc(sizeof(*pool), GFP_KERNEL);
-   if (!pool || init_worker_pool(pool)  0)
-   goto fail;
-
-   lockdep_set_subclass(pool-lock, 1);   /* see put_pwq() */
-   copy_workqueue_attrs(pool-attrs, attrs);
-
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
-   if (cpumask_subset(pool-attrs-cpumask,
+   if (cpumask_subset(attrs-cpumask,
   wq_numa_possible_cpumask[node])) {
-   pool-node = node;
+   pool_node = node;
break;
}
}
}
 
+   /* create a new one */
+   pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, pool_node);
+   if (!pool || init_worker_pool(pool)  0)
+   goto fail;
+
+   lockdep_set_subclass(pool-lock, 1);   /* see put_pwq() */
+   copy_workqueue_attrs(pool-attrs, attrs);
+   pool-node = pool_node;
+
if (worker_pool_assign_id(pool)  0)
goto fail;
 
@@ -3595,7 +3596,7 @@ static struct worker_pool *get_unbound_pool(const struct 
workqueue_attrs *attrs)
 
/* install */
hash_add(unbound_pool_hash, pool-hash_node, hash);
-out_unlock:
+out_pool:
return pool;
 fail:
if (pool)
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/7] workqueue, use default pwq when fail to allocate node pwd

2013-04-03 Thread Lai Jiangshan
When we fail to allocate the node pwq, we can use the default pwq
for the node.

Thus we can avoid failure after allocated default pwq, and remove
some code for failure path.

Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
---
 kernel/workqueue.c |   28 +++-
 1 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a383eaf..737646d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3751,17 +3751,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-   lockdep_assert_held(wq_pool_mutex);
-
-   if (pwq) {
-   put_unbound_pool(pwq-pool);
-   kfree(pwq);
-   }
-}
-
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of interest
@@ -3891,12 +3880,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
for_each_node(node) {
if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs-cpumask)) {
pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-   if (!pwq_tbl[node])
-   goto enomem_pwq;
-   } else {
-   dfl_pwq-refcnt++;
-   pwq_tbl[node] = dfl_pwq;
+   if (pwq_tbl[node])
+   continue;
+   /* fallback to dfl_pwq if the allocation failed */
}
+   dfl_pwq-refcnt++;
+   pwq_tbl[node] = dfl_pwq;
}
 
mutex_unlock(wq_pool_mutex);
@@ -3931,10 +3920,6 @@ out_free:
return ret;
 
 enomem_pwq:
-   free_unbound_pwq(dfl_pwq);
-   for_each_node(node)
-   if (pwq_tbl  pwq_tbl[node] != dfl_pwq)
-   free_unbound_pwq(pwq_tbl[node]);
mutex_unlock(wq_pool_mutex);
put_online_cpus();
 enomem:
@@ -4017,7 +4002,8 @@ static void wq_update_unbound_numa(struct 
workqueue_struct *wq, int cpu,
if (!pwq) {
pr_warning(workqueue: allocation failed while updating NUMA 
affinity of \%s\\n,
   wq-name);
-   goto out_unlock;
+   mutex_lock(wq-mutex);
+   goto use_dfl_pwq;
}
 
/*
-- 
1.7.7.6

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Word-at-a-time dcache name accesses (was Re: .. anybody know of any filesystems that depend on the exact VFS 'namehash' implementation?)

2013-04-04 Thread Lai Jiangshan
[resend in plain text mode (I did not notice the gmail changed the
default mode, sorry)]

On Fri, Apr 5, 2013 at 12:17 AM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 Hi, ALL

 I also encountered the same problem.

 git bisect:

 14134f6584212d585b310ce95428014b653dfaf6 is the first bad commit
 commit 14134f6584212d585b310ce95428014b653dfaf6
 Author: dingtianhong dingtianh...@huawei.com
 Date:   Mon Mar 25 17:02:04 2013 +

 af_unix: dont send SCM_CREDENTIAL when dest socket is NULL

 SCM_SCREDENTIALS should apply to write() syscalls only either source or
 destination
 socket asserted SOCK_PASSCRED. The original implememtation in
 maybe_add_creds is wrong,
 and breaks several LSB testcases ( i.e.
 /tset/LSB.os/netowkr/recvfrom/T.recvfrom).

 Origionally-authored-by: Karel Srot ks...@redhat.com
 Signed-off-by: Ding Tianhong dingtianh...@huawei.com
 Acked-by: Eric Dumazet eduma...@google.com
 Signed-off-by: David S. Miller da...@davemloft.net

 :04 04 ef0356cc0fc168a39c0f94cff0ba27c46c4d0048
 ae34e59f235c379f04d6145f0103cccd5b3a307a M net

 ===
 Like Brian Gerst, no obvious bug, but the system can't boot, service udev
 start fails when boot
 (also DEBUG_PAGEALLOC=n, I did not try to test with it=y)

 [   11.022976] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   11.023293] systemd[1]: Unit udev-control.socket entered failed state.
 [   11.182478] systemd-readahead-replay[399]: Bumped block_nr parameter of
 8:16 to 16384. This is a temporary hack and should be removed one day.
 [   14.473283] udevd[410]: bind failed: Address already in use
 [   14.478630] udevd[410]: error binding udev control socket
 [   15.201158] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   16.900792] udevd[427]: error binding udev control socket
 [   18.356484] EXT4-fs (sdb7): re-mounted. Opts: (null)
 [   19.738401] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   19.742494] systemd[1]: Job pending for unit, delaying automatic restart.
 [   19.747764] systemd[1]: Unit udev.service entered failed state.
 [   19.752303] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   19.770723] udevd[459]: bind failed: Address already in use
 [   19.771027] udevd[459]: error binding udev control socket
 [   19.771175] udevd[459]: error binding udev control socket
 [   19.813256] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   19.914450] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   19.918374] systemd[1]: Job pending for unit, delaying automatic restart.
 [   19.923392] systemd[1]: Unit udev.service entered failed state.
 [   19.923808] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   19.943792] udevd[465]: bind failed: Address already in use
 [   19.944056] udevd[465]: error binding udev control socket
 [   19.944210] udevd[465]: error binding udev control socket
 [   19.946071] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.047524] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   20.051939] systemd[1]: Job pending for unit, delaying automatic restart.
 [   20.057539] systemd[1]: Unit udev.service entered failed state.
 [   20.058069] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   20.081141] udevd[467]: bind failed: Address already in use
 [   20.087120] udevd[467]: error binding udev control socket
 [   20.092040] udevd[467]: error binding udev control socket
 [   20.096519] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.184910] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   20.189863] systemd[1]: Job pending for unit, delaying automatic restart.
 [   20.195440] systemd[1]: Unit udev.service entered failed state.
 [   20.196012] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   20.220543] udevd[469]: bind failed: Address already in use
 [   20.220584] udevd[469]: error binding udev control socket
 [   20.220780] udevd[469]: error binding udev control socket
 [   20.222830] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.323906] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   20.329170] systemd[1]: Job pending for unit, delaying automatic restart.
 [   20.334785] systemd[1]: Unit udev.service entered failed state.
 [   20.335318] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   20.360255] udevd[471]: bind failed: Address already in use
 [   20.360294] udevd[471]: error binding udev control socket
 [   20.360401] udevd[471]: error binding udev control socket
 [   20.362359] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.463651] systemd[1]: udev.service holdoff time over

Re: [PATCH 1/3] kernel/srcu: merge common code into a macro

2013-04-05 Thread Lai Jiangshan

On 03/19/2013 10:16 PM, Sebastian Andrzej Siewior wrote:
 DEFINE_SRCU() and DEFINE_STATIC_SRCU() does the same thing except for
 the static attribute. This patch moves the common pieces into
 _DEFINE_SRCU() which is used by the the former macros either adding the
 static attribute or not.
 
 Signed-off-by: Sebastian Andrzej Siewior bige...@linutronix.de
 ---
  include/linux/srcu.h |   10 +-
  1 file changed, 5 insertions(+), 5 deletions(-)

Hi, Sebastian

The patch hurts readability.
The original code are simple enough, merging them as one macro
gives us no benefit.

Thanks
Lai.


 
 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
 index 6eb691b..d04acb8 100644
 --- a/include/linux/srcu.h
 +++ b/include/linux/srcu.h
 @@ -102,13 +102,13 @@ void process_srcu(struct work_struct *work);
   * define and init a srcu struct at build time.
   * dont't call init_srcu_struct() nor cleanup_srcu_struct() on it.
   */
 -#define DEFINE_SRCU(name)\
 +#define _DEFINE_SRCU(name, mod)  
 \
   static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
 - struct srcu_struct name = __SRCU_STRUCT_INIT(name);
 + mod struct srcu_struct name =   \
 + __SRCU_STRUCT_INIT(name);
  
 -#define DEFINE_STATIC_SRCU(name) \
 - static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
 - static struct srcu_struct name = __SRCU_STRUCT_INIT(name);
 +#define DEFINE_SRCU(name)_DEFINE_SRCU(name, )
 +#define DEFINE_STATIC_SRCU(name) _DEFINE_SRCU(name, static)
  
  /**
   * call_srcu() - Queue a callback for invocation after an SRCU grace period

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] kernel/SRCU: provide a static initializer

2013-04-05 Thread Lai Jiangshan


On 03/19/2013 10:16 PM, Sebastian Andrzej Siewior wrote:
 There are macros for static initializer for the three out of four
 possible notifier types, that are:
   ATOMIC_NOTIFIER_HEAD()
   BLOCKING_NOTIFIER_HEAD()
   RAW_NOTIFIER_HEAD()
 
 This patch provides a static initilizer for the forth type to make it
 complete.
 
 Signed-off-by: Sebastian Andrzej Siewior bige...@linutronix.de
 ---
  include/linux/notifier.h |   26 +-
  include/linux/srcu.h |6 +++---
  2 files changed, 24 insertions(+), 8 deletions(-)
 
 diff --git a/include/linux/notifier.h b/include/linux/notifier.h
 index d65746e..6bfd703 100644
 --- a/include/linux/notifier.h
 +++ b/include/linux/notifier.h
 @@ -42,9 +42,7 @@
   * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
   * As compensation, srcu_notifier_chain_unregister() is rather expensive.
   * SRCU notifier chains should be used when the chain will be called very
 - * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
 - * chains are slightly more difficult to use because they require special
 - * runtime initialization.
 + * often but notifier_blocks will seldom be removed.
   */
  
  struct notifier_block {
 @@ -85,7 +83,7 @@ struct srcu_notifier_head {
   (name)-head = NULL;\
   } while (0)
  
 -/* srcu_notifier_heads must be initialized and cleaned up dynamically */
 +/* srcu_notifier_heads must be cleaned up dynamically */
  extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
  #define srcu_cleanup_notifier_head(name) \
   cleanup_srcu_struct((name)-srcu);
 @@ -98,7 +96,13 @@ extern void srcu_init_notifier_head(struct 
 srcu_notifier_head *nh);
   .head = NULL }
  #define RAW_NOTIFIER_INIT(name)  {   \
   .head = NULL }
 -/* srcu_notifier_heads cannot be initialized statically */
 +
 +#define SRCU_NOTIFIER_INIT(name, pcpu)   \
 + {   \
 + .mutex = __MUTEX_INITIALIZER(name.mutex),   \
 + .head = NULL,   \
 + .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu),\
 + }


Hi, Sebastian

I don't want to expose __SRCU_STRUCT_INIT(),
due to it has strong coupling with the percpu array.

I hope other structure which uses SRCU should use init_srcu_struct().

Thanks,
Lai

  
  #define ATOMIC_NOTIFIER_HEAD(name)   \
   struct atomic_notifier_head name =  \
 @@ -110,6 +114,18 @@ extern void srcu_init_notifier_head(struct 
 srcu_notifier_head *nh);
   struct raw_notifier_head name = \
   RAW_NOTIFIER_INIT(name)
  
 +#define _SRCU_NOTIFIER_HEAD(name, mod)   \
 + static DEFINE_PER_CPU(struct srcu_struct_array, \
 + name##_head_srcu_array);\
 + mod struct srcu_notifier_head name =\
 + SRCU_NOTIFIER_INIT(name, name##_head_srcu_array)
 +
 +#define SRCU_NOTIFIER_HEAD(name) \
 + _SRCU_NOTIFIER_HEAD(name, )
 +
 +#define SRCU_NOTIFIER_HEAD_STATIC(name)  \
 + _SRCU_NOTIFIER_HEAD(name, static)
 +
  #ifdef __KERNEL__
  
  extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh,
 diff --git a/include/linux/srcu.h b/include/linux/srcu.h
 index d04acb8..fe9efd4 100644
 --- a/include/linux/srcu.h
 +++ b/include/linux/srcu.h
 @@ -84,10 +84,10 @@ int init_srcu_struct(struct srcu_struct *sp);
  
  void process_srcu(struct work_struct *work);
  
 -#define __SRCU_STRUCT_INIT(name) \
 +#define __SRCU_STRUCT_INIT(name, pcpu_name)  \
   {   \
   .completed = -300,  \
 - .per_cpu_ref = name##_srcu_array,  \
 + .per_cpu_ref = pcpu_name,  \
   .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock),\
   .running = false,   \
   .batch_queue = RCU_BATCH_INIT(name.batch_queue),\
 @@ -105,7 +105,7 @@ void process_srcu(struct work_struct *work);
  #define _DEFINE_SRCU(name, mod)  
 \
   static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\
   mod struct srcu_struct name =   \
 - __SRCU_STRUCT_INIT(name);
 + __SRCU_STRUCT_INIT(name, name##_srcu_array);
  
  #define DEFINE_SRCU(name)_DEFINE_SRCU(name, )
  #define DEFINE_STATIC_SRCU(name) _DEFINE_SRCU(name, static)

--
To unsubscribe from this 

Re: [PATCH 7/9] fsnotify: use existed call_srcu()

2013-04-05 Thread Lai Jiangshan
[Ping]

Hi, Eric Paris

Could you review this patch?

Thanks,
Lai

On 03/16/2013 12:50 AM, Lai Jiangshan wrote:
 fsnotify implements its own call_srcu() by:
   dedicated thread + synchronize_srcu()
 
 But srcu provides call_srcu() now, so we should convert them to use
 existed call_srcu() and remove the thread.
 
 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 Cc: Eric Paris epa...@parisplace.org
 ---
  fs/notify/mark.c |   59 ++---
  include/linux/fsnotify_backend.h |2 +-
  2 files changed, 11 insertions(+), 50 deletions(-)
 
 diff --git a/fs/notify/mark.c b/fs/notify/mark.c
 index aeededc..af5f0e1 100644
 --- a/fs/notify/mark.c
 +++ b/fs/notify/mark.c
 @@ -98,9 +98,6 @@
  #include fsnotify.h
  
  DEFINE_SRCU(fsnotify_mark_srcu);
 -static DEFINE_SPINLOCK(destroy_lock);
 -static LIST_HEAD(destroy_list);
 -static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
  
  void fsnotify_get_mark(struct fsnotify_mark *mark)
  {
 @@ -116,6 +113,14 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
   }
  }
  
 +static void fsnotify_destroy_mark_rcu(struct rcu_head *rcu)
 +{
 + struct fsnotify_mark *mark;
 +
 + mark = container_of(rcu, struct fsnotify_mark, rcu);
 + fsnotify_put_mark(mark);
 +}
 +
  /*
   * Any time a mark is getting freed we end up here.
   * The caller had better be holding a reference to this mark so we don't 
 actually
 @@ -155,10 +160,7 @@ void fsnotify_destroy_mark_locked(struct fsnotify_mark 
 *mark,
   /* release lock temporarily */
   mutex_unlock(group-mark_mutex);
  
 - spin_lock(destroy_lock);
 - list_add(mark-destroy_list, destroy_list);
 - spin_unlock(destroy_lock);
 - wake_up(destroy_waitq);
 + call_srcu(fsnotify_mark_srcu, mark-rcu, fsnotify_destroy_mark_rcu);
   /*
* We don't necessarily have a ref on mark from caller so the above 
 destroy
* may have actually freed it, unless this group provides a 
 'freeing_mark'
 @@ -273,11 +275,7 @@ err:
   atomic_dec(group-num_marks);
  
   spin_unlock(mark-lock);
 -
 - spin_lock(destroy_lock);
 - list_add(mark-destroy_list, destroy_list);
 - spin_unlock(destroy_lock);
 - wake_up(destroy_waitq);
 + call_srcu(fsnotify_mark_srcu, mark-rcu, fsnotify_destroy_mark_rcu);
  
   return ret;
  }
 @@ -342,40 +340,3 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
   atomic_set(mark-refcnt, 1);
   mark-free_mark = free_mark;
  }
 -
 -static int fsnotify_mark_destroy(void *ignored)
 -{
 - struct fsnotify_mark *mark, *next;
 - LIST_HEAD(private_destroy_list);
 -
 - for (;;) {
 - spin_lock(destroy_lock);
 - /* exchange the list head */
 - list_replace_init(destroy_list, private_destroy_list);
 - spin_unlock(destroy_lock);
 -
 - synchronize_srcu(fsnotify_mark_srcu);
 -
 - list_for_each_entry_safe(mark, next, private_destroy_list, 
 destroy_list) {
 - list_del_init(mark-destroy_list);
 - fsnotify_put_mark(mark);
 - }
 -
 - wait_event_interruptible(destroy_waitq, 
 !list_empty(destroy_list));
 - }
 -
 - return 0;
 -}
 -
 -static int __init fsnotify_mark_init(void)
 -{
 - struct task_struct *thread;
 -
 - thread = kthread_run(fsnotify_mark_destroy, NULL,
 -  fsnotify_mark);
 - if (IS_ERR(thread))
 - panic(unable to start fsnotify mark destruction thread.);
 -
 - return 0;
 -}
 -device_initcall(fsnotify_mark_init);
 diff --git a/include/linux/fsnotify_backend.h 
 b/include/linux/fsnotify_backend.h
 index d5b0910..3d435eb 100644
 --- a/include/linux/fsnotify_backend.h
 +++ b/include/linux/fsnotify_backend.h
 @@ -296,7 +296,7 @@ struct fsnotify_mark {
  #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY   0x08
  #define FSNOTIFY_MARK_FLAG_ALIVE 0x10
   unsigned int flags; /* vfsmount or inode mark? */
 - struct list_head destroy_list;
 + struct rcu_head rcu;
   void (*free_mark)(struct fsnotify_mark *mark); /* called on final 
 put+free */
  };
  

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] workqueue: avoid false negative WARN_ON() in destroy_workqueue()

2013-04-05 Thread Lai Jiangshan
On 04/04/2013 10:55 PM, Tejun Heo wrote:
From 5c529597e922c26910fe49b8d5f93aeaca9a2415 Mon Sep 17 00:00:00 2001
 From: Lai Jiangshan la...@cn.fujitsu.com
 Date: Thu, 4 Apr 2013 10:05:38 +0800
 
 destroy_workqueue() performs several sanity checks before proceeding
 with destruction of a workqueue.  One of the checks verifies that
 refcnt of each pwq (pool_workqueue) is over 1 as at that point there
 should be no in-flight work items and the only holder of pwq refs is
 the workqueue itself.
 
 This worked fine as a workqueue used to hold only one reference to its
 pwqs; however, since 4c16bd327c (workqueue: implement NUMA affinity
 for unbound workqueues), a workqueue may hold multiple references to
 its default pwq triggering this sanity check spuriously.
 
 Fix it by not triggering the pwq-refcnt assertion on default pwqs.
 
 An example spurious WARN trigger follows.
 
  WARNING: at kernel/workqueue.c:4201 destroy_workqueue+0x6a/0x13e()
  Hardware name: 4286C12
  Modules linked in: sdhci_pci sdhci mmc_core usb_storage i915 drm_kms_helper 
 drm i2c_algo_bit i2c_core video
  Pid: 361, comm: umount Not tainted 3.9.0-rc5+ #29
  Call Trace:
   [c04314a7] warn_slowpath_common+0x7c/0x93
   [c04314e0] warn_slowpath_null+0x22/0x24
   [c044796a] destroy_workqueue+0x6a/0x13e
   [c056dc01] ext4_put_super+0x43/0x2c4
   [c04fb7b8] generic_shutdown_super+0x4b/0xb9
   [c04fb848] kill_block_super+0x22/0x60
   [c04fb960] deactivate_locked_super+0x2f/0x56
   [c04fc41b] deactivate_super+0x2e/0x31
   [c050f1e6] mntput_no_expire+0x103/0x108
   [c050fdce] sys_umount+0x2a2/0x2c4
   [c050fe0e] sys_oldumount+0x1e/0x20
   [c085ba4d] sysenter_do_call+0x12/0x38
 
 tj: Rewrote description.
 
 Signed-off-by: Lai Jiangshan la...@cn.fujitsu.com
 Signed-off-by: Tejun Heo t...@kernel.org
 Reported-by: Fengguang Wu fengguang...@intel.com

Hi, Wu

Could you also send regression-report of workqueue to me?

Thanks,
Lai

 ---
 Applied to wq/for-3.10.
 
 Thanks.
 
  kernel/workqueue.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)
 
 diff --git a/kernel/workqueue.c b/kernel/workqueue.c
 index dd2a4c4..c273376 100644
 --- a/kernel/workqueue.c
 +++ b/kernel/workqueue.c
 @@ -4201,7 +4201,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
   }
   }
  
 - if (WARN_ON(pwq-refcnt  1) ||
 + if (WARN_ON((pwq != wq-dfl_pwq)  (pwq-refcnt  1)) ||
   WARN_ON(pwq-nr_active) ||
   WARN_ON(!list_empty(pwq-delayed_works))) {
   mutex_unlock(wq-mutex);

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: af_unix udev startup regression

2013-04-07 Thread Lai Jiangshan
On 04/05/2013 02:03 AM, Linus Torvalds wrote:
 [ Fixed odd legacy subject line that has nothing to do with the actual bug ]
 
 Hmm. Can you double-check and verify that reverting that commit makes
 things work again for you?

reverting 14134f6584212d585b310ce95428014b653dfaf6 works.

14134f6584212d585b310ce95428014b653dfaf6 is already reverted in upstream.
(and sorry for so late reply)

 
 Also, what's your distribution and setup? 

Fedora 16

Thanks,
Lai

 I'd like this to get
 verified, just to see that it's not some timing-dependent thing or a
 bisection mistake, but if so, then the LSB test-cases obviously have
 to be fixed, and the commit that causes the problem needs to be
 reverted. Test-cases count for nothing compared to actual users.
 
 Linus
 
 On Thu, Apr 4, 2013 at 9:17 AM, Lai Jiangshan la...@cn.fujitsu.com wrote:
 Hi, ALL

 I also encountered the same problem.

 git bisect:

 14134f6584212d585b310ce95428014b653dfaf6 is the first bad commit
 commit 14134f6584212d585b310ce95428014b653dfaf6
 Author: dingtianhong dingtianh...@huawei.com
 Date:   Mon Mar 25 17:02:04 2013 +

 af_unix: dont send SCM_CREDENTIAL when dest socket is NULL

 SCM_SCREDENTIALS should apply to write() syscalls only either source or
 destination
 socket asserted SOCK_PASSCRED. The original implememtation in
 maybe_add_creds is wrong,
 and breaks several LSB testcases ( i.e.
 /tset/LSB.os/netowkr/recvfrom/T.recvfrom).

 Origionally-authored-by: Karel Srot ks...@redhat.com
 Signed-off-by: Ding Tianhong dingtianh...@huawei.com
 Acked-by: Eric Dumazet eduma...@google.com
 Signed-off-by: David S. Miller da...@davemloft.net

 :04 04 ef0356cc0fc168a39c0f94cff0ba27c46c4d0048
 ae34e59f235c379f04d6145f0103cccd5b3a307a M net

 ===
 Like Brian Gerst, no obvious bug, but the system can't boot, service udev
 start fails when boot
 (also DEBUG_PAGEALLOC=n, I did not try to test with it=y)

 [   11.022976] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   11.023293] systemd[1]: Unit udev-control.socket entered failed state.
 [   11.182478] systemd-readahead-replay[399]: Bumped block_nr parameter of
 8:16 to 16384. This is a temporary hack and should be removed one day.
 [   14.473283] udevd[410]: bind failed: Address already in use
 [   14.478630] udevd[410]: error binding udev control socket
 [   15.201158] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   16.900792] udevd[427]: error binding udev control socket
 [   18.356484] EXT4-fs (sdb7): re-mounted. Opts: (null)
 [   19.738401] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   19.742494] systemd[1]: Job pending for unit, delaying automatic restart.
 [   19.747764] systemd[1]: Unit udev.service entered failed state.
 [   19.752303] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   19.770723] udevd[459]: bind failed: Address already in use
 [   19.771027] udevd[459]: error binding udev control socket
 [   19.771175] udevd[459]: error binding udev control socket
 [   19.813256] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   19.914450] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   19.918374] systemd[1]: Job pending for unit, delaying automatic restart.
 [   19.923392] systemd[1]: Unit udev.service entered failed state.
 [   19.923808] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   19.943792] udevd[465]: bind failed: Address already in use
 [   19.944056] udevd[465]: error binding udev control socket
 [   19.944210] udevd[465]: error binding udev control socket
 [   19.946071] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.047524] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   20.051939] systemd[1]: Job pending for unit, delaying automatic restart.
 [   20.057539] systemd[1]: Unit udev.service entered failed state.
 [   20.058069] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   20.081141] udevd[467]: bind failed: Address already in use
 [   20.087120] udevd[467]: error binding udev control socket
 [   20.092040] udevd[467]: error binding udev control socket
 [   20.096519] systemd[1]: udev.service: main process exited, code=exited,
 status=1
 [   20.184910] systemd[1]: udev.service holdoff time over, scheduling
 restart.
 [   20.189863] systemd[1]: Job pending for unit, delaying automatic restart.
 [   20.195440] systemd[1]: Unit udev.service entered failed state.
 [   20.196012] systemd[1]: udev-control.socket failed to listen on sockets:
 Address already in use
 [   20.220543] udevd[469]: bind failed: Address already in use
 [   20.220584] udevd[469]: error binding udev control socket
 [   20.220780] udevd[469]: error binding udev control socket
 [   20.222830] systemd[1]: udev.service: main process exited

Re: [PATCH 2/3] kernel/SRCU: provide a static initializer

2013-04-08 Thread Lai Jiangshan
On 04/08/2013 06:03 PM, Sebastian Andrzej Siewior wrote:
 On 04/05/2013 09:21 AM, Lai Jiangshan wrote:
 Hi, Sebastian
 
 Hi Lai,
 
 I don't want to expose __SRCU_STRUCT_INIT(),
 due to it has strong coupling with the percpu array.

 I hope other structure which uses SRCU should use init_srcu_struct().
 
 I need a static initialization for this kind. Patch #3 shows one
 example I have another one pending for crypto.

If the percpu array can be defined in __SRCU_STRUCT_INIT(),
I'm happy to expose it. but it is not currently.

Why crypto can't use boot time initialization?

 Do you have any idea how I could get it done without this? Do you want
 to move/merge header files?

if crypto has to use static initialization, I will find out some way
or use your patch.

Thanks,
Lai

 

 Thanks,
 Lai
 
 Sebastian
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] kernel/SRCU: provide a static initializer

2013-04-11 Thread Lai Jiangshan
On 04/12/2013 01:04 AM, Sebastian Andrzej Siewior wrote:
 * Lai Jiangshan | 2013-04-09 09:09:56 [+0800]:
 
 If the percpu array can be defined in __SRCU_STRUCT_INIT(),
 I'm happy to expose it. but it is not currently.
 
 I have no idea how to achieve this.
 
 Why crypto can't use boot time initialization?
 
 It would require something like this:
 --- linux-stable.orig/crypto/Kconfig
 +++ linux-stable/crypto/Kconfig
 @@ -13,7 +13,7 @@ source crypto/async_tx/Kconfig
  # Cryptographic API Configuration
  #
  menuconfig CRYPTO
 -   tristate Cryptographic API
 +   bool Cryptographic API
 help
   This option provides the core Cryptographic API.

Why convert to bool?
srcu_init_notifier_head() can be called in module-load-time.

 
 --- linux-stable.orig/crypto/api.c
 +++ linux-stable/crypto/api.c
 @@ -34,6 +34,13 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem);
  struct srcu_notifier_head crypto_chain;
  EXPORT_SYMBOL_GPL(crypto_chain);
 
 +static int __init crypto_api_init(void)
 +{
 +   srcu_init_notifier_head(crypto_chain);
 +   return 0;
 +}
 +core_initcall(crypto_api_init);
 +
  static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg)
  {
 atomic_inc(alg-cra_refcnt);

And again, why crypto can't use boot time nor module-load-time initialization?

 
 and there is no need for this.
 
 Do you have any idea how I could get it done without this? Do you want
 to move/merge header files?

 if crypto has to use static initialization, I will find out some way
 or use your patch.
 
 The crypto would like this:
 
 Subject: crypto: Convert crypto notifier chain to SRCU
 From: Peter Zijlstra pet...@infradead.org
 Date: Fri, 05 Oct 2012 09:03:24 +0100
 
 The crypto notifier deadlocks on RT. Though this can be a real deadlock
 on mainline as well due to fifo fair rwsems.
 
 The involved parties here are:
 
 [   82.172678] swapper/0   S 0001 0 1  0 
 0x
 [   82.172682]  88042f18fcf0 0046 88042f18fc80 
 81491238
 [   82.172685]  00011cc0 00011cc0 88042f18c040 
 88042f18ffd8
 [   82.172688]  00011cc0 00011cc0 88042f18ffd8 
 00011cc0
 [   82.172689] Call Trace:
 [   82.172697]  [81491238] ? _raw_spin_unlock_irqrestore+0x6c/0x7a
 [   82.172701]  [8148fd3f] schedule+0x64/0x66
 [   82.172704]  [8148ec6b] schedule_timeout+0x27/0xd0
 [   82.172708]  [81043c0c] ? unpin_current_cpu+0x1a/0x6c
 [   82.172713]  [8106e491] ? migrate_enable+0x12f/0x141
 [   82.172716]  [8148fbbd] wait_for_common+0xbb/0x11f
 [   82.172719]  [810709f2] ? try_to_wake_up+0x182/0x182
 [   82.172722]  [8148fc96] 
 wait_for_completion_interruptible+0x1d/0x2e
 [   82.172726]  [811debfd] crypto_wait_for_test+0x49/0x6b
 [   82.172728]  [811ded32] crypto_register_alg+0x53/0x5a
 [   82.172730]  [811ded6c] crypto_register_algs+0x33/0x72
 [   82.172734]  [81ad7686] ? aes_init+0x12/0x12
 [   82.172737]  [81ad76ea] aesni_init+0x64/0x66
 [   82.172741]  [81000318] do_one_initcall+0x7f/0x13b
 [   82.172744]  [81ac4d34] kernel_init+0x199/0x22c
 [   82.172747]  [81ac44ef] ? loglevel+0x31/0x31
 [   82.172752]  [814987c4] kernel_thread_helper+0x4/0x10
 [   82.172755]  [81491574] ? retint_restore_args+0x13/0x13
 [   82.172759]  [81ac4b9b] ? start_kernel+0x3ca/0x3ca
 [   82.172761]  [814987c0] ? gs_change+0x13/0x13
 
 [   82.174186] cryptomgr_test  S 0001 041  2 
 0x
 [   82.174189]  88042c971980 0046 81d74830 
 0292
 [   82.174192]  00011cc0 00011cc0 88042c96eb80 
 88042c971fd8
 [   82.174195]  00011cc0 00011cc0 88042c971fd8 
 00011cc0
 [   82.174195] Call Trace:
 [   82.174198]  [8148fd3f] schedule+0x64/0x66
 [   82.174201]  [8148ec6b] schedule_timeout+0x27/0xd0
 [   82.174204]  [81043c0c] ? unpin_current_cpu+0x1a/0x6c
 [   82.174206]  [8106e491] ? migrate_enable+0x12f/0x141
 [   82.174209]  [8148fbbd] wait_for_common+0xbb/0x11f
 [   82.174212]  [810709f2] ? try_to_wake_up+0x182/0x182
 [   82.174215]  [8148fc96] 
 wait_for_completion_interruptible+0x1d/0x2e
 [   82.174218]  [811e4883] cryptomgr_notify+0x280/0x385
 [   82.174221]  [814943de] notifier_call_chain+0x6b/0x98
 [   82.174224]  [8108a11c] ? rt_down_read+0x10/0x12
 [   82.174227]  [810677cd] __blocking_notifier_call_chain+0x70/0x8d
 [   82.174230]  [810677fe] blocking_notifier_call_chain+0x14/0x16
 [   82.174234]  [811dd272] crypto_probing_notify+0x24/0x50
 [   82.174236]  [811dd7a1] crypto_alg_mod_lookup+0x3e/0x74
 [   82.174238]  [811dd949] crypto_alloc_base+0x36/0x8f
 [   82.174241]  [811e9408] cryptd_alloc_ablkcipher+0x6e/0xb5
 [   82.174243

  1   2   3   4   5   6   7   8   9   10   >