[PATCH 6/7] sched: rt-group: per group period

2008-01-04 Thread Peter Zijlstra
Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h|2 
 kernel/sched.c   |  225 +--
 kernel/sched_rt.c|   61 ++--
 kernel/sysctl.c  |2 
 kernel/time/tick-sched.c |5 -
 5 files changed, 232 insertions(+), 63 deletions(-)

Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,8 +230,6 @@ static inline int select_nohz_load_balan
 }
 #endif
 
-extern unsigned long rt_needs_cpu(int cpu);
-
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
struct rt_rq **rt_rq;
 
unsigned int rt_ratio;
+   ktime_t rt_period;
 
/*
 * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
int rt_throttled;
u64 rt_time;
+   struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
struct cfs_rq cfs;
struct rt_rq rt;
-   u64 rt_period_expire;
-   int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p) cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)  (cpu_rq(cpu)-curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-   struct rq *rq = cpu_rq(cpu);
-   u64 delta;
-
-   if (!rq-rt_throttled)
-   return 0;
-
-   if (rq-clock  rq-rt_period_expire)
-   return 1;
-
-   delta = rq-rt_period_expire - rq-clock;
-   do_div(delta, NSEC_PER_SEC / HZ);
-
-   return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 100;
 
 #define SCHED_RT_FRAC_SHIFT16
 #define SCHED_RT_FRAC  (1UL  SCHED_RT_FRAC_SHIFT)
@@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+   static const ktime_t ktime_zero = { .tv64 = 0 };
+   return ktime_add_ns(ktime_zero, ns);
+}
+
 #include sched_stats.h
 #include sched_idletask.c
 #include sched_fair.c
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
rq-tick_timestamp = rq-clock;
update_cpu_load(rq);
curr-sched_class-task_tick(rq, curr, 0);
-   update_sched_rt_period(rq);
spin_unlock(rq-lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,152 @@ static inline void sched_init_granularit
sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+   struct rt_rq *rt_rq =
+   container_of(timer, struct rt_rq, rt_period_timer);
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+   ktime_t now = ktime_get();
+
+   WARN_ON(smp_processor_id() != cpu_of(rq));
+   WARN_ON(!in_irq());
+
+   spin_lock(rq-lock);
+   update_sched_rt_period(rt_rq);
+   spin_unlock(rq-lock);
+
+   hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+   return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+   ktime_t period = sched_rt_period(rt_rq);
+
+   WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+   for (;;) {
+   ktime_t now = ktime_get();
+   hrtimer_forward(rt_rq-rt_period_timer, now, period);
+   hrtimer_start(rt_rq-rt_period_timer,
+   rt_rq-rt_period_timer.expires,
+   HRTIMER_MODE_ABS);
+   if (hrtimer_active(rt_rq-rt_period_timer))
+   break;
+   }
+}
+
+static void sched_rt_period_stop(struct rt_rq *rt_rq

[PATCH 1/7] sched: rt throttling vs no_hz

2008-01-04 Thread Peter Zijlstra
We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h|2 ++
 kernel/sched.c   |   23 ++-
 kernel/sched_rt.c|   30 --
 kernel/time/tick-sched.c |5 +
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
u64 rt_period_expire;
+   int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p) cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)  (cpu_rq(cpu)-curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+   u64 delta;
+
+   if (!rq-rt_throttled)
+   return 0;
+
+   if (rq-clock  rq-rt_period_expire)
+   return 1;
+
+   delta = rq-rt_period_expire - rq-clock;
+   do_div(delta, NSEC_PER_SEC / HZ);
+
+   return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array-bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+   rt_rq-highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
rt_rq-rt_nr_migratory = 0;
-   rt_rq-highest_prio = MAX_RT_PRIO;
rt_rq-overloaded = 0;
 #endif
 
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
list_add(init_task_group.list, task_groups);
 #endif
rq-rt_period_expire = 0;
+   rq-rt_throttled = 0;
 
for (j = 0; j  CPU_LOAD_IDX_MAX; j++)
rq-cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
 
if (rt_rq-rt_time  ratio) {
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+
+   rq-rt_throttled = 1;
rt_rq-rt_throttled = 1;
+
sched_rt_ratio_dequeue(rt_rq);
return 1;
}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-   unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-   u64 ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
-
-   rt_rq-rt_time -= min(rt_rq-rt_time, ratio);
-   if (rt_rq-rt_throttled) {
-   rt_rq-rt_throttled = 0;
-   sched_rt_ratio_enqueue(rt_rq);
-   }
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
rq-rt_period_expire += period;
 
-   for_each_leaf_rt_rq(rt_rq, rq)
-   __update_sched_rt_period(rt_rq, period);
+   for_each_leaf_rt_rq(rt_rq, rq) {
+   unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+   u64 ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
+
+   rt_rq-rt_time -= min(rt_rq-rt_time, ratio);
+   if (rt_rq-rt_throttled) {
+   rt_rq-rt_throttled = 0;
+   sched_rt_ratio_enqueue(rt_rq);
+   }
+   }
+
+   rq-rt_throttled = 0;
}
 }
 
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+   unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies

Re: [PATCH 6/7] sched: rt-group: per group period

2008-01-05 Thread Peter Zijlstra
Could you please fold this into the 6/7 patch.

It reverts a wandering chunk (the 32768 thing), but more importantly
it fixes !FAIR_GROUP_SCHED compilation.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched.c |   10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -647,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt
  * ratio of time -rt tasks may consume.
  * default: 95%
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259;
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -5379,6 +5379,7 @@ static void __init sched_rt_period_init(
hotcpu_notifier(sched_rt_period_hotplug, 0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void __sched_rt_period_init_tg(void *arg)
 {
struct task_group *tg = arg;
@@ -5404,12 +5405,14 @@ static void sched_rt_period_destroy_tg(s
 {
on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
 }
-#else
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#else /* CONFIG_SMP */
 static void __init sched_rt_period_init(void)
 {
sched_rt_period_start_cpu(0);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
 static void sched_rt_period_init_tg(struct task_group *tg)
 {
sched_rt_period_start(tg-rt_rq[0]);
@@ -5419,7 +5422,8 @@ static void sched_rt_period_destroy_tg(s
 {
sched_rt_period_stop(tg-rt_rq[0]);
 }
-#endif
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_SMP
 /*


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.24-rc6: possible recursive locking detected

2008-01-05 Thread Peter Zijlstra

On Sat, 2008-01-05 at 18:12 +1100, Herbert Xu wrote:
 On Fri, Jan 04, 2008 at 09:30:49AM +0100, Ingo Molnar wrote:
 
[ 1310.670986] =
[ 1310.671690] [ INFO: possible recursive locking detected ]
[ 1310.672097] 2.6.24-rc6 #1
[ 1310.672421] -
[ 1310.672828] FahCore_a0.exe/3692 is trying to acquire lock:
[ 1310.673238]  (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50
[ 1310.673869]
[ 1310.673870] but task is already holding lock:
[ 1310.674567]  (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50
[ 1310.675267]
[ 1310.675268] other info that might help us debug this:
[ 1310.675952] 5 locks held by FahCore_a0.exe/3692:
[ 1310.676334]  #0:  (rcu_read_lock){..--}, at: [c038b620] 
net_rx_action+0x60/0x1b0
[ 1310.677251]  #1:  (rcu_read_lock){..--}, at: [c0388d60] 
netif_receive_skb+0x100/0x470
[ 1310.677924]  #2:  (rcu_read_lock){..--}, at: [c03a7fb2] 
ip_local_deliver_finish+0x32/0x210
[ 1310.678460]  #3:  (clock-AF_INET){-.-?}, at: [c038164e] 
sock_def_readable+0x1e/0x80
[ 1310.679250]  #4:  (q-lock){++..}, at: [c011544b] 
__wake_up+0x1b/0x50
 
 The net part might just be a red herring, since the problem is that
 __wake_up is somehow reentering itself.

/*
 * Perform a safe wake up of the poll wait list. The problem is that
 * with the new callback'd wake up system, it is possible that the
 * poll callback is reentered from inside the call to wake_up() done
 * on the poll wait queue head. The rule is that we cannot reenter the
 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
 * and we cannot reenter the same wait queue head at all. This will
 * enable to have a hierarchy of epoll file descriptor of no more than
 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
 * because this one gets called by the poll callback, that in turn is called
 * from inside a wake_up(), that might be called from irq context.
 */

Seems to suggest that the epoll code can indeed recurse into wakeup.

Davide, Johannes, any ideas?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.24-rc6: possible recursive locking detected

2008-01-05 Thread Peter Zijlstra

On Sat, 2008-01-05 at 17:53 +0100, Peter Zijlstra wrote:
 On Sat, 2008-01-05 at 18:12 +1100, Herbert Xu wrote:
  On Fri, Jan 04, 2008 at 09:30:49AM +0100, Ingo Molnar wrote:
  
 [ 1310.670986] =
 [ 1310.671690] [ INFO: possible recursive locking detected ]
 [ 1310.672097] 2.6.24-rc6 #1
 [ 1310.672421] -
 [ 1310.672828] FahCore_a0.exe/3692 is trying to acquire lock:
 [ 1310.673238]  (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50
 [ 1310.673869]
 [ 1310.673870] but task is already holding lock:
 [ 1310.674567]  (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50
 [ 1310.675267]
 [ 1310.675268] other info that might help us debug this:
 [ 1310.675952] 5 locks held by FahCore_a0.exe/3692:
 [ 1310.676334]  #0:  (rcu_read_lock){..--}, at: [c038b620] 
 net_rx_action+0x60/0x1b0
 [ 1310.677251]  #1:  (rcu_read_lock){..--}, at: [c0388d60] 
 netif_receive_skb+0x100/0x470
 [ 1310.677924]  #2:  (rcu_read_lock){..--}, at: [c03a7fb2] 
 ip_local_deliver_finish+0x32/0x210
 [ 1310.678460]  #3:  (clock-AF_INET){-.-?}, at: [c038164e] 
 sock_def_readable+0x1e/0x80
 [ 1310.679250]  #4:  (q-lock){++..}, at: [c011544b] 
 __wake_up+0x1b/0x50
  
  The net part might just be a red herring, since the problem is that
  __wake_up is somehow reentering itself.
 
 /*
  * Perform a safe wake up of the poll wait list. The problem is that
  * with the new callback'd wake up system, it is possible that the
  * poll callback is reentered from inside the call to wake_up() done
  * on the poll wait queue head. The rule is that we cannot reenter the
  * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
  * and we cannot reenter the same wait queue head at all. This will
  * enable to have a hierarchy of epoll file descriptor of no more than
  * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
  * because this one gets called by the poll callback, that in turn is called
  * from inside a wake_up(), that might be called from irq context.
  */
 
 Seems to suggest that the epoll code can indeed recurse into wakeup.
 
 Davide, Johannes, any ideas?

Since EP_MAX_POLLWAKE_NESTS  MAX_LOCKDEP_SUBCLASSES we could perhaps do
something like:

  wake_up_nested(..., wake_nests);

although I'm not quite sure that is correct, my understanding of this
code is still fragile at best.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] block2mtd lockdep_init_map warning

2008-01-06 Thread Peter Zijlstra

On Sun, 2008-01-06 at 14:13 +0100, Jörn Engel wrote:

 Ingo, Peter, does either of you actually care about this problem?  In
 the last round when I debugged this problem there was a notable lack of
 reaction from either of you.

Yeah I do, I just know very little about the module stuff and haven't
come around to looking into it.

I agree that Erez's patch is quite horrible.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/11] sched: rt-group: interface

2008-01-06 Thread Peter Zijlstra
Change the rt_ratio interface to rt_runtime_us, to match rt_period_us.
This avoids picking a granularity for the ratio.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h |8 +++
 kernel/sched.c|  116 ++
 kernel/sched_rt.c |   42 +++---
 kernel/sysctl.c   |4 -
 4 files changed, 106 insertions(+), 64 deletions(-)

Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1518,7 +1518,7 @@ extern unsigned int sysctl_sched_feature
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_ratio;
+extern unsigned int sysctl_sched_rt_runtime;
 #if defined(CONFIG_FAIR_GROUP_SCHED)  defined(CONFIG_SMP)
 extern unsigned int sysctl_sched_min_bal_int_shares;
 extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -2014,6 +2014,12 @@ extern void sched_destroy_group(struct t
 extern void sched_move_task(struct task_struct *tsk);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+ unsigned long rt_runtime_us);
+extern unsigned long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+unsigned long rt_runtime_us);
+extern unsigned long sched_group_rt_period(struct task_group *tg);
 
 #endif
 
Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -176,7 +176,7 @@ struct task_group {
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;
 
-   unsigned int rt_ratio;
+   u64 rt_runtime;
ktime_t rt_period;
 
/*
@@ -646,19 +646,16 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in us.
+ * period over which we measure rt task cpu usage in us.
  * default: 1s
  */
 const_debug unsigned int sysctl_sched_rt_period = 100;
 
-#define SCHED_RT_FRAC_SHIFT16
-#define SCHED_RT_FRAC  (1UL  SCHED_RT_FRAC_SHIFT)
-
 /*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
  */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+const_debug unsigned int sysctl_sched_rt_runtime = 95;
 
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7209,7 +7206,8 @@ void __init sched_init(void)
per_cpu(init_sched_entity, i), i, 1);
 
rq-rt.rt_rq_type = RT_RQ_EDF;
-   init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+   init_task_group.rt_runtime =
+   sysctl_sched_rt_runtime * NSEC_PER_USEC;
init_task_group.rt_period =
ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
INIT_LIST_HEAD(rq-leaf_rt_rq_list);
@@ -7606,7 +7604,7 @@ struct task_group *sched_create_group(vo
goto err;
 
tg-shares = NICE_0_LOAD;
-   tg-rt_ratio = 0; /* XXX */
+   tg-rt_runtime = 0; /* XXX */
tg-rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
 
for_each_possible_cpu(i) {
@@ -7801,41 +7799,87 @@ unsigned long sched_group_shares(struct 
 }
 
 /*
- * Ensure the total rt_ratio = sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
  */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+   u64 r = runtime * (1ULL  16);
+   do_div(r, period);
+   return r;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
struct task_group *tgi;
unsigned long total = 0;
+   unsigned long global_ratio =
+   to_ratio(sysctl_sched_rt_period, sysctl_sched_rt_runtime);
 
rcu_read_lock();
-   list_for_each_entry_rcu(tgi, task_groups, list)
-   total += tgi-rt_ratio;
+   list_for_each_entry_rcu(tgi, task_groups, list) {
+   if (tgi == tg)
+   continue;
+
+   total += to_ratio(ktime_to_ns(tgi-rt_period), tgi-rt_runtime);
+   }
rcu_read_unlock();
 
-   if (total + rt_ratio - tg-rt_ratio  sysctl_sched_rt_ratio)
-   return -EINVAL;
+   return total + to_ratio(period, runtime)  global_ratio;
+}
 
-   tg-rt_ratio = rt_ratio

[PATCH 02/11] sched: load_balance_monitor rename

2008-01-06 Thread Peter Zijlstra
don't start the load_balance_monitor when there is only a single cpu.
rename the kthread because its currently longer than TASK_COMM_LEN

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7070,8 +7070,11 @@ void __init sched_init_smp(void)
sched_init_granularity();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   if (nr_cpu_ids == 1)
+   return;
+
lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-load_balance_monitor);
+group_balance);
if (!IS_ERR(lb_monitor_task)) {
lb_monitor_task-flags |= PF_NOFREEZE;
wake_up_process(lb_monitor_task);

--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 10/11] sched: rt-group: EDF

2008-01-06 Thread Peter Zijlstra
Use a simple Ealiest Deadline First implementation to schedule the realtime
groups.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h |1 
 kernel/sched.c|   13 +
 kernel/sched_rt.c |  115 +++---
 3 files changed, 124 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -942,6 +942,7 @@ struct sched_rt_entity {
int nr_cpus_allowed;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   struct rb_node  run_node;
struct sched_rt_entity  *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq*rt_rq;
Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -360,6 +360,11 @@ struct cfs_rq {
 #endif
 };
 
+enum rt_rq_type {
+   RT_RQ_PRIO,
+   RT_RQ_EDF,
+};
+
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
struct rt_prio_array active;
@@ -376,6 +381,10 @@ struct rt_rq {
struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   enum rt_rq_type rt_rq_type;
+   struct rb_root deadlines;
+   struct rb_node *rb_leftmost;
+
unsigned long rt_nr_boosted;
 
struct rq *rq;
@@ -7127,6 +7136,9 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq-rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   rt_rq-rt_rq_type = RT_RQ_PRIO;
+   rt_rq-deadlines = RB_ROOT;
+   rt_rq-rb_leftmost = NULL;
rt_rq-rt_nr_boosted = 0;
rt_rq-rq = rq;
 #endif
@@ -7196,6 +7208,7 @@ void __init sched_init(void)
per_cpu(init_cfs_rq, i),
per_cpu(init_sched_entity, i), i, 1);
 
+   rq-rt.rt_rq_type = RT_RQ_EDF;
init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
init_task_group.rt_period =
ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC);
Index: linux-2.6/kernel/sched_rt.c
===
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -138,6 +138,84 @@ static int rt_se_boosted(struct sched_rt
return p-prio != p-normal_prio;
 }
 
+static inline u64 rt_deadline(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+   BUG_ON(!group_rq);
+   return ktime_to_ns(group_rq-rt_period_timer.expires);
+}
+
+static void enqueue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+   struct rb_node **link;
+   struct rb_node *parent;
+   struct sched_rt_entity *entry;
+   u64 deadline;
+   int leftmost = 1;
+
+   if (rt_rq-rt_rq_type != RT_RQ_EDF)
+   return;
+
+   link = rt_rq-deadlines.rb_node;
+   parent = NULL;
+   deadline = rt_deadline(rt_se);
+
+   while (*link) {
+   parent = *link;
+   entry = rb_entry(parent, struct sched_rt_entity, run_node);
+
+   if (deadline  rt_deadline(entry)) {
+   link = parent-rb_left;
+   } else {
+   link = parent-rb_right;
+   leftmost = 0;
+   }
+   }
+
+   if (leftmost)
+   rt_rq-rb_leftmost = rt_se-run_node;
+
+   rb_link_node(rt_se-run_node, parent, link);
+   rb_insert_color(rt_se-run_node, rt_rq-deadlines);
+}
+
+static void dequeue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+   if (rt_rq-rt_rq_type != RT_RQ_EDF)
+   return;
+
+   if (rt_rq-rb_leftmost == rt_se-run_node)
+   rt_rq-rb_leftmost = rb_next(rt_se-run_node);
+
+   rb_erase(rt_se-run_node, rt_rq-deadlines);
+}
+
+static void requeue_rt_deadline(struct rt_rq *rt_rq)
+{
+   struct sched_rt_entity *rt_se = rt_rq-rt_se;
+
+   BUG_ON(!rt_se);
+   if (on_rt_rq(rt_se)) {
+   dequeue_rt_deadline(rt_se);
+   enqueue_rt_deadline(rt_se);
+   }
+}
+
+static struct sched_rt_entity *next_rt_deadline(struct rt_rq *rt_rq)
+{
+   if (rt_rq-rt_rq_type != RT_RQ_EDF)
+   return NULL;
+
+   if (!rt_rq-rb_leftmost)
+   return NULL;
+
+   return rb_entry(rt_rq-rb_leftmost, struct sched_rt_entity, run_node);
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -191,6 +269,23 @@ static inline int rt_rq_throttled(struct
 {
return rt_rq-rt_throttled;
 }
+
+static inline void enqueue_rt_deadline(struct sched_rt_entity *rt_se)
+{
+}
+
+static inline void dequeue_rt_deadline(struct sched_rt_entity *rt_se

[PATCH 08/11] sched: rt-group: deal with PI

2008-01-06 Thread Peter Zijlstra
Steven mentioned the fun case where a lock holding task will be throttled.

Simple fix: allow groups that have boosted tasks to run anyway.
This is ofcourse not quite correct. Needs more tricks.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched.c|3 +++
 kernel/sched_rt.c |   48 
 2 files changed, 43 insertions(+), 8 deletions(-)

Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -376,6 +376,8 @@ struct rt_rq {
struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   unsigned long rt_nr_boosted;
+
struct rq *rq;
struct list_head leaf_rt_rq_list;
struct task_group *tg;
@@ -7279,6 +7281,7 @@ static void init_rt_rq(struct rt_rq *rt_
rt_rq-rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+   rt_rq-rt_nr_boosted = 0;
rt_rq-rq = rq;
 #endif
 }
Index: linux-2.6/kernel/sched_rt.c
===
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -121,6 +121,23 @@ static void sched_rt_ratio_dequeue(struc
dequeue_rt_entity(rt_se);
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+   return rt_rq-rt_throttled  !rt_rq-rt_nr_boosted;
+}
+
+static int rt_se_boosted(struct sched_rt_entity *rt_se)
+{
+   struct rt_rq *rt_rq = group_rt_rq(rt_se);
+   struct task_struct *p;
+
+   if (rt_rq)
+   return !!rt_rq-rt_nr_boosted;
+
+   p = rt_task_of(rt_se);
+   return p-prio != p-normal_prio;
+}
+
 #else
 
 static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
@@ -170,6 +187,10 @@ static inline void sched_rt_ratio_dequeu
 {
 }
 
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+   return rt_rq-rt_throttled;
+}
 #endif
 
 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
@@ -190,21 +211,22 @@ static int sched_rt_ratio_exceeded(struc
u64 period, ratio;
 
if (rt_ratio == SCHED_RT_FRAC)
-   return 0;
+   goto out;
 
if (rt_rq-rt_throttled)
-   return 1;
+   goto out;
 
period = sched_rt_period_ns(rt_rq);
ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
 
if (rt_rq-rt_time  ratio) {
rt_rq-rt_throttled = 1;
-   sched_rt_ratio_dequeue(rt_rq);
-   return 1;
+   if (rt_rq_throttled(rt_rq))
+   sched_rt_ratio_dequeue(rt_rq);
}
 
-   return 0;
+out:
+   return rt_rq_throttled(rt_rq);
 }
 
 static void update_sched_rt_period(struct rt_rq *rt_rq)
@@ -265,6 +287,10 @@ void inc_rt_tasks(struct sched_rt_entity
 
update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   if (rt_se_boosted(rt_se))
+   rt_rq-rt_nr_boosted++;
+#endif
 }
 
 static inline
@@ -295,6 +321,12 @@ void dec_rt_tasks(struct sched_rt_entity
 
update_rt_migration(rq_of_rt_rq(rt_rq));
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+   if (rt_se_boosted(rt_se))
+   rt_rq-rt_nr_boosted--;
+
+   WARN_ON(!rt_rq-rt_nr_running  rt_rq-rt_nr_boosted);
+#endif
 }
 
 static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -303,7 +335,7 @@ static void enqueue_rt_entity(struct sch
struct rt_prio_array *array = rt_rq-active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
 
-   if (group_rq  group_rq-rt_throttled)
+   if (group_rq  rt_rq_throttled(group_rq))
return;
 
list_add_tail(rt_se-run_list, array-queue + rt_se_prio(rt_se));
@@ -476,7 +508,7 @@ static struct sched_rt_entity *pick_next
struct list_head *queue;
int idx;
 
-   if (sched_rt_ratio_exceeded(rt_rq))
+   if (rt_rq_throttled(rt_rq))
goto out;
 
idx = sched_find_first_bit(array-bitmap);
@@ -500,7 +532,7 @@ static struct task_struct *pick_next_tas
if (unlikely(!rt_rq-rt_nr_running))
return NULL;
 
-   if (sched_rt_ratio_exceeded(rt_rq))
+   if (rt_rq_throttled(rt_rq))
return NULL;
 
do {

--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/11] sched: rt throttling vs no_hz

2008-01-06 Thread Peter Zijlstra
We need to teach no_hz about the rt throttling because its tick driven.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h|2 ++
 kernel/sched.c   |   23 ++-
 kernel/sched_rt.c|   30 --
 kernel/time/tick-sched.c |5 +
 4 files changed, 45 insertions(+), 15 deletions(-)

Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -230,6 +230,8 @@ static inline int select_nohz_load_balan
 }
 #endif
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  */
Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -442,6 +442,7 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
u64 rt_period_expire;
+   int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p) cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)  (cpu_rq(cpu)-curr)
 
+unsigned long rt_needs_cpu(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+   u64 delta;
+
+   if (!rq-rt_throttled)
+   return 0;
+
+   if (rq-clock  rq-rt_period_expire)
+   return 1;
+
+   delta = rq-rt_period_expire - rq-clock;
+   do_div(delta, NSEC_PER_SEC / HZ);
+
+   return (unsigned long)delta;
+}
+
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_
/* delimiter for bitsearch: */
__set_bit(MAX_RT_PRIO, array-bitmap);
 
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+   rt_rq-highest_prio = MAX_RT_PRIO;
+#endif
 #ifdef CONFIG_SMP
rt_rq-rt_nr_migratory = 0;
-   rt_rq-highest_prio = MAX_RT_PRIO;
rt_rq-overloaded = 0;
 #endif
 
@@ -7186,6 +7206,7 @@ void __init sched_init(void)
list_add(init_task_group.list, task_groups);
 #endif
rq-rt_period_expire = 0;
+   rq-rt_throttled = 0;
 
for (j = 0; j  CPU_LOAD_IDX_MAX; j++)
rq-cpu_load[j] = 0;
Index: linux-2.6/kernel/sched_rt.c
===
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc
ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
 
if (rt_rq-rt_time  ratio) {
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+
+   rq-rt_throttled = 1;
rt_rq-rt_throttled = 1;
+
sched_rt_ratio_dequeue(rt_rq);
return 1;
}
@@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc
return 0;
 }
 
-static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period)
-{
-   unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-   u64 ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
-
-   rt_rq-rt_time -= min(rt_rq-rt_time, ratio);
-   if (rt_rq-rt_throttled) {
-   rt_rq-rt_throttled = 0;
-   sched_rt_ratio_enqueue(rt_rq);
-   }
-}
-
 static void update_sched_rt_period(struct rq *rq)
 {
struct rt_rq *rt_rq;
@@ -204,8 +196,18 @@ static void update_sched_rt_period(struc
period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
rq-rt_period_expire += period;
 
-   for_each_leaf_rt_rq(rt_rq, rq)
-   __update_sched_rt_period(rt_rq, period);
+   for_each_leaf_rt_rq(rt_rq, rq) {
+   unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+   u64 ratio = (period * rt_ratio)  SCHED_RT_FRAC_SHIFT;
+
+   rt_rq-rt_time -= min(rt_rq-rt_time, ratio);
+   if (rt_rq-rt_throttled) {
+   rt_rq-rt_throttled = 0;
+   sched_rt_ratio_enqueue(rt_rq);
+   }
+   }
+
+   rq-rt_throttled = 0;
}
 }
 
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 {
unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+   unsigned long rt_jiffies;
struct tick_sched *ts;
ktime_t last_update, expires, now, delta;
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
next_jiffies

[PATCH 00/11] another rt group sched update

2008-01-06 Thread Peter Zijlstra
this time compile tested on all 16 combinations of:

  CONFIG_SMP
  CONFIG_FAIR_GROUP_SCHED
  CONFIG_HIGH_RES_TIMERS
  CONFIG_NO_HZ

ran some but not all combinations
--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 03/11] hrtimer: clean up cpu-base locking tricks

2008-01-06 Thread Peter Zijlstra
In order to more easily allow for the scheduler to use timers, clean up
the locking a bit.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/hrtimer.c |  109 +++
 kernel/time/tick-sched.c |8 ---
 2 files changed, 102 insertions(+), 15 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even
basenow = ktime_add(now, base-offset);
 
while ((node = base-first)) {
+   enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
+   int restart;
 
timer = rb_entry(node, struct hrtimer, node);
 
@@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even
 HRTIMER_STATE_CALLBACK, 0);
timer_stats_account_hrtimer(timer);
 
+   fn = timer-function;
+   if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+   /*
+* Used for scheduler timers, avoid lock
+* inversion with rq-lock and tasklist_lock.
+*
+* These timers are required to deal with
+* enqueue expiry themselves and are not
+* allowed to migrate.
+*/
+   spin_unlock(cpu_base-lock);
+   restart = fn(timer);
+   spin_lock(cpu_base-lock);
+   } else
+   restart = fn(timer);
+
/*
 * Note: We clear the CALLBACK bit after
 * enqueue_hrtimer to avoid reprogramming of
 * the event hardware. This happens at the end
 * of this function anyway.
 */
-   if (timer-function(timer) != HRTIMER_NORESTART) {
+   if (restart != HRTIMER_NORESTART) {
BUG_ON(timer-state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base, 0);
}
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t
 {
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
-   struct hrtimer_cpu_base *base = timer-base-cpu_base;
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
int cpu = smp_processor_id();
@@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t
touch_softlockup_watchdog();
ts-idle_jiffies++;
}
-   /*
-* update_process_times() might take tasklist_lock, hence
-* drop the base lock. sched-tick hrtimers are per-CPU and
-* never accessible by userspace APIs, so this is safe to do.
-*/
-   spin_unlock(base-lock);
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
-   spin_lock(base-lock);
}
 
/* Do not restart, when we are in the idle loop */

--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/11] hrtimer: unlock hrtimer_wakeup

2008-01-06 Thread Peter Zijlstra
hrtimer_wakeup creates a

  base-lock
rq-lock

lock dependancy. Avoid this by switching to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
which doesn't hold base-lock.

This fully untangles hrtimer locks from the scheduler locks, and allows
hrtimer usage in the scheduler proper.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/hrtimer.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/hrtimer.c
===
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -1296,7 +1296,7 @@ void hrtimer_init_sleeper(struct hrtimer
sl-timer.function = hrtimer_wakeup;
sl-task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-   sl-timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+   sl-timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 }
 
@@ -1307,6 +1307,8 @@ static int __sched do_nanosleep(struct h
do {
set_current_state(TASK_INTERRUPTIBLE);
hrtimer_start(t-timer, t-timer.expires, mode);
+   if (!hrtimer_active(t-timer))
+   t-task = NULL;
 
if (likely(t-task))
schedule();

--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/11] sched: rt-group: dynamic period ticks

2008-01-06 Thread Peter Zijlstra
Disable the period updates for inactive groups.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched.c|  158 --
 kernel/sched_rt.c |   54 ++
 2 files changed, 53 insertions(+), 159 deletions(-)

Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -5277,158 +5277,6 @@ static inline void sched_init_granularit
sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-   struct rt_rq *rt_rq =
-   container_of(timer, struct rt_rq, rt_period_timer);
-   struct rq *rq = rq_of_rt_rq(rt_rq);
-   ktime_t now = ktime_get();
-
-   WARN_ON(smp_processor_id() != cpu_of(rq));
-   WARN_ON(!in_irq());
-
-   spin_lock(rq-lock);
-   update_sched_rt_period(rt_rq);
-   spin_unlock(rq-lock);
-
-   hrtimer_forward(timer, now, sched_rt_period(rt_rq));
-   return HRTIMER_RESTART;
-}
-
-static void sched_rt_period_start(struct rt_rq *rt_rq)
-{
-   ktime_t period = sched_rt_period(rt_rq);
-
-   WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
-
-   for (;;) {
-   ktime_t now = ktime_get();
-   hrtimer_forward(rt_rq-rt_period_timer, now, period);
-   hrtimer_start(rt_rq-rt_period_timer,
-   rt_rq-rt_period_timer.expires,
-   HRTIMER_MODE_ABS);
-   if (hrtimer_active(rt_rq-rt_period_timer))
-   break;
-   }
-}
-
-#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
-static void sched_rt_period_stop(struct rt_rq *rt_rq)
-{
-   hrtimer_cancel(rt_rq-rt_period_timer);
-}
-#endif
-
-static void sched_rt_period_start_cpu(int cpu)
-{
-   struct rq *rq = cpu_rq(cpu);
-   struct rt_rq *rt_rq;
-
-   for_each_leaf_rt_rq(rt_rq, rq)
-   sched_rt_period_start(rt_rq);
-}
-
-#ifdef CONFIG_SMP
-static void sched_rt_period_stop_cpu(int cpu)
-{
-   struct rq *rq = cpu_rq(cpu);
-   struct rt_rq *rt_rq;
-
-   for_each_leaf_rt_rq(rt_rq, rq)
-   sched_rt_period_stop(rt_rq);
-}
-
-static int sched_rt_period_hotplug(struct notifier_block *nfb,
-   unsigned long action, void *hcpu)
-{
-   int cpu = (unsigned long)hcpu;
-
-   switch (action) {
-   case CPU_UP_PREPARE:
-   case CPU_UP_PREPARE_FROZEN:
-   case CPU_DOWN_FAILED:
-   case CPU_DOWN_FAILED_FROZEN:
-   sched_rt_period_start_cpu(cpu);
-   return NOTIFY_OK;
-
-   case CPU_DOWN_PREPARE:
-   case CPU_DOWN_PREPARE_FROZEN:
-   case CPU_UP_CANCELED:
-   case CPU_UP_CANCELED_FROZEN:
-   sched_rt_period_stop_cpu(cpu);
-   return NOTIFY_OK;
-
-   case CPU_ONLINE:
-   case CPU_ONLINE_FROZEN:
-   case CPU_DEAD:
-   case CPU_DEAD_FROZEN:
-   return NOTIFY_OK;
-
-   default:
-   return NOTIFY_DONE;
-   }
-
-   return NOTIFY_OK;
-}
-
-static void __init __sched_rt_period_init(void *arg)
-{
-   int cpu = smp_processor_id();
-   sched_rt_period_start_cpu(cpu);
-}
-
-static void __init sched_rt_period_init(void)
-{
-   on_each_cpu(__sched_rt_period_init, NULL, 0, 1);
-   hotcpu_notifier(sched_rt_period_hotplug, 0);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void __sched_rt_period_init_tg(void *arg)
-{
-   struct task_group *tg = arg;
-   int cpu = smp_processor_id();
-
-   sched_rt_period_start(tg-rt_rq[cpu]);
-}
-
-static void sched_rt_period_init_tg(struct task_group *tg)
-{
-   on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1);
-}
-
-static void __sched_rt_period_destroy_tg(void *arg)
-{
-   struct task_group *tg = arg;
-   int cpu = smp_processor_id();
-
-   sched_rt_period_stop(tg-rt_rq[cpu]);
-}
-
-static void sched_rt_period_destroy_tg(struct task_group *tg)
-{
-   on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1);
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#else /* CONFIG_SMP */
-static void __init sched_rt_period_init(void)
-{
-   sched_rt_period_start_cpu(0);
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void sched_rt_period_init_tg(struct task_group *tg)
-{
-   sched_rt_period_start(tg-rt_rq[0]);
-}
-
-static void sched_rt_period_destroy_tg(struct task_group *tg)
-{
-   sched_rt_period_stop(tg-rt_rq[0]);
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#endif /* CONFIG_SMP */
-
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -7210,7 +7058,6 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus)  0)
BUG();
sched_init_granularity();
-   sched_rt_period_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
if (nr_cpu_ids == 1)
@@ -7231,7 +7078,6 @@ void __init sched_init_smp(void)
 void __init

[PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback

2008-01-06 Thread Peter Zijlstra
Currently all highres=off timers are run from softirq context, but
HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context.

Fix this up by splitting it similar to the highres=on case.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/hrtimer.h |5 -
 kernel/hrtimer.c|  232 +---
 kernel/timer.c  |3 
 3 files changed, 125 insertions(+), 115 deletions(-)

Index: linux-2.6/kernel/hrtimer.c
===
--- linux-2.6.orig/kernel/hrtimer.c
+++ linux-2.6/kernel/hrtimer.c
@@ -622,6 +622,11 @@ static inline int hrtimer_cb_pending(str
 static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+   struct hrtimer_clock_base *base)
+{
+   return 0;
+}
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
@@ -1030,6 +1035,85 @@ int hrtimer_get_res(const clockid_t whic
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+   spin_lock_irq(cpu_base-lock);
+
+   while (!list_empty(cpu_base-cb_pending)) {
+   enum hrtimer_restart (*fn)(struct hrtimer *);
+   struct hrtimer *timer;
+   int restart;
+
+   timer = list_entry(cpu_base-cb_pending.next,
+  struct hrtimer, cb_entry);
+
+   timer_stats_account_hrtimer(timer);
+
+   fn = timer-function;
+   __remove_hrtimer(timer, timer-base, HRTIMER_STATE_CALLBACK, 0);
+   spin_unlock_irq(cpu_base-lock);
+
+   restart = fn(timer);
+
+   spin_lock_irq(cpu_base-lock);
+
+   timer-state = ~HRTIMER_STATE_CALLBACK;
+   if (restart == HRTIMER_RESTART) {
+   BUG_ON(hrtimer_active(timer));
+   /*
+* Enqueue the timer, allow reprogramming of the event
+* device
+*/
+   enqueue_hrtimer(timer, timer-base, 1);
+   } else if (hrtimer_active(timer)) {
+   /*
+* If the timer was rearmed on another CPU, reprogram
+* the event device.
+*/
+   if (timer-base-first == timer-node)
+   hrtimer_reprogram(timer, timer-base);
+   }
+   }
+   spin_unlock_irq(cpu_base-lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+   struct hrtimer_clock_base *base = timer-base;
+   struct hrtimer_cpu_base *cpu_base = base-cpu_base;
+   enum hrtimer_restart (*fn)(struct hrtimer *);
+   int restart;
+
+   __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+   timer_stats_account_hrtimer(timer);
+
+   fn = timer-function;
+   if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+   /*
+* Used for scheduler timers, avoid lock inversion with
+* rq-lock and tasklist_lock.
+*
+* These timers are required to deal with enqueue expiry
+* themselves and are not allowed to migrate.
+*/
+   spin_unlock(cpu_base-lock);
+   restart = fn(timer);
+   spin_lock(cpu_base-lock);
+   } else
+   restart = fn(timer);
+
+   /*
+* Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+* reprogramming of the event hardware. This happens at the end of this
+* function anyway.
+*/
+   if (restart != HRTIMER_NORESTART) {
+   BUG_ON(timer-state != HRTIMER_STATE_CALLBACK);
+   enqueue_hrtimer(timer, base, 0);
+   }
+   timer-state = ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 /*
@@ -1063,9 +1147,7 @@ void hrtimer_interrupt(struct clock_even
basenow = ktime_add(now, base-offset);
 
while ((node = base-first)) {
-   enum hrtimer_restart (*fn)(struct hrtimer *);
struct hrtimer *timer;
-   int restart;
 
timer = rb_entry(node, struct hrtimer, node);
 
@@ -1089,37 +1171,7 @@ void hrtimer_interrupt(struct clock_even
continue;
}
 
-   __remove_hrtimer(timer, base,
-HRTIMER_STATE_CALLBACK, 0);
-   timer_stats_account_hrtimer(timer);
-
-   fn = timer-function;
-   if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ

[PATCH 06/11] sched: rt-group: reduce rescheduling

2008-01-06 Thread Peter Zijlstra
Only reschedule if the new group has a higher prio task.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched_rt.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/kernel/sched_rt.c
===
--- linux-2.6.orig/kernel/sched_rt.c
+++ linux-2.6/kernel/sched_rt.c
@@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc
struct sched_rt_entity *rt_se = rt_rq-rt_se;
 
if (rt_se  !on_rt_rq(rt_se)  rt_rq-rt_nr_running) {
+   struct task_struct *curr = rq_of_rt_rq(rt_rq)-curr;
+
enqueue_rt_entity(rt_se);
-   resched_task(rq_of_rt_rq(rt_rq)-curr);
+   if (rt_rq-highest_prio  curr-prio)
+   resched_task(curr);
}
 }
 

--

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 07/11] sched: rt-group: per group period

2008-01-06 Thread Peter Zijlstra
Steven asked for per group periods in order to get closer to RMA or EDF
scheduling.

Use the fancy new hrtimers to provide a per group period

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h|2 
 kernel/sched.c   |  229 ++-
 kernel/sched_rt.c|   61 ++--
 kernel/sysctl.c  |2 
 kernel/time/tick-sched.c |5 -
 5 files changed, 237 insertions(+), 62 deletions(-)

Index: linux-2.6/kernel/sched.c
===
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -177,6 +177,7 @@ struct task_group {
struct rt_rq **rt_rq;
 
unsigned int rt_ratio;
+   ktime_t rt_period;
 
/*
 * shares assigned to a task group governs how much of cpu bandwidth
@@ -372,6 +373,7 @@ struct rt_rq {
 #endif
int rt_throttled;
u64 rt_time;
+   struct hrtimer rt_period_timer;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq;
@@ -441,8 +443,6 @@ struct rq {
 
struct cfs_rq cfs;
struct rt_rq rt;
-   u64 rt_period_expire;
-   int rt_throttled;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r
 #define task_rq(p) cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)  (cpu_rq(cpu)-curr)
 
-unsigned long rt_needs_cpu(int cpu)
-{
-   struct rq *rq = cpu_rq(cpu);
-   u64 delta;
-
-   if (!rq-rt_throttled)
-   return 0;
-
-   if (rq-clock  rq-rt_period_expire)
-   return 1;
-
-   delta = rq-rt_period_expire - rq-clock;
-   do_div(delta, NSEC_PER_SEC / HZ);
-
-   return (unsigned long)delta;
-}
-
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
@@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
  * default: 1s
  */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+const_debug unsigned int sysctl_sched_rt_period = 100;
 
 #define SCHED_RT_FRAC_SHIFT16
 #define SCHED_RT_FRAC  (1UL  SCHED_RT_FRAC_SHIFT)
@@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #endif /* CONFIG_SMP */
 
+static inline ktime_t ns_to_ktime(u64 ns)
+{
+   static const ktime_t ktime_zero = { .tv64 = 0 };
+   return ktime_add_ns(ktime_zero, ns);
+}
+
 #include sched_stats.h
 #include sched_idletask.c
 #include sched_fair.c
@@ -3741,7 +3730,6 @@ void scheduler_tick(void)
rq-tick_timestamp = rq-clock;
update_cpu_load(rq);
curr-sched_class-task_tick(rq, curr, 0);
-   update_sched_rt_period(rq);
spin_unlock(rq-lock);
 
 #ifdef CONFIG_SMP
@@ -5287,6 +5275,158 @@ static inline void sched_init_granularit
sysctl_sched_batch_wakeup_granularity *= factor;
 }
 
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+   struct rt_rq *rt_rq =
+   container_of(timer, struct rt_rq, rt_period_timer);
+   struct rq *rq = rq_of_rt_rq(rt_rq);
+   ktime_t now = ktime_get();
+
+   WARN_ON(smp_processor_id() != cpu_of(rq));
+   WARN_ON(!in_irq());
+
+   spin_lock(rq-lock);
+   update_sched_rt_period(rt_rq);
+   spin_unlock(rq-lock);
+
+   hrtimer_forward(timer, now, sched_rt_period(rt_rq));
+   return HRTIMER_RESTART;
+}
+
+static void sched_rt_period_start(struct rt_rq *rt_rq)
+{
+   ktime_t period = sched_rt_period(rt_rq);
+
+   WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq)));
+
+   for (;;) {
+   ktime_t now = ktime_get();
+   hrtimer_forward(rt_rq-rt_period_timer, now, period);
+   hrtimer_start(rt_rq-rt_period_timer,
+   rt_rq-rt_period_timer.expires,
+   HRTIMER_MODE_ABS);
+   if (hrtimer_active(rt_rq-rt_period_timer))
+   break;
+   }
+}
+
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+static void sched_rt_period_stop(struct rt_rq *rt_rq)
+{
+   hrtimer_cancel(rt_rq-rt_period_timer);
+}
+#endif
+
+static void sched_rt_period_start_cpu(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+   struct rt_rq *rt_rq;
+
+   for_each_leaf_rt_rq(rt_rq, rq)
+   sched_rt_period_start(rt_rq);
+}
+
+#ifdef CONFIG_SMP
+static void sched_rt_period_stop_cpu(int cpu)
+{
+   struct rq *rq = cpu_rq(cpu);
+   struct rt_rq *rt_rq;
+
+   for_each_leaf_rt_rq(rt_rq, rq)
+   sched_rt_period_stop(rt_rq);
+}
+
+static int sched_rt_period_hotplug(struct notifier_block *nfb,
+   unsigned long action, void *hcpu)
+{
+   int cpu

Re: [stable] [PATCH] lockdep: fix mismatched lockdep_depth/curr_chain_hash

2007-10-25 Thread Peter Zijlstra
On Mon, 2007-10-08 at 10:39 -0700, Greg KH wrote:
 On Mon, Oct 08, 2007 at 07:36:10PM +0200, Peter Zijlstra wrote:
  
  On Mon, 2007-10-08 at 10:24 -0700, Greg KH wrote:
   On Fri, Oct 05, 2007 at 11:31:26AM +0200, Peter Zijlstra wrote:

Stable team,

please consider this patch for the next 22-stable.
   
   I don't see this patch in Linus's upstream tree.  We need it there to be
   able to accept it for -stable.  Or is this just a bugfix of other things
   that are already in his tree?
  
  I send Linus a similar patch, haven??t seem him pick it up yet.
  I??ll notify you when and if he picks it up.
 
 Great, that would be great for us -stable monkeys...

3aa416b07f0adf01c090baab26fb70c35ec17623


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: aim7 -30% regression in 2.6.24-rc1

2007-10-26 Thread Peter Zijlstra
On Fri, 2007-10-26 at 17:43 +0800, Zhang, Yanmin wrote:
 I tested 2.6.24-rc1 on my x86_64 machine which has 2 quad-core processors.
 
 Comparing with 2.6.23, aim7 has about -30% regression. I did a bisect and 
 found
 patch 
 http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=b5869ce7f68b233ceb81465a7644be0d9a5f3dbb
 caused the issue.

Bit weird that you point to a merge commit, and not an actual patch. Are
you sure git bisect pointed at this one?


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-26 Thread Peter Zijlstra
On Wed, 2007-10-03 at 15:35 +0200, Kay Sievers wrote:
 On Wed, 2007-10-03 at 12:37 +0200, Peter Zijlstra wrote:
  On Wed, 2007-10-03 at 12:15 +0200, Kay Sievers wrote:
   On Tue, 2007-10-02 at 22:05 +1000, Nick Piggin wrote:
On Tuesday 02 October 2007 21:40, Peter Zijlstra wrote:
 On Tue, 2007-10-02 at 13:21 +0200, Kay Sievers wrote:

  How about adding this information to the tree then, instead of
  creating a new top-level hack, just because something that you think
  you need doesn't exist.

 So you suggest adding all the various network filesystems in there
 (where?), and adding the concept of a BDI, and ensuring all are 
 properly
 linked together - somehow. Feel free to do so.

Would something fit better under /sys/fs/? At least filesystems are
already an existing concept to userspace.
   
   Sounds at least less messy than an new top-level directory.
   
   But again, if it's device releated, like the name suggests, it should
   be reachable from the device tree.
   Which userspace tool is supposed to set these values, and at what time?
   An init-script, something at device discovery/setup? If that is is ever
   going to be used in a hotplug setup, you really don't want to go look
   for directories with magic device names in another disconnected tree.
  
  Filesystems don't really map to BDIs either. One can have multiple FSs
  per BDI.
  
  'Normally' a BDI relates to a block device, but networked (and other
  non-block device) filesystems have to create a BDI too. So these need to
  be represented some place as well.
  
  The typical usage would indeed be init scripts. The typical example
  would be setting the read-ahead window. Currently that cannot be done
  for NFS mounts.
 
 What kind of context for a non-block based fs will get the bdi controls
 added? Is there a generic place, or does every non-block based
 filesystem needs to be adapted individually to use it?

---
Subject: bdi: debugfs interface

Expose the BDI stats (and readahead window) in /debug/bdi/

I'm still thinking it should go into /sys somewhere, however I just noticed
not all block devices that have a queue have a /queue directory. Noticeably
those that use make_request_fn() as opposed to request_fn(). And then of
course there are the non-block/non-queue BDIs.

A BDI is basically the object that represents the 'thing' you dirty pages
against. For block devices that is related to the block device (and is
typically embedded in the queue object), for NFS mounts its the remote server
object of the client. For FUSE, yet again something else.

I appreciate the sysfs people their opinion that /sys/bdi/ might not be the
best from their POV, however I'm not seeing where to hook the BDI object from
so that it all makes sense, a few of the things are currently not exposed in
sysfs at all, like the NFS and FUSE things.

So, for now, I've exposed the thing in debugfs. Please suggest a better
alternative.

Miklos, Trond: could you suggest a better fmt for the bdi_init_fmt() for your
respective filesystems?

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
CC: Miklos Szeredi [EMAIL PROTECTED]
CC: Trond Myklebust [EMAIL PROTECTED]
---
 block/genhd.c   |2 
 block/ll_rw_blk.c   |1 
 drivers/block/loop.c|7 ++
 drivers/md/dm.c |2 
 drivers/md/md.c |2 
 fs/fuse/inode.c |2 
 fs/nfs/client.c |2 
 include/linux/backing-dev.h |   15 
 include/linux/debugfs.h |   11 +++
 include/linux/writeback.h   |3 
 mm/backing-dev.c|  153 
 mm/page-writeback.c |2 
 12 files changed, 199 insertions(+), 3 deletions(-)

Index: linux-2.6-2/fs/fuse/inode.c
===
--- linux-2.6-2.orig/fs/fuse/inode.c
+++ linux-2.6-2/fs/fuse/inode.c
@@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void)
atomic_set(fc-num_waiting, 0);
fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc-bdi.unplug_io_fn = default_unplug_io_fn;
-   err = bdi_init(fc-bdi);
+   err = bdi_init_fmt(fc-bdi, fuse-%p, fc);
if (err) {
kfree(fc);
fc = NULL;
Index: linux-2.6-2/fs/nfs/client.c
===
--- linux-2.6-2.orig/fs/nfs/client.c
+++ linux-2.6-2/fs/nfs/client.c
@@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s
goto out_error;
 
nfs_server_set_fsinfo(server, fsinfo);
-   error = bdi_init(server-backing_dev_info);
+   error = bdi_init_fmt(server-backing_dev_info, nfs-%s-%p, 
clp-cl_hostname, server);
if (error)
goto out_error;
 
Index: linux-2.6-2/include/linux/backing-dev.h

Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-26 Thread Peter Zijlstra
On Fri, 2007-10-26 at 17:33 +0200, Kay Sievers wrote:
 On Fri, 2007-10-26 at 17:22 +0200, Peter Zijlstra wrote:
  On Fri, 2007-10-26 at 17:10 +0200, Kay Sievers wrote:
   On Fri, 2007-10-26 at 16:48 +0200, Peter Zijlstra wrote:
   
I appreciate the sysfs people their opinion that /sys/bdi/ might not be 
the
best from their POV, however I'm not seeing where to hook the BDI 
object from
so that it all makes sense, a few of the things are currently not 
exposed in
sysfs at all, like the NFS and FUSE things.
   
   What happended to the idea to create a bdi class, and have the
   existing devices as parents, and for stuff that is not (not now, or
   never) in sysfs, no parent is set.
  
  Must have forgotten about that, mainly because I'm not sure I fully
  understand it.
  
  So we create a class,
 
 Yes.
 
  create these objects,
 
 Yes, struct device objects, assigned to the bdi class. (Don't use
 class_device, that will be removed soon.)
 
  which are all called bdi
 
 Probably not. You can name it how you want, you can inherit the name of
 the parent, or prefix it with whatever fits, they just need to be
 unique. Things like the fuse-%llu name would work just fine. I guess
 you already solved that problem in the debugfs directory.
 
  and have children with these attributes in it.
 
 The attributes would just be files in the device object.
 
  Now, I supposed there is a directory that lists all unparented thingies,
  how do I locate the one that matches my nfs mount?
 
 You look for the name (prefix), try: ls /sys/class/sound/, it's the
 same model all over the place.

Ok, will try that. Is there a 'simple uncluttered' example I could look
at to copy from?


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: 2.6.24-rc1: First impressions

2007-10-26 Thread Peter Zijlstra
On Fri, 2007-10-26 at 17:22 +0200, Ingo Molnar wrote:
 * Martin Knoblauch [EMAIL PROTECTED] wrote:
 
  Hi ,
  
   just to give some feedback on 2.6.24-rc1. For some time I am tracking 
   IO/writeback problems that hurt system responsiveness big-time. I 
   tested Peters stuff together with Fenguangs additions and it looked 
   promising. Therefore I was very happy to see Peters stuff going into 
   2.6.24 and waited eagerly for rc1. In short, I am impressed. This 
   really looks good. IO throughput is great and I could not reproduce 
   the responsiveness problems so far.
  
   Below are a some numbers of my brute-force I/O tests that I can use 
   to bring responsiveness down. My platform is a HP/DL380g4, dual CPUs, 
   HT-enabled, 8 GB Memory, SmartaArray6i controller with 4x72GB SCSI 
   disks as RAID5 (battery protected writeback cahe enabled) and gigabit 
   networking (tg3). User space is 64-bit RHEL4.3
  
   I am basically doing copies using dd with 1MB blocksize. Local 
   Filesystem ist ext2 (noatime). IO-Scheduler is dealine, as it tends 
   to give best results. NFS3 Server is a Sun/T2000/Solaris10. The tests 
   are:
  
  dd1 - copy 16 GB from /dev/zero to local FS
  dd1-dir - same, but using O_DIRECT for output
  dd2/dd2-dir - copy 2x7.6 GB in parallel from /dev/zero to local FS
  dd3/dd3-dir - copy 3x5.2 GB in parallel from /dev/zero lo local FS
  net1 - copy 5.2 GB from NFS3 share to local FS
  mix3 - copy 3x5.2 GB from /dev/zero to local disk and two NFS3 shares
  
   I did the numbers for 2.6.19.2, 2.6.22.6 and 2.6.24-rc1. All units 
   are MB/sec.
  
  test   2.6.19.2 2.6.22.62.6.24.-rc1
  
  dd1  28   50 96
  dd1-dir  88   88 86
  dd2  2x16.5 2x11 2x44.5
  dd2-dir2x44 2x44   2x43
  dd3   3x9.83x8.7   3x30
  dd3-dir  3x29.5   3x29.5 3x28.5
  net1  30-3350-55  37-52
  mix3  17/3225/50  96/35 (disk/combined-network)
 
 wow, really nice results! Peter does know how to make stuff fast :) Now 
 lets pick up some of Peter's other, previously discarded patches as well
 :-)
 
 Such as the rewritten reclaim (clockpro) patches:
 
   http://programming.kicks-ass.net/kernel-patches/page-replace/

I think riel is taking over that stuff with his split vm and policies
per type.

 The improve-swap-performance (swap-token) patches:
 
   http://programming.kicks-ass.net/kernel-patches/swap_token/

Ashwin's version did get upstreamed.

 His enable-swap-over-NFS [and other complex IO transports] patches:
 
   http://programming.kicks-ass.net/kernel-patches/vm_deadlock/

Will post that one again, soonish Esp. after Linus professed liking
to have swap over NFS.

I've been working on improving the changelogs and comments in that code.

latest code (somewhat raw, as rushed by ingo posting this) in:
http://programming.kicks-ass.net/kernel-patches/vm_deadlock/v2.6.23-mm1/

 And the concurrent pagecache patches:
 
   http://programming.kicks-ass.net/kernel-patches/concurrent-pagecache/
 
 as a starter :-) I think the MM should get out of deep-feature-freeze 
 mode - there's tons of room to improve :-/

Yeah, that one would be cool, but it depends on Nick getting his
lockless pagecache upstream. For those who don't know, both are in -rt
(and have been for some time) so it's not unproven code.


signature.asc
Description: This is a digitally signed message part


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-26 Thread Peter Zijlstra
On Fri, 2007-10-26 at 17:10 +0200, Kay Sievers wrote:
 On Fri, 2007-10-26 at 16:48 +0200, Peter Zijlstra wrote:
 
  I appreciate the sysfs people their opinion that /sys/bdi/ might not be the
  best from their POV, however I'm not seeing where to hook the BDI object 
  from
  so that it all makes sense, a few of the things are currently not exposed in
  sysfs at all, like the NFS and FUSE things.
 
 What happended to the idea to create a bdi class, and have the
 existing devices as parents, and for stuff that is not (not now, or
 never) in sysfs, no parent is set.

Must have forgotten about that, mainly because I'm not sure I fully
understand it.

So we create a class, create these objects, which are all called bdi and
have children with these attributes in it.

Now, I supposed there is a directory that lists all unparented thingies,
how do I locate the one that matches my nfs mount?



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-26 Thread Peter Zijlstra
This crashes and burns on bootup, but I'm too tired to figure out what I
did wrong... will give it another try tomorrow..


Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 block/genhd.c   |2 
 fs/fuse/inode.c |2 
 fs/nfs/client.c |2 
 include/linux/backing-dev.h |   33 
 include/linux/writeback.h   |3 +
 mm/backing-dev.c|  121 
 mm/page-writeback.c |2 
 7 files changed, 162 insertions(+), 3 deletions(-)

Index: linux-2.6-2/fs/fuse/inode.c
===
--- linux-2.6-2.orig/fs/fuse/inode.c
+++ linux-2.6-2/fs/fuse/inode.c
@@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void)
atomic_set(fc-num_waiting, 0);
fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc-bdi.unplug_io_fn = default_unplug_io_fn;
-   err = bdi_init(fc-bdi);
+   err = bdi_init_fmt(fc-bdi, fuse-%llu, (unsigned long 
long)fc-id);
if (err) {
kfree(fc);
fc = NULL;
Index: linux-2.6-2/fs/nfs/client.c
===
--- linux-2.6-2.orig/fs/nfs/client.c
+++ linux-2.6-2/fs/nfs/client.c
@@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s
goto out_error;
 
nfs_server_set_fsinfo(server, fsinfo);
-   error = bdi_init(server-backing_dev_info);
+   error = bdi_init_fmt(server-backing_dev_info, nfs-%s-%p, 
clp-cl_hostname, server);
if (error)
goto out_error;
 
Index: linux-2.6-2/include/linux/backing-dev.h
===
--- linux-2.6-2.orig/include/linux/backing-dev.h
+++ linux-2.6-2/include/linux/backing-dev.h
@@ -11,6 +11,8 @@
 #include linux/percpu_counter.h
 #include linux/log2.h
 #include linux/proportions.h
+#include linux/kernel.h
+#include linux/device.h
 #include asm/atomic.h
 
 struct page;
@@ -48,11 +50,42 @@ struct backing_dev_info {
 
struct prop_local_percpu completions;
int dirty_exceeded;
+
+#ifdef CONFIG_SYSFS
+   struct device kdev;
+#endif
 };
 
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+int __bdi_register(struct backing_dev_info *bdi);
+void bdi_unregister(struct backing_dev_info *bdi);
+
+#ifdef CONFIG_SYSFS
+#define bdi_init_fmt(bdi, fmt...)  \
+   ({  \
+   int ret;\
+   kobject_set_name((bdi)-kdev.kobj, ##fmt); \
+   ret = bdi_init(bdi);\
+   if (!ret) { \
+   ret = __bdi_register(bdi);  \
+   if (ret)\
+   bdi_destroy(bdi);   \
+   }   \
+   ret;\
+   })
+
+#define bdi_register(bdi, fmt...)  \
+   ({  \
+   kobject_set_name((bdi)-kdev.kobj, ##fmt); \
+   __bdi_register(bdi);\
+   })
+#else
+#define bdi_init_fmt(bdi, fmt...)  bdi_init(bdi)
+#define bdi_register(bdi, fmt...)  __bdi_register(bdi)
+#endif
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
enum bdi_stat_item item, s64 amount)
 {
Index: linux-2.6-2/include/linux/writeback.h
===
--- linux-2.6-2.orig/include/linux/writeback.h
+++ linux-2.6-2/include/linux/writeback.h
@@ -113,6 +113,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
  void __user *, size_t *, loff_t *);
 
+void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+struct backing_dev_info *bdi);
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
unsigned long nr_pages_dirtied);
Index: linux-2.6-2/mm/backing-dev.c
===
--- linux-2.6-2.orig/mm/backing-dev.c
+++ linux-2.6-2/mm/backing-dev.c
@@ -4,12 +4,130 @@
 #include linux/fs.h
 #include linux/sched.h
 #include linux/module.h
+#include linux/writeback.h
+#include linux/device.h
+
+#ifdef CONFIG_SYSFS
+
+static void bdi_release(struct device *dev)
+{
+}
+
+static int bdi_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+   return 0;
+}
+
+static struct class bdi_class = {
+   .name

Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-26 Thread Peter Zijlstra

On Fri, 2007-10-26 at 22:04 +0200, Peter Zijlstra wrote:
 This crashes and burns on bootup, but I'm too tired to figure out what I
 did wrong... will give it another try tomorrow..

Ok, can't sleep.. took a look. I have several problems here.

The thing that makes it go *boom* is the __ATTR_NULL. Removing that
makes it boot. Albeit it then warns me of multiple duplicate sysfs
objects, all named bdi.

For some obscure reason this device interface insists on using the
bus_id as name (?!), and further reduces usability by limiting that to
20 odd characters.

This makes it quite useless. I tried fudging around that limit by using
device_rename and kobject_rename, but to no avail.

Really, it should not be this hard to use, trying to expose a handfull
of simple integers to userspace should not take 8h+ and still not work.

Peter, who thinks sysfs is contorted mess beyond his skill. I'll stick
to VM and scheduler code, that actually makes sense.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-27 Thread Peter Zijlstra
On Fri, 2007-10-26 at 19:40 -0700, Greg KH wrote:
 On Sat, Oct 27, 2007 at 03:18:08AM +0200, Peter Zijlstra wrote:
  
  On Fri, 2007-10-26 at 22:04 +0200, Peter Zijlstra wrote:
   This crashes and burns on bootup, but I'm too tired to figure out what I
   did wrong... will give it another try tomorrow..
  
  Ok, can't sleep.. took a look. I have several problems here.
  
  The thing that makes it go *boom* is the __ATTR_NULL. Removing that
  makes it boot. Albeit it then warns me of multiple duplicate sysfs
  objects, all named bdi.
  
  For some obscure reason this device interface insists on using the
  bus_id as name (?!), and further reduces usability by limiting that to
  20 odd characters.
  
  This makes it quite useless. I tried fudging around that limit by using
  device_rename and kobject_rename, but to no avail.
  
  Really, it should not be this hard to use, trying to expose a handfull
  of simple integers to userspace should not take 8h+ and still not work.
  
  Peter, who thinks sysfs is contorted mess beyond his skill. I'll stick
  to VM and scheduler code, that actually makes sense.
 
 Heh, that's funny :)
 
 I'll look at this and see what I can come up with.  Would you just like
 a whole new patch, or one against this one?

Sorry for the grumpy note, I get that way at 3.30 am. Maybe I ought not
have mailed :-/

This is the code I had at that time.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 block/genhd.c   |2 
 fs/fuse/inode.c |2 
 fs/nfs/client.c |2 
 include/linux/backing-dev.h |   21 ++
 include/linux/string.h  |4 +
 include/linux/writeback.h   |3 
 mm/backing-dev.c|  144 
 mm/page-writeback.c |2 
 mm/util.c   |   42 
 9 files changed, 219 insertions(+), 3 deletions(-)

Index: linux-2.6-2/fs/fuse/inode.c
===
--- linux-2.6-2.orig/fs/fuse/inode.c
+++ linux-2.6-2/fs/fuse/inode.c
@@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void)
atomic_set(fc-num_waiting, 0);
fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
fc-bdi.unplug_io_fn = default_unplug_io_fn;
-   err = bdi_init(fc-bdi);
+   err = bdi_init_fmt(fc-bdi, bdi-fuse-%llu, (unsigned long 
long)fc-id);
if (err) {
kfree(fc);
fc = NULL;
Index: linux-2.6-2/fs/nfs/client.c
===
--- linux-2.6-2.orig/fs/nfs/client.c
+++ linux-2.6-2/fs/nfs/client.c
@@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s
goto out_error;
 
nfs_server_set_fsinfo(server, fsinfo);
-   error = bdi_init(server-backing_dev_info);
+   error = bdi_init_fmt(server-backing_dev_info, bdi-nfs-%s-%p, 
clp-cl_hostname, server);
if (error)
goto out_error;
 
Index: linux-2.6-2/include/linux/backing-dev.h
===
--- linux-2.6-2.orig/include/linux/backing-dev.h
+++ linux-2.6-2/include/linux/backing-dev.h
@@ -11,6 +11,8 @@
 #include linux/percpu_counter.h
 #include linux/log2.h
 #include linux/proportions.h
+#include linux/kernel.h
+#include linux/device.h
 #include asm/atomic.h
 
 struct page;
@@ -48,11 +50,30 @@ struct backing_dev_info {
 
struct prop_local_percpu completions;
int dirty_exceeded;
+
+#ifdef CONFIG_SYSFS
+   struct device kdev;
+#endif
 };
 
 int bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
 
+int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...);
+void bdi_unregister(struct backing_dev_info *bdi);
+
+#define bdi_init_fmt(bdi, fmt...)  \
+   ({  \
+   int ret;\
+   ret = bdi_init(bdi);\
+   if (!ret) { \
+   ret = bdi_register(bdi, ##fmt); \
+   if (ret)\
+   bdi_destroy(bdi);   \
+   }   \
+   ret;\
+})
+
 static inline void __add_bdi_stat(struct backing_dev_info *bdi,
enum bdi_stat_item item, s64 amount)
 {
Index: linux-2.6-2/include/linux/writeback.h
===
--- linux-2.6-2.orig/include/linux/writeback.h
+++ linux-2.6-2/include/linux/writeback.h
@@ -113,6 +113,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file

Networked filesystems vs backing_dev_info

2007-10-27 Thread Peter Zijlstra
Hi,

I had me a little look at bdi usage in networked filesystems.

 NFS, CIFS, (smbfs), AFS, CODA and NCP

And of those, NFS is the only one that I could find that creates
backing_dev_info structures. The rest seems to fall back to
default_backing_dev_info.

With my recent per bdi dirty limit patches the bdi has become more
important than it has been in the past. While falling back to the
default_backing_dev_info isn't wrong per-se, it isn't right either. 

Could I implore the various maintainers to look into this issue for
their respective filesystem. I'll try and come up with some patches to
address this, but feel free to beat me to it.

peterz

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: lock held when returning to user space

2007-10-27 Thread Peter Zijlstra

On Sat, 2007-10-27 at 17:12 +0200, Jiri Kosina wrote:
 On Sat, 27 Oct 2007, Gabriel C wrote:
 
  I found that today in dmesg after booting current git ( 
  ec3b67c11df42362ccda81261d62829042f223f0 ) :
  ...
  [  592.752777]
  [  592.752781] 
  [  592.753478] [ BUG: lock held when returning to user space! ]
  [  592.753880] 
  [  592.754262] hwclock/1452 is leaving the kernel with locks still held!
  [  592.754655] 1 lock held by hwclock/1452:
  [  592.755007]  #0:  (rtc-char_lock){--..}, at: [c02a7ebb] 
  rtc_dev_open+0x2e/0x7e
 
 Yes, this is because rtc keeps a char_lock mutex locked as long as the 
 device is open, to avoid concurrent accessess.
 
 It could be easily substituted by some counting -- setting and clearing 
 bit in struct rtc_device instead of using char_lock, but doing this just 
 to shut the lockdep off is questionable imho.
 
 Peter, what is the preferred way to annotate these kinds of locking for 
 lockdep to express that it is intended?

Not sure, I'd not thought that anyone would actually want to do this.
I'm also not sure how I stand on this, I'd prefer to say: don't do this!

I think, in this case, the lock is associated with a kernel object that
is properly cleaned up if the holding tasks gets a SIGKILL. But in
general I'd like to see this kind of thing go away.

Now I could probably come up with an annotation to hide it, but what do
other people think, Ingo, Linus, Andrew, do we want to keep kernel locks
held over userspace?



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Networked filesystems vs backing_dev_info

2007-10-27 Thread Peter Zijlstra

On Sat, 2007-10-27 at 11:22 -0400, Jan Harkes wrote:
 On Sat, Oct 27, 2007 at 11:34:26AM +0200, Peter Zijlstra wrote:
  I had me a little look at bdi usage in networked filesystems.
  
   NFS, CIFS, (smbfs), AFS, CODA and NCP
  
  And of those, NFS is the only one that I could find that creates
  backing_dev_info structures. The rest seems to fall back to
  default_backing_dev_info.
 
 While a file is opened in Coda we associate the open file handle with a
 local cache file. All read and write operations are redirected to this
 local file and we even redirect inode-i_mapping. Actual reads and
 writes are completely handled by the underlying file system. We send the
 new file contents back to the servers only after all local references
 have been released (last-close semantics).
 
 As a result, there is no need for backing_dev_info structures in Coda,
 if any congestion control is needed it will be handled by the underlying
 file system where our locally cached copies are stored.

Ok, that works. Thanks for this explanation!

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-27 Thread Peter Zijlstra

On Sat, 2007-10-27 at 09:02 -0700, Greg KH wrote:

 Ah, I see a few problems.  Here, try this version instead.  It's
 compile-tested only, and should be a lot simpler.
 
 Note, we still are not setting the parent to the new bdi structure
 properly, so the devices will show up in /sys/devices/virtual/ instead
 of in their proper location.  To do this, we need the parent of the
 device, which I'm not so sure what it should be (block device?  block
 device controller?)

The problem is that not every bdi has a sysfs represented parent, hence
the class suggestion. For block devices it is indeed the block device
itself, but for example the NFS client's server descriptor does not have
a sysfs representation.

 Let me know if this works better, I'm off to a kids birthday party for
 the day, but will be around this evening...

Hehe, do enjoy! Thanks.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: BUG: lock held when returning to user space

2007-10-27 Thread Peter Zijlstra

On Sat, 2007-10-27 at 08:47 -0700, Arjan van de Ven wrote:
 On Sat, 27 Oct 2007 17:12:41 +0200 (CEST)
 Jiri Kosina [EMAIL PROTECTED] wrote:
 
  On Sat, 27 Oct 2007, Gabriel C wrote:
  
   I found that today in dmesg after booting current git ( 
   ec3b67c11df42362ccda81261d62829042f223f0 ) :
   ...
   [  592.752777]
   [  592.752781] 
   [  592.753478] [ BUG: lock held when returning to user space! ]
   [  592.753880] 
   [  592.754262] hwclock/1452 is leaving the kernel with locks still
   held! [  592.754655] 1 lock held by hwclock/1452:
   [  592.755007]  #0:  (rtc-char_lock){--..}, at: [c02a7ebb]
   rtc_dev_open+0x2e/0x7e
  
  Yes, this is because rtc keeps a char_lock mutex locked as long as
  the device is open, to avoid concurrent accessess.
  
  It could be easily substituted by some counting -- setting and
  clearing bit in struct rtc_device instead of using char_lock, but
  doing this just to shut the lockdep off is questionable imho.
 
 it's not about lockdep; what this code doing is not valid use of a
 mutex:
 A mutex is required to have a clear process as owner, and in this case
 it doesn't have that... at all. This is a violation of the kernel mutex
 semantics.. and should be fixed.

Right, the fd could be transferred using unix sockets or fork(). That
would indeed seriously break a mutex.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2] irq_flags_t: intro and core annotations

2007-10-27 Thread Peter Zijlstra
On Sun, 2007-10-28 at 00:14 +0400, Alexey Dobriyan wrote:
 On Sat, Oct 27, 2007 at 09:20:43PM +0200, Roman Zippel wrote:
  On Sun, 21 Oct 2007, Alexey Dobriyan wrote:
  
   So far remedies were:
   a) grep(1) -- obviously fragile. I tried at some point grepping for
  spin_lock_irqsave(), found quite a few, but it became bring 
   quickly.
   b) BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)) -- was tried,
  brutally broke some arches, survived one commit before revert :^)
  Doesn't work on i386 where sizeof(unsigned int) == sizeof(unsigned 
   long).
   
   So it would be nice to have something more robust.
  
  If it's just about the type checking, something like below should pretty 
  much do the same.
 
 It won't catch, the following if both variables are unsigned long:
 
   spin_lock_irqsave(lock, flags);
   [stuff]
   spin_unlock_irqrestore(lock, foo-flags);
 
 It won't catch static unsigned long flags;. With sparse, we can
 eventually mark type as on-stack only.

  +static __always_inline void __irq_flags_check(unsigned long *flags)
  +{
+   BUILD_BUG_ON(!__builtin_stack_addr(flags));
  +}
  +

obviously gcc doesn't (yet) support that __builtin function, but you
could make it work for sparse and define a dummy for gcc.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Networked filesystems vs backing_dev_info

2007-10-27 Thread Peter Zijlstra
On Sat, 2007-10-27 at 16:02 -0500, Steve French wrote:
 On 10/27/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
  Hi,
 
  I had me a little look at bdi usage in networked filesystems.
 
   NFS, CIFS, (smbfs), AFS, CODA and NCP
 
  And of those, NFS is the only one that I could find that creates
  backing_dev_info structures. The rest seems to fall back to
  default_backing_dev_info.
 
  With my recent per bdi dirty limit patches the bdi has become more
  important than it has been in the past. While falling back to the
  default_backing_dev_info isn't wrong per-se, it isn't right either.
 
  Could I implore the various maintainers to look into this issue for
  their respective filesystem. I'll try and come up with some patches to
  address this, but feel free to beat me to it.
 
 I would like to understand more about your patches to see what bdi
 values makes sense for CIFS and how to report possible congestion back
 to the page manager. 

So, what my recent patches do is carve up the total writeback cache
size, or dirty page limit as we call it, proportionally to a BDIs
writeout speed. So a fast device gets more than a slow device, but will
not starve it.

However, for this to work, each device, or remote backing store in the
case of networked filesystems, need to have a BDI.

   I had been thinking about setting bdi-ra_pages
 so that we do more sensible readahead and writebehind - better
 matching what is possible over the network and what the server
 prefers.  

Well, you'd first have to create backing_dev_info instances before
setting that value :-)

   SMB/CIFS Servers typically allow a maximum of 50 requests
 in parallel at one time from one client (although this is adjustable
 for some).

That seems like a perfect point to set congestion.

So in short, stick a struct backing_dev_info into whatever represents a
client, initialize it using bdi_init(), destroy using bdi_destroy().

Mark it congested once you have 50 (or more) outstanding requests, clear
congestion when you drop below 50.

and you should be set.



signature.asc
Description: This is a digitally signed message part


Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)

2007-10-27 Thread Peter Zijlstra
On Sat, 2007-10-27 at 23:08 +0200, Kay Sievers wrote:
 On Sat, 2007-10-27 at 09:02 -0700, Greg KH wrote:

  Ah, I see a few problems.  Here, try this version instead.  It's
  compile-tested only, and should be a lot simpler.
  
  Note, we still are not setting the parent to the new bdi structure
  properly, so the devices will show up in /sys/devices/virtual/ instead
  of in their proper location.  To do this, we need the parent of the
  device, which I'm not so sure what it should be (block device?  block
  device controller?)
 
 Assigning a parent device will only work with the upcoming conversion of
 the raw kobjects in the block subsystem to struct device.
 
 A few comments to the patch:
 
  --- a/include/linux/string.h
  +++ b/include/linux/string.h
  @@ -8,6 +8,7 @@
   #include linux/compiler.h/* for inline */
   #include linux/types.h   /* for size_t */
   #include linux/stddef.h  /* for NULL */
  +#include stdarg.h
   
   #ifdef __cplusplus
   extern C {
  @@ -111,6 +112,9 @@ extern void *kmemdup(const void *src, si
   extern char **argv_split(gfp_t gfp, const char *str, int *argcp);
   extern void argv_free(char **argv);
   
  +char *kvprintf(const char *fmt, va_list args);
  +char *kprintf(const char *fmt, ...);
 
 Why is that here? I don't think we need this when we use the existing:
   kvasprintf(GFP_KERNEL, fmt, args)

Ignorance of the existance of said function. Thanks for pointing it out.
(kobject_set_name ought to use it too I guess)

  --- a/mm/backing-dev.c
  +++ b/mm/backing-dev.c
 
  +
  +static struct device_attribute bdi_dev_attrs[] = {
  +   __ATTR(readahead, 0644, readahead_show, readahead_store),
  +   __ATTR_RO(reclaimable),
  +   __ATTR_RO(writeback),
  +   __ATTR_RO(dirty),
  +   __ATTR_RO(bdi_dirty),
  +};
 
 Default attributes will need the NULL termination back (see below).
 
  +static __init int bdi_class_init(void)
  +{
  +   bdi_class = class_create(THIS_MODULE, bdi);
  +   return 0;
  +}
  +
  +__initcall(bdi_class_init);
  +
  +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
 
 This function should accept a: struct device *parent and all callers
 just pass NULL until the block layer conversion gets merged.

Yeah, you're right, but I wanted to just get something working before
bothering with the parent thing.

  +{
  +   char *name;
  +   va_list args;
  +   int ret = -ENOMEM;
  +   int i;
  +
  +   va_start(args, fmt);
  +   name = kvprintf(fmt, args);
 
 kvasprintf(GFP_KERNEL, fmt, args);
 
  +   va_end(args);
  +
  +   if (!name)
  +   return -ENOMEM;
  +
  +   bdi-dev = device_create(bdi_class, NULL, MKDEV(0,0), name);
 
 The parent should be passed here.
 
  +   for (i = 0; i  ARRAY_SIZE(bdi_dev_attrs); i++) {
  +   ret = device_create_file(bdi-dev, bdi_dev_attrs[i]);
  +   if (ret)
  +   break;
  +   }
  +   if (ret) {
  +   while (--i = 0)
  +   device_remove_file(bdi-dev, bdi_dev_attrs[i]);
  +   device_unregister(bdi-dev);
  +   bdi-dev = NULL;
  +   }
 
 All this open-coded attribute stuff should go away and be replaced by:
   bdi_class-dev_attrs = bdi_dev_attrs;
 Otherwise at event time the attributes are not created and stuff hooking
 into the events will not be able to set values. Also, the core will do
 proper add/remove and error handling then.

ok, that's good to know. someone ought to write a book on how to use all
this... really, even the functions are bare of documentation or
comments.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Networked filesystems vs backing_dev_info

2007-10-27 Thread Peter Zijlstra
On Sat, 2007-10-27 at 23:30 +0200, Peter Zijlstra wrote:
 On Sat, 2007-10-27 at 16:02 -0500, Steve French wrote:
  On 10/27/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
   Hi,
  
   I had me a little look at bdi usage in networked filesystems.
  
NFS, CIFS, (smbfs), AFS, CODA and NCP
  
   And of those, NFS is the only one that I could find that creates
   backing_dev_info structures. The rest seems to fall back to
   default_backing_dev_info.
  
   With my recent per bdi dirty limit patches the bdi has become more
   important than it has been in the past. While falling back to the
   default_backing_dev_info isn't wrong per-se, it isn't right either.
  
   Could I implore the various maintainers to look into this issue for
   their respective filesystem. I'll try and come up with some patches to
   address this, but feel free to beat me to it.
  
  I would like to understand more about your patches to see what bdi
  values makes sense for CIFS and how to report possible congestion back
  to the page manager. 
 
 So, what my recent patches do is carve up the total writeback cache
 size, or dirty page limit as we call it, proportionally to a BDIs
 writeout speed. So a fast device gets more than a slow device, but will
 not starve it.
 
 However, for this to work, each device, or remote backing store in the
 case of networked filesystems, need to have a BDI.
 
I had been thinking about setting bdi-ra_pages
  so that we do more sensible readahead and writebehind - better
  matching what is possible over the network and what the server
  prefers.  
 
 Well, you'd first have to create backing_dev_info instances before
 setting that value :-)
 
SMB/CIFS Servers typically allow a maximum of 50 requests
  in parallel at one time from one client (although this is adjustable
  for some).
 
 That seems like a perfect point to set congestion.
 
 So in short, stick a struct backing_dev_info into whatever represents a
 client, initialize it using bdi_init(), destroy using bdi_destroy().

Oh, and the most important point, make your fresh I_NEW inodes point to
this bdi struct.

 Mark it congested once you have 50 (or more) outstanding requests, clear
 congestion when you drop below 50.
 
 and you should be set.
 

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-29 Thread Peter Zijlstra
On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote:
 On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote:
  On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] 
  wrote:
  
   The problem original occurs with the fb_defio driver 
   (driver/video/fb_defio.c).
   This driver use the vm_ops.page_mkwrite() handler for tracking the 
   modified pages,
   which will be in an extra thread handled, to perform the IO and clean and
   write protect all pages with page_clean().
  
 
 Hi,
 
 An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255.
 
 I understood from the thread that PeterZ is looking into page_mkclean
 changes which I guess went into 2.6.23. I'm also happy to help in any
 way if the way we're doing fb_defio needs to change.

Yeah, its the truncate race stuff introduced by Nick in
  d0217ac04ca6591841e5665f518e38064f4e65bd

I'm a bit at a loss on how to go around fixing this. One ugly idea I had
was to check page-mapping before going into page_mkwrite() and when
that is null, don't bother with the truncate check.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-29 Thread Peter Zijlstra
On Mon, 2007-10-29 at 11:11 +0100, Peter Zijlstra wrote:
 On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote:
  On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote:
   On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] 
   wrote:
   
The problem original occurs with the fb_defio driver 
(driver/video/fb_defio.c).
This driver use the vm_ops.page_mkwrite() handler for tracking the 
modified pages,
which will be in an extra thread handled, to perform the IO and clean 
and
write protect all pages with page_clean().
   
  
  Hi,
  
  An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255.
  
  I understood from the thread that PeterZ is looking into page_mkclean
  changes which I guess went into 2.6.23. I'm also happy to help in any
  way if the way we're doing fb_defio needs to change.
 
 Yeah, its the truncate race stuff introduced by Nick in
   d0217ac04ca6591841e5665f518e38064f4e65bd
 
 I'm a bit at a loss on how to go around fixing this. One ugly idea I had
 was to check page-mapping before going into page_mkwrite() and when
 that is null, don't bother with the truncate check.

Something like this

---
 mm/memory.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

Index: linux-2.6/mm/memory.c
===
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -2300,6 +2300,8 @@ static int __do_fault(struct mm_struct *
 * to become writable
 */
if (vma-vm_ops-page_mkwrite) {
+   struct address_space *mapping = page-mapping;
+
unlock_page(page);
if (vma-vm_ops-page_mkwrite(vma, page)  0) {
ret = VM_FAULT_SIGBUS;
@@ -2314,7 +2316,7 @@ static int __do_fault(struct mm_struct *
 * reworking page_mkwrite locking API, which
 * is better done later.
 */
-   if (!page-mapping) {
+   if (mapping != page-mapping) {
ret = 0;
anon = 1; /* no anon but release 
vmf.page */
goto out;


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-29 Thread Peter Zijlstra
On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote:
 On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote:
  On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] 
  wrote:
  
   The problem original occurs with the fb_defio driver 
   (driver/video/fb_defio.c).
   This driver use the vm_ops.page_mkwrite() handler for tracking the 
   modified pages,
   which will be in an extra thread handled, to perform the IO and clean and
   write protect all pages with page_clean().
  

 An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255.
 
 I understood from the thread that PeterZ is looking into page_mkclean
 changes which I guess went into 2.6.23. I'm also happy to help in any
 way if the way we're doing fb_defio needs to change.

OK, seems I can't read. Or at least, I missed a large part of the
problem.

page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And
looking at the fb_defio code, I'm not sure I understand how its
page_mkclean() use could ever have worked.

The proposed patch [1] only fixes the issue of -page_mkwrite() on
vmalloc()'ed memory. Not page_mkclean(), and that has never worked from
what I can make of it.

Jaya, could you shed some light on this? I presume you had your display
working.


[1] which I will clean up and resend after this issue is cleared up -
and preferably tested by someone who has this hardware.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-29 Thread Peter Zijlstra
On Mon, 2007-10-29 at 13:51 -0400, Jaya Kumar wrote:
 On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
  On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote:
   An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255.
  
   I understood from the thread that PeterZ is looking into page_mkclean
   changes which I guess went into 2.6.23. I'm also happy to help in any
   way if the way we're doing fb_defio needs to change.
 
  OK, seems I can't read. Or at least, I missed a large part of the
  problem.
 
  page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And
  looking at the fb_defio code, I'm not sure I understand how its
  page_mkclean() use could ever have worked.
 
  The proposed patch [1] only fixes the issue of -page_mkwrite() on
  vmalloc()'ed memory. Not page_mkclean(), and that has never worked from
  what I can make of it.
 
  Jaya, could you shed some light on this? I presume you had your display
  working.
 
 
 I thought I had it working. I saw the display update after each
 mmap/write sequence to the framebuffer. I need to check if there's an
 munmap or anything else going on in between write sequences that would
 cause it to behave like page_mkclean was working.
 
 Is it correct to assume that page_mkclean should mark the pages
 read-only so that the next write would again trigger mkwrite?

Well, yes, that is the intended behaviour.

  Even if the page was from a vmalloc_to_page()?

That is the crux, I only ever implemented it for file pages.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [stable] 2.6.23 regression: top displaying 9999% CPU usage

2007-10-29 Thread Peter Zijlstra

On Mon, 2007-10-29 at 21:41 +0100, Ingo Molnar wrote:
 * Christian Borntraeger [EMAIL PROTECTED] wrote:
 
   - return clock_t_to_cputime(utime);
   + p-prev_utime = max(p-prev_utime, clock_t_to_cputime(utime));
   + return p-prev_utime;
}
  [...]
  
  I dont think it will work. It will make utime monotic, but stime can 
  still decrease. For example let sum_exec_runtime increase by a tiny 
  little bit while utime will get a full additional tick. stime is 
  sum-utime. So stime can still go backwards. So I think that we need 
  this kind of logic for stime as well, no?
 
 yeah, probably. Peter?

/me dons the brown paper bag while mumbling an agreement of sorts.

I'll not attempt to come up with a patch as I fear I'll just make a
bigger mess in my current state, hope to feel better tomorrow..

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-29 Thread Peter Zijlstra

On Mon, 2007-10-29 at 19:17 +0100, Peter Zijlstra wrote:
 On Mon, 2007-10-29 at 13:51 -0400, Jaya Kumar wrote:
  On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
   On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote:
An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255.
   
I understood from the thread that PeterZ is looking into page_mkclean
changes which I guess went into 2.6.23. I'm also happy to help in any
way if the way we're doing fb_defio needs to change.
  
   OK, seems I can't read. Or at least, I missed a large part of the
   problem.
  
   page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And
   looking at the fb_defio code, I'm not sure I understand how its
   page_mkclean() use could ever have worked.
  
   The proposed patch [1] only fixes the issue of -page_mkwrite() on
   vmalloc()'ed memory. Not page_mkclean(), and that has never worked from
   what I can make of it.
  
   Jaya, could you shed some light on this? I presume you had your display
   working.
  
  
  I thought I had it working. I saw the display update after each
  mmap/write sequence to the framebuffer. I need to check if there's an
  munmap or anything else going on in between write sequences that would
  cause it to behave like page_mkclean was working.
  
  Is it correct to assume that page_mkclean should mark the pages
  read-only so that the next write would again trigger mkwrite?
 
 Well, yes, that is the intended behaviour.
 
   Even if the page was from a vmalloc_to_page()?
 
 That is the crux, I only ever implemented it for file pages.

Hmm, so these vmalloc pages are mapped into user-space with
remap_pfn_range(), which doesn't have any form of rmap. That is, given a
pfn there is no way to obtain all ptes for it. So the interface to
page_mkclean() could never work for these (as it only provides a struct
page *).

[ also, remap_vmalloc_range() suffers similar issues, only file and anon
  have proper rmap ]

I'm not sure we want full rmap for remap_pfn/vmalloc_range, but perhaps
we could assist drivers in maintaining and using vma lists.

I think page_mkclean_one() would work if you'd manually set page-index
and iterate the vmas yourself. Although atm I'm not sure of anything so
don't pin me on it.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-30 Thread Peter Zijlstra
On Mon, 2007-10-29 at 21:22 -0400, Jaya Kumar wrote:
 On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
 
  [ also, remap_vmalloc_range() suffers similar issues, only file and anon
have proper rmap ]
 
  I'm not sure we want full rmap for remap_pfn/vmalloc_range, but perhaps
  we could assist drivers in maintaining and using vma lists.
 
  I think page_mkclean_one() would work if you'd manually set page-index
  and iterate the vmas yourself. Although atm I'm not sure of anything so
  don't pin me on it.
 
 :-) If it's anybody's fault, it's mine for not testing properly. My bad.
 
 In the case of defio, I think it's no trouble to build a list of vmas
 at mmap time and then to iterate through them when it's ready for
 mkclean time as you suggested. I don't fully understand page-index
 yet. I had thought it was only used by swap cache or file map.
 
 On an unrelated note, I was looking for somewhere to stuff a 16 bit
 offset (so that I have a cheap way to know which struct page
 corresponds to which framebuffer block or offset) for another driver.
 I had thought page-index was it but I think I am wrong now.

Yeah, page-index is used along with vma-vmpgoff and vma-vm_start to
determine the address of the page in the given vma:

  address = vma-vm_start + ((page-index - vma-vm_pgoff)  PAGE_SHIFT);

and from that address the pte can be found by walking the vma-vm_mm
page tables.

So page-index does what you want it to, identify which part of the
framebuffer this particular page belongs to.


signature.asc
Description: This is a digitally signed message part


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-30 Thread Peter Zijlstra
On Tue, 2007-10-30 at 12:39 +, Hugh Dickins wrote:
 On Tue, 30 Oct 2007, Stefani Seibold wrote:
  
  the question is how can i get all pte's from a vmalloc'ed memory. Due to
  the zeroed mapping pointer i dont see how to do this?
 
 The mapping pointer is zeroed because you've done nothing to set it.
 Below is how I answered you a week ago.  But this is new territory
 (extending page_mkclean to work on more than just pagecache pages),
 I'm still unsure what would be the safest way to do it.

Quite, I think manual usage of page_mkclean_one() on the vma gotten from
mmap() along with properly setting page-index is the simplest solution
to make work.

Making page_mkclean(struct page *) work for remap_pfn/vmalloc_range()
style mmaps would require extending rmap to work with those, which
includes setting page-mapping to point to a anon_vma like object.

But that sounds like a lot of work, and I'm not sure its worth the
overhead, because so far all users of remap_pfn/vmalloc_range() have
survived without.




signature.asc
Description: This is a digitally signed message part


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-30 Thread Peter Zijlstra
On Tue, 2007-10-30 at 09:16 -0400, Jaya Kumar wrote:
 On 10/30/07, Peter Zijlstra [EMAIL PROTECTED] wrote:
  So page-index does what you want it to, identify which part of the
  framebuffer this particular page belongs to.
 
 Ok. I'm attempting to walk the code sequence. Here's what I think:
 
 - driver loads
 - driver vmalloc()s its fb
 - this creates the necessary pte entries

well, one set thereof, the kernel mappings, which for this purpose are
the least interesting.

 then...
 - app mmap(/dev/fb0)
 - vma is created
 - defio mmap adds this vma to private list (equivalent of
 address_space or anon_vma)

 - app touches base + pixel(128,128) = base + 16k
 - page fault
 - defio nopage gets called
 - defio nopage does vmalloc_to_page(base+16k)

this installs a user space page table entry for your page; this is the
interesting one as it carries the user-dirty state.

 - that finds the correct struct page corresponding to that vaddr.
 page-index has not been set by anyone so far, right?
 * ah... i see, you are suggesting that this is where I could set the
 index since i know the offset i want it to represent. right?

Not quite, you would set that right after vmallocing, just set an
increasing page-index starting with 0 for the first page.

Then ensure your vma-vm_pgoff is 0 (which should be the case since
userspace will most likely mmap the whole thing, and if not it still
gets what it expects).

 - defio mkwrite get called. defio adds page to its list. schedules delayed 
 work
 - app keeps writing the page
 - delayed work occurs
 - foreach vma { foreach page { page_mkclean_one(page, vma) }

Yeah, page_mkclean_one(page, vma) will use vma_address() to obtain an
user-space address for the page in this vma using page-index and the
formula from the last email, this address is then used to walk the page
tables and obtain a pte.

This will be the user-space pte installed by your nopfn handler. Not the
kernel vmap pte resulting from the vmalloc() call.

 - cycle repeats...




signature.asc
Description: This is a digitally signed message part


Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23

2007-10-30 Thread Peter Zijlstra
On Tue, 2007-10-30 at 15:47 +, Hugh Dickins wrote:
 On Tue, 30 Oct 2007, Peter Zijlstra wrote:
  On Tue, 2007-10-30 at 09:16 -0400, Jaya Kumar wrote:
 
   - defio mmap adds this vma to private list (equivalent of
   address_space or anon_vma)
 
   - foreach vma { foreach page { page_mkclean_one(page, vma) }
  
  Yeah, page_mkclean_one(page, vma) will use vma_address() to obtain an
  user-space address for the page in this vma using page-index and the
  formula from the last email, this address is then used to walk the page
  tables and obtain a pte.
 
 I don't understand why you suggested an anon_vma, nor why Jaya is
 suggesting a private list.  All vmas mapping /dev/fb0 will be kept
 in the prio_tree rooted in its struct address_space (__vma_link_file
 in mm/mmap.c).  And page_mkclean gets page_mkclean_file to walk that
 very tree.  The missing part is just the setting of page-mapping to
 point to that struct address_space (and clearing it before finally
 freeing the pages), and the setting of page-index as you described.
 Isn't it?

Hmm, there is a thought. I had not considered that mapping a chardev
would have that effect.

I'd have to have a look at the actual code, but yeah, that might very
well work out. How silly of me.

Thanks!


signature.asc
Description: This is a digitally signed message part


[PATCH 32/33] nfs: fix various memory recursions possible with swap over NFS.

2007-10-30 Thread Peter Zijlstra
GFP_NOFS is not enough, since swap traffic is IO, hence fall back to GFP_NOIO.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/pagelist.c |2 +-
 fs/nfs/write.c|6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -44,7 +44,7 @@ static struct kmem_cache *nfs_wdata_cach
 
 struct nfs_write_data *nfs_commit_alloc(void)
 {
-   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);
+   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOIO);
 
if (p) {
memset(p, 0, sizeof(*p));
@@ -68,7 +68,7 @@ void nfs_commit_free(struct nfs_write_da
 
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);
+   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOIO);
 
if (p) {
memset(p, 0, sizeof(*p));
@@ -77,7 +77,7 @@ struct nfs_write_data *nfs_writedata_all
if (pagecount = ARRAY_SIZE(p-page_array))
p-pagevec = p-page_array;
else {
-   p-pagevec = kcalloc(pagecount, sizeof(struct page *), 
GFP_NOFS);
+   p-pagevec = kcalloc(pagecount, sizeof(struct page *), 
GFP_NOIO);
if (!p-pagevec) {
kmem_cache_free(nfs_wdata_cachep, p);
p = NULL;
Index: linux-2.6/fs/nfs/pagelist.c
===
--- linux-2.6.orig/fs/nfs/pagelist.c
+++ linux-2.6/fs/nfs/pagelist.c
@@ -27,7 +27,7 @@ static inline struct nfs_page *
 nfs_page_alloc(void)
 {
struct nfs_page *p;
-   p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL);
+   p = kmem_cache_alloc(nfs_page_cachep, GFP_NOIO);
if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(p-wb_list);

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 05/33] mm: kmem_estimate_pages()

2007-10-30 Thread Peter Zijlstra
Provide a method to get the upper bound on the pages needed to allocate
a given number of objects from a given kmem_cache.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/slab.h |3 +
 mm/slub.c|   82 +++
 2 files changed, 85 insertions(+)

Index: linux-2.6/include/linux/slab.h
===
--- linux-2.6.orig/include/linux/slab.h
+++ linux-2.6/include/linux/slab.h
@@ -60,6 +60,7 @@ void kmem_cache_free(struct kmem_cache *
 unsigned int kmem_cache_size(struct kmem_cache *);
 const char *kmem_cache_name(struct kmem_cache *);
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr);
+unsigned kmem_estimate_pages(struct kmem_cache *cachep, gfp_t flags, int 
objects);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -94,6 +95,8 @@ int kmem_ptr_validate(struct kmem_cache 
 void * __must_check krealloc(const void *, size_t, gfp_t);
 void kfree(const void *);
 size_t ksize(const void *);
+unsigned kestimate_single(size_t, gfp_t, int);
+unsigned kestimate(gfp_t, size_t);
 
 /*
  * Allocator specific definitions. These are mainly used to establish optimized
Index: linux-2.6/mm/slub.c
===
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -2293,6 +2293,37 @@ const char *kmem_cache_name(struct kmem_
 EXPORT_SYMBOL(kmem_cache_name);
 
 /*
+ * return the max number of pages required to allocated count
+ * objects from the given cache
+ */
+unsigned kmem_estimate_pages(struct kmem_cache *s, gfp_t flags, int objects)
+{
+   unsigned long slabs;
+
+   if (WARN_ON(!s) || WARN_ON(!s-objects))
+   return 0;
+
+   slabs = DIV_ROUND_UP(objects, s-objects);
+
+   /*
+* Account the possible additional overhead if the slab holds more that
+* one object.
+*/
+   if (s-objects  1) {
+   /*
+* Account the possible additional overhead if per cpu slabs
+* are currently empty and have to be allocated. This is very
+* unlikely but a possible scenario immediately after
+* kmem_cache_shrink.
+*/
+   slabs += num_online_cpus();
+   }
+
+   return slabs  s-order;
+}
+EXPORT_SYMBOL_GPL(kmem_estimate_pages);
+
+/*
  * Attempt to free all slabs on a node. Return the number of slabs we
  * were unable to free.
  */
@@ -2630,6 +2661,57 @@ void kfree(const void *x)
 EXPORT_SYMBOL(kfree);
 
 /*
+ * return the max number of pages required to allocate @count objects
+ * of @size bytes from kmalloc given @flags.
+ */
+unsigned kestimate_single(size_t size, gfp_t flags, int count)
+{
+   struct kmem_cache *s = get_slab(size, flags);
+   if (!s)
+   return 0;
+
+   return kmem_estimate_pages(s, flags, count);
+
+}
+EXPORT_SYMBOL_GPL(kestimate_single);
+
+/*
+ * return the max number of pages required to allocate @bytes from kmalloc
+ * in an unspecified number of allocation of heterogeneous size.
+ */
+unsigned kestimate(gfp_t flags, size_t bytes)
+{
+   int i;
+   unsigned long pages;
+
+   /*
+* multiply by two, in order to account the worst case slack space
+* due to the power-of-two allocation sizes.
+*/
+   pages = DIV_ROUND_UP(2 * bytes, PAGE_SIZE);
+
+   /*
+* add the kmem_cache overhead of each possible kmalloc cache
+*/
+   for (i = 1; i  PAGE_SHIFT; i++) {
+   struct kmem_cache *s;
+
+#ifdef CONFIG_ZONE_DMA
+   if (unlikely(flags  SLUB_DMA))
+   s = dma_kmalloc_cache(i, flags);
+   else
+#endif
+   s = kmalloc_caches[i];
+
+   if (s)
+   pages += kmem_estimate_pages(s, flags, 0);
+   }
+
+   return pages;
+}
+EXPORT_SYMBOL_GPL(kestimate);
+
+/*
  * kmem_cache_shrink removes empty slabs from the partial lists and sorts
  * the remaining slabs by the number of items in use. The slabs with the
  * most items in use come first. New allocations will then fill those up

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 24/33] mm: prepare swap entry methods for use in page methods

2007-10-30 Thread Peter Zijlstra
Move around the swap entry methods in preparation for use from
page methods.

Also provide a function to obtain the swap_info_struct backing
a swap cache page.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/mm.h  |8 
 include/linux/swap.h|   48 
 include/linux/swapops.h |   44 
 mm/swapfile.c   |1 +
 4 files changed, 57 insertions(+), 44 deletions(-)

Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -12,6 +12,7 @@
 #include linux/prio_tree.h
 #include linux/debug_locks.h
 #include linux/mm_types.h
+#include linux/swap.h
 
 struct mempolicy;
 struct anon_vma;
@@ -573,6 +574,13 @@ static inline struct address_space *page
return mapping;
 }
 
+static inline struct swap_info_struct *page_swap_info(struct page *page)
+{
+   swp_entry_t swap = { .val = page_private(page) };
+   BUG_ON(!PageSwapCache(page));
+   return get_swap_info_struct(swp_type(swap));
+}
+
 static inline int PageAnon(struct page *page)
 {
return ((unsigned long)page-mapping  PAGE_MAPPING_ANON) != 0;
Index: linux-2.6/include/linux/swap.h
===
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -80,6 +80,50 @@ typedef struct {
 } swp_entry_t;
 
 /*
+ * swapcache pages are stored in the swapper_space radix tree.  We want to
+ * get good packing density in that tree, so the index should be dense in
+ * the low-order bits.
+ *
+ * We arrange the `type' and `offset' fields so that `type' is at the five
+ * high-order bits of the swp_entry_t and `offset' is right-aligned in the
+ * remaining bits.
+ *
+ * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
+ */
+#define SWP_TYPE_SHIFT(e)  (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
+#define SWP_OFFSET_MASK(e) ((1UL  SWP_TYPE_SHIFT(e)) - 1)
+
+/*
+ * Store a type+offset into a swp_entry_t in an arch-independent format
+ */
+static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
+{
+   swp_entry_t ret;
+
+   ret.val = (type  SWP_TYPE_SHIFT(ret)) |
+   (offset  SWP_OFFSET_MASK(ret));
+   return ret;
+}
+
+/*
+ * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
+ * arch-independent format
+ */
+static inline unsigned swp_type(swp_entry_t entry)
+{
+   return (entry.val  SWP_TYPE_SHIFT(entry));
+}
+
+/*
+ * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
+ * arch-independent format
+ */
+static inline pgoff_t swp_offset(swp_entry_t entry)
+{
+   return entry.val  SWP_OFFSET_MASK(entry);
+}
+
+/*
  * current-reclaim_state points to one of these when a task is running
  * memory reclaim
  */
@@ -326,6 +370,10 @@ static inline int valid_swaphandles(swp_
return 0;
 }
 
+static inline struct swap_info_struct *get_swap_info_struct(unsigned type)
+{
+   return NULL;
+}
 #define can_share_swap_page(p) (page_mapcount(p) == 1)
 
 static inline int move_to_swap_cache(struct page *page, swp_entry_t entry)
Index: linux-2.6/include/linux/swapops.h
===
--- linux-2.6.orig/include/linux/swapops.h
+++ linux-2.6/include/linux/swapops.h
@@ -1,48 +1,4 @@
 /*
- * swapcache pages are stored in the swapper_space radix tree.  We want to
- * get good packing density in that tree, so the index should be dense in
- * the low-order bits.
- *
- * We arrange the `type' and `offset' fields so that `type' is at the five
- * high-order bits of the swp_entry_t and `offset' is right-aligned in the
- * remaining bits.
- *
- * swp_entry_t's are *never* stored anywhere in their arch-dependent format.
- */
-#define SWP_TYPE_SHIFT(e)  (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
-#define SWP_OFFSET_MASK(e) ((1UL  SWP_TYPE_SHIFT(e)) - 1)
-
-/*
- * Store a type+offset into a swp_entry_t in an arch-independent format
- */
-static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset)
-{
-   swp_entry_t ret;
-
-   ret.val = (type  SWP_TYPE_SHIFT(ret)) |
-   (offset  SWP_OFFSET_MASK(ret));
-   return ret;
-}
-
-/*
- * Extract the `type' field from a swp_entry_t.  The swp_entry_t is in
- * arch-independent format
- */
-static inline unsigned swp_type(swp_entry_t entry)
-{
-   return (entry.val  SWP_TYPE_SHIFT(entry));
-}
-
-/*
- * Extract the `offset' field from a swp_entry_t.  The swp_entry_t is in
- * arch-independent format
- */
-static inline pgoff_t swp_offset(swp_entry_t entry)
-{
-   return entry.val  SWP_OFFSET_MASK(entry);
-}
-
-/*
  * Convert the arch-dependent pte representation of a swp_entry_t into an
  * arch-independent swp_entry_t.
  */
Index: linux-2.6/mm/swapfile.c

[PATCH 03/33] mm: slub: add knowledge of reserve pages

2007-10-30 Thread Peter Zijlstra
Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
contexts that are entitled to it.

Care is taken to only touch the SLUB slow path.

This is done to ensure reserve pages don't leak out and get consumed.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/slub_def.h |1 +
 mm/slub.c|   31 +++
 2 files changed, 24 insertions(+), 8 deletions(-)

Index: linux-2.6/mm/slub.c
===
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -20,11 +20,12 @@
 #include linux/mempolicy.h
 #include linux/ctype.h
 #include linux/kallsyms.h
+#include internal.h
 
 /*
  * Lock order:
  *   1. slab_lock(page)
- *   2. slab-list_lock
+ *   2. node-list_lock
  *
  *   The slab_lock protects operations on the object of a particular
  *   slab and its metadata in the page struct. If the slab lock
@@ -1074,7 +1075,7 @@ static void setup_object(struct kmem_cac
s-ctor(s, object);
 }
 
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int 
*reserve)
 {
struct page *page;
struct kmem_cache_node *n;
@@ -1090,6 +1091,7 @@ static struct page *new_slab(struct kmem
if (!page)
goto out;
 
+   *reserve = page-reserve;
n = get_node(s, page_to_nid(page));
if (n)
atomic_long_inc(n-nr_slabs);
@@ -1468,10 +1470,22 @@ static void *__slab_alloc(struct kmem_ca
 {
void **object;
struct page *new;
+   int reserve = 0;
 
if (!c-page)
goto new_slab;
 
+   if (unlikely(c-reserve)) {
+   /*
+* If the current slab is a reserve slab and the current
+* allocation context does not allow access to the reserves
+* we must force an allocation to test the current levels.
+*/
+   if (!(gfp_to_alloc_flags(gfpflags)  ALLOC_NO_WATERMARKS))
+   goto alloc_slab;
+   reserve = 1;
+   }
+
slab_lock(c-page);
if (unlikely(!node_match(c, node)))
goto another_slab;
@@ -1479,10 +1493,9 @@ load_freelist:
object = c-page-freelist;
if (unlikely(!object))
goto another_slab;
-   if (unlikely(SlabDebug(c-page)))
+   if (unlikely(SlabDebug(c-page) || reserve))
goto debug;
 
-   object = c-page-freelist;
c-freelist = object[c-offset];
c-page-inuse = s-objects;
c-page-freelist = NULL;
@@ -1500,16 +1513,18 @@ new_slab:
goto load_freelist;
}
 
+alloc_slab:
if (gfpflags  __GFP_WAIT)
local_irq_enable();
 
-   new = new_slab(s, gfpflags, node);
+   new = new_slab(s, gfpflags, node, reserve);
 
if (gfpflags  __GFP_WAIT)
local_irq_disable();
 
if (new) {
c = get_cpu_slab(s, smp_processor_id());
+   c-reserve = reserve;
if (c-page) {
/*
 * Someone else populated the cpu_slab while we
@@ -1537,8 +1552,7 @@ new_slab:
}
return NULL;
 debug:
-   object = c-page-freelist;
-   if (!alloc_debug_processing(s, c-page, object, addr))
+   if (SlabDebug(c-page)  !alloc_debug_processing(s, c-page, object, 
addr))
goto another_slab;
 
c-page-inuse++;
@@ -2010,10 +2024,11 @@ static struct kmem_cache_node *early_kme
 {
struct page *page;
struct kmem_cache_node *n;
+   int reserve;
 
BUG_ON(kmalloc_caches-size  sizeof(struct kmem_cache_node));
 
-   page = new_slab(kmalloc_caches, gfpflags, node);
+   page = new_slab(kmalloc_caches, gfpflags, node, reserve);
 
BUG_ON(!page);
if (page_to_nid(page) != node) {
Index: linux-2.6/include/linux/slub_def.h
===
--- linux-2.6.orig/include/linux/slub_def.h
+++ linux-2.6/include/linux/slub_def.h
@@ -17,6 +17,7 @@ struct kmem_cache_cpu {
int node;
unsigned int offset;
unsigned int objsize;
+   int reserve;
 };
 
 struct kmem_cache_node {

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 16/33] netvm: network reserve infrastructure

2007-10-30 Thread Peter Zijlstra
Provide the basic infrastructure to reserve and charge/account network memory.

We provide the following reserve tree:

1)  total network reserve
2)network TX reserve
3)  protocol TX pages
4)network RX reserve
5)  SKB data reserve

[1] is used to make all the network reserves a single subtree, for easy
manipulation.

[2] and [4] are merely for eastetic reasons.

The TX pages reserve [3] is assumed bounded by it being the upper bound of
memory that can be used for sending pages (not quite true, but good enough)

The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data
against in the fallback path.

The consumers for these reserves are sockets marked with:
  SOCK_MEMALLOC

Such sockets are to be used to service the VM (iow. to swap over). They
must be handled kernel side, exposing such a socket to user-space is a BUG.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h |   35 +++-
 net/Kconfig|3 +
 net/core/sock.c|  113 +
 3 files changed, 150 insertions(+), 1 deletion(-)

Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -50,6 +50,7 @@
 #include linux/skbuff.h  /* struct sk_buff */
 #include linux/mm.h
 #include linux/security.h
+#include linux/reserve.h
 
 #include linux/filter.h
 
@@ -397,6 +398,7 @@ enum sock_flags {
SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */
SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+   SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -419,9 +421,40 @@ static inline int sock_flag(struct sock 
return test_bit(flag, sk-sk_flags);
 }
 
+static inline int sk_has_memalloc(struct sock *sk)
+{
+   return sock_flag(sk, SOCK_MEMALLOC);
+}
+
+/*
+ * Guestimate the per request queue TX upper bound.
+ *
+ * Max packet size is 64k, and we need to reserve that much since the data
+ * might need to bounce it. Double it to be on the safe side.
+ */
+#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE)
+
+extern atomic_t memalloc_socks;
+
+extern struct mem_reserve net_rx_reserve;
+extern struct mem_reserve net_skb_reserve;
+
+static inline int sk_memalloc_socks(void)
+{
+   return atomic_read(memalloc_socks);
+}
+
+extern int rx_emergency_get(int bytes);
+extern int rx_emergency_get_overcommit(int bytes);
+extern void rx_emergency_put(int bytes);
+
+extern int sk_adjust_memalloc(int socks, long tx_reserve_pages);
+extern int sk_set_memalloc(struct sock *sk);
+extern int sk_clear_memalloc(struct sock *sk);
+
 static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
 {
-   return gfp_mask;
+   return gfp_mask | (sk-sk_allocation  __GFP_MEMALLOC);
 }
 
 static inline void sk_acceptq_removed(struct sock *sk)
Index: linux-2.6/net/core/sock.c
===
--- linux-2.6.orig/net/core/sock.c
+++ linux-2.6/net/core/sock.c
@@ -112,6 +112,7 @@
 #include linux/tcp.h
 #include linux/init.h
 #include linux/highmem.h
+#include linux/reserve.h
 
 #include asm/uaccess.h
 #include asm/system.h
@@ -213,6 +214,111 @@ __u32 sysctl_rmem_default __read_mostly 
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
+atomic_t memalloc_socks;
+
+static struct mem_reserve net_reserve;
+struct mem_reserve net_rx_reserve;
+struct mem_reserve net_skb_reserve;
+static struct mem_reserve net_tx_reserve;
+static struct mem_reserve net_tx_pages;
+
+EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */
+EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */
+
+/*
+ * is there room for another emergency packet?
+ */
+static int __rx_emergency_get(int bytes, bool overcommit)
+{
+   return mem_reserve_kmalloc_charge(net_skb_reserve, bytes, overcommit);
+}
+
+int rx_emergency_get(int bytes)
+{
+   return __rx_emergency_get(bytes, false);
+}
+
+int rx_emergency_get_overcommit(int bytes)
+{
+   return __rx_emergency_get(bytes, true);
+}
+
+void rx_emergency_put(int bytes)
+{
+   mem_reserve_kmalloc_charge(net_skb_reserve, -bytes, 0);
+}
+
+/**
+ * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX
+ * @socks: number of new %SOCK_MEMALLOC sockets
+ * @tx_resserve_pages: number of pages to (un)reserve for TX
+ *
+ * This function adjusts the memalloc reserve based on system demand.
+ * The RX reserve is a limit, and only added once, not for each socket.
+ *
+ * NOTE:
+ *@tx_reserve_pages is an upper-bound of memory used for TX hence
+ *we need not account the pages like we do for RX pages

[PATCH 08/33] mm: emergency pool

2007-10-30 Thread Peter Zijlstra
Provide means to reserve a specific amount of pages.

The emergency pool is separated from the min watermark because ALLOC_HARDER
and ALLOC_HIGH modify the watermark in a relative way and thus do not ensure
a strict minimum.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/mmzone.h |3 +
 mm/page_alloc.c|   82 +++--
 mm/vmstat.c|6 +--
 3 files changed, 78 insertions(+), 13 deletions(-)

Index: linux-2.6/include/linux/mmzone.h
===
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -213,7 +213,7 @@ enum zone_type {
 
 struct zone {
/* Fields commonly accessed by the page allocator */
-   unsigned long   pages_min, pages_low, pages_high;
+   unsigned long   pages_emerg, pages_min, pages_low, pages_high;
/*
 * We don't know if the memory that we're going to allocate will be 
freeable
 * or/and it will be released eventually, so to avoid totally wasting 
several
@@ -682,6 +682,7 @@ int sysctl_min_unmapped_ratio_sysctl_han
struct file *, void __user *, size_t *, loff_t *);
 int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
struct file *, void __user *, size_t *, loff_t *);
+int adjust_memalloc_reserve(int pages);
 
 extern int numa_zonelist_order_handler(struct ctl_table *, int,
struct file *, void __user *, size_t *, loff_t *);
Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -118,6 +118,8 @@ static char * const zone_names[MAX_NR_ZO
 
 static DEFINE_SPINLOCK(min_free_lock);
 int min_free_kbytes = 1024;
+static DEFINE_MUTEX(var_free_mutex);
+int var_free_kbytes;
 
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
@@ -1252,7 +1254,7 @@ int zone_watermark_ok(struct zone *z, in
if (alloc_flags  ALLOC_HARDER)
min -= min / 4;
 
-   if (free_pages = min + z-lowmem_reserve[classzone_idx])
+   if (free_pages = min + z-lowmem_reserve[classzone_idx] + 
z-pages_emerg)
return 0;
for (o = 0; o  order; o++) {
/* At the next order, this order's pages become unavailable */
@@ -1733,8 +1735,8 @@ nofail_alloc:
 nopage:
if (!(gfp_mask  __GFP_NOWARN)  printk_ratelimit()) {
printk(KERN_WARNING %s: page allocation failure.
-order:%d, mode:0x%x\n,
-   p-comm, order, gfp_mask);
+order:%d, mode:0x%x, alloc_flags:0x%x, pflags:0x%x\n,
+   p-comm, order, gfp_mask, alloc_flags, p-flags);
dump_stack();
show_mem();
}
@@ -1952,9 +1954,9 @@ void show_free_areas(void)
\n,
zone-name,
K(zone_page_state(zone, NR_FREE_PAGES)),
-   K(zone-pages_min),
-   K(zone-pages_low),
-   K(zone-pages_high),
+   K(zone-pages_emerg + zone-pages_min),
+   K(zone-pages_emerg + zone-pages_low),
+   K(zone-pages_emerg + zone-pages_high),
K(zone_page_state(zone, NR_ACTIVE)),
K(zone_page_state(zone, NR_INACTIVE)),
K(zone-present_pages),
@@ -4113,7 +4115,7 @@ static void calculate_totalreserve_pages
}
 
/* we treat pages_high as reserved pages. */
-   max += zone-pages_high;
+   max += zone-pages_high + zone-pages_emerg;
 
if (max  zone-present_pages)
max = zone-present_pages;
@@ -4170,7 +4172,8 @@ static void setup_per_zone_lowmem_reserv
  */
 static void __setup_per_zone_pages_min(void)
 {
-   unsigned long pages_min = min_free_kbytes  (PAGE_SHIFT - 10);
+   unsigned pages_min = min_free_kbytes  (PAGE_SHIFT - 10);
+   unsigned pages_emerg = var_free_kbytes  (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
@@ -4182,11 +4185,13 @@ static void __setup_per_zone_pages_min(v
}
 
for_each_zone(zone) {
-   u64 tmp;
+   u64 tmp, tmp_emerg;
 
spin_lock_irqsave(zone-lru_lock, flags);
tmp = (u64)pages_min * zone-present_pages;
do_div(tmp, lowmem_pages);
+   tmp_emerg = (u64)pages_emerg * zone-present_pages;
+   do_div(tmp_emerg, lowmem_pages);
if (is_highmem(zone)) {
/*
 * __GFP_HIGH and PF_MEMALLOC allocations usually don't

[PATCH 30/33] nfs: swap vs nfs_writepage

2007-10-30 Thread Peter Zijlstra
For now just use the -writepage() path for swap traffic. Trond would like
to see -swap_page() or some such additional a_op.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/write.c |   23 +++
 1 file changed, 23 insertions(+)

Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -336,6 +336,29 @@ static int nfs_do_writepage(struct page 
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
 
+   if (unlikely(IS_SWAPFILE(inode))) {
+   struct rpc_cred *cred;
+   struct nfs_open_context *ctx;
+   int status;
+
+   cred = rpcauth_lookupcred(NFS_CLIENT(inode)-cl_auth, 0);
+   if (IS_ERR(cred))
+   return PTR_ERR(cred);
+
+   ctx = nfs_find_open_context(inode, cred, FMODE_WRITE);
+   if (!ctx)
+   return -EBADF;
+
+   status = nfs_writepage_setup(ctx, page, 0, 
nfs_page_length(page));
+
+   put_nfs_open_context(ctx);
+
+   if (status  0) {
+   nfs_set_pageerror(page);
+   return status;
+   }
+   }
+
nfs_pageio_cond_complete(pgio, page-index);
return nfs_page_async_flush(pgio, page);
 }

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 26/33] mm: methods for teaching filesystems about PG_swapcache pages

2007-10-30 Thread Peter Zijlstra
In order to teach filesystems to handle swap cache pages, two new page
functions are introduced:

  pgoff_t page_file_index(struct page *);
  struct address_space *page_file_mapping(struct page *);

page_file_index - gives the offset of this page in the file in PAGE_CACHE_SIZE
blocks. Like page-index is for mapped pages, this function also gives the
correct index for PG_swapcache pages.

page_file_mapping - gives the mapping backing the actual page; that is for
swap cache pages it will give swap_file-f_mapping.

page_offset() is modified to use page_file_index(), so that it will give the
expected result, even for PG_swapcache pages.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/mm.h  |   26 ++
 include/linux/pagemap.h |2 +-
 2 files changed, 27 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include linux/debug_locks.h
 #include linux/mm_types.h
 #include linux/swap.h
+#include linux/fs.h
 
 struct mempolicy;
 struct anon_vma;
@@ -581,6 +582,16 @@ static inline struct swap_info_struct *p
return get_swap_info_struct(swp_type(swap));
 }
 
+static inline
+struct address_space *page_file_mapping(struct page *page)
+{
+#ifdef CONFIG_SWAP_FILE
+   if (unlikely(PageSwapCache(page)))
+   return page_swap_info(page)-swap_file-f_mapping;
+#endif
+   return page-mapping;
+}
+
 static inline int PageAnon(struct page *page)
 {
return ((unsigned long)page-mapping  PAGE_MAPPING_ANON) != 0;
@@ -598,6 +609,21 @@ static inline pgoff_t page_index(struct 
 }
 
 /*
+ * Return the file index of the page. Regular pagecache pages use -index
+ * whereas swapcache pages use swp_offset(-private)
+ */
+static inline pgoff_t page_file_index(struct page *page)
+{
+#ifdef CONFIG_SWAP_FILE
+   if (unlikely(PageSwapCache(page))) {
+   swp_entry_t swap = { .val = page_private(page) };
+   return swp_offset(swap);
+   }
+#endif
+   return page-index;
+}
+
+/*
  * The atomic page-_mapcount, like _count, starts from -1:
  * so that transitions both from it and to it can be tracked,
  * using atomic_inc_and_test and atomic_add_negative(-1).
Index: linux-2.6/include/linux/pagemap.h
===
--- linux-2.6.orig/include/linux/pagemap.h
+++ linux-2.6/include/linux/pagemap.h
@@ -145,7 +145,7 @@ extern void __remove_from_page_cache(str
  */
 static inline loff_t page_offset(struct page *page)
 {
-   return ((loff_t)page-index)  PAGE_CACHE_SHIFT;
+   return ((loff_t)page_file_index(page))  PAGE_CACHE_SHIFT;
 }
 
 static inline pgoff_t linear_page_index(struct vm_area_struct *vma,

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 02/33] mm: tag reseve pages

2007-10-30 Thread Peter Zijlstra
Tag pages allocated from the reserves with a non-zero page-reserve.
This allows us to distinguish and account reserve pages.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/mm_types.h |1 +
 mm/page_alloc.c  |4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/mm_types.h
===
--- linux-2.6.orig/include/linux/mm_types.h
+++ linux-2.6/include/linux/mm_types.h
@@ -70,6 +70,7 @@ struct page {
union {
pgoff_t index;  /* Our offset within mapping. */
void *freelist; /* SLUB: freelist req. slab lock */
+   int reserve;/* page_alloc: page is a reserve page */
};
struct list_head lru;   /* Pageout list, eg. active_list
 * protected by zone-lru_lock !
Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1448,8 +1448,10 @@ zonelist_scan:
}
 
page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
-   if (page)
+   if (page) {
+   page-reserve = !!(alloc_flags  ALLOC_NO_WATERMARKS);
break;
+   }
 this_zone_full:
if (NUMA_BUILD)
zlc_mark_zone_full(zonelist, z);

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 28/33] nfs: teach the NFS client how to treat PG_swapcache pages

2007-10-30 Thread Peter Zijlstra
Replace all relevant occurences of page-index and page-mapping in the NFS
client with the new page_file_index() and page_file_mapping() functions.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/file.c |8 
 fs/nfs/internal.h |7 ---
 fs/nfs/pagelist.c |6 +++---
 fs/nfs/read.c |6 +++---
 fs/nfs/write.c|   49 +
 5 files changed, 39 insertions(+), 37 deletions(-)

Index: linux-2.6/fs/nfs/file.c
===
--- linux-2.6.orig/fs/nfs/file.c
+++ linux-2.6/fs/nfs/file.c
@@ -357,7 +357,7 @@ static void nfs_invalidate_page(struct p
if (offset != 0)
return;
/* Cancel any unstarted writes on this page */
-   nfs_wb_page_cancel(page-mapping-host, page);
+   nfs_wb_page_cancel(page_file_mapping(page)-host, page);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
@@ -368,7 +368,7 @@ static int nfs_release_page(struct page 
 
 static int nfs_launder_page(struct page *page)
 {
-   return nfs_wb_page(page-mapping-host, page);
+   return nfs_wb_page(page_file_mapping(page)-host, page);
 }
 
 const struct address_space_operations nfs_file_aops = {
@@ -397,13 +397,13 @@ static int nfs_vm_page_mkwrite(struct vm
loff_t offset;
 
lock_page(page);
-   mapping = page-mapping;
+   mapping = page_file_mapping(page);
if (mapping != vma-vm_file-f_path.dentry-d_inode-i_mapping) {
unlock_page(page);
return -EINVAL;
}
pagelen = nfs_page_length(page);
-   offset = (loff_t)page-index  PAGE_CACHE_SHIFT;
+   offset = (loff_t)page_file_index(page)  PAGE_CACHE_SHIFT;
unlock_page(page);
 
/*
Index: linux-2.6/fs/nfs/pagelist.c
===
--- linux-2.6.orig/fs/nfs/pagelist.c
+++ linux-2.6/fs/nfs/pagelist.c
@@ -77,11 +77,11 @@ nfs_create_request(struct nfs_open_conte
 * update_nfs_request below if the region is not locked. */
req-wb_page= page;
atomic_set(req-wb_complete, 0);
-   req-wb_index   = page-index;
+   req-wb_index   = page_file_index(page);
page_cache_get(page);
BUG_ON(PagePrivate(page));
BUG_ON(!PageLocked(page));
-   BUG_ON(page-mapping-host != inode);
+   BUG_ON(page_file_mapping(page)-host != inode);
req-wb_offset  = offset;
req-wb_pgbase  = offset;
req-wb_bytes   = count;
@@ -383,7 +383,7 @@ void nfs_pageio_cond_complete(struct nfs
  * nfs_scan_list - Scan a list for matching requests
  * @nfsi: NFS inode
  * @dst: Destination list
- * @idx_start: lower bound of page-index to scan
+ * @idx_start: lower bound of page_file_index(page) to scan
  * @npages: idx_start + npages sets the upper bound to scan.
  * @tag: tag to scan for
  *
Index: linux-2.6/fs/nfs/read.c
===
--- linux-2.6.orig/fs/nfs/read.c
+++ linux-2.6/fs/nfs/read.c
@@ -460,11 +460,11 @@ static const struct rpc_call_ops nfs_rea
 int nfs_readpage(struct file *file, struct page *page)
 {
struct nfs_open_context *ctx;
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = page_file_mapping(page)-host;
int error;
 
dprintk(NFS: nfs_readpage (%p [EMAIL PROTECTED])\n,
-   page, PAGE_CACHE_SIZE, page-index);
+   page, PAGE_CACHE_SIZE, page_file_index(page));
nfs_inc_stats(inode, NFSIOS_VFSREADPAGE);
nfs_add_stats(inode, NFSIOS_READPAGES, 1);
 
@@ -511,7 +511,7 @@ static int
 readpage_async_filler(void *data, struct page *page)
 {
struct nfs_readdesc *desc = (struct nfs_readdesc *)data;
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = page_file_mapping(page)-host;
struct nfs_page *new;
unsigned int len;
int error;
Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -126,7 +126,7 @@ static struct nfs_page *nfs_page_find_re
 
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = page_file_mapping(page)-host;
struct nfs_page *req = NULL;
 
spin_lock(inode-i_lock);
@@ -138,13 +138,13 @@ static struct nfs_page *nfs_page_find_re
 /* Adjust the file length if we're writing beyond the end */
 static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int 
count)
 {
-   struct inode *inode = page-mapping-host;
+   struct inode *inode = page_file_mapping(page)-host;
loff_t end, i_size = i_size_read(inode);
pgoff_t end_index = (i_size - 1)  PAGE_CACHE_SHIFT;
 
-   if (i_size  0  page-index  end_index)
+   if (i_size  0  page_file_index(page

[PATCH 22/33] netfilter: NF_QUEUE vs emergency skbs

2007-10-30 Thread Peter Zijlstra
Avoid memory getting stuck waiting for userspace, drop all emergency packets.
This of course requires the regular storage route to not include an NF_QUEUE
target ;-)

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 net/netfilter/core.c |3 +++
 1 file changed, 3 insertions(+)

Index: linux-2.6/net/netfilter/core.c
===
--- linux-2.6.orig/net/netfilter/core.c
+++ linux-2.6/net/netfilter/core.c
@@ -181,9 +181,12 @@ next_hook:
ret = 1;
goto unlock;
} else if (verdict == NF_DROP) {
+drop:
kfree_skb(*pskb);
ret = -EPERM;
} else if ((verdict  NF_VERDICT_MASK)  == NF_QUEUE) {
+   if (skb_emergency(*pskb))
+   goto drop;
NFDEBUG(nf_hook: Verdict = QUEUE.\n);
if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn,
  verdict  NF_VERDICT_BITS))

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 19/33] netvm: hook skb allocation to reserves

2007-10-30 Thread Peter Zijlstra
Change the skb allocation api to indicate RX usage and use this to fall back to
the reserve when needed. SKBs allocated from the reserve are tagged in
skb-emergency.

Teach all other skb ops about emergency skbs and the reserve accounting.

Use the (new) packet split API to allocate and track fragment pages from the
emergency reserve. Do this using an atomic counter in page-index. This is
needed because the fragments have a different sharing semantic than that
indicated by skb_shinfo()-dataref. 

Note that the decision to distinguish between regular and emergency SKBs allows
the accounting overhead to be limited to the later kind.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/mm_types.h |1 
 include/linux/skbuff.h   |   25 +-
 net/core/skbuff.c|  173 +--
 3 files changed, 173 insertions(+), 26 deletions(-)

Index: linux-2.6/include/linux/skbuff.h
===
--- linux-2.6.orig/include/linux/skbuff.h
+++ linux-2.6/include/linux/skbuff.h
@@ -289,7 +289,8 @@ struct sk_buff {
__u8pkt_type:3,
fclone:2,
ipvs_property:1,
-   nf_trace:1;
+   nf_trace:1,
+   emergency:1;
__be16  protocol;
 
void(*destructor)(struct sk_buff *skb);
@@ -341,10 +342,22 @@ struct sk_buff {
 
 #include asm/system.h
 
+#define SKB_ALLOC_FCLONE   0x01
+#define SKB_ALLOC_RX   0x02
+
+static inline bool skb_emergency(const struct sk_buff *skb)
+{
+#ifdef CONFIG_NETVM
+   return unlikely(skb-emergency);
+#else
+   return false;
+#endif
+}
+
 extern void kfree_skb(struct sk_buff *skb);
 extern void   __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-  gfp_t priority, int fclone, int node);
+  gfp_t priority, int flags, int node);
 static inline struct sk_buff *alloc_skb(unsigned int size,
gfp_t priority)
 {
@@ -354,7 +367,7 @@ static inline struct sk_buff *alloc_skb(
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
   gfp_t priority)
 {
-   return __alloc_skb(size, priority, 1, -1);
+   return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1);
 }
 
 extern void   kfree_skbmem(struct sk_buff *skb);
@@ -1297,7 +1310,8 @@ static inline void __skb_queue_purge(str
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
  gfp_t gfp_mask)
 {
-   struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask);
+   struct sk_buff *skb =
+   __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1);
if (likely(skb))
skb_reserve(skb, NET_SKB_PAD);
return skb;
@@ -1343,6 +1357,7 @@ static inline struct sk_buff *netdev_all
 }
 
 extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t 
gfp_mask);
+extern void __netdev_free_page(struct net_device *dev, struct page *page);
 
 /**
  * netdev_alloc_page - allocate a page for ps-rx on a specific device
@@ -1359,7 +1374,7 @@ static inline struct page *netdev_alloc_
 
 static inline void netdev_free_page(struct net_device *dev, struct page *page)
 {
-   __free_page(page);
+   __netdev_free_page(dev, page);
 }
 
 /**
Index: linux-2.6/net/core/skbuff.c
===
--- linux-2.6.orig/net/core/skbuff.c
+++ linux-2.6/net/core/skbuff.c
@@ -179,21 +179,28 @@ EXPORT_SYMBOL(skb_truesize_bug);
  * %GFP_ATOMIC.
  */
 struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
-   int fclone, int node)
+   int flags, int node)
 {
struct kmem_cache *cache;
struct skb_shared_info *shinfo;
struct sk_buff *skb;
u8 *data;
+   int emergency = 0, memalloc = sk_memalloc_socks();
 
-   cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+   size = SKB_DATA_ALIGN(size);
+   cache = (flags  SKB_ALLOC_FCLONE)
+   ? skbuff_fclone_cache : skbuff_head_cache;
+#ifdef CONFIG_NETVM
+   if (memalloc  (flags  SKB_ALLOC_RX))
+   gfp_mask |= __GFP_NOMEMALLOC|__GFP_NOWARN;
 
+retry_alloc:
+#endif
/* Get the HEAD */
skb = kmem_cache_alloc_node(cache, gfp_mask  ~__GFP_DMA, node);
if (!skb)
-   goto out;
+   goto noskb;
 
-   size = SKB_DATA_ALIGN(size);
data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
gfp_mask, node);
if (!data)
@@ -203,6 +210,7 @@ struct sk_buff *__alloc_skb(unsigned int
 * See

[PATCH 13/33] net: wrap sk-sk_backlog_rcv()

2007-10-30 Thread Peter Zijlstra
Wrap calling sk-sk_backlog_rcv() in a function. This will allow extending the
generic sk_backlog_rcv behaviour.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h   |5 +
 net/core/sock.c  |4 ++--
 net/ipv4/tcp.c   |2 +-
 net/ipv4/tcp_timer.c |2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -485,6 +485,11 @@ static inline void sk_add_backlog(struct
skb-next = NULL;
 }
 
+static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+   return sk-sk_backlog_rcv(sk, skb);
+}
+
 #define sk_wait_event(__sk, __timeo, __condition)  \
({  int __rc;   \
release_sock(__sk); \
Index: linux-2.6/net/core/sock.c
===
--- linux-2.6.orig/net/core/sock.c
+++ linux-2.6/net/core/sock.c
@@ -320,7 +320,7 @@ int sk_receive_skb(struct sock *sk, stru
 */
mutex_acquire(sk-sk_lock.dep_map, 0, 1, _RET_IP_);
 
-   rc = sk-sk_backlog_rcv(sk, skb);
+   rc = sk_backlog_rcv(sk, skb);
 
mutex_release(sk-sk_lock.dep_map, 1, _RET_IP_);
} else
@@ -1312,7 +1312,7 @@ static void __release_sock(struct sock *
struct sk_buff *next = skb-next;
 
skb-next = NULL;
-   sk-sk_backlog_rcv(sk, skb);
+   sk_backlog_rcv(sk, skb);
 
/*
 * We are in process context here with softirqs
Index: linux-2.6/net/ipv4/tcp.c
===
--- linux-2.6.orig/net/ipv4/tcp.c
+++ linux-2.6/net/ipv4/tcp.c
@@ -1134,7 +1134,7 @@ static void tcp_prequeue_process(struct 
 * necessary */
local_bh_disable();
while ((skb = __skb_dequeue(tp-ucopy.prequeue)) != NULL)
-   sk-sk_backlog_rcv(sk, skb);
+   sk_backlog_rcv(sk, skb);
local_bh_enable();
 
/* Clear memory counter. */
Index: linux-2.6/net/ipv4/tcp_timer.c
===
--- linux-2.6.orig/net/ipv4/tcp_timer.c
+++ linux-2.6/net/ipv4/tcp_timer.c
@@ -196,7 +196,7 @@ static void tcp_delack_timer(unsigned lo
NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 
while ((skb = __skb_dequeue(tp-ucopy.prequeue)) != NULL)
-   sk-sk_backlog_rcv(sk, skb);
+   sk_backlog_rcv(sk, skb);
 
tp-ucopy.memory = 0;
}

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 23/33] netvm: skb processing

2007-10-30 Thread Peter Zijlstra
In order to make sure emergency packets receive all memory needed to proceed
ensure processing of emergency SKBs happens under PF_MEMALLOC.

Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing.

Skip taps, since those are user-space again.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h |5 +
 net/core/dev.c |   44 ++--
 net/core/sock.c|   18 ++
 3 files changed, 61 insertions(+), 6 deletions(-)

Index: linux-2.6/net/core/dev.c
===
--- linux-2.6.orig/net/core/dev.c
+++ linux-2.6/net/core/dev.c
@@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk
struct net_device *orig_dev;
int ret = NET_RX_DROP;
__be16 type;
+   unsigned long pflags = current-flags;
+
+   /* Emergency skb are special, they should
+*  - be delivered to SOCK_MEMALLOC sockets only
+*  - stay away from userspace
+*  - have bounded memory usage
+*
+* Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
+* This saves us from propagating the allocation context down to all
+* allocation sites.
+*/
+   if (skb_emergency(skb))
+   current-flags |= PF_MEMALLOC;
 
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb(skb))
-   return NET_RX_DROP;
+   goto out;
 
if (!skb-tstamp.tv64)
net_timestamp(skb);
@@ -1990,7 +2003,7 @@ int netif_receive_skb(struct sk_buff *sk
orig_dev = skb_bond(skb);
 
if (!orig_dev)
-   return NET_RX_DROP;
+   goto out;
 
__get_cpu_var(netdev_rx_stat).total++;
 
@@ -2009,6 +2022,9 @@ int netif_receive_skb(struct sk_buff *sk
}
 #endif
 
+   if (skb_emergency(skb))
+   goto skip_taps;
+
list_for_each_entry_rcu(ptype, ptype_all, list) {
if (!ptype-dev || ptype-dev == skb-dev) {
if (pt_prev)
@@ -2017,6 +2033,7 @@ int netif_receive_skb(struct sk_buff *sk
}
}
 
+skip_taps:
 #ifdef CONFIG_NET_CLS_ACT
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk
 
if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
kfree_skb(skb);
-   goto out;
+   goto unlock;
}
 
skb-tc_verd = 0;
 ncls:
 #endif
 
+   if (skb_emergency(skb))
+   switch(skb-protocol) {
+   case __constant_htons(ETH_P_ARP):
+   case __constant_htons(ETH_P_IP):
+   case __constant_htons(ETH_P_IPV6):
+   case __constant_htons(ETH_P_8021Q):
+   break;
+
+   default:
+   goto drop;
+   }
+
skb = handle_bridge(skb, pt_prev, ret, orig_dev);
if (!skb)
-   goto out;
+   goto unlock;
skb = handle_macvlan(skb, pt_prev, ret, orig_dev);
if (!skb)
-   goto out;
+   goto unlock;
 
type = skb-protocol;
list_for_each_entry_rcu(ptype, ptype_base[ntohs(type)15], list) {
@@ -2056,6 +2085,7 @@ ncls:
if (pt_prev) {
ret = pt_prev-func(skb, skb-dev, pt_prev, orig_dev);
} else {
+drop:
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
 * me how you were going to use this. :-)
@@ -2063,8 +2093,10 @@ ncls:
ret = NET_RX_DROP;
}
 
-out:
+unlock:
rcu_read_unlock();
+out:
+   tsk_restore_flags(current, pflags, PF_MEMALLOC);
return ret;
 }
 
Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -523,8 +523,13 @@ static inline void sk_add_backlog(struct
skb-next = NULL;
 }
 
+extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
+
 static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 {
+   if (skb_emergency(skb))
+   return __sk_backlog_rcv(sk, skb);
+
return sk-sk_backlog_rcv(sk, skb);
 }
 
Index: linux-2.6/net/core/sock.c
===
--- linux-2.6.orig/net/core/sock.c
+++ linux-2.6/net/core/sock.c
@@ -319,6 +319,24 @@ int sk_clear_memalloc(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 
+#ifdef CONFIG_NETVM
+int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
+{
+   int ret;
+   unsigned long pflags = current-flags;
+
+   /* these should have been dropped before queueing */
+   BUG_ON(!sk_has_memalloc(sk

[PATCH 25/33] mm: add support for non block device backed swap files

2007-10-30 Thread Peter Zijlstra
A new addres_space_operations method is added:
  int swapfile(struct address_space *, int)

When during sys_swapon() this method is found and returns no error the 
swapper_space.a_ops will proxy to sis-swap_file-f_mapping-a_ops.

The swapfile method will be used to communicate to the address_space that the
VM relies on it, and the address_space should take adequate measures (like 
reserving memory for mempools or the like).

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 Documentation/filesystems/Locking |9 +
 include/linux/buffer_head.h   |2 -
 include/linux/fs.h|1 
 include/linux/swap.h  |3 +
 mm/Kconfig|3 +
 mm/page_io.c  |   58 ++
 mm/swap_state.c   |5 +++
 mm/swapfile.c |   22 +-
 8 files changed, 101 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/swap.h
===
--- linux-2.6.orig/include/linux/swap.h
+++ linux-2.6/include/linux/swap.h
@@ -164,6 +164,7 @@ enum {
SWP_USED= (1  0), /* is slot in swap_info[] used? */
SWP_WRITEOK = (1  1), /* ok to write to this swap?*/
SWP_ACTIVE  = (SWP_USED | SWP_WRITEOK),
+   SWP_FILE= (1  2), /* file swap area */
/* add others here before... */
SWP_SCANNING= (1  8), /* refcount in scan_swap_map */
 };
@@ -264,6 +265,8 @@ extern void swap_unplug_io_fn(struct bac
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void swap_sync_page(struct page *page);
+extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
 /* linux/mm/swap_state.c */
Index: linux-2.6/mm/page_io.c
===
--- linux-2.6.orig/mm/page_io.c
+++ linux-2.6/mm/page_io.c
@@ -17,6 +17,7 @@
 #include linux/bio.h
 #include linux/swapops.h
 #include linux/writeback.h
+#include linux/buffer_head.h
 #include asm/pgtable.h
 
 static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
@@ -102,6 +103,18 @@ int swap_writepage(struct page *page, st
unlock_page(page);
goto out;
}
+#ifdef CONFIG_SWAP_FILE
+   {
+   struct swap_info_struct *sis = page_swap_info(page);
+   if (sis-flags  SWP_FILE) {
+   ret = sis-swap_file-f_mapping-
+   a_ops-writepage(page, wbc);
+   if (!ret)
+   count_vm_event(PSWPOUT);
+   return ret;
+   }
+   }
+#endif
bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) {
@@ -120,6 +133,39 @@ out:
return ret;
 }
 
+#ifdef CONFIG_SWAP_FILE
+void swap_sync_page(struct page *page)
+{
+   struct swap_info_struct *sis = page_swap_info(page);
+
+   if (sis-flags  SWP_FILE) {
+   const struct address_space_operations * a_ops =
+   sis-swap_file-f_mapping-a_ops;
+   if (a_ops-sync_page)
+   a_ops-sync_page(page);
+   } else
+   block_sync_page(page);
+}
+
+int swap_set_page_dirty(struct page *page)
+{
+   struct swap_info_struct *sis = page_swap_info(page);
+
+   if (sis-flags  SWP_FILE) {
+   const struct address_space_operations * a_ops =
+   sis-swap_file-f_mapping-a_ops;
+   int (*spd)(struct page *) = a_ops-set_page_dirty;
+#ifdef CONFIG_BLOCK
+   if (!spd)
+   spd = __set_page_dirty_buffers;
+#endif
+   return (*spd)(page);
+   }
+
+   return __set_page_dirty_nobuffers(page);
+}
+#endif
+
 int swap_readpage(struct file *file, struct page *page)
 {
struct bio *bio;
@@ -127,6 +173,18 @@ int swap_readpage(struct file *file, str
 
BUG_ON(!PageLocked(page));
ClearPageUptodate(page);
+#ifdef CONFIG_SWAP_FILE
+   {
+   struct swap_info_struct *sis = page_swap_info(page);
+   if (sis-flags  SWP_FILE) {
+   ret = sis-swap_file-f_mapping-
+   a_ops-readpage(sis-swap_file, page);
+   if (!ret)
+   count_vm_event(PSWPIN);
+   return ret;
+   }
+   }
+#endif
bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) {
Index: linux-2.6/mm/swap_state.c
===
--- linux-2.6.orig/mm/swap_state.c

[PATCH 20/33] netvm: filter emergency skbs.

2007-10-30 Thread Peter Zijlstra
Toss all emergency packets not for a SOCK_MEMALLOC socket. This ensures our
precious memory reserve doesn't get stuck waiting for user-space.

The correctness of this approach relies on the fact that networks must be
assumed lossy.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h |3 +++
 1 file changed, 3 insertions(+)

Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -930,6 +930,9 @@ static inline int sk_filter(struct sock 
 {
int err;
struct sk_filter *filter;
+
+   if (skb_emergency(skb)  !sk_has_memalloc(sk))
+   return -ENOMEM;

err = security_sock_rcv_skb(sk, skb);
if (err)

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 01/33] mm: gfp_to_alloc_flags()

2007-10-30 Thread Peter Zijlstra
Factor out the gfp to alloc_flags mapping so it can be used in other places.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/internal.h   |   11 ++
 mm/page_alloc.c |   98 
 2 files changed, 67 insertions(+), 42 deletions(-)

Index: linux-2.6/mm/internal.h
===
--- linux-2.6.orig/mm/internal.h
+++ linux-2.6/mm/internal.h
@@ -47,4 +47,15 @@ static inline unsigned long page_order(s
VM_BUG_ON(!PageBuddy(page));
return page_private(page);
 }
+
+#define ALLOC_HARDER   0x01 /* try to alloc harder */
+#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */
+#define ALLOC_WMARK_MIN0x04 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW0x08 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH   0x10 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS0x20 /* don't check watermarks at all */
+#define ALLOC_CPUSET   0x40 /* check for correct cpuset */
+
+int gfp_to_alloc_flags(gfp_t gfp_mask);
+
 #endif
Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1139,14 +1139,6 @@ failed:
return NULL;
 }
 
-#define ALLOC_NO_WATERMARKS0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH   0x08 /* use pages_high watermark */
-#define ALLOC_HARDER   0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET   0x40 /* check for correct cpuset */
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
 static struct fail_page_alloc_attr {
@@ -1535,6 +1527,44 @@ static void set_page_owner(struct page *
 #endif /* CONFIG_PAGE_OWNER */
 
 /*
+ * get the deepest reaching allocation flags for the given gfp_mask
+ */
+int gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+   struct task_struct *p = current;
+   int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+   const gfp_t wait = gfp_mask  __GFP_WAIT;
+
+   /*
+* The caller may dip into page reserves a bit more if the caller
+* cannot run direct reclaim, or if the caller has realtime scheduling
+* policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+*/
+   if (gfp_mask  __GFP_HIGH)
+   alloc_flags |= ALLOC_HIGH;
+
+   if (!wait) {
+   alloc_flags |= ALLOC_HARDER;
+   /*
+* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+*/
+   alloc_flags = ~ALLOC_CPUSET;
+   } else if (unlikely(rt_task(p))  !in_interrupt())
+   alloc_flags |= ALLOC_HARDER;
+
+   if (likely(!(gfp_mask  __GFP_NOMEMALLOC))) {
+   if (!in_interrupt() 
+   ((p-flags  PF_MEMALLOC) ||
+unlikely(test_thread_flag(TIF_MEMDIE
+   alloc_flags |= ALLOC_NO_WATERMARKS;
+   }
+
+   return alloc_flags;
+}
+
+/*
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page * fastcall
@@ -1589,48 +1619,28 @@ restart:
 * OK, we're below the kswapd watermark and have kicked background
 * reclaim. Now things get more complex, so set up alloc_flags according
 * to how we want to proceed.
-*
-* The caller may dip into page reserves a bit more if the caller
-* cannot run direct reclaim, or if the caller has realtime scheduling
-* policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
 */
-   alloc_flags = ALLOC_WMARK_MIN;
-   if ((unlikely(rt_task(p))  !in_interrupt()) || !wait)
-   alloc_flags |= ALLOC_HARDER;
-   if (gfp_mask  __GFP_HIGH)
-   alloc_flags |= ALLOC_HIGH;
-   if (wait)
-   alloc_flags |= ALLOC_CPUSET;
+   alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-   /*
-* Go through the zonelist again. Let __GFP_HIGH and allocations
-* coming from realtime tasks go deeper into reserves.
-*
-* This is the last chance, in general, before the goto nopage.
-* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-*/
-   page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+   /* This is the last chance, in general, before the goto nopage. */
+   page = get_page_from_freelist(gfp_mask, order, zonelist

[PATCH 10/33] mm: __GFP_MEMALLOC

2007-10-30 Thread Peter Zijlstra
__GFP_MEMALLOC will allow the allocation to disregard the watermarks, 
much like PF_MEMALLOC.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/gfp.h |3 ++-
 mm/page_alloc.c |4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

Index: linux-2.6/include/linux/gfp.h
===
--- linux-2.6.orig/include/linux/gfp.h
+++ linux-2.6/include/linux/gfp.h
@@ -43,6 +43,7 @@ struct vm_area_struct;
 #define __GFP_REPEAT   ((__force gfp_t)0x400u) /* Retry the allocation.  Might 
fail */
 #define __GFP_NOFAIL   ((__force gfp_t)0x800u) /* Retry for ever.  Cannot fail 
*/
 #define __GFP_NORETRY  ((__force gfp_t)0x1000u)/* Do not retry.  Might fail */
+#define __GFP_MEMALLOC  ((__force gfp_t)0x2000u)/* Use emergency reserves */
 #define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */
 #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on 
success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use emergency 
reserves */
@@ -88,7 +89,7 @@ struct vm_area_struct;
 /* Control page allocator reclaim behavior */
 #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\
-   __GFP_NORETRY|__GFP_NOMEMALLOC)
+   __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC)
 
 /* Control allocation constraints */
 #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE)
Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1560,7 +1560,9 @@ int gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
 
if (likely(!(gfp_mask  __GFP_NOMEMALLOC))) {
-   if (!in_irq()  (p-flags  PF_MEMALLOC))
+   if (gfp_mask  __GFP_MEMALLOC)
+   alloc_flags |= ALLOC_NO_WATERMARKS;
+   else if (!in_irq()  (p-flags  PF_MEMALLOC))
alloc_flags |= ALLOC_NO_WATERMARKS;
else if (!in_interrupt() 
unlikely(test_thread_flag(TIF_MEMDIE)))

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 17/33] sysctl: propagate conv errors

2007-10-30 Thread Peter Zijlstra
Currently conv routines will only generate -EINVAL, allow for other
errors to be propagetd.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sysctl.c |   11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

Index: linux-2.6/kernel/sysctl.c
===
--- linux-2.6.orig/kernel/sysctl.c
+++ linux-2.6/kernel/sysctl.c
@@ -1732,6 +1732,7 @@ static int __do_proc_dointvec(void *tbl_
int *i, vleft, first=1, neg, val;
unsigned long lval;
size_t left, len;
+   int ret = 0;

char buf[TMPBUFLEN], *p;
char __user *s = buffer;
@@ -1787,14 +1788,16 @@ static int __do_proc_dointvec(void *tbl_
s += len;
left -= len;
 
-   if (conv(neg, lval, i, 1, data))
+   ret = conv(neg, lval, i, 1, data);
+   if (ret)
break;
} else {
p = buf;
if (!first)
*p++ = '\t';

-   if (conv(neg, lval, i, 0, data))
+   ret = conv(neg, lval, i, 0, data);
+   if (ret)
break;
 
sprintf(p, %s%lu, neg ? - : , lval);
@@ -1823,11 +1826,9 @@ static int __do_proc_dointvec(void *tbl_
left--;
}
}
-   if (write  first)
-   return -EINVAL;
*lenp -= left;
*ppos += *lenp;
-   return 0;
+   return ret;
 #undef TMPBUFLEN
 }
 

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 14/33] net: packet split receive api

2007-10-30 Thread Peter Zijlstra
Add some packet-split receive hooks.

For one this allows to do NUMA node affine page allocs. Later on these hooks
will be extended to do emergency reserve allocations for fragments.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 drivers/net/e1000/e1000_main.c |8 ++--
 drivers/net/sky2.c |   16 ++--
 include/linux/skbuff.h |   23 +++
 net/core/skbuff.c  |   20 
 4 files changed, 51 insertions(+), 16 deletions(-)

Index: linux-2.6/drivers/net/e1000/e1000_main.c
===
--- linux-2.6.orig/drivers/net/e1000/e1000_main.c
+++ linux-2.6/drivers/net/e1000/e1000_main.c
@@ -4407,12 +4407,8 @@ e1000_clean_rx_irq_ps(struct e1000_adapt
pci_unmap_page(pdev, ps_page_dma-ps_page_dma[j],
PAGE_SIZE, PCI_DMA_FROMDEVICE);
ps_page_dma-ps_page_dma[j] = 0;
-   skb_fill_page_desc(skb, j, ps_page-ps_page[j], 0,
-  length);
+   skb_add_rx_frag(skb, j, ps_page-ps_page[j], 0, length);
ps_page-ps_page[j] = NULL;
-   skb-len += length;
-   skb-data_len += length;
-   skb-truesize += length;
}
 
/* strip the ethernet crc, problem is we're using pages now so
@@ -4618,7 +4614,7 @@ e1000_alloc_rx_buffers_ps(struct e1000_a
if (j  adapter-rx_ps_pages) {
if (likely(!ps_page-ps_page[j])) {
ps_page-ps_page[j] =
-   alloc_page(GFP_ATOMIC);
+   netdev_alloc_page(netdev);
if (unlikely(!ps_page-ps_page[j])) {
adapter-alloc_rx_buff_failed++;
goto no_buffers;
Index: linux-2.6/include/linux/skbuff.h
===
--- linux-2.6.orig/include/linux/skbuff.h
+++ linux-2.6/include/linux/skbuff.h
@@ -846,6 +846,9 @@ static inline void skb_fill_page_desc(st
skb_shinfo(skb)-nr_frags = i + 1;
 }
 
+extern void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page,
+   int off, int size);
+
 #define SKB_PAGE_ASSERT(skb)   BUG_ON(skb_shinfo(skb)-nr_frags)
 #define SKB_FRAG_ASSERT(skb)   BUG_ON(skb_shinfo(skb)-frag_list)
 #define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
@@ -1339,6 +1342,26 @@ static inline struct sk_buff *netdev_all
return __netdev_alloc_skb(dev, length, GFP_ATOMIC);
 }
 
+extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t 
gfp_mask);
+
+/**
+ * netdev_alloc_page - allocate a page for ps-rx on a specific device
+ * @dev: network device to receive on
+ *
+ * Allocate a new page node local to the specified device.
+ *
+ * %NULL is returned if there is no free memory.
+ */
+static inline struct page *netdev_alloc_page(struct net_device *dev)
+{
+   return __netdev_alloc_page(dev, GFP_ATOMIC);
+}
+
+static inline void netdev_free_page(struct net_device *dev, struct page *page)
+{
+   __free_page(page);
+}
+
 /**
  * skb_clone_writable - is the header of a clone writable
  * @skb: buffer to check
Index: linux-2.6/net/core/skbuff.c
===
--- linux-2.6.orig/net/core/skbuff.c
+++ linux-2.6/net/core/skbuff.c
@@ -263,6 +263,24 @@ struct sk_buff *__netdev_alloc_skb(struc
return skb;
 }
 
+struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask)
+{
+   int node = dev-dev.parent ? dev_to_node(dev-dev.parent) : -1;
+   struct page *page;
+
+   page = alloc_pages_node(node, gfp_mask, 0);
+   return page;
+}
+
+void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
+   int size)
+{
+   skb_fill_page_desc(skb, i, page, off, size);
+   skb-len += size;
+   skb-data_len += size;
+   skb-truesize += size;
+}
+
 static void skb_drop_list(struct sk_buff **listp)
 {
struct sk_buff *list = *listp;
@@ -2464,6 +2482,8 @@ EXPORT_SYMBOL(kfree_skb);
 EXPORT_SYMBOL(__pskb_pull_tail);
 EXPORT_SYMBOL(__alloc_skb);
 EXPORT_SYMBOL(__netdev_alloc_skb);
+EXPORT_SYMBOL(__netdev_alloc_page);
+EXPORT_SYMBOL(skb_add_rx_frag);
 EXPORT_SYMBOL(pskb_copy);
 EXPORT_SYMBOL(pskb_expand_head);
 EXPORT_SYMBOL(skb_checksum);
Index: linux-2.6/drivers/net/sky2.c
===
--- linux-2.6.orig/drivers/net/sky2.c
+++ linux-2.6/drivers/net/sky2.c
@@ -1173,7 +1173,7 @@ static struct sk_buff *sky2_rx_alloc(str
skb_reserve(skb, ALIGN(p, RX_SKB_ALIGN) - p

[PATCH 15/33] net: sk_allocation() - concentrate socket related allocations

2007-10-30 Thread Peter Zijlstra
Introduce sk_allocation(), this function allows to inject sock specific
flags to each sock related allocation.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h|7 ++-
 net/ipv4/tcp_output.c |   11 ++-
 net/ipv6/tcp_ipv6.c   |   14 +-
 3 files changed, 21 insertions(+), 11 deletions(-)

Index: linux-2.6/net/ipv4/tcp_output.c
===
--- linux-2.6.orig/net/ipv4/tcp_output.c
+++ linux-2.6/net/ipv4/tcp_output.c
@@ -2081,7 +2081,7 @@ void tcp_send_fin(struct sock *sk)
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
-   skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
+   skb = alloc_skb_fclone(MAX_TCP_HEADER, 
sk-sk_allocation);
if (skb)
break;
yield();
@@ -2114,7 +2114,7 @@ void tcp_send_active_reset(struct sock *
struct sk_buff *skb;
 
/* NOTE: No TCP options attached and we never retransmit this. */
-   skb = alloc_skb(MAX_TCP_HEADER, priority);
+   skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, priority));
if (!skb) {
NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
return;
@@ -2187,7 +2187,8 @@ struct sk_buff * tcp_make_synack(struct 
__u8 *md5_hash_location;
 #endif
 
-   skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+   skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1,
+   sk_allocation(sk, GFP_ATOMIC));
if (skb == NULL)
return NULL;
 
@@ -2446,7 +2447,7 @@ void tcp_send_ack(struct sock *sk)
 * tcp_transmit_skb() will set the ownership to this
 * sock.
 */
-   buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+   buff = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
if (buff == NULL) {
inet_csk_schedule_ack(sk);
inet_csk(sk)-icsk_ack.ato = TCP_ATO_MIN;
@@ -2488,7 +2489,7 @@ static int tcp_xmit_probe_skb(struct soc
struct sk_buff *skb;
 
/* We don't queue it, tcp_transmit_skb() sets ownership. */
-   skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+   skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC));
if (skb == NULL)
return -1;
 
Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -419,6 +419,11 @@ static inline int sock_flag(struct sock 
return test_bit(flag, sk-sk_flags);
 }
 
+static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask)
+{
+   return gfp_mask;
+}
+
 static inline void sk_acceptq_removed(struct sock *sk)
 {
sk-sk_ack_backlog--;
@@ -1212,7 +1217,7 @@ static inline struct sk_buff *sk_stream_
int hdr_len;
 
hdr_len = SKB_DATA_ALIGN(sk-sk_prot-max_header);
-   skb = alloc_skb_fclone(size + hdr_len, gfp);
+   skb = alloc_skb_fclone(size + hdr_len, sk_allocation(sk, gfp));
if (skb) {
skb-truesize += mem;
if (sk_stream_wmem_schedule(sk, skb-truesize)) {
Index: linux-2.6/net/ipv6/tcp_ipv6.c
===
--- linux-2.6.orig/net/ipv6/tcp_ipv6.c
+++ linux-2.6/net/ipv6/tcp_ipv6.c
@@ -573,7 +573,8 @@ static int tcp_v6_md5_do_add(struct sock
} else {
/* reallocate new list if current one is full. */
if (!tp-md5sig_info) {
-   tp-md5sig_info = kzalloc(sizeof(*tp-md5sig_info), 
GFP_ATOMIC);
+   tp-md5sig_info = kzalloc(sizeof(*tp-md5sig_info),
+   sk_allocation(sk, GFP_ATOMIC));
if (!tp-md5sig_info) {
kfree(newkey);
return -ENOMEM;
@@ -583,7 +584,8 @@ static int tcp_v6_md5_do_add(struct sock
tcp_alloc_md5sig_pool();
if (tp-md5sig_info-alloced6 == tp-md5sig_info-entries6) {
keys = kmalloc((sizeof (tp-md5sig_info-keys6[0]) *
-  (tp-md5sig_info-entries6 + 1)), 
GFP_ATOMIC);
+  (tp-md5sig_info-entries6 + 1)),
+  sk_allocation(sk, GFP_ATOMIC));
 
if (!keys) {
tcp_free_md5sig_pool();
@@ -709,7 +711,7 @@ static int tcp_v6_parse_md5_keys (struct
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *p;
 
-   p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL);
+   p = kzalloc(sizeof(struct tcp_md5sig_info), sk-sk_allocation

[PATCH 07/33] mm: serialize access to min_free_kbytes

2007-10-30 Thread Peter Zijlstra
There is a small race between the procfs caller and the memory hotplug caller
of setup_per_zone_pages_min(). Not a big deal, but the next patch will add yet
another caller. Time to close the gap.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/page_alloc.c |   16 +---
 1 file changed, 13 insertions(+), 3 deletions(-)

Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -116,6 +116,7 @@ static char * const zone_names[MAX_NR_ZO
 Movable,
 };
 
+static DEFINE_SPINLOCK(min_free_lock);
 int min_free_kbytes = 1024;
 
 unsigned long __meminitdata nr_kernel_pages;
@@ -4162,12 +4163,12 @@ static void setup_per_zone_lowmem_reserv
 }
 
 /**
- * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ * __setup_per_zone_pages_min - called when min_free_kbytes changes.
  *
  * Ensures that the pages_{min,low,high} values for each zone are set correctly
  * with respect to min_free_kbytes.
  */
-void setup_per_zone_pages_min(void)
+static void __setup_per_zone_pages_min(void)
 {
unsigned long pages_min = min_free_kbytes  (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
@@ -4222,6 +4223,15 @@ void setup_per_zone_pages_min(void)
calculate_totalreserve_pages();
 }
 
+void setup_per_zone_pages_min(void)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(min_free_lock, flags);
+   __setup_per_zone_pages_min();
+   spin_unlock_irqrestore(min_free_lock, flags);
+}
+
 /*
  * Initialise min_free_kbytes.
  *
@@ -4257,7 +4267,7 @@ static int __init init_per_zone_pages_mi
min_free_kbytes = 128;
if (min_free_kbytes  65536)
min_free_kbytes = 65536;
-   setup_per_zone_pages_min();
+   __setup_per_zone_pages_min();
setup_per_zone_lowmem_reserve();
return 0;
 }

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 11/33] mm: memory reserve management

2007-10-30 Thread Peter Zijlstra
Generic reserve management code. 

It provides methods to reserve and charge. Upon this, generic alloc/free style
reserve pools could be build, which could fully replace mempool_t
functionality.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/reserve.h |   54 +
 mm/Makefile |2 
 mm/reserve.c|  436 
 3 files changed, 491 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/reserve.h
===
--- /dev/null
+++ linux-2.6/include/linux/reserve.h
@@ -0,0 +1,54 @@
+/*
+ * Memory reserve management.
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra [EMAIL PROTECTED]
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef _LINUX_RESERVE_H
+#define _LINUX_RESERVE_H
+
+#include linux/list.h
+#include linux/spinlock.h
+
+struct mem_reserve {
+   struct mem_reserve *parent;
+   struct list_head children;
+   struct list_head siblings;
+
+   const char *name;
+
+   long pages;
+   long limit;
+   long usage;
+   spinlock_t lock;/* protects limit and usage */
+};
+
+extern struct mem_reserve mem_reserve_root;
+
+void mem_reserve_init(struct mem_reserve *res, const char *name,
+ struct mem_reserve *parent);
+int mem_reserve_connect(struct mem_reserve *new_child,
+   struct mem_reserve *node);
+int mem_reserve_disconnect(struct mem_reserve *node);
+
+int mem_reserve_pages_set(struct mem_reserve *res, long pages);
+int mem_reserve_pages_add(struct mem_reserve *res, long pages);
+int mem_reserve_pages_charge(struct mem_reserve *res, long pages,
+int overcommit);
+
+int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes);
+int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes,
+  int overcommit);
+
+struct kmem_cache;
+
+int mem_reserve_kmem_cache_set(struct mem_reserve *res,
+  struct kmem_cache *s,
+  int objects);
+int mem_reserve_kmem_cache_charge(struct mem_reserve *res,
+ long objs,
+ int overcommit);
+
+#endif /* _LINUX_RESERVE_H */
Index: linux-2.6/mm/Makefile
===
--- linux-2.6.orig/mm/Makefile
+++ linux-2.6/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o
   page_alloc.o page-writeback.o pdflush.o \
   readahead.o swap.o truncate.o vmscan.o \
   prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-  page_isolation.o $(mmu-y)
+  page_isolation.o reserve.o $(mmu-y)
 
 obj-$(CONFIG_BOUNCE)   += bounce.o
 obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o
Index: linux-2.6/mm/reserve.c
===
--- /dev/null
+++ linux-2.6/mm/reserve.c
@@ -0,0 +1,436 @@
+/*
+ * Memory reserve management.
+ *
+ *  Copyright (C) 2007, Red Hat, Inc., Peter Zijlstra [EMAIL PROTECTED]
+ *
+ * Description:
+ *
+ * Manage a set of memory reserves.
+ *
+ * A memory reserve is a reserve for a specified number of object of specified
+ * size. Since memory is managed in pages, this reserve demand is then
+ * translated into a page unit.
+ *
+ * So each reserve has a specified object limit, an object usage count and a
+ * number of pages required to back these objects.
+ *
+ * Usage is charged against a reserve, if the charge fails, the resource must
+ * not be allocated/used.
+ *
+ * The reserves are managed in a tree, and the resource demands (pages and
+ * limit) are propagated up the tree. Obviously the object limit will be
+ * meaningless as soon as the unit starts mixing, but the required page reserve
+ * (being of one unit) is still valid at the root.
+ *
+ * It is the page demand of the root node that is used to set the global
+ * reserve (adjust_memalloc_reserve() which sets zone-pages_emerg).
+ *
+ * As long as a subtree has the same usage unit, an aggregate node can be used
+ * to charge against, instead of the leaf nodes. However, do be consistent with
+ * who is charged, resource usage is not propagated up the tree (for
+ * performance reasons).
+ */
+
+#include linux/reserve.h
+#include linux/mutex.h
+#include linux/mmzone.h
+#include linux/log2.h
+#include linux/proc_fs.h
+#include linux/seq_file.h
+#include linux/module.h
+#include linux/slab.h
+
+static DEFINE_MUTEX(mem_reserve_mutex);
+
+/**
+ * @mem_reserve_root - the global reserve root
+ *
+ * The global reserve is empty, and has no limit unit, it merely
+ * acts as an aggregation point for reserves and an interface to
+ * adjust_memalloc_reserve().
+ */
+struct mem_reserve mem_reserve_root

[PATCH 21/33] netvm: prevent a TCP specific deadlock

2007-10-30 Thread Peter Zijlstra
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data
that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers
from receiving data, which will prevent userspace from running, which is needed
to reduce the buffered data.

Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/net/sock.h |7 ---
 net/core/stream.c  |5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

Index: linux-2.6/include/net/sock.h
===
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -743,7 +743,8 @@ static inline struct inode *SOCK_INODE(s
 }
 
 extern void __sk_stream_mem_reclaim(struct sock *sk);
-extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind);
+extern int sk_stream_mem_schedule(struct sock *sk, struct sk_buff *skb,
+   int size, int kind);
 
 #define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE)
 
@@ -761,13 +762,13 @@ static inline void sk_stream_mem_reclaim
 static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb)
 {
return (int)skb-truesize = sk-sk_forward_alloc ||
-   sk_stream_mem_schedule(sk, skb-truesize, 1);
+   sk_stream_mem_schedule(sk, skb, skb-truesize, 1);
 }
 
 static inline int sk_stream_wmem_schedule(struct sock *sk, int size)
 {
return size = sk-sk_forward_alloc ||
-  sk_stream_mem_schedule(sk, size, 0);
+  sk_stream_mem_schedule(sk, NULL, size, 0);
 }
 
 /* Used by processes to lock a socket state, so that
Index: linux-2.6/net/core/stream.c
===
--- linux-2.6.orig/net/core/stream.c
+++ linux-2.6/net/core/stream.c
@@ -207,7 +207,7 @@ void __sk_stream_mem_reclaim(struct sock
 
 EXPORT_SYMBOL(__sk_stream_mem_reclaim);
 
-int sk_stream_mem_schedule(struct sock *sk, int size, int kind)
+int sk_stream_mem_schedule(struct sock *sk, struct sk_buff *skb, int size, int 
kind)
 {
int amt = sk_stream_pages(size);
 
@@ -224,7 +224,8 @@ int sk_stream_mem_schedule(struct sock *
/* Over hard limit. */
if (atomic_read(sk-sk_prot-memory_allocated)  
sk-sk_prot-sysctl_mem[2]) {
sk-sk_prot-enter_memory_pressure();
-   goto suppress_allocation;
+   if (!skb || (skb  !skb_emergency(skb)))
+   goto suppress_allocation;
}
 
/* Under pressure. */

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 29/33] nfs: disable data cache revalidation for swapfiles

2007-10-30 Thread Peter Zijlstra
Do as Trond suggested:
  http://lkml.org/lkml/2006/8/25/348

Disable NFS data cache revalidation on swap files since it doesn't really 
make sense to have other clients change the file while you are using it.

Thereby we can stop setting PG_private on swap pages, since there ought to
be no further races with invalidate_inode_pages2() to deal with.

And since we cannot set PG_private we cannot use page-private (which is
already used by PG_swapcache pages anyway) to store the nfs_page. Thus
augment the new nfs_page_find_request logic.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/inode.c |6 
 fs/nfs/write.c |   73 ++---
 2 files changed, 65 insertions(+), 14 deletions(-)

Index: linux-2.6/fs/nfs/inode.c
===
--- linux-2.6.orig/fs/nfs/inode.c
+++ linux-2.6/fs/nfs/inode.c
@@ -744,6 +744,12 @@ int nfs_revalidate_mapping_nolock(struct
struct nfs_inode *nfsi = NFS_I(inode);
int ret = 0;
 
+   /*
+* swapfiles are not supposed to be shared.
+*/
+   if (IS_SWAPFILE(inode))
+   goto out;
+
if ((nfsi-cache_validity  NFS_INO_REVAL_PAGECACHE)
|| nfs_attribute_timeout(inode) || NFS_STALE(inode)) {
ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -112,25 +112,62 @@ static void nfs_context_set_write_error(
set_bit(NFS_CONTEXT_ERROR_WRITE, ctx-flags);
 }
 
-static struct nfs_page *nfs_page_find_request_locked(struct page *page)
+static struct nfs_page *
+__nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page, int 
get)
 {
struct nfs_page *req = NULL;
 
-   if (PagePrivate(page)) {
+   if (PagePrivate(page))
req = (struct nfs_page *)page_private(page);
-   if (req != NULL)
-   kref_get(req-wb_kref);
-   }
+   else if (unlikely(PageSwapCache(page)))
+   req = radix_tree_lookup(nfsi-nfs_page_tree, 
page_file_index(page));
+
+   if (get  req)
+   kref_get(req-wb_kref);
+
return req;
 }
 
+static inline struct nfs_page *
+nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page)
+{
+   return __nfs_page_find_request_locked(nfsi, page, 1);
+}
+
+static int __nfs_page_has_request(struct page *page)
+{
+   struct inode *inode = page_file_mapping(page)-host;
+   struct nfs_page *req = NULL;
+
+   spin_lock(inode-i_lock);
+   req = __nfs_page_find_request_locked(NFS_I(inode), page, 0);
+   spin_unlock(inode-i_lock);
+
+   /*
+* hole here plugged by the caller holding onto PG_locked
+*/
+
+   return req != NULL;
+}
+
+static inline int nfs_page_has_request(struct page *page)
+{
+   if (PagePrivate(page))
+   return 1;
+
+   if (unlikely(PageSwapCache(page)))
+   return __nfs_page_has_request(page);
+
+   return 0;
+}
+
 static struct nfs_page *nfs_page_find_request(struct page *page)
 {
struct inode *inode = page_file_mapping(page)-host;
struct nfs_page *req = NULL;
 
spin_lock(inode-i_lock);
-   req = nfs_page_find_request_locked(page);
+   req = nfs_page_find_request_locked(NFS_I(inode), page);
spin_unlock(inode-i_lock);
return req;
 }
@@ -255,7 +292,7 @@ static int nfs_page_async_flush(struct n
 
spin_lock(inode-i_lock);
for(;;) {
-   req = nfs_page_find_request_locked(page);
+   req = nfs_page_find_request_locked(nfsi, page);
if (req == NULL) {
spin_unlock(inode-i_lock);
return 0;
@@ -374,8 +411,14 @@ static int nfs_inode_add_request(struct 
if (nfs_have_delegation(inode, FMODE_WRITE))
nfsi-change_attr++;
}
-   SetPagePrivate(req-wb_page);
-   set_page_private(req-wb_page, (unsigned long)req);
+   /*
+* Swap-space should not get truncated. Hence no need to plug the race
+* with invalidate/truncate.
+*/
+   if (likely(!PageSwapCache(req-wb_page))) {
+   SetPagePrivate(req-wb_page);
+   set_page_private(req-wb_page, (unsigned long)req);
+   }
nfsi-npages++;
kref_get(req-wb_kref);
return 0;
@@ -392,8 +435,10 @@ static void nfs_inode_remove_request(str
BUG_ON (!NFS_WBACK_BUSY(req));
 
spin_lock(inode-i_lock);
-   set_page_private(req-wb_page, 0);
-   ClearPagePrivate(req-wb_page);
+   if (likely(!PageSwapCache(req-wb_page))) {
+   set_page_private(req-wb_page, 0);
+   ClearPagePrivate(req-wb_page);
+   }
radix_tree_delete(nfsi-nfs_page_tree, req-wb_index

[PATCH 00/33] Swap over NFS -v14

2007-10-30 Thread Peter Zijlstra

Hi,

Another posting of the full swap over NFS series. 

[ I tried just posting the first part last time around, but
  that just gets more confusion by lack of a general picture ]

[ patches against 2.6.23-mm1, also to be found online at:
  http://programming.kicks-ass.net/kernel-patches/vm_deadlock/v2.6.23-mm1/ ]

The patch-set can be split in roughtly 5 parts, for each of which I shall give
a description.


  Part 1, patches 1-12

The problem with swap over network is the generic swap problem: needing memory
to free memory. Normally this is solved using mempools, as can be seen in the
BIO layer.

Swap over network has the problem that the network subsystem does not use fixed
sized allocations, but heavily relies on kmalloc(). This makes mempools
unusable.

This first part provides a generic reserve framework.

Care is taken to only affect the slow paths - when we're low on memory.

Caveats: it is currently SLUB only.

 1 - mm: gfp_to_alloc_flags()
 2 - mm: tag reseve pages
 3 - mm: slub: add knowledge of reserve pages
 4 - mm: allow mempool to fall back to memalloc reserves
 5 - mm: kmem_estimate_pages()
 6 - mm: allow PF_MEMALLOC from softirq context
 7 - mm: serialize access to min_free_kbytes
 8 - mm: emergency pool
 9 - mm: system wide ALLOC_NO_WATERMARK
10 - mm: __GFP_MEMALLOC
11 - mm: memory reserve management
12 - selinux: tag avc cache alloc as non-critical


  Part 2, patches 13-15

Provide some generic network infrastructure needed later on.

13 - net: wrap sk-sk_backlog_rcv()
14 - net: packet split receive api
15 - net: sk_allocation() - concentrate socket related allocations


  Part 3, patches 16-23

Now that we have a generic memory reserve system, use it on the network stack.
The thing that makes this interesting is that, contrary to BIO, both the
transmit and receive path require memory allocations. 

That is, in the BIO layer write back completion is usually just an ISR flipping
a bit and waking stuff up. A network write back completion involved receiving
packets, which when there is no memory, is rather hard. And even when there is
memory there is no guarantee that the required packet comes in in the window
that that memory buys us.

The solution to this problem is found in the fact that network is to be assumed
lossy. Even now, when there is no memory to receive packets the network card
will have to discard packets. What we do is move this into the network stack.

So we reserve a little pool to act as a receive buffer, this allows us to
inspect packets before tossing them. This way, we can filter out those packets
that ensure progress (writeback completion) and disregard the others (as would
have happened anyway). [ NOTE: this is a stable mode of operation with limited
memory usage, exactly the kind of thing we need ]

Again, care is taken to keep much of the overhead of this to only affect the
slow path. Only packets allocated from the reserves will suffer the extra
atomic overhead needed for accounting.

16 - netvm: network reserve infrastructure
17 - sysctl: propagate conv errors
18 - netvm: INET reserves.
19 - netvm: hook skb allocation to reserves
20 - netvm: filter emergency skbs.
21 - netvm: prevent a TCP specific deadlock
22 - netfilter: NF_QUEUE vs emergency skbs
23 - netvm: skb processing


  Part 4, patches 24-26

Generic vm infrastructure to handle swapping to a filesystem instead of a block
device. The approach here has been questioned, people would like to see a less
invasive approach.

One suggestion is to create and use a_ops-swap_{in,out}().

24 - mm: prepare swap entry methods for use in page methods
25 - mm: add support for non block device backed swap files
26 - mm: methods for teaching filesystems about PG_swapcache pages


  Part 5, patches 27-33

Finally, convert NFS to make use of the new network and vm infrastructure to
provide swap over NFS.

27 - nfs: remove mempools
28 - nfs: teach the NFS client how to treat PG_swapcache pages
29 - nfs: disable data cache revalidation for swapfiles
30 - nfs: swap vs nfs_writepage
31 - nfs: enable swap on NFS
32 - nfs: fix various memory recursions possible with swap over NFS.
33 - nfs: do not warn on radix tree node allocation failures



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 27/33] nfs: remove mempools

2007-10-30 Thread Peter Zijlstra
With the introduction of the shared dirty page accounting in .19, NFS should
not be able to surpise the VM with all dirty pages. Thus it should always be
able to free some memory. Hence no more need for mempools.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/read.c  |   15 +++
 fs/nfs/write.c |   27 +--
 2 files changed, 8 insertions(+), 34 deletions(-)

Index: linux-2.6/fs/nfs/read.c
===
--- linux-2.6.orig/fs/nfs/read.c
+++ linux-2.6/fs/nfs/read.c
@@ -33,13 +33,10 @@ static const struct rpc_call_ops nfs_rea
 static const struct rpc_call_ops nfs_read_full_ops;
 
 static struct kmem_cache *nfs_rdata_cachep;
-static mempool_t *nfs_rdata_mempool;
-
-#define MIN_POOL_READ  (32)
 
 struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
 {
-   struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS);
+   struct nfs_read_data *p = kmem_cache_alloc(nfs_rdata_cachep, GFP_NOFS);
 
if (p) {
memset(p, 0, sizeof(*p));
@@ -50,7 +47,7 @@ struct nfs_read_data *nfs_readdata_alloc
else {
p-pagevec = kcalloc(pagecount, sizeof(struct page *), 
GFP_NOFS);
if (!p-pagevec) {
-   mempool_free(p, nfs_rdata_mempool);
+   kmem_cache_free(nfs_rdata_cachep, p);
p = NULL;
}
}
@@ -63,7 +60,7 @@ static void nfs_readdata_rcu_free(struct
struct nfs_read_data *p = container_of(head, struct nfs_read_data, 
task.u.tk_rcu);
if (p  (p-pagevec != p-page_array[0]))
kfree(p-pagevec);
-   mempool_free(p, nfs_rdata_mempool);
+   kmem_cache_free(nfs_rdata_cachep, p);
 }
 
 static void nfs_readdata_free(struct nfs_read_data *rdata)
@@ -597,16 +594,10 @@ int __init nfs_init_readpagecache(void)
if (nfs_rdata_cachep == NULL)
return -ENOMEM;
 
-   nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ,
-nfs_rdata_cachep);
-   if (nfs_rdata_mempool == NULL)
-   return -ENOMEM;
-
return 0;
 }
 
 void nfs_destroy_readpagecache(void)
 {
-   mempool_destroy(nfs_rdata_mempool);
kmem_cache_destroy(nfs_rdata_cachep);
 }
Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -28,9 +28,6 @@
 
 #define NFSDBG_FACILITYNFSDBG_PAGECACHE
 
-#define MIN_POOL_WRITE (32)
-#define MIN_POOL_COMMIT(4)
-
 /*
  * Local function declarations
  */
@@ -44,12 +41,10 @@ static const struct rpc_call_ops nfs_wri
 static const struct rpc_call_ops nfs_commit_ops;
 
 static struct kmem_cache *nfs_wdata_cachep;
-static mempool_t *nfs_wdata_mempool;
-static mempool_t *nfs_commit_mempool;
 
 struct nfs_write_data *nfs_commit_alloc(void)
 {
-   struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS);
+   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);
 
if (p) {
memset(p, 0, sizeof(*p));
@@ -63,7 +58,7 @@ static void nfs_commit_rcu_free(struct r
struct nfs_write_data *p = container_of(head, struct nfs_write_data, 
task.u.tk_rcu);
if (p  (p-pagevec != p-page_array[0]))
kfree(p-pagevec);
-   mempool_free(p, nfs_commit_mempool);
+   kmem_cache_free(nfs_wdata_cachep, p);
 }
 
 void nfs_commit_free(struct nfs_write_data *wdata)
@@ -73,7 +68,7 @@ void nfs_commit_free(struct nfs_write_da
 
 struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
 {
-   struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS);
+   struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS);
 
if (p) {
memset(p, 0, sizeof(*p));
@@ -84,7 +79,7 @@ struct nfs_write_data *nfs_writedata_all
else {
p-pagevec = kcalloc(pagecount, sizeof(struct page *), 
GFP_NOFS);
if (!p-pagevec) {
-   mempool_free(p, nfs_wdata_mempool);
+   kmem_cache_free(nfs_wdata_cachep, p);
p = NULL;
}
}
@@ -97,7 +92,7 @@ static void nfs_writedata_rcu_free(struc
struct nfs_write_data *p = container_of(head, struct nfs_write_data, 
task.u.tk_rcu);
if (p  (p-pagevec != p-page_array[0]))
kfree(p-pagevec);
-   mempool_free(p, nfs_wdata_mempool);
+   kmem_cache_free(nfs_wdata_cachep, p);
 }
 
 static void nfs_writedata_free(struct nfs_write_data *wdata)
@@ -1474,16 +1469,6 @@ int __init nfs_init_writepagecache(void)
if (nfs_wdata_cachep == NULL)
return -ENOMEM

[PATCH 31/33] nfs: enable swap on NFS

2007-10-30 Thread Peter Zijlstra
Provide an a_ops-swapfile() implementation for NFS. This will set the
NFS socket to SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well
as reset SOCK_MEMALLOC before engaging the protocol -connect() method.

PF_MEMALLOC should allow the allocation of struct socket and related objects
and the early (re)setting of SOCK_MEMALLOC should allow us to receive the 
packets
required for the TCP connection buildup.

(swapping continues over a server reset during heavy network traffic)

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/Kconfig  |   18 
 fs/nfs/file.c   |   10 ++
 include/linux/sunrpc/xprt.h |5 ++-
 net/sunrpc/sched.c  |9 --
 net/sunrpc/xprtsock.c   |   63 
 5 files changed, 102 insertions(+), 3 deletions(-)

Index: linux-2.6/fs/nfs/file.c
===
--- linux-2.6.orig/fs/nfs/file.c
+++ linux-2.6/fs/nfs/file.c
@@ -371,6 +371,13 @@ static int nfs_launder_page(struct page 
return nfs_wb_page(page_file_mapping(page)-host, page);
 }
 
+#ifdef CONFIG_NFS_SWAP
+static int nfs_swapfile(struct address_space *mapping, int enable)
+{
+   return xs_swapper(NFS_CLIENT(mapping-host)-cl_xprt, enable);
+}
+#endif
+
 const struct address_space_operations nfs_file_aops = {
.readpage = nfs_readpage,
.readpages = nfs_readpages,
@@ -385,6 +392,9 @@ const struct address_space_operations nf
.direct_IO = nfs_direct_IO,
 #endif
.launder_page = nfs_launder_page,
+#ifdef CONFIG_NFS_SWAP
+   .swapfile = nfs_swapfile,
+#endif
 };
 
 static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
Index: linux-2.6/include/linux/sunrpc/xprt.h
===
--- linux-2.6.orig/include/linux/sunrpc/xprt.h
+++ linux-2.6/include/linux/sunrpc/xprt.h
@@ -143,7 +143,9 @@ struct rpc_xprt {
unsigned intmax_reqs;   /* total slots */
unsigned long   state;  /* transport state */
unsigned char   shutdown   : 1, /* being shut down */
-   resvport   : 1; /* use a reserved port */
+   resvport   : 1, /* use a reserved port */
+   swapper: 1; /* we're swapping over this
+  transport */
unsigned intbind_index; /* bind function index */
 
/*
@@ -246,6 +248,7 @@ struct rpc_rqst *   xprt_lookup_rqst(struc
 void   xprt_complete_rqst(struct rpc_task *task, int copied);
 void   xprt_release_rqst_cong(struct rpc_task *task);
 void   xprt_disconnect(struct rpc_xprt *xprt);
+intxs_swapper(struct rpc_xprt *xprt, int enable);
 
 /*
  * Reserved bit positions in xprt-state
Index: linux-2.6/net/sunrpc/sched.c
===
--- linux-2.6.orig/net/sunrpc/sched.c
+++ linux-2.6/net/sunrpc/sched.c
@@ -761,7 +761,10 @@ struct rpc_buffer {
 void *rpc_malloc(struct rpc_task *task, size_t size)
 {
struct rpc_buffer *buf;
-   gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT;
+   gfp_t gfp = GFP_NOWAIT;
+
+   if (RPC_IS_SWAPPER(task))
+   gfp |= __GFP_MEMALLOC;
 
size += sizeof(struct rpc_buffer);
if (size = RPC_BUFFER_MAXSIZE)
@@ -817,6 +820,8 @@ void rpc_init_task(struct rpc_task *task
atomic_set(task-tk_count, 1);
task-tk_client = clnt;
task-tk_flags  = flags;
+   if (clnt-cl_xprt-swapper)
+   task-tk_flags |= RPC_TASK_SWAPPER;
task-tk_ops = tk_ops;
if (tk_ops-rpc_call_prepare != NULL)
task-tk_action = rpc_prepare_task;
@@ -853,7 +858,7 @@ void rpc_init_task(struct rpc_task *task
 static struct rpc_task *
 rpc_alloc_task(void)
 {
-   return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS);
+   return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
 }
 
 static void rpc_free_task(struct rcu_head *rcu)
Index: linux-2.6/net/sunrpc/xprtsock.c
===
--- linux-2.6.orig/net/sunrpc/xprtsock.c
+++ linux-2.6/net/sunrpc/xprtsock.c
@@ -1397,6 +1397,9 @@ static void xs_udp_finish_connecting(str
transport-sock = sock;
transport-inet = sk;
 
+   if (xprt-swapper)
+   sk_set_memalloc(sk);
+
write_unlock_bh(sk-sk_callback_lock);
}
xs_udp_do_set_buffer_size(xprt);
@@ -1414,11 +1417,15 @@ static void xs_udp_connect_worker4(struc
container_of(work, struct sock_xprt, connect_worker.work);
struct rpc_xprt *xprt = transport-xprt;
struct socket *sock = transport-sock;
+   unsigned long pflags

[PATCH 04/33] mm: allow mempool to fall back to memalloc reserves

2007-10-30 Thread Peter Zijlstra
Allow the mempool to use the memalloc reserves when all else fails and
the allocation context would otherwise allow it.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/mempool.c |   12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

Index: linux-2.6/mm/mempool.c
===
--- linux-2.6.orig/mm/mempool.c
+++ linux-2.6/mm/mempool.c
@@ -14,6 +14,7 @@
 #include linux/mempool.h
 #include linux/blkdev.h
 #include linux/writeback.h
+#include internal.h
 
 static void add_element(mempool_t *pool, void *element)
 {
@@ -204,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, gf
void *element;
unsigned long flags;
wait_queue_t wait;
-   gfp_t gfp_temp;
+   gfp_t gfp_temp, gfp_orig = gfp_mask;
 
might_sleep_if(gfp_mask  __GFP_WAIT);
 
@@ -228,6 +229,15 @@ repeat_alloc:
}
spin_unlock_irqrestore(pool-lock, flags);
 
+   /* if we really had right to the emergency reserves try those */
+   if (gfp_to_alloc_flags(gfp_orig)  ALLOC_NO_WATERMARKS) {
+   if (gfp_temp  __GFP_NOMEMALLOC) {
+   gfp_temp = ~(__GFP_NOMEMALLOC|__GFP_NOWARN);
+   goto repeat_alloc;
+   } else
+   gfp_temp |= __GFP_NOMEMALLOC|__GFP_NOWARN;
+   }
+
/* We must not sleep in the GFP_ATOMIC case */
if (!(gfp_mask  __GFP_WAIT))
return NULL;

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 06/33] mm: allow PF_MEMALLOC from softirq context

2007-10-30 Thread Peter Zijlstra
Allow PF_MEMALLOC to be set in softirq context. When running softirqs from
a borrowed context save current-flags, ksoftirqd will have its own 
task_struct.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sched.h |4 
 kernel/softirq.c  |3 +++
 mm/page_alloc.c   |7 ---
 3 files changed, 11 insertions(+), 3 deletions(-)

Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1557,9 +1557,10 @@ int gfp_to_alloc_flags(gfp_t gfp_mask)
alloc_flags |= ALLOC_HARDER;
 
if (likely(!(gfp_mask  __GFP_NOMEMALLOC))) {
-   if (!in_interrupt() 
-   ((p-flags  PF_MEMALLOC) ||
-unlikely(test_thread_flag(TIF_MEMDIE
+   if (!in_irq()  (p-flags  PF_MEMALLOC))
+   alloc_flags |= ALLOC_NO_WATERMARKS;
+   else if (!in_interrupt() 
+   unlikely(test_thread_flag(TIF_MEMDIE)))
alloc_flags |= ALLOC_NO_WATERMARKS;
}
 
Index: linux-2.6/kernel/softirq.c
===
--- linux-2.6.orig/kernel/softirq.c
+++ linux-2.6/kernel/softirq.c
@@ -211,6 +211,8 @@ asmlinkage void __do_softirq(void)
__u32 pending;
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;
+   unsigned long pflags = current-flags;
+   current-flags = ~PF_MEMALLOC;
 
pending = local_softirq_pending();
account_system_vtime(current);
@@ -249,6 +251,7 @@ restart:
 
account_system_vtime(current);
_local_bh_enable();
+   tsk_restore_flags(current, pflags, PF_MEMALLOC);
 }
 
 #ifndef __ARCH_HAS_DO_SOFTIRQ
Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -1389,6 +1389,10 @@ static inline void put_task_struct(struc
 #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
+#define tsk_restore_flags(p, pflags, mask) \
+   do {(p)-flags = ~(mask); \
+   (p)-flags |= ((pflags)  (mask)); } while (0)
+
 #ifdef CONFIG_SMP
 extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
 #else

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 12/33] selinux: tag avc cache alloc as non-critical

2007-10-30 Thread Peter Zijlstra
Failing to allocate a cache entry will only harm performance not correctness.
Do not consume valuable reserve pages for something like that.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
Acked-by: James Morris [EMAIL PROTECTED]
---
 security/selinux/avc.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: linux-2.6-2/security/selinux/avc.c
===
--- linux-2.6-2.orig/security/selinux/avc.c
+++ linux-2.6-2/security/selinux/avc.c
@@ -334,7 +334,7 @@ static struct avc_node *avc_alloc_node(v
 {
struct avc_node *node;
 
-   node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC);
+   node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC);
if (!node)
goto out;
 

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 18/33] netvm: INET reserves.

2007-10-30 Thread Peter Zijlstra
Add reserves for INET.

The two big users seem to be the route cache and ip-fragment cache.

Reserve the route cache under generic RX reserve, its usage is bounded by
the high reclaim watermark, and thus does not need further accounting.

Reserve the ip-fragement caches under SKB data reserve, these add to the
SKB RX limit. By ensuring we can at least receive as much data as fits in
the reassmbly line we avoid fragment attack deadlocks.

Use proc conv() routines to update these limits and return -ENOMEM to user
space.

Adds to the reserve tree:

  total network reserve  
network TX reserve   
  protocol TX pages  
network RX reserve   
+ IPv6 route cache   
+ IPv4 route cache   
  SKB data reserve   
+   IPv6 fragment cache  
+   IPv4 fragment cache  

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 include/linux/sysctl.h |   11 +++
 kernel/sysctl.c|8 ++--
 net/ipv4/ip_fragment.c |7 +++
 net/ipv4/route.c   |   30 +-
 net/ipv4/sysctl_net_ipv4.c |   24 +++-
 net/ipv6/reassembly.c  |7 +++
 net/ipv6/route.c   |   31 ++-
 net/ipv6/sysctl_net_ipv6.c |   24 +++-
 8 files changed, 136 insertions(+), 6 deletions(-)

Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c
===
--- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c
+++ linux-2.6/net/ipv4/sysctl_net_ipv4.c
@@ -18,6 +18,7 @@
 #include net/route.h
 #include net/tcp.h
 #include net/cipso_ipv4.h
+#include linux/reserve.h
 
 /* From af_inet.c */
 extern int sysctl_ip_nonlocal_bind;
@@ -186,6 +187,27 @@ static int strategy_allowed_congestion_c
 
 }
 
+extern struct mem_reserve ipv4_frag_reserve;
+
+static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp,
+int *valp, int write, void *data)
+{
+   if (write) {
+   long value = *negp ? -*lvalp : *lvalp;
+   int err = mem_reserve_kmalloc_set(ipv4_frag_reserve, value);
+   if (err)
+   return err;
+   }
+   return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_fragment(ctl_table *table, int write, struct file 
*filp,
+void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+   return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+   do_proc_dointvec_fragment_conv, NULL);
+}
+
 ctl_table ipv4_table[] = {
{
.ctl_name   = NET_IPV4_TCP_TIMESTAMPS,
@@ -291,7 +313,7 @@ ctl_table ipv4_table[] = {
.data   = sysctl_ipfrag_high_thresh,
.maxlen = sizeof(int),
.mode   = 0644,
-   .proc_handler   = proc_dointvec
+   .proc_handler   = proc_dointvec_fragment
},
{
.ctl_name   = NET_IPV4_IPFRAG_LOW_THRESH,
Index: linux-2.6/net/ipv6/sysctl_net_ipv6.c
===
--- linux-2.6.orig/net/ipv6/sysctl_net_ipv6.c
+++ linux-2.6/net/ipv6/sysctl_net_ipv6.c
@@ -12,9 +12,31 @@
 #include net/ndisc.h
 #include net/ipv6.h
 #include net/addrconf.h
+#include linux/reserve.h
 
 #ifdef CONFIG_SYSCTL
 
+extern struct mem_reserve ipv6_frag_reserve;
+
+static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp,
+int *valp, int write, void *data)
+{
+   if (write) {
+   long value = *negp ? -*lvalp : *lvalp;
+   int err = mem_reserve_kmalloc_set(ipv6_frag_reserve, value);
+   if (err)
+   return err;
+   }
+   return do_proc_dointvec_conv(negp, lvalp, valp, write, data);
+}
+
+static int proc_dointvec_fragment(ctl_table *table, int write, struct file 
*filp,
+void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+   return do_proc_dointvec(table, write, filp, buffer, lenp, ppos,
+   do_proc_dointvec_fragment_conv, NULL);
+}
+
 static ctl_table ipv6_table[] = {
{
.ctl_name   = NET_IPV6_ROUTE,
@@ -44,7 +66,7 @@ static ctl_table ipv6_table[] = {
.data   = sysctl_ip6frag_high_thresh,
.maxlen = sizeof(int),
.mode   = 0644,
-   .proc_handler   = proc_dointvec
+   .proc_handler   = proc_dointvec_fragment
},
{
.ctl_name   = NET_IPV6_IP6FRAG_LOW_THRESH,
Index: linux-2.6/net/ipv4/ip_fragment.c
===
--- linux-2.6.orig/net/ipv4/ip_fragment.c
+++ linux-2.6/net/ipv4/ip_fragment.c
@@ -43,6 +43,7 @@
 #include linux/udp.h
 #include linux/inet.h
 #include linux/netfilter_ipv4.h
+#include linux/reserve.h

[PATCH 33/33] nfs: do not warn on radix tree node allocation failures

2007-10-30 Thread Peter Zijlstra
GFP_ATOMIC failures are rather common, no not warn about them.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 fs/nfs/inode.c |2 +-
 fs/nfs/write.c |   10 ++
 2 files changed, 11 insertions(+), 1 deletion(-)

Index: linux-2.6/fs/nfs/inode.c
===
--- linux-2.6.orig/fs/nfs/inode.c
+++ linux-2.6/fs/nfs/inode.c
@@ -1172,7 +1172,7 @@ static void init_once(struct kmem_cache 
INIT_LIST_HEAD(nfsi-open_files);
INIT_LIST_HEAD(nfsi-access_cache_entry_lru);
INIT_LIST_HEAD(nfsi-access_cache_inode_lru);
-   INIT_RADIX_TREE(nfsi-nfs_page_tree, GFP_ATOMIC);
+   INIT_RADIX_TREE(nfsi-nfs_page_tree, GFP_ATOMIC|__GFP_NOWARN);
nfsi-ncommit = 0;
nfsi-npages = 0;
nfs4_init_once(nfsi);
Index: linux-2.6/fs/nfs/write.c
===
--- linux-2.6.orig/fs/nfs/write.c
+++ linux-2.6/fs/nfs/write.c
@@ -652,6 +652,7 @@ static struct nfs_page * nfs_update_requ
struct inode *inode = mapping-host;
struct nfs_page *req, *new = NULL;
pgoff_t rqend, end;
+   int error;
 
end = offset + bytes;
 
@@ -659,6 +660,10 @@ static struct nfs_page * nfs_update_requ
/* Loop over all inode entries and see if we find
 * A request for the page we wish to update
 */
+   error = radix_tree_preload(GFP_NOIO);
+   if (error)
+   return ERR_PTR(error);
+
spin_lock(inode-i_lock);
req = nfs_page_find_request_locked(NFS_I(inode), page);
if (req) {
@@ -666,6 +671,7 @@ static struct nfs_page * nfs_update_requ
int error;
 
spin_unlock(inode-i_lock);
+   radix_tree_preload_end();
error = nfs_wait_on_request(req);
nfs_release_request(req);
if (error  0) {
@@ -676,6 +682,7 @@ static struct nfs_page * nfs_update_requ
continue;
}
spin_unlock(inode-i_lock);
+   radix_tree_preload_end();
if (new)
nfs_release_request(new);
break;
@@ -687,13 +694,16 @@ static struct nfs_page * nfs_update_requ
error = nfs_inode_add_request(inode, new);
if (error) {
spin_unlock(inode-i_lock);
+   radix_tree_preload_end();
nfs_unlock_request(new);
return ERR_PTR(error);
}
spin_unlock(inode-i_lock);
+   radix_tree_preload_end();
return new;
}
spin_unlock(inode-i_lock);
+   radix_tree_preload_end();
 
new = nfs_create_request(ctx, inode, page, offset, bytes);
if (IS_ERR(new))

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK

2007-10-30 Thread Peter Zijlstra
Change ALLOC_NO_WATERMARK page allocation such that the reserves are system
wide - which they are per setup_per_zone_pages_min(), when we scrape the
barrel, do it properly.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 mm/page_alloc.c |6 ++
 1 file changed, 6 insertions(+)

Index: linux-2.6/mm/page_alloc.c
===
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -1638,6 +1638,12 @@ restart:
 rebalance:
if (alloc_flags  ALLOC_NO_WATERMARKS) {
 nofail_alloc:
+   /*
+* break out of mempolicy boundaries
+*/
+   zonelist = NODE_DATA(numa_node_id())-node_zonelists +
+   gfp_zone(gfp_mask);
+
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order, zonelist,
ALLOC_NO_WATERMARKS);

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] Create kinst/ or ki/ directory ?

2007-10-30 Thread Peter Zijlstra
On Tue, 2007-10-30 at 13:24 -0400, Mathieu Desnoyers wrote:
 * Jeff Garzik ([EMAIL PROTECTED]) wrote:
 ...
  Pick a shorter word like probes or profile or what... or better yet... 
  just leave most things in their current directories.
 ...
 
 
 How about something along the
 
 kinst or ki
 
 lines ?
 
 (for kernel instrumentation)

I think I'm with jgarzik on this, lets not do this until its clear where
the generalized instrumentation goes to.

That is, i386/x86_64 - x86 was part of a full integration plan, one
that was immediately followed up by a series of integration patches.

With this, I see no such plan. Please draft this generic instrumentation
you talk about, if after that we all like it, we can go moving files
together with the immediate purpose of integrating them.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 23/33] netvm: skb processing

2007-10-30 Thread Peter Zijlstra
On Tue, 2007-10-30 at 14:26 -0700, Stephen Hemminger wrote:
 On Tue, 30 Oct 2007 17:04:24 +0100
 Peter Zijlstra [EMAIL PROTECTED] wrote:
 
  In order to make sure emergency packets receive all memory needed to proceed
  ensure processing of emergency SKBs happens under PF_MEMALLOC.
  
  Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog 
  processing.
  
  Skip taps, since those are user-space again.
  
  Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
  ---
   include/net/sock.h |5 +
   net/core/dev.c |   44 ++--
   net/core/sock.c|   18 ++
   3 files changed, 61 insertions(+), 6 deletions(-)
  
  Index: linux-2.6/net/core/dev.c
  ===
  --- linux-2.6.orig/net/core/dev.c
  +++ linux-2.6/net/core/dev.c
  @@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk
  struct net_device *orig_dev;
  int ret = NET_RX_DROP;
  __be16 type;
  +   unsigned long pflags = current-flags;
  +
  +   /* Emergency skb are special, they should
  +*  - be delivered to SOCK_MEMALLOC sockets only
  +*  - stay away from userspace
  +*  - have bounded memory usage
  +*
  +* Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
  +* This saves us from propagating the allocation context down to all
  +* allocation sites.
  +*/
  +   if (skb_emergency(skb))
  +   current-flags |= PF_MEMALLOC;
   
  /* if we've gotten here through NAPI, check netpoll */
  if (netpoll_receive_skb(skb))
  -   return NET_RX_DROP;
  +   goto out;
 
 Why the change? doesn't gcc optimize the common exit case anyway?

It needs to unset PF_MEMALLOC at the exit.

  @@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk
   
  if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
  kfree_skb(skb);
  -   goto out;
  +   goto unlock;
  }
   
  skb-tc_verd = 0;
   ncls:
   #endif
   
  +   if (skb_emergency(skb))
  +   switch(skb-protocol) {
  +   case __constant_htons(ETH_P_ARP):
  +   case __constant_htons(ETH_P_IP):
  +   case __constant_htons(ETH_P_IPV6):
  +   case __constant_htons(ETH_P_8021Q):
  +   break;
 
 Indentation is wrong, and hard coding protocol values as spcial case
 seems bad here. What about vlan's, etc?

The other protocols needs analysis on what memory allocations occur
during packet processing, if anything is done that is not yet accounted
for (skb, route cache) then that needs to be added to a reserve, if
there are any paths that could touch user-space, those need to be
handled.

I've started looking at a few others, but its hard and difficult work if
one is not familiar with the protocols.


  @@ -2063,8 +2093,10 @@ ncls:
  ret = NET_RX_DROP;
  }
   
  -out:
  +unlock:
  rcu_read_unlock();
  +out:
  +   tsk_restore_flags(current, pflags, PF_MEMALLOC);
  return ret;
   }

Its that tsk_restore_flags() there what requires the s/return/goto/
stuff you noted earlier.

 I am still not convinced that this solves the problem well enough
 to be useful.  Can you really survive a heavy memory overcommit?

On a machine with mem=128M, I've ran 4 processes of 64M, 2 file backed
with the files on NFS, 2 anonymous. The processes just cycle through the
memory using writes. This is a 100% overcommit.

During these tests I've ran various network loads.

I've shut down the NFS server, waited for say 15 minutes, and restarted
the NFS server, and the machine came back up and continued.

 In other words, can you prove that the added complexity causes the system
 to survive a real test where otherwise it would not?

I've put some statistics in the skb reserve allocations, those are most
definately used. I'm quite certain the machine would lock up solid
without it.

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 00/33] Swap over NFS -v14

2007-10-31 Thread Peter Zijlstra
On Tue, 2007-10-30 at 21:37 -0700, David Miller wrote:
 From: Nick Piggin [EMAIL PROTECTED]
 Date: Wed, 31 Oct 2007 14:26:32 +1100
 
  Is it really worth all the added complexity of making swap
  over NFS files work, given that you could use a network block
  device instead?
 
 Don't be misled.  Swapping over NFS is just a scarecrow for the
 seemingly real impetus behind these changes which is network storage
 stuff like iSCSI.

Not quite, yes, iSCSI is also on the 'want' list of quite a few people,
but swap over NFS on its own is also a feature of great demand.


signature.asc
Description: This is a digitally signed message part


Re: aim7 -30% regression in 2.6.24-rc1

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 17:57 +0800, Zhang, Yanmin wrote:
 On Tue, 2007-10-30 at 16:36 +0800, Zhang, Yanmin wrote:
  On Tue, 2007-10-30 at 08:26 +0100, Ingo Molnar wrote:
   * Zhang, Yanmin [EMAIL PROTECTED] wrote:
   
sub-bisecting captured patch 
38ad464d410dadceda1563f36bdb0be7fe4c8938(sched: uniform tunings) 
caused 20% regression of aim7.

The last 10% should be also related to sched parameters, such like 
sysctl_sched_min_granularity.
   
   ah, interesting. Since you have CONFIG_SCHED_DEBUG enabled, could you 
   please try to figure out what the best value for 
   /proc/sys/kernel_sched_latency, /proc/sys/kernel_sched_nr_latency and 
   /proc/sys/kernel_sched_min_granularity is?
   
   there's a tuning constraint for kernel_sched_nr_latency: 
   
   - kernel_sched_nr_latency should always be set to 
 kernel_sched_latency/kernel_sched_min_granularity. (it's not a free 
 tunable)
   
   i suspect a good approach would be to double the value of 
   kernel_sched_latency and kernel_sched_nr_latency in each tuning 
   iteration, while keeping kernel_sched_min_granularity unchanged. That 
   will excercise the tuning values of the 2.6.23 kernel as well.
  I followed your idea to test 2.6.24-rc1. The improvement is slow.
  When sched_nr_latency=2560 and sched_latency_ns=64000, the performance
  is still about 15% less than 2.6.23.
 
 I got the aim7 30% regression on my new upgraded stoakley machine. I found
 this mahcine is slower than the old one. Maybe BIOS has issues, or 
 memeory(Might not
 be dual-channel?) is slow. So I retested it on the old machine and found on 
 the old
 stoakley machine, the regression is about 6%, quite similiar to the 
 regression on tigerton
 machine.
 
 By sched_nr_latency=640 and sched_latency_ns=64000 on the old stoakley 
 machine,
 the regression becomes about 2%. Other latency has more regression.
 
 On my tulsa machine, by sched_nr_latency=640 and sched_latency_ns=64000,
 the regression becomes less than 1% (The original regression is about 20%).
 
 When I ran a bad script to change the values of sched_nr_latency and 
 sched_latency_ns,
 I hit OOPS on my tulsa machine. Below is the log. It looks like 
 sched_nr_latency becomes
 0.

Oops, yeah I think I overlooked that case :-/
I think limiting the sysctl parameters make most sense, as a 0 value
really doesn't.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b4efbe..0f34c91 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -94,6 +94,7 @@ static int two = 2;
 
 static int zero;
 static int one_hundred = 100;
+static int int_max = INT_MAX;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -239,7 +240,10 @@ static struct ctl_table kern_table[] = {
.data   = sysctl_sched_nr_latency,
.maxlen = sizeof(unsigned int),
.mode   = 0644,
-   .proc_handler   = proc_dointvec,
+   .proc_handler   = proc_dointvec_minmax,
+   .strategy   = sysctl_intvec,
+   .extra1 = one,
+   .extra2 = int_max,
},
{
.ctl_name   = CTL_UNNUMBERED,



signature.asc
Description: This is a digitally signed message part


Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 14:37 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
  Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
  contexts that are entitled to it.
 
  Care is taken to only touch the SLUB slow path.
 
  This is done to ensure reserve pages don't leak out and get consumed.
 
 I think this is generally a good idea (to prevent slab allocators
 from stealing reserve). However I naively think the implementation
 is a bit overengineered and thus has a few holes.
 
 Humour me, what was the problem with failing the slab allocation
 (actually, not fail but just call into the page allocator to do
 correct waiting  / reclaim) in the slowpath if the process fails the
 watermark checks?

Ah, we actually need slabs below the watermarks. Its just that once I
allocated those slabs using __GFP_MEMALLOC/PF_MEMALLOC I don't want
allocation contexts that do not have rights to those pages to walk off
with objects.

So, this generic reserve framework still uses the slab allocator to
provide certain kind of objects (kmalloc, kmem_cache_alloc), it just
separates those that are and are not entitled to the reserves.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 05/33] mm: kmem_estimate_pages()

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 14:43 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
  Provide a method to get the upper bound on the pages needed to allocate
  a given number of objects from a given kmem_cache.
 
 
 Fair enough, but just to make it a bit easier, can you provide a
 little reason of why in this patch (or reference the patch number
 where you use it, or put it together with the patch where you use
 it, etc.).

A generic reserve framework, as seen in patch 11/23, needs to be able
convert from a object demand (kmalloc() bytes, kmem_cache_alloc()
objects) to a page reserve.



signature.asc
Description: This is a digitally signed message part


Re: [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 14:51 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
  Allow PF_MEMALLOC to be set in softirq context. When running softirqs from
  a borrowed context save current-flags, ksoftirqd will have its own
  task_struct.
 
 
 What's this for? Why would ksoftirqd pick up PF_MEMALLOC? (I guess
 that some networking thing must be picking it up in a subsequent patch,
 but I'm too lazy to look!)... Again, can you have more of a rationale in
 your patch headers, or ref the patch that uses it... thanks

Right, I knew I was forgetting something in these changelogs.

The network stack does quite a bit of packet processing from softirq
context. Once you start swapping over network, some of the packets want
to be processed under PF_MEMALLOC.

See patch 23/33.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 14:52 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
  Change ALLOC_NO_WATERMARK page allocation such that the reserves are system
  wide - which they are per setup_per_zone_pages_min(), when we scrape the
  barrel, do it properly.
 
 
 IIRC it's actually not too uncommon to have allocations coming here via
 page reclaim. It's not exactly clear that you want to break mempolicies
 at this point.

Hmm, the way I see it is that mempolicies are mainly for user-space
allocations, reserve allocations are always kernel allocations. These
already break mempolicies - for example hardirq context allocations.

Also, as it stands, the reserve is spread out evenly over all
zones/nodes (excluding highmem), so by restricting ourselves to a
subset, we don't have access to the full reserve.



signature.asc
Description: This is a digitally signed message part


Re: [PATCH 00/33] Swap over NFS -v14

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 08:50 +, Christoph Hellwig wrote:
 On Tue, Oct 30, 2007 at 09:37:53PM -0700, David Miller wrote:
  Don't be misled.  Swapping over NFS is just a scarecrow for the
  seemingly real impetus behind these changes which is network storage
  stuff like iSCSI.
 
 So can we please do swap over network storage only first?  All these
 VM bits look conceptually sane to me, while the changes to the swap
 code to support nfs are real crackpipe material.

Yeah, I know how you stand on that. I just wanted to post all this
before going off into the woods reworking it all.

 Then again doing
 that part properly by adding address_space methods for swap I/O without
 the abuse might be a really good idea, especially as the way we
 do swapfiles on block-based filesystems is an horrible hack already.

Is planned. What do you think of the proposed a_ops extension to
accomplish this? That is,

-swapfile() - is this address space willing to back swap
-swapout() - write out a page
-swapin() - read in a page

 So please get the VM bits for swap over network blockdevices in first,

Trouble with that part is that we don't have any sane network block
devices atm, NBD is utter crap, and iSCSI is too complex to be called
sane.

Maybe Evgeniy's Distributed storage thingy would work, will have a look
at that.

 and then we can look into a complete revamp of the swapfile support
 that cleans up the current mess and adds support for nfs insted of
 making the mess even worse.

Sure, concrete suggestion are always welcome. Just being told something
is utter crap only goes so far.


signature.asc
Description: This is a digitally signed message part


Re: NBD was Re: [PATCH 00/33] Swap over NFS -v14

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 12:18 +0100, Pavel Machek wrote:
 Hi!
 
   So please get the VM bits for swap over network blockdevices in first,
  
  Trouble with that part is that we don't have any sane network block
  devices atm, NBD is utter crap, and iSCSI is too complex to be called
  sane.
 
 Hey, NBD was designed to be _simple_. And I think it works okay in
 that area.. so can you elaborate on utter crap? [Ok, performance is
 not great.]

Yeah, sorry, perhaps I was overly strong.

It doesn't work for me, because:

  - it does connection management in user-space, which makes it
impossible to reconnect. I'd want a full kernel based client.

  - it had some plugging issues, and after talking to Jens about it
he suggested a rewrite using -make_request() ala AoE. [ sorry if
I'm short on details here, it was a long time ago, and I
forgot, maybe Jens remembers ]

 Plus, I'd suggest you to look at ata-over-ethernet. It is in tree
 today, quite simple, but should have better performance than nbd.

Ah, right, I keep forgetting about that one. The only draw-back to that
on is, is that its raw ethernet, and not some IP protocol.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 00/33] Swap over NFS -v14

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 14:26 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
  Hi,
 
  Another posting of the full swap over NFS series.
 
 Hi,
 
 Is it really worth all the added complexity of making swap
 over NFS files work, given that you could use a network block
 device instead?

As it stands, we don't have a usable network block device IMHO.
NFS is by far the most used and usable network storage solution out
there, anybody with half a brain knows how to set it up and use it.

 Also, have you ensured that page_file_index, page_file_mapping
 and page_offset are only ever used on anonymous pages when the
 page is locked? (otherwise PageSwapCache could change)

Good point, I hope so, both -readpage() and -writepage() take a locked
page, I'd have to look if it remains locked throughout the NFS call
chain.

Then again, it might become obsolete with the extended swap a_ops.



signature.asc
Description: This is a digitally signed message part


Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 21:46 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 21:42, Peter Zijlstra wrote:
  On Wed, 2007-10-31 at 14:37 +1100, Nick Piggin wrote:
   On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation
contexts that are entitled to it.
   
Care is taken to only touch the SLUB slow path.
   
This is done to ensure reserve pages don't leak out and get consumed.
  
   I think this is generally a good idea (to prevent slab allocators
   from stealing reserve). However I naively think the implementation
   is a bit overengineered and thus has a few holes.
  
   Humour me, what was the problem with failing the slab allocation
   (actually, not fail but just call into the page allocator to do
   correct waiting  / reclaim) in the slowpath if the process fails the
   watermark checks?
 
  Ah, we actually need slabs below the watermarks.
 
 Right, I'd still allow those guys to allocate slabs. Provided they
 have the right allocation context, right?
 
 
  Its just that once I 
  allocated those slabs using __GFP_MEMALLOC/PF_MEMALLOC I don't want
  allocation contexts that do not have rights to those pages to walk off
  with objects.
 
 And I'd prevent these ones from doing so.
 
 Without keeping track of reserve pages, which doesn't feel
 too clean.

The problem with that is that once a slab was allocated with the right
allocation context, anybody can get objects from these slabs.


low memory, and empty slab:

task Atask B

kmem_cache_alloc() = NULL

  current-flags |= PF_MEMALLOC
  kmem_cache_alloc() = obj
  (slab != NULL)

kmem_cache_alloc() = obj
kmem_cache_alloc() = obj
kmem_cache_alloc() = obj


And now task A, who doesn't have the right permissions walks
away with all our reserve memory.

So we either reserve a page per object, which for 32 byte objects is a
large waste, or we stop anybody who doesn't have the right permissions
from obtaining objects. I took the latter approach.



signature.asc
Description: This is a digitally signed message part


Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 22:25 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 23:17, Peter Zijlstra wrote:
  On Wed, 2007-10-31 at 21:46 +1100, Nick Piggin wrote:
 
   And I'd prevent these ones from doing so.
  
   Without keeping track of reserve pages, which doesn't feel
   too clean.
 
  The problem with that is that once a slab was allocated with the right
  allocation context, anybody can get objects from these slabs.
 
 [snip]
 
 I understand that.
 
 
  So we either reserve a page per object, which for 32 byte objects is a
  large waste, or we stop anybody who doesn't have the right permissions
  from obtaining objects. I took the latter approach.
 
 What I'm saying is that the slab allocator slowpath should always
 just check watermarks against the current task. Instead of this
 -reserve stuff.

So what you say is to allocate a slab every time we take the slow path,
even when we already have one?

That sounds rather sub-optimal.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 00/33] Swap over NFS -v14

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 08:16 -0400, Jeff Garzik wrote:
 Thoughts:
 
 1) I absolutely agree that NFS is far more prominent and useful than any 
 network block device, at the present time.
 
 
 2) Nonetheless, swap over NFS is a pretty rare case.  I view this work 
 as interesting, but I really don't see a huge need, for swapping over 
 NBD or swapping over NFS.  I tend to think swapping to a remote resource 
 starts to approach migration rather than merely swapping.  Yes, we can 
 do it...  but given the lack of burning need one must examine the price.

There is a large corporate demand for this, which is why I'm doing this.

The typical usage scenarios are:
 - cluster/blades, where having local disks is a cost issue (maintenance
   of failures, heat, etc)
 - virtualisation, where dumping the storage on a networked storage unit
   makes for trivial migration and what not..

But please, people who want this (I'm sure some of you are reading) do
speak up. I'm just the motivated corporate drone implementing the
feature :-)

 3) You note
  Swap over network has the problem that the network subsystem does not use 
  fixed
  sized allocations, but heavily relies on kmalloc(). This makes mempools
  unusable.
 
 True, but IMO there are mitigating factors that should be researched and 
 taken into account:
 
 a) To give you some net driver background/history, most mainstream net 
 drivers were coded to allocate RX skbs of size 1538, under the theory 
 that they would all be allocating out of the same underlying slab cache. 
   It would not be difficult to update a great many of the [non-jumbo] 
 cases to create a fixed size allocation pattern.

One issue that comes to mind is how to ensure we'd still overflow the
IP-reassembly buffers. Currently those are managed on the number of
bytes present, not the number of fragments.

One of the goals of my approach was to not rewrite the network subsystem
to accomodate this feature (and I hope I succeeded).

 b) Spare-time experiments and anecdotal evidence points to RX and TX skb 
 recycling as a potentially valuable area of research.  If you are able 
 to do something like that, then memory suddenly becomes a lot more 
 bounded and predictable.
 
 
 So my gut feeling is that taking a hard look at how net drivers function 
 in the field should give you a lot of good ideas that approach the 
 shared goal of making network memory allocations more predictable and 
 bounded.

Note that being bounded only comes from dropping most packets before
trying them to a socket. That is the crucial part of the RX path, to
receive all packets from the NIC (regardless their size) but to not pass
them on to the network stack - unless they belong to a 'special' socket
that promises undelayed processing.

Thanks for these ideas, I'll look into them.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 21:49 +1100, Nick Piggin wrote:
 On Wednesday 31 October 2007 21:42, Peter Zijlstra wrote:
  On Wed, 2007-10-31 at 14:51 +1100, Nick Piggin wrote:
   On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote:
Allow PF_MEMALLOC to be set in softirq context. When running softirqs
from a borrowed context save current-flags, ksoftirqd will have its
own task_struct.
  
   What's this for? Why would ksoftirqd pick up PF_MEMALLOC? (I guess
   that some networking thing must be picking it up in a subsequent patch,
   but I'm too lazy to look!)... Again, can you have more of a rationale in
   your patch headers, or ref the patch that uses it... thanks
 
  Right, I knew I was forgetting something in these changelogs.
 
  The network stack does quite a bit of packet processing from softirq
  context. Once you start swapping over network, some of the packets want
  to be processed under PF_MEMALLOC.
 
 Hmm... what about processing from interrupt context?

From what I could tell that is not done, ISR just fills the skb and
sticks it on an RX queue to be further processed by the softirq.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 13:54 +0100, Peter Zijlstra wrote:
 On Wed, 2007-10-31 at 22:25 +1100, Nick Piggin wrote:

  What I'm saying is that the slab allocator slowpath should always
  just check watermarks against the current task. Instead of this
  -reserve stuff.
 
 So what you say is to allocate a slab every time we take the slow path,
 even when we already have one?

BTW, a task that does not have reserve permissions will already attempt
to allocate a new slab - this is done to probe the current watermarks.
If this succeeds the reserve status is lifted.


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: sched: fix new task startup crash

2007-10-31 Thread Peter Zijlstra
Hi,

Commit: b9dca1e0fcb696716840a3bc8f20a6941b484dbf

seems to me that by calling enqueue_fair_task() from task_new_fair() is
wrong. The wakeup=1 in enqueue_fair_task() will cause all non-top
sched_entities to be re-positioned by place_entity().

Although the current implementation thereof seems to avoid doing
something horrible.



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] lockdep: fix mismatched lockdep_depth/curr_chain_hash

2007-10-31 Thread Peter Zijlstra
On Wed, 2007-10-31 at 11:44 -0400, Gregory Haskins wrote:
 Hi Greg,
Here is the backported version of the patch.  I applied it on top of
2.6.22.10.  Let me know if you have any issues.
 
 -Greg

Thanks Gregory!



-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 6/6] sched: place_entity() comments

2007-10-31 Thread Peter Zijlstra
Add a few comments to place_entity().

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
---
 kernel/sched_fair.c |   11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -582,19 +582,26 @@ place_entity(struct cfs_rq *cfs_rq, stru
} else if (sched_feat(APPROX_AVG)  cfs_rq-nr_running)
vruntime += sched_vslice(cfs_rq)/2;
 
+   /*
+* The 'current' period is already promised to the current tasks,
+* however the extra weight of the new task will slow them down a
+* little, place the new task so that it fits in the slot that
+* stays open at the end.
+*/
if (initial  sched_feat(START_DEBIT))
vruntime += sched_vslice_add(cfs_rq, se);
 
if (!initial) {
+   /* sleeps upto a single latency don't count. */
if (sched_feat(NEW_FAIR_SLEEPERS)  entity_is_task(se) 
task_of(se)-policy != SCHED_BATCH)
vruntime -= sysctl_sched_latency;
 
-   vruntime = max_t(s64, vruntime, se-vruntime);
+   /* ensure we never gain time by being placed backwards. */
+   vruntime = max_vruntime(se-vruntime, vruntime);
}
 
se-vruntime = vruntime;
-
 }
 
 static void

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/6] sched: make sched_slice() group scheduling savvy

2007-10-31 Thread Peter Zijlstra
Currently the ideal slice length does not take group scheduling into account.
Change it so that it properly takes all the runnable tasks on this cpu into
account and caluclate the weight according to the grouping hierarchy.

Also fixes a bug in vslice which missed a factor NICE_0_LOAD.

Signed-off-by: Peter Zijlstra [EMAIL PROTECTED]
CC: Srivatsa Vaddagiri [EMAIL PROTECTED]
---
 kernel/sched_fair.c |   42 +++---
 1 file changed, 31 insertions(+), 11 deletions(-)

Index: linux-2.6/kernel/sched_fair.c
===
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long 
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   u64 slice = __sched_period(cfs_rq-nr_running);
+   unsigned long nr_running = rq_of(cfs_rq)-nr_running;
+   u64 slice = __sched_period(nr_running);
 
-   slice *= se-load.weight;
-   do_div(slice, cfs_rq-load.weight);
+   for_each_sched_entity(se) {
+   cfs_rq = cfs_rq_of(se);
+
+   slice *= se-load.weight;
+   do_div(slice, cfs_rq-load.weight);
+   }
 
return slice;
 }
@@ -344,24 +349,39 @@ static u64 sched_slice(struct cfs_rq *cf
  *
  * vs = s/w = p/rw
  */
-static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
+static u64 __sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *new)
 {
-   u64 vslice = __sched_period(nr_running);
+   struct sched_entity *se = cfs_rq-curr;
+   unsigned long nr_running = rq_of(cfs_rq)-nr_running;
+   unsigned long weight = 0;
+   u64 vslice;
+
+   if (new) {
+   nr_running++;
+   weight = new-load.weight;
+   }
 
-   do_div(vslice, rq_weight);
+   vslice = __sched_period(nr_running);
+
+   for_each_sched_entity(se) {
+   cfs_rq = cfs_rq_of(se);
+
+   vslice *= NICE_0_LOAD;
+   do_div(vslice, cfs_rq-load.weight + weight);
+   weight = 0;
+   }
 
return vslice;
 }
 
-static u64 sched_vslice(struct cfs_rq *cfs_rq)
+static inline u64 sched_vslice(struct cfs_rq *cfs_rq)
 {
-   return __sched_vslice(cfs_rq-load.weight, cfs_rq-nr_running);
+   return __sched_vslice(cfs_rq, NULL);
 }
 
-static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static inline u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity 
*new)
 {
-   return __sched_vslice(cfs_rq-load.weight + se-load.weight,
-   cfs_rq-nr_running + 1);
+   return __sched_vslice(cfs_rq, new);
 }
 
 /*

--

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


<    1   2   3   4   5   6   7   8   9   10   >