[PATCH 6/7] sched: rt-group: per group period
Steven asked for per group periods in order to get closer to RMA or EDF scheduling. Use the fancy new hrtimers to provide a per group period Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h|2 kernel/sched.c | 225 +-- kernel/sched_rt.c| 61 ++-- kernel/sysctl.c |2 kernel/time/tick-sched.c |5 - 5 files changed, 232 insertions(+), 63 deletions(-) Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -230,8 +230,6 @@ static inline int select_nohz_load_balan } #endif -extern unsigned long rt_needs_cpu(int cpu); - /* * Only dump TASK_* tasks. (0 for all tasks) */ Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -177,6 +177,7 @@ struct task_group { struct rt_rq **rt_rq; unsigned int rt_ratio; + ktime_t rt_period; /* * shares assigned to a task group governs how much of cpu bandwidth @@ -372,6 +373,7 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + struct hrtimer rt_period_timer; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; @@ -441,8 +443,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)-curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq-rt_throttled) - return 0; - - if (rq-clock rq-rt_period_expire) - return 1; - - delta = rq-rt_period_expire - rq-clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +const_debug unsigned int sysctl_sched_rt_period = 100; #define SCHED_RT_FRAC_SHIFT16 #define SCHED_RT_FRAC (1UL SCHED_RT_FRAC_SHIFT) @@ -664,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt * ratio of time -rt tasks may consume. * default: 95% */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259; /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #endif /* CONFIG_SMP */ +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #include sched_stats.h #include sched_idletask.c #include sched_fair.c @@ -3741,7 +3730,6 @@ void scheduler_tick(void) rq-tick_timestamp = rq-clock; update_cpu_load(rq); curr-sched_class-task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(rq-lock); #ifdef CONFIG_SMP @@ -5287,6 +5275,152 @@ static inline void sched_init_granularit sysctl_sched_batch_wakeup_granularity *= factor; } +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_rq *rt_rq = + container_of(timer, struct rt_rq, rt_period_timer); + struct rq *rq = rq_of_rt_rq(rt_rq); + ktime_t now = ktime_get(); + + WARN_ON(smp_processor_id() != cpu_of(rq)); + WARN_ON(!in_irq()); + + spin_lock(rq-lock); + update_sched_rt_period(rt_rq); + spin_unlock(rq-lock); + + hrtimer_forward(timer, now, sched_rt_period(rt_rq)); + return HRTIMER_RESTART; +} + +static void sched_rt_period_start(struct rt_rq *rt_rq) +{ + ktime_t period = sched_rt_period(rt_rq); + + WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq))); + + for (;;) { + ktime_t now = ktime_get(); + hrtimer_forward(rt_rq-rt_period_timer, now, period); + hrtimer_start(rt_rq-rt_period_timer, + rt_rq-rt_period_timer.expires, + HRTIMER_MODE_ABS); + if (hrtimer_active(rt_rq-rt_period_timer)) + break; + } +} + +static void sched_rt_period_stop(struct rt_rq *rt_rq
[PATCH 1/7] sched: rt throttling vs no_hz
We need to teach no_hz about the rt throttling because its tick driven. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h|2 ++ kernel/sched.c | 23 ++- kernel/sched_rt.c| 30 -- kernel/time/tick-sched.c |5 + 4 files changed, 45 insertions(+), 15 deletions(-) Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -230,6 +230,8 @@ static inline int select_nohz_load_balan } #endif +extern unsigned long rt_needs_cpu(int cpu); + /* * Only dump TASK_* tasks. (0 for all tasks) */ Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -442,6 +442,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; u64 rt_period_expire; + int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)-curr) +unsigned long rt_needs_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 delta; + + if (!rq-rt_throttled) + return 0; + + if (rq-clock rq-rt_period_expire) + return 1; + + delta = rq-rt_period_expire - rq-clock; + do_div(delta, NSEC_PER_SEC / HZ); + + return (unsigned long)delta; +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_ /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array-bitmap); +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + rt_rq-highest_prio = MAX_RT_PRIO; +#endif #ifdef CONFIG_SMP rt_rq-rt_nr_migratory = 0; - rt_rq-highest_prio = MAX_RT_PRIO; rt_rq-overloaded = 0; #endif @@ -7186,6 +7206,7 @@ void __init sched_init(void) list_add(init_task_group.list, task_groups); #endif rq-rt_period_expire = 0; + rq-rt_throttled = 0; for (j = 0; j CPU_LOAD_IDX_MAX; j++) rq-cpu_load[j] = 0; Index: linux-2.6/kernel/sched_rt.c === --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; if (rt_rq-rt_time ratio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + + rq-rt_throttled = 1; rt_rq-rt_throttled = 1; + sched_rt_ratio_dequeue(rt_rq); return 1; } @@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc return 0; } -static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period) -{ - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; - - rt_rq-rt_time -= min(rt_rq-rt_time, ratio); - if (rt_rq-rt_throttled) { - rt_rq-rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); - } -} - static void update_sched_rt_period(struct rq *rq) { struct rt_rq *rt_rq; @@ -204,8 +196,18 @@ static void update_sched_rt_period(struc period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; rq-rt_period_expire += period; - for_each_leaf_rt_rq(rt_rq, rq) - __update_sched_rt_period(rt_rq, period); + for_each_leaf_rt_rq(rt_rq, rq) { + unsigned long rt_ratio = sched_rt_ratio(rt_rq); + u64 ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; + + rt_rq-rt_time -= min(rt_rq-rt_time, ratio); + if (rt_rq-rt_throttled) { + rt_rq-rt_throttled = 0; + sched_rt_ratio_enqueue(rt_rq); + } + } + + rq-rt_throttled = 0; } } Index: linux-2.6/kernel/time/tick-sched.c === --- linux-2.6.orig/kernel/time/tick-sched.c +++ linux-2.6/kernel/time/tick-sched.c @@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now, delta; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void) next_jiffies
Re: [PATCH 6/7] sched: rt-group: per group period
Could you please fold this into the 6/7 patch. It reverts a wandering chunk (the 32768 thing), but more importantly it fixes !FAIR_GROUP_SCHED compilation. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched.c | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -647,7 +647,7 @@ const_debug unsigned int sysctl_sched_rt * ratio of time -rt tasks may consume. * default: 95% */ -const_debug unsigned int sysctl_sched_rt_ratio = 32768; //62259; +const_debug unsigned int sysctl_sched_rt_ratio = 62259; /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -5379,6 +5379,7 @@ static void __init sched_rt_period_init( hotcpu_notifier(sched_rt_period_hotplug, 0); } +#ifdef CONFIG_FAIR_GROUP_SCHED static void __sched_rt_period_init_tg(void *arg) { struct task_group *tg = arg; @@ -5404,12 +5405,14 @@ static void sched_rt_period_destroy_tg(s { on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1); } -#else +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#else /* CONFIG_SMP */ static void __init sched_rt_period_init(void) { sched_rt_period_start_cpu(0); } +#ifdef CONFIG_FAIR_GROUP_SCHED static void sched_rt_period_init_tg(struct task_group *tg) { sched_rt_period_start(tg-rt_rq[0]); @@ -5419,7 +5422,8 @@ static void sched_rt_period_destroy_tg(s { sched_rt_period_stop(tg-rt_rq[0]); } -#endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_SMP */ #ifdef CONFIG_SMP /* -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.24-rc6: possible recursive locking detected
On Sat, 2008-01-05 at 18:12 +1100, Herbert Xu wrote: On Fri, Jan 04, 2008 at 09:30:49AM +0100, Ingo Molnar wrote: [ 1310.670986] = [ 1310.671690] [ INFO: possible recursive locking detected ] [ 1310.672097] 2.6.24-rc6 #1 [ 1310.672421] - [ 1310.672828] FahCore_a0.exe/3692 is trying to acquire lock: [ 1310.673238] (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 [ 1310.673869] [ 1310.673870] but task is already holding lock: [ 1310.674567] (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 [ 1310.675267] [ 1310.675268] other info that might help us debug this: [ 1310.675952] 5 locks held by FahCore_a0.exe/3692: [ 1310.676334] #0: (rcu_read_lock){..--}, at: [c038b620] net_rx_action+0x60/0x1b0 [ 1310.677251] #1: (rcu_read_lock){..--}, at: [c0388d60] netif_receive_skb+0x100/0x470 [ 1310.677924] #2: (rcu_read_lock){..--}, at: [c03a7fb2] ip_local_deliver_finish+0x32/0x210 [ 1310.678460] #3: (clock-AF_INET){-.-?}, at: [c038164e] sock_def_readable+0x1e/0x80 [ 1310.679250] #4: (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 The net part might just be a red herring, since the problem is that __wake_up is somehow reentering itself. /* * Perform a safe wake up of the poll wait list. The problem is that * with the new callback'd wake up system, it is possible that the * poll callback is reentered from inside the call to wake_up() done * on the poll wait queue head. The rule is that we cannot reenter the * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, * and we cannot reenter the same wait queue head at all. This will * enable to have a hierarchy of epoll file descriptor of no more than * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock * because this one gets called by the poll callback, that in turn is called * from inside a wake_up(), that might be called from irq context. */ Seems to suggest that the epoll code can indeed recurse into wakeup. Davide, Johannes, any ideas? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.24-rc6: possible recursive locking detected
On Sat, 2008-01-05 at 17:53 +0100, Peter Zijlstra wrote: On Sat, 2008-01-05 at 18:12 +1100, Herbert Xu wrote: On Fri, Jan 04, 2008 at 09:30:49AM +0100, Ingo Molnar wrote: [ 1310.670986] = [ 1310.671690] [ INFO: possible recursive locking detected ] [ 1310.672097] 2.6.24-rc6 #1 [ 1310.672421] - [ 1310.672828] FahCore_a0.exe/3692 is trying to acquire lock: [ 1310.673238] (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 [ 1310.673869] [ 1310.673870] but task is already holding lock: [ 1310.674567] (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 [ 1310.675267] [ 1310.675268] other info that might help us debug this: [ 1310.675952] 5 locks held by FahCore_a0.exe/3692: [ 1310.676334] #0: (rcu_read_lock){..--}, at: [c038b620] net_rx_action+0x60/0x1b0 [ 1310.677251] #1: (rcu_read_lock){..--}, at: [c0388d60] netif_receive_skb+0x100/0x470 [ 1310.677924] #2: (rcu_read_lock){..--}, at: [c03a7fb2] ip_local_deliver_finish+0x32/0x210 [ 1310.678460] #3: (clock-AF_INET){-.-?}, at: [c038164e] sock_def_readable+0x1e/0x80 [ 1310.679250] #4: (q-lock){++..}, at: [c011544b] __wake_up+0x1b/0x50 The net part might just be a red herring, since the problem is that __wake_up is somehow reentering itself. /* * Perform a safe wake up of the poll wait list. The problem is that * with the new callback'd wake up system, it is possible that the * poll callback is reentered from inside the call to wake_up() done * on the poll wait queue head. The rule is that we cannot reenter the * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, * and we cannot reenter the same wait queue head at all. This will * enable to have a hierarchy of epoll file descriptor of no more than * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock * because this one gets called by the poll callback, that in turn is called * from inside a wake_up(), that might be called from irq context. */ Seems to suggest that the epoll code can indeed recurse into wakeup. Davide, Johannes, any ideas? Since EP_MAX_POLLWAKE_NESTS MAX_LOCKDEP_SUBCLASSES we could perhaps do something like: wake_up_nested(..., wake_nests); although I'm not quite sure that is correct, my understanding of this code is still fragile at best. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block2mtd lockdep_init_map warning
On Sun, 2008-01-06 at 14:13 +0100, Jörn Engel wrote: Ingo, Peter, does either of you actually care about this problem? In the last round when I debugged this problem there was a notable lack of reaction from either of you. Yeah I do, I just know very little about the module stuff and haven't come around to looking into it. I agree that Erez's patch is quite horrible. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/11] sched: rt-group: interface
Change the rt_ratio interface to rt_runtime_us, to match rt_period_us. This avoids picking a granularity for the ratio. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h |8 +++ kernel/sched.c| 116 ++ kernel/sched_rt.c | 42 +++--- kernel/sysctl.c |4 - 4 files changed, 106 insertions(+), 64 deletions(-) Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -1518,7 +1518,7 @@ extern unsigned int sysctl_sched_feature extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_rt_period; -extern unsigned int sysctl_sched_rt_ratio; +extern unsigned int sysctl_sched_rt_runtime; #if defined(CONFIG_FAIR_GROUP_SCHED) defined(CONFIG_SMP) extern unsigned int sysctl_sched_min_bal_int_shares; extern unsigned int sysctl_sched_max_bal_int_shares; @@ -2014,6 +2014,12 @@ extern void sched_destroy_group(struct t extern void sched_move_task(struct task_struct *tsk); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern unsigned long sched_group_shares(struct task_group *tg); +extern int sched_group_set_rt_runtime(struct task_group *tg, + unsigned long rt_runtime_us); +extern unsigned long sched_group_rt_runtime(struct task_group *tg); +extern int sched_group_set_rt_period(struct task_group *tg, +unsigned long rt_runtime_us); +extern unsigned long sched_group_rt_period(struct task_group *tg); #endif Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -176,7 +176,7 @@ struct task_group { struct sched_rt_entity **rt_se; struct rt_rq **rt_rq; - unsigned int rt_ratio; + u64 rt_runtime; ktime_t rt_period; /* @@ -646,19 +646,16 @@ const_debug unsigned int sysctl_sched_fe const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in us. + * period over which we measure rt task cpu usage in us. * default: 1s */ const_debug unsigned int sysctl_sched_rt_period = 100; -#define SCHED_RT_FRAC_SHIFT16 -#define SCHED_RT_FRAC (1UL SCHED_RT_FRAC_SHIFT) - /* - * ratio of time -rt tasks may consume. - * default: 95% + * part of the period that we allow rt tasks to run in us. + * default: 0.95s */ -const_debug unsigned int sysctl_sched_rt_ratio = 62259; +const_debug unsigned int sysctl_sched_rt_runtime = 95; /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu @@ -7209,7 +7206,8 @@ void __init sched_init(void) per_cpu(init_sched_entity, i), i, 1); rq-rt.rt_rq_type = RT_RQ_EDF; - init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ + init_task_group.rt_runtime = + sysctl_sched_rt_runtime * NSEC_PER_USEC; init_task_group.rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC); INIT_LIST_HEAD(rq-leaf_rt_rq_list); @@ -7606,7 +7604,7 @@ struct task_group *sched_create_group(vo goto err; tg-shares = NICE_0_LOAD; - tg-rt_ratio = 0; /* XXX */ + tg-rt_runtime = 0; /* XXX */ tg-rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC); for_each_possible_cpu(i) { @@ -7801,41 +7799,87 @@ unsigned long sched_group_shares(struct } /* - * Ensure the total rt_ratio = sysctl_sched_rt_ratio + * Ensure that the real time constraints are schedulable. */ -int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + u64 r = runtime * (1ULL 16); + do_div(r, period); + return r; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { struct task_group *tgi; unsigned long total = 0; + unsigned long global_ratio = + to_ratio(sysctl_sched_rt_period, sysctl_sched_rt_runtime); rcu_read_lock(); - list_for_each_entry_rcu(tgi, task_groups, list) - total += tgi-rt_ratio; + list_for_each_entry_rcu(tgi, task_groups, list) { + if (tgi == tg) + continue; + + total += to_ratio(ktime_to_ns(tgi-rt_period), tgi-rt_runtime); + } rcu_read_unlock(); - if (total + rt_ratio - tg-rt_ratio sysctl_sched_rt_ratio) - return -EINVAL; + return total + to_ratio(period, runtime) global_ratio; +} - tg-rt_ratio = rt_ratio
[PATCH 02/11] sched: load_balance_monitor rename
don't start the load_balance_monitor when there is only a single cpu. rename the kthread because its currently longer than TASK_COMM_LEN Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched.c |5 - 1 file changed, 4 insertions(+), 1 deletion(-) Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -7070,8 +7070,11 @@ void __init sched_init_smp(void) sched_init_granularity(); #ifdef CONFIG_FAIR_GROUP_SCHED + if (nr_cpu_ids == 1) + return; + lb_monitor_task = kthread_create(load_balance_monitor, NULL, -load_balance_monitor); +group_balance); if (!IS_ERR(lb_monitor_task)) { lb_monitor_task-flags |= PF_NOFREEZE; wake_up_process(lb_monitor_task); -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 10/11] sched: rt-group: EDF
Use a simple Ealiest Deadline First implementation to schedule the realtime groups. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h |1 kernel/sched.c| 13 + kernel/sched_rt.c | 115 +++--- 3 files changed, 124 insertions(+), 5 deletions(-) Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -942,6 +942,7 @@ struct sched_rt_entity { int nr_cpus_allowed; #ifdef CONFIG_FAIR_GROUP_SCHED + struct rb_node run_node; struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq*rt_rq; Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -360,6 +360,11 @@ struct cfs_rq { #endif }; +enum rt_rq_type { + RT_RQ_PRIO, + RT_RQ_EDF, +}; + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -376,6 +381,10 @@ struct rt_rq { struct hrtimer rt_period_timer; #ifdef CONFIG_FAIR_GROUP_SCHED + enum rt_rq_type rt_rq_type; + struct rb_root deadlines; + struct rb_node *rb_leftmost; + unsigned long rt_nr_boosted; struct rq *rq; @@ -7127,6 +7136,9 @@ static void init_rt_rq(struct rt_rq *rt_ rt_rq-rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; #ifdef CONFIG_FAIR_GROUP_SCHED + rt_rq-rt_rq_type = RT_RQ_PRIO; + rt_rq-deadlines = RB_ROOT; + rt_rq-rb_leftmost = NULL; rt_rq-rt_nr_boosted = 0; rt_rq-rq = rq; #endif @@ -7196,6 +7208,7 @@ void __init sched_init(void) per_cpu(init_cfs_rq, i), per_cpu(init_sched_entity, i), i, 1); + rq-rt.rt_rq_type = RT_RQ_EDF; init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ init_task_group.rt_period = ns_to_ktime(sysctl_sched_rt_period * NSEC_PER_USEC); Index: linux-2.6/kernel/sched_rt.c === --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -138,6 +138,84 @@ static int rt_se_boosted(struct sched_rt return p-prio != p-normal_prio; } +static inline u64 rt_deadline(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + + BUG_ON(!group_rq); + return ktime_to_ns(group_rq-rt_period_timer.expires); +} + +static void enqueue_rt_deadline(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rb_node **link; + struct rb_node *parent; + struct sched_rt_entity *entry; + u64 deadline; + int leftmost = 1; + + if (rt_rq-rt_rq_type != RT_RQ_EDF) + return; + + link = rt_rq-deadlines.rb_node; + parent = NULL; + deadline = rt_deadline(rt_se); + + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_rt_entity, run_node); + + if (deadline rt_deadline(entry)) { + link = parent-rb_left; + } else { + link = parent-rb_right; + leftmost = 0; + } + } + + if (leftmost) + rt_rq-rb_leftmost = rt_se-run_node; + + rb_link_node(rt_se-run_node, parent, link); + rb_insert_color(rt_se-run_node, rt_rq-deadlines); +} + +static void dequeue_rt_deadline(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + + if (rt_rq-rt_rq_type != RT_RQ_EDF) + return; + + if (rt_rq-rb_leftmost == rt_se-run_node) + rt_rq-rb_leftmost = rb_next(rt_se-run_node); + + rb_erase(rt_se-run_node, rt_rq-deadlines); +} + +static void requeue_rt_deadline(struct rt_rq *rt_rq) +{ + struct sched_rt_entity *rt_se = rt_rq-rt_se; + + BUG_ON(!rt_se); + if (on_rt_rq(rt_se)) { + dequeue_rt_deadline(rt_se); + enqueue_rt_deadline(rt_se); + } +} + +static struct sched_rt_entity *next_rt_deadline(struct rt_rq *rt_rq) +{ + if (rt_rq-rt_rq_type != RT_RQ_EDF) + return NULL; + + if (!rt_rq-rb_leftmost) + return NULL; + + return rb_entry(rt_rq-rb_leftmost, struct sched_rt_entity, run_node); +} + #else static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) @@ -191,6 +269,23 @@ static inline int rt_rq_throttled(struct { return rt_rq-rt_throttled; } + +static inline void enqueue_rt_deadline(struct sched_rt_entity *rt_se) +{ +} + +static inline void dequeue_rt_deadline(struct sched_rt_entity *rt_se
[PATCH 08/11] sched: rt-group: deal with PI
Steven mentioned the fun case where a lock holding task will be throttled. Simple fix: allow groups that have boosted tasks to run anyway. This is ofcourse not quite correct. Needs more tricks. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched.c|3 +++ kernel/sched_rt.c | 48 2 files changed, 43 insertions(+), 8 deletions(-) Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -376,6 +376,8 @@ struct rt_rq { struct hrtimer rt_period_timer; #ifdef CONFIG_FAIR_GROUP_SCHED + unsigned long rt_nr_boosted; + struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; @@ -7279,6 +7281,7 @@ static void init_rt_rq(struct rt_rq *rt_ rt_rq-rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; #ifdef CONFIG_FAIR_GROUP_SCHED + rt_rq-rt_nr_boosted = 0; rt_rq-rq = rq; #endif } Index: linux-2.6/kernel/sched_rt.c === --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -121,6 +121,23 @@ static void sched_rt_ratio_dequeue(struc dequeue_rt_entity(rt_se); } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq-rt_throttled !rt_rq-rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq-rt_nr_boosted; + + p = rt_task_of(rt_se); + return p-prio != p-normal_prio; +} + #else static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) @@ -170,6 +187,10 @@ static inline void sched_rt_ratio_dequeu { } +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq-rt_throttled; +} #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -190,21 +211,22 @@ static int sched_rt_ratio_exceeded(struc u64 period, ratio; if (rt_ratio == SCHED_RT_FRAC) - return 0; + goto out; if (rt_rq-rt_throttled) - return 1; + goto out; period = sched_rt_period_ns(rt_rq); ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; if (rt_rq-rt_time ratio) { rt_rq-rt_throttled = 1; - sched_rt_ratio_dequeue(rt_rq); - return 1; + if (rt_rq_throttled(rt_rq)) + sched_rt_ratio_dequeue(rt_rq); } - return 0; +out: + return rt_rq_throttled(rt_rq); } static void update_sched_rt_period(struct rt_rq *rt_rq) @@ -265,6 +287,10 @@ void inc_rt_tasks(struct sched_rt_entity update_rt_migration(rq_of_rt_rq(rt_rq)); #endif +#ifdef CONFIG_FAIR_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq-rt_nr_boosted++; +#endif } static inline @@ -295,6 +321,12 @@ void dec_rt_tasks(struct sched_rt_entity update_rt_migration(rq_of_rt_rq(rt_rq)); #endif /* CONFIG_SMP */ +#ifdef CONFIG_FAIR_GROUP_SCHED + if (rt_se_boosted(rt_se)) + rt_rq-rt_nr_boosted--; + + WARN_ON(!rt_rq-rt_nr_running rt_rq-rt_nr_boosted); +#endif } static void enqueue_rt_entity(struct sched_rt_entity *rt_se) @@ -303,7 +335,7 @@ static void enqueue_rt_entity(struct sch struct rt_prio_array *array = rt_rq-active; struct rt_rq *group_rq = group_rt_rq(rt_se); - if (group_rq group_rq-rt_throttled) + if (group_rq rt_rq_throttled(group_rq)) return; list_add_tail(rt_se-run_list, array-queue + rt_se_prio(rt_se)); @@ -476,7 +508,7 @@ static struct sched_rt_entity *pick_next struct list_head *queue; int idx; - if (sched_rt_ratio_exceeded(rt_rq)) + if (rt_rq_throttled(rt_rq)) goto out; idx = sched_find_first_bit(array-bitmap); @@ -500,7 +532,7 @@ static struct task_struct *pick_next_tas if (unlikely(!rt_rq-rt_nr_running)) return NULL; - if (sched_rt_ratio_exceeded(rt_rq)) + if (rt_rq_throttled(rt_rq)) return NULL; do { -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 01/11] sched: rt throttling vs no_hz
We need to teach no_hz about the rt throttling because its tick driven. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h|2 ++ kernel/sched.c | 23 ++- kernel/sched_rt.c| 30 -- kernel/time/tick-sched.c |5 + 4 files changed, 45 insertions(+), 15 deletions(-) Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -230,6 +230,8 @@ static inline int select_nohz_load_balan } #endif +extern unsigned long rt_needs_cpu(int cpu); + /* * Only dump TASK_* tasks. (0 for all tasks) */ Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -442,6 +442,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; u64 rt_period_expire; + int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -594,6 +595,23 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)-curr) +unsigned long rt_needs_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 delta; + + if (!rq-rt_throttled) + return 0; + + if (rq-clock rq-rt_period_expire) + return 1; + + delta = rq-rt_period_expire - rq-clock; + do_div(delta, NSEC_PER_SEC / HZ); + + return (unsigned long)delta; +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -7099,9 +7117,11 @@ static void init_rt_rq(struct rt_rq *rt_ /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array-bitmap); +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + rt_rq-highest_prio = MAX_RT_PRIO; +#endif #ifdef CONFIG_SMP rt_rq-rt_nr_migratory = 0; - rt_rq-highest_prio = MAX_RT_PRIO; rt_rq-overloaded = 0; #endif @@ -7186,6 +7206,7 @@ void __init sched_init(void) list_add(init_task_group.list, task_groups); #endif rq-rt_period_expire = 0; + rq-rt_throttled = 0; for (j = 0; j CPU_LOAD_IDX_MAX; j++) rq-cpu_load[j] = 0; Index: linux-2.6/kernel/sched_rt.c === --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -175,7 +175,11 @@ static int sched_rt_ratio_exceeded(struc ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; if (rt_rq-rt_time ratio) { + struct rq *rq = rq_of_rt_rq(rt_rq); + + rq-rt_throttled = 1; rt_rq-rt_throttled = 1; + sched_rt_ratio_dequeue(rt_rq); return 1; } @@ -183,18 +187,6 @@ static int sched_rt_ratio_exceeded(struc return 0; } -static void __update_sched_rt_period(struct rt_rq *rt_rq, u64 period) -{ - unsigned long rt_ratio = sched_rt_ratio(rt_rq); - u64 ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; - - rt_rq-rt_time -= min(rt_rq-rt_time, ratio); - if (rt_rq-rt_throttled) { - rt_rq-rt_throttled = 0; - sched_rt_ratio_enqueue(rt_rq); - } -} - static void update_sched_rt_period(struct rq *rq) { struct rt_rq *rt_rq; @@ -204,8 +196,18 @@ static void update_sched_rt_period(struc period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; rq-rt_period_expire += period; - for_each_leaf_rt_rq(rt_rq, rq) - __update_sched_rt_period(rt_rq, period); + for_each_leaf_rt_rq(rt_rq, rq) { + unsigned long rt_ratio = sched_rt_ratio(rt_rq); + u64 ratio = (period * rt_ratio) SCHED_RT_FRAC_SHIFT; + + rt_rq-rt_time -= min(rt_rq-rt_time, ratio); + if (rt_rq-rt_throttled) { + rt_rq-rt_throttled = 0; + sched_rt_ratio_enqueue(rt_rq); + } + } + + rq-rt_throttled = 0; } } Index: linux-2.6/kernel/time/tick-sched.c === --- linux-2.6.orig/kernel/time/tick-sched.c +++ linux-2.6/kernel/time/tick-sched.c @@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void) void tick_nohz_stop_sched_tick(void) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + unsigned long rt_jiffies; struct tick_sched *ts; ktime_t last_update, expires, now, delta; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; @@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void) next_jiffies
[PATCH 00/11] another rt group sched update
this time compile tested on all 16 combinations of: CONFIG_SMP CONFIG_FAIR_GROUP_SCHED CONFIG_HIGH_RES_TIMERS CONFIG_NO_HZ ran some but not all combinations -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 03/11] hrtimer: clean up cpu-base locking tricks
In order to more easily allow for the scheduler to use timers, clean up the locking a bit. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/hrtimer.c | 109 +++ kernel/time/tick-sched.c |8 --- 2 files changed, 102 insertions(+), 15 deletions(-) Index: linux-2.6/kernel/hrtimer.c === --- linux-2.6.orig/kernel/hrtimer.c +++ linux-2.6/kernel/hrtimer.c @@ -1063,7 +1063,9 @@ void hrtimer_interrupt(struct clock_even basenow = ktime_add(now, base-offset); while ((node = base-first)) { + enum hrtimer_restart (*fn)(struct hrtimer *); struct hrtimer *timer; + int restart; timer = rb_entry(node, struct hrtimer, node); @@ -1091,13 +1093,29 @@ void hrtimer_interrupt(struct clock_even HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); + fn = timer-function; + if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + /* +* Used for scheduler timers, avoid lock +* inversion with rq-lock and tasklist_lock. +* +* These timers are required to deal with +* enqueue expiry themselves and are not +* allowed to migrate. +*/ + spin_unlock(cpu_base-lock); + restart = fn(timer); + spin_lock(cpu_base-lock); + } else + restart = fn(timer); + /* * Note: We clear the CALLBACK bit after * enqueue_hrtimer to avoid reprogramming of * the event hardware. This happens at the end * of this function anyway. */ - if (timer-function(timer) != HRTIMER_NORESTART) { + if (restart != HRTIMER_NORESTART) { BUG_ON(timer-state != HRTIMER_STATE_CALLBACK); enqueue_hrtimer(timer, base, 0); } Index: linux-2.6/kernel/time/tick-sched.c === --- linux-2.6.orig/kernel/time/tick-sched.c +++ linux-2.6/kernel/time/tick-sched.c @@ -514,7 +514,6 @@ static enum hrtimer_restart tick_sched_t { struct tick_sched *ts = container_of(timer, struct tick_sched, sched_timer); - struct hrtimer_cpu_base *base = timer-base-cpu_base; struct pt_regs *regs = get_irq_regs(); ktime_t now = ktime_get(); int cpu = smp_processor_id(); @@ -552,15 +551,8 @@ static enum hrtimer_restart tick_sched_t touch_softlockup_watchdog(); ts-idle_jiffies++; } - /* -* update_process_times() might take tasklist_lock, hence -* drop the base lock. sched-tick hrtimers are per-CPU and -* never accessible by userspace APIs, so this is safe to do. -*/ - spin_unlock(base-lock); update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); - spin_lock(base-lock); } /* Do not restart, when we are in the idle loop */ -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/11] hrtimer: unlock hrtimer_wakeup
hrtimer_wakeup creates a base-lock rq-lock lock dependancy. Avoid this by switching to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ which doesn't hold base-lock. This fully untangles hrtimer locks from the scheduler locks, and allows hrtimer usage in the scheduler proper. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/hrtimer.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-2.6/kernel/hrtimer.c === --- linux-2.6.orig/kernel/hrtimer.c +++ linux-2.6/kernel/hrtimer.c @@ -1296,7 +1296,7 @@ void hrtimer_init_sleeper(struct hrtimer sl-timer.function = hrtimer_wakeup; sl-task = task; #ifdef CONFIG_HIGH_RES_TIMERS - sl-timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; + sl-timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; #endif } @@ -1307,6 +1307,8 @@ static int __sched do_nanosleep(struct h do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start(t-timer, t-timer.expires, mode); + if (!hrtimer_active(t-timer)) + t-task = NULL; if (likely(t-task)) schedule(); -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 09/11] sched: rt-group: dynamic period ticks
Disable the period updates for inactive groups. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched.c| 158 -- kernel/sched_rt.c | 54 ++ 2 files changed, 53 insertions(+), 159 deletions(-) Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -5277,158 +5277,6 @@ static inline void sched_init_granularit sysctl_sched_batch_wakeup_granularity *= factor; } -static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) -{ - struct rt_rq *rt_rq = - container_of(timer, struct rt_rq, rt_period_timer); - struct rq *rq = rq_of_rt_rq(rt_rq); - ktime_t now = ktime_get(); - - WARN_ON(smp_processor_id() != cpu_of(rq)); - WARN_ON(!in_irq()); - - spin_lock(rq-lock); - update_sched_rt_period(rt_rq); - spin_unlock(rq-lock); - - hrtimer_forward(timer, now, sched_rt_period(rt_rq)); - return HRTIMER_RESTART; -} - -static void sched_rt_period_start(struct rt_rq *rt_rq) -{ - ktime_t period = sched_rt_period(rt_rq); - - WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq))); - - for (;;) { - ktime_t now = ktime_get(); - hrtimer_forward(rt_rq-rt_period_timer, now, period); - hrtimer_start(rt_rq-rt_period_timer, - rt_rq-rt_period_timer.expires, - HRTIMER_MODE_ABS); - if (hrtimer_active(rt_rq-rt_period_timer)) - break; - } -} - -#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED -static void sched_rt_period_stop(struct rt_rq *rt_rq) -{ - hrtimer_cancel(rt_rq-rt_period_timer); -} -#endif - -static void sched_rt_period_start_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct rt_rq *rt_rq; - - for_each_leaf_rt_rq(rt_rq, rq) - sched_rt_period_start(rt_rq); -} - -#ifdef CONFIG_SMP -static void sched_rt_period_stop_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - struct rt_rq *rt_rq; - - for_each_leaf_rt_rq(rt_rq, rq) - sched_rt_period_stop(rt_rq); -} - -static int sched_rt_period_hotplug(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - int cpu = (unsigned long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - sched_rt_period_start_cpu(cpu); - return NOTIFY_OK; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - sched_rt_period_stop_cpu(cpu); - return NOTIFY_OK; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - return NOTIFY_OK; - - default: - return NOTIFY_DONE; - } - - return NOTIFY_OK; -} - -static void __init __sched_rt_period_init(void *arg) -{ - int cpu = smp_processor_id(); - sched_rt_period_start_cpu(cpu); -} - -static void __init sched_rt_period_init(void) -{ - on_each_cpu(__sched_rt_period_init, NULL, 0, 1); - hotcpu_notifier(sched_rt_period_hotplug, 0); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void __sched_rt_period_init_tg(void *arg) -{ - struct task_group *tg = arg; - int cpu = smp_processor_id(); - - sched_rt_period_start(tg-rt_rq[cpu]); -} - -static void sched_rt_period_init_tg(struct task_group *tg) -{ - on_each_cpu(__sched_rt_period_init_tg, tg, 0, 1); -} - -static void __sched_rt_period_destroy_tg(void *arg) -{ - struct task_group *tg = arg; - int cpu = smp_processor_id(); - - sched_rt_period_stop(tg-rt_rq[cpu]); -} - -static void sched_rt_period_destroy_tg(struct task_group *tg) -{ - on_each_cpu(__sched_rt_period_destroy_tg, tg, 0, 1); -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#else /* CONFIG_SMP */ -static void __init sched_rt_period_init(void) -{ - sched_rt_period_start_cpu(0); -} - -#ifdef CONFIG_FAIR_GROUP_SCHED -static void sched_rt_period_init_tg(struct task_group *tg) -{ - sched_rt_period_start(tg-rt_rq[0]); -} - -static void sched_rt_period_destroy_tg(struct task_group *tg) -{ - sched_rt_period_stop(tg-rt_rq[0]); -} -#endif /* CONFIG_FAIR_GROUP_SCHED */ -#endif /* CONFIG_SMP */ - #ifdef CONFIG_SMP /* * This is how migration works: @@ -7210,7 +7058,6 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) 0) BUG(); sched_init_granularity(); - sched_rt_period_init(); #ifdef CONFIG_FAIR_GROUP_SCHED if (nr_cpu_ids == 1) @@ -7231,7 +7078,6 @@ void __init sched_init_smp(void) void __init
[PATCH 04/11] hrtimer: fixup the HRTIMER_CB_IRQSAFE_NO_SOFTIRQ fallback
Currently all highres=off timers are run from softirq context, but HRTIMER_CB_IRQSAFE_NO_SOFTIRQ timers expect to run from irq context. Fix this up by splitting it similar to the highres=on case. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/hrtimer.h |5 - kernel/hrtimer.c| 232 +--- kernel/timer.c |3 3 files changed, 125 insertions(+), 115 deletions(-) Index: linux-2.6/kernel/hrtimer.c === --- linux-2.6.orig/kernel/hrtimer.c +++ linux-2.6/kernel/hrtimer.c @@ -622,6 +622,11 @@ static inline int hrtimer_cb_pending(str static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } +static inline int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + return 0; +} #endif /* CONFIG_HIGH_RES_TIMERS */ @@ -1030,6 +1035,85 @@ int hrtimer_get_res(const clockid_t whic } EXPORT_SYMBOL_GPL(hrtimer_get_res); +static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) +{ + spin_lock_irq(cpu_base-lock); + + while (!list_empty(cpu_base-cb_pending)) { + enum hrtimer_restart (*fn)(struct hrtimer *); + struct hrtimer *timer; + int restart; + + timer = list_entry(cpu_base-cb_pending.next, + struct hrtimer, cb_entry); + + timer_stats_account_hrtimer(timer); + + fn = timer-function; + __remove_hrtimer(timer, timer-base, HRTIMER_STATE_CALLBACK, 0); + spin_unlock_irq(cpu_base-lock); + + restart = fn(timer); + + spin_lock_irq(cpu_base-lock); + + timer-state = ~HRTIMER_STATE_CALLBACK; + if (restart == HRTIMER_RESTART) { + BUG_ON(hrtimer_active(timer)); + /* +* Enqueue the timer, allow reprogramming of the event +* device +*/ + enqueue_hrtimer(timer, timer-base, 1); + } else if (hrtimer_active(timer)) { + /* +* If the timer was rearmed on another CPU, reprogram +* the event device. +*/ + if (timer-base-first == timer-node) + hrtimer_reprogram(timer, timer-base); + } + } + spin_unlock_irq(cpu_base-lock); +} + +static void __run_hrtimer(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base = timer-base; + struct hrtimer_cpu_base *cpu_base = base-cpu_base; + enum hrtimer_restart (*fn)(struct hrtimer *); + int restart; + + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + + fn = timer-function; + if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { + /* +* Used for scheduler timers, avoid lock inversion with +* rq-lock and tasklist_lock. +* +* These timers are required to deal with enqueue expiry +* themselves and are not allowed to migrate. +*/ + spin_unlock(cpu_base-lock); + restart = fn(timer); + spin_lock(cpu_base-lock); + } else + restart = fn(timer); + + /* +* Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid +* reprogramming of the event hardware. This happens at the end of this +* function anyway. +*/ + if (restart != HRTIMER_NORESTART) { + BUG_ON(timer-state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base, 0); + } + timer-state = ~HRTIMER_STATE_CALLBACK; +} + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1063,9 +1147,7 @@ void hrtimer_interrupt(struct clock_even basenow = ktime_add(now, base-offset); while ((node = base-first)) { - enum hrtimer_restart (*fn)(struct hrtimer *); struct hrtimer *timer; - int restart; timer = rb_entry(node, struct hrtimer, node); @@ -1089,37 +1171,7 @@ void hrtimer_interrupt(struct clock_even continue; } - __remove_hrtimer(timer, base, -HRTIMER_STATE_CALLBACK, 0); - timer_stats_account_hrtimer(timer); - - fn = timer-function; - if (timer-cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
[PATCH 06/11] sched: rt-group: reduce rescheduling
Only reschedule if the new group has a higher prio task. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched_rt.c |5 - 1 file changed, 4 insertions(+), 1 deletion(-) Index: linux-2.6/kernel/sched_rt.c === --- linux-2.6.orig/kernel/sched_rt.c +++ linux-2.6/kernel/sched_rt.c @@ -94,8 +94,11 @@ static void sched_rt_ratio_enqueue(struc struct sched_rt_entity *rt_se = rt_rq-rt_se; if (rt_se !on_rt_rq(rt_se) rt_rq-rt_nr_running) { + struct task_struct *curr = rq_of_rt_rq(rt_rq)-curr; + enqueue_rt_entity(rt_se); - resched_task(rq_of_rt_rq(rt_rq)-curr); + if (rt_rq-highest_prio curr-prio) + resched_task(curr); } } -- -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 07/11] sched: rt-group: per group period
Steven asked for per group periods in order to get closer to RMA or EDF scheduling. Use the fancy new hrtimers to provide a per group period Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h|2 kernel/sched.c | 229 ++- kernel/sched_rt.c| 61 ++-- kernel/sysctl.c |2 kernel/time/tick-sched.c |5 - 5 files changed, 237 insertions(+), 62 deletions(-) Index: linux-2.6/kernel/sched.c === --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -177,6 +177,7 @@ struct task_group { struct rt_rq **rt_rq; unsigned int rt_ratio; + ktime_t rt_period; /* * shares assigned to a task group governs how much of cpu bandwidth @@ -372,6 +373,7 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + struct hrtimer rt_period_timer; #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; @@ -441,8 +443,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; - u64 rt_period_expire; - int rt_throttled; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -595,23 +595,6 @@ static void update_rq_clock(struct rq *r #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)-curr) -unsigned long rt_needs_cpu(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - u64 delta; - - if (!rq-rt_throttled) - return 0; - - if (rq-clock rq-rt_period_expire) - return 1; - - delta = rq-rt_period_expire - rq-clock; - do_div(delta, NSEC_PER_SEC / HZ); - - return (unsigned long)delta; -} - /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -652,10 +635,10 @@ const_debug unsigned int sysctl_sched_fe const_debug unsigned int sysctl_sched_nr_migrate = 32; /* - * period over which we measure -rt task cpu usage in ms. + * period over which we measure -rt task cpu usage in us. * default: 1s */ -const_debug unsigned int sysctl_sched_rt_period = 1000; +const_debug unsigned int sysctl_sched_rt_period = 100; #define SCHED_RT_FRAC_SHIFT16 #define SCHED_RT_FRAC (1UL SCHED_RT_FRAC_SHIFT) @@ -1245,6 +1228,12 @@ static unsigned long cpu_avg_load_per_ta static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #endif /* CONFIG_SMP */ +static inline ktime_t ns_to_ktime(u64 ns) +{ + static const ktime_t ktime_zero = { .tv64 = 0 }; + return ktime_add_ns(ktime_zero, ns); +} + #include sched_stats.h #include sched_idletask.c #include sched_fair.c @@ -3741,7 +3730,6 @@ void scheduler_tick(void) rq-tick_timestamp = rq-clock; update_cpu_load(rq); curr-sched_class-task_tick(rq, curr, 0); - update_sched_rt_period(rq); spin_unlock(rq-lock); #ifdef CONFIG_SMP @@ -5287,6 +5275,158 @@ static inline void sched_init_granularit sysctl_sched_batch_wakeup_granularity *= factor; } +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_rq *rt_rq = + container_of(timer, struct rt_rq, rt_period_timer); + struct rq *rq = rq_of_rt_rq(rt_rq); + ktime_t now = ktime_get(); + + WARN_ON(smp_processor_id() != cpu_of(rq)); + WARN_ON(!in_irq()); + + spin_lock(rq-lock); + update_sched_rt_period(rt_rq); + spin_unlock(rq-lock); + + hrtimer_forward(timer, now, sched_rt_period(rt_rq)); + return HRTIMER_RESTART; +} + +static void sched_rt_period_start(struct rt_rq *rt_rq) +{ + ktime_t period = sched_rt_period(rt_rq); + + WARN_ON(smp_processor_id() != cpu_of(rq_of_rt_rq(rt_rq))); + + for (;;) { + ktime_t now = ktime_get(); + hrtimer_forward(rt_rq-rt_period_timer, now, period); + hrtimer_start(rt_rq-rt_period_timer, + rt_rq-rt_period_timer.expires, + HRTIMER_MODE_ABS); + if (hrtimer_active(rt_rq-rt_period_timer)) + break; + } +} + +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED +static void sched_rt_period_stop(struct rt_rq *rt_rq) +{ + hrtimer_cancel(rt_rq-rt_period_timer); +} +#endif + +static void sched_rt_period_start_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct rt_rq *rt_rq; + + for_each_leaf_rt_rq(rt_rq, rq) + sched_rt_period_start(rt_rq); +} + +#ifdef CONFIG_SMP +static void sched_rt_period_stop_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct rt_rq *rt_rq; + + for_each_leaf_rt_rq(rt_rq, rq) + sched_rt_period_stop(rt_rq); +} + +static int sched_rt_period_hotplug(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu
Re: [stable] [PATCH] lockdep: fix mismatched lockdep_depth/curr_chain_hash
On Mon, 2007-10-08 at 10:39 -0700, Greg KH wrote: On Mon, Oct 08, 2007 at 07:36:10PM +0200, Peter Zijlstra wrote: On Mon, 2007-10-08 at 10:24 -0700, Greg KH wrote: On Fri, Oct 05, 2007 at 11:31:26AM +0200, Peter Zijlstra wrote: Stable team, please consider this patch for the next 22-stable. I don't see this patch in Linus's upstream tree. We need it there to be able to accept it for -stable. Or is this just a bugfix of other things that are already in his tree? I send Linus a similar patch, haven??t seem him pick it up yet. I??ll notify you when and if he picks it up. Great, that would be great for us -stable monkeys... 3aa416b07f0adf01c090baab26fb70c35ec17623 - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: aim7 -30% regression in 2.6.24-rc1
On Fri, 2007-10-26 at 17:43 +0800, Zhang, Yanmin wrote: I tested 2.6.24-rc1 on my x86_64 machine which has 2 quad-core processors. Comparing with 2.6.23, aim7 has about -30% regression. I did a bisect and found patch http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=b5869ce7f68b233ceb81465a7644be0d9a5f3dbb caused the issue. Bit weird that you point to a merge commit, and not an actual patch. Are you sure git bisect pointed at this one? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Wed, 2007-10-03 at 15:35 +0200, Kay Sievers wrote: On Wed, 2007-10-03 at 12:37 +0200, Peter Zijlstra wrote: On Wed, 2007-10-03 at 12:15 +0200, Kay Sievers wrote: On Tue, 2007-10-02 at 22:05 +1000, Nick Piggin wrote: On Tuesday 02 October 2007 21:40, Peter Zijlstra wrote: On Tue, 2007-10-02 at 13:21 +0200, Kay Sievers wrote: How about adding this information to the tree then, instead of creating a new top-level hack, just because something that you think you need doesn't exist. So you suggest adding all the various network filesystems in there (where?), and adding the concept of a BDI, and ensuring all are properly linked together - somehow. Feel free to do so. Would something fit better under /sys/fs/? At least filesystems are already an existing concept to userspace. Sounds at least less messy than an new top-level directory. But again, if it's device releated, like the name suggests, it should be reachable from the device tree. Which userspace tool is supposed to set these values, and at what time? An init-script, something at device discovery/setup? If that is is ever going to be used in a hotplug setup, you really don't want to go look for directories with magic device names in another disconnected tree. Filesystems don't really map to BDIs either. One can have multiple FSs per BDI. 'Normally' a BDI relates to a block device, but networked (and other non-block device) filesystems have to create a BDI too. So these need to be represented some place as well. The typical usage would indeed be init scripts. The typical example would be setting the read-ahead window. Currently that cannot be done for NFS mounts. What kind of context for a non-block based fs will get the bdi controls added? Is there a generic place, or does every non-block based filesystem needs to be adapted individually to use it? --- Subject: bdi: debugfs interface Expose the BDI stats (and readahead window) in /debug/bdi/ I'm still thinking it should go into /sys somewhere, however I just noticed not all block devices that have a queue have a /queue directory. Noticeably those that use make_request_fn() as opposed to request_fn(). And then of course there are the non-block/non-queue BDIs. A BDI is basically the object that represents the 'thing' you dirty pages against. For block devices that is related to the block device (and is typically embedded in the queue object), for NFS mounts its the remote server object of the client. For FUSE, yet again something else. I appreciate the sysfs people their opinion that /sys/bdi/ might not be the best from their POV, however I'm not seeing where to hook the BDI object from so that it all makes sense, a few of the things are currently not exposed in sysfs at all, like the NFS and FUSE things. So, for now, I've exposed the thing in debugfs. Please suggest a better alternative. Miklos, Trond: could you suggest a better fmt for the bdi_init_fmt() for your respective filesystems? Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] CC: Miklos Szeredi [EMAIL PROTECTED] CC: Trond Myklebust [EMAIL PROTECTED] --- block/genhd.c |2 block/ll_rw_blk.c |1 drivers/block/loop.c|7 ++ drivers/md/dm.c |2 drivers/md/md.c |2 fs/fuse/inode.c |2 fs/nfs/client.c |2 include/linux/backing-dev.h | 15 include/linux/debugfs.h | 11 +++ include/linux/writeback.h |3 mm/backing-dev.c| 153 mm/page-writeback.c |2 12 files changed, 199 insertions(+), 3 deletions(-) Index: linux-2.6-2/fs/fuse/inode.c === --- linux-2.6-2.orig/fs/fuse/inode.c +++ linux-2.6-2/fs/fuse/inode.c @@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void) atomic_set(fc-num_waiting, 0); fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc-bdi.unplug_io_fn = default_unplug_io_fn; - err = bdi_init(fc-bdi); + err = bdi_init_fmt(fc-bdi, fuse-%p, fc); if (err) { kfree(fc); fc = NULL; Index: linux-2.6-2/fs/nfs/client.c === --- linux-2.6-2.orig/fs/nfs/client.c +++ linux-2.6-2/fs/nfs/client.c @@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s goto out_error; nfs_server_set_fsinfo(server, fsinfo); - error = bdi_init(server-backing_dev_info); + error = bdi_init_fmt(server-backing_dev_info, nfs-%s-%p, clp-cl_hostname, server); if (error) goto out_error; Index: linux-2.6-2/include/linux/backing-dev.h
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Fri, 2007-10-26 at 17:33 +0200, Kay Sievers wrote: On Fri, 2007-10-26 at 17:22 +0200, Peter Zijlstra wrote: On Fri, 2007-10-26 at 17:10 +0200, Kay Sievers wrote: On Fri, 2007-10-26 at 16:48 +0200, Peter Zijlstra wrote: I appreciate the sysfs people their opinion that /sys/bdi/ might not be the best from their POV, however I'm not seeing where to hook the BDI object from so that it all makes sense, a few of the things are currently not exposed in sysfs at all, like the NFS and FUSE things. What happended to the idea to create a bdi class, and have the existing devices as parents, and for stuff that is not (not now, or never) in sysfs, no parent is set. Must have forgotten about that, mainly because I'm not sure I fully understand it. So we create a class, Yes. create these objects, Yes, struct device objects, assigned to the bdi class. (Don't use class_device, that will be removed soon.) which are all called bdi Probably not. You can name it how you want, you can inherit the name of the parent, or prefix it with whatever fits, they just need to be unique. Things like the fuse-%llu name would work just fine. I guess you already solved that problem in the debugfs directory. and have children with these attributes in it. The attributes would just be files in the device object. Now, I supposed there is a directory that lists all unparented thingies, how do I locate the one that matches my nfs mount? You look for the name (prefix), try: ls /sys/class/sound/, it's the same model all over the place. Ok, will try that. Is there a 'simple uncluttered' example I could look at to copy from? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.24-rc1: First impressions
On Fri, 2007-10-26 at 17:22 +0200, Ingo Molnar wrote: * Martin Knoblauch [EMAIL PROTECTED] wrote: Hi , just to give some feedback on 2.6.24-rc1. For some time I am tracking IO/writeback problems that hurt system responsiveness big-time. I tested Peters stuff together with Fenguangs additions and it looked promising. Therefore I was very happy to see Peters stuff going into 2.6.24 and waited eagerly for rc1. In short, I am impressed. This really looks good. IO throughput is great and I could not reproduce the responsiveness problems so far. Below are a some numbers of my brute-force I/O tests that I can use to bring responsiveness down. My platform is a HP/DL380g4, dual CPUs, HT-enabled, 8 GB Memory, SmartaArray6i controller with 4x72GB SCSI disks as RAID5 (battery protected writeback cahe enabled) and gigabit networking (tg3). User space is 64-bit RHEL4.3 I am basically doing copies using dd with 1MB blocksize. Local Filesystem ist ext2 (noatime). IO-Scheduler is dealine, as it tends to give best results. NFS3 Server is a Sun/T2000/Solaris10. The tests are: dd1 - copy 16 GB from /dev/zero to local FS dd1-dir - same, but using O_DIRECT for output dd2/dd2-dir - copy 2x7.6 GB in parallel from /dev/zero to local FS dd3/dd3-dir - copy 3x5.2 GB in parallel from /dev/zero lo local FS net1 - copy 5.2 GB from NFS3 share to local FS mix3 - copy 3x5.2 GB from /dev/zero to local disk and two NFS3 shares I did the numbers for 2.6.19.2, 2.6.22.6 and 2.6.24-rc1. All units are MB/sec. test 2.6.19.2 2.6.22.62.6.24.-rc1 dd1 28 50 96 dd1-dir 88 88 86 dd2 2x16.5 2x11 2x44.5 dd2-dir2x44 2x44 2x43 dd3 3x9.83x8.7 3x30 dd3-dir 3x29.5 3x29.5 3x28.5 net1 30-3350-55 37-52 mix3 17/3225/50 96/35 (disk/combined-network) wow, really nice results! Peter does know how to make stuff fast :) Now lets pick up some of Peter's other, previously discarded patches as well :-) Such as the rewritten reclaim (clockpro) patches: http://programming.kicks-ass.net/kernel-patches/page-replace/ I think riel is taking over that stuff with his split vm and policies per type. The improve-swap-performance (swap-token) patches: http://programming.kicks-ass.net/kernel-patches/swap_token/ Ashwin's version did get upstreamed. His enable-swap-over-NFS [and other complex IO transports] patches: http://programming.kicks-ass.net/kernel-patches/vm_deadlock/ Will post that one again, soonish Esp. after Linus professed liking to have swap over NFS. I've been working on improving the changelogs and comments in that code. latest code (somewhat raw, as rushed by ingo posting this) in: http://programming.kicks-ass.net/kernel-patches/vm_deadlock/v2.6.23-mm1/ And the concurrent pagecache patches: http://programming.kicks-ass.net/kernel-patches/concurrent-pagecache/ as a starter :-) I think the MM should get out of deep-feature-freeze mode - there's tons of room to improve :-/ Yeah, that one would be cool, but it depends on Nick getting his lockless pagecache upstream. For those who don't know, both are in -rt (and have been for some time) so it's not unproven code. signature.asc Description: This is a digitally signed message part
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Fri, 2007-10-26 at 17:10 +0200, Kay Sievers wrote: On Fri, 2007-10-26 at 16:48 +0200, Peter Zijlstra wrote: I appreciate the sysfs people their opinion that /sys/bdi/ might not be the best from their POV, however I'm not seeing where to hook the BDI object from so that it all makes sense, a few of the things are currently not exposed in sysfs at all, like the NFS and FUSE things. What happended to the idea to create a bdi class, and have the existing devices as parents, and for stuff that is not (not now, or never) in sysfs, no parent is set. Must have forgotten about that, mainly because I'm not sure I fully understand it. So we create a class, create these objects, which are all called bdi and have children with these attributes in it. Now, I supposed there is a directory that lists all unparented thingies, how do I locate the one that matches my nfs mount? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
This crashes and burns on bootup, but I'm too tired to figure out what I did wrong... will give it another try tomorrow.. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- block/genhd.c |2 fs/fuse/inode.c |2 fs/nfs/client.c |2 include/linux/backing-dev.h | 33 include/linux/writeback.h |3 + mm/backing-dev.c| 121 mm/page-writeback.c |2 7 files changed, 162 insertions(+), 3 deletions(-) Index: linux-2.6-2/fs/fuse/inode.c === --- linux-2.6-2.orig/fs/fuse/inode.c +++ linux-2.6-2/fs/fuse/inode.c @@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void) atomic_set(fc-num_waiting, 0); fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc-bdi.unplug_io_fn = default_unplug_io_fn; - err = bdi_init(fc-bdi); + err = bdi_init_fmt(fc-bdi, fuse-%llu, (unsigned long long)fc-id); if (err) { kfree(fc); fc = NULL; Index: linux-2.6-2/fs/nfs/client.c === --- linux-2.6-2.orig/fs/nfs/client.c +++ linux-2.6-2/fs/nfs/client.c @@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s goto out_error; nfs_server_set_fsinfo(server, fsinfo); - error = bdi_init(server-backing_dev_info); + error = bdi_init_fmt(server-backing_dev_info, nfs-%s-%p, clp-cl_hostname, server); if (error) goto out_error; Index: linux-2.6-2/include/linux/backing-dev.h === --- linux-2.6-2.orig/include/linux/backing-dev.h +++ linux-2.6-2/include/linux/backing-dev.h @@ -11,6 +11,8 @@ #include linux/percpu_counter.h #include linux/log2.h #include linux/proportions.h +#include linux/kernel.h +#include linux/device.h #include asm/atomic.h struct page; @@ -48,11 +50,42 @@ struct backing_dev_info { struct prop_local_percpu completions; int dirty_exceeded; + +#ifdef CONFIG_SYSFS + struct device kdev; +#endif }; int bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); +int __bdi_register(struct backing_dev_info *bdi); +void bdi_unregister(struct backing_dev_info *bdi); + +#ifdef CONFIG_SYSFS +#define bdi_init_fmt(bdi, fmt...) \ + ({ \ + int ret;\ + kobject_set_name((bdi)-kdev.kobj, ##fmt); \ + ret = bdi_init(bdi);\ + if (!ret) { \ + ret = __bdi_register(bdi); \ + if (ret)\ + bdi_destroy(bdi); \ + } \ + ret;\ + }) + +#define bdi_register(bdi, fmt...) \ + ({ \ + kobject_set_name((bdi)-kdev.kobj, ##fmt); \ + __bdi_register(bdi);\ + }) +#else +#define bdi_init_fmt(bdi, fmt...) bdi_init(bdi) +#define bdi_register(bdi, fmt...) __bdi_register(bdi) +#endif + static inline void __add_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, s64 amount) { Index: linux-2.6-2/include/linux/writeback.h === --- linux-2.6-2.orig/include/linux/writeback.h +++ linux-2.6-2/include/linux/writeback.h @@ -113,6 +113,9 @@ struct file; int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, +struct backing_dev_info *bdi); + void page_writeback_init(void); void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied); Index: linux-2.6-2/mm/backing-dev.c === --- linux-2.6-2.orig/mm/backing-dev.c +++ linux-2.6-2/mm/backing-dev.c @@ -4,12 +4,130 @@ #include linux/fs.h #include linux/sched.h #include linux/module.h +#include linux/writeback.h +#include linux/device.h + +#ifdef CONFIG_SYSFS + +static void bdi_release(struct device *dev) +{ +} + +static int bdi_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + return 0; +} + +static struct class bdi_class = { + .name
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Fri, 2007-10-26 at 22:04 +0200, Peter Zijlstra wrote: This crashes and burns on bootup, but I'm too tired to figure out what I did wrong... will give it another try tomorrow.. Ok, can't sleep.. took a look. I have several problems here. The thing that makes it go *boom* is the __ATTR_NULL. Removing that makes it boot. Albeit it then warns me of multiple duplicate sysfs objects, all named bdi. For some obscure reason this device interface insists on using the bus_id as name (?!), and further reduces usability by limiting that to 20 odd characters. This makes it quite useless. I tried fudging around that limit by using device_rename and kobject_rename, but to no avail. Really, it should not be this hard to use, trying to expose a handfull of simple integers to userspace should not take 8h+ and still not work. Peter, who thinks sysfs is contorted mess beyond his skill. I'll stick to VM and scheduler code, that actually makes sense. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Fri, 2007-10-26 at 19:40 -0700, Greg KH wrote: On Sat, Oct 27, 2007 at 03:18:08AM +0200, Peter Zijlstra wrote: On Fri, 2007-10-26 at 22:04 +0200, Peter Zijlstra wrote: This crashes and burns on bootup, but I'm too tired to figure out what I did wrong... will give it another try tomorrow.. Ok, can't sleep.. took a look. I have several problems here. The thing that makes it go *boom* is the __ATTR_NULL. Removing that makes it boot. Albeit it then warns me of multiple duplicate sysfs objects, all named bdi. For some obscure reason this device interface insists on using the bus_id as name (?!), and further reduces usability by limiting that to 20 odd characters. This makes it quite useless. I tried fudging around that limit by using device_rename and kobject_rename, but to no avail. Really, it should not be this hard to use, trying to expose a handfull of simple integers to userspace should not take 8h+ and still not work. Peter, who thinks sysfs is contorted mess beyond his skill. I'll stick to VM and scheduler code, that actually makes sense. Heh, that's funny :) I'll look at this and see what I can come up with. Would you just like a whole new patch, or one against this one? Sorry for the grumpy note, I get that way at 3.30 am. Maybe I ought not have mailed :-/ This is the code I had at that time. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- block/genhd.c |2 fs/fuse/inode.c |2 fs/nfs/client.c |2 include/linux/backing-dev.h | 21 ++ include/linux/string.h |4 + include/linux/writeback.h |3 mm/backing-dev.c| 144 mm/page-writeback.c |2 mm/util.c | 42 9 files changed, 219 insertions(+), 3 deletions(-) Index: linux-2.6-2/fs/fuse/inode.c === --- linux-2.6-2.orig/fs/fuse/inode.c +++ linux-2.6-2/fs/fuse/inode.c @@ -467,7 +467,7 @@ static struct fuse_conn *new_conn(void) atomic_set(fc-num_waiting, 0); fc-bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc-bdi.unplug_io_fn = default_unplug_io_fn; - err = bdi_init(fc-bdi); + err = bdi_init_fmt(fc-bdi, bdi-fuse-%llu, (unsigned long long)fc-id); if (err) { kfree(fc); fc = NULL; Index: linux-2.6-2/fs/nfs/client.c === --- linux-2.6-2.orig/fs/nfs/client.c +++ linux-2.6-2/fs/nfs/client.c @@ -678,7 +678,7 @@ static int nfs_probe_fsinfo(struct nfs_s goto out_error; nfs_server_set_fsinfo(server, fsinfo); - error = bdi_init(server-backing_dev_info); + error = bdi_init_fmt(server-backing_dev_info, bdi-nfs-%s-%p, clp-cl_hostname, server); if (error) goto out_error; Index: linux-2.6-2/include/linux/backing-dev.h === --- linux-2.6-2.orig/include/linux/backing-dev.h +++ linux-2.6-2/include/linux/backing-dev.h @@ -11,6 +11,8 @@ #include linux/percpu_counter.h #include linux/log2.h #include linux/proportions.h +#include linux/kernel.h +#include linux/device.h #include asm/atomic.h struct page; @@ -48,11 +50,30 @@ struct backing_dev_info { struct prop_local_percpu completions; int dirty_exceeded; + +#ifdef CONFIG_SYSFS + struct device kdev; +#endif }; int bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); +void bdi_unregister(struct backing_dev_info *bdi); + +#define bdi_init_fmt(bdi, fmt...) \ + ({ \ + int ret;\ + ret = bdi_init(bdi);\ + if (!ret) { \ + ret = bdi_register(bdi, ##fmt); \ + if (ret)\ + bdi_destroy(bdi); \ + } \ + ret;\ +}) + static inline void __add_bdi_stat(struct backing_dev_info *bdi, enum bdi_stat_item item, s64 amount) { Index: linux-2.6-2/include/linux/writeback.h === --- linux-2.6-2.orig/include/linux/writeback.h +++ linux-2.6-2/include/linux/writeback.h @@ -113,6 +113,9 @@ struct file; int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file
Networked filesystems vs backing_dev_info
Hi, I had me a little look at bdi usage in networked filesystems. NFS, CIFS, (smbfs), AFS, CODA and NCP And of those, NFS is the only one that I could find that creates backing_dev_info structures. The rest seems to fall back to default_backing_dev_info. With my recent per bdi dirty limit patches the bdi has become more important than it has been in the past. While falling back to the default_backing_dev_info isn't wrong per-se, it isn't right either. Could I implore the various maintainers to look into this issue for their respective filesystem. I'll try and come up with some patches to address this, but feel free to beat me to it. peterz - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: BUG: lock held when returning to user space
On Sat, 2007-10-27 at 17:12 +0200, Jiri Kosina wrote: On Sat, 27 Oct 2007, Gabriel C wrote: I found that today in dmesg after booting current git ( ec3b67c11df42362ccda81261d62829042f223f0 ) : ... [ 592.752777] [ 592.752781] [ 592.753478] [ BUG: lock held when returning to user space! ] [ 592.753880] [ 592.754262] hwclock/1452 is leaving the kernel with locks still held! [ 592.754655] 1 lock held by hwclock/1452: [ 592.755007] #0: (rtc-char_lock){--..}, at: [c02a7ebb] rtc_dev_open+0x2e/0x7e Yes, this is because rtc keeps a char_lock mutex locked as long as the device is open, to avoid concurrent accessess. It could be easily substituted by some counting -- setting and clearing bit in struct rtc_device instead of using char_lock, but doing this just to shut the lockdep off is questionable imho. Peter, what is the preferred way to annotate these kinds of locking for lockdep to express that it is intended? Not sure, I'd not thought that anyone would actually want to do this. I'm also not sure how I stand on this, I'd prefer to say: don't do this! I think, in this case, the lock is associated with a kernel object that is properly cleaned up if the holding tasks gets a SIGKILL. But in general I'd like to see this kind of thing go away. Now I could probably come up with an annotation to hide it, but what do other people think, Ingo, Linus, Andrew, do we want to keep kernel locks held over userspace? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Networked filesystems vs backing_dev_info
On Sat, 2007-10-27 at 11:22 -0400, Jan Harkes wrote: On Sat, Oct 27, 2007 at 11:34:26AM +0200, Peter Zijlstra wrote: I had me a little look at bdi usage in networked filesystems. NFS, CIFS, (smbfs), AFS, CODA and NCP And of those, NFS is the only one that I could find that creates backing_dev_info structures. The rest seems to fall back to default_backing_dev_info. While a file is opened in Coda we associate the open file handle with a local cache file. All read and write operations are redirected to this local file and we even redirect inode-i_mapping. Actual reads and writes are completely handled by the underlying file system. We send the new file contents back to the servers only after all local references have been released (last-close semantics). As a result, there is no need for backing_dev_info structures in Coda, if any congestion control is needed it will be handled by the underlying file system where our locally cached copies are stored. Ok, that works. Thanks for this explanation! - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Sat, 2007-10-27 at 09:02 -0700, Greg KH wrote: Ah, I see a few problems. Here, try this version instead. It's compile-tested only, and should be a lot simpler. Note, we still are not setting the parent to the new bdi structure properly, so the devices will show up in /sys/devices/virtual/ instead of in their proper location. To do this, we need the parent of the device, which I'm not so sure what it should be (block device? block device controller?) The problem is that not every bdi has a sysfs represented parent, hence the class suggestion. For block devices it is indeed the block device itself, but for example the NFS client's server descriptor does not have a sysfs representation. Let me know if this works better, I'm off to a kids birthday party for the day, but will be around this evening... Hehe, do enjoy! Thanks. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: BUG: lock held when returning to user space
On Sat, 2007-10-27 at 08:47 -0700, Arjan van de Ven wrote: On Sat, 27 Oct 2007 17:12:41 +0200 (CEST) Jiri Kosina [EMAIL PROTECTED] wrote: On Sat, 27 Oct 2007, Gabriel C wrote: I found that today in dmesg after booting current git ( ec3b67c11df42362ccda81261d62829042f223f0 ) : ... [ 592.752777] [ 592.752781] [ 592.753478] [ BUG: lock held when returning to user space! ] [ 592.753880] [ 592.754262] hwclock/1452 is leaving the kernel with locks still held! [ 592.754655] 1 lock held by hwclock/1452: [ 592.755007] #0: (rtc-char_lock){--..}, at: [c02a7ebb] rtc_dev_open+0x2e/0x7e Yes, this is because rtc keeps a char_lock mutex locked as long as the device is open, to avoid concurrent accessess. It could be easily substituted by some counting -- setting and clearing bit in struct rtc_device instead of using char_lock, but doing this just to shut the lockdep off is questionable imho. it's not about lockdep; what this code doing is not valid use of a mutex: A mutex is required to have a clear process as owner, and in this case it doesn't have that... at all. This is a violation of the kernel mutex semantics.. and should be fixed. Right, the fd could be transferred using unix sockets or fork(). That would indeed seriously break a mutex. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/2] irq_flags_t: intro and core annotations
On Sun, 2007-10-28 at 00:14 +0400, Alexey Dobriyan wrote: On Sat, Oct 27, 2007 at 09:20:43PM +0200, Roman Zippel wrote: On Sun, 21 Oct 2007, Alexey Dobriyan wrote: So far remedies were: a) grep(1) -- obviously fragile. I tried at some point grepping for spin_lock_irqsave(), found quite a few, but it became bring quickly. b) BUILD_BUG_ON(sizeof(flags) != sizeof(unsigned long)) -- was tried, brutally broke some arches, survived one commit before revert :^) Doesn't work on i386 where sizeof(unsigned int) == sizeof(unsigned long). So it would be nice to have something more robust. If it's just about the type checking, something like below should pretty much do the same. It won't catch, the following if both variables are unsigned long: spin_lock_irqsave(lock, flags); [stuff] spin_unlock_irqrestore(lock, foo-flags); It won't catch static unsigned long flags;. With sparse, we can eventually mark type as on-stack only. +static __always_inline void __irq_flags_check(unsigned long *flags) +{ + BUILD_BUG_ON(!__builtin_stack_addr(flags)); +} + obviously gcc doesn't (yet) support that __builtin function, but you could make it work for sparse and define a dummy for gcc. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Networked filesystems vs backing_dev_info
On Sat, 2007-10-27 at 16:02 -0500, Steve French wrote: On 10/27/07, Peter Zijlstra [EMAIL PROTECTED] wrote: Hi, I had me a little look at bdi usage in networked filesystems. NFS, CIFS, (smbfs), AFS, CODA and NCP And of those, NFS is the only one that I could find that creates backing_dev_info structures. The rest seems to fall back to default_backing_dev_info. With my recent per bdi dirty limit patches the bdi has become more important than it has been in the past. While falling back to the default_backing_dev_info isn't wrong per-se, it isn't right either. Could I implore the various maintainers to look into this issue for their respective filesystem. I'll try and come up with some patches to address this, but feel free to beat me to it. I would like to understand more about your patches to see what bdi values makes sense for CIFS and how to report possible congestion back to the page manager. So, what my recent patches do is carve up the total writeback cache size, or dirty page limit as we call it, proportionally to a BDIs writeout speed. So a fast device gets more than a slow device, but will not starve it. However, for this to work, each device, or remote backing store in the case of networked filesystems, need to have a BDI. I had been thinking about setting bdi-ra_pages so that we do more sensible readahead and writebehind - better matching what is possible over the network and what the server prefers. Well, you'd first have to create backing_dev_info instances before setting that value :-) SMB/CIFS Servers typically allow a maximum of 50 requests in parallel at one time from one client (although this is adjustable for some). That seems like a perfect point to set congestion. So in short, stick a struct backing_dev_info into whatever represents a client, initialize it using bdi_init(), destroy using bdi_destroy(). Mark it congested once you have 50 (or more) outstanding requests, clear congestion when you drop below 50. and you should be set. signature.asc Description: This is a digitally signed message part
Re: per BDI dirty limit (was Re: -mm merge plans for 2.6.24)
On Sat, 2007-10-27 at 23:08 +0200, Kay Sievers wrote: On Sat, 2007-10-27 at 09:02 -0700, Greg KH wrote: Ah, I see a few problems. Here, try this version instead. It's compile-tested only, and should be a lot simpler. Note, we still are not setting the parent to the new bdi structure properly, so the devices will show up in /sys/devices/virtual/ instead of in their proper location. To do this, we need the parent of the device, which I'm not so sure what it should be (block device? block device controller?) Assigning a parent device will only work with the upcoming conversion of the raw kobjects in the block subsystem to struct device. A few comments to the patch: --- a/include/linux/string.h +++ b/include/linux/string.h @@ -8,6 +8,7 @@ #include linux/compiler.h/* for inline */ #include linux/types.h /* for size_t */ #include linux/stddef.h /* for NULL */ +#include stdarg.h #ifdef __cplusplus extern C { @@ -111,6 +112,9 @@ extern void *kmemdup(const void *src, si extern char **argv_split(gfp_t gfp, const char *str, int *argcp); extern void argv_free(char **argv); +char *kvprintf(const char *fmt, va_list args); +char *kprintf(const char *fmt, ...); Why is that here? I don't think we need this when we use the existing: kvasprintf(GFP_KERNEL, fmt, args) Ignorance of the existance of said function. Thanks for pointing it out. (kobject_set_name ought to use it too I guess) --- a/mm/backing-dev.c +++ b/mm/backing-dev.c + +static struct device_attribute bdi_dev_attrs[] = { + __ATTR(readahead, 0644, readahead_show, readahead_store), + __ATTR_RO(reclaimable), + __ATTR_RO(writeback), + __ATTR_RO(dirty), + __ATTR_RO(bdi_dirty), +}; Default attributes will need the NULL termination back (see below). +static __init int bdi_class_init(void) +{ + bdi_class = class_create(THIS_MODULE, bdi); + return 0; +} + +__initcall(bdi_class_init); + +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) This function should accept a: struct device *parent and all callers just pass NULL until the block layer conversion gets merged. Yeah, you're right, but I wanted to just get something working before bothering with the parent thing. +{ + char *name; + va_list args; + int ret = -ENOMEM; + int i; + + va_start(args, fmt); + name = kvprintf(fmt, args); kvasprintf(GFP_KERNEL, fmt, args); + va_end(args); + + if (!name) + return -ENOMEM; + + bdi-dev = device_create(bdi_class, NULL, MKDEV(0,0), name); The parent should be passed here. + for (i = 0; i ARRAY_SIZE(bdi_dev_attrs); i++) { + ret = device_create_file(bdi-dev, bdi_dev_attrs[i]); + if (ret) + break; + } + if (ret) { + while (--i = 0) + device_remove_file(bdi-dev, bdi_dev_attrs[i]); + device_unregister(bdi-dev); + bdi-dev = NULL; + } All this open-coded attribute stuff should go away and be replaced by: bdi_class-dev_attrs = bdi_dev_attrs; Otherwise at event time the attributes are not created and stuff hooking into the events will not be able to set values. Also, the core will do proper add/remove and error handling then. ok, that's good to know. someone ought to write a book on how to use all this... really, even the functions are bare of documentation or comments. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Networked filesystems vs backing_dev_info
On Sat, 2007-10-27 at 23:30 +0200, Peter Zijlstra wrote: On Sat, 2007-10-27 at 16:02 -0500, Steve French wrote: On 10/27/07, Peter Zijlstra [EMAIL PROTECTED] wrote: Hi, I had me a little look at bdi usage in networked filesystems. NFS, CIFS, (smbfs), AFS, CODA and NCP And of those, NFS is the only one that I could find that creates backing_dev_info structures. The rest seems to fall back to default_backing_dev_info. With my recent per bdi dirty limit patches the bdi has become more important than it has been in the past. While falling back to the default_backing_dev_info isn't wrong per-se, it isn't right either. Could I implore the various maintainers to look into this issue for their respective filesystem. I'll try and come up with some patches to address this, but feel free to beat me to it. I would like to understand more about your patches to see what bdi values makes sense for CIFS and how to report possible congestion back to the page manager. So, what my recent patches do is carve up the total writeback cache size, or dirty page limit as we call it, proportionally to a BDIs writeout speed. So a fast device gets more than a slow device, but will not starve it. However, for this to work, each device, or remote backing store in the case of networked filesystems, need to have a BDI. I had been thinking about setting bdi-ra_pages so that we do more sensible readahead and writebehind - better matching what is possible over the network and what the server prefers. Well, you'd first have to create backing_dev_info instances before setting that value :-) SMB/CIFS Servers typically allow a maximum of 50 requests in parallel at one time from one client (although this is adjustable for some). That seems like a perfect point to set congestion. So in short, stick a struct backing_dev_info into whatever represents a client, initialize it using bdi_init(), destroy using bdi_destroy(). Oh, and the most important point, make your fresh I_NEW inodes point to this bdi struct. Mark it congested once you have 50 (or more) outstanding requests, clear congestion when you drop below 50. and you should be set. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote: On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote: On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] wrote: The problem original occurs with the fb_defio driver (driver/video/fb_defio.c). This driver use the vm_ops.page_mkwrite() handler for tracking the modified pages, which will be in an extra thread handled, to perform the IO and clean and write protect all pages with page_clean(). Hi, An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255. I understood from the thread that PeterZ is looking into page_mkclean changes which I guess went into 2.6.23. I'm also happy to help in any way if the way we're doing fb_defio needs to change. Yeah, its the truncate race stuff introduced by Nick in d0217ac04ca6591841e5665f518e38064f4e65bd I'm a bit at a loss on how to go around fixing this. One ugly idea I had was to check page-mapping before going into page_mkwrite() and when that is null, don't bother with the truncate check. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 11:11 +0100, Peter Zijlstra wrote: On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote: On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote: On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] wrote: The problem original occurs with the fb_defio driver (driver/video/fb_defio.c). This driver use the vm_ops.page_mkwrite() handler for tracking the modified pages, which will be in an extra thread handled, to perform the IO and clean and write protect all pages with page_clean(). Hi, An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255. I understood from the thread that PeterZ is looking into page_mkclean changes which I guess went into 2.6.23. I'm also happy to help in any way if the way we're doing fb_defio needs to change. Yeah, its the truncate race stuff introduced by Nick in d0217ac04ca6591841e5665f518e38064f4e65bd I'm a bit at a loss on how to go around fixing this. One ugly idea I had was to check page-mapping before going into page_mkwrite() and when that is null, don't bother with the truncate check. Something like this --- mm/memory.c |4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) Index: linux-2.6/mm/memory.c === --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -2300,6 +2300,8 @@ static int __do_fault(struct mm_struct * * to become writable */ if (vma-vm_ops-page_mkwrite) { + struct address_space *mapping = page-mapping; + unlock_page(page); if (vma-vm_ops-page_mkwrite(vma, page) 0) { ret = VM_FAULT_SIGBUS; @@ -2314,7 +2316,7 @@ static int __do_fault(struct mm_struct * * reworking page_mkwrite locking API, which * is better done later. */ - if (!page-mapping) { + if (mapping != page-mapping) { ret = 0; anon = 1; /* no anon but release vmf.page */ goto out; - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote: On 10/29/07, Andrew Morton [EMAIL PROTECTED] wrote: On Mon, 22 Oct 2007 16:40:57 +0200 Stefani Seibold [EMAIL PROTECTED] wrote: The problem original occurs with the fb_defio driver (driver/video/fb_defio.c). This driver use the vm_ops.page_mkwrite() handler for tracking the modified pages, which will be in an extra thread handled, to perform the IO and clean and write protect all pages with page_clean(). An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255. I understood from the thread that PeterZ is looking into page_mkclean changes which I guess went into 2.6.23. I'm also happy to help in any way if the way we're doing fb_defio needs to change. OK, seems I can't read. Or at least, I missed a large part of the problem. page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And looking at the fb_defio code, I'm not sure I understand how its page_mkclean() use could ever have worked. The proposed patch [1] only fixes the issue of -page_mkwrite() on vmalloc()'ed memory. Not page_mkclean(), and that has never worked from what I can make of it. Jaya, could you shed some light on this? I presume you had your display working. [1] which I will clean up and resend after this issue is cleared up - and preferably tested by someone who has this hardware. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 13:51 -0400, Jaya Kumar wrote: On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote: On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote: An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255. I understood from the thread that PeterZ is looking into page_mkclean changes which I guess went into 2.6.23. I'm also happy to help in any way if the way we're doing fb_defio needs to change. OK, seems I can't read. Or at least, I missed a large part of the problem. page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And looking at the fb_defio code, I'm not sure I understand how its page_mkclean() use could ever have worked. The proposed patch [1] only fixes the issue of -page_mkwrite() on vmalloc()'ed memory. Not page_mkclean(), and that has never worked from what I can make of it. Jaya, could you shed some light on this? I presume you had your display working. I thought I had it working. I saw the display update after each mmap/write sequence to the framebuffer. I need to check if there's an munmap or anything else going on in between write sequences that would cause it to behave like page_mkclean was working. Is it correct to assume that page_mkclean should mark the pages read-only so that the next write would again trigger mkwrite? Well, yes, that is the intended behaviour. Even if the page was from a vmalloc_to_page()? That is the crux, I only ever implemented it for file pages. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [stable] 2.6.23 regression: top displaying 9999% CPU usage
On Mon, 2007-10-29 at 21:41 +0100, Ingo Molnar wrote: * Christian Borntraeger [EMAIL PROTECTED] wrote: - return clock_t_to_cputime(utime); + p-prev_utime = max(p-prev_utime, clock_t_to_cputime(utime)); + return p-prev_utime; } [...] I dont think it will work. It will make utime monotic, but stime can still decrease. For example let sum_exec_runtime increase by a tiny little bit while utime will get a full additional tick. stime is sum-utime. So stime can still go backwards. So I think that we need this kind of logic for stime as well, no? yeah, probably. Peter? /me dons the brown paper bag while mumbling an agreement of sorts. I'll not attempt to come up with a patch as I fear I'll just make a bigger mess in my current state, hope to feel better tomorrow.. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 19:17 +0100, Peter Zijlstra wrote: On Mon, 2007-10-29 at 13:51 -0400, Jaya Kumar wrote: On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote: On Mon, 2007-10-29 at 01:17 -0700, Jaya Kumar wrote: An aside, I just tested that deferred IO works fine on 2.6.22.10/pxa255. I understood from the thread that PeterZ is looking into page_mkclean changes which I guess went into 2.6.23. I'm also happy to help in any way if the way we're doing fb_defio needs to change. OK, seems I can't read. Or at least, I missed a large part of the problem. page_mkclean() hasn't changed, it was -page_mkwrite() that changed. And looking at the fb_defio code, I'm not sure I understand how its page_mkclean() use could ever have worked. The proposed patch [1] only fixes the issue of -page_mkwrite() on vmalloc()'ed memory. Not page_mkclean(), and that has never worked from what I can make of it. Jaya, could you shed some light on this? I presume you had your display working. I thought I had it working. I saw the display update after each mmap/write sequence to the framebuffer. I need to check if there's an munmap or anything else going on in between write sequences that would cause it to behave like page_mkclean was working. Is it correct to assume that page_mkclean should mark the pages read-only so that the next write would again trigger mkwrite? Well, yes, that is the intended behaviour. Even if the page was from a vmalloc_to_page()? That is the crux, I only ever implemented it for file pages. Hmm, so these vmalloc pages are mapped into user-space with remap_pfn_range(), which doesn't have any form of rmap. That is, given a pfn there is no way to obtain all ptes for it. So the interface to page_mkclean() could never work for these (as it only provides a struct page *). [ also, remap_vmalloc_range() suffers similar issues, only file and anon have proper rmap ] I'm not sure we want full rmap for remap_pfn/vmalloc_range, but perhaps we could assist drivers in maintaining and using vma lists. I think page_mkclean_one() would work if you'd manually set page-index and iterate the vmas yourself. Although atm I'm not sure of anything so don't pin me on it. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Mon, 2007-10-29 at 21:22 -0400, Jaya Kumar wrote: On 10/29/07, Peter Zijlstra [EMAIL PROTECTED] wrote: [ also, remap_vmalloc_range() suffers similar issues, only file and anon have proper rmap ] I'm not sure we want full rmap for remap_pfn/vmalloc_range, but perhaps we could assist drivers in maintaining and using vma lists. I think page_mkclean_one() would work if you'd manually set page-index and iterate the vmas yourself. Although atm I'm not sure of anything so don't pin me on it. :-) If it's anybody's fault, it's mine for not testing properly. My bad. In the case of defio, I think it's no trouble to build a list of vmas at mmap time and then to iterate through them when it's ready for mkclean time as you suggested. I don't fully understand page-index yet. I had thought it was only used by swap cache or file map. On an unrelated note, I was looking for somewhere to stuff a 16 bit offset (so that I have a cheap way to know which struct page corresponds to which framebuffer block or offset) for another driver. I had thought page-index was it but I think I am wrong now. Yeah, page-index is used along with vma-vmpgoff and vma-vm_start to determine the address of the page in the given vma: address = vma-vm_start + ((page-index - vma-vm_pgoff) PAGE_SHIFT); and from that address the pte can be found by walking the vma-vm_mm page tables. So page-index does what you want it to, identify which part of the framebuffer this particular page belongs to. signature.asc Description: This is a digitally signed message part
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Tue, 2007-10-30 at 12:39 +, Hugh Dickins wrote: On Tue, 30 Oct 2007, Stefani Seibold wrote: the question is how can i get all pte's from a vmalloc'ed memory. Due to the zeroed mapping pointer i dont see how to do this? The mapping pointer is zeroed because you've done nothing to set it. Below is how I answered you a week ago. But this is new territory (extending page_mkclean to work on more than just pagecache pages), I'm still unsure what would be the safest way to do it. Quite, I think manual usage of page_mkclean_one() on the vma gotten from mmap() along with properly setting page-index is the simplest solution to make work. Making page_mkclean(struct page *) work for remap_pfn/vmalloc_range() style mmaps would require extending rmap to work with those, which includes setting page-mapping to point to a anon_vma like object. But that sounds like a lot of work, and I'm not sure its worth the overhead, because so far all users of remap_pfn/vmalloc_range() have survived without. signature.asc Description: This is a digitally signed message part
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Tue, 2007-10-30 at 09:16 -0400, Jaya Kumar wrote: On 10/30/07, Peter Zijlstra [EMAIL PROTECTED] wrote: So page-index does what you want it to, identify which part of the framebuffer this particular page belongs to. Ok. I'm attempting to walk the code sequence. Here's what I think: - driver loads - driver vmalloc()s its fb - this creates the necessary pte entries well, one set thereof, the kernel mappings, which for this purpose are the least interesting. then... - app mmap(/dev/fb0) - vma is created - defio mmap adds this vma to private list (equivalent of address_space or anon_vma) - app touches base + pixel(128,128) = base + 16k - page fault - defio nopage gets called - defio nopage does vmalloc_to_page(base+16k) this installs a user space page table entry for your page; this is the interesting one as it carries the user-dirty state. - that finds the correct struct page corresponding to that vaddr. page-index has not been set by anyone so far, right? * ah... i see, you are suggesting that this is where I could set the index since i know the offset i want it to represent. right? Not quite, you would set that right after vmallocing, just set an increasing page-index starting with 0 for the first page. Then ensure your vma-vm_pgoff is 0 (which should be the case since userspace will most likely mmap the whole thing, and if not it still gets what it expects). - defio mkwrite get called. defio adds page to its list. schedules delayed work - app keeps writing the page - delayed work occurs - foreach vma { foreach page { page_mkclean_one(page, vma) } Yeah, page_mkclean_one(page, vma) will use vma_address() to obtain an user-space address for the page in this vma using page-index and the formula from the last email, this address is then used to walk the page tables and obtain a pte. This will be the user-space pte installed by your nopfn handler. Not the kernel vmap pte resulting from the vmalloc() call. - cycle repeats... signature.asc Description: This is a digitally signed message part
Re: vm_ops.page_mkwrite() fails with vmalloc on 2.6.23
On Tue, 2007-10-30 at 15:47 +, Hugh Dickins wrote: On Tue, 30 Oct 2007, Peter Zijlstra wrote: On Tue, 2007-10-30 at 09:16 -0400, Jaya Kumar wrote: - defio mmap adds this vma to private list (equivalent of address_space or anon_vma) - foreach vma { foreach page { page_mkclean_one(page, vma) } Yeah, page_mkclean_one(page, vma) will use vma_address() to obtain an user-space address for the page in this vma using page-index and the formula from the last email, this address is then used to walk the page tables and obtain a pte. I don't understand why you suggested an anon_vma, nor why Jaya is suggesting a private list. All vmas mapping /dev/fb0 will be kept in the prio_tree rooted in its struct address_space (__vma_link_file in mm/mmap.c). And page_mkclean gets page_mkclean_file to walk that very tree. The missing part is just the setting of page-mapping to point to that struct address_space (and clearing it before finally freeing the pages), and the setting of page-index as you described. Isn't it? Hmm, there is a thought. I had not considered that mapping a chardev would have that effect. I'd have to have a look at the actual code, but yeah, that might very well work out. How silly of me. Thanks! signature.asc Description: This is a digitally signed message part
[PATCH 32/33] nfs: fix various memory recursions possible with swap over NFS.
GFP_NOFS is not enough, since swap traffic is IO, hence fall back to GFP_NOIO. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/pagelist.c |2 +- fs/nfs/write.c|6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -44,7 +44,7 @@ static struct kmem_cache *nfs_wdata_cach struct nfs_write_data *nfs_commit_alloc(void) { - struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS); + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); @@ -68,7 +68,7 @@ void nfs_commit_free(struct nfs_write_da struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { - struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS); + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); @@ -77,7 +77,7 @@ struct nfs_write_data *nfs_writedata_all if (pagecount = ARRAY_SIZE(p-page_array)) p-pagevec = p-page_array; else { - p-pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); + p-pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOIO); if (!p-pagevec) { kmem_cache_free(nfs_wdata_cachep, p); p = NULL; Index: linux-2.6/fs/nfs/pagelist.c === --- linux-2.6.orig/fs/nfs/pagelist.c +++ linux-2.6/fs/nfs/pagelist.c @@ -27,7 +27,7 @@ static inline struct nfs_page * nfs_page_alloc(void) { struct nfs_page *p; - p = kmem_cache_alloc(nfs_page_cachep, GFP_KERNEL); + p = kmem_cache_alloc(nfs_page_cachep, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); INIT_LIST_HEAD(p-wb_list); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/33] mm: kmem_estimate_pages()
Provide a method to get the upper bound on the pages needed to allocate a given number of objects from a given kmem_cache. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/slab.h |3 + mm/slub.c| 82 +++ 2 files changed, 85 insertions(+) Index: linux-2.6/include/linux/slab.h === --- linux-2.6.orig/include/linux/slab.h +++ linux-2.6/include/linux/slab.h @@ -60,6 +60,7 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); +unsigned kmem_estimate_pages(struct kmem_cache *cachep, gfp_t flags, int objects); /* * Please use this macro to create slab caches. Simply specify the @@ -94,6 +95,8 @@ int kmem_ptr_validate(struct kmem_cache void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); size_t ksize(const void *); +unsigned kestimate_single(size_t, gfp_t, int); +unsigned kestimate(gfp_t, size_t); /* * Allocator specific definitions. These are mainly used to establish optimized Index: linux-2.6/mm/slub.c === --- linux-2.6.orig/mm/slub.c +++ linux-2.6/mm/slub.c @@ -2293,6 +2293,37 @@ const char *kmem_cache_name(struct kmem_ EXPORT_SYMBOL(kmem_cache_name); /* + * return the max number of pages required to allocated count + * objects from the given cache + */ +unsigned kmem_estimate_pages(struct kmem_cache *s, gfp_t flags, int objects) +{ + unsigned long slabs; + + if (WARN_ON(!s) || WARN_ON(!s-objects)) + return 0; + + slabs = DIV_ROUND_UP(objects, s-objects); + + /* +* Account the possible additional overhead if the slab holds more that +* one object. +*/ + if (s-objects 1) { + /* +* Account the possible additional overhead if per cpu slabs +* are currently empty and have to be allocated. This is very +* unlikely but a possible scenario immediately after +* kmem_cache_shrink. +*/ + slabs += num_online_cpus(); + } + + return slabs s-order; +} +EXPORT_SYMBOL_GPL(kmem_estimate_pages); + +/* * Attempt to free all slabs on a node. Return the number of slabs we * were unable to free. */ @@ -2630,6 +2661,57 @@ void kfree(const void *x) EXPORT_SYMBOL(kfree); /* + * return the max number of pages required to allocate @count objects + * of @size bytes from kmalloc given @flags. + */ +unsigned kestimate_single(size_t size, gfp_t flags, int count) +{ + struct kmem_cache *s = get_slab(size, flags); + if (!s) + return 0; + + return kmem_estimate_pages(s, flags, count); + +} +EXPORT_SYMBOL_GPL(kestimate_single); + +/* + * return the max number of pages required to allocate @bytes from kmalloc + * in an unspecified number of allocation of heterogeneous size. + */ +unsigned kestimate(gfp_t flags, size_t bytes) +{ + int i; + unsigned long pages; + + /* +* multiply by two, in order to account the worst case slack space +* due to the power-of-two allocation sizes. +*/ + pages = DIV_ROUND_UP(2 * bytes, PAGE_SIZE); + + /* +* add the kmem_cache overhead of each possible kmalloc cache +*/ + for (i = 1; i PAGE_SHIFT; i++) { + struct kmem_cache *s; + +#ifdef CONFIG_ZONE_DMA + if (unlikely(flags SLUB_DMA)) + s = dma_kmalloc_cache(i, flags); + else +#endif + s = kmalloc_caches[i]; + + if (s) + pages += kmem_estimate_pages(s, flags, 0); + } + + return pages; +} +EXPORT_SYMBOL_GPL(kestimate); + +/* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the * most items in use come first. New allocations will then fill those up -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 24/33] mm: prepare swap entry methods for use in page methods
Move around the swap entry methods in preparation for use from page methods. Also provide a function to obtain the swap_info_struct backing a swap cache page. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/mm.h |8 include/linux/swap.h| 48 include/linux/swapops.h | 44 mm/swapfile.c |1 + 4 files changed, 57 insertions(+), 44 deletions(-) Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -12,6 +12,7 @@ #include linux/prio_tree.h #include linux/debug_locks.h #include linux/mm_types.h +#include linux/swap.h struct mempolicy; struct anon_vma; @@ -573,6 +574,13 @@ static inline struct address_space *page return mapping; } +static inline struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + BUG_ON(!PageSwapCache(page)); + return get_swap_info_struct(swp_type(swap)); +} + static inline int PageAnon(struct page *page) { return ((unsigned long)page-mapping PAGE_MAPPING_ANON) != 0; Index: linux-2.6/include/linux/swap.h === --- linux-2.6.orig/include/linux/swap.h +++ linux-2.6/include/linux/swap.h @@ -80,6 +80,50 @@ typedef struct { } swp_entry_t; /* + * swapcache pages are stored in the swapper_space radix tree. We want to + * get good packing density in that tree, so the index should be dense in + * the low-order bits. + * + * We arrange the `type' and `offset' fields so that `type' is at the five + * high-order bits of the swp_entry_t and `offset' is right-aligned in the + * remaining bits. + * + * swp_entry_t's are *never* stored anywhere in their arch-dependent format. + */ +#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT) +#define SWP_OFFSET_MASK(e) ((1UL SWP_TYPE_SHIFT(e)) - 1) + +/* + * Store a type+offset into a swp_entry_t in an arch-independent format + */ +static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) +{ + swp_entry_t ret; + + ret.val = (type SWP_TYPE_SHIFT(ret)) | + (offset SWP_OFFSET_MASK(ret)); + return ret; +} + +/* + * Extract the `type' field from a swp_entry_t. The swp_entry_t is in + * arch-independent format + */ +static inline unsigned swp_type(swp_entry_t entry) +{ + return (entry.val SWP_TYPE_SHIFT(entry)); +} + +/* + * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in + * arch-independent format + */ +static inline pgoff_t swp_offset(swp_entry_t entry) +{ + return entry.val SWP_OFFSET_MASK(entry); +} + +/* * current-reclaim_state points to one of these when a task is running * memory reclaim */ @@ -326,6 +370,10 @@ static inline int valid_swaphandles(swp_ return 0; } +static inline struct swap_info_struct *get_swap_info_struct(unsigned type) +{ + return NULL; +} #define can_share_swap_page(p) (page_mapcount(p) == 1) static inline int move_to_swap_cache(struct page *page, swp_entry_t entry) Index: linux-2.6/include/linux/swapops.h === --- linux-2.6.orig/include/linux/swapops.h +++ linux-2.6/include/linux/swapops.h @@ -1,48 +1,4 @@ /* - * swapcache pages are stored in the swapper_space radix tree. We want to - * get good packing density in that tree, so the index should be dense in - * the low-order bits. - * - * We arrange the `type' and `offset' fields so that `type' is at the five - * high-order bits of the swp_entry_t and `offset' is right-aligned in the - * remaining bits. - * - * swp_entry_t's are *never* stored anywhere in their arch-dependent format. - */ -#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT) -#define SWP_OFFSET_MASK(e) ((1UL SWP_TYPE_SHIFT(e)) - 1) - -/* - * Store a type+offset into a swp_entry_t in an arch-independent format - */ -static inline swp_entry_t swp_entry(unsigned long type, pgoff_t offset) -{ - swp_entry_t ret; - - ret.val = (type SWP_TYPE_SHIFT(ret)) | - (offset SWP_OFFSET_MASK(ret)); - return ret; -} - -/* - * Extract the `type' field from a swp_entry_t. The swp_entry_t is in - * arch-independent format - */ -static inline unsigned swp_type(swp_entry_t entry) -{ - return (entry.val SWP_TYPE_SHIFT(entry)); -} - -/* - * Extract the `offset' field from a swp_entry_t. The swp_entry_t is in - * arch-independent format - */ -static inline pgoff_t swp_offset(swp_entry_t entry) -{ - return entry.val SWP_OFFSET_MASK(entry); -} - -/* * Convert the arch-dependent pte representation of a swp_entry_t into an * arch-independent swp_entry_t. */ Index: linux-2.6/mm/swapfile.c
[PATCH 03/33] mm: slub: add knowledge of reserve pages
Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation contexts that are entitled to it. Care is taken to only touch the SLUB slow path. This is done to ensure reserve pages don't leak out and get consumed. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/slub_def.h |1 + mm/slub.c| 31 +++ 2 files changed, 24 insertions(+), 8 deletions(-) Index: linux-2.6/mm/slub.c === --- linux-2.6.orig/mm/slub.c +++ linux-2.6/mm/slub.c @@ -20,11 +20,12 @@ #include linux/mempolicy.h #include linux/ctype.h #include linux/kallsyms.h +#include internal.h /* * Lock order: * 1. slab_lock(page) - * 2. slab-list_lock + * 2. node-list_lock * * The slab_lock protects operations on the object of a particular * slab and its metadata in the page struct. If the slab lock @@ -1074,7 +1075,7 @@ static void setup_object(struct kmem_cac s-ctor(s, object); } -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int *reserve) { struct page *page; struct kmem_cache_node *n; @@ -1090,6 +1091,7 @@ static struct page *new_slab(struct kmem if (!page) goto out; + *reserve = page-reserve; n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(n-nr_slabs); @@ -1468,10 +1470,22 @@ static void *__slab_alloc(struct kmem_ca { void **object; struct page *new; + int reserve = 0; if (!c-page) goto new_slab; + if (unlikely(c-reserve)) { + /* +* If the current slab is a reserve slab and the current +* allocation context does not allow access to the reserves +* we must force an allocation to test the current levels. +*/ + if (!(gfp_to_alloc_flags(gfpflags) ALLOC_NO_WATERMARKS)) + goto alloc_slab; + reserve = 1; + } + slab_lock(c-page); if (unlikely(!node_match(c, node))) goto another_slab; @@ -1479,10 +1493,9 @@ load_freelist: object = c-page-freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(c-page))) + if (unlikely(SlabDebug(c-page) || reserve)) goto debug; - object = c-page-freelist; c-freelist = object[c-offset]; c-page-inuse = s-objects; c-page-freelist = NULL; @@ -1500,16 +1513,18 @@ new_slab: goto load_freelist; } +alloc_slab: if (gfpflags __GFP_WAIT) local_irq_enable(); - new = new_slab(s, gfpflags, node); + new = new_slab(s, gfpflags, node, reserve); if (gfpflags __GFP_WAIT) local_irq_disable(); if (new) { c = get_cpu_slab(s, smp_processor_id()); + c-reserve = reserve; if (c-page) { /* * Someone else populated the cpu_slab while we @@ -1537,8 +1552,7 @@ new_slab: } return NULL; debug: - object = c-page-freelist; - if (!alloc_debug_processing(s, c-page, object, addr)) + if (SlabDebug(c-page) !alloc_debug_processing(s, c-page, object, addr)) goto another_slab; c-page-inuse++; @@ -2010,10 +2024,11 @@ static struct kmem_cache_node *early_kme { struct page *page; struct kmem_cache_node *n; + int reserve; BUG_ON(kmalloc_caches-size sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmalloc_caches, gfpflags, node, reserve); BUG_ON(!page); if (page_to_nid(page) != node) { Index: linux-2.6/include/linux/slub_def.h === --- linux-2.6.orig/include/linux/slub_def.h +++ linux-2.6/include/linux/slub_def.h @@ -17,6 +17,7 @@ struct kmem_cache_cpu { int node; unsigned int offset; unsigned int objsize; + int reserve; }; struct kmem_cache_node { -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 16/33] netvm: network reserve infrastructure
Provide the basic infrastructure to reserve and charge/account network memory. We provide the following reserve tree: 1) total network reserve 2)network TX reserve 3) protocol TX pages 4)network RX reserve 5) SKB data reserve [1] is used to make all the network reserves a single subtree, for easy manipulation. [2] and [4] are merely for eastetic reasons. The TX pages reserve [3] is assumed bounded by it being the upper bound of memory that can be used for sending pages (not quite true, but good enough) The SKB reserve [5] is an aggregate reserve, which is used to charge SKB data against in the fallback path. The consumers for these reserves are sockets marked with: SOCK_MEMALLOC Such sockets are to be used to service the VM (iow. to swap over). They must be handled kernel side, exposing such a socket to user-space is a BUG. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h | 35 +++- net/Kconfig|3 + net/core/sock.c| 113 + 3 files changed, 150 insertions(+), 1 deletion(-) Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -50,6 +50,7 @@ #include linux/skbuff.h /* struct sk_buff */ #include linux/mm.h #include linux/security.h +#include linux/reserve.h #include linux/filter.h @@ -397,6 +398,7 @@ enum sock_flags { SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ + SOCK_MEMALLOC, /* the VM depends on us - make sure we're serviced */ }; static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) @@ -419,9 +421,40 @@ static inline int sock_flag(struct sock return test_bit(flag, sk-sk_flags); } +static inline int sk_has_memalloc(struct sock *sk) +{ + return sock_flag(sk, SOCK_MEMALLOC); +} + +/* + * Guestimate the per request queue TX upper bound. + * + * Max packet size is 64k, and we need to reserve that much since the data + * might need to bounce it. Double it to be on the safe side. + */ +#define TX_RESERVE_PAGES DIV_ROUND_UP(2*65536, PAGE_SIZE) + +extern atomic_t memalloc_socks; + +extern struct mem_reserve net_rx_reserve; +extern struct mem_reserve net_skb_reserve; + +static inline int sk_memalloc_socks(void) +{ + return atomic_read(memalloc_socks); +} + +extern int rx_emergency_get(int bytes); +extern int rx_emergency_get_overcommit(int bytes); +extern void rx_emergency_put(int bytes); + +extern int sk_adjust_memalloc(int socks, long tx_reserve_pages); +extern int sk_set_memalloc(struct sock *sk); +extern int sk_clear_memalloc(struct sock *sk); + static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask) { - return gfp_mask; + return gfp_mask | (sk-sk_allocation __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) Index: linux-2.6/net/core/sock.c === --- linux-2.6.orig/net/core/sock.c +++ linux-2.6/net/core/sock.c @@ -112,6 +112,7 @@ #include linux/tcp.h #include linux/init.h #include linux/highmem.h +#include linux/reserve.h #include asm/uaccess.h #include asm/system.h @@ -213,6 +214,111 @@ __u32 sysctl_rmem_default __read_mostly /* Maximal space eaten by iovec or ancilliary data plus some space */ int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); +atomic_t memalloc_socks; + +static struct mem_reserve net_reserve; +struct mem_reserve net_rx_reserve; +struct mem_reserve net_skb_reserve; +static struct mem_reserve net_tx_reserve; +static struct mem_reserve net_tx_pages; + +EXPORT_SYMBOL_GPL(net_rx_reserve); /* modular ipv6 only */ +EXPORT_SYMBOL_GPL(net_skb_reserve); /* modular ipv6 only */ + +/* + * is there room for another emergency packet? + */ +static int __rx_emergency_get(int bytes, bool overcommit) +{ + return mem_reserve_kmalloc_charge(net_skb_reserve, bytes, overcommit); +} + +int rx_emergency_get(int bytes) +{ + return __rx_emergency_get(bytes, false); +} + +int rx_emergency_get_overcommit(int bytes) +{ + return __rx_emergency_get(bytes, true); +} + +void rx_emergency_put(int bytes) +{ + mem_reserve_kmalloc_charge(net_skb_reserve, -bytes, 0); +} + +/** + * sk_adjust_memalloc - adjust the global memalloc reserve for critical RX + * @socks: number of new %SOCK_MEMALLOC sockets + * @tx_resserve_pages: number of pages to (un)reserve for TX + * + * This function adjusts the memalloc reserve based on system demand. + * The RX reserve is a limit, and only added once, not for each socket. + * + * NOTE: + *@tx_reserve_pages is an upper-bound of memory used for TX hence + *we need not account the pages like we do for RX pages
[PATCH 08/33] mm: emergency pool
Provide means to reserve a specific amount of pages. The emergency pool is separated from the min watermark because ALLOC_HARDER and ALLOC_HIGH modify the watermark in a relative way and thus do not ensure a strict minimum. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/mmzone.h |3 + mm/page_alloc.c| 82 +++-- mm/vmstat.c|6 +-- 3 files changed, 78 insertions(+), 13 deletions(-) Index: linux-2.6/include/linux/mmzone.h === --- linux-2.6.orig/include/linux/mmzone.h +++ linux-2.6/include/linux/mmzone.h @@ -213,7 +213,7 @@ enum zone_type { struct zone { /* Fields commonly accessed by the page allocator */ - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_emerg, pages_min, pages_low, pages_high; /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several @@ -682,6 +682,7 @@ int sysctl_min_unmapped_ratio_sysctl_han struct file *, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int adjust_memalloc_reserve(int pages); extern int numa_zonelist_order_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -118,6 +118,8 @@ static char * const zone_names[MAX_NR_ZO static DEFINE_SPINLOCK(min_free_lock); int min_free_kbytes = 1024; +static DEFINE_MUTEX(var_free_mutex); +int var_free_kbytes; unsigned long __meminitdata nr_kernel_pages; unsigned long __meminitdata nr_all_pages; @@ -1252,7 +1254,7 @@ int zone_watermark_ok(struct zone *z, in if (alloc_flags ALLOC_HARDER) min -= min / 4; - if (free_pages = min + z-lowmem_reserve[classzone_idx]) + if (free_pages = min + z-lowmem_reserve[classzone_idx] + z-pages_emerg) return 0; for (o = 0; o order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1733,8 +1735,8 @@ nofail_alloc: nopage: if (!(gfp_mask __GFP_NOWARN) printk_ratelimit()) { printk(KERN_WARNING %s: page allocation failure. -order:%d, mode:0x%x\n, - p-comm, order, gfp_mask); +order:%d, mode:0x%x, alloc_flags:0x%x, pflags:0x%x\n, + p-comm, order, gfp_mask, alloc_flags, p-flags); dump_stack(); show_mem(); } @@ -1952,9 +1954,9 @@ void show_free_areas(void) \n, zone-name, K(zone_page_state(zone, NR_FREE_PAGES)), - K(zone-pages_min), - K(zone-pages_low), - K(zone-pages_high), + K(zone-pages_emerg + zone-pages_min), + K(zone-pages_emerg + zone-pages_low), + K(zone-pages_emerg + zone-pages_high), K(zone_page_state(zone, NR_ACTIVE)), K(zone_page_state(zone, NR_INACTIVE)), K(zone-present_pages), @@ -4113,7 +4115,7 @@ static void calculate_totalreserve_pages } /* we treat pages_high as reserved pages. */ - max += zone-pages_high; + max += zone-pages_high + zone-pages_emerg; if (max zone-present_pages) max = zone-present_pages; @@ -4170,7 +4172,8 @@ static void setup_per_zone_lowmem_reserv */ static void __setup_per_zone_pages_min(void) { - unsigned long pages_min = min_free_kbytes (PAGE_SHIFT - 10); + unsigned pages_min = min_free_kbytes (PAGE_SHIFT - 10); + unsigned pages_emerg = var_free_kbytes (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -4182,11 +4185,13 @@ static void __setup_per_zone_pages_min(v } for_each_zone(zone) { - u64 tmp; + u64 tmp, tmp_emerg; spin_lock_irqsave(zone-lru_lock, flags); tmp = (u64)pages_min * zone-present_pages; do_div(tmp, lowmem_pages); + tmp_emerg = (u64)pages_emerg * zone-present_pages; + do_div(tmp_emerg, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't
[PATCH 30/33] nfs: swap vs nfs_writepage
For now just use the -writepage() path for swap traffic. Trond would like to see -swap_page() or some such additional a_op. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/write.c | 23 +++ 1 file changed, 23 insertions(+) Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -336,6 +336,29 @@ static int nfs_do_writepage(struct page nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); + if (unlikely(IS_SWAPFILE(inode))) { + struct rpc_cred *cred; + struct nfs_open_context *ctx; + int status; + + cred = rpcauth_lookupcred(NFS_CLIENT(inode)-cl_auth, 0); + if (IS_ERR(cred)) + return PTR_ERR(cred); + + ctx = nfs_find_open_context(inode, cred, FMODE_WRITE); + if (!ctx) + return -EBADF; + + status = nfs_writepage_setup(ctx, page, 0, nfs_page_length(page)); + + put_nfs_open_context(ctx); + + if (status 0) { + nfs_set_pageerror(page); + return status; + } + } + nfs_pageio_cond_complete(pgio, page-index); return nfs_page_async_flush(pgio, page); } -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 26/33] mm: methods for teaching filesystems about PG_swapcache pages
In order to teach filesystems to handle swap cache pages, two new page functions are introduced: pgoff_t page_file_index(struct page *); struct address_space *page_file_mapping(struct page *); page_file_index - gives the offset of this page in the file in PAGE_CACHE_SIZE blocks. Like page-index is for mapped pages, this function also gives the correct index for PG_swapcache pages. page_file_mapping - gives the mapping backing the actual page; that is for swap cache pages it will give swap_file-f_mapping. page_offset() is modified to use page_file_index(), so that it will give the expected result, even for PG_swapcache pages. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/mm.h | 26 ++ include/linux/pagemap.h |2 +- 2 files changed, 27 insertions(+), 1 deletion(-) Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -13,6 +13,7 @@ #include linux/debug_locks.h #include linux/mm_types.h #include linux/swap.h +#include linux/fs.h struct mempolicy; struct anon_vma; @@ -581,6 +582,16 @@ static inline struct swap_info_struct *p return get_swap_info_struct(swp_type(swap)); } +static inline +struct address_space *page_file_mapping(struct page *page) +{ +#ifdef CONFIG_SWAP_FILE + if (unlikely(PageSwapCache(page))) + return page_swap_info(page)-swap_file-f_mapping; +#endif + return page-mapping; +} + static inline int PageAnon(struct page *page) { return ((unsigned long)page-mapping PAGE_MAPPING_ANON) != 0; @@ -598,6 +609,21 @@ static inline pgoff_t page_index(struct } /* + * Return the file index of the page. Regular pagecache pages use -index + * whereas swapcache pages use swp_offset(-private) + */ +static inline pgoff_t page_file_index(struct page *page) +{ +#ifdef CONFIG_SWAP_FILE + if (unlikely(PageSwapCache(page))) { + swp_entry_t swap = { .val = page_private(page) }; + return swp_offset(swap); + } +#endif + return page-index; +} + +/* * The atomic page-_mapcount, like _count, starts from -1: * so that transitions both from it and to it can be tracked, * using atomic_inc_and_test and atomic_add_negative(-1). Index: linux-2.6/include/linux/pagemap.h === --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -145,7 +145,7 @@ extern void __remove_from_page_cache(str */ static inline loff_t page_offset(struct page *page) { - return ((loff_t)page-index) PAGE_CACHE_SHIFT; + return ((loff_t)page_file_index(page)) PAGE_CACHE_SHIFT; } static inline pgoff_t linear_page_index(struct vm_area_struct *vma, -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/33] mm: tag reseve pages
Tag pages allocated from the reserves with a non-zero page-reserve. This allows us to distinguish and account reserve pages. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/mm_types.h |1 + mm/page_alloc.c |4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) Index: linux-2.6/include/linux/mm_types.h === --- linux-2.6.orig/include/linux/mm_types.h +++ linux-2.6/include/linux/mm_types.h @@ -70,6 +70,7 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* SLUB: freelist req. slab lock */ + int reserve;/* page_alloc: page is a reserve page */ }; struct list_head lru; /* Pageout list, eg. active_list * protected by zone-lru_lock ! Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -1448,8 +1448,10 @@ zonelist_scan: } page = buffered_rmqueue(zonelist, zone, order, gfp_mask); - if (page) + if (page) { + page-reserve = !!(alloc_flags ALLOC_NO_WATERMARKS); break; + } this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 28/33] nfs: teach the NFS client how to treat PG_swapcache pages
Replace all relevant occurences of page-index and page-mapping in the NFS client with the new page_file_index() and page_file_mapping() functions. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/file.c |8 fs/nfs/internal.h |7 --- fs/nfs/pagelist.c |6 +++--- fs/nfs/read.c |6 +++--- fs/nfs/write.c| 49 + 5 files changed, 39 insertions(+), 37 deletions(-) Index: linux-2.6/fs/nfs/file.c === --- linux-2.6.orig/fs/nfs/file.c +++ linux-2.6/fs/nfs/file.c @@ -357,7 +357,7 @@ static void nfs_invalidate_page(struct p if (offset != 0) return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page-mapping-host, page); + nfs_wb_page_cancel(page_file_mapping(page)-host, page); } static int nfs_release_page(struct page *page, gfp_t gfp) @@ -368,7 +368,7 @@ static int nfs_release_page(struct page static int nfs_launder_page(struct page *page) { - return nfs_wb_page(page-mapping-host, page); + return nfs_wb_page(page_file_mapping(page)-host, page); } const struct address_space_operations nfs_file_aops = { @@ -397,13 +397,13 @@ static int nfs_vm_page_mkwrite(struct vm loff_t offset; lock_page(page); - mapping = page-mapping; + mapping = page_file_mapping(page); if (mapping != vma-vm_file-f_path.dentry-d_inode-i_mapping) { unlock_page(page); return -EINVAL; } pagelen = nfs_page_length(page); - offset = (loff_t)page-index PAGE_CACHE_SHIFT; + offset = (loff_t)page_file_index(page) PAGE_CACHE_SHIFT; unlock_page(page); /* Index: linux-2.6/fs/nfs/pagelist.c === --- linux-2.6.orig/fs/nfs/pagelist.c +++ linux-2.6/fs/nfs/pagelist.c @@ -77,11 +77,11 @@ nfs_create_request(struct nfs_open_conte * update_nfs_request below if the region is not locked. */ req-wb_page= page; atomic_set(req-wb_complete, 0); - req-wb_index = page-index; + req-wb_index = page_file_index(page); page_cache_get(page); BUG_ON(PagePrivate(page)); BUG_ON(!PageLocked(page)); - BUG_ON(page-mapping-host != inode); + BUG_ON(page_file_mapping(page)-host != inode); req-wb_offset = offset; req-wb_pgbase = offset; req-wb_bytes = count; @@ -383,7 +383,7 @@ void nfs_pageio_cond_complete(struct nfs * nfs_scan_list - Scan a list for matching requests * @nfsi: NFS inode * @dst: Destination list - * @idx_start: lower bound of page-index to scan + * @idx_start: lower bound of page_file_index(page) to scan * @npages: idx_start + npages sets the upper bound to scan. * @tag: tag to scan for * Index: linux-2.6/fs/nfs/read.c === --- linux-2.6.orig/fs/nfs/read.c +++ linux-2.6/fs/nfs/read.c @@ -460,11 +460,11 @@ static const struct rpc_call_ops nfs_rea int nfs_readpage(struct file *file, struct page *page) { struct nfs_open_context *ctx; - struct inode *inode = page-mapping-host; + struct inode *inode = page_file_mapping(page)-host; int error; dprintk(NFS: nfs_readpage (%p [EMAIL PROTECTED])\n, - page, PAGE_CACHE_SIZE, page-index); + page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -511,7 +511,7 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page-mapping-host; + struct inode *inode = page_file_mapping(page)-host; struct nfs_page *new; unsigned int len; int error; Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -126,7 +126,7 @@ static struct nfs_page *nfs_page_find_re static struct nfs_page *nfs_page_find_request(struct page *page) { - struct inode *inode = page-mapping-host; + struct inode *inode = page_file_mapping(page)-host; struct nfs_page *req = NULL; spin_lock(inode-i_lock); @@ -138,13 +138,13 @@ static struct nfs_page *nfs_page_find_re /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { - struct inode *inode = page-mapping-host; + struct inode *inode = page_file_mapping(page)-host; loff_t end, i_size = i_size_read(inode); pgoff_t end_index = (i_size - 1) PAGE_CACHE_SHIFT; - if (i_size 0 page-index end_index) + if (i_size 0 page_file_index(page
[PATCH 22/33] netfilter: NF_QUEUE vs emergency skbs
Avoid memory getting stuck waiting for userspace, drop all emergency packets. This of course requires the regular storage route to not include an NF_QUEUE target ;-) Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- net/netfilter/core.c |3 +++ 1 file changed, 3 insertions(+) Index: linux-2.6/net/netfilter/core.c === --- linux-2.6.orig/net/netfilter/core.c +++ linux-2.6/net/netfilter/core.c @@ -181,9 +181,12 @@ next_hook: ret = 1; goto unlock; } else if (verdict == NF_DROP) { +drop: kfree_skb(*pskb); ret = -EPERM; } else if ((verdict NF_VERDICT_MASK) == NF_QUEUE) { + if (skb_emergency(*pskb)) + goto drop; NFDEBUG(nf_hook: Verdict = QUEUE.\n); if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn, verdict NF_VERDICT_BITS)) -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 19/33] netvm: hook skb allocation to reserves
Change the skb allocation api to indicate RX usage and use this to fall back to the reserve when needed. SKBs allocated from the reserve are tagged in skb-emergency. Teach all other skb ops about emergency skbs and the reserve accounting. Use the (new) packet split API to allocate and track fragment pages from the emergency reserve. Do this using an atomic counter in page-index. This is needed because the fragments have a different sharing semantic than that indicated by skb_shinfo()-dataref. Note that the decision to distinguish between regular and emergency SKBs allows the accounting overhead to be limited to the later kind. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/mm_types.h |1 include/linux/skbuff.h | 25 +- net/core/skbuff.c| 173 +-- 3 files changed, 173 insertions(+), 26 deletions(-) Index: linux-2.6/include/linux/skbuff.h === --- linux-2.6.orig/include/linux/skbuff.h +++ linux-2.6/include/linux/skbuff.h @@ -289,7 +289,8 @@ struct sk_buff { __u8pkt_type:3, fclone:2, ipvs_property:1, - nf_trace:1; + nf_trace:1, + emergency:1; __be16 protocol; void(*destructor)(struct sk_buff *skb); @@ -341,10 +342,22 @@ struct sk_buff { #include asm/system.h +#define SKB_ALLOC_FCLONE 0x01 +#define SKB_ALLOC_RX 0x02 + +static inline bool skb_emergency(const struct sk_buff *skb) +{ +#ifdef CONFIG_NETVM + return unlikely(skb-emergency); +#else + return false; +#endif +} + extern void kfree_skb(struct sk_buff *skb); extern void __kfree_skb(struct sk_buff *skb); extern struct sk_buff *__alloc_skb(unsigned int size, - gfp_t priority, int fclone, int node); + gfp_t priority, int flags, int node); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) { @@ -354,7 +367,7 @@ static inline struct sk_buff *alloc_skb( static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { - return __alloc_skb(size, priority, 1, -1); + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, -1); } extern void kfree_skbmem(struct sk_buff *skb); @@ -1297,7 +1310,8 @@ static inline void __skb_queue_purge(str static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) { - struct sk_buff *skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); + struct sk_buff *skb = + __alloc_skb(length + NET_SKB_PAD, gfp_mask, SKB_ALLOC_RX, -1); if (likely(skb)) skb_reserve(skb, NET_SKB_PAD); return skb; @@ -1343,6 +1357,7 @@ static inline struct sk_buff *netdev_all } extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask); +extern void __netdev_free_page(struct net_device *dev, struct page *page); /** * netdev_alloc_page - allocate a page for ps-rx on a specific device @@ -1359,7 +1374,7 @@ static inline struct page *netdev_alloc_ static inline void netdev_free_page(struct net_device *dev, struct page *page) { - __free_page(page); + __netdev_free_page(dev, page); } /** Index: linux-2.6/net/core/skbuff.c === --- linux-2.6.orig/net/core/skbuff.c +++ linux-2.6/net/core/skbuff.c @@ -179,21 +179,28 @@ EXPORT_SYMBOL(skb_truesize_bug); * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) + int flags, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; + int emergency = 0, memalloc = sk_memalloc_socks(); - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + size = SKB_DATA_ALIGN(size); + cache = (flags SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; +#ifdef CONFIG_NETVM + if (memalloc (flags SKB_ALLOC_RX)) + gfp_mask |= __GFP_NOMEMALLOC|__GFP_NOWARN; +retry_alloc: +#endif /* Get the HEAD */ skb = kmem_cache_alloc_node(cache, gfp_mask ~__GFP_DMA, node); if (!skb) - goto out; + goto noskb; - size = SKB_DATA_ALIGN(size); data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info), gfp_mask, node); if (!data) @@ -203,6 +210,7 @@ struct sk_buff *__alloc_skb(unsigned int * See
[PATCH 13/33] net: wrap sk-sk_backlog_rcv()
Wrap calling sk-sk_backlog_rcv() in a function. This will allow extending the generic sk_backlog_rcv behaviour. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h |5 + net/core/sock.c |4 ++-- net/ipv4/tcp.c |2 +- net/ipv4/tcp_timer.c |2 +- 4 files changed, 9 insertions(+), 4 deletions(-) Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -485,6 +485,11 @@ static inline void sk_add_backlog(struct skb-next = NULL; } +static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + return sk-sk_backlog_rcv(sk, skb); +} + #define sk_wait_event(__sk, __timeo, __condition) \ ({ int __rc; \ release_sock(__sk); \ Index: linux-2.6/net/core/sock.c === --- linux-2.6.orig/net/core/sock.c +++ linux-2.6/net/core/sock.c @@ -320,7 +320,7 @@ int sk_receive_skb(struct sock *sk, stru */ mutex_acquire(sk-sk_lock.dep_map, 0, 1, _RET_IP_); - rc = sk-sk_backlog_rcv(sk, skb); + rc = sk_backlog_rcv(sk, skb); mutex_release(sk-sk_lock.dep_map, 1, _RET_IP_); } else @@ -1312,7 +1312,7 @@ static void __release_sock(struct sock * struct sk_buff *next = skb-next; skb-next = NULL; - sk-sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); /* * We are in process context here with softirqs Index: linux-2.6/net/ipv4/tcp.c === --- linux-2.6.orig/net/ipv4/tcp.c +++ linux-2.6/net/ipv4/tcp.c @@ -1134,7 +1134,7 @@ static void tcp_prequeue_process(struct * necessary */ local_bh_disable(); while ((skb = __skb_dequeue(tp-ucopy.prequeue)) != NULL) - sk-sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); local_bh_enable(); /* Clear memory counter. */ Index: linux-2.6/net/ipv4/tcp_timer.c === --- linux-2.6.orig/net/ipv4/tcp_timer.c +++ linux-2.6/net/ipv4/tcp_timer.c @@ -196,7 +196,7 @@ static void tcp_delack_timer(unsigned lo NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED); while ((skb = __skb_dequeue(tp-ucopy.prequeue)) != NULL) - sk-sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); tp-ucopy.memory = 0; } -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 23/33] netvm: skb processing
In order to make sure emergency packets receive all memory needed to proceed ensure processing of emergency SKBs happens under PF_MEMALLOC. Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing. Skip taps, since those are user-space again. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h |5 + net/core/dev.c | 44 ++-- net/core/sock.c| 18 ++ 3 files changed, 61 insertions(+), 6 deletions(-) Index: linux-2.6/net/core/dev.c === --- linux-2.6.orig/net/core/dev.c +++ linux-2.6/net/core/dev.c @@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *orig_dev; int ret = NET_RX_DROP; __be16 type; + unsigned long pflags = current-flags; + + /* Emergency skb are special, they should +* - be delivered to SOCK_MEMALLOC sockets only +* - stay away from userspace +* - have bounded memory usage +* +* Use PF_MEMALLOC as a poor mans memory pool - the grouping kind. +* This saves us from propagating the allocation context down to all +* allocation sites. +*/ + if (skb_emergency(skb)) + current-flags |= PF_MEMALLOC; /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) - return NET_RX_DROP; + goto out; if (!skb-tstamp.tv64) net_timestamp(skb); @@ -1990,7 +2003,7 @@ int netif_receive_skb(struct sk_buff *sk orig_dev = skb_bond(skb); if (!orig_dev) - return NET_RX_DROP; + goto out; __get_cpu_var(netdev_rx_stat).total++; @@ -2009,6 +2022,9 @@ int netif_receive_skb(struct sk_buff *sk } #endif + if (skb_emergency(skb)) + goto skip_taps; + list_for_each_entry_rcu(ptype, ptype_all, list) { if (!ptype-dev || ptype-dev == skb-dev) { if (pt_prev) @@ -2017,6 +2033,7 @@ int netif_receive_skb(struct sk_buff *sk } } +skip_taps: #ifdef CONFIG_NET_CLS_ACT if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { kfree_skb(skb); - goto out; + goto unlock; } skb-tc_verd = 0; ncls: #endif + if (skb_emergency(skb)) + switch(skb-protocol) { + case __constant_htons(ETH_P_ARP): + case __constant_htons(ETH_P_IP): + case __constant_htons(ETH_P_IPV6): + case __constant_htons(ETH_P_8021Q): + break; + + default: + goto drop; + } + skb = handle_bridge(skb, pt_prev, ret, orig_dev); if (!skb) - goto out; + goto unlock; skb = handle_macvlan(skb, pt_prev, ret, orig_dev); if (!skb) - goto out; + goto unlock; type = skb-protocol; list_for_each_entry_rcu(ptype, ptype_base[ntohs(type)15], list) { @@ -2056,6 +2085,7 @@ ncls: if (pt_prev) { ret = pt_prev-func(skb, skb-dev, pt_prev, orig_dev); } else { +drop: kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -2063,8 +2093,10 @@ ncls: ret = NET_RX_DROP; } -out: +unlock: rcu_read_unlock(); +out: + tsk_restore_flags(current, pflags, PF_MEMALLOC); return ret; } Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -523,8 +523,13 @@ static inline void sk_add_backlog(struct skb-next = NULL; } +extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { + if (skb_emergency(skb)) + return __sk_backlog_rcv(sk, skb); + return sk-sk_backlog_rcv(sk, skb); } Index: linux-2.6/net/core/sock.c === --- linux-2.6.orig/net/core/sock.c +++ linux-2.6/net/core/sock.c @@ -319,6 +319,24 @@ int sk_clear_memalloc(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_clear_memalloc); +#ifdef CONFIG_NETVM +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + unsigned long pflags = current-flags; + + /* these should have been dropped before queueing */ + BUG_ON(!sk_has_memalloc(sk
[PATCH 25/33] mm: add support for non block device backed swap files
A new addres_space_operations method is added: int swapfile(struct address_space *, int) When during sys_swapon() this method is found and returns no error the swapper_space.a_ops will proxy to sis-swap_file-f_mapping-a_ops. The swapfile method will be used to communicate to the address_space that the VM relies on it, and the address_space should take adequate measures (like reserving memory for mempools or the like). Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- Documentation/filesystems/Locking |9 + include/linux/buffer_head.h |2 - include/linux/fs.h|1 include/linux/swap.h |3 + mm/Kconfig|3 + mm/page_io.c | 58 ++ mm/swap_state.c |5 +++ mm/swapfile.c | 22 +- 8 files changed, 101 insertions(+), 2 deletions(-) Index: linux-2.6/include/linux/swap.h === --- linux-2.6.orig/include/linux/swap.h +++ linux-2.6/include/linux/swap.h @@ -164,6 +164,7 @@ enum { SWP_USED= (1 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 1), /* ok to write to this swap?*/ SWP_ACTIVE = (SWP_USED | SWP_WRITEOK), + SWP_FILE= (1 2), /* file swap area */ /* add others here before... */ SWP_SCANNING= (1 8), /* refcount in scan_swap_map */ }; @@ -264,6 +265,8 @@ extern void swap_unplug_io_fn(struct bac /* linux/mm/page_io.c */ extern int swap_readpage(struct file *, struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern void swap_sync_page(struct page *page); +extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_read(struct bio *bio, int err); /* linux/mm/swap_state.c */ Index: linux-2.6/mm/page_io.c === --- linux-2.6.orig/mm/page_io.c +++ linux-2.6/mm/page_io.c @@ -17,6 +17,7 @@ #include linux/bio.h #include linux/swapops.h #include linux/writeback.h +#include linux/buffer_head.h #include asm/pgtable.h static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, @@ -102,6 +103,18 @@ int swap_writepage(struct page *page, st unlock_page(page); goto out; } +#ifdef CONFIG_SWAP_FILE + { + struct swap_info_struct *sis = page_swap_info(page); + if (sis-flags SWP_FILE) { + ret = sis-swap_file-f_mapping- + a_ops-writepage(page, wbc); + if (!ret) + count_vm_event(PSWPOUT); + return ret; + } + } +#endif bio = get_swap_bio(GFP_NOIO, page_private(page), page, end_swap_bio_write); if (bio == NULL) { @@ -120,6 +133,39 @@ out: return ret; } +#ifdef CONFIG_SWAP_FILE +void swap_sync_page(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis-flags SWP_FILE) { + const struct address_space_operations * a_ops = + sis-swap_file-f_mapping-a_ops; + if (a_ops-sync_page) + a_ops-sync_page(page); + } else + block_sync_page(page); +} + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis-flags SWP_FILE) { + const struct address_space_operations * a_ops = + sis-swap_file-f_mapping-a_ops; + int (*spd)(struct page *) = a_ops-set_page_dirty; +#ifdef CONFIG_BLOCK + if (!spd) + spd = __set_page_dirty_buffers; +#endif + return (*spd)(page); + } + + return __set_page_dirty_nobuffers(page); +} +#endif + int swap_readpage(struct file *file, struct page *page) { struct bio *bio; @@ -127,6 +173,18 @@ int swap_readpage(struct file *file, str BUG_ON(!PageLocked(page)); ClearPageUptodate(page); +#ifdef CONFIG_SWAP_FILE + { + struct swap_info_struct *sis = page_swap_info(page); + if (sis-flags SWP_FILE) { + ret = sis-swap_file-f_mapping- + a_ops-readpage(sis-swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } + } +#endif bio = get_swap_bio(GFP_KERNEL, page_private(page), page, end_swap_bio_read); if (bio == NULL) { Index: linux-2.6/mm/swap_state.c === --- linux-2.6.orig/mm/swap_state.c
[PATCH 20/33] netvm: filter emergency skbs.
Toss all emergency packets not for a SOCK_MEMALLOC socket. This ensures our precious memory reserve doesn't get stuck waiting for user-space. The correctness of this approach relies on the fact that networks must be assumed lossy. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h |3 +++ 1 file changed, 3 insertions(+) Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -930,6 +930,9 @@ static inline int sk_filter(struct sock { int err; struct sk_filter *filter; + + if (skb_emergency(skb) !sk_has_memalloc(sk)) + return -ENOMEM; err = security_sock_rcv_skb(sk, skb); if (err) -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 01/33] mm: gfp_to_alloc_flags()
Factor out the gfp to alloc_flags mapping so it can be used in other places. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- mm/internal.h | 11 ++ mm/page_alloc.c | 98 2 files changed, 67 insertions(+), 42 deletions(-) Index: linux-2.6/mm/internal.h === --- linux-2.6.orig/mm/internal.h +++ linux-2.6/mm/internal.h @@ -47,4 +47,15 @@ static inline unsigned long page_order(s VM_BUG_ON(!PageBuddy(page)); return page_private(page); } + +#define ALLOC_HARDER 0x01 /* try to alloc harder */ +#define ALLOC_HIGH 0x02 /* __GFP_HIGH set */ +#define ALLOC_WMARK_MIN0x04 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW0x08 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x10 /* use pages_high watermark */ +#define ALLOC_NO_WATERMARKS0x20 /* don't check watermarks at all */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + +int gfp_to_alloc_flags(gfp_t gfp_mask); + #endif Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -1139,14 +1139,6 @@ failed: return NULL; } -#define ALLOC_NO_WATERMARKS0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct fail_page_alloc_attr { @@ -1535,6 +1527,44 @@ static void set_page_owner(struct page * #endif /* CONFIG_PAGE_OWNER */ /* + * get the deepest reaching allocation flags for the given gfp_mask + */ +int gfp_to_alloc_flags(gfp_t gfp_mask) +{ + struct task_struct *p = current; + int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + const gfp_t wait = gfp_mask __GFP_WAIT; + + /* +* The caller may dip into page reserves a bit more if the caller +* cannot run direct reclaim, or if the caller has realtime scheduling +* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will +* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). +*/ + if (gfp_mask __GFP_HIGH) + alloc_flags |= ALLOC_HIGH; + + if (!wait) { + alloc_flags |= ALLOC_HARDER; + /* +* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. +* See also cpuset_zone_allowed() comment in kernel/cpuset.c. +*/ + alloc_flags = ~ALLOC_CPUSET; + } else if (unlikely(rt_task(p)) !in_interrupt()) + alloc_flags |= ALLOC_HARDER; + + if (likely(!(gfp_mask __GFP_NOMEMALLOC))) { + if (!in_interrupt() + ((p-flags PF_MEMALLOC) || +unlikely(test_thread_flag(TIF_MEMDIE + alloc_flags |= ALLOC_NO_WATERMARKS; + } + + return alloc_flags; +} + +/* * This is the 'heart' of the zoned buddy allocator. */ struct page * fastcall @@ -1589,48 +1619,28 @@ restart: * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according * to how we want to proceed. -* -* The caller may dip into page reserves a bit more if the caller -* cannot run direct reclaim, or if the caller has realtime scheduling -* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will -* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags = ALLOC_WMARK_MIN; - if ((unlikely(rt_task(p)) !in_interrupt()) || !wait) - alloc_flags |= ALLOC_HARDER; - if (gfp_mask __GFP_HIGH) - alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* -* Go through the zonelist again. Let __GFP_HIGH and allocations -* coming from realtime tasks go deeper into reserves. -* -* This is the last chance, in general, before the goto nopage. -* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. -* See also cpuset_zone_allowed() comment in kernel/cpuset.c. -*/ - page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); + /* This is the last chance, in general, before the goto nopage. */ + page = get_page_from_freelist(gfp_mask, order, zonelist
[PATCH 10/33] mm: __GFP_MEMALLOC
__GFP_MEMALLOC will allow the allocation to disregard the watermarks, much like PF_MEMALLOC. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/gfp.h |3 ++- mm/page_alloc.c |4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) Index: linux-2.6/include/linux/gfp.h === --- linux-2.6.orig/include/linux/gfp.h +++ linux-2.6/include/linux/gfp.h @@ -43,6 +43,7 @@ struct vm_area_struct; #define __GFP_REPEAT ((__force gfp_t)0x400u) /* Retry the allocation. Might fail */ #define __GFP_NOFAIL ((__force gfp_t)0x800u) /* Retry for ever. Cannot fail */ #define __GFP_NORETRY ((__force gfp_t)0x1000u)/* Do not retry. Might fail */ +#define __GFP_MEMALLOC ((__force gfp_t)0x2000u)/* Use emergency reserves */ #define __GFP_COMP ((__force gfp_t)0x4000u)/* Add compound page metadata */ #define __GFP_ZERO ((__force gfp_t)0x8000u)/* Return zeroed page on success */ #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use emergency reserves */ @@ -88,7 +89,7 @@ struct vm_area_struct; /* Control page allocator reclaim behavior */ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -1560,7 +1560,9 @@ int gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask __GFP_NOMEMALLOC))) { - if (!in_irq() (p-flags PF_MEMALLOC)) + if (gfp_mask __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_irq() (p-flags PF_MEMALLOC)) alloc_flags |= ALLOC_NO_WATERMARKS; else if (!in_interrupt() unlikely(test_thread_flag(TIF_MEMDIE))) -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 17/33] sysctl: propagate conv errors
Currently conv routines will only generate -EINVAL, allow for other errors to be propagetd. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sysctl.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) Index: linux-2.6/kernel/sysctl.c === --- linux-2.6.orig/kernel/sysctl.c +++ linux-2.6/kernel/sysctl.c @@ -1732,6 +1732,7 @@ static int __do_proc_dointvec(void *tbl_ int *i, vleft, first=1, neg, val; unsigned long lval; size_t left, len; + int ret = 0; char buf[TMPBUFLEN], *p; char __user *s = buffer; @@ -1787,14 +1788,16 @@ static int __do_proc_dointvec(void *tbl_ s += len; left -= len; - if (conv(neg, lval, i, 1, data)) + ret = conv(neg, lval, i, 1, data); + if (ret) break; } else { p = buf; if (!first) *p++ = '\t'; - if (conv(neg, lval, i, 0, data)) + ret = conv(neg, lval, i, 0, data); + if (ret) break; sprintf(p, %s%lu, neg ? - : , lval); @@ -1823,11 +1826,9 @@ static int __do_proc_dointvec(void *tbl_ left--; } } - if (write first) - return -EINVAL; *lenp -= left; *ppos += *lenp; - return 0; + return ret; #undef TMPBUFLEN } -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 14/33] net: packet split receive api
Add some packet-split receive hooks. For one this allows to do NUMA node affine page allocs. Later on these hooks will be extended to do emergency reserve allocations for fragments. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- drivers/net/e1000/e1000_main.c |8 ++-- drivers/net/sky2.c | 16 ++-- include/linux/skbuff.h | 23 +++ net/core/skbuff.c | 20 4 files changed, 51 insertions(+), 16 deletions(-) Index: linux-2.6/drivers/net/e1000/e1000_main.c === --- linux-2.6.orig/drivers/net/e1000/e1000_main.c +++ linux-2.6/drivers/net/e1000/e1000_main.c @@ -4407,12 +4407,8 @@ e1000_clean_rx_irq_ps(struct e1000_adapt pci_unmap_page(pdev, ps_page_dma-ps_page_dma[j], PAGE_SIZE, PCI_DMA_FROMDEVICE); ps_page_dma-ps_page_dma[j] = 0; - skb_fill_page_desc(skb, j, ps_page-ps_page[j], 0, - length); + skb_add_rx_frag(skb, j, ps_page-ps_page[j], 0, length); ps_page-ps_page[j] = NULL; - skb-len += length; - skb-data_len += length; - skb-truesize += length; } /* strip the ethernet crc, problem is we're using pages now so @@ -4618,7 +4614,7 @@ e1000_alloc_rx_buffers_ps(struct e1000_a if (j adapter-rx_ps_pages) { if (likely(!ps_page-ps_page[j])) { ps_page-ps_page[j] = - alloc_page(GFP_ATOMIC); + netdev_alloc_page(netdev); if (unlikely(!ps_page-ps_page[j])) { adapter-alloc_rx_buff_failed++; goto no_buffers; Index: linux-2.6/include/linux/skbuff.h === --- linux-2.6.orig/include/linux/skbuff.h +++ linux-2.6/include/linux/skbuff.h @@ -846,6 +846,9 @@ static inline void skb_fill_page_desc(st skb_shinfo(skb)-nr_frags = i + 1; } +extern void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, + int off, int size); + #define SKB_PAGE_ASSERT(skb) BUG_ON(skb_shinfo(skb)-nr_frags) #define SKB_FRAG_ASSERT(skb) BUG_ON(skb_shinfo(skb)-frag_list) #define SKB_LINEAR_ASSERT(skb) BUG_ON(skb_is_nonlinear(skb)) @@ -1339,6 +1342,26 @@ static inline struct sk_buff *netdev_all return __netdev_alloc_skb(dev, length, GFP_ATOMIC); } +extern struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask); + +/** + * netdev_alloc_page - allocate a page for ps-rx on a specific device + * @dev: network device to receive on + * + * Allocate a new page node local to the specified device. + * + * %NULL is returned if there is no free memory. + */ +static inline struct page *netdev_alloc_page(struct net_device *dev) +{ + return __netdev_alloc_page(dev, GFP_ATOMIC); +} + +static inline void netdev_free_page(struct net_device *dev, struct page *page) +{ + __free_page(page); +} + /** * skb_clone_writable - is the header of a clone writable * @skb: buffer to check Index: linux-2.6/net/core/skbuff.c === --- linux-2.6.orig/net/core/skbuff.c +++ linux-2.6/net/core/skbuff.c @@ -263,6 +263,24 @@ struct sk_buff *__netdev_alloc_skb(struc return skb; } +struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) +{ + int node = dev-dev.parent ? dev_to_node(dev-dev.parent) : -1; + struct page *page; + + page = alloc_pages_node(node, gfp_mask, 0); + return page; +} + +void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, + int size) +{ + skb_fill_page_desc(skb, i, page, off, size); + skb-len += size; + skb-data_len += size; + skb-truesize += size; +} + static void skb_drop_list(struct sk_buff **listp) { struct sk_buff *list = *listp; @@ -2464,6 +2482,8 @@ EXPORT_SYMBOL(kfree_skb); EXPORT_SYMBOL(__pskb_pull_tail); EXPORT_SYMBOL(__alloc_skb); EXPORT_SYMBOL(__netdev_alloc_skb); +EXPORT_SYMBOL(__netdev_alloc_page); +EXPORT_SYMBOL(skb_add_rx_frag); EXPORT_SYMBOL(pskb_copy); EXPORT_SYMBOL(pskb_expand_head); EXPORT_SYMBOL(skb_checksum); Index: linux-2.6/drivers/net/sky2.c === --- linux-2.6.orig/drivers/net/sky2.c +++ linux-2.6/drivers/net/sky2.c @@ -1173,7 +1173,7 @@ static struct sk_buff *sky2_rx_alloc(str skb_reserve(skb, ALIGN(p, RX_SKB_ALIGN) - p
[PATCH 15/33] net: sk_allocation() - concentrate socket related allocations
Introduce sk_allocation(), this function allows to inject sock specific flags to each sock related allocation. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h|7 ++- net/ipv4/tcp_output.c | 11 ++- net/ipv6/tcp_ipv6.c | 14 +- 3 files changed, 21 insertions(+), 11 deletions(-) Index: linux-2.6/net/ipv4/tcp_output.c === --- linux-2.6.orig/net/ipv4/tcp_output.c +++ linux-2.6/net/ipv4/tcp_output.c @@ -2081,7 +2081,7 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, sk-sk_allocation); if (skb) break; yield(); @@ -2114,7 +2114,7 @@ void tcp_send_active_reset(struct sock * struct sk_buff *skb; /* NOTE: No TCP options attached and we never retransmit this. */ - skb = alloc_skb(MAX_TCP_HEADER, priority); + skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, priority)); if (!skb) { NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); return; @@ -2187,7 +2187,8 @@ struct sk_buff * tcp_make_synack(struct __u8 *md5_hash_location; #endif - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, + sk_allocation(sk, GFP_ATOMIC)); if (skb == NULL) return NULL; @@ -2446,7 +2447,7 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC)); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)-icsk_ack.ato = TCP_ATO_MIN; @@ -2488,7 +2489,7 @@ static int tcp_xmit_probe_skb(struct soc struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, sk_allocation(sk, GFP_ATOMIC)); if (skb == NULL) return -1; Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -419,6 +419,11 @@ static inline int sock_flag(struct sock return test_bit(flag, sk-sk_flags); } +static inline gfp_t sk_allocation(struct sock *sk, gfp_t gfp_mask) +{ + return gfp_mask; +} + static inline void sk_acceptq_removed(struct sock *sk) { sk-sk_ack_backlog--; @@ -1212,7 +1217,7 @@ static inline struct sk_buff *sk_stream_ int hdr_len; hdr_len = SKB_DATA_ALIGN(sk-sk_prot-max_header); - skb = alloc_skb_fclone(size + hdr_len, gfp); + skb = alloc_skb_fclone(size + hdr_len, sk_allocation(sk, gfp)); if (skb) { skb-truesize += mem; if (sk_stream_wmem_schedule(sk, skb-truesize)) { Index: linux-2.6/net/ipv6/tcp_ipv6.c === --- linux-2.6.orig/net/ipv6/tcp_ipv6.c +++ linux-2.6/net/ipv6/tcp_ipv6.c @@ -573,7 +573,8 @@ static int tcp_v6_md5_do_add(struct sock } else { /* reallocate new list if current one is full. */ if (!tp-md5sig_info) { - tp-md5sig_info = kzalloc(sizeof(*tp-md5sig_info), GFP_ATOMIC); + tp-md5sig_info = kzalloc(sizeof(*tp-md5sig_info), + sk_allocation(sk, GFP_ATOMIC)); if (!tp-md5sig_info) { kfree(newkey); return -ENOMEM; @@ -583,7 +584,8 @@ static int tcp_v6_md5_do_add(struct sock tcp_alloc_md5sig_pool(); if (tp-md5sig_info-alloced6 == tp-md5sig_info-entries6) { keys = kmalloc((sizeof (tp-md5sig_info-keys6[0]) * - (tp-md5sig_info-entries6 + 1)), GFP_ATOMIC); + (tp-md5sig_info-entries6 + 1)), + sk_allocation(sk, GFP_ATOMIC)); if (!keys) { tcp_free_md5sig_pool(); @@ -709,7 +711,7 @@ static int tcp_v6_parse_md5_keys (struct struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *p; - p = kzalloc(sizeof(struct tcp_md5sig_info), GFP_KERNEL); + p = kzalloc(sizeof(struct tcp_md5sig_info), sk-sk_allocation
[PATCH 07/33] mm: serialize access to min_free_kbytes
There is a small race between the procfs caller and the memory hotplug caller of setup_per_zone_pages_min(). Not a big deal, but the next patch will add yet another caller. Time to close the gap. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- mm/page_alloc.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -116,6 +116,7 @@ static char * const zone_names[MAX_NR_ZO Movable, }; +static DEFINE_SPINLOCK(min_free_lock); int min_free_kbytes = 1024; unsigned long __meminitdata nr_kernel_pages; @@ -4162,12 +4163,12 @@ static void setup_per_zone_lowmem_reserv } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * __setup_per_zone_pages_min - called when min_free_kbytes changes. * * Ensures that the pages_{min,low,high} values for each zone are set correctly * with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +static void __setup_per_zone_pages_min(void) { unsigned long pages_min = min_free_kbytes (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4222,6 +4223,15 @@ void setup_per_zone_pages_min(void) calculate_totalreserve_pages(); } +void setup_per_zone_pages_min(void) +{ + unsigned long flags; + + spin_lock_irqsave(min_free_lock, flags); + __setup_per_zone_pages_min(); + spin_unlock_irqrestore(min_free_lock, flags); +} + /* * Initialise min_free_kbytes. * @@ -4257,7 +4267,7 @@ static int __init init_per_zone_pages_mi min_free_kbytes = 128; if (min_free_kbytes 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + __setup_per_zone_pages_min(); setup_per_zone_lowmem_reserve(); return 0; } -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 11/33] mm: memory reserve management
Generic reserve management code. It provides methods to reserve and charge. Upon this, generic alloc/free style reserve pools could be build, which could fully replace mempool_t functionality. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/reserve.h | 54 + mm/Makefile |2 mm/reserve.c| 436 3 files changed, 491 insertions(+), 1 deletion(-) Index: linux-2.6/include/linux/reserve.h === --- /dev/null +++ linux-2.6/include/linux/reserve.h @@ -0,0 +1,54 @@ +/* + * Memory reserve management. + * + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra [EMAIL PROTECTED] + * + * This file contains the public data structure and API definitions. + */ + +#ifndef _LINUX_RESERVE_H +#define _LINUX_RESERVE_H + +#include linux/list.h +#include linux/spinlock.h + +struct mem_reserve { + struct mem_reserve *parent; + struct list_head children; + struct list_head siblings; + + const char *name; + + long pages; + long limit; + long usage; + spinlock_t lock;/* protects limit and usage */ +}; + +extern struct mem_reserve mem_reserve_root; + +void mem_reserve_init(struct mem_reserve *res, const char *name, + struct mem_reserve *parent); +int mem_reserve_connect(struct mem_reserve *new_child, + struct mem_reserve *node); +int mem_reserve_disconnect(struct mem_reserve *node); + +int mem_reserve_pages_set(struct mem_reserve *res, long pages); +int mem_reserve_pages_add(struct mem_reserve *res, long pages); +int mem_reserve_pages_charge(struct mem_reserve *res, long pages, +int overcommit); + +int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes); +int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes, + int overcommit); + +struct kmem_cache; + +int mem_reserve_kmem_cache_set(struct mem_reserve *res, + struct kmem_cache *s, + int objects); +int mem_reserve_kmem_cache_charge(struct mem_reserve *res, + long objs, + int overcommit); + +#endif /* _LINUX_RESERVE_H */ Index: linux-2.6/mm/Makefile === --- linux-2.6.orig/mm/Makefile +++ linux-2.6/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o reserve.o $(mmu-y) obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o Index: linux-2.6/mm/reserve.c === --- /dev/null +++ linux-2.6/mm/reserve.c @@ -0,0 +1,436 @@ +/* + * Memory reserve management. + * + * Copyright (C) 2007, Red Hat, Inc., Peter Zijlstra [EMAIL PROTECTED] + * + * Description: + * + * Manage a set of memory reserves. + * + * A memory reserve is a reserve for a specified number of object of specified + * size. Since memory is managed in pages, this reserve demand is then + * translated into a page unit. + * + * So each reserve has a specified object limit, an object usage count and a + * number of pages required to back these objects. + * + * Usage is charged against a reserve, if the charge fails, the resource must + * not be allocated/used. + * + * The reserves are managed in a tree, and the resource demands (pages and + * limit) are propagated up the tree. Obviously the object limit will be + * meaningless as soon as the unit starts mixing, but the required page reserve + * (being of one unit) is still valid at the root. + * + * It is the page demand of the root node that is used to set the global + * reserve (adjust_memalloc_reserve() which sets zone-pages_emerg). + * + * As long as a subtree has the same usage unit, an aggregate node can be used + * to charge against, instead of the leaf nodes. However, do be consistent with + * who is charged, resource usage is not propagated up the tree (for + * performance reasons). + */ + +#include linux/reserve.h +#include linux/mutex.h +#include linux/mmzone.h +#include linux/log2.h +#include linux/proc_fs.h +#include linux/seq_file.h +#include linux/module.h +#include linux/slab.h + +static DEFINE_MUTEX(mem_reserve_mutex); + +/** + * @mem_reserve_root - the global reserve root + * + * The global reserve is empty, and has no limit unit, it merely + * acts as an aggregation point for reserves and an interface to + * adjust_memalloc_reserve(). + */ +struct mem_reserve mem_reserve_root
[PATCH 21/33] netvm: prevent a TCP specific deadlock
It could happen that all !SOCK_MEMALLOC sockets have buffered so much data that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers from receiving data, which will prevent userspace from running, which is needed to reduce the buffered data. Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h |7 --- net/core/stream.c |5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) Index: linux-2.6/include/net/sock.h === --- linux-2.6.orig/include/net/sock.h +++ linux-2.6/include/net/sock.h @@ -743,7 +743,8 @@ static inline struct inode *SOCK_INODE(s } extern void __sk_stream_mem_reclaim(struct sock *sk); -extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind); +extern int sk_stream_mem_schedule(struct sock *sk, struct sk_buff *skb, + int size, int kind); #define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE) @@ -761,13 +762,13 @@ static inline void sk_stream_mem_reclaim static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) { return (int)skb-truesize = sk-sk_forward_alloc || - sk_stream_mem_schedule(sk, skb-truesize, 1); + sk_stream_mem_schedule(sk, skb, skb-truesize, 1); } static inline int sk_stream_wmem_schedule(struct sock *sk, int size) { return size = sk-sk_forward_alloc || - sk_stream_mem_schedule(sk, size, 0); + sk_stream_mem_schedule(sk, NULL, size, 0); } /* Used by processes to lock a socket state, so that Index: linux-2.6/net/core/stream.c === --- linux-2.6.orig/net/core/stream.c +++ linux-2.6/net/core/stream.c @@ -207,7 +207,7 @@ void __sk_stream_mem_reclaim(struct sock EXPORT_SYMBOL(__sk_stream_mem_reclaim); -int sk_stream_mem_schedule(struct sock *sk, int size, int kind) +int sk_stream_mem_schedule(struct sock *sk, struct sk_buff *skb, int size, int kind) { int amt = sk_stream_pages(size); @@ -224,7 +224,8 @@ int sk_stream_mem_schedule(struct sock * /* Over hard limit. */ if (atomic_read(sk-sk_prot-memory_allocated) sk-sk_prot-sysctl_mem[2]) { sk-sk_prot-enter_memory_pressure(); - goto suppress_allocation; + if (!skb || (skb !skb_emergency(skb))) + goto suppress_allocation; } /* Under pressure. */ -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 29/33] nfs: disable data cache revalidation for swapfiles
Do as Trond suggested: http://lkml.org/lkml/2006/8/25/348 Disable NFS data cache revalidation on swap files since it doesn't really make sense to have other clients change the file while you are using it. Thereby we can stop setting PG_private on swap pages, since there ought to be no further races with invalidate_inode_pages2() to deal with. And since we cannot set PG_private we cannot use page-private (which is already used by PG_swapcache pages anyway) to store the nfs_page. Thus augment the new nfs_page_find_request logic. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/inode.c |6 fs/nfs/write.c | 73 ++--- 2 files changed, 65 insertions(+), 14 deletions(-) Index: linux-2.6/fs/nfs/inode.c === --- linux-2.6.orig/fs/nfs/inode.c +++ linux-2.6/fs/nfs/inode.c @@ -744,6 +744,12 @@ int nfs_revalidate_mapping_nolock(struct struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; + /* +* swapfiles are not supposed to be shared. +*/ + if (IS_SWAPFILE(inode)) + goto out; + if ((nfsi-cache_validity NFS_INO_REVAL_PAGECACHE) || nfs_attribute_timeout(inode) || NFS_STALE(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -112,25 +112,62 @@ static void nfs_context_set_write_error( set_bit(NFS_CONTEXT_ERROR_WRITE, ctx-flags); } -static struct nfs_page *nfs_page_find_request_locked(struct page *page) +static struct nfs_page * +__nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page, int get) { struct nfs_page *req = NULL; - if (PagePrivate(page)) { + if (PagePrivate(page)) req = (struct nfs_page *)page_private(page); - if (req != NULL) - kref_get(req-wb_kref); - } + else if (unlikely(PageSwapCache(page))) + req = radix_tree_lookup(nfsi-nfs_page_tree, page_file_index(page)); + + if (get req) + kref_get(req-wb_kref); + return req; } +static inline struct nfs_page * +nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) +{ + return __nfs_page_find_request_locked(nfsi, page, 1); +} + +static int __nfs_page_has_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)-host; + struct nfs_page *req = NULL; + + spin_lock(inode-i_lock); + req = __nfs_page_find_request_locked(NFS_I(inode), page, 0); + spin_unlock(inode-i_lock); + + /* +* hole here plugged by the caller holding onto PG_locked +*/ + + return req != NULL; +} + +static inline int nfs_page_has_request(struct page *page) +{ + if (PagePrivate(page)) + return 1; + + if (unlikely(PageSwapCache(page))) + return __nfs_page_has_request(page); + + return 0; +} + static struct nfs_page *nfs_page_find_request(struct page *page) { struct inode *inode = page_file_mapping(page)-host; struct nfs_page *req = NULL; spin_lock(inode-i_lock); - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); spin_unlock(inode-i_lock); return req; } @@ -255,7 +292,7 @@ static int nfs_page_async_flush(struct n spin_lock(inode-i_lock); for(;;) { - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(nfsi, page); if (req == NULL) { spin_unlock(inode-i_lock); return 0; @@ -374,8 +411,14 @@ static int nfs_inode_add_request(struct if (nfs_have_delegation(inode, FMODE_WRITE)) nfsi-change_attr++; } - SetPagePrivate(req-wb_page); - set_page_private(req-wb_page, (unsigned long)req); + /* +* Swap-space should not get truncated. Hence no need to plug the race +* with invalidate/truncate. +*/ + if (likely(!PageSwapCache(req-wb_page))) { + SetPagePrivate(req-wb_page); + set_page_private(req-wb_page, (unsigned long)req); + } nfsi-npages++; kref_get(req-wb_kref); return 0; @@ -392,8 +435,10 @@ static void nfs_inode_remove_request(str BUG_ON (!NFS_WBACK_BUSY(req)); spin_lock(inode-i_lock); - set_page_private(req-wb_page, 0); - ClearPagePrivate(req-wb_page); + if (likely(!PageSwapCache(req-wb_page))) { + set_page_private(req-wb_page, 0); + ClearPagePrivate(req-wb_page); + } radix_tree_delete(nfsi-nfs_page_tree, req-wb_index
[PATCH 00/33] Swap over NFS -v14
Hi, Another posting of the full swap over NFS series. [ I tried just posting the first part last time around, but that just gets more confusion by lack of a general picture ] [ patches against 2.6.23-mm1, also to be found online at: http://programming.kicks-ass.net/kernel-patches/vm_deadlock/v2.6.23-mm1/ ] The patch-set can be split in roughtly 5 parts, for each of which I shall give a description. Part 1, patches 1-12 The problem with swap over network is the generic swap problem: needing memory to free memory. Normally this is solved using mempools, as can be seen in the BIO layer. Swap over network has the problem that the network subsystem does not use fixed sized allocations, but heavily relies on kmalloc(). This makes mempools unusable. This first part provides a generic reserve framework. Care is taken to only affect the slow paths - when we're low on memory. Caveats: it is currently SLUB only. 1 - mm: gfp_to_alloc_flags() 2 - mm: tag reseve pages 3 - mm: slub: add knowledge of reserve pages 4 - mm: allow mempool to fall back to memalloc reserves 5 - mm: kmem_estimate_pages() 6 - mm: allow PF_MEMALLOC from softirq context 7 - mm: serialize access to min_free_kbytes 8 - mm: emergency pool 9 - mm: system wide ALLOC_NO_WATERMARK 10 - mm: __GFP_MEMALLOC 11 - mm: memory reserve management 12 - selinux: tag avc cache alloc as non-critical Part 2, patches 13-15 Provide some generic network infrastructure needed later on. 13 - net: wrap sk-sk_backlog_rcv() 14 - net: packet split receive api 15 - net: sk_allocation() - concentrate socket related allocations Part 3, patches 16-23 Now that we have a generic memory reserve system, use it on the network stack. The thing that makes this interesting is that, contrary to BIO, both the transmit and receive path require memory allocations. That is, in the BIO layer write back completion is usually just an ISR flipping a bit and waking stuff up. A network write back completion involved receiving packets, which when there is no memory, is rather hard. And even when there is memory there is no guarantee that the required packet comes in in the window that that memory buys us. The solution to this problem is found in the fact that network is to be assumed lossy. Even now, when there is no memory to receive packets the network card will have to discard packets. What we do is move this into the network stack. So we reserve a little pool to act as a receive buffer, this allows us to inspect packets before tossing them. This way, we can filter out those packets that ensure progress (writeback completion) and disregard the others (as would have happened anyway). [ NOTE: this is a stable mode of operation with limited memory usage, exactly the kind of thing we need ] Again, care is taken to keep much of the overhead of this to only affect the slow path. Only packets allocated from the reserves will suffer the extra atomic overhead needed for accounting. 16 - netvm: network reserve infrastructure 17 - sysctl: propagate conv errors 18 - netvm: INET reserves. 19 - netvm: hook skb allocation to reserves 20 - netvm: filter emergency skbs. 21 - netvm: prevent a TCP specific deadlock 22 - netfilter: NF_QUEUE vs emergency skbs 23 - netvm: skb processing Part 4, patches 24-26 Generic vm infrastructure to handle swapping to a filesystem instead of a block device. The approach here has been questioned, people would like to see a less invasive approach. One suggestion is to create and use a_ops-swap_{in,out}(). 24 - mm: prepare swap entry methods for use in page methods 25 - mm: add support for non block device backed swap files 26 - mm: methods for teaching filesystems about PG_swapcache pages Part 5, patches 27-33 Finally, convert NFS to make use of the new network and vm infrastructure to provide swap over NFS. 27 - nfs: remove mempools 28 - nfs: teach the NFS client how to treat PG_swapcache pages 29 - nfs: disable data cache revalidation for swapfiles 30 - nfs: swap vs nfs_writepage 31 - nfs: enable swap on NFS 32 - nfs: fix various memory recursions possible with swap over NFS. 33 - nfs: do not warn on radix tree node allocation failures - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 27/33] nfs: remove mempools
With the introduction of the shared dirty page accounting in .19, NFS should not be able to surpise the VM with all dirty pages. Thus it should always be able to free some memory. Hence no more need for mempools. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/read.c | 15 +++ fs/nfs/write.c | 27 +-- 2 files changed, 8 insertions(+), 34 deletions(-) Index: linux-2.6/fs/nfs/read.c === --- linux-2.6.orig/fs/nfs/read.c +++ linux-2.6/fs/nfs/read.c @@ -33,13 +33,10 @@ static const struct rpc_call_ops nfs_rea static const struct rpc_call_ops nfs_read_full_ops; static struct kmem_cache *nfs_rdata_cachep; -static mempool_t *nfs_rdata_mempool; - -#define MIN_POOL_READ (32) struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) { - struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, GFP_NOFS); + struct nfs_read_data *p = kmem_cache_alloc(nfs_rdata_cachep, GFP_NOFS); if (p) { memset(p, 0, sizeof(*p)); @@ -50,7 +47,7 @@ struct nfs_read_data *nfs_readdata_alloc else { p-pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); if (!p-pagevec) { - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); p = NULL; } } @@ -63,7 +60,7 @@ static void nfs_readdata_rcu_free(struct struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu); if (p (p-pagevec != p-page_array[0])) kfree(p-pagevec); - mempool_free(p, nfs_rdata_mempool); + kmem_cache_free(nfs_rdata_cachep, p); } static void nfs_readdata_free(struct nfs_read_data *rdata) @@ -597,16 +594,10 @@ int __init nfs_init_readpagecache(void) if (nfs_rdata_cachep == NULL) return -ENOMEM; - nfs_rdata_mempool = mempool_create_slab_pool(MIN_POOL_READ, -nfs_rdata_cachep); - if (nfs_rdata_mempool == NULL) - return -ENOMEM; - return 0; } void nfs_destroy_readpagecache(void) { - mempool_destroy(nfs_rdata_mempool); kmem_cache_destroy(nfs_rdata_cachep); } Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -28,9 +28,6 @@ #define NFSDBG_FACILITYNFSDBG_PAGECACHE -#define MIN_POOL_WRITE (32) -#define MIN_POOL_COMMIT(4) - /* * Local function declarations */ @@ -44,12 +41,10 @@ static const struct rpc_call_ops nfs_wri static const struct rpc_call_ops nfs_commit_ops; static struct kmem_cache *nfs_wdata_cachep; -static mempool_t *nfs_wdata_mempool; -static mempool_t *nfs_commit_mempool; struct nfs_write_data *nfs_commit_alloc(void) { - struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS); if (p) { memset(p, 0, sizeof(*p)); @@ -63,7 +58,7 @@ static void nfs_commit_rcu_free(struct r struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p (p-pagevec != p-page_array[0])) kfree(p-pagevec); - mempool_free(p, nfs_commit_mempool); + kmem_cache_free(nfs_wdata_cachep, p); } void nfs_commit_free(struct nfs_write_data *wdata) @@ -73,7 +68,7 @@ void nfs_commit_free(struct nfs_write_da struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { - struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + struct nfs_write_data *p = kmem_cache_alloc(nfs_wdata_cachep, GFP_NOFS); if (p) { memset(p, 0, sizeof(*p)); @@ -84,7 +79,7 @@ struct nfs_write_data *nfs_writedata_all else { p-pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_NOFS); if (!p-pagevec) { - mempool_free(p, nfs_wdata_mempool); + kmem_cache_free(nfs_wdata_cachep, p); p = NULL; } } @@ -97,7 +92,7 @@ static void nfs_writedata_rcu_free(struc struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p (p-pagevec != p-page_array[0])) kfree(p-pagevec); - mempool_free(p, nfs_wdata_mempool); + kmem_cache_free(nfs_wdata_cachep, p); } static void nfs_writedata_free(struct nfs_write_data *wdata) @@ -1474,16 +1469,6 @@ int __init nfs_init_writepagecache(void) if (nfs_wdata_cachep == NULL) return -ENOMEM
[PATCH 31/33] nfs: enable swap on NFS
Provide an a_ops-swapfile() implementation for NFS. This will set the NFS socket to SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset SOCK_MEMALLOC before engaging the protocol -connect() method. PF_MEMALLOC should allow the allocation of struct socket and related objects and the early (re)setting of SOCK_MEMALLOC should allow us to receive the packets required for the TCP connection buildup. (swapping continues over a server reset during heavy network traffic) Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/Kconfig | 18 fs/nfs/file.c | 10 ++ include/linux/sunrpc/xprt.h |5 ++- net/sunrpc/sched.c |9 -- net/sunrpc/xprtsock.c | 63 5 files changed, 102 insertions(+), 3 deletions(-) Index: linux-2.6/fs/nfs/file.c === --- linux-2.6.orig/fs/nfs/file.c +++ linux-2.6/fs/nfs/file.c @@ -371,6 +371,13 @@ static int nfs_launder_page(struct page return nfs_wb_page(page_file_mapping(page)-host, page); } +#ifdef CONFIG_NFS_SWAP +static int nfs_swapfile(struct address_space *mapping, int enable) +{ + return xs_swapper(NFS_CLIENT(mapping-host)-cl_xprt, enable); +} +#endif + const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -385,6 +392,9 @@ const struct address_space_operations nf .direct_IO = nfs_direct_IO, #endif .launder_page = nfs_launder_page, +#ifdef CONFIG_NFS_SWAP + .swapfile = nfs_swapfile, +#endif }; static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) Index: linux-2.6/include/linux/sunrpc/xprt.h === --- linux-2.6.orig/include/linux/sunrpc/xprt.h +++ linux-2.6/include/linux/sunrpc/xprt.h @@ -143,7 +143,9 @@ struct rpc_xprt { unsigned intmax_reqs; /* total slots */ unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ - resvport : 1; /* use a reserved port */ + resvport : 1, /* use a reserved port */ + swapper: 1; /* we're swapping over this + transport */ unsigned intbind_index; /* bind function index */ /* @@ -246,6 +248,7 @@ struct rpc_rqst * xprt_lookup_rqst(struc void xprt_complete_rqst(struct rpc_task *task, int copied); void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect(struct rpc_xprt *xprt); +intxs_swapper(struct rpc_xprt *xprt, int enable); /* * Reserved bit positions in xprt-state Index: linux-2.6/net/sunrpc/sched.c === --- linux-2.6.orig/net/sunrpc/sched.c +++ linux-2.6/net/sunrpc/sched.c @@ -761,7 +761,10 @@ struct rpc_buffer { void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT; + + if (RPC_IS_SWAPPER(task)) + gfp |= __GFP_MEMALLOC; size += sizeof(struct rpc_buffer); if (size = RPC_BUFFER_MAXSIZE) @@ -817,6 +820,8 @@ void rpc_init_task(struct rpc_task *task atomic_set(task-tk_count, 1); task-tk_client = clnt; task-tk_flags = flags; + if (clnt-cl_xprt-swapper) + task-tk_flags |= RPC_TASK_SWAPPER; task-tk_ops = tk_ops; if (tk_ops-rpc_call_prepare != NULL) task-tk_action = rpc_prepare_task; @@ -853,7 +858,7 @@ void rpc_init_task(struct rpc_task *task static struct rpc_task * rpc_alloc_task(void) { - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); } static void rpc_free_task(struct rcu_head *rcu) Index: linux-2.6/net/sunrpc/xprtsock.c === --- linux-2.6.orig/net/sunrpc/xprtsock.c +++ linux-2.6/net/sunrpc/xprtsock.c @@ -1397,6 +1397,9 @@ static void xs_udp_finish_connecting(str transport-sock = sock; transport-inet = sk; + if (xprt-swapper) + sk_set_memalloc(sk); + write_unlock_bh(sk-sk_callback_lock); } xs_udp_do_set_buffer_size(xprt); @@ -1414,11 +1417,15 @@ static void xs_udp_connect_worker4(struc container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = transport-xprt; struct socket *sock = transport-sock; + unsigned long pflags
[PATCH 04/33] mm: allow mempool to fall back to memalloc reserves
Allow the mempool to use the memalloc reserves when all else fails and the allocation context would otherwise allow it. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- mm/mempool.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) Index: linux-2.6/mm/mempool.c === --- linux-2.6.orig/mm/mempool.c +++ linux-2.6/mm/mempool.c @@ -14,6 +14,7 @@ #include linux/mempool.h #include linux/blkdev.h #include linux/writeback.h +#include internal.h static void add_element(mempool_t *pool, void *element) { @@ -204,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, gf void *element; unsigned long flags; wait_queue_t wait; - gfp_t gfp_temp; + gfp_t gfp_temp, gfp_orig = gfp_mask; might_sleep_if(gfp_mask __GFP_WAIT); @@ -228,6 +229,15 @@ repeat_alloc: } spin_unlock_irqrestore(pool-lock, flags); + /* if we really had right to the emergency reserves try those */ + if (gfp_to_alloc_flags(gfp_orig) ALLOC_NO_WATERMARKS) { + if (gfp_temp __GFP_NOMEMALLOC) { + gfp_temp = ~(__GFP_NOMEMALLOC|__GFP_NOWARN); + goto repeat_alloc; + } else + gfp_temp |= __GFP_NOMEMALLOC|__GFP_NOWARN; + } + /* We must not sleep in the GFP_ATOMIC case */ if (!(gfp_mask __GFP_WAIT)) return NULL; -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 06/33] mm: allow PF_MEMALLOC from softirq context
Allow PF_MEMALLOC to be set in softirq context. When running softirqs from a borrowed context save current-flags, ksoftirqd will have its own task_struct. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sched.h |4 kernel/softirq.c |3 +++ mm/page_alloc.c |7 --- 3 files changed, 11 insertions(+), 3 deletions(-) Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -1557,9 +1557,10 @@ int gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask __GFP_NOMEMALLOC))) { - if (!in_interrupt() - ((p-flags PF_MEMALLOC) || -unlikely(test_thread_flag(TIF_MEMDIE + if (!in_irq() (p-flags PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() + unlikely(test_thread_flag(TIF_MEMDIE))) alloc_flags |= ALLOC_NO_WATERMARKS; } Index: linux-2.6/kernel/softirq.c === --- linux-2.6.orig/kernel/softirq.c +++ linux-2.6/kernel/softirq.c @@ -211,6 +211,8 @@ asmlinkage void __do_softirq(void) __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + unsigned long pflags = current-flags; + current-flags = ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); @@ -249,6 +251,7 @@ restart: account_system_vtime(current); _local_bh_enable(); + tsk_restore_flags(current, pflags, PF_MEMALLOC); } #ifndef __ARCH_HAS_DO_SOFTIRQ Index: linux-2.6/include/linux/sched.h === --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -1389,6 +1389,10 @@ static inline void put_task_struct(struc #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) +#define tsk_restore_flags(p, pflags, mask) \ + do {(p)-flags = ~(mask); \ + (p)-flags |= ((pflags) (mask)); } while (0) + #ifdef CONFIG_SMP extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask); #else -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 12/33] selinux: tag avc cache alloc as non-critical
Failing to allocate a cache entry will only harm performance not correctness. Do not consume valuable reserve pages for something like that. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] Acked-by: James Morris [EMAIL PROTECTED] --- security/selinux/avc.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) Index: linux-2.6-2/security/selinux/avc.c === --- linux-2.6-2.orig/security/selinux/avc.c +++ linux-2.6-2/security/selinux/avc.c @@ -334,7 +334,7 @@ static struct avc_node *avc_alloc_node(v { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC); + node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); if (!node) goto out; -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 18/33] netvm: INET reserves.
Add reserves for INET. The two big users seem to be the route cache and ip-fragment cache. Reserve the route cache under generic RX reserve, its usage is bounded by the high reclaim watermark, and thus does not need further accounting. Reserve the ip-fragement caches under SKB data reserve, these add to the SKB RX limit. By ensuring we can at least receive as much data as fits in the reassmbly line we avoid fragment attack deadlocks. Use proc conv() routines to update these limits and return -ENOMEM to user space. Adds to the reserve tree: total network reserve network TX reserve protocol TX pages network RX reserve + IPv6 route cache + IPv4 route cache SKB data reserve + IPv6 fragment cache + IPv4 fragment cache Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/linux/sysctl.h | 11 +++ kernel/sysctl.c|8 ++-- net/ipv4/ip_fragment.c |7 +++ net/ipv4/route.c | 30 +- net/ipv4/sysctl_net_ipv4.c | 24 +++- net/ipv6/reassembly.c |7 +++ net/ipv6/route.c | 31 ++- net/ipv6/sysctl_net_ipv6.c | 24 +++- 8 files changed, 136 insertions(+), 6 deletions(-) Index: linux-2.6/net/ipv4/sysctl_net_ipv4.c === --- linux-2.6.orig/net/ipv4/sysctl_net_ipv4.c +++ linux-2.6/net/ipv4/sysctl_net_ipv4.c @@ -18,6 +18,7 @@ #include net/route.h #include net/tcp.h #include net/cipso_ipv4.h +#include linux/reserve.h /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; @@ -186,6 +187,27 @@ static int strategy_allowed_congestion_c } +extern struct mem_reserve ipv4_frag_reserve; + +static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp, +int *valp, int write, void *data) +{ + if (write) { + long value = *negp ? -*lvalp : *lvalp; + int err = mem_reserve_kmalloc_set(ipv4_frag_reserve, value); + if (err) + return err; + } + return do_proc_dointvec_conv(negp, lvalp, valp, write, data); +} + +static int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp, +void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_fragment_conv, NULL); +} + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -291,7 +313,7 @@ ctl_table ipv4_table[] = { .data = sysctl_ipfrag_high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_fragment }, { .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, Index: linux-2.6/net/ipv6/sysctl_net_ipv6.c === --- linux-2.6.orig/net/ipv6/sysctl_net_ipv6.c +++ linux-2.6/net/ipv6/sysctl_net_ipv6.c @@ -12,9 +12,31 @@ #include net/ndisc.h #include net/ipv6.h #include net/addrconf.h +#include linux/reserve.h #ifdef CONFIG_SYSCTL +extern struct mem_reserve ipv6_frag_reserve; + +static int do_proc_dointvec_fragment_conv(int *negp, unsigned long *lvalp, +int *valp, int write, void *data) +{ + if (write) { + long value = *negp ? -*lvalp : *lvalp; + int err = mem_reserve_kmalloc_set(ipv6_frag_reserve, value); + if (err) + return err; + } + return do_proc_dointvec_conv(negp, lvalp, valp, write, data); +} + +static int proc_dointvec_fragment(ctl_table *table, int write, struct file *filp, +void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + do_proc_dointvec_fragment_conv, NULL); +} + static ctl_table ipv6_table[] = { { .ctl_name = NET_IPV6_ROUTE, @@ -44,7 +66,7 @@ static ctl_table ipv6_table[] = { .data = sysctl_ip6frag_high_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_fragment }, { .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, Index: linux-2.6/net/ipv4/ip_fragment.c === --- linux-2.6.orig/net/ipv4/ip_fragment.c +++ linux-2.6/net/ipv4/ip_fragment.c @@ -43,6 +43,7 @@ #include linux/udp.h #include linux/inet.h #include linux/netfilter_ipv4.h +#include linux/reserve.h
[PATCH 33/33] nfs: do not warn on radix tree node allocation failures
GFP_ATOMIC failures are rather common, no not warn about them. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- fs/nfs/inode.c |2 +- fs/nfs/write.c | 10 ++ 2 files changed, 11 insertions(+), 1 deletion(-) Index: linux-2.6/fs/nfs/inode.c === --- linux-2.6.orig/fs/nfs/inode.c +++ linux-2.6/fs/nfs/inode.c @@ -1172,7 +1172,7 @@ static void init_once(struct kmem_cache INIT_LIST_HEAD(nfsi-open_files); INIT_LIST_HEAD(nfsi-access_cache_entry_lru); INIT_LIST_HEAD(nfsi-access_cache_inode_lru); - INIT_RADIX_TREE(nfsi-nfs_page_tree, GFP_ATOMIC); + INIT_RADIX_TREE(nfsi-nfs_page_tree, GFP_ATOMIC|__GFP_NOWARN); nfsi-ncommit = 0; nfsi-npages = 0; nfs4_init_once(nfsi); Index: linux-2.6/fs/nfs/write.c === --- linux-2.6.orig/fs/nfs/write.c +++ linux-2.6/fs/nfs/write.c @@ -652,6 +652,7 @@ static struct nfs_page * nfs_update_requ struct inode *inode = mapping-host; struct nfs_page *req, *new = NULL; pgoff_t rqend, end; + int error; end = offset + bytes; @@ -659,6 +660,10 @@ static struct nfs_page * nfs_update_requ /* Loop over all inode entries and see if we find * A request for the page we wish to update */ + error = radix_tree_preload(GFP_NOIO); + if (error) + return ERR_PTR(error); + spin_lock(inode-i_lock); req = nfs_page_find_request_locked(NFS_I(inode), page); if (req) { @@ -666,6 +671,7 @@ static struct nfs_page * nfs_update_requ int error; spin_unlock(inode-i_lock); + radix_tree_preload_end(); error = nfs_wait_on_request(req); nfs_release_request(req); if (error 0) { @@ -676,6 +682,7 @@ static struct nfs_page * nfs_update_requ continue; } spin_unlock(inode-i_lock); + radix_tree_preload_end(); if (new) nfs_release_request(new); break; @@ -687,13 +694,16 @@ static struct nfs_page * nfs_update_requ error = nfs_inode_add_request(inode, new); if (error) { spin_unlock(inode-i_lock); + radix_tree_preload_end(); nfs_unlock_request(new); return ERR_PTR(error); } spin_unlock(inode-i_lock); + radix_tree_preload_end(); return new; } spin_unlock(inode-i_lock); + radix_tree_preload_end(); new = nfs_create_request(ctx, inode, page, offset, bytes); if (IS_ERR(new)) -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK
Change ALLOC_NO_WATERMARK page allocation such that the reserves are system wide - which they are per setup_per_zone_pages_min(), when we scrape the barrel, do it properly. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- mm/page_alloc.c |6 ++ 1 file changed, 6 insertions(+) Index: linux-2.6/mm/page_alloc.c === --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -1638,6 +1638,12 @@ restart: rebalance: if (alloc_flags ALLOC_NO_WATERMARKS) { nofail_alloc: + /* +* break out of mempolicy boundaries +*/ + zonelist = NODE_DATA(numa_node_id())-node_zonelists + + gfp_zone(gfp_mask); + /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, zonelist, ALLOC_NO_WATERMARKS); -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] Create kinst/ or ki/ directory ?
On Tue, 2007-10-30 at 13:24 -0400, Mathieu Desnoyers wrote: * Jeff Garzik ([EMAIL PROTECTED]) wrote: ... Pick a shorter word like probes or profile or what... or better yet... just leave most things in their current directories. ... How about something along the kinst or ki lines ? (for kernel instrumentation) I think I'm with jgarzik on this, lets not do this until its clear where the generalized instrumentation goes to. That is, i386/x86_64 - x86 was part of a full integration plan, one that was immediately followed up by a series of integration patches. With this, I see no such plan. Please draft this generic instrumentation you talk about, if after that we all like it, we can go moving files together with the immediate purpose of integrating them. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 23/33] netvm: skb processing
On Tue, 2007-10-30 at 14:26 -0700, Stephen Hemminger wrote: On Tue, 30 Oct 2007 17:04:24 +0100 Peter Zijlstra [EMAIL PROTECTED] wrote: In order to make sure emergency packets receive all memory needed to proceed ensure processing of emergency SKBs happens under PF_MEMALLOC. Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing. Skip taps, since those are user-space again. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- include/net/sock.h |5 + net/core/dev.c | 44 ++-- net/core/sock.c| 18 ++ 3 files changed, 61 insertions(+), 6 deletions(-) Index: linux-2.6/net/core/dev.c === --- linux-2.6.orig/net/core/dev.c +++ linux-2.6/net/core/dev.c @@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk struct net_device *orig_dev; int ret = NET_RX_DROP; __be16 type; + unsigned long pflags = current-flags; + + /* Emergency skb are special, they should +* - be delivered to SOCK_MEMALLOC sockets only +* - stay away from userspace +* - have bounded memory usage +* +* Use PF_MEMALLOC as a poor mans memory pool - the grouping kind. +* This saves us from propagating the allocation context down to all +* allocation sites. +*/ + if (skb_emergency(skb)) + current-flags |= PF_MEMALLOC; /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) - return NET_RX_DROP; + goto out; Why the change? doesn't gcc optimize the common exit case anyway? It needs to unset PF_MEMALLOC at the exit. @@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { kfree_skb(skb); - goto out; + goto unlock; } skb-tc_verd = 0; ncls: #endif + if (skb_emergency(skb)) + switch(skb-protocol) { + case __constant_htons(ETH_P_ARP): + case __constant_htons(ETH_P_IP): + case __constant_htons(ETH_P_IPV6): + case __constant_htons(ETH_P_8021Q): + break; Indentation is wrong, and hard coding protocol values as spcial case seems bad here. What about vlan's, etc? The other protocols needs analysis on what memory allocations occur during packet processing, if anything is done that is not yet accounted for (skb, route cache) then that needs to be added to a reserve, if there are any paths that could touch user-space, those need to be handled. I've started looking at a few others, but its hard and difficult work if one is not familiar with the protocols. @@ -2063,8 +2093,10 @@ ncls: ret = NET_RX_DROP; } -out: +unlock: rcu_read_unlock(); +out: + tsk_restore_flags(current, pflags, PF_MEMALLOC); return ret; } Its that tsk_restore_flags() there what requires the s/return/goto/ stuff you noted earlier. I am still not convinced that this solves the problem well enough to be useful. Can you really survive a heavy memory overcommit? On a machine with mem=128M, I've ran 4 processes of 64M, 2 file backed with the files on NFS, 2 anonymous. The processes just cycle through the memory using writes. This is a 100% overcommit. During these tests I've ran various network loads. I've shut down the NFS server, waited for say 15 minutes, and restarted the NFS server, and the machine came back up and continued. In other words, can you prove that the added complexity causes the system to survive a real test where otherwise it would not? I've put some statistics in the skb reserve allocations, those are most definately used. I'm quite certain the machine would lock up solid without it. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 00/33] Swap over NFS -v14
On Tue, 2007-10-30 at 21:37 -0700, David Miller wrote: From: Nick Piggin [EMAIL PROTECTED] Date: Wed, 31 Oct 2007 14:26:32 +1100 Is it really worth all the added complexity of making swap over NFS files work, given that you could use a network block device instead? Don't be misled. Swapping over NFS is just a scarecrow for the seemingly real impetus behind these changes which is network storage stuff like iSCSI. Not quite, yes, iSCSI is also on the 'want' list of quite a few people, but swap over NFS on its own is also a feature of great demand. signature.asc Description: This is a digitally signed message part
Re: aim7 -30% regression in 2.6.24-rc1
On Wed, 2007-10-31 at 17:57 +0800, Zhang, Yanmin wrote: On Tue, 2007-10-30 at 16:36 +0800, Zhang, Yanmin wrote: On Tue, 2007-10-30 at 08:26 +0100, Ingo Molnar wrote: * Zhang, Yanmin [EMAIL PROTECTED] wrote: sub-bisecting captured patch 38ad464d410dadceda1563f36bdb0be7fe4c8938(sched: uniform tunings) caused 20% regression of aim7. The last 10% should be also related to sched parameters, such like sysctl_sched_min_granularity. ah, interesting. Since you have CONFIG_SCHED_DEBUG enabled, could you please try to figure out what the best value for /proc/sys/kernel_sched_latency, /proc/sys/kernel_sched_nr_latency and /proc/sys/kernel_sched_min_granularity is? there's a tuning constraint for kernel_sched_nr_latency: - kernel_sched_nr_latency should always be set to kernel_sched_latency/kernel_sched_min_granularity. (it's not a free tunable) i suspect a good approach would be to double the value of kernel_sched_latency and kernel_sched_nr_latency in each tuning iteration, while keeping kernel_sched_min_granularity unchanged. That will excercise the tuning values of the 2.6.23 kernel as well. I followed your idea to test 2.6.24-rc1. The improvement is slow. When sched_nr_latency=2560 and sched_latency_ns=64000, the performance is still about 15% less than 2.6.23. I got the aim7 30% regression on my new upgraded stoakley machine. I found this mahcine is slower than the old one. Maybe BIOS has issues, or memeory(Might not be dual-channel?) is slow. So I retested it on the old machine and found on the old stoakley machine, the regression is about 6%, quite similiar to the regression on tigerton machine. By sched_nr_latency=640 and sched_latency_ns=64000 on the old stoakley machine, the regression becomes about 2%. Other latency has more regression. On my tulsa machine, by sched_nr_latency=640 and sched_latency_ns=64000, the regression becomes less than 1% (The original regression is about 20%). When I ran a bad script to change the values of sched_nr_latency and sched_latency_ns, I hit OOPS on my tulsa machine. Below is the log. It looks like sched_nr_latency becomes 0. Oops, yeah I think I overlooked that case :-/ I think limiting the sysctl parameters make most sense, as a 0 value really doesn't. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b4efbe..0f34c91 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -94,6 +94,7 @@ static int two = 2; static int zero; static int one_hundred = 100; +static int int_max = INT_MAX; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -239,7 +240,10 @@ static struct ctl_table kern_table[] = { .data = sysctl_sched_nr_latency, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = one, + .extra2 = int_max, }, { .ctl_name = CTL_UNNUMBERED, signature.asc Description: This is a digitally signed message part
Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages
On Wed, 2007-10-31 at 14:37 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation contexts that are entitled to it. Care is taken to only touch the SLUB slow path. This is done to ensure reserve pages don't leak out and get consumed. I think this is generally a good idea (to prevent slab allocators from stealing reserve). However I naively think the implementation is a bit overengineered and thus has a few holes. Humour me, what was the problem with failing the slab allocation (actually, not fail but just call into the page allocator to do correct waiting / reclaim) in the slowpath if the process fails the watermark checks? Ah, we actually need slabs below the watermarks. Its just that once I allocated those slabs using __GFP_MEMALLOC/PF_MEMALLOC I don't want allocation contexts that do not have rights to those pages to walk off with objects. So, this generic reserve framework still uses the slab allocator to provide certain kind of objects (kmalloc, kmem_cache_alloc), it just separates those that are and are not entitled to the reserves. signature.asc Description: This is a digitally signed message part
Re: [PATCH 05/33] mm: kmem_estimate_pages()
On Wed, 2007-10-31 at 14:43 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Provide a method to get the upper bound on the pages needed to allocate a given number of objects from a given kmem_cache. Fair enough, but just to make it a bit easier, can you provide a little reason of why in this patch (or reference the patch number where you use it, or put it together with the patch where you use it, etc.). A generic reserve framework, as seen in patch 11/23, needs to be able convert from a object demand (kmalloc() bytes, kmem_cache_alloc() objects) to a page reserve. signature.asc Description: This is a digitally signed message part
Re: [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context
On Wed, 2007-10-31 at 14:51 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Allow PF_MEMALLOC to be set in softirq context. When running softirqs from a borrowed context save current-flags, ksoftirqd will have its own task_struct. What's this for? Why would ksoftirqd pick up PF_MEMALLOC? (I guess that some networking thing must be picking it up in a subsequent patch, but I'm too lazy to look!)... Again, can you have more of a rationale in your patch headers, or ref the patch that uses it... thanks Right, I knew I was forgetting something in these changelogs. The network stack does quite a bit of packet processing from softirq context. Once you start swapping over network, some of the packets want to be processed under PF_MEMALLOC. See patch 23/33. signature.asc Description: This is a digitally signed message part
Re: [PATCH 09/33] mm: system wide ALLOC_NO_WATERMARK
On Wed, 2007-10-31 at 14:52 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Change ALLOC_NO_WATERMARK page allocation such that the reserves are system wide - which they are per setup_per_zone_pages_min(), when we scrape the barrel, do it properly. IIRC it's actually not too uncommon to have allocations coming here via page reclaim. It's not exactly clear that you want to break mempolicies at this point. Hmm, the way I see it is that mempolicies are mainly for user-space allocations, reserve allocations are always kernel allocations. These already break mempolicies - for example hardirq context allocations. Also, as it stands, the reserve is spread out evenly over all zones/nodes (excluding highmem), so by restricting ourselves to a subset, we don't have access to the full reserve. signature.asc Description: This is a digitally signed message part
Re: [PATCH 00/33] Swap over NFS -v14
On Wed, 2007-10-31 at 08:50 +, Christoph Hellwig wrote: On Tue, Oct 30, 2007 at 09:37:53PM -0700, David Miller wrote: Don't be misled. Swapping over NFS is just a scarecrow for the seemingly real impetus behind these changes which is network storage stuff like iSCSI. So can we please do swap over network storage only first? All these VM bits look conceptually sane to me, while the changes to the swap code to support nfs are real crackpipe material. Yeah, I know how you stand on that. I just wanted to post all this before going off into the woods reworking it all. Then again doing that part properly by adding address_space methods for swap I/O without the abuse might be a really good idea, especially as the way we do swapfiles on block-based filesystems is an horrible hack already. Is planned. What do you think of the proposed a_ops extension to accomplish this? That is, -swapfile() - is this address space willing to back swap -swapout() - write out a page -swapin() - read in a page So please get the VM bits for swap over network blockdevices in first, Trouble with that part is that we don't have any sane network block devices atm, NBD is utter crap, and iSCSI is too complex to be called sane. Maybe Evgeniy's Distributed storage thingy would work, will have a look at that. and then we can look into a complete revamp of the swapfile support that cleans up the current mess and adds support for nfs insted of making the mess even worse. Sure, concrete suggestion are always welcome. Just being told something is utter crap only goes so far. signature.asc Description: This is a digitally signed message part
Re: NBD was Re: [PATCH 00/33] Swap over NFS -v14
On Wed, 2007-10-31 at 12:18 +0100, Pavel Machek wrote: Hi! So please get the VM bits for swap over network blockdevices in first, Trouble with that part is that we don't have any sane network block devices atm, NBD is utter crap, and iSCSI is too complex to be called sane. Hey, NBD was designed to be _simple_. And I think it works okay in that area.. so can you elaborate on utter crap? [Ok, performance is not great.] Yeah, sorry, perhaps I was overly strong. It doesn't work for me, because: - it does connection management in user-space, which makes it impossible to reconnect. I'd want a full kernel based client. - it had some plugging issues, and after talking to Jens about it he suggested a rewrite using -make_request() ala AoE. [ sorry if I'm short on details here, it was a long time ago, and I forgot, maybe Jens remembers ] Plus, I'd suggest you to look at ata-over-ethernet. It is in tree today, quite simple, but should have better performance than nbd. Ah, right, I keep forgetting about that one. The only draw-back to that on is, is that its raw ethernet, and not some IP protocol. signature.asc Description: This is a digitally signed message part
Re: [PATCH 00/33] Swap over NFS -v14
On Wed, 2007-10-31 at 14:26 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Hi, Another posting of the full swap over NFS series. Hi, Is it really worth all the added complexity of making swap over NFS files work, given that you could use a network block device instead? As it stands, we don't have a usable network block device IMHO. NFS is by far the most used and usable network storage solution out there, anybody with half a brain knows how to set it up and use it. Also, have you ensured that page_file_index, page_file_mapping and page_offset are only ever used on anonymous pages when the page is locked? (otherwise PageSwapCache could change) Good point, I hope so, both -readpage() and -writepage() take a locked page, I'd have to look if it remains locked throughout the NFS call chain. Then again, it might become obsolete with the extended swap a_ops. signature.asc Description: This is a digitally signed message part
Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages
On Wed, 2007-10-31 at 21:46 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 21:42, Peter Zijlstra wrote: On Wed, 2007-10-31 at 14:37 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Restrict objects from reserve slabs (ALLOC_NO_WATERMARKS) to allocation contexts that are entitled to it. Care is taken to only touch the SLUB slow path. This is done to ensure reserve pages don't leak out and get consumed. I think this is generally a good idea (to prevent slab allocators from stealing reserve). However I naively think the implementation is a bit overengineered and thus has a few holes. Humour me, what was the problem with failing the slab allocation (actually, not fail but just call into the page allocator to do correct waiting / reclaim) in the slowpath if the process fails the watermark checks? Ah, we actually need slabs below the watermarks. Right, I'd still allow those guys to allocate slabs. Provided they have the right allocation context, right? Its just that once I allocated those slabs using __GFP_MEMALLOC/PF_MEMALLOC I don't want allocation contexts that do not have rights to those pages to walk off with objects. And I'd prevent these ones from doing so. Without keeping track of reserve pages, which doesn't feel too clean. The problem with that is that once a slab was allocated with the right allocation context, anybody can get objects from these slabs. low memory, and empty slab: task Atask B kmem_cache_alloc() = NULL current-flags |= PF_MEMALLOC kmem_cache_alloc() = obj (slab != NULL) kmem_cache_alloc() = obj kmem_cache_alloc() = obj kmem_cache_alloc() = obj And now task A, who doesn't have the right permissions walks away with all our reserve memory. So we either reserve a page per object, which for 32 byte objects is a large waste, or we stop anybody who doesn't have the right permissions from obtaining objects. I took the latter approach. signature.asc Description: This is a digitally signed message part
Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages
On Wed, 2007-10-31 at 22:25 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 23:17, Peter Zijlstra wrote: On Wed, 2007-10-31 at 21:46 +1100, Nick Piggin wrote: And I'd prevent these ones from doing so. Without keeping track of reserve pages, which doesn't feel too clean. The problem with that is that once a slab was allocated with the right allocation context, anybody can get objects from these slabs. [snip] I understand that. So we either reserve a page per object, which for 32 byte objects is a large waste, or we stop anybody who doesn't have the right permissions from obtaining objects. I took the latter approach. What I'm saying is that the slab allocator slowpath should always just check watermarks against the current task. Instead of this -reserve stuff. So what you say is to allocate a slab every time we take the slow path, even when we already have one? That sounds rather sub-optimal. signature.asc Description: This is a digitally signed message part
Re: [PATCH 00/33] Swap over NFS -v14
On Wed, 2007-10-31 at 08:16 -0400, Jeff Garzik wrote: Thoughts: 1) I absolutely agree that NFS is far more prominent and useful than any network block device, at the present time. 2) Nonetheless, swap over NFS is a pretty rare case. I view this work as interesting, but I really don't see a huge need, for swapping over NBD or swapping over NFS. I tend to think swapping to a remote resource starts to approach migration rather than merely swapping. Yes, we can do it... but given the lack of burning need one must examine the price. There is a large corporate demand for this, which is why I'm doing this. The typical usage scenarios are: - cluster/blades, where having local disks is a cost issue (maintenance of failures, heat, etc) - virtualisation, where dumping the storage on a networked storage unit makes for trivial migration and what not.. But please, people who want this (I'm sure some of you are reading) do speak up. I'm just the motivated corporate drone implementing the feature :-) 3) You note Swap over network has the problem that the network subsystem does not use fixed sized allocations, but heavily relies on kmalloc(). This makes mempools unusable. True, but IMO there are mitigating factors that should be researched and taken into account: a) To give you some net driver background/history, most mainstream net drivers were coded to allocate RX skbs of size 1538, under the theory that they would all be allocating out of the same underlying slab cache. It would not be difficult to update a great many of the [non-jumbo] cases to create a fixed size allocation pattern. One issue that comes to mind is how to ensure we'd still overflow the IP-reassembly buffers. Currently those are managed on the number of bytes present, not the number of fragments. One of the goals of my approach was to not rewrite the network subsystem to accomodate this feature (and I hope I succeeded). b) Spare-time experiments and anecdotal evidence points to RX and TX skb recycling as a potentially valuable area of research. If you are able to do something like that, then memory suddenly becomes a lot more bounded and predictable. So my gut feeling is that taking a hard look at how net drivers function in the field should give you a lot of good ideas that approach the shared goal of making network memory allocations more predictable and bounded. Note that being bounded only comes from dropping most packets before trying them to a socket. That is the crucial part of the RX path, to receive all packets from the NIC (regardless their size) but to not pass them on to the network stack - unless they belong to a 'special' socket that promises undelayed processing. Thanks for these ideas, I'll look into them. signature.asc Description: This is a digitally signed message part
Re: [PATCH 06/33] mm: allow PF_MEMALLOC from softirq context
On Wed, 2007-10-31 at 21:49 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 21:42, Peter Zijlstra wrote: On Wed, 2007-10-31 at 14:51 +1100, Nick Piggin wrote: On Wednesday 31 October 2007 03:04, Peter Zijlstra wrote: Allow PF_MEMALLOC to be set in softirq context. When running softirqs from a borrowed context save current-flags, ksoftirqd will have its own task_struct. What's this for? Why would ksoftirqd pick up PF_MEMALLOC? (I guess that some networking thing must be picking it up in a subsequent patch, but I'm too lazy to look!)... Again, can you have more of a rationale in your patch headers, or ref the patch that uses it... thanks Right, I knew I was forgetting something in these changelogs. The network stack does quite a bit of packet processing from softirq context. Once you start swapping over network, some of the packets want to be processed under PF_MEMALLOC. Hmm... what about processing from interrupt context? From what I could tell that is not done, ISR just fills the skb and sticks it on an RX queue to be further processed by the softirq. signature.asc Description: This is a digitally signed message part
Re: [PATCH 03/33] mm: slub: add knowledge of reserve pages
On Wed, 2007-10-31 at 13:54 +0100, Peter Zijlstra wrote: On Wed, 2007-10-31 at 22:25 +1100, Nick Piggin wrote: What I'm saying is that the slab allocator slowpath should always just check watermarks against the current task. Instead of this -reserve stuff. So what you say is to allocate a slab every time we take the slow path, even when we already have one? BTW, a task that does not have reserve permissions will already attempt to allocate a new slab - this is done to probe the current watermarks. If this succeeds the reserve status is lifted. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: sched: fix new task startup crash
Hi, Commit: b9dca1e0fcb696716840a3bc8f20a6941b484dbf seems to me that by calling enqueue_fair_task() from task_new_fair() is wrong. The wakeup=1 in enqueue_fair_task() will cause all non-top sched_entities to be re-positioned by place_entity(). Although the current implementation thereof seems to avoid doing something horrible. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] lockdep: fix mismatched lockdep_depth/curr_chain_hash
On Wed, 2007-10-31 at 11:44 -0400, Gregory Haskins wrote: Hi Greg, Here is the backported version of the patch. I applied it on top of 2.6.22.10. Let me know if you have any issues. -Greg Thanks Gregory! - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 6/6] sched: place_entity() comments
Add a few comments to place_entity(). Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] --- kernel/sched_fair.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) Index: linux-2.6/kernel/sched_fair.c === --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -582,19 +582,26 @@ place_entity(struct cfs_rq *cfs_rq, stru } else if (sched_feat(APPROX_AVG) cfs_rq-nr_running) vruntime += sched_vslice(cfs_rq)/2; + /* +* The 'current' period is already promised to the current tasks, +* however the extra weight of the new task will slow them down a +* little, place the new task so that it fits in the slot that +* stays open at the end. +*/ if (initial sched_feat(START_DEBIT)) vruntime += sched_vslice_add(cfs_rq, se); if (!initial) { + /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS) entity_is_task(se) task_of(se)-policy != SCHED_BATCH) vruntime -= sysctl_sched_latency; - vruntime = max_t(s64, vruntime, se-vruntime); + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se-vruntime, vruntime); } se-vruntime = vruntime; - } static void -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/6] sched: make sched_slice() group scheduling savvy
Currently the ideal slice length does not take group scheduling into account. Change it so that it properly takes all the runnable tasks on this cpu into account and caluclate the weight according to the grouping hierarchy. Also fixes a bug in vslice which missed a factor NICE_0_LOAD. Signed-off-by: Peter Zijlstra [EMAIL PROTECTED] CC: Srivatsa Vaddagiri [EMAIL PROTECTED] --- kernel/sched_fair.c | 42 +++--- 1 file changed, 31 insertions(+), 11 deletions(-) Index: linux-2.6/kernel/sched_fair.c === --- linux-2.6.orig/kernel/sched_fair.c +++ linux-2.6/kernel/sched_fair.c @@ -331,10 +331,15 @@ static u64 __sched_period(unsigned long */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq-nr_running); + unsigned long nr_running = rq_of(cfs_rq)-nr_running; + u64 slice = __sched_period(nr_running); - slice *= se-load.weight; - do_div(slice, cfs_rq-load.weight); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + slice *= se-load.weight; + do_div(slice, cfs_rq-load.weight); + } return slice; } @@ -344,24 +349,39 @@ static u64 sched_slice(struct cfs_rq *cf * * vs = s/w = p/rw */ -static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) +static u64 __sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *new) { - u64 vslice = __sched_period(nr_running); + struct sched_entity *se = cfs_rq-curr; + unsigned long nr_running = rq_of(cfs_rq)-nr_running; + unsigned long weight = 0; + u64 vslice; + + if (new) { + nr_running++; + weight = new-load.weight; + } - do_div(vslice, rq_weight); + vslice = __sched_period(nr_running); + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + + vslice *= NICE_0_LOAD; + do_div(vslice, cfs_rq-load.weight + weight); + weight = 0; + } return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) +static inline u64 sched_vslice(struct cfs_rq *cfs_rq) { - return __sched_vslice(cfs_rq-load.weight, cfs_rq-nr_running); + return __sched_vslice(cfs_rq, NULL); } -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *new) { - return __sched_vslice(cfs_rq-load.weight + se-load.weight, - cfs_rq-nr_running + 1); + return __sched_vslice(cfs_rq, new); } /* -- - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/