[PATCH] RCU: Remove have_rcu_nocb_mask from tree_plugin.h

2017-11-17 Thread Rakib Mullick
Currently have_rcu_nocb_mask is used to avoid double allocation of
rcu_nocb_mask during boot up. Due to different representation of
cpumask_var_t on different kernel config CPUMASK=y(or n) it was okay.
But now we have a helper cpumask_available(), which can be utilized
to check whether rcu_nocb_mask has been allocated or not without using
a variable.

Removing the variable also reduces vmlinux size.

Unpatched version:
text   data bss dec hex filename
130503937852470 145434083544627121cddff vmlinux

Patched version:
 text  data bss dec hex filename
130503907852438 145434083544623621cdddc vmlinux

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
Cc: "Paul E. McKenney" <paul...@linux.vnet.ibm.com>
Cc: Josh Triplett <j...@joshtriplett.org>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoy...@efficios.com>
Cc: Lai Jiangshan <jiangshan...@gmail.com>
---
Patch applied on top of linus's tree (commit cf9b0772f2e41).

 kernel/rcu/tree_plugin.h | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index db85ca3..13a8e08 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask;/* Was rcu_nocb_mask allocated? */
 static bool __read_mostly rcu_nocb_poll;/* Offload kthread are to poll. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 
@@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void)
 static int __init rcu_nocb_setup(char *str)
 {
alloc_bootmem_cpumask_var(_nocb_mask);
-   have_rcu_nocb_mask = true;
cpulist_parse(str, rcu_nocb_mask);
return 1;
 }
@@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 /* Is the specified CPU a no-CBs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
-   if (have_rcu_nocb_mask)
+   if (cpumask_available(rcu_nocb_mask))
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
 }
@@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
 
-   if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+   if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
if (!zalloc_cpumask_var(_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback 
offloading disabled.\n");
return;
}
-   have_rcu_nocb_mask = true;
}
-   if (!have_rcu_nocb_mask)
+   if (!cpumask_available(rcu_nocb_mask))
return;
 
 #if defined(CONFIG_NO_HZ_FULL)
@@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct 
rcu_state *rsp)
struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
 
-   if (!have_rcu_nocb_mask)
+   if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
-- 
2.9.3



[PATCH] RCU: Remove have_rcu_nocb_mask from tree_plugin.h

2017-11-17 Thread Rakib Mullick
Currently have_rcu_nocb_mask is used to avoid double allocation of
rcu_nocb_mask during boot up. Due to different representation of
cpumask_var_t on different kernel config CPUMASK=y(or n) it was okay.
But now we have a helper cpumask_available(), which can be utilized
to check whether rcu_nocb_mask has been allocated or not without using
a variable.

Removing the variable also reduces vmlinux size.

Unpatched version:
text   data bss dec hex filename
130503937852470 145434083544627121cddff vmlinux

Patched version:
 text  data bss dec hex filename
130503907852438 145434083544623621cdddc vmlinux

Signed-off-by: Rakib Mullick 
Cc: "Paul E. McKenney" 
Cc: Josh Triplett 
Cc: Steven Rostedt 
Cc: Mathieu Desnoyers 
Cc: Lai Jiangshan 
---
Patch applied on top of linus's tree (commit cf9b0772f2e41).

 kernel/rcu/tree_plugin.h | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index db85ca3..13a8e08 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
 
 #ifdef CONFIG_RCU_NOCB_CPU
 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool have_rcu_nocb_mask;/* Was rcu_nocb_mask allocated? */
 static bool __read_mostly rcu_nocb_poll;/* Offload kthread are to poll. */
 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
 
@@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void)
 static int __init rcu_nocb_setup(char *str)
 {
alloc_bootmem_cpumask_var(_nocb_mask);
-   have_rcu_nocb_mask = true;
cpulist_parse(str, rcu_nocb_mask);
return 1;
 }
@@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
 /* Is the specified CPU a no-CBs CPU? */
 bool rcu_is_nocb_cpu(int cpu)
 {
-   if (have_rcu_nocb_mask)
+   if (cpumask_available(rcu_nocb_mask))
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
 }
@@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void)
need_rcu_nocb_mask = true;
 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
 
-   if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
+   if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
if (!zalloc_cpumask_var(_nocb_mask, GFP_KERNEL)) {
pr_info("rcu_nocb_mask allocation failed, callback 
offloading disabled.\n");
return;
}
-   have_rcu_nocb_mask = true;
}
-   if (!have_rcu_nocb_mask)
+   if (!cpumask_available(rcu_nocb_mask))
return;
 
 #if defined(CONFIG_NO_HZ_FULL)
@@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct 
rcu_state *rsp)
struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
struct rcu_data *rdp_prev = NULL;
 
-   if (!have_rcu_nocb_mask)
+   if (!cpumask_available(rcu_nocb_mask))
return;
if (ls == -1) {
ls = int_sqrt(nr_cpu_ids);
-- 
2.9.3



Re: [PATCH] lib: Avoid redundant sizeof checking in __bitmap_weight() calculation.

2017-11-14 Thread Rakib Mullick
On Tue, Nov 14, 2017 at 1:23 PM, Rasmus Villemoes
 wrote:
>
> hint: sizeof() very rarely evaluates to zero... So this is the same as
> "is32 = 1". So the patch as-is is broken (and may explain the 1-byte
> delta in vmlinux). But even if this condition is fixed, the patch
> doesn't change anything, since the sizeof() evaluation is done at
> compile-time, regardless of whether it happens inside the inlined
> hweight_long or outside. So it is certainly not worth it to duplicate
> the loop.
>
Right, no need to duplicate the loop. Checking the objdump output, it
didn't changed anything at all. Fixed condition nullifies the vmlinux
size advantage also. Drop it.

Thanks,
Rakib.


Re: [PATCH] lib: Avoid redundant sizeof checking in __bitmap_weight() calculation.

2017-11-14 Thread Rakib Mullick
On Tue, Nov 14, 2017 at 1:23 PM, Rasmus Villemoes
 wrote:
>
> hint: sizeof() very rarely evaluates to zero... So this is the same as
> "is32 = 1". So the patch as-is is broken (and may explain the 1-byte
> delta in vmlinux). But even if this condition is fixed, the patch
> doesn't change anything, since the sizeof() evaluation is done at
> compile-time, regardless of whether it happens inside the inlined
> hweight_long or outside. So it is certainly not worth it to duplicate
> the loop.
>
Right, no need to duplicate the loop. Checking the objdump output, it
didn't changed anything at all. Fixed condition nullifies the vmlinux
size advantage also. Drop it.

Thanks,
Rakib.


[PATCH] lib: Avoid redundant sizeof checking in __bitmap_weight() calculation.

2017-11-13 Thread Rakib Mullick
Currently, during __bitmap_weight() calculation hweight_long() is used.
Inside a hweight_long() a check has been made to figure out whether a
hweight32() or hweight64() version to use.

However, it's unnecessary to do it in case of __bitmap_weight() calculation
inside the loop. We can detect whether to use hweight32() or hweight64()
upfront and call respective function directly. It also reduces the
vmlinux size.

Before the patch:
   textdata bss dec hex filename
129013327798930 1454181635242078219c05e vmlinux

After the patch:
   textdata bss dec hex filename
129013317798930 1454181635242077219c05d vmlinux

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Rasmus Villemoes <li...@rasmusvillemoes.dk>
Cc: Matthew Wilcox <mawil...@microsoft.com>
Cc: Yury Norov <yno...@caviumnetworks.com>
Cc: Mauro Carvalho Chehab <mche...@kernel.org>
 
---
Patch was created against torvald's tree (commit 43ff2f4db9d0f764).

 lib/bitmap.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index d8f0c09..552096f 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -241,10 +241,15 @@ EXPORT_SYMBOL(__bitmap_subset);
 int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
unsigned int k, lim = bits/BITS_PER_LONG;
-   int w = 0;
-
-   for (k = 0; k < lim; k++)
-   w += hweight_long(bitmap[k]);
+   int w = 0, is32 = sizeof(bitmap[0]) ? 1 : 0;
+
+   if (is32) {
+   for (k = 0; k < lim; k++)
+   w += hweight32(bitmap[k]);
+   } else {
+   for (k = 0; k < lim; k++)
+   w += hweight64(bitmap[k]);
+   }
 
if (bits % BITS_PER_LONG)
w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
-- 
2.9.3



[PATCH] lib: Avoid redundant sizeof checking in __bitmap_weight() calculation.

2017-11-13 Thread Rakib Mullick
Currently, during __bitmap_weight() calculation hweight_long() is used.
Inside a hweight_long() a check has been made to figure out whether a
hweight32() or hweight64() version to use.

However, it's unnecessary to do it in case of __bitmap_weight() calculation
inside the loop. We can detect whether to use hweight32() or hweight64()
upfront and call respective function directly. It also reduces the
vmlinux size.

Before the patch:
   textdata bss dec hex filename
129013327798930 1454181635242078219c05e vmlinux

After the patch:
   textdata bss dec hex filename
129013317798930 1454181635242077219c05d vmlinux

Signed-off-by: Rakib Mullick 
Cc: Andrew Morton 
Cc: Rasmus Villemoes 
Cc: Matthew Wilcox 
Cc: Yury Norov 
Cc: Mauro Carvalho Chehab 
 
---
Patch was created against torvald's tree (commit 43ff2f4db9d0f764).

 lib/bitmap.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/lib/bitmap.c b/lib/bitmap.c
index d8f0c09..552096f 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -241,10 +241,15 @@ EXPORT_SYMBOL(__bitmap_subset);
 int __bitmap_weight(const unsigned long *bitmap, unsigned int bits)
 {
unsigned int k, lim = bits/BITS_PER_LONG;
-   int w = 0;
-
-   for (k = 0; k < lim; k++)
-   w += hweight_long(bitmap[k]);
+   int w = 0, is32 = sizeof(bitmap[0]) ? 1 : 0;
+
+   if (is32) {
+   for (k = 0; k < lim; k++)
+   w += hweight32(bitmap[k]);
+   } else {
+   for (k = 0; k < lim; k++)
+   w += hweight64(bitmap[k]);
+   }
 
if (bits % BITS_PER_LONG)
w += hweight_long(bitmap[k] & BITMAP_LAST_WORD_MASK(bits));
-- 
2.9.3



Re: [PATCH] Documentation, fix module-signing file reference.

2017-11-05 Thread Rakib Mullick
Ping! Anyone care to take this trivial fix?

On Tue, Oct 31, 2017 at 8:39 PM, Rakib Mullick <rakib.mull...@gmail.com> wrote:
> Kconfig reference for module-signing.txt file needs to
> be replaced with admin-guide/module-signing.rst.
>
> Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
> ---
>  init/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 3c1faaa..1b5e786 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1752,7 +1752,7 @@ config MODULE_SIG
> help
>   Check modules for valid signatures upon load: the signature
>   is simply appended to the module. For more information see
> - Documentation/module-signing.txt.
> + Documentation/admin-guide/module-signing.rst.
>
>   Note that this option adds the OpenSSL development packages as a
>   kernel build dependency so that the signing tool can use its crypto
> --
> 2.9.3
>


Re: [PATCH] Documentation, fix module-signing file reference.

2017-11-05 Thread Rakib Mullick
Ping! Anyone care to take this trivial fix?

On Tue, Oct 31, 2017 at 8:39 PM, Rakib Mullick  wrote:
> Kconfig reference for module-signing.txt file needs to
> be replaced with admin-guide/module-signing.rst.
>
> Signed-off-by: Rakib Mullick 
> ---
>  init/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 3c1faaa..1b5e786 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1752,7 +1752,7 @@ config MODULE_SIG
> help
>   Check modules for valid signatures upon load: the signature
>   is simply appended to the module. For more information see
> - Documentation/module-signing.txt.
> + Documentation/admin-guide/module-signing.rst.
>
>   Note that this option adds the OpenSSL development packages as a
>   kernel build dependency so that the signing tool can use its crypto
> --
> 2.9.3
>


[tip:irq/core] irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1)

2017-11-01 Thread tip-bot for Rakib Mullick
Commit-ID:  10d94ff4d558b96bfc4f55bb0051ae4d938246fe
Gitweb: https://git.kernel.org/tip/10d94ff4d558b96bfc4f55bb0051ae4d938246fe
Author: Rakib Mullick <rakib.mull...@gmail.com>
AuthorDate: Wed, 1 Nov 2017 10:14:51 +0600
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Wed, 1 Nov 2017 09:56:39 +0100

irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on 
CPUMASK_OFFSTACK=y kernels(v1)

When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y
kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before
initializing the slab allocator to allocate a cpumask.

So, use alloc_bootmem_cpumask_var() instead.

Also do some cleanups while at it: in init_irq_default_affinity() remove
an #ifdef via using cpumask_available().

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mull...@gmail.com
Link: http://lkml.kernel.org/r/20171101041451.12581-1-rakib.mull...@gmail.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/irq/irqdesc.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 982a357..f2edcf8 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-   if (!irq_default_affinity)
+   if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }


[tip:irq/core] irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1)

2017-11-01 Thread tip-bot for Rakib Mullick
Commit-ID:  10d94ff4d558b96bfc4f55bb0051ae4d938246fe
Gitweb: https://git.kernel.org/tip/10d94ff4d558b96bfc4f55bb0051ae4d938246fe
Author: Rakib Mullick 
AuthorDate: Wed, 1 Nov 2017 10:14:51 +0600
Committer:  Ingo Molnar 
CommitDate: Wed, 1 Nov 2017 09:56:39 +0100

irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on 
CPUMASK_OFFSTACK=y kernels(v1)

When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y
kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before
initializing the slab allocator to allocate a cpumask.

So, use alloc_bootmem_cpumask_var() instead.

Also do some cleanups while at it: in init_irq_default_affinity() remove
an #ifdef via using cpumask_available().

Signed-off-by: Rakib Mullick 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mull...@gmail.com
Link: http://lkml.kernel.org/r/20171101041451.12581-1-rakib.mull...@gmail.com
Signed-off-by: Ingo Molnar 
---
 kernel/irq/irqdesc.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 982a357..f2edcf8 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-   if (!irq_default_affinity)
+   if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }


Re: [PATCH] [irq] Fix boot failure when irqaffinity is passed.

2017-10-31 Thread Rakib Mullick
On Tue, Oct 31, 2017 at 5:29 PM, Ingo Molnar  wrote:
>
>
> Not applied, because this patch causes the following build warning:
>
>   kernel/irq/irqdesc.c:43:6: warning: the address of ‘irq_default_affinity’ 
> will always evaluate as ‘true’ [-Waddress]
>
Ah, sorry I didn't look into the build log. It happened due to removal
of #ifdef's. Now, it's been fixed by using cpumask_available().

> Also, please pick up the improved changelog below for the next version of the
> patch.
>
Thanks for the improved changelog, I have sent a new version here:
https://lkml.org/lkml/2017/11/1/6.

Thanks,
Rakib.


Re: [PATCH] [irq] Fix boot failure when irqaffinity is passed.

2017-10-31 Thread Rakib Mullick
On Tue, Oct 31, 2017 at 5:29 PM, Ingo Molnar  wrote:
>
>
> Not applied, because this patch causes the following build warning:
>
>   kernel/irq/irqdesc.c:43:6: warning: the address of ‘irq_default_affinity’ 
> will always evaluate as ‘true’ [-Waddress]
>
Ah, sorry I didn't look into the build log. It happened due to removal
of #ifdef's. Now, it's been fixed by using cpumask_available().

> Also, please pick up the improved changelog below for the next version of the
> patch.
>
Thanks for the improved changelog, I have sent a new version here:
https://lkml.org/lkml/2017/11/1/6.

Thanks,
Rakib.


[PATCH] irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1)

2017-10-31 Thread Rakib Mullick
When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y
kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before
initializing the slab allocator to allocate a cpumask.

So, use alloc_bootmem_cpumask_var() instead.

Also do some cleanups while at it: in init_irq_default_affinity() remove
an unnecessary #ifdef.

Change since v0:
* Fix build warning.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mull...@gmail.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
Patch created against -rc7 (commit 0b07194bb55ed836c2). I found tip had a merge
conflict, so used -rc7 instead.

 kernel/irq/irqdesc.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7e..e97bbae 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-   if (!irq_default_affinity)
+   if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }
-- 
2.9.3



[PATCH] irq/core: Fix boot crash when the irqaffinity= boot parameter is passed on CPUMASK_OFFSTACK=y kernels(v1)

2017-10-31 Thread Rakib Mullick
When the irqaffinity= kernel parameter is passed in a CPUMASK_OFFSTACK=y
kernel, it fails to boot, because zalloc_cpumask_var() cannot be used before
initializing the slab allocator to allocate a cpumask.

So, use alloc_bootmem_cpumask_var() instead.

Also do some cleanups while at it: in init_irq_default_affinity() remove
an unnecessary #ifdef.

Change since v0:
* Fix build warning.

Signed-off-by: Rakib Mullick 
Cc: Thomas Gleixner 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Link: http://lkml.kernel.org/r/20171026045800.27087-1-rakib.mull...@gmail.com
Signed-off-by: Ingo Molnar 
---
Patch created against -rc7 (commit 0b07194bb55ed836c2). I found tip had a merge
conflict, so used -rc7 instead.

 kernel/irq/irqdesc.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 82afb7e..e97bbae 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-   if (!irq_default_affinity)
+   if (!cpumask_available(irq_default_affinity))
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }
-- 
2.9.3



[PATCH] Documentation, fix module-signing file reference.

2017-10-31 Thread Rakib Mullick
Kconfig reference for module-signing.txt file needs to
be replaced with admin-guide/module-signing.rst.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 3c1faaa..1b5e786 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1752,7 +1752,7 @@ config MODULE_SIG
help
  Check modules for valid signatures upon load: the signature
  is simply appended to the module. For more information see
- Documentation/module-signing.txt.
+ Documentation/admin-guide/module-signing.rst.
 
  Note that this option adds the OpenSSL development packages as a
  kernel build dependency so that the signing tool can use its crypto
-- 
2.9.3



[PATCH] Documentation, fix module-signing file reference.

2017-10-31 Thread Rakib Mullick
Kconfig reference for module-signing.txt file needs to
be replaced with admin-guide/module-signing.rst.

Signed-off-by: Rakib Mullick 
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 3c1faaa..1b5e786 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1752,7 +1752,7 @@ config MODULE_SIG
help
  Check modules for valid signatures upon load: the signature
  is simply appended to the module. For more information see
- Documentation/module-signing.txt.
+ Documentation/admin-guide/module-signing.rst.
 
  Note that this option adds the OpenSSL development packages as a
  kernel build dependency so that the signing tool can use its crypto
-- 
2.9.3



[PATCH] [irq] Fix boot failure when irqaffinity is passed.

2017-10-25 Thread Rakib Mullick
When irqaffinity kernel param is passed in a CPUMASK_OFFSTACK=y build
kernel, it fails to boot. zalloc_cpumask_var() cannot be used before
initializing mm stuff (slab allocator) to allocate cpumask. So, use
alloc_bootmem_cpumask_var(). Also in init_irq_default_affinity() removes
unneeded ifdef, these ifdef conditions are handled at defination site.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
---
 kernel/irq/irqdesc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 982a357..db6380d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
if (!irq_default_affinity)
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }
-- 
2.9.3



[PATCH] [irq] Fix boot failure when irqaffinity is passed.

2017-10-25 Thread Rakib Mullick
When irqaffinity kernel param is passed in a CPUMASK_OFFSTACK=y build
kernel, it fails to boot. zalloc_cpumask_var() cannot be used before
initializing mm stuff (slab allocator) to allocate cpumask. So, use
alloc_bootmem_cpumask_var(). Also in init_irq_default_affinity() removes
unneeded ifdef, these ifdef conditions are handled at defination site.

Signed-off-by: Rakib Mullick 
---
 kernel/irq/irqdesc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 982a357..db6380d 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -27,7 +27,7 @@ static struct lock_class_key irq_desc_lock_class;
 #if defined(CONFIG_SMP)
 static int __init irq_affinity_setup(char *str)
 {
-   zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
+   alloc_bootmem_cpumask_var(_default_affinity);
cpulist_parse(str, irq_default_affinity);
/*
 * Set at least the boot cpu. We don't want to end up with
@@ -40,10 +40,8 @@ __setup("irqaffinity=", irq_affinity_setup);
 
 static void __init init_irq_default_affinity(void)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
if (!irq_default_affinity)
zalloc_cpumask_var(_default_affinity, GFP_NOWAIT);
-#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
 }
-- 
2.9.3



[tip:sched/core] sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK

2017-10-24 Thread tip-bot for Rakib Mullick
Commit-ID:  e22cdc3fc5991956146b9856d36b4971fe54dcd6
Gitweb: https://git.kernel.org/tip/e22cdc3fc5991956146b9856d36b4971fe54dcd6
Author: Rakib Mullick <rakib.mull...@gmail.com>
AuthorDate: Mon, 23 Oct 2017 19:01:54 +0600
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Tue, 24 Oct 2017 11:47:25 +0200

sched/isolcpus: Fix "isolcpus=" boot parameter handling when 
!CONFIG_CPUMASK_OFFSTACK

cpulist_parse() uses nr_cpumask_bits as a limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:

 - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
   NR_CPUS, which might not represent the # of CPUs that really exist
   (default 64). So, there's a chance of a gap between nr_cpu_ids
   and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
   operation. For example, if isolcpus=9 is passed on an 8 cpu
   system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
   that it's supposed to.

This patch fixes this bug by finding the last CPU of the passed
isolcpus= list and checking it against nr_cpu_ids.

It also fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since CPU numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: adobri...@gmail.com
Cc: a...@linux-foundation.org
Cc: long...@redhat.com
Cc: m...@chromium.org
Cc: t...@kernel.org
Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mull...@gmail.com
[ Enhanced the changelog and the kernel message. ]
Signed-off-by: Ingo Molnar <mi...@kernel.org>

 include/linux/cpumask.h |   16 
 kernel/sched/topology.c |4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..63661de 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e50450c..e3d31b0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str)
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u - ignoring them.\n", nr_cpu_ids-1);
return 0;
}
return 1;


[tip:sched/core] sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK

2017-10-24 Thread tip-bot for Rakib Mullick
Commit-ID:  e22cdc3fc5991956146b9856d36b4971fe54dcd6
Gitweb: https://git.kernel.org/tip/e22cdc3fc5991956146b9856d36b4971fe54dcd6
Author: Rakib Mullick 
AuthorDate: Mon, 23 Oct 2017 19:01:54 +0600
Committer:  Ingo Molnar 
CommitDate: Tue, 24 Oct 2017 11:47:25 +0200

sched/isolcpus: Fix "isolcpus=" boot parameter handling when 
!CONFIG_CPUMASK_OFFSTACK

cpulist_parse() uses nr_cpumask_bits as a limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depending upon the CONFIG_CPUMASK_OFFSTACK option:

 - If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is the same as
   NR_CPUS, which might not represent the # of CPUs that really exist
   (default 64). So, there's a chance of a gap between nr_cpu_ids
   and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
   operation. For example, if isolcpus=9 is passed on an 8 cpu
   system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
   that it's supposed to.

This patch fixes this bug by finding the last CPU of the passed
isolcpus= list and checking it against nr_cpu_ids.

It also fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since CPU numbering starts from 0.

Signed-off-by: Rakib Mullick 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: adobri...@gmail.com
Cc: a...@linux-foundation.org
Cc: long...@redhat.com
Cc: m...@chromium.org
Cc: t...@kernel.org
Link: http://lkml.kernel.org/r/20171023130154.9050-1-rakib.mull...@gmail.com
[ Enhanced the changelog and the kernel message. ]
Signed-off-by: Ingo Molnar 

 include/linux/cpumask.h |   16 
 kernel/sched/topology.c |4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..63661de 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e50450c..e3d31b0 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -476,8 +476,8 @@ static int __init isolated_cpu_setup(char *str)
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u - ignoring them.\n", nr_cpu_ids-1);
return 0;
}
return 1;


[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-23 Thread Rakib Mullick

> On Mon, Oct 23, 2017 at 5:50 PM, Ingo Molnar <mi...@kernel.org> wrote:
>
>> * Rakib Mullick <rakib.mull...@gmail.com> wrote:
>> +/**
>> + * cpumask_last - get the last cpu in a cpumask
>
> Please capitalize 'CPU' properly in documentation.
>
OK.
>> + int ret, lastcpu;
>>
>>   alloc_bootmem_cpumask_var(_isolated_map);
>>   ret = cpulist_parse(str, cpu_isolated_map);
>> - if (ret) {
>> - pr_err("sched: Error, all isolcpus= values must be between 0 
>> and %u\n", nr_cpu_ids);
>> + lastcpu = cpumask_last(cpu_isolated_map);
>> + if (ret || lastcpu >= nr_cpu_ids) {
>
> Any reason why 'lastcpu' has to be introduced - why not just use 
> cpumask_last()
> directly in the condition? It looks obvious enough of a pattern.
Thought it reflects what we're doing here, but yes, actually it's redundant.

>> + pr_err("sched: Error, all isolcpus= values must be between 0 
>> and %u\n",
>> + nr_cpu_ids-1);
>
> Please don't break the line mindlessly just due to checkpatch complaining - it
> makes the code less readable.
>
Yes, right, mindlessly followed what checkpatch was complaining.

Please see the following patch, with changes made based on your review and
patched up against -rc6.

Thanks,
Rakib
---

[PATCH](v1) Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Changes since v0 (Ingo):
* use cpumask_last() directly into the condition.
* use CPU rather cpu in documentation
* undo line break

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..63661de 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..060cee5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -470,8 +470,8 @@ static int __init isolated_cpu_setup(char *str)
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-23 Thread Rakib Mullick

> On Mon, Oct 23, 2017 at 5:50 PM, Ingo Molnar  wrote:
>
>> * Rakib Mullick  wrote:
>> +/**
>> + * cpumask_last - get the last cpu in a cpumask
>
> Please capitalize 'CPU' properly in documentation.
>
OK.
>> + int ret, lastcpu;
>>
>>   alloc_bootmem_cpumask_var(_isolated_map);
>>   ret = cpulist_parse(str, cpu_isolated_map);
>> - if (ret) {
>> - pr_err("sched: Error, all isolcpus= values must be between 0 
>> and %u\n", nr_cpu_ids);
>> + lastcpu = cpumask_last(cpu_isolated_map);
>> + if (ret || lastcpu >= nr_cpu_ids) {
>
> Any reason why 'lastcpu' has to be introduced - why not just use 
> cpumask_last()
> directly in the condition? It looks obvious enough of a pattern.
Thought it reflects what we're doing here, but yes, actually it's redundant.

>> + pr_err("sched: Error, all isolcpus= values must be between 0 
>> and %u\n",
>> + nr_cpu_ids-1);
>
> Please don't break the line mindlessly just due to checkpatch complaining - it
> makes the code less readable.
>
Yes, right, mindlessly followed what checkpatch was complaining.

Please see the following patch, with changes made based on your review and
patched up against -rc6.

Thanks,
Rakib
---

[PATCH](v1) Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Changes since v0 (Ingo):
    * use cpumask_last() directly into the condition.
* use CPU rather cpu in documentation
* undo line break

Signed-off-by: Rakib Mullick 
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..63661de 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last CPU in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no CPUs set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..060cee5 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -470,8 +470,8 @@ static int __init isolated_cpu_setup(char *str)
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   if (ret || cpumask_last(cpu_isolated_map) >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-21 Thread Rakib Mullick
>  *On Fri, Oct 20, 2017 at 2:49 PM, Ingo Molnar <mi...@kernel.org> wrote:
>
>> Rakib Mullick <rakib.mull...@gmail.com> wrote:
>>  include/linux/cpumask.h | 16 
>>  kernel/sched/topology.c |  8 +---
>>  2 files changed, 21 insertions(+), 3 deletions(-)
>
> What kernel is this against? It does not apply to the latest kernels.

It was against 4.14-rc4, prepared before -rc5 release. Please, consider
the below one, against -rc5.

 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  8 +---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..5631725 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last cpu in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no cpus set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..b9265c8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -466,12 +466,14 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 /* Setup the mask of CPUs configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-   int ret;
+   int ret, lastcpu;
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   lastcpu = cpumask_last(cpu_isolated_map);
+   if (ret || lastcpu >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n",
+   nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-21 Thread Rakib Mullick
>  *On Fri, Oct 20, 2017 at 2:49 PM, Ingo Molnar  wrote:
>
>> Rakib Mullick  wrote:
>>  include/linux/cpumask.h | 16 
>>  kernel/sched/topology.c |  8 +---
>>  2 files changed, 21 insertions(+), 3 deletions(-)
>
> What kernel is this against? It does not apply to the latest kernels.

It was against 4.14-rc4, prepared before -rc5 release. Please, consider
the below one, against -rc5.

 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Signed-off-by: Rakib Mullick 
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  8 +---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cd415b7..5631725 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -178,6 +183,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
+/**
+ * cpumask_last - get the last cpu in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no cpus set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
 unsigned int cpumask_next(int n, const struct cpumask *srcp);
 
 /**
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..b9265c8 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -466,12 +466,14 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 /* Setup the mask of CPUs configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-   int ret;
+   int ret, lastcpu;
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n", nr_cpu_ids);
+   lastcpu = cpumask_last(cpu_isolated_map);
+   if (ret || lastcpu >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %u\n",
+   nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-15 Thread Rakib Mullick
 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Signed-off-by: Rakib Mullick <rakib.mull...@gmail.com>
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  8 +---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad2..f9a7c9b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
 }
 
 /**
+ * cpumask_last - get the last cpu in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no cpus set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
  * cpumask_next - get the next cpu in a cpumask
  * @n: the cpu prior to the place to search (ie. return will be > @n)
  * @srcp: the cpumask pointer
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb..55b5e09 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -452,12 +452,14 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 /* Setup the mask of CPUs configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-   int ret;
+   int ret, lastcpu;
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %d\n", nr_cpu_ids);
+   lastcpu = cpumask_last(cpu_isolated_map);
+   if (ret || lastcpu >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %d\n",
+   nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Fix isocpus's param handling when CPUMASK_OFFSTACK=n.

2017-10-15 Thread Rakib Mullick
 cpulist_parse() uses nr_cpumask_bits as limit to parse the
passed buffer from kernel commandline. What nr_cpumask_bits
represents varies depends upon CONFIG_CPUMASK_OFFSTACK option.
If CONFIG_CPUMASK_OFFSTACK=n, then nr_cpumask_bits is same as
NR_CPUS, which might not represent the # of cpus really exist
(default 64). So, there's a chance of gap between nr_cpu_ids
and NR_CPUS, which ultimately lead towards invalid cpulist_parse()
operation. For example, if isolcpus=9 is passed on a 8 cpu
system (CONFIG_CPUMASK_OFFSTACK=n) it doesn't show the error
that it suppose to.

This patch fixes this issue by effectively find out the last
cpu of the passed isolcpus list and checking it with nr_cpu_ids.
Also, fixes the error message where the nr_cpu_ids should be
nr_cpu_ids-1, since the cpu numbering starts from 0.

Signed-off-by: Rakib Mullick 
---
 include/linux/cpumask.h | 16 
 kernel/sched/topology.c |  8 +---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 2404ad2..f9a7c9b 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -130,6 +130,11 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
return 0;
 }
 
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return 0;
+}
+
 /* Valid inputs for n are -1 and 0. */
 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
 {
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct 
cpumask *srcp)
 }
 
 /**
+ * cpumask_last - get the last cpu in a cpumask
+ * @srcp:  - the cpumask pointer
+ *
+ * Returns >= nr_cpumask_bits if no cpus set.
+ */
+static inline unsigned int cpumask_last(const struct cpumask *srcp)
+{
+   return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
+}
+
+/**
  * cpumask_next - get the next cpu in a cpumask
  * @n: the cpu prior to the place to search (ie. return will be > @n)
  * @srcp: the cpumask pointer
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1b0b4fb..55b5e09 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -452,12 +452,14 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
 /* Setup the mask of CPUs configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
 {
-   int ret;
+   int ret, lastcpu;
 
alloc_bootmem_cpumask_var(_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
-   if (ret) {
-   pr_err("sched: Error, all isolcpus= values must be between 0 
and %d\n", nr_cpu_ids);
+   lastcpu = cpumask_last(cpu_isolated_map);
+   if (ret || lastcpu >= nr_cpu_ids) {
+   pr_err("sched: Error, all isolcpus= values must be between 0 
and %d\n",
+   nr_cpu_ids-1);
return 0;
}
return 1;
-- 
2.9.3



[PATCH] Remove cpuset_update_active_cpus()'s parameter.

2017-04-08 Thread Rakib Mullick
In cpuset_update_active_cpus(), cpu_online isn't used anymore. Remove
it.

Signed-off-by: Rakib Mullick<rakib.mull...@gmail.com>
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 611fce5..119a3f9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292..d6e8ded 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2354,7 +2354,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
rebuild_sched_domains();
 }
 
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
/*
 * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc0..430b046 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5732,7 +5732,7 @@ static void cpuset_cpu_active(void)
 * cpuset configurations.
 */
}
-   cpuset_update_active_cpus(true);
+   cpuset_update_active_cpus();
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5755,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
if (overflow)
return -EBUSY;
-   cpuset_update_active_cpus(false);
+   cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
-- 
2.9.3



[PATCH] Remove cpuset_update_active_cpus()'s parameter.

2017-04-08 Thread Rakib Mullick
In cpuset_update_active_cpus(), cpu_online isn't used anymore. Remove
it.

Signed-off-by: Rakib Mullick
---
 include/linux/cpuset.h | 4 ++--
 kernel/cgroup/cpuset.c | 2 +-
 kernel/sched/core.c| 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 611fce5..119a3f9 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -42,7 +42,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_update_active_cpus(bool cpu_online);
+extern void cpuset_update_active_cpus(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_update_active_cpus(bool cpu_online)
+static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 0f41292..d6e8ded 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2354,7 +2354,7 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
rebuild_sched_domains();
 }
 
-void cpuset_update_active_cpus(bool cpu_online)
+void cpuset_update_active_cpus(void)
 {
/*
 * We're inside cpu hotplug critical region which usually nests
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3b31fc0..430b046 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5732,7 +5732,7 @@ static void cpuset_cpu_active(void)
 * cpuset configurations.
 */
}
-   cpuset_update_active_cpus(true);
+   cpuset_update_active_cpus();
 }
 
 static int cpuset_cpu_inactive(unsigned int cpu)
@@ -5755,7 +5755,7 @@ static int cpuset_cpu_inactive(unsigned int cpu)
 
if (overflow)
return -EBUSY;
-   cpuset_update_active_cpus(false);
+   cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
-- 
2.9.3



[ANNOUNCE] BLD-3.18 release.

2014-12-16 Thread Rakib Mullick
 if (p->nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags & SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   want_affine = 1;
+   }
+   }
+
+   if (want_affine)
+   tmpmask = tsk_cpus_allowed(p);
+   else
+   tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
+
+   if (rt_task(p))
+   cpu = select_cpu_for_wakeup(0, tmpmask);
+   else
+   cpu = select_cpu_for_wakeup(1, tmpmask);
+
+   return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   int firstbit;
+   struct rt_rq *first;
+   struct rt_prio_array *array = >rt.active;
+
+   first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
+   firstbit = sched_find_first_bit(array->bitmap);
+
+   /* Maintaining rt.lowbit */
+   if (firstbit > 0 && firstbit <= rq->rt.lowbit)
+   rq->rt.lowbit = firstbit;
+
+   if (rq->rt.lowbit < first->lowbit) {
+   write_lock_irqsave(_list_lock, flag);
+   list_del(>rt.bld_rt_list);
+   list_add_tail(>rt.bld_rt_list, _rq_head);
+   write_unlock_irqrestore(_list_lock, flag);
+   }
+}
+
+static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   unsigned int cpu;
+
+   if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC && 
(get_nr_threads(p) > 1)))
+   cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
+   else {
+   if (rt_task(p))
+   cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
+   else
+   cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
+   }
+
+   return cpu;
+}
+
+static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   if (rt_task(p)) {
+   track_load_rt(rq, p);
+   } else {
+   if (rq->cfs.pos != 2) {
+   struct cfs_rq *last;
+   last = list_entry(cfs_rq_head.prev, struct cfs_rq, 
bld_cfs_list);
+   if (rq->cfs.load.weight >= last->load.weight) {
+   write_lock_irqsave(_list_lock, flag);
+   list_del(>cfs.bld_cfs_list);
+   list_add_tail(>cfs.bld_cfs_list, 
_rq_head);
+   rq->cfs.pos = 2; last->pos = 1;
+   write_unlock_irqrestore(_list_lock, flag);
+   }
+   }
+   }
+}
+
+static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   if (rt_task(p)) {
+   track_load_rt(rq, p);
+   } else {
+   if (rq->cfs.pos != 0) {
+   struct cfs_rq *first;
+   first = list_entry(cfs_rq_head.next, struct cfs_rq, 
bld_cfs_list);
+   if (rq->cfs.load.weight <= first->load.weight) {
+   write_lock_irqsave(_list_lock, flag);
+   list_del(>cfs.bld_cfs_list);
+   list_add(>cfs.bld_cfs_list, _rq_head);
+   rq->cfs.pos = 0; first->pos = 1;
+   write_unlock_irqrestore(_list_lock, flag);
+   }
+   }
+   }
+}
+#else
+static inline void bld_track_load_activate(struct rq *rq, struct task_struct 
*p)
+{
+}
+
+static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct 
*p)
+{
+}
+#endif /* CONFIG_BLD */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e7283..bd702c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,6 +24,8 @@
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *  Thomas Gleixner, Mike Kravetz
+ *  2012-Feb   The Barbershop Load Distribution (BLD) algorithm - an alternate
+ * CPU load distribution technique for kernel scheduler by Rakib 
Mullick.
  */
 
 #include 
@@ -86,6 +88,7 @@
 #include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
+#include "bld.h"
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct 
*p, int flags)
update_rq_clock(rq);
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
+   if (!dl_task(p))
+   bld_track_load_activate(rq, p);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -847,6 +852,8 @@ static void dequeue_task(struc

[ANNOUNCE] BLD-3.18 release.

2014-12-16 Thread Rakib Mullick
-nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags  SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   want_affine = 1;
+   }
+   }
+
+   if (want_affine)
+   tmpmask = tsk_cpus_allowed(p);
+   else
+   tmpmask = sched_domain_span(cpu_rq(task_cpu(p))-sd);
+
+   if (rt_task(p))
+   cpu = select_cpu_for_wakeup(0, tmpmask);
+   else
+   cpu = select_cpu_for_wakeup(1, tmpmask);
+
+   return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   int firstbit;
+   struct rt_rq *first;
+   struct rt_prio_array *array = rq-rt.active;
+
+   first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list);
+   firstbit = sched_find_first_bit(array-bitmap);
+
+   /* Maintaining rt.lowbit */
+   if (firstbit  0  firstbit = rq-rt.lowbit)
+   rq-rt.lowbit = firstbit;
+
+   if (rq-rt.lowbit  first-lowbit) {
+   write_lock_irqsave(rt_list_lock, flag);
+   list_del(rq-rt.bld_rt_list);
+   list_add_tail(rq-rt.bld_rt_list, rt_rq_head);
+   write_unlock_irqrestore(rt_list_lock, flag);
+   }
+}
+
+static int bld_get_cpu(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   unsigned int cpu;
+
+   if (sd_flags == SD_BALANCE_WAKE || (sd_flags == SD_BALANCE_EXEC  
(get_nr_threads(p)  1)))
+   cpu = bld_pick_cpu_domain(p, sd_flags, wake_flags);
+   else {
+   if (rt_task(p))
+   cpu = bld_pick_cpu_rt(p, sd_flags, wake_flags);
+   else
+   cpu = bld_pick_cpu_cfs(p, sd_flags, wake_flags);
+   }
+
+   return cpu;
+}
+
+static void bld_track_load_activate(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   if (rt_task(p)) {
+   track_load_rt(rq, p);
+   } else {
+   if (rq-cfs.pos != 2) {
+   struct cfs_rq *last;
+   last = list_entry(cfs_rq_head.prev, struct cfs_rq, 
bld_cfs_list);
+   if (rq-cfs.load.weight = last-load.weight) {
+   write_lock_irqsave(cfs_list_lock, flag);
+   list_del(rq-cfs.bld_cfs_list);
+   list_add_tail(rq-cfs.bld_cfs_list, 
cfs_rq_head);
+   rq-cfs.pos = 2; last-pos = 1;
+   write_unlock_irqrestore(cfs_list_lock, flag);
+   }
+   }
+   }
+}
+
+static void bld_track_load_deactivate(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   if (rt_task(p)) {
+   track_load_rt(rq, p);
+   } else {
+   if (rq-cfs.pos != 0) {
+   struct cfs_rq *first;
+   first = list_entry(cfs_rq_head.next, struct cfs_rq, 
bld_cfs_list);
+   if (rq-cfs.load.weight = first-load.weight) {
+   write_lock_irqsave(cfs_list_lock, flag);
+   list_del(rq-cfs.bld_cfs_list);
+   list_add(rq-cfs.bld_cfs_list, cfs_rq_head);
+   rq-cfs.pos = 0; first-pos = 1;
+   write_unlock_irqrestore(cfs_list_lock, flag);
+   }
+   }
+   }
+}
+#else
+static inline void bld_track_load_activate(struct rq *rq, struct task_struct 
*p)
+{
+}
+
+static inline void bld_track_load_deactivate(struct rq *rq, struct task_struct 
*p)
+{
+}
+#endif /* CONFIG_BLD */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 89e7283..bd702c6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -24,6 +24,8 @@
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *  Thomas Gleixner, Mike Kravetz
+ *  2012-Feb   The Barbershop Load Distribution (BLD) algorithm - an alternate
+ * CPU load distribution technique for kernel scheduler by Rakib 
Mullick.
  */
 
 #include linux/mm.h
@@ -86,6 +88,7 @@
 #include sched.h
 #include ../workqueue_internal.h
 #include ../smpboot.h
+#include bld.h
 
 #define CREATE_TRACE_POINTS
 #include trace/events/sched.h
@@ -840,6 +843,8 @@ static void enqueue_task(struct rq *rq, struct task_struct 
*p, int flags)
update_rq_clock(rq);
sched_info_queued(rq, p);
p-sched_class-enqueue_task(rq, p, flags);
+   if (!dl_task(p))
+   bld_track_load_activate(rq, p);
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -847,6 +852,8 @@ static void dequeue_task(struct rq *rq, struct task_struct 
*p, int flags)
update_rq_clock(rq);
sched_info_dequeued(rq, p);
p-sched_class

Re: [ANNOUNCE] BLD-3.17 release.

2014-10-14 Thread Rakib Mullick
On 10/14/14, Mike Galbraith  wrote:
> On Mon, 2014-10-13 at 21:14 +0600, Rakib Mullick wrote:
>
>> Okay. From the numbers above it's apparent that BLD isn't doing good,
>> atleast for the
>> kind of system that you have been using. I didn't had a chance to ran
>> it on any kind of
>> NUMA systems, for that reason on Kconfig, I've marked it as "Not
>> suitable for NUMA", yet.
>
> (yeah, a NUMA box would rip itself to shreds)
>
>> Part of the reason is, I didn't manage to try it out myself and other
>> reason is, it's easy to
>> get things wrong if schedule domains are build improperly. I'm not
>> sure what was the
>> sched configuration in your case. BLD assumes (or kindof bliendly
>> believes systems
>> default sched domain topology) on wakeup tasks are cache hot and so
>> don't put those
>> task's on other sched domains, but if that isn't the case then perhaps
>> it'll miss out on
>> balancing oppourtunity, in that case CPU utilization will be improper.
>
> Even when you have only one socket with a big L3, you don't really want
> to bounce fast/light tasks around too frequently, L2 misses still hurt.
>
>> Can you please share the perf stat of netperf runs? So, far I have
>> seen reduced context
>> switch numbers with -BLD with a drawback of huge increase of CPU
>> migration numbers.
>
> No need, it's L2 misses.  Q6600 has no L3 to mitigate the miss pain.
>
I see.

>> But, the kind of systems I ran so far, it deemed too much CPU movement
>> didn't cost much.
>> But, it could be wrong for NUMA systems.
>
> You can most definitely move even very fast/light tasks too much within
> an L3, L2 misses can demolish throughput.  We had that problem.
>
So, L2 misses is the key here. So far I haven't tried to utilize
various sched domain flags, like. SD_SHARE_PKG_RESOURCE/_CPUPOWER etc.
Probably, integrating those might help? I'll take a look at those and
will try to see what happens, although testing will be problematic.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ANNOUNCE] BLD-3.17 release.

2014-10-14 Thread Rakib Mullick
On 10/14/14, Mike Galbraith umgwanakikb...@gmail.com wrote:
 On Mon, 2014-10-13 at 21:14 +0600, Rakib Mullick wrote:

 Okay. From the numbers above it's apparent that BLD isn't doing good,
 atleast for the
 kind of system that you have been using. I didn't had a chance to ran
 it on any kind of
 NUMA systems, for that reason on Kconfig, I've marked it as Not
 suitable for NUMA, yet.

 (yeah, a NUMA box would rip itself to shreds)

 Part of the reason is, I didn't manage to try it out myself and other
 reason is, it's easy to
 get things wrong if schedule domains are build improperly. I'm not
 sure what was the
 sched configuration in your case. BLD assumes (or kindof bliendly
 believes systems
 default sched domain topology) on wakeup tasks are cache hot and so
 don't put those
 task's on other sched domains, but if that isn't the case then perhaps
 it'll miss out on
 balancing oppourtunity, in that case CPU utilization will be improper.

 Even when you have only one socket with a big L3, you don't really want
 to bounce fast/light tasks around too frequently, L2 misses still hurt.

 Can you please share the perf stat of netperf runs? So, far I have
 seen reduced context
 switch numbers with -BLD with a drawback of huge increase of CPU
 migration numbers.

 No need, it's L2 misses.  Q6600 has no L3 to mitigate the miss pain.

I see.

 But, the kind of systems I ran so far, it deemed too much CPU movement
 didn't cost much.
 But, it could be wrong for NUMA systems.

 You can most definitely move even very fast/light tasks too much within
 an L3, L2 misses can demolish throughput.  We had that problem.

So, L2 misses is the key here. So far I haven't tried to utilize
various sched domain flags, like. SD_SHARE_PKG_RESOURCE/_CPUPOWER etc.
Probably, integrating those might help? I'll take a look at those and
will try to see what happens, although testing will be problematic.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ANNOUNCE] BLD-3.17 release.

2014-10-13 Thread Rakib Mullick
On 10/13/14, Mike Galbraith  wrote:
> On Sat, 2014-10-11 at 12:20 +0600, Rakib Mullick wrote:
>> BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17
>
> I had a curiosity attack, played with it a little.
>
Thanks for showing your interest!

> My little Q6600 box could be describes as being "micro-numa", with two
> pathetic little "nodes" connected by the worst interconnect this side of
> tin cans and string.  Communicating tasks sorely missed sharing cache.
>
> tbench
> 3.18.0-master
> Throughput 287.411 MB/sec  1 clients  1 procs  max_latency=1.614 ms
> 1.000
> Throughput 568.631 MB/sec  2 clients  2 procs  max_latency=1.942 ms
> 1.000
> Throughput 1069.75 MB/sec  4 clients  4 procs  max_latency=18.494 ms
> 1.000
> Throughput 1040.99 MB/sec  8 clients  8 procs  max_latency=17.364 ms
> 1.000
>
> 3.18.0-master-BLDvs
> master
> Throughput 261.986 MB/sec  1 clients  1 procs  max_latency=11.943 ms
> .911
> Throughput 264.461 MB/sec  2 clients  2 procs  max_latency=11.884 ms
> .465
> Throughput 476.191 MB/sec  4 clients  4 procs  max_latency=11.497 ms
> .445
> Throughput 558.236 MB/sec  8 clients  8 procs  max_latency=9.008 ms
> .536
>
>
> TCP_RR 4 unbound clients
> 3.18.0-master
> TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
> (127.0.0.1) port 0 AF_INET
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size SizeTime Rate
> bytes  Bytes  bytesbytes   secs.per sec
>
> 16384  87380  11   30.0072436.65
> 16384  87380  11   30.0072438.55
> 16384  87380  11   30.0072213.18
> 16384  87380  11   30.0072493.48
>sum 289581.86 1.000
>
> 3.18.0-master-BLD
> TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
> (127.0.0.1) port 0 AF_INET
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size SizeTime Rate
> bytes  Bytes  bytesbytes   secs.per sec
>
> 16384  87380  11   30.0029014.09
> 16384  87380  11   30.0028804.53
> 16384  87380  11   30.0028999.40
> 16384  87380  11   30.0028901.84
>sum 115719.86  .399 vs master
>
>
Okay. From the numbers above it's apparent that BLD isn't doing good,
atleast for the
kind of system that you have been using. I didn't had a chance to ran
it on any kind of
NUMA systems, for that reason on Kconfig, I've marked it as "Not
suitable for NUMA", yet.
Part of the reason is, I didn't manage to try it out myself and other
reason is, it's easy to
get things wrong if schedule domains are build improperly. I'm not
sure what was the
sched configuration in your case. BLD assumes (or kindof bliendly
believes systems
default sched domain topology) on wakeup tasks are cache hot and so
don't put those
task's on other sched domains, but if that isn't the case then perhaps
it'll miss out on
balancing oppourtunity, in that case CPU utilization will be improper.

Can you please share the perf stat of netperf runs? So, far I have
seen reduced context
switch numbers with -BLD with a drawback of huge increase of CPU
migration numbers.
But, the kind of systems I ran so far, it deemed too much CPU movement
didn't cost much.
But, it could be wrong for NUMA systems.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [ANNOUNCE] BLD-3.17 release.

2014-10-13 Thread Rakib Mullick
On 10/13/14, Mike Galbraith umgwanakikb...@gmail.com wrote:
 On Sat, 2014-10-11 at 12:20 +0600, Rakib Mullick wrote:
 BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17

 I had a curiosity attack, played with it a little.

Thanks for showing your interest!

 My little Q6600 box could be describes as being micro-numa, with two
 pathetic little nodes connected by the worst interconnect this side of
 tin cans and string.  Communicating tasks sorely missed sharing cache.

 tbench
 3.18.0-master
 Throughput 287.411 MB/sec  1 clients  1 procs  max_latency=1.614 ms
 1.000
 Throughput 568.631 MB/sec  2 clients  2 procs  max_latency=1.942 ms
 1.000
 Throughput 1069.75 MB/sec  4 clients  4 procs  max_latency=18.494 ms
 1.000
 Throughput 1040.99 MB/sec  8 clients  8 procs  max_latency=17.364 ms
 1.000

 3.18.0-master-BLDvs
 master
 Throughput 261.986 MB/sec  1 clients  1 procs  max_latency=11.943 ms
 .911
 Throughput 264.461 MB/sec  2 clients  2 procs  max_latency=11.884 ms
 .465
 Throughput 476.191 MB/sec  4 clients  4 procs  max_latency=11.497 ms
 .445
 Throughput 558.236 MB/sec  8 clients  8 procs  max_latency=9.008 ms
 .536


 TCP_RR 4 unbound clients
 3.18.0-master
 TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
 (127.0.0.1) port 0 AF_INET
 Local /Remote
 Socket Size   Request  Resp.   Elapsed  Trans.
 Send   Recv   Size SizeTime Rate
 bytes  Bytes  bytesbytes   secs.per sec

 16384  87380  11   30.0072436.65
 16384  87380  11   30.0072438.55
 16384  87380  11   30.0072213.18
 16384  87380  11   30.0072493.48
sum 289581.86 1.000

 3.18.0-master-BLD
 TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.1
 (127.0.0.1) port 0 AF_INET
 Local /Remote
 Socket Size   Request  Resp.   Elapsed  Trans.
 Send   Recv   Size SizeTime Rate
 bytes  Bytes  bytesbytes   secs.per sec

 16384  87380  11   30.0029014.09
 16384  87380  11   30.0028804.53
 16384  87380  11   30.0028999.40
 16384  87380  11   30.0028901.84
sum 115719.86  .399 vs master


Okay. From the numbers above it's apparent that BLD isn't doing good,
atleast for the
kind of system that you have been using. I didn't had a chance to ran
it on any kind of
NUMA systems, for that reason on Kconfig, I've marked it as Not
suitable for NUMA, yet.
Part of the reason is, I didn't manage to try it out myself and other
reason is, it's easy to
get things wrong if schedule domains are build improperly. I'm not
sure what was the
sched configuration in your case. BLD assumes (or kindof bliendly
believes systems
default sched domain topology) on wakeup tasks are cache hot and so
don't put those
task's on other sched domains, but if that isn't the case then perhaps
it'll miss out on
balancing oppourtunity, in that case CPU utilization will be improper.

Can you please share the perf stat of netperf runs? So, far I have
seen reduced context
switch numbers with -BLD with a drawback of huge increase of CPU
migration numbers.
But, the kind of systems I ran so far, it deemed too much CPU movement
didn't cost much.
But, it could be wrong for NUMA systems.

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[ANNOUNCE] BLD-3.17 release.

2014-10-11 Thread Rakib Mullick
BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17
, this version contains a build fix when CONFIG_BLD=n. I have recently
done some netperf benchmark (actually I did prevously too but never
publish) below shows the stat of default netperf run on localhost
system (client/server) running on local system (core i3, 2g mem,
higher is better).

tcp_stream  tcp_rrudp_stream udp_rr

mainline9343.5420812.0318231.74 24396.074
   18210.71

bld14738.3529224.5426475.75 34910.08
   26462.53

These are average of 5 runs of each tests. BLD performs better
and shows ~(35-40)% improvement. Config used for this benchmark
could be found at the following link:

https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17

And, recently Luis Cruz backports BLD's previous release BLD-3.16
for Android and experimentally ran it on his galaxy SIII, these
could be found at following link:

https://github.com/SyNtheticNightmar3/bld-patches

If you are interested in running it on Android, take a look at the
above link.

Thanks,
Rakib

Signed-off-by: Rakib Mullick 
---

diff --git a/init/Kconfig b/init/Kconfig
index 80a6907..65319c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
depends on BROKEN || !SMP
default y
 
+config BLD
+   bool "An alternate CPU load distribution technique for task scheduler"
+   depends on SMP
+   default y
+   help
+ This is an alternate CPU load distribution technique based for task
+ scheduler based on The Barbershop Load Distribution algorithm. Not
+ suitable for NUMA, should work well on SMP.
+
 config INIT_ENV_ARG_LIMIT
int
default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 000..097dd23
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,207 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq->rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return rt_rq->rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+   int cpu = smp_processor_id(), i;
+   unsigned long load, min_load = ULONG_MAX;
+   struct rq *rq;
+
+   if (task_type) {
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq->cfs.load.weight;
+   if (load < min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   } else {
+   min_load = -1;
+
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq->rt.lowbit;
+   if (load > min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   }
+
+   return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   struct cfs_rq *cfs;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(_list_lock, flags);
+   list_for_each_entry(cfs, _rq_head, bld_cfs_list) {
+   cpu = cpu_of(rq_of_cfs(cfs));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   struct rt_rq *rt;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(_list_lock, flags);
+   list_for_each_entry(rt, _rq_head, bld_rt_list) {
+   cpu = cpu_of(rq_of_rt(rt));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   unsigned int cpu = smp_processor_id(), want_affine = 0;
+   struct cpumask *tmpmask;
+
+   if (p->nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags & SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   

[ANNOUNCE] BLD-3.17 release.

2014-10-11 Thread Rakib Mullick
BLD (The Barbershop Load Distribution Algorithm) patch for Linux 3.17
, this version contains a build fix when CONFIG_BLD=n. I have recently
done some netperf benchmark (actually I did prevously too but never
publish) below shows the stat of default netperf run on localhost
system (client/server) running on local system (core i3, 2g mem,
higher is better).

tcp_stream  tcp_rrudp_stream udp_rr

mainline9343.5420812.0318231.74 24396.074
   18210.71

bld14738.3529224.5426475.75 34910.08
   26462.53

These are average of 5 runs of each tests. BLD performs better
and shows ~(35-40)% improvement. Config used for this benchmark
could be found at the following link:

https://raw.githubusercontent.com/rmullick/bld-patches/master/config.benchmark-3.17

And, recently Luis Cruz backports BLD's previous release BLD-3.16
for Android and experimentally ran it on his galaxy SIII, these
could be found at following link:

https://github.com/SyNtheticNightmar3/bld-patches

If you are interested in running it on Android, take a look at the
above link.

Thanks,
Rakib

Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---

diff --git a/init/Kconfig b/init/Kconfig
index 80a6907..65319c6 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
depends on BROKEN || !SMP
default y
 
+config BLD
+   bool An alternate CPU load distribution technique for task scheduler
+   depends on SMP
+   default y
+   help
+ This is an alternate CPU load distribution technique based for task
+ scheduler based on The Barbershop Load Distribution algorithm. Not
+ suitable for NUMA, should work well on SMP.
+
 config INIT_ENV_ARG_LIMIT
int
default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 000..097dd23
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,207 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq-rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return rt_rq-rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+   int cpu = smp_processor_id(), i;
+   unsigned long load, min_load = ULONG_MAX;
+   struct rq *rq;
+
+   if (task_type) {
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq-cfs.load.weight;
+   if (load  min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   } else {
+   min_load = -1;
+
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq-rt.lowbit;
+   if (load  min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   }
+
+   return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   struct cfs_rq *cfs;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(cfs_list_lock, flags);
+   list_for_each_entry(cfs, cfs_rq_head, bld_cfs_list) {
+   cpu = cpu_of(rq_of_cfs(cfs));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(cfs_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   struct rt_rq *rt;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(rt_list_lock, flags);
+   list_for_each_entry(rt, rt_rq_head, bld_rt_list) {
+   cpu = cpu_of(rq_of_rt(rt));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(rt_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   unsigned int cpu = smp_processor_id(), want_affine = 0;
+   struct cpumask *tmpmask;
+
+   if (p-nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags  SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p

[PATCH] cpuidle: Minor optimization in cpuidle governor finding.

2014-09-26 Thread Rakib Mullick
 Currently in __cpuidle_find_governor(), strnicmp() is used to
find which governor (menu or ladder) to register where CPUIDLE_NAME_LEN
is passed strnicmp()'s comparision max limit. But, "menu" or "ladder"
these strings are much smaller than CPUIDLE_NAME_LEN. To avoid it
this patch introduces len field on cpuidle_governor structure, which
is statically initialized to the size of length of governor name. Later
on this size has been passed as strnicmp()'s length param. Another
advantage this patch introduces, before calling strnicmp() by checking
the size of governor name length we can determine whether to call
strnicmp() i.e a shortcut.

Signed-off-by: Rakib Mullick 
---

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index ca89412..f601cc5 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -20,16 +20,21 @@ struct cpuidle_governor *cpuidle_curr_governor;
 /**
  * __cpuidle_find_governor - finds a governor of the specified name
  * @str: the name
+ * @size: length of str
  *
  * Must be called with cpuidle_lock acquired.
  */
-static struct cpuidle_governor * __cpuidle_find_governor(const char *str)
+static struct cpuidle_governor *__cpuidle_find_governor(const char *str,
+   size_t size)
 {
struct cpuidle_governor *gov;
 
-   list_for_each_entry(gov, _governors, governor_list)
-   if (!strnicmp(str, gov->name, CPUIDLE_NAME_LEN))
+   list_for_each_entry(gov, _governors, governor_list) {
+   if (size != gov->len)
+   continue;
+   if (!strnicmp(str, gov->name, gov->len))
return gov;
+   }
 
return NULL;
 }
@@ -85,7 +90,7 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
return -ENODEV;
 
mutex_lock(_lock);
-   if (__cpuidle_find_governor(gov->name) == NULL) {
+   if (__cpuidle_find_governor(gov->name, gov->len) == NULL) {
ret = 0;
list_add_tail(>governor_list, _governors);
if (!cpuidle_curr_governor ||
diff --git a/drivers/cpuidle/governors/ladder.c 
b/drivers/cpuidle/governors/ladder.c
index 044ee0d..91aff9c 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -177,6 +177,7 @@ static void ladder_reflect(struct cpuidle_device *dev, int 
index)
 
 static struct cpuidle_governor ladder_governor = {
.name = "ladder",
+   .len =  6,
.rating =   10,
.enable =   ladder_enable_device,
.select =   ladder_select_state,
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 34db2fb..23de182 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -477,6 +477,7 @@ static int menu_enable_device(struct cpuidle_driver *drv,
 
 static struct cpuidle_governor menu_governor = {
.name = "menu",
+   .len =  4,
.rating =   20,
.enable =   menu_enable_device,
.select =   menu_select,
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 25e0df6..bb9b5bd 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -198,6 +198,7 @@ struct cpuidle_governor {
charname[CPUIDLE_NAME_LEN];
struct list_headgovernor_list;
unsigned intrating;
+   size_t  len;
 
int  (*enable)  (struct cpuidle_driver *drv,
struct cpuidle_device *dev);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] cpuidle: Minor optimization in cpuidle governor finding.

2014-09-26 Thread Rakib Mullick
 Currently in __cpuidle_find_governor(), strnicmp() is used to
find which governor (menu or ladder) to register where CPUIDLE_NAME_LEN
is passed strnicmp()'s comparision max limit. But, menu or ladder
these strings are much smaller than CPUIDLE_NAME_LEN. To avoid it
this patch introduces len field on cpuidle_governor structure, which
is statically initialized to the size of length of governor name. Later
on this size has been passed as strnicmp()'s length param. Another
advantage this patch introduces, before calling strnicmp() by checking
the size of governor name length we can determine whether to call
strnicmp() i.e a shortcut.

Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---

diff --git a/drivers/cpuidle/governor.c b/drivers/cpuidle/governor.c
index ca89412..f601cc5 100644
--- a/drivers/cpuidle/governor.c
+++ b/drivers/cpuidle/governor.c
@@ -20,16 +20,21 @@ struct cpuidle_governor *cpuidle_curr_governor;
 /**
  * __cpuidle_find_governor - finds a governor of the specified name
  * @str: the name
+ * @size: length of str
  *
  * Must be called with cpuidle_lock acquired.
  */
-static struct cpuidle_governor * __cpuidle_find_governor(const char *str)
+static struct cpuidle_governor *__cpuidle_find_governor(const char *str,
+   size_t size)
 {
struct cpuidle_governor *gov;
 
-   list_for_each_entry(gov, cpuidle_governors, governor_list)
-   if (!strnicmp(str, gov-name, CPUIDLE_NAME_LEN))
+   list_for_each_entry(gov, cpuidle_governors, governor_list) {
+   if (size != gov-len)
+   continue;
+   if (!strnicmp(str, gov-name, gov-len))
return gov;
+   }
 
return NULL;
 }
@@ -85,7 +90,7 @@ int cpuidle_register_governor(struct cpuidle_governor *gov)
return -ENODEV;
 
mutex_lock(cpuidle_lock);
-   if (__cpuidle_find_governor(gov-name) == NULL) {
+   if (__cpuidle_find_governor(gov-name, gov-len) == NULL) {
ret = 0;
list_add_tail(gov-governor_list, cpuidle_governors);
if (!cpuidle_curr_governor ||
diff --git a/drivers/cpuidle/governors/ladder.c 
b/drivers/cpuidle/governors/ladder.c
index 044ee0d..91aff9c 100644
--- a/drivers/cpuidle/governors/ladder.c
+++ b/drivers/cpuidle/governors/ladder.c
@@ -177,6 +177,7 @@ static void ladder_reflect(struct cpuidle_device *dev, int 
index)
 
 static struct cpuidle_governor ladder_governor = {
.name = ladder,
+   .len =  6,
.rating =   10,
.enable =   ladder_enable_device,
.select =   ladder_select_state,
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index 34db2fb..23de182 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -477,6 +477,7 @@ static int menu_enable_device(struct cpuidle_driver *drv,
 
 static struct cpuidle_governor menu_governor = {
.name = menu,
+   .len =  4,
.rating =   20,
.enable =   menu_enable_device,
.select =   menu_select,
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 25e0df6..bb9b5bd 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -198,6 +198,7 @@ struct cpuidle_governor {
charname[CPUIDLE_NAME_LEN];
struct list_headgovernor_list;
unsigned intrating;
+   size_t  len;
 
int  (*enable)  (struct cpuidle_driver *drv,
struct cpuidle_device *dev);


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86: Avoid showing repetitive message from intel_init_thermal().

2014-09-18 Thread Rakib Mullick
 intel_init_thermal() is called from a) at the time of system initializing
and b) at the time of system resume to initialize thermal monitoring.
In case when thermal monitoring is handled by SMI, we get to know it via
printk(). Currently it gives the message at both cases, but its okay if we
get it only once and no need to get the same message at every time system
resumes. So, limit showing this message only at system boot time by avoid
showing at system resume and reduce abusing kernel log buffer.

Signed-off-by: Rakib Mullick 
---

diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c 
b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 36a1bb6..c8a8e45 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -498,8 +498,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
 

if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
-   printk(KERN_DEBUG
-  "CPU%d: Thermal monitoring handled by SMI\n", cpu);
+   if (system_state == SYSTEM_BOOTING)
+   printk(KERN_DEBUG
+   "CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
 



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


ANNOUNCE: BLD-3.16 release.

2014-08-07 Thread Rakib Mullick
Hello everyone,

It's been quite a long time since the announcement of the Barbershop Load
Distribution (BLD) Algorithm. Quite a few changes have been made, since then.
Now it more reflects what it really should be ;-). It's a simplistic approach
towards load balancing, typical x86 SMP boxes should run okay (tested 
personally)
, but, yes it can break your boxes too. I'm looking forward to get some 
feedback,
to keep further development up and going.

(This patch is made for kernel version 3.16.)

Thanks,
Rakib
---

 Signed-off-by: Rakib Mullick 

diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..847f34d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
depends on BROKEN || !SMP
default y
 
+config BLD
+   bool "An alternate CPU load distribution technique for task scheduler"
+   depends on SMP
+   default y
+   help
+ This is an alternate CPU load distribution technique based for task
+ scheduler based on The Barbershop Load Distribution algorithm. Not
+ suitable for NUMA, should work well on SMP.
+
 config INIT_ENV_ARG_LIMIT
int
default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 000..5a067c1
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,207 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq->rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return rt_rq->rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+   int cpu = smp_processor_id(), i;
+   unsigned long load, min_load = ULONG_MAX;
+   struct rq *rq;
+
+   if (task_type) {
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq->cfs.load.weight;
+   if (load < min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   } else {
+   min_load = -1;
+
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq->rt.lowbit;
+   if (load > min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   }
+
+   return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   struct cfs_rq *cfs;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(_list_lock, flags);
+   list_for_each_entry(cfs, _rq_head, bld_cfs_list) {
+   cpu = cpu_of(rq_of_cfs(cfs));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   struct rt_rq *rt;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(_list_lock, flags);
+   list_for_each_entry(rt, _rq_head, bld_rt_list) {
+   cpu = cpu_of(rq_of_rt(rt));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   unsigned int cpu = smp_processor_id(), want_affine = 0;
+   struct cpumask *tmpmask;
+
+   if (p->nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags & SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   want_affine = 1;
+   }
+   }
+
+   if (want_affine)
+   tmpmask = tsk_cpus_allowed(p);
+   else
+   tmpmask = sched_domain_span(cpu_rq(task_cpu(p))->sd);
+
+   if (rt_task(p))
+   cpu = select_cpu_for_wakeup(0, tmpmask);
+   else
+   cpu = select_cpu_for_wakeup(1, tmpmask);
+
+   return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   int firstbit;
+   struct rt_rq *first;
+   struct rt_prio_array *array = >rt.active;
+
+   first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list)

ANNOUNCE: BLD-3.16 release.

2014-08-07 Thread Rakib Mullick
Hello everyone,

It's been quite a long time since the announcement of the Barbershop Load
Distribution (BLD) Algorithm. Quite a few changes have been made, since then.
Now it more reflects what it really should be ;-). It's a simplistic approach
towards load balancing, typical x86 SMP boxes should run okay (tested 
personally)
, but, yes it can break your boxes too. I'm looking forward to get some 
feedback,
to keep further development up and going.

(This patch is made for kernel version 3.16.)

Thanks,
Rakib
---

 Signed-off-by: Rakib Mullick rakib.mull...@gmail.com

diff --git a/init/Kconfig b/init/Kconfig
index 9d76b99..847f34d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -36,6 +36,15 @@ config BROKEN_ON_SMP
depends on BROKEN || !SMP
default y
 
+config BLD
+   bool An alternate CPU load distribution technique for task scheduler
+   depends on SMP
+   default y
+   help
+ This is an alternate CPU load distribution technique based for task
+ scheduler based on The Barbershop Load Distribution algorithm. Not
+ suitable for NUMA, should work well on SMP.
+
 config INIT_ENV_ARG_LIMIT
int
default 32 if !UML
diff --git a/kernel/sched/bld.h b/kernel/sched/bld.h
new file mode 100644
index 000..5a067c1
--- /dev/null
+++ b/kernel/sched/bld.h
@@ -0,0 +1,207 @@
+#ifdef CONFIG_BLD
+
+static DEFINE_RWLOCK(rt_list_lock);
+static LIST_HEAD(rt_rq_head);
+static LIST_HEAD(cfs_rq_head);
+static DEFINE_RWLOCK(cfs_list_lock);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return cfs_rq-rq;
+}
+#else
+static inline struct rq *rq_of_cfs(struct cfs_rq *cfs_rq)
+{
+   return container_of(cfs_rq, struct rq, cfs);
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return rt_rq-rq;
+}
+#else
+static inline struct rq *rq_of_rt(struct rt_rq *rt_rq)
+{
+   return container_of(rt_rq, struct rq, rt);
+}
+#endif
+
+static int select_cpu_for_wakeup(int task_type, struct cpumask *mask)
+{
+   int cpu = smp_processor_id(), i;
+   unsigned long load, min_load = ULONG_MAX;
+   struct rq *rq;
+
+   if (task_type) {
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq-cfs.load.weight;
+   if (load  min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   } else {
+   min_load = -1;
+
+   for_each_cpu(i, mask) {
+   rq = cpu_rq(i);
+   load = rq-rt.lowbit;
+   if (load  min_load) {
+   min_load = load;
+   cpu = i;
+   }
+   }
+   }
+
+   return cpu;
+}
+
+static int bld_pick_cpu_cfs(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   struct cfs_rq *cfs;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(cfs_list_lock, flags);
+   list_for_each_entry(cfs, cfs_rq_head, bld_cfs_list) {
+   cpu = cpu_of(rq_of_cfs(cfs));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(cfs_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_rt(struct task_struct *p, int sd_flags, int wake_flags)
+{
+   struct rt_rq *rt;
+   unsigned long flags;
+   unsigned int cpu = smp_processor_id();
+
+   read_lock_irqsave(rt_list_lock, flags);
+   list_for_each_entry(rt, rt_rq_head, bld_rt_list) {
+   cpu = cpu_of(rq_of_rt(rt));
+   if (cpu_online(cpu))
+   break;
+   }
+   read_unlock_irqrestore(rt_list_lock, flags);
+   return cpu;
+}
+
+static int bld_pick_cpu_domain(struct task_struct *p, int sd_flags, int 
wake_flags)
+{
+   unsigned int cpu = smp_processor_id(), want_affine = 0;
+   struct cpumask *tmpmask;
+
+   if (p-nr_cpus_allowed == 1)
+   return task_cpu(p);
+
+   if (sd_flags  SD_BALANCE_WAKE) {
+   if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+   want_affine = 1;
+   }
+   }
+
+   if (want_affine)
+   tmpmask = tsk_cpus_allowed(p);
+   else
+   tmpmask = sched_domain_span(cpu_rq(task_cpu(p))-sd);
+
+   if (rt_task(p))
+   cpu = select_cpu_for_wakeup(0, tmpmask);
+   else
+   cpu = select_cpu_for_wakeup(1, tmpmask);
+
+   return cpu;
+}
+
+static void track_load_rt(struct rq *rq, struct task_struct *p)
+{
+   unsigned long flag;
+   int firstbit;
+   struct rt_rq *first;
+   struct rt_prio_array *array = rq-rt.active;
+
+   first = list_entry(rt_rq_head.next, struct rt_rq, bld_rt_list

Re: Do we really need curr_target in signal_struct ?

2014-02-03 Thread Rakib Mullick
On Mon, Feb 3, 2014 at 10:39 PM, Oleg Nesterov  wrote:
> On 02/02, Rakib Mullick wrote:
>
>> > As I already said it caches the last wants_signal(t) thread?
>> Yes, you said. But, gets messed up at exit path, not useful everytime.
>
> Yes.
>
>> If fails, p gets checked twice.
>
> Yes, but this is minor, I think.
>
Right.

>> >> I took a look and found that using while_each_thread()
>> >> can make things better than current.
>> >
>> > Why?
>> >
>> using while_each_thread() we can start thread traversing from p, which
>> is a likely
>> pick and also gets away from redundant checking of p.
>
> Heh. We always check "p" first. And (in general) we do not want to start
> from "p" if we want to find a wants_signal() thread, and ->curr_target can
> help to avoid this.
>
>> >> What do you think?
>> >
>> > The patch is technically wrong, a group-wide signal doesn't check all
>> > threads after this change.
>> If group is empty, we don't need to check other than t.
>
> I didn't meant the thread_group_empty() case. Please look at your code:
>
>
> if (!group || thread_group_empty(p)) {
> if (wants_signal(sig, t))
> goto found;
> } else {
> while_each_thread(p, t) {
> if (wants_signal(sig, t))
> goto found;
> }
> }
>
> Suppose that group == T, thread_group_empty(p) == F. Suppose that all
> sub-threads except "p" blocked this signal. With this change "p" (and
> thus the whole thread group) won't be notified. IOW, with your change
> we do not check "p" at all. This is wrong.
>
Oh, sorry, my bad. That was wrong.

> The only user of ->curr_target is complete_signal(), you have found it.
>
Indeed.

>
> I can only read the current code. I do not know the original intent.
>
This is where things are confusing.


> Really?
>
Sometimes, 100% correct (!group case) ;-).

>
> Yes (except a thread can't be killed), so what? Obviously, if ->curr_targer
> exits we should update this pointer. We could even nullify it.
>
That's makes ->curr_target less useful, that's what I meant.

>
> Yes, "p" can be checked twice. I don't think this is that bad, and I
> do not think this particular "problem" should be fixed.
>
Yes, it's minor.

>
> I simply can't understand. Why? I do not think so.
>
Cause, want_signal logic checks these thread attributes to find whether it's
eligible or not.

>> We can acheive the same without ->curr_signal
>> by traversing thread group from the lastly created thread.
>
> We certainly can't "achieve the same" this way, although I am not sure
> what this "the same" actually means.
>
>> So, this is what I think. Let me know if these reason's looks reasonable to 
>> you,
>
> No. Contrary, whatever I personally think about ->curr_signal, I feel
> that you do not understand the code you are trying to change. Sorry,
> I can be wrong. But I still do not see any argument.
>
Yes, right. I do not fully understand this code, also how it exactly puts impact
on signaling subsystems. And, therefore, I think I should not make any
changes in this code.

>> cause before Ingo or Andrew taking it, it requires your ack.
>
> Not really. And of course I'll review the patch correctness-wise, and
> I already sent the change in complete_signal() which looks right to me.
>
> But I am not going to ack the behaviour change, simply because I have
> no idea how this can impact the existing applications. Perhaps nobody
> will notice this change, but we can't know this.
>
Yes, I'm not also sure about the behavior change and it's impact over
existing applications, so, I'm skipping it.

I usually try to make small fixes, cleanup; cause it's less error-prone and
requires less follow-up. Since the things here becoming sort of "don't know"
thing, I think I should stop. But, thank you for helping and replying in this
thread.

Again thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-02-03 Thread Rakib Mullick
On Mon, Feb 3, 2014 at 10:39 PM, Oleg Nesterov o...@redhat.com wrote:
 On 02/02, Rakib Mullick wrote:

  As I already said it caches the last wants_signal(t) thread?
 Yes, you said. But, gets messed up at exit path, not useful everytime.

 Yes.

 If fails, p gets checked twice.

 Yes, but this is minor, I think.

Right.

  I took a look and found that using while_each_thread()
  can make things better than current.
 
  Why?
 
 using while_each_thread() we can start thread traversing from p, which
 is a likely
 pick and also gets away from redundant checking of p.

 Heh. We always check p first. And (in general) we do not want to start
 from p if we want to find a wants_signal() thread, and -curr_target can
 help to avoid this.

  What do you think?
 
  The patch is technically wrong, a group-wide signal doesn't check all
  threads after this change.
 If group is empty, we don't need to check other than t.

 I didn't meant the thread_group_empty() case. Please look at your code:


 if (!group || thread_group_empty(p)) {
 if (wants_signal(sig, t))
 goto found;
 } else {
 while_each_thread(p, t) {
 if (wants_signal(sig, t))
 goto found;
 }
 }

 Suppose that group == T, thread_group_empty(p) == F. Suppose that all
 sub-threads except p blocked this signal. With this change p (and
 thus the whole thread group) won't be notified. IOW, with your change
 we do not check p at all. This is wrong.

Oh, sorry, my bad. That was wrong.

 The only user of -curr_target is complete_signal(), you have found it.

Indeed.


 I can only read the current code. I do not know the original intent.

This is where things are confusing.


 Really?

Sometimes, 100% correct (!group case) ;-).


 Yes (except a thread can't be killed), so what? Obviously, if -curr_targer
 exits we should update this pointer. We could even nullify it.

That's makes -curr_target less useful, that's what I meant.


 Yes, p can be checked twice. I don't think this is that bad, and I
 do not think this particular problem should be fixed.

Yes, it's minor.


 I simply can't understand. Why? I do not think so.

Cause, want_signal logic checks these thread attributes to find whether it's
eligible or not.

 We can acheive the same without -curr_signal
 by traversing thread group from the lastly created thread.

 We certainly can't achieve the same this way, although I am not sure
 what this the same actually means.

 So, this is what I think. Let me know if these reason's looks reasonable to 
 you,

 No. Contrary, whatever I personally think about -curr_signal, I feel
 that you do not understand the code you are trying to change. Sorry,
 I can be wrong. But I still do not see any argument.

Yes, right. I do not fully understand this code, also how it exactly puts impact
on signaling subsystems. And, therefore, I think I should not make any
changes in this code.

 cause before Ingo or Andrew taking it, it requires your ack.

 Not really. And of course I'll review the patch correctness-wise, and
 I already sent the change in complete_signal() which looks right to me.

 But I am not going to ack the behaviour change, simply because I have
 no idea how this can impact the existing applications. Perhaps nobody
 will notice this change, but we can't know this.

Yes, I'm not also sure about the behavior change and it's impact over
existing applications, so, I'm skipping it.

I usually try to make small fixes, cleanup; cause it's less error-prone and
requires less follow-up. Since the things here becoming sort of don't know
thing, I think I should stop. But, thank you for helping and replying in this
thread.

Again thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-02-02 Thread Rakib Mullick
On Sat, Feb 1, 2014 at 10:51 PM, Oleg Nesterov  wrote:
> On 02/01, Rakib Mullick wrote:
>>
>> On Thu, Jan 30, 2014 at 1:02 PM, Rakib Mullick  
>> wrote:
>> > On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov  wrote:
>> >> On 01/29, Rakib Mullick wrote:
>> >
>> >>> Are you thinking that , since things are not broken, then we shouldn't
>> >>> try to do anything?
>> >>
>> >> Hmm. No.
>> >>
>> >> I am thinking that, since you misunderstood the purpose of ->curr_target,
>> >> I should probably try to argue with your patch which blindly removes this
>> >> optimization ?
>> >>
>> > Since the optimization (usages of ->curr_target) isn't perfect, so there's
>> > chance of being misunderstood. This optimization is misleading too (I 
>> > think),
>> > cause curr_target don't have anything to do with wants_signal()
>
> can't understand... curr_target is obviously connected to wants_signal() ?
No. I meant, curr_target doesn't makes sure that it really wants the
signal, it's likely
choice.

> As I already said it caches the last wants_signal(t) thread?
Yes, you said. But, gets messed up at exit path, not useful everytime.
If fails, p gets checked twice.

>> > and
>> >  ->curr_target is used only for this optimization and to get this 
>> > optimization
>> > needs to maintain it properly, and this maintenance does have cost and if
>> > we don't get benefited too much, then it doesn't worth it (my pov).
>
> but you need to prove this somehow.
>
Right, I'll try, I'll do it as per my understanding and everyone might
not agree.

>> I took a look and found that using while_each_thread()
>> can make things better than current.
>
> Why?
>
using while_each_thread() we can start thread traversing from p, which
is a likely
pick and also gets away from redundant checking of p.

>> What do you think?
>
> The patch is technically wrong, a group-wide signal doesn't check all
> threads after this change.
If group is empty, we don't need to check other than t.

> And I do not understand why complete_signal()
> still updates ->curr_target.
Didn't want to remove it blindly :-/

> Plus thread_group_empty() doesn't buy too
> much if we use while_each_thread(). But this doesn't really matter, easy
> to fix.
>
> Rakib. It is not that I like ->curr_target very much. But let me repeat,
> if you want to remove this optimization you need to explain why it doesn't
> make sense. You claim that this "can make things better" without any
> argument.
>
Well, I had other way of looking at it and didn't find any real usages
of ->curr_target and got confused though the code comment in curr_target.

> As for me - I simply do not know. This logic is very old, I am not even
> sure that the current usage of ->curr_signal matches the original intent.
> But it can obviously help in some cases, and you need to explain why we
> do not care.
>
Actually, this is exactly what i wanted to know. What is the purpose
of ->curr_signal, *do we really need it?* If I knew or seen any reason
I'd have never asked this question. You guys knows this domain better than
me, that's why I asked. Git log didn't help much, neither it's documentation
. As a result, I asked and thought about cleaning it up, (as i rarely do if
it meets my capability of course), so appended a sort of rough patch.

> So I won't argue if you submit the technically correct patch, but you
> need to convince Andrew or Ingo to take it. I think the right change in
> complete_signal() is something like below. It can be even simpler and use
> the single do/while loop, but then we need to check "group" inside that
> loop. With the change below ->curr_target can be simply removed.
>
I'll try to make points to convince Andrew or Ingo, I'll try to show
up my points,
and the rest will be upon them.

And here's what i think about ->curr_target removal (as per my understanding):

 a) ->curr_target is only might be useful in group wide case.
 b) Usually signals are sent particularly to a thread so ->curr_signal
 doesn't makes sense.
 c) If ->curr_signal pointed thread is killed, curr_signal points to
the next thread,
but nothing can make sure that it's a right pick.
 d) also current in implementation p is checked twice.
 e) One use case of ->curr_signal is, it always points to the lastly
created thread,
as it's a better candidate for want_signal cause newly created thread don't
usually have any signal pending. We can acheive the same without ->curr_signal
by traversing thread group from the lastly created thread.

So, this is what I think. Let me know if these reason's looks reasonable to you,
cause before Ingo or Andrew taking it, it requires your ack.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-02-02 Thread Rakib Mullick
On Sat, Feb 1, 2014 at 10:51 PM, Oleg Nesterov o...@redhat.com wrote:
 On 02/01, Rakib Mullick wrote:

 On Thu, Jan 30, 2014 at 1:02 PM, Rakib Mullick rakib.mull...@gmail.com 
 wrote:
  On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov o...@redhat.com wrote:
  On 01/29, Rakib Mullick wrote:
 
  Are you thinking that , since things are not broken, then we shouldn't
  try to do anything?
 
  Hmm. No.
 
  I am thinking that, since you misunderstood the purpose of -curr_target,
  I should probably try to argue with your patch which blindly removes this
  optimization ?
 
  Since the optimization (usages of -curr_target) isn't perfect, so there's
  chance of being misunderstood. This optimization is misleading too (I 
  think),
  cause curr_target don't have anything to do with wants_signal()

 can't understand... curr_target is obviously connected to wants_signal() ?
No. I meant, curr_target doesn't makes sure that it really wants the
signal, it's likely
choice.

 As I already said it caches the last wants_signal(t) thread?
Yes, you said. But, gets messed up at exit path, not useful everytime.
If fails, p gets checked twice.

  and
   -curr_target is used only for this optimization and to get this 
  optimization
  needs to maintain it properly, and this maintenance does have cost and if
  we don't get benefited too much, then it doesn't worth it (my pov).

 but you need to prove this somehow.

Right, I'll try, I'll do it as per my understanding and everyone might
not agree.

 I took a look and found that using while_each_thread()
 can make things better than current.

 Why?

using while_each_thread() we can start thread traversing from p, which
is a likely
pick and also gets away from redundant checking of p.

 What do you think?

 The patch is technically wrong, a group-wide signal doesn't check all
 threads after this change.
If group is empty, we don't need to check other than t.

 And I do not understand why complete_signal()
 still updates -curr_target.
Didn't want to remove it blindly :-/

 Plus thread_group_empty() doesn't buy too
 much if we use while_each_thread(). But this doesn't really matter, easy
 to fix.

 Rakib. It is not that I like -curr_target very much. But let me repeat,
 if you want to remove this optimization you need to explain why it doesn't
 make sense. You claim that this can make things better without any
 argument.

Well, I had other way of looking at it and didn't find any real usages
of -curr_target and got confused though the code comment in curr_target.

 As for me - I simply do not know. This logic is very old, I am not even
 sure that the current usage of -curr_signal matches the original intent.
 But it can obviously help in some cases, and you need to explain why we
 do not care.

Actually, this is exactly what i wanted to know. What is the purpose
of -curr_signal, *do we really need it?* If I knew or seen any reason
I'd have never asked this question. You guys knows this domain better than
me, that's why I asked. Git log didn't help much, neither it's documentation
. As a result, I asked and thought about cleaning it up, (as i rarely do if
it meets my capability of course), so appended a sort of rough patch.

 So I won't argue if you submit the technically correct patch, but you
 need to convince Andrew or Ingo to take it. I think the right change in
 complete_signal() is something like below. It can be even simpler and use
 the single do/while loop, but then we need to check group inside that
 loop. With the change below -curr_target can be simply removed.

I'll try to make points to convince Andrew or Ingo, I'll try to show
up my points,
and the rest will be upon them.

And here's what i think about -curr_target removal (as per my understanding):

 a) -curr_target is only might be useful in group wide case.
 b) Usually signals are sent particularly to a thread so -curr_signal
 doesn't makes sense.
 c) If -curr_signal pointed thread is killed, curr_signal points to
the next thread,
but nothing can make sure that it's a right pick.
 d) also current in implementation p is checked twice.
 e) One use case of -curr_signal is, it always points to the lastly
created thread,
as it's a better candidate for want_signal cause newly created thread don't
usually have any signal pending. We can acheive the same without -curr_signal
by traversing thread group from the lastly created thread.

So, this is what I think. Let me know if these reason's looks reasonable to you,
cause before Ingo or Andrew taking it, it requires your ack.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-31 Thread Rakib Mullick
On Thu, Jan 30, 2014 at 1:02 PM, Rakib Mullick  wrote:
> On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov  wrote:
>> On 01/29, Rakib Mullick wrote:
>
>>> Are you thinking that , since things are not broken, then we shouldn't
>>> try to do anything?
>>
>> Hmm. No.
>>
>> I am thinking that, since you misunderstood the purpose of ->curr_target,
>> I should probably try to argue with your patch which blindly removes this
>> optimization ?
>>
> Since the optimization (usages of ->curr_target) isn't perfect, so there's
> chance of being misunderstood. This optimization is misleading too (I think),
> cause curr_target don't have anything to do with wants_signal() and
>  ->curr_target is used only for this optimization and to get this optimization
> needs to maintain it properly, and this maintenance does have cost and if
> we don't get benefited too much, then it doesn't worth it (my pov).
>
>> I also think that this logic doesn't look perfect, but this is another
>> story.
>
> Yes, this logic seems need to improve.
>
As you've made few points about the current logic of thread picking in
complete_signal(), I took a look and found that using while_each_thread()
can make things better than current. Although my initial intent of this thread
wasn't related to complete_signal() logic, but as you've pointed out that
things could be better, so I prepared the attached patch (just to address
complete_signal()'s thread finding logic) not using ->curr_target but it's been
kept. What do you think?


Thanks,
Rakib
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881d..064f81f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -944,7 +944,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
 static void complete_signal(int sig, struct task_struct *p, int group)
 {
 	struct signal_struct *signal = p->signal;
-	struct task_struct *t;
+	struct task_struct *t = p;
 
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
@@ -952,33 +952,26 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * If the main thread wants the signal, it gets first crack.
 	 * Probably the least surprising to the average bear.
 	 */
-	if (wants_signal(sig, p))
-		t = p;
-	else if (!group || thread_group_empty(p))
-		/*
-		 * There is just one thread and it does not need to be woken.
-		 * It will dequeue unblocked signals before it runs again.
-		 */
-		return;
-	else {
-		/*
-		 * Otherwise try to find a suitable thread.
-		 */
-		t = signal->curr_target;
-		while (!wants_signal(sig, t)) {
-			t = next_thread(t);
-			if (t == signal->curr_target)
-/*
- * No thread needs to be woken.
- * Any eligible threads will see
- * the signal in the queue soon.
- */
-return;
+	if (!group || thread_group_empty(p)) {
+		if (wants_signal(sig, t))
+			goto found;
+	} else {
+		while_each_thread(p, t) {
+			if (wants_signal(sig, t))
+goto found;
 		}
-		signal->curr_target = t;
 	}
 
 	/*
+	 * No thread needs to be woken.
+	 * Any eligible threads will see
+	 * the signal in the queue soon.
+	 */
+	return;
+found:
+	signal->curr_target = t;
+
+	/*
 	 * Found a killable thread.  If the signal will be fatal,
 	 * then start taking the whole group down immediately.
 	 */


Re: Do we really need curr_target in signal_struct ?

2014-01-31 Thread Rakib Mullick
On Thu, Jan 30, 2014 at 1:02 PM, Rakib Mullick rakib.mull...@gmail.com wrote:
 On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov o...@redhat.com wrote:
 On 01/29, Rakib Mullick wrote:

 Are you thinking that , since things are not broken, then we shouldn't
 try to do anything?

 Hmm. No.

 I am thinking that, since you misunderstood the purpose of -curr_target,
 I should probably try to argue with your patch which blindly removes this
 optimization ?

 Since the optimization (usages of -curr_target) isn't perfect, so there's
 chance of being misunderstood. This optimization is misleading too (I think),
 cause curr_target don't have anything to do with wants_signal() and
  -curr_target is used only for this optimization and to get this optimization
 needs to maintain it properly, and this maintenance does have cost and if
 we don't get benefited too much, then it doesn't worth it (my pov).

 I also think that this logic doesn't look perfect, but this is another
 story.

 Yes, this logic seems need to improve.

As you've made few points about the current logic of thread picking in
complete_signal(), I took a look and found that using while_each_thread()
can make things better than current. Although my initial intent of this thread
wasn't related to complete_signal() logic, but as you've pointed out that
things could be better, so I prepared the attached patch (just to address
complete_signal()'s thread finding logic) not using -curr_target but it's been
kept. What do you think?


Thanks,
Rakib
diff --git a/kernel/signal.c b/kernel/signal.c
index 52f881d..064f81f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -944,7 +944,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
 static void complete_signal(int sig, struct task_struct *p, int group)
 {
 	struct signal_struct *signal = p-signal;
-	struct task_struct *t;
+	struct task_struct *t = p;
 
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
@@ -952,33 +952,26 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * If the main thread wants the signal, it gets first crack.
 	 * Probably the least surprising to the average bear.
 	 */
-	if (wants_signal(sig, p))
-		t = p;
-	else if (!group || thread_group_empty(p))
-		/*
-		 * There is just one thread and it does not need to be woken.
-		 * It will dequeue unblocked signals before it runs again.
-		 */
-		return;
-	else {
-		/*
-		 * Otherwise try to find a suitable thread.
-		 */
-		t = signal-curr_target;
-		while (!wants_signal(sig, t)) {
-			t = next_thread(t);
-			if (t == signal-curr_target)
-/*
- * No thread needs to be woken.
- * Any eligible threads will see
- * the signal in the queue soon.
- */
-return;
+	if (!group || thread_group_empty(p)) {
+		if (wants_signal(sig, t))
+			goto found;
+	} else {
+		while_each_thread(p, t) {
+			if (wants_signal(sig, t))
+goto found;
 		}
-		signal-curr_target = t;
 	}
 
 	/*
+	 * No thread needs to be woken.
+	 * Any eligible threads will see
+	 * the signal in the queue soon.
+	 */
+	return;
+found:
+	signal-curr_target = t;
+
+	/*
 	 * Found a killable thread.  If the signal will be fatal,
 	 * then start taking the whole group down immediately.
 	 */


Re: Do we really need curr_target in signal_struct ?

2014-01-29 Thread Rakib Mullick
On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov  wrote:
> On 01/29, Rakib Mullick wrote:

>> Are you thinking that , since things are not broken, then we shouldn't
>> try to do anything?
>
> Hmm. No.
>
> I am thinking that, since you misunderstood the purpose of ->curr_target,
> I should probably try to argue with your patch which blindly removes this
> optimization ?
>
Since the optimization (usages of ->curr_target) isn't perfect, so there's
chance of being misunderstood. This optimization is misleading too (I think),
cause curr_target don't have anything to do with wants_signal() and
 ->curr_target is used only for this optimization and to get this optimization
needs to maintain it properly, and this maintenance does have cost and if
we don't get benefited too much, then it doesn't worth it (my pov).

> I also think that this logic doesn't look perfect, but this is another
> story.

Yes, this logic seems need to improve.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-29 Thread Rakib Mullick
On Wed, Jan 29, 2014 at 8:55 PM, Oleg Nesterov  wrote:
> On 01/29, Rakib Mullick wrote:
>>
>> On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov  wrote:
>>
>
>> AFAIU, ->current_target is only a loop breaker to avoid infinite loop,
>
> No. It caches the last result of "find a thread which can handle this
> group-wide signal".
>
The reason behind of my understanding is the following comments:

/*
 * No thread needs to be woken.
 * Any eligible threads will see
 * the signal in the queue soon.
 */

What if, there's no thread in a group wants_signal()? Or it can't
practically happen?

>> but - by using while_each_thread() we can remove it completely, thus
>> helps to get rid from maintaining it too.
>
> ... and remove the optimization above.
>
>> I'll prepare a proper patch with you suggestions for reviewing.
>
> I am not sure we want this patch. Once again, I do not know how much
> ->curr_target helps, and certainaly it can't help always. But you
> should not blindly remove it just because yes, sure, it is not strictly
> needed to find a wants_signal() thread.
>
Are you thinking that , since things are not broken, then we shouldn't
try to do anything?

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-29 Thread Rakib Mullick
On Wed, Jan 29, 2014 at 8:55 PM, Oleg Nesterov o...@redhat.com wrote:
 On 01/29, Rakib Mullick wrote:

 On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov o...@redhat.com wrote:


 AFAIU, -current_target is only a loop breaker to avoid infinite loop,

 No. It caches the last result of find a thread which can handle this
 group-wide signal.

The reason behind of my understanding is the following comments:

/*
 * No thread needs to be woken.
 * Any eligible threads will see
 * the signal in the queue soon.
 */

What if, there's no thread in a group wants_signal()? Or it can't
practically happen?

 but - by using while_each_thread() we can remove it completely, thus
 helps to get rid from maintaining it too.

 ... and remove the optimization above.

 I'll prepare a proper patch with you suggestions for reviewing.

 I am not sure we want this patch. Once again, I do not know how much
 -curr_target helps, and certainaly it can't help always. But you
 should not blindly remove it just because yes, sure, it is not strictly
 needed to find a wants_signal() thread.

Are you thinking that , since things are not broken, then we shouldn't
try to do anything?

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-29 Thread Rakib Mullick
On Thu, Jan 30, 2014 at 12:32 AM, Oleg Nesterov o...@redhat.com wrote:
 On 01/29, Rakib Mullick wrote:

 Are you thinking that , since things are not broken, then we shouldn't
 try to do anything?

 Hmm. No.

 I am thinking that, since you misunderstood the purpose of -curr_target,
 I should probably try to argue with your patch which blindly removes this
 optimization ?

Since the optimization (usages of -curr_target) isn't perfect, so there's
chance of being misunderstood. This optimization is misleading too (I think),
cause curr_target don't have anything to do with wants_signal() and
 -curr_target is used only for this optimization and to get this optimization
needs to maintain it properly, and this maintenance does have cost and if
we don't get benefited too much, then it doesn't worth it (my pov).

 I also think that this logic doesn't look perfect, but this is another
 story.

Yes, this logic seems need to improve.

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
On Wed, Jan 29, 2014 at 10:09 AM, Rakib Mullick  wrote:
> On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov  wrote:
>> On 01/28, Rakib Mullick wrote:
>>
>> You could simply do while_each_thread(p, t) to find a thread which
>> wants_signal(..).
>>
> Yes, while_each_thread() is much nicer than get_nr_thread(), thanks for
> the pointer.
>
Or, should we use for_each_thread()? Just noticed that, you've plan to remove
while_each_thread().

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov  wrote:
> On 01/28, Rakib Mullick wrote:
>
> You could simply do while_each_thread(p, t) to find a thread which
> wants_signal(..).
>
Yes, while_each_thread() is much nicer than get_nr_thread(), thanks for
the pointer.

> But I guess ->curr_target was added exactly to avoid this loop if
> possible, assuming that wants_signal(->current_targer) should be
> likely true. Although perhaps this optimization is too simple.
>
Well, this code block will only hit when first check for wants_signal()
will miss, that means we need to find some other thread of the group.
AFAIU, ->current_target is only a loop breaker to avoid infinite loop,
but - by using while_each_thread() we can remove it completely, thus
helps to get rid from maintaining it too.

I'll prepare a proper patch with you suggestions for reviewing.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
I was wondering what might be the possible use of curr_target in signal_struct 
(atleast,
it's not doing what it's comment says). Also, I'm not seeing any real use of it 
except
in kernel/signal.c::complete_signal() where it's use as loop breaking condition 
in thread's
list traversing. As an alternative of using curr_target we can use 
get_nr_thread() count
to get # of threads in a group and can remove curr_target completely. This will 
also help
us to get rid from overhead of maintaining ->curr_target at fork()/exit() path. 
Below is
rough patch, I can prepare a good one if everyone agrees.

Or we really need it?

---
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 485234d..1fd65b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -554,7 +554,7 @@ struct signal_struct {
wait_queue_head_t   wait_chldexit;  /* for wait4() */
 
/* current thread group signal load-balancing target: */
-   struct task_struct  *curr_target;
+   //struct task_struct*curr_target;
 
/* shared signal handling: */
struct sigpending   shared_pending;
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc6..4a2cf37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -113,8 +113,8 @@ static void __exit_signal(struct task_struct *tsk)
if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
 
-   if (tsk == sig->curr_target)
-   sig->curr_target = next_thread(tsk);
+   //if (tsk == sig->curr_target)
+   //  sig->curr_target = next_thread(tsk);
/*
 * Accumulate here the counters for all threads but the
 * group leader as they die, so they can be added into
diff --git a/kernel/fork.c b/kernel/fork.c
index 2f11bbe..8de4928 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1041,7 +1041,7 @@ static int copy_signal(unsigned long clone_flags, struct 
task_struct *tsk)
tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head);
 
init_waitqueue_head(>wait_chldexit);
-   sig->curr_target = tsk;
+   //sig->curr_target = tsk;
init_sigpending(>shared_pending);
INIT_LIST_HEAD(>posix_timers);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30e..1a1280a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -945,6 +945,7 @@ static void complete_signal(int sig, struct task_struct *p, 
int group)
 {
struct signal_struct *signal = p->signal;
struct task_struct *t;
+   int i;
 
/*
 * Now find a thread we can wake up to take the signal off the queue.
@@ -961,21 +962,16 @@ static void complete_signal(int sig, struct task_struct 
*p, int group)
 */
return;
else {
-   /*
-* Otherwise try to find a suitable thread.
-*/
-   t = signal->curr_target;
-   while (!wants_signal(sig, t)) {
+   i = get_nr_threads(p);
+   t = p;
+   do {
+   --i;
t = next_thread(t);
-   if (t == signal->curr_target)
-   /*
-* No thread needs to be woken.
-* Any eligible threads will see
-* the signal in the queue soon.
-*/
+   if (!i)
return;
-   }
-   signal->curr_target = t;
+   } while (!wants_signal(sig, t));
+   
+   //signal->curr_target = t;
}
 
/*


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
I was wondering what might be the possible use of curr_target in signal_struct 
(atleast,
it's not doing what it's comment says). Also, I'm not seeing any real use of it 
except
in kernel/signal.c::complete_signal() where it's use as loop breaking condition 
in thread's
list traversing. As an alternative of using curr_target we can use 
get_nr_thread() count
to get # of threads in a group and can remove curr_target completely. This will 
also help
us to get rid from overhead of maintaining -curr_target at fork()/exit() path. 
Below is
rough patch, I can prepare a good one if everyone agrees.

Or we really need it?

---
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 485234d..1fd65b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -554,7 +554,7 @@ struct signal_struct {
wait_queue_head_t   wait_chldexit;  /* for wait4() */
 
/* current thread group signal load-balancing target: */
-   struct task_struct  *curr_target;
+   //struct task_struct*curr_target;
 
/* shared signal handling: */
struct sigpending   shared_pending;
diff --git a/kernel/exit.c b/kernel/exit.c
index 1e77fc6..4a2cf37 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -113,8 +113,8 @@ static void __exit_signal(struct task_struct *tsk)
if (sig-notify_count  0  !--sig-notify_count)
wake_up_process(sig-group_exit_task);
 
-   if (tsk == sig-curr_target)
-   sig-curr_target = next_thread(tsk);
+   //if (tsk == sig-curr_target)
+   //  sig-curr_target = next_thread(tsk);
/*
 * Accumulate here the counters for all threads but the
 * group leader as they die, so they can be added into
diff --git a/kernel/fork.c b/kernel/fork.c
index 2f11bbe..8de4928 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1041,7 +1041,7 @@ static int copy_signal(unsigned long clone_flags, struct 
task_struct *tsk)
tsk-thread_node = (struct list_head)LIST_HEAD_INIT(sig-thread_head);
 
init_waitqueue_head(sig-wait_chldexit);
-   sig-curr_target = tsk;
+   //sig-curr_target = tsk;
init_sigpending(sig-shared_pending);
INIT_LIST_HEAD(sig-posix_timers);
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 940b30e..1a1280a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -945,6 +945,7 @@ static void complete_signal(int sig, struct task_struct *p, 
int group)
 {
struct signal_struct *signal = p-signal;
struct task_struct *t;
+   int i;
 
/*
 * Now find a thread we can wake up to take the signal off the queue.
@@ -961,21 +962,16 @@ static void complete_signal(int sig, struct task_struct 
*p, int group)
 */
return;
else {
-   /*
-* Otherwise try to find a suitable thread.
-*/
-   t = signal-curr_target;
-   while (!wants_signal(sig, t)) {
+   i = get_nr_threads(p);
+   t = p;
+   do {
+   --i;
t = next_thread(t);
-   if (t == signal-curr_target)
-   /*
-* No thread needs to be woken.
-* Any eligible threads will see
-* the signal in the queue soon.
-*/
+   if (!i)
return;
-   }
-   signal-curr_target = t;
+   } while (!wants_signal(sig, t));
+   
+   //signal-curr_target = t;
}
 
/*


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov o...@redhat.com wrote:
 On 01/28, Rakib Mullick wrote:

 You could simply do while_each_thread(p, t) to find a thread which
 wants_signal(..).

Yes, while_each_thread() is much nicer than get_nr_thread(), thanks for
the pointer.

 But I guess -curr_target was added exactly to avoid this loop if
 possible, assuming that wants_signal(-current_targer) should be
 likely true. Although perhaps this optimization is too simple.

Well, this code block will only hit when first check for wants_signal()
will miss, that means we need to find some other thread of the group.
AFAIU, -current_target is only a loop breaker to avoid infinite loop,
but - by using while_each_thread() we can remove it completely, thus
helps to get rid from maintaining it too.

I'll prepare a proper patch with you suggestions for reviewing.

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Do we really need curr_target in signal_struct ?

2014-01-28 Thread Rakib Mullick
On Wed, Jan 29, 2014 at 10:09 AM, Rakib Mullick rakib.mull...@gmail.com wrote:
 On Tue, Jan 28, 2014 at 10:43 PM, Oleg Nesterov o...@redhat.com wrote:
 On 01/28, Rakib Mullick wrote:

 You could simply do while_each_thread(p, t) to find a thread which
 wants_signal(..).

 Yes, while_each_thread() is much nicer than get_nr_thread(), thanks for
 the pointer.

Or, should we use for_each_thread()? Just noticed that, you've plan to remove
while_each_thread().

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-24 Thread Rakib Mullick
On Wed, Jul 24, 2013 at 2:34 PM, Michael Wang
 wrote:
> On 07/24/2013 04:01 PM, Rakib Mullick wrote:
>> On Wed, Jul 24, 2013 at 9:26 AM, Michael Wang
>>  wrote:
>>> Hi, Rakib
>>>
>>> On 07/24/2013 01:42 AM, Rakib Mullick wrote:
>>>> Currently, update_top_cache_domain() is called whenever schedule domain is 
>>>> built or destroyed. But, the following
>>>> callpath shows that they're at the same callpath and can be avoided 
>>>> update_top_cache_domain() while destroying schedule
>>>> domain and update only at the times of building schedule domains.
>>>>
>>>>   partition_sched_domains()
>>>>   detach_destroy_domain()
>>>>   cpu_attach_domain()
>>>>   update_top_cache_domain()
>>>
>>> IMHO, cpu_attach_domain() and update_top_cache_domain() should be
>>> paired, below patch will open a window which 'rq->sd == NULL' while
>>> 'sd_llc != NULL', isn't it?
>>>
>>> I don't think we have the promise that before we rebuild the stuff
>>> correctly, no one will utilize 'sd_llc'...
>>>
>> I never said it. My point is different.  partition_sched_domain works as -
>>
>>   - destroying existing schedule domain (if previous domain and new
>> domain aren't same)
>>   - building new partition
>>
>> while doing the first it needs to detach all the cpus on that domain.
>> By detaching what it does,
>> it fall backs to it's root default domain. In this context (which i've
>> proposed to skip), by means
>> of updating top cache domain it takes the highest flag domain to setup
>> it's sd_llc_id or cpu itself.
>>
>> Whatever done above gets overwritten (updating top cache domain),
>> while building new partition.
>> Then, why we did that before? Hope you understand my point.
>
> I think you missed this in PeterZ's suggestion:
>
> -   cpu_attach_domain(NULL, _root_domain, i);
>
> With this change, it will be safe since you will still get an un-freed
> sd, although it's an old one.
>
I never meant it and clearly I missed it. If you remove
cpu_attach_domain(), then detach_destroy_domain() becomes meaningless.
And I don't have any intent to remove cpu_attach_domain from
detach_destroy_domain() at all.

> But your patch will run the risk to get a freed sd, since you make
> 'sd_llc' wrong for a period of time (between destroy and rebuild) IMO.
>
Building 'sd_llc' depends on schedule domain. If you don't have sd,
sd_llc will point to NULL and sd_llc_id is
the CPU itself. Since, we're  trying to re-construing so for this time
being it doesn't matter, cause we're building
it again. Now, please just note what you're saying, on last thread
you've said -

   "I don't think we have the promise that before we rebuild the stuff
correctly, no one will utilize 'sd_llc'..."

If that is the case, then we shouldn't worry about it at all. And this
above comments (from previous thread I've quoted and this thread I'm
replying) they're just self contradictory.

> I guess I get you point, you are trying to save one time update since
> you think this will be done twice, but actually the result of this two
> time update was different, it's not redo and it's in order to sync
> 'sd_llc' with 'rq->sd'.
>
Yes, you got my point now, but I don't understand your points. Anyway,
I'm not going to argue with this
anymore, this stuff isn't much of an issue, but removing this sorts of
stuff is typical in kernel development.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-24 Thread Rakib Mullick
On Wed, Jul 24, 2013 at 9:26 AM, Michael Wang
 wrote:
> Hi, Rakib
>
> On 07/24/2013 01:42 AM, Rakib Mullick wrote:
>> Currently, update_top_cache_domain() is called whenever schedule domain is 
>> built or destroyed. But, the following
>> callpath shows that they're at the same callpath and can be avoided 
>> update_top_cache_domain() while destroying schedule
>> domain and update only at the times of building schedule domains.
>>
>>   partition_sched_domains()
>>   detach_destroy_domain()
>>   cpu_attach_domain()
>>   update_top_cache_domain()
>
> IMHO, cpu_attach_domain() and update_top_cache_domain() should be
> paired, below patch will open a window which 'rq->sd == NULL' while
> 'sd_llc != NULL', isn't it?
>
> I don't think we have the promise that before we rebuild the stuff
> correctly, no one will utilize 'sd_llc'...
>
I never said it. My point is different.  partition_sched_domain works as -

- destroying existing schedule domain (if previous domain and new
domain aren't same)
- building new partition

while doing the first it needs to detach all the cpus on that domain.
By detaching what it does,
it fall backs to it's root default domain. In this context (which i've
proposed to skip), by means
of updating top cache domain it takes the highest flag domain to setup
it's sd_llc_id or cpu itself.

Whatever done above gets overwritten (updating top cache domain),
while building new partition.
Then, why we did that before? Hope you understand my point.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-24 Thread Rakib Mullick
On Wed, Jul 24, 2013 at 9:26 AM, Michael Wang
wang...@linux.vnet.ibm.com wrote:
 Hi, Rakib

 On 07/24/2013 01:42 AM, Rakib Mullick wrote:
 Currently, update_top_cache_domain() is called whenever schedule domain is 
 built or destroyed. But, the following
 callpath shows that they're at the same callpath and can be avoided 
 update_top_cache_domain() while destroying schedule
 domain and update only at the times of building schedule domains.

   partition_sched_domains()
   detach_destroy_domain()
   cpu_attach_domain()
   update_top_cache_domain()

 IMHO, cpu_attach_domain() and update_top_cache_domain() should be
 paired, below patch will open a window which 'rq-sd == NULL' while
 'sd_llc != NULL', isn't it?

 I don't think we have the promise that before we rebuild the stuff
 correctly, no one will utilize 'sd_llc'...

I never said it. My point is different.  partition_sched_domain works as -

- destroying existing schedule domain (if previous domain and new
domain aren't same)
- building new partition

while doing the first it needs to detach all the cpus on that domain.
By detaching what it does,
it fall backs to it's root default domain. In this context (which i've
proposed to skip), by means
of updating top cache domain it takes the highest flag domain to setup
it's sd_llc_id or cpu itself.

Whatever done above gets overwritten (updating top cache domain),
while building new partition.
Then, why we did that before? Hope you understand my point.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-24 Thread Rakib Mullick
On Wed, Jul 24, 2013 at 2:34 PM, Michael Wang
wang...@linux.vnet.ibm.com wrote:
 On 07/24/2013 04:01 PM, Rakib Mullick wrote:
 On Wed, Jul 24, 2013 at 9:26 AM, Michael Wang
 wang...@linux.vnet.ibm.com wrote:
 Hi, Rakib

 On 07/24/2013 01:42 AM, Rakib Mullick wrote:
 Currently, update_top_cache_domain() is called whenever schedule domain is 
 built or destroyed. But, the following
 callpath shows that they're at the same callpath and can be avoided 
 update_top_cache_domain() while destroying schedule
 domain and update only at the times of building schedule domains.

   partition_sched_domains()
   detach_destroy_domain()
   cpu_attach_domain()
   update_top_cache_domain()

 IMHO, cpu_attach_domain() and update_top_cache_domain() should be
 paired, below patch will open a window which 'rq-sd == NULL' while
 'sd_llc != NULL', isn't it?

 I don't think we have the promise that before we rebuild the stuff
 correctly, no one will utilize 'sd_llc'...

 I never said it. My point is different.  partition_sched_domain works as -

   - destroying existing schedule domain (if previous domain and new
 domain aren't same)
   - building new partition

 while doing the first it needs to detach all the cpus on that domain.
 By detaching what it does,
 it fall backs to it's root default domain. In this context (which i've
 proposed to skip), by means
 of updating top cache domain it takes the highest flag domain to setup
 it's sd_llc_id or cpu itself.

 Whatever done above gets overwritten (updating top cache domain),
 while building new partition.
 Then, why we did that before? Hope you understand my point.

 I think you missed this in PeterZ's suggestion:

 -   cpu_attach_domain(NULL, def_root_domain, i);

 With this change, it will be safe since you will still get an un-freed
 sd, although it's an old one.

I never meant it and clearly I missed it. If you remove
cpu_attach_domain(), then detach_destroy_domain() becomes meaningless.
And I don't have any intent to remove cpu_attach_domain from
detach_destroy_domain() at all.

 But your patch will run the risk to get a freed sd, since you make
 'sd_llc' wrong for a period of time (between destroy and rebuild) IMO.

Building 'sd_llc' depends on schedule domain. If you don't have sd,
sd_llc will point to NULL and sd_llc_id is
the CPU itself. Since, we're  trying to re-construing so for this time
being it doesn't matter, cause we're building
it again. Now, please just note what you're saying, on last thread
you've said -

   I don't think we have the promise that before we rebuild the stuff
correctly, no one will utilize 'sd_llc'...

If that is the case, then we shouldn't worry about it at all. And this
above comments (from previous thread I've quoted and this thread I'm
replying) they're just self contradictory.

 I guess I get you point, you are trying to save one time update since
 you think this will be done twice, but actually the result of this two
 time update was different, it's not redo and it's in order to sync
 'sd_llc' with 'rq-sd'.

Yes, you got my point now, but I don't understand your points. Anyway,
I'm not going to argue with this
anymore, this stuff isn't much of an issue, but removing this sorts of
stuff is typical in kernel development.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
Currently, update_top_cache_domain() is called whenever schedule domain is 
built or destroyed. But, the following
callpath shows that they're at the same callpath and can be avoided 
update_top_cache_domain() while destroying schedule
domain and update only at the times of building schedule domains.

partition_sched_domains()
detach_destroy_domain()
cpu_attach_domain()
update_top_cache_domain()
build_sched_domains()
cpu_attach_domain()
update_top_cache_domain()

Changes since v1: use sd to determine when to skip, courtesy PeterZ

Signed-off-by: Rakib Mullick 
---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb..387fb66 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
 
-   update_top_cache_domain(cpu);
+   if (sd)
+   update_top_cache_domain(cpu);
 }
 
 /* cpus with isolated domains */



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 10:53 PM, Peter Zijlstra  wrote:
> On Tue, Jul 23, 2013 at 10:20:20PM +0600, Rakib Mullick wrote:
>
>> You mean using sd == NULL rather than using update_cache_domain variable ?
>
> Yes, note how:
>
> @@ -6109,7 +6110,7 @@ static void detach_destroy_domains(const struct cpumask 
> *cpu_map)
>
> rcu_read_lock();
> for_each_cpu(i, cpu_map)
> -   cpu_attach_domain(NULL, _root_domain, i);
> rcu_read_unlock();
>  }
>
> Always has NULL for sd? Which means you can do:
>
> @@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
> root_domain *rd, int cpu)
> rcu_assign_pointer(rq->sd, sd);
> destroy_sched_domains(tmp, cpu);
>
> -   update_top_cache_domain(cpu);
> +   if (sd)
> +   update_top_cache_domain(cpu);
>  }
>
>  /* cpus with isolated domains */

Okay, got it. Will submit an updated patch!

Thanks
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 10:09 PM, Rakib Mullick  wrote:
> On Tue, Jul 23, 2013 at 9:47 PM, Peter Zijlstra  wrote:
>> On Tue, Jul 23, 2013 at 09:44:17PM +0600, Rakib Mullick wrote:
>>>  Currently, update_top_cache_domain() is called whenever schedule domain is 
>>> built or destroyed. But, the following
>>> callpath shows that they're at the same callpath and can be avoided 
>>> update_top_cache_domain() while destroying schedule
>>> domain and update only at the times of building schedule domains.
>>>
>>>   partition_sched_domains()
>>>   detach_destroy_domain()
>>>   cpu_attach_domain()
>>>   update_top_cache_domain()
>>>   build_sched_domains()
>>>   cpu_attach_domain()
>>>   update_top_cache_domain()
>>>
>>
>> Does it really matter?
>
> Why should we do it twice? More importantly at the time of destroying
> even though we know it'll be built again just few moment later.
>
>>
>>
>> This just makes the code uglier for no gain afaict.
>>
>> If you really need to do this, key off @sd == NULL or something.
>
> Sorry, would you please a bit more clearer? I can't pick what you're 
> suggesting.

You mean using sd == NULL rather than using update_cache_domain variable ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 9:47 PM, Peter Zijlstra  wrote:
> On Tue, Jul 23, 2013 at 09:44:17PM +0600, Rakib Mullick wrote:
>>  Currently, update_top_cache_domain() is called whenever schedule domain is 
>> built or destroyed. But, the following
>> callpath shows that they're at the same callpath and can be avoided 
>> update_top_cache_domain() while destroying schedule
>> domain and update only at the times of building schedule domains.
>>
>>   partition_sched_domains()
>>   detach_destroy_domain()
>>   cpu_attach_domain()
>>   update_top_cache_domain()
>>   build_sched_domains()
>>   cpu_attach_domain()
>>   update_top_cache_domain()
>>
>
> Does it really matter?

Why should we do it twice? More importantly at the time of destroying
even though we know it'll be built again just few moment later.

>
>
> This just makes the code uglier for no gain afaict.
>
> If you really need to do this, key off @sd == NULL or something.

Sorry, would you please a bit more clearer? I can't pick what you're suggesting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
 Currently, update_top_cache_domain() is called whenever schedule domain is 
built or destroyed. But, the following
callpath shows that they're at the same callpath and can be avoided 
update_top_cache_domain() while destroying schedule
domain and update only at the times of building schedule domains.

partition_sched_domains()
detach_destroy_domain()
cpu_attach_domain()
update_top_cache_domain()
build_sched_domains()
cpu_attach_domain()
update_top_cache_domain()



Signed-off-by: Rakib Mullick 
---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb..8c6fee4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5102,8 +5102,8 @@ static void update_top_cache_domain(int cpu)
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd,
+   int cpu, bool update_cache_domain)
 {
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
@@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
rcu_assign_pointer(rq->sd, sd);
destroy_sched_domains(tmp, cpu);
 
-   update_top_cache_domain(cpu);
+   if (update_cache_domain)
+   update_top_cache_domain(cpu);
 }
 
 /* cpus with isolated domains */
@@ -6021,7 +6022,7 @@ static int build_sched_domains(const struct cpumask 
*cpu_map,
rcu_read_lock();
for_each_cpu(i, cpu_map) {
sd = *per_cpu_ptr(d.sd, i);
-   cpu_attach_domain(sd, d.rd, i);
+   cpu_attach_domain(sd, d.rd, i, 1);
}
rcu_read_unlock();
 
@@ -6109,7 +6110,7 @@ static void detach_destroy_domains(const struct cpumask 
*cpu_map)
 
rcu_read_lock();
for_each_cpu(i, cpu_map)
-   cpu_attach_domain(NULL, _root_domain, i);
+   cpu_attach_domain(NULL, _root_domain, i, 0);
rcu_read_unlock();
 }
 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
 Currently, update_top_cache_domain() is called whenever schedule domain is 
built or destroyed. But, the following
callpath shows that they're at the same callpath and can be avoided 
update_top_cache_domain() while destroying schedule
domain and update only at the times of building schedule domains.

partition_sched_domains()
detach_destroy_domain()
cpu_attach_domain()
update_top_cache_domain()
build_sched_domains()
cpu_attach_domain()
update_top_cache_domain()



Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb..8c6fee4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5102,8 +5102,8 @@ static void update_top_cache_domain(int cpu)
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  */
-static void
-cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd,
+   int cpu, bool update_cache_domain)
 {
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
@@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
rcu_assign_pointer(rq-sd, sd);
destroy_sched_domains(tmp, cpu);
 
-   update_top_cache_domain(cpu);
+   if (update_cache_domain)
+   update_top_cache_domain(cpu);
 }
 
 /* cpus with isolated domains */
@@ -6021,7 +6022,7 @@ static int build_sched_domains(const struct cpumask 
*cpu_map,
rcu_read_lock();
for_each_cpu(i, cpu_map) {
sd = *per_cpu_ptr(d.sd, i);
-   cpu_attach_domain(sd, d.rd, i);
+   cpu_attach_domain(sd, d.rd, i, 1);
}
rcu_read_unlock();
 
@@ -6109,7 +6110,7 @@ static void detach_destroy_domains(const struct cpumask 
*cpu_map)
 
rcu_read_lock();
for_each_cpu(i, cpu_map)
-   cpu_attach_domain(NULL, def_root_domain, i);
+   cpu_attach_domain(NULL, def_root_domain, i, 0);
rcu_read_unlock();
 }
 


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 9:47 PM, Peter Zijlstra pet...@infradead.org wrote:
 On Tue, Jul 23, 2013 at 09:44:17PM +0600, Rakib Mullick wrote:
  Currently, update_top_cache_domain() is called whenever schedule domain is 
 built or destroyed. But, the following
 callpath shows that they're at the same callpath and can be avoided 
 update_top_cache_domain() while destroying schedule
 domain and update only at the times of building schedule domains.

   partition_sched_domains()
   detach_destroy_domain()
   cpu_attach_domain()
   update_top_cache_domain()
   build_sched_domains()
   cpu_attach_domain()
   update_top_cache_domain()


 Does it really matter?

Why should we do it twice? More importantly at the time of destroying
even though we know it'll be built again just few moment later.



 This just makes the code uglier for no gain afaict.

 If you really need to do this, key off @sd == NULL or something.

Sorry, would you please a bit more clearer? I can't pick what you're suggesting.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 10:09 PM, Rakib Mullick rakib.mull...@gmail.com wrote:
 On Tue, Jul 23, 2013 at 9:47 PM, Peter Zijlstra pet...@infradead.org wrote:
 On Tue, Jul 23, 2013 at 09:44:17PM +0600, Rakib Mullick wrote:
  Currently, update_top_cache_domain() is called whenever schedule domain is 
 built or destroyed. But, the following
 callpath shows that they're at the same callpath and can be avoided 
 update_top_cache_domain() while destroying schedule
 domain and update only at the times of building schedule domains.

   partition_sched_domains()
   detach_destroy_domain()
   cpu_attach_domain()
   update_top_cache_domain()
   build_sched_domains()
   cpu_attach_domain()
   update_top_cache_domain()


 Does it really matter?

 Why should we do it twice? More importantly at the time of destroying
 even though we know it'll be built again just few moment later.



 This just makes the code uglier for no gain afaict.

 If you really need to do this, key off @sd == NULL or something.

 Sorry, would you please a bit more clearer? I can't pick what you're 
 suggesting.

You mean using sd == NULL rather than using update_cache_domain variable ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH, RFC] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
On Tue, Jul 23, 2013 at 10:53 PM, Peter Zijlstra pet...@infradead.org wrote:
 On Tue, Jul 23, 2013 at 10:20:20PM +0600, Rakib Mullick wrote:

 You mean using sd == NULL rather than using update_cache_domain variable ?

 Yes, note how:

 @@ -6109,7 +6110,7 @@ static void detach_destroy_domains(const struct cpumask 
 *cpu_map)

 rcu_read_lock();
 for_each_cpu(i, cpu_map)
 -   cpu_attach_domain(NULL, def_root_domain, i);
 rcu_read_unlock();
  }

 Always has NULL for sd? Which means you can do:

 @@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
 root_domain *rd, int cpu)
 rcu_assign_pointer(rq-sd, sd);
 destroy_sched_domains(tmp, cpu);

 -   update_top_cache_domain(cpu);
 +   if (sd)
 +   update_top_cache_domain(cpu);
  }

  /* cpus with isolated domains */

Okay, got it. Will submit an updated patch!

Thanks
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2] sched: update_top_cache_domain only at the times of building sched domain.

2013-07-23 Thread Rakib Mullick
Currently, update_top_cache_domain() is called whenever schedule domain is 
built or destroyed. But, the following
callpath shows that they're at the same callpath and can be avoided 
update_top_cache_domain() while destroying schedule
domain and update only at the times of building schedule domains.

partition_sched_domains()
detach_destroy_domain()
cpu_attach_domain()
update_top_cache_domain()
build_sched_domains()
cpu_attach_domain()
update_top_cache_domain()

Changes since v1: use sd to determine when to skip, courtesy PeterZ

Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b7c32cb..387fb66 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5138,7 +5138,8 @@ cpu_attach_domain(struct sched_domain *sd, struct 
root_domain *rd, int cpu)
rcu_assign_pointer(rq-sd, sd);
destroy_sched_domains(tmp, cpu);
 
-   update_top_cache_domain(cpu);
+   if (sd)
+   update_top_cache_domain(cpu);
 }
 
 /* cpus with isolated domains */



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-19 Thread Rakib Mullick
On Fri, Apr 19, 2013 at 2:25 PM, Ingo Molnar  wrote:
>
> * Paul Gortmaker  wrote:
>
>> On 13-04-18 07:14 AM, Peter Zijlstra wrote:
>> > On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
>> >> * Paul Gortmaker  wrote:
>> >>
>> >>> Recent activity has had a focus on moving functionally related blocks of 
>> >>> stuff
>> >>> out of sched/core.c into stand-alone files.  The code relating to load 
>> >>> average
>> >>> calculations has grown significantly enough recently to warrant placing 
>> >>> it in a
>> >>> separate file.
>> >>>
>> >>> Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
>> >>> (~10%).
>> >>>
>> >>> A couple small static functions in the core sched.h header were also 
>> >>> localized
>> >>> to their singular user in sched/fair.c at the same time, with the goal 
>> >>> to also
>> >>> reduce the amount of "broadcast" content in that sched.h file.
>> >>
>> >> Nice!
>> >>
>> >> Peter, is this (and the naming of the new file) fine with you too?
>> >
>> > Yes and no.. that is I do like the change, but I don't like the
>> > filename. We have _wy_ too many different things we call load_avg.
>> >
>> > That said, I'm having a somewhat hard time coming up with a coherent
>> > alternative :/
>>
>> Several of the relocated functions start their name with "calc_load..."
>> Does "calc_load.c" sound any better?
>
> Peter has a point about load_avg being somewhat of a misnomer: that's not your
> fault in any way, we created overlapping naming within the scheduler and are 
> now
> hurting from it.
>
> Here are the main scheduler 'load' concepts we have right now:
>
>  - The externally visible 'average load' value extracted by tools like 'top' 
> via
>/proc/loadavg and handled by fs/proc/loadavg.c. Internally the naming is 
> all
>over the map: the fields that are updated are named 'avenrun[]', most other
>variables and methods are named calc_load_*(), and a few callbacks are 
> named
>*_cpu_load_*().
>
>  - rq->cpu_load, a weighted, vectored scheduler-internal notion of task load
>average with multiple run length averages. Only exposed by debug 
> interfaces but
>otherwise relied on by the scheduler for SMP load balancing.
>
>  - se->avg - per entity (per task) load average. This is integrated 
> differently
>from the cpu_load - but work is ongoing to possibly integrate it with the
>rq->cpu_load metric. This metric is used for CPU internal execution time
>allocation and timeslicing, based on nice value priorities and cgroup
>weights and constraints.
>
> Work is ongoing to integrate rq->cpu_load and se->avg - eventually they will
> become one metric.
>
> It might eventually make sense to integrate the 'average load' calculation as 
> well
> with all this - as they really have a similar purpose, the avenload[] vector 
> of
> averages is conceptually similar to the rq->cpu_load[] vector of averages.
>
> So I'd suggest to side-step all that existing confusion and simply name the 
> new
> file kernel/sched/proc.c - our external /proc scheduler ABI towards userspace.
> This is similar to the already existing kernel/irq/proc.c pattern.
>
Well, kernel/sched/stat.c - also exposes scheduler ABI to userspace.
Aren't these things going to introduce confusion (stat.c and proc.c
under same sched directory) ?

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-19 Thread Rakib Mullick
On Fri, Apr 19, 2013 at 5:13 AM, Paul Gortmaker
 wrote:
> [Re: [RFC PATCH 0/2] sched: move content out of core files for load average] 
> On 18/04/2013 (Thu 23:06) Rakib Mullick wrote:
>
>> On Thu, Apr 18, 2013 at 9:54 PM, Paul Gortmaker
>>  wrote:
>> > On 13-04-18 07:14 AM, Peter Zijlstra wrote:
>> >> On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
>> >>> * Paul Gortmaker  wrote:
>> >>>
>> >>>> Recent activity has had a focus on moving functionally related blocks 
>> >>>> of stuff
>> >>>> out of sched/core.c into stand-alone files.  The code relating to load 
>> >>>> average
>> >>>> calculations has grown significantly enough recently to warrant placing 
>> >>>> it in a
>> >>>> separate file.
>> >>>>
>> >>>> Here we do that, and in doing so, we shed ~20k of code from 
>> >>>> sched/core.c (~10%).
>> >>>>
>> >>>> A couple small static functions in the core sched.h header were also 
>> >>>> localized
>> >>>> to their singular user in sched/fair.c at the same time, with the goal 
>> >>>> to also
>> >>>> reduce the amount of "broadcast" content in that sched.h file.
>> >>>
>> >>> Nice!
>> >>>
>> >>> Peter, is this (and the naming of the new file) fine with you too?
>> >>
>> >> Yes and no.. that is I do like the change, but I don't like the
>> >> filename. We have _wy_ too many different things we call load_avg.
>> >>
>> >> That said, I'm having a somewhat hard time coming up with a coherent
>> >> alternative :/
>> >
>> > Several of the relocated functions start their name with "calc_load..."
>> > Does "calc_load.c" sound any better?
>> >
>> How about sched_load.c ?
>
> No, that doesn't work since it duplicates the path info in the file
> name -- something that none of the other kernel/sched/*.c files do.
> Do a "ls -1 kernel/sched" to see what I mean if it is not clear.
>
I understand your point, so just take sched out of it. I was just
trying to give a hint, if it could help.

> I honestly didn't spend a lot of time thinking about the file name.
> I chose load_avg.c since it had a parallel to the /proc/loadavg that
> linux has had since the early 1990s.  I have no real attachment
> to that name, but at the same time I'd like to avoid having name
> choice become a bikeshedding event...
>
I also didn't spend a lot of time thinking about file name. What I
said, it's from intuition and what I've seen so far. Name, it should
be simple, clear and should suggest what it's dealing with. But I also
think that,  name doesn't make things different, how it works - that's
what is important.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-19 Thread Rakib Mullick
On Fri, Apr 19, 2013 at 5:13 AM, Paul Gortmaker
paul.gortma...@windriver.com wrote:
 [Re: [RFC PATCH 0/2] sched: move content out of core files for load average] 
 On 18/04/2013 (Thu 23:06) Rakib Mullick wrote:

 On Thu, Apr 18, 2013 at 9:54 PM, Paul Gortmaker
 paul.gortma...@windriver.com wrote:
  On 13-04-18 07:14 AM, Peter Zijlstra wrote:
  On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
  * Paul Gortmaker paul.gortma...@windriver.com wrote:
 
  Recent activity has had a focus on moving functionally related blocks 
  of stuff
  out of sched/core.c into stand-alone files.  The code relating to load 
  average
  calculations has grown significantly enough recently to warrant placing 
  it in a
  separate file.
 
  Here we do that, and in doing so, we shed ~20k of code from 
  sched/core.c (~10%).
 
  A couple small static functions in the core sched.h header were also 
  localized
  to their singular user in sched/fair.c at the same time, with the goal 
  to also
  reduce the amount of broadcast content in that sched.h file.
 
  Nice!
 
  Peter, is this (and the naming of the new file) fine with you too?
 
  Yes and no.. that is I do like the change, but I don't like the
  filename. We have _wy_ too many different things we call load_avg.
 
  That said, I'm having a somewhat hard time coming up with a coherent
  alternative :/
 
  Several of the relocated functions start their name with calc_load...
  Does calc_load.c sound any better?
 
 How about sched_load.c ?

 No, that doesn't work since it duplicates the path info in the file
 name -- something that none of the other kernel/sched/*.c files do.
 Do a ls -1 kernel/sched to see what I mean if it is not clear.

I understand your point, so just take sched out of it. I was just
trying to give a hint, if it could help.

 I honestly didn't spend a lot of time thinking about the file name.
 I chose load_avg.c since it had a parallel to the /proc/loadavg that
 linux has had since the early 1990s.  I have no real attachment
 to that name, but at the same time I'd like to avoid having name
 choice become a bikeshedding event...

I also didn't spend a lot of time thinking about file name. What I
said, it's from intuition and what I've seen so far. Name, it should
be simple, clear and should suggest what it's dealing with. But I also
think that,  name doesn't make things different, how it works - that's
what is important.

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-19 Thread Rakib Mullick
On Fri, Apr 19, 2013 at 2:25 PM, Ingo Molnar mi...@kernel.org wrote:

 * Paul Gortmaker paul.gortma...@windriver.com wrote:

 On 13-04-18 07:14 AM, Peter Zijlstra wrote:
  On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
  * Paul Gortmaker paul.gortma...@windriver.com wrote:
 
  Recent activity has had a focus on moving functionally related blocks of 
  stuff
  out of sched/core.c into stand-alone files.  The code relating to load 
  average
  calculations has grown significantly enough recently to warrant placing 
  it in a
  separate file.
 
  Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
  (~10%).
 
  A couple small static functions in the core sched.h header were also 
  localized
  to their singular user in sched/fair.c at the same time, with the goal 
  to also
  reduce the amount of broadcast content in that sched.h file.
 
  Nice!
 
  Peter, is this (and the naming of the new file) fine with you too?
 
  Yes and no.. that is I do like the change, but I don't like the
  filename. We have _wy_ too many different things we call load_avg.
 
  That said, I'm having a somewhat hard time coming up with a coherent
  alternative :/

 Several of the relocated functions start their name with calc_load...
 Does calc_load.c sound any better?

 Peter has a point about load_avg being somewhat of a misnomer: that's not your
 fault in any way, we created overlapping naming within the scheduler and are 
 now
 hurting from it.

 Here are the main scheduler 'load' concepts we have right now:

  - The externally visible 'average load' value extracted by tools like 'top' 
 via
/proc/loadavg and handled by fs/proc/loadavg.c. Internally the naming is 
 all
over the map: the fields that are updated are named 'avenrun[]', most other
variables and methods are named calc_load_*(), and a few callbacks are 
 named
*_cpu_load_*().

  - rq-cpu_load, a weighted, vectored scheduler-internal notion of task load
average with multiple run length averages. Only exposed by debug 
 interfaces but
otherwise relied on by the scheduler for SMP load balancing.

  - se-avg - per entity (per task) load average. This is integrated 
 differently
from the cpu_load - but work is ongoing to possibly integrate it with the
rq-cpu_load metric. This metric is used for CPU internal execution time
allocation and timeslicing, based on nice value priorities and cgroup
weights and constraints.

 Work is ongoing to integrate rq-cpu_load and se-avg - eventually they will
 become one metric.

 It might eventually make sense to integrate the 'average load' calculation as 
 well
 with all this - as they really have a similar purpose, the avenload[] vector 
 of
 averages is conceptually similar to the rq-cpu_load[] vector of averages.

 So I'd suggest to side-step all that existing confusion and simply name the 
 new
 file kernel/sched/proc.c - our external /proc scheduler ABI towards userspace.
 This is similar to the already existing kernel/irq/proc.c pattern.

Well, kernel/sched/stat.c - also exposes scheduler ABI to userspace.
Aren't these things going to introduce confusion (stat.c and proc.c
under same sched directory) ?

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-18 Thread Rakib Mullick
On Thu, Apr 18, 2013 at 9:54 PM, Paul Gortmaker
 wrote:
> On 13-04-18 07:14 AM, Peter Zijlstra wrote:
>> On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
>>> * Paul Gortmaker  wrote:
>>>
 Recent activity has had a focus on moving functionally related blocks of 
 stuff
 out of sched/core.c into stand-alone files.  The code relating to load 
 average
 calculations has grown significantly enough recently to warrant placing it 
 in a
 separate file.

 Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
 (~10%).

 A couple small static functions in the core sched.h header were also 
 localized
 to their singular user in sched/fair.c at the same time, with the goal to 
 also
 reduce the amount of "broadcast" content in that sched.h file.
>>>
>>> Nice!
>>>
>>> Peter, is this (and the naming of the new file) fine with you too?
>>
>> Yes and no.. that is I do like the change, but I don't like the
>> filename. We have _wy_ too many different things we call load_avg.
>>
>> That said, I'm having a somewhat hard time coming up with a coherent
>> alternative :/
>
> Several of the relocated functions start their name with "calc_load..."
> Does "calc_load.c" sound any better?
>
How about sched_load.c ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-18 Thread Rakib Mullick
On Thu, Apr 18, 2013 at 9:54 PM, Paul Gortmaker
paul.gortma...@windriver.com wrote:
 On 13-04-18 07:14 AM, Peter Zijlstra wrote:
 On Mon, 2013-04-15 at 11:33 +0200, Ingo Molnar wrote:
 * Paul Gortmaker paul.gortma...@windriver.com wrote:

 Recent activity has had a focus on moving functionally related blocks of 
 stuff
 out of sched/core.c into stand-alone files.  The code relating to load 
 average
 calculations has grown significantly enough recently to warrant placing it 
 in a
 separate file.

 Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
 (~10%).

 A couple small static functions in the core sched.h header were also 
 localized
 to their singular user in sched/fair.c at the same time, with the goal to 
 also
 reduce the amount of broadcast content in that sched.h file.

 Nice!

 Peter, is this (and the naming of the new file) fine with you too?

 Yes and no.. that is I do like the change, but I don't like the
 filename. We have _wy_ too many different things we call load_avg.

 That said, I'm having a somewhat hard time coming up with a coherent
 alternative :/

 Several of the relocated functions start their name with calc_load...
 Does calc_load.c sound any better?

How about sched_load.c ?
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-12 Thread Rakib Mullick
On Sat, Apr 13, 2013 at 6:04 AM, Paul Gortmaker
 wrote:
> Recent activity has had a focus on moving functionally related blocks of stuff
> out of sched/core.c into stand-alone files.  The code relating to load average
> calculations has grown significantly enough recently to warrant placing it in
> a separate file.
>
> Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
> (~10%).
>
> A couple small static functions in the core sched.h header were also localized
> to their singular user in sched/fair.c at the same time, with the goal to also
> reduce the amount of "broadcast" content in that sched.h file.
>
> Paul.
> ---
>
> [ Patches sent here are tested on tip's sched/core, i.e. v3.9-rc1-38-gb329fd5
>
>   Assuming that this change is OK with folks, the timing can be whatever is 
> most
>   convenient -- i.e. I can update/respin it close to the end of the merge 
> window
>   for what will be v3.10-rc1, if that is what minimizes the inconvenience to 
> folks
>   who might be changing the code that is relocated here. ]
>
> Paul Gortmaker (2):
>   sched: fork load calculation code from sched/core --> sched/load_avg
>   sched: move update_load_[add/sub/set] from sched.h to fair.c
>
>  kernel/sched/Makefile   |   2 +-
>  kernel/sched/core.c | 569 ---
>  kernel/sched/fair.c |  18 ++
>  kernel/sched/load_avg.c | 577 
> 
>  kernel/sched/sched.h|  26 +--
>  5 files changed, 604 insertions(+), 588 deletions(-)
>  create mode 100644 kernel/sched/load_avg.c
>

Is there any impact positive over vmlinuz size after these changes?

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH 0/2] sched: move content out of core files for load average

2013-04-12 Thread Rakib Mullick
On Sat, Apr 13, 2013 at 6:04 AM, Paul Gortmaker
paul.gortma...@windriver.com wrote:
 Recent activity has had a focus on moving functionally related blocks of stuff
 out of sched/core.c into stand-alone files.  The code relating to load average
 calculations has grown significantly enough recently to warrant placing it in
 a separate file.

 Here we do that, and in doing so, we shed ~20k of code from sched/core.c 
 (~10%).

 A couple small static functions in the core sched.h header were also localized
 to their singular user in sched/fair.c at the same time, with the goal to also
 reduce the amount of broadcast content in that sched.h file.

 Paul.
 ---

 [ Patches sent here are tested on tip's sched/core, i.e. v3.9-rc1-38-gb329fd5

   Assuming that this change is OK with folks, the timing can be whatever is 
 most
   convenient -- i.e. I can update/respin it close to the end of the merge 
 window
   for what will be v3.10-rc1, if that is what minimizes the inconvenience to 
 folks
   who might be changing the code that is relocated here. ]

 Paul Gortmaker (2):
   sched: fork load calculation code from sched/core -- sched/load_avg
   sched: move update_load_[add/sub/set] from sched.h to fair.c

  kernel/sched/Makefile   |   2 +-
  kernel/sched/core.c | 569 ---
  kernel/sched/fair.c |  18 ++
  kernel/sched/load_avg.c | 577 
 
  kernel/sched/sched.h|  26 +--
  5 files changed, 604 insertions(+), 588 deletions(-)
  create mode 100644 kernel/sched/load_avg.c


Is there any impact positive over vmlinuz size after these changes?

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] auditsc: Use kzalloc instead of kmalloc+memset.

2013-04-08 Thread Rakib Mullick
On Tue, Apr 9, 2013 at 3:43 AM, Andrew Morton  wrote:
>
> Fair enough.  I'd go futher...
>
> From: Andrew Morton 
> Subject: auditsc-use-kzalloc-instead-of-kmallocmemset-fix
>
> remove audit_set_context() altogether - fold it into its caller
>
> Cc: Al Viro 
> Cc: Eric Paris 
> Cc: Rakib Mullick 
> Signed-off-by: Andrew Morton 
> ---
>
>  kernel/auditsc.c |   10 ++
>  1 file changed, 2 insertions(+), 8 deletions(-)
>
> diff -puN kernel/auditsc.c~auditsc-use-kzalloc-instead-of-kmallocmemset-fix 
> kernel/auditsc.c
> --- a/kernel/auditsc.c~auditsc-use-kzalloc-instead-of-kmallocmemset-fix
> +++ a/kernel/auditsc.c
> @@ -1034,13 +1034,6 @@ static inline void audit_free_aux(struct
> }
>  }
>
> -static inline void audit_set_context(struct audit_context *context,
> - enum audit_state state)
> -{
> -   context->state  = state;
> -   context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
> -}
> -
>  static inline struct audit_context *audit_alloc_context(enum audit_state 
> state)
>  {
> struct audit_context *context;
> @@ -1048,7 +1041,8 @@ static inline struct audit_context *audi
> context = kzalloc(sizeof(*context), GFP_KERNEL);
> if (!context)
> return NULL;
> -   audit_set_context(context, state);
> +   context->state = state;
> +   context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
> INIT_LIST_HEAD(>killed_trees);
> INIT_LIST_HEAD(>names_list);
> return context;
> _
>
Yes, this one is better than my patch and it's due to its diff stat.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[QUESTION] loops_per_jiffy calculation from smp_callin().

2013-04-08 Thread Rakib Mullick
Hello Ingo and all,

In function arch/x86/kernel/smpboot.c::smp_callin(), we do a
calibrate_delay(), the reason behind doing this is commented as
follows:

 * Get our bogomips.
 * Update loops_per_jiffy in cpu_data. Previous call to
 * smp_store_cpu_info() stored a value that is close but not as
 * accurate as the value just calculated.

Now, if we look at the init/calibrate.c::calibrate_delay() - a percpu
variable cpu_loops_per_jiffy is used to cache up the loops_per_jiffy
value. If cpu_loop_per_jiffy value is set (which should be after first
run) then it prevents calculating new loops_per_jiffy thus the
accuracy expected from the callsite isn't getable (I'm thinking about
cpu hotpluging for few times). Is it not something that we should care
about? Is lpj value is that important to recalculate (when mostly we
read from tsc)? And, if it's important, how cpu_loop_per_jiffy is
helping?


Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[QUESTION] loops_per_jiffy calculation from smp_callin().

2013-04-08 Thread Rakib Mullick
Hello Ingo and all,

In function arch/x86/kernel/smpboot.c::smp_callin(), we do a
calibrate_delay(), the reason behind doing this is commented as
follows:

 * Get our bogomips.
 * Update loops_per_jiffy in cpu_data. Previous call to
 * smp_store_cpu_info() stored a value that is close but not as
 * accurate as the value just calculated.

Now, if we look at the init/calibrate.c::calibrate_delay() - a percpu
variable cpu_loops_per_jiffy is used to cache up the loops_per_jiffy
value. If cpu_loop_per_jiffy value is set (which should be after first
run) then it prevents calculating new loops_per_jiffy thus the
accuracy expected from the callsite isn't getable (I'm thinking about
cpu hotpluging for few times). Is it not something that we should care
about? Is lpj value is that important to recalculate (when mostly we
read from tsc)? And, if it's important, how cpu_loop_per_jiffy is
helping?


Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] auditsc: Use kzalloc instead of kmalloc+memset.

2013-04-08 Thread Rakib Mullick
On Tue, Apr 9, 2013 at 3:43 AM, Andrew Morton a...@linux-foundation.org wrote:

 Fair enough.  I'd go futher...

 From: Andrew Morton a...@linux-foundation.org
 Subject: auditsc-use-kzalloc-instead-of-kmallocmemset-fix

 remove audit_set_context() altogether - fold it into its caller

 Cc: Al Viro v...@zeniv.linux.org.uk
 Cc: Eric Paris epa...@redhat.com
 Cc: Rakib Mullick rakib.mull...@gmail.com
 Signed-off-by: Andrew Morton a...@linux-foundation.org
 ---

  kernel/auditsc.c |   10 ++
  1 file changed, 2 insertions(+), 8 deletions(-)

 diff -puN kernel/auditsc.c~auditsc-use-kzalloc-instead-of-kmallocmemset-fix 
 kernel/auditsc.c
 --- a/kernel/auditsc.c~auditsc-use-kzalloc-instead-of-kmallocmemset-fix
 +++ a/kernel/auditsc.c
 @@ -1034,13 +1034,6 @@ static inline void audit_free_aux(struct
 }
  }

 -static inline void audit_set_context(struct audit_context *context,
 - enum audit_state state)
 -{
 -   context-state  = state;
 -   context-prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 -}
 -
  static inline struct audit_context *audit_alloc_context(enum audit_state 
 state)
  {
 struct audit_context *context;
 @@ -1048,7 +1041,8 @@ static inline struct audit_context *audi
 context = kzalloc(sizeof(*context), GFP_KERNEL);
 if (!context)
 return NULL;
 -   audit_set_context(context, state);
 +   context-state = state;
 +   context-prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 INIT_LIST_HEAD(context-killed_trees);
 INIT_LIST_HEAD(context-names_list);
 return context;
 _

Yes, this one is better than my patch and it's due to its diff stat.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] auditsc: Use kzalloc instead of kmalloc+memset.

2013-04-07 Thread Rakib Mullick
  In function audit_alloc_context(), use kzalloc, instead of kmalloc+memset. 
Patch also renames audit_zero_context() to 
audit_set_context(), to represent it's inner workings properly.

Signed-off-by: Rakib Mullick 
---

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f85..f5b6dc5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,10 +1034,9 @@ static inline void audit_free_aux(struct audit_context 
*context)
}
 }
 
-static inline void audit_zero_context(struct audit_context *context,
+static inline void audit_set_context(struct audit_context *context,
  enum audit_state state)
 {
-   memset(context, 0, sizeof(*context));
context->state  = state;
context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 }
@@ -1046,9 +1045,10 @@ static inline struct audit_context 
*audit_alloc_context(enum audit_state state)
 {
struct audit_context *context;
 
-   if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+   context = kzalloc(sizeof(*context), GFP_KERNEL);
+   if (!context)
return NULL;
-   audit_zero_context(context, state);
+   audit_set_context(context, state);
INIT_LIST_HEAD(>killed_trees);
INIT_LIST_HEAD(>names_list);
return context;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: old->umask copying without spin_lock, in copy_fs_struct()

2013-04-07 Thread Rakib Mullick
On Sun, Apr 7, 2013 at 1:56 PM, Al Viro  wrote:
> On Sun, Apr 07, 2013 at 11:37:27AM +0600, Rakib Mullick wrote:
>> Hello,
>>
>> In copy_fs_struct(), old->umask is assigned to fs->umask outside of
>> spin_lock(>lock). Shouldn't it be inside spin_lock()? Since we're
>> dealing with  fs_struct *old ? Isn't it unsafe? Following lines -
>>
>>   fs->umask = old->umask;
>>
>> spin_lock(>lock);
>
> What would moving it down buy us?  Root, pwd and umask are all modified
> independently; the *only* reason why we hold old->lock for root and
> pwd (and we might drop and regain it between copying those - it would
> be pointless, so we don't bother, but it wouldn't have affected correctness)
> is that we want the values of root.mnt and root.dentry taken at the same
> time and we want to grab extra references on those while they are still
> valid.  The same goes for pwd, of course.  That's what old->lock
> protects - we want the damn thing atomic wrt set_fs_root() and set_fs_pwd().
> umask is an integer; its updates are atomic anyway, so it's not as if we
> could see a half-updated value or needed to do anything with refcounts.

Thanks for your explanation! The ->umask operation is trivial and as
you've explained (I was also looking at the code),
it seems that code execution order makes sure that nothing goes wrong.
fs_struct's data are protected with the ->lock, that's what I was
thinking in that way and was just making sure it wasn't missed out
accidentally.

Thanks
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: old-umask copying without spin_lock, in copy_fs_struct()

2013-04-07 Thread Rakib Mullick
On Sun, Apr 7, 2013 at 1:56 PM, Al Viro v...@zeniv.linux.org.uk wrote:
 On Sun, Apr 07, 2013 at 11:37:27AM +0600, Rakib Mullick wrote:
 Hello,

 In copy_fs_struct(), old-umask is assigned to fs-umask outside of
 spin_lock(old-lock). Shouldn't it be inside spin_lock()? Since we're
 dealing with  fs_struct *old ? Isn't it unsafe? Following lines -

   fs-umask = old-umask;

 spin_lock(old-lock);

 What would moving it down buy us?  Root, pwd and umask are all modified
 independently; the *only* reason why we hold old-lock for root and
 pwd (and we might drop and regain it between copying those - it would
 be pointless, so we don't bother, but it wouldn't have affected correctness)
 is that we want the values of root.mnt and root.dentry taken at the same
 time and we want to grab extra references on those while they are still
 valid.  The same goes for pwd, of course.  That's what old-lock
 protects - we want the damn thing atomic wrt set_fs_root() and set_fs_pwd().
 umask is an integer; its updates are atomic anyway, so it's not as if we
 could see a half-updated value or needed to do anything with refcounts.

Thanks for your explanation! The -umask operation is trivial and as
you've explained (I was also looking at the code),
it seems that code execution order makes sure that nothing goes wrong.
fs_struct's data are protected with the -lock, that's what I was
thinking in that way and was just making sure it wasn't missed out
accidentally.

Thanks
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] auditsc: Use kzalloc instead of kmalloc+memset.

2013-04-07 Thread Rakib Mullick
  In function audit_alloc_context(), use kzalloc, instead of kmalloc+memset. 
Patch also renames audit_zero_context() to 
audit_set_context(), to represent it's inner workings properly.

Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f85..f5b6dc5 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,10 +1034,9 @@ static inline void audit_free_aux(struct audit_context 
*context)
}
 }
 
-static inline void audit_zero_context(struct audit_context *context,
+static inline void audit_set_context(struct audit_context *context,
  enum audit_state state)
 {
-   memset(context, 0, sizeof(*context));
context-state  = state;
context-prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
 }
@@ -1046,9 +1045,10 @@ static inline struct audit_context 
*audit_alloc_context(enum audit_state state)
 {
struct audit_context *context;
 
-   if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+   context = kzalloc(sizeof(*context), GFP_KERNEL);
+   if (!context)
return NULL;
-   audit_zero_context(context, state);
+   audit_set_context(context, state);
INIT_LIST_HEAD(context-killed_trees);
INIT_LIST_HEAD(context-names_list);
return context;


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-11 Thread Rakib Mullick
On Sat, Mar 9, 2013 at 10:48 PM, Rakib Mullick  wrote:
> On Sat, Mar 9, 2013 at 2:33 PM, Eric W. Biederman  
> wrote:
>> Rakib Mullick  writes:
>>
>>> On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
>>>  wrote:
>>>>
>>>> When a new task is created one of two things needs to happen.
>>>> A) A reference count needs to be added to the current nsproxy.
>>>> B) B a new nsproxy needs to be created.
>>>>
>>>> The way that code works today is far from a shiny example of totally
>>>> clear code but it is not incorrect.
>>>>
>>>> By moving get_nsproxy down below the first return 0, you removed taking
>>>> the reference count in the one case it is important.
>>>>
>>>> Arguably we should apply the patch below for clarity, and I just might
>>>> queue it up for 3.10.
>>>>
>>> This one is much more cleaner. One thing regarding this patch, can we
>>> check the namespace related flags at copy_namespace() call time at
>>> copy_process(), also get_nsproxy()? I think this will reduce some
>>> extra function call overhead and as you've mentioned get_nsproxy() is
>>> needed at every process creation.
>>
>> If you can write a benchmark that can tell the difference, and the code
>> continues to be readable.  It would be worth making the change.
>>
>> My gut says you are proposing an optimization that won't have
>> a measurable impact.
>>
> Yes, it'll be hard to measure these sorts of optimization, though I'll
> try and will tell you if the impact is measurable :).
>
Well, it's quite certain that - the changes I've proposed that don't
have much noticable impact. From this tiny
changes we can't expect too much impact. The tinyest possible impact
could be on at every task creation path by
reducing some latency. So, to find it out I took the help of ftrace -
I used it's function_graph option with set_graph_function set to
do_fork.

So currently, at every task creation copy_namespaces() gets called and
it costs some like below (few snippet from ftrace log):

 3)   0.025 us|try_module_get();
 3) ! 887.805 us  |  } /* dup_mm */
 3)   0.278 us|  copy_namespaces(); <--
 3)   0.094 us|  copy_thread();

 3)   0.264 us|  copy_namespaces(); <-
 3)   0.091 us|  copy_thread();

 3)   0.306 us|  copy_namespaces();  <-
 3)   0.086 us|  copy_thread();

etc. copy_namespaces() just returns here.

On the otherhand, when namespace related flags are moved to
kernel/fork.c::copy_process(), copy_namespaces() never gets called
until flags are set.

So, this is what I did manage to get. If you're not convinced, then
you can drop it. But, we can't expect much than this for this simple
changes.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-11 Thread Rakib Mullick
On Sat, Mar 9, 2013 at 10:48 PM, Rakib Mullick rakib.mull...@gmail.com wrote:
 On Sat, Mar 9, 2013 at 2:33 PM, Eric W. Biederman ebied...@xmission.com 
 wrote:
 Rakib Mullick rakib.mull...@gmail.com writes:

 On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
 ebied...@xmission.com wrote:

 When a new task is created one of two things needs to happen.
 A) A reference count needs to be added to the current nsproxy.
 B) B a new nsproxy needs to be created.

 The way that code works today is far from a shiny example of totally
 clear code but it is not incorrect.

 By moving get_nsproxy down below the first return 0, you removed taking
 the reference count in the one case it is important.

 Arguably we should apply the patch below for clarity, and I just might
 queue it up for 3.10.

 This one is much more cleaner. One thing regarding this patch, can we
 check the namespace related flags at copy_namespace() call time at
 copy_process(), also get_nsproxy()? I think this will reduce some
 extra function call overhead and as you've mentioned get_nsproxy() is
 needed at every process creation.

 If you can write a benchmark that can tell the difference, and the code
 continues to be readable.  It would be worth making the change.

 My gut says you are proposing an optimization that won't have
 a measurable impact.

 Yes, it'll be hard to measure these sorts of optimization, though I'll
 try and will tell you if the impact is measurable :).

Well, it's quite certain that - the changes I've proposed that don't
have much noticable impact. From this tiny
changes we can't expect too much impact. The tinyest possible impact
could be on at every task creation path by
reducing some latency. So, to find it out I took the help of ftrace -
I used it's function_graph option with set_graph_function set to
do_fork.

So currently, at every task creation copy_namespaces() gets called and
it costs some like below (few snippet from ftrace log):

 3)   0.025 us|try_module_get();
 3) ! 887.805 us  |  } /* dup_mm */
 3)   0.278 us|  copy_namespaces(); --
 3)   0.094 us|  copy_thread();

 3)   0.264 us|  copy_namespaces(); -
 3)   0.091 us|  copy_thread();

 3)   0.306 us|  copy_namespaces();  -
 3)   0.086 us|  copy_thread();

etc. copy_namespaces() just returns here.

On the otherhand, when namespace related flags are moved to
kernel/fork.c::copy_process(), copy_namespaces() never gets called
until flags are set.

So, this is what I did manage to get. If you're not convinced, then
you can drop it. But, we can't expect much than this for this simple
changes.

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-09 Thread Rakib Mullick
On Sat, Mar 9, 2013 at 2:33 PM, Eric W. Biederman  wrote:
> Rakib Mullick  writes:
>
>> On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
>>  wrote:
>>>
>>> When a new task is created one of two things needs to happen.
>>> A) A reference count needs to be added to the current nsproxy.
>>> B) B a new nsproxy needs to be created.
>>>
>>> The way that code works today is far from a shiny example of totally
>>> clear code but it is not incorrect.
>>>
>>> By moving get_nsproxy down below the first return 0, you removed taking
>>> the reference count in the one case it is important.
>>>
>>> Arguably we should apply the patch below for clarity, and I just might
>>> queue it up for 3.10.
>>>
>> This one is much more cleaner. One thing regarding this patch, can we
>> check the namespace related flags at copy_namespace() call time at
>> copy_process(), also get_nsproxy()? I think this will reduce some
>> extra function call overhead and as you've mentioned get_nsproxy() is
>> needed at every process creation.
>
> If you can write a benchmark that can tell the difference, and the code
> continues to be readable.  It would be worth making the change.
>
> My gut says you are proposing an optimization that won't have
> a measurable impact.
>
Yes, it'll be hard to measure these sorts of optimization, though I'll
try and will tell you if the impact is measurable :).

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-09 Thread Rakib Mullick
On Sat, Mar 9, 2013 at 2:33 PM, Eric W. Biederman ebied...@xmission.com wrote:
 Rakib Mullick rakib.mull...@gmail.com writes:

 On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
 ebied...@xmission.com wrote:

 When a new task is created one of two things needs to happen.
 A) A reference count needs to be added to the current nsproxy.
 B) B a new nsproxy needs to be created.

 The way that code works today is far from a shiny example of totally
 clear code but it is not incorrect.

 By moving get_nsproxy down below the first return 0, you removed taking
 the reference count in the one case it is important.

 Arguably we should apply the patch below for clarity, and I just might
 queue it up for 3.10.

 This one is much more cleaner. One thing regarding this patch, can we
 check the namespace related flags at copy_namespace() call time at
 copy_process(), also get_nsproxy()? I think this will reduce some
 extra function call overhead and as you've mentioned get_nsproxy() is
 needed at every process creation.

 If you can write a benchmark that can tell the difference, and the code
 continues to be readable.  It would be worth making the change.

 My gut says you are proposing an optimization that won't have
 a measurable impact.

Yes, it'll be hard to measure these sorts of optimization, though I'll
try and will tell you if the impact is measurable :).

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-08 Thread Rakib Mullick
On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
 wrote:
>
> When a new task is created one of two things needs to happen.
> A) A reference count needs to be added to the current nsproxy.
> B) B a new nsproxy needs to be created.
>
> The way that code works today is far from a shiny example of totally
> clear code but it is not incorrect.
>
> By moving get_nsproxy down below the first return 0, you removed taking
> the reference count in the one case it is important.
>
> Arguably we should apply the patch below for clarity, and I just might
> queue it up for 3.10.
>
This one is much more cleaner. One thing regarding this patch, can we
check the namespace related flags at copy_namespace() call time at
copy_process(), also get_nsproxy()? I think this will reduce some
extra function call overhead and as you've mentioned get_nsproxy() is
needed at every process creation.

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-08 Thread Rakib Mullick
On 3/7/13, Eric W. Biederman  wrote:
> Fengguang Wu  writes:
>
>> Greetings,
>>
>> I got the below oops and the first bad commit is
>
> Doh!  On a second look that change is totally wrong.  Of course we need
> to up the ref-count every time we create a new process.  Especially if
> we don't do anything with namespaces.
>
> I was looking at it from the wrong angle last night.  I should have
> known better.
>
> Patch dropped.
>

Sad to know :( . From the debug messages, it's kmemcheck report. I
can't related the problem specified with the patch I've proposed.

It seems at task exit path, at switch_task_namespaces() - after my
patch atomic_dec_and_test(>count) becomes true (-1), thus
free_nsproxy() gets called. But, free_nsproxy() shouldn't get called
here.

Am I right? Or there's something else?

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-08 Thread Rakib Mullick
On 3/7/13, Eric W. Biederman ebied...@xmission.com wrote:
 Fengguang Wu fengguang...@intel.com writes:

 Greetings,

 I got the below oops and the first bad commit is

 Doh!  On a second look that change is totally wrong.  Of course we need
 to up the ref-count every time we create a new process.  Especially if
 we don't do anything with namespaces.

 I was looking at it from the wrong angle last night.  I should have
 known better.

 Patch dropped.


Sad to know :( . From the debug messages, it's kmemcheck report. I
can't related the problem specified with the patch I've proposed.

It seems at task exit path, at switch_task_namespaces() - after my
patch atomic_dec_and_test(ns-count) becomes true (-1), thus
free_nsproxy() gets called. But, free_nsproxy() shouldn't get called
here.

Am I right? Or there's something else?

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [nsproxy] BUG: unable to handle kernel NULL pointer dereference at 0000000000000024

2013-03-08 Thread Rakib Mullick
On Fri, Mar 8, 2013 at 10:01 PM, Eric W. Biederman
ebied...@xmission.com wrote:

 When a new task is created one of two things needs to happen.
 A) A reference count needs to be added to the current nsproxy.
 B) B a new nsproxy needs to be created.

 The way that code works today is far from a shiny example of totally
 clear code but it is not incorrect.

 By moving get_nsproxy down below the first return 0, you removed taking
 the reference count in the one case it is important.

 Arguably we should apply the patch below for clarity, and I just might
 queue it up for 3.10.

This one is much more cleaner. One thing regarding this patch, can we
check the namespace related flags at copy_namespace() call time at
copy_process(), also get_nsproxy()? I think this will reduce some
extra function call overhead and as you've mentioned get_nsproxy() is
needed at every process creation.

Thanks,
Rakib
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] nsproxy: Fix ->nsproxy counting problem in copy_namespace.

2013-03-07 Thread Rakib Mullick
>From cd41495e25cf2641ffe9e01a40d3d221a46b08be Mon Sep 17 00:00:00 2001
From: Rakib Mullick 
Date: Thu, 7 Mar 2013 14:52:20 +0600
Subject: [PATCH] nsproxy: Fix ->nsproxy counting problem in copy_namespace.

In copy_namespace(), get_nsproxy() (which increments nsproxy->count) is called 
before checking namespace related flags.
Therefore, task's nsproxy->count could have improper value, which could lead to 
calling free_nsproxy unnecessarily. Also,
incrementing nsproxy->count is an atomic operation (which is expensive than 
normal increment operation), so before
doing it - it's better we make sure namespace related flags are set.

Signed-off-by: Rakib Mullick 
---
 kernel/nsproxy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456..de36209 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -130,12 +130,12 @@ int copy_namespaces(unsigned long flags, struct 
task_struct *tsk)
if (!old_ns)
return 0;
 
-   get_nsproxy(old_ns);
-
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
 
+   get_nsproxy(old_ns);
+
if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
err = -EPERM;
goto out;
-- 
1.7.11.7



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] nsproxy: Fix -nsproxy counting problem in copy_namespace.

2013-03-07 Thread Rakib Mullick
From cd41495e25cf2641ffe9e01a40d3d221a46b08be Mon Sep 17 00:00:00 2001
From: Rakib Mullick rakib.mull...@gmail.com
Date: Thu, 7 Mar 2013 14:52:20 +0600
Subject: [PATCH] nsproxy: Fix -nsproxy counting problem in copy_namespace.

In copy_namespace(), get_nsproxy() (which increments nsproxy-count) is called 
before checking namespace related flags.
Therefore, task's nsproxy-count could have improper value, which could lead to 
calling free_nsproxy unnecessarily. Also,
incrementing nsproxy-count is an atomic operation (which is expensive than 
normal increment operation), so before
doing it - it's better we make sure namespace related flags are set.

Signed-off-by: Rakib Mullick rakib.mull...@gmail.com
---
 kernel/nsproxy.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456..de36209 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -130,12 +130,12 @@ int copy_namespaces(unsigned long flags, struct 
task_struct *tsk)
if (!old_ns)
return 0;
 
-   get_nsproxy(old_ns);
-
if (!(flags  (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET)))
return 0;
 
+   get_nsproxy(old_ns);
+
if (!ns_capable(user_ns, CAP_SYS_ADMIN)) {
err = -EPERM;
goto out;
-- 
1.7.11.7



--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] sched: The removal of idle_balance()

2013-02-18 Thread Rakib Mullick
On Mon, Feb 18, 2013 at 9:25 PM, Steven Rostedt  wrote:
> On Mon, 2013-02-18 at 13:43 +0530, Srikar Dronamraju wrote:
>> > The cache misses dropped by ~23% and migrations dropped by ~28%. I
>> > really believe that the idle_balance() hurts performance, and not just
>> > for something like hackbench, but the aggressive nature for migration
>> > that idle_balance() causes takes a large hit on a process' cache.
>> >
>> > Think about it some more, just because we go idle isn't enough reason to
>> > pull a runable task over. CPUs go idle all the time, and tasks are woken
>> > up all the time. There's no reason that we can't just wait for the sched
>> > tick to decide its time to do a bit of balancing. Sure, it would be nice
>> > if the idle CPU did the work. But I think that frame of mind was an
>> > incorrect notion from back in the early 2000s and does not apply to
>> > today's hardware, or perhaps it doesn't apply to the (relatively) new
>> > CFS scheduler. If you want aggressive scheduling, make the task rt, and
>> > it will do aggressive scheduling.
>> >
>>
>> How is it that the normal tick based load balancing gets it correctly while
>> the idle_balance gets is wrong?  Can it because of the different
>> cpu_idle_type?
>>
>
> Currently looks to be a fluke in my box, as this performance increase
> can't be duplicated elsewhere (yet). But from looking at my traces, it
> seems that my box does the idle balance at just the wrong time, and
> causes these issues.
>
A default hackbench run creates 400 tasks (10 * 40), on a i7 system (4
core, HT), idle_balance() shouldn't be in action, cause on a 8 cpu
system we're assigning 400 tasks. If idle_balance() comes in, that
means - we've done something wrong while distributing tasks among the
CPUs, that indicates a problem during fork/exec/wake balancing?

Thanks,
Rakib.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] sched: The removal of idle_balance()

2013-02-18 Thread Rakib Mullick
On Mon, Feb 18, 2013 at 9:25 PM, Steven Rostedt rost...@goodmis.org wrote:
 On Mon, 2013-02-18 at 13:43 +0530, Srikar Dronamraju wrote:
  The cache misses dropped by ~23% and migrations dropped by ~28%. I
  really believe that the idle_balance() hurts performance, and not just
  for something like hackbench, but the aggressive nature for migration
  that idle_balance() causes takes a large hit on a process' cache.
 
  Think about it some more, just because we go idle isn't enough reason to
  pull a runable task over. CPUs go idle all the time, and tasks are woken
  up all the time. There's no reason that we can't just wait for the sched
  tick to decide its time to do a bit of balancing. Sure, it would be nice
  if the idle CPU did the work. But I think that frame of mind was an
  incorrect notion from back in the early 2000s and does not apply to
  today's hardware, or perhaps it doesn't apply to the (relatively) new
  CFS scheduler. If you want aggressive scheduling, make the task rt, and
  it will do aggressive scheduling.
 

 How is it that the normal tick based load balancing gets it correctly while
 the idle_balance gets is wrong?  Can it because of the different
 cpu_idle_type?


 Currently looks to be a fluke in my box, as this performance increase
 can't be duplicated elsewhere (yet). But from looking at my traces, it
 seems that my box does the idle balance at just the wrong time, and
 causes these issues.

A default hackbench run creates 400 tasks (10 * 40), on a i7 system (4
core, HT), idle_balance() shouldn't be in action, cause on a 8 cpu
system we're assigning 400 tasks. If idle_balance() comes in, that
means - we've done something wrong while distributing tasks among the
CPUs, that indicates a problem during fork/exec/wake balancing?

Thanks,
Rakib.
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC] sched: nohz_idle_balance

2012-09-13 Thread Rakib Mullick
On 9/13/12, Peter Zijlstra  wrote:
> On Thu, 2012-09-13 at 08:49 +0200, Mike Galbraith wrote:
>> On Thu, 2012-09-13 at 06:11 +0200, Vincent Guittot wrote:
>
> Well, updating the load statistics on the cpu you're going to balance
> seems like a good end to me.. ;-) No point updating the local statistics
> N times and leaving the ones you're going to balance stale for god knows
> how long.

Don't you think this patch has a very poor subject line?

Thanks,
Rakib
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >