[RFC PATCH 2/8] x86/cpu: Load Key Locker internal key at boot-time

2020-12-16 Thread Chang S. Bae
Internal (Wrapping) Key is a new entity of Intel Key Locker feature. This
internal key is loaded in a software-inaccessible CPU state and used to
encode a data encryption key.

The kernel makes random data and loads it as the internal key in each CPU.
The data need to be invalidated as soon as the load is done.

The BIOS may disable the feature. Check the dynamic CPUID bit
(KEYLOCKER_CPUID_EBX_AESKLE) at first.

Add byte code for LOADIWKEY -- an instruction to load the internal key, in
the 'x86-opcode-map.txt' file to avoid objtool's misinterpretation.

Signed-off-by: Chang S. Bae 
Cc: x...@kernel.org
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/include/asm/keylocker.h  | 11 +
 arch/x86/kernel/Makefile  |  1 +
 arch/x86/kernel/cpu/common.c  | 38 +-
 arch/x86/kernel/keylocker.c   | 71 +++
 arch/x86/kernel/smpboot.c |  2 +
 arch/x86/lib/x86-opcode-map.txt   |  2 +-
 tools/arch/x86/lib/x86-opcode-map.txt |  2 +-
 7 files changed, 124 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/keylocker.c

diff --git a/arch/x86/include/asm/keylocker.h b/arch/x86/include/asm/keylocker.h
index 2fe13c21c63f..daf0734a4095 100644
--- a/arch/x86/include/asm/keylocker.h
+++ b/arch/x86/include/asm/keylocker.h
@@ -14,5 +14,16 @@
 #define KEYLOCKER_CPUID_EBX_BACKUP BIT(4)
 #define KEYLOCKER_CPUID_ECX_RAND   BIT(1)
 
+bool check_keylocker_readiness(void);
+
+bool load_keylocker(void);
+
+void make_keylocker_data(void);
+#ifdef CONFIG_X86_KEYLOCKER
+void invalidate_keylocker_data(void);
+#else
+#define invalidate_keylocker_data() do { } while (0)
+#endif
+
 #endif /*__ASSEMBLY__ */
 #endif /* _ASM_KEYLOCKER_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 68608bd892c0..085dbf49b3b9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -145,6 +145,7 @@ obj-$(CONFIG_PERF_EVENTS)   += perf_regs.o
 obj-$(CONFIG_TRACING)  += tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)+= itmt.o
 obj-$(CONFIG_X86_UMIP) += umip.o
+obj-$(CONFIG_X86_KEYLOCKER)+= keylocker.o
 
 obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
 obj-$(CONFIG_UNWINDER_FRAME_POINTER)   += unwind_frame.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..d675075848bb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,8 @@
 #include 
 #include 
 #include 
+#include 
+
 #include 
 
 #include "cpu.h"
@@ -459,6 +461,39 @@ static __init int x86_nofsgsbase_setup(char *arg)
 }
 __setup("nofsgsbase", x86_nofsgsbase_setup);
 
+static __always_inline void setup_keylocker(struct cpuinfo_x86 *c)
+{
+   bool keyloaded;
+
+   if (!cpu_feature_enabled(X86_FEATURE_KEYLOCKER))
+   goto out;
+
+   cr4_set_bits(X86_CR4_KEYLOCKER);
+
+   if (c == _cpu_data) {
+   if (!check_keylocker_readiness())
+   goto disable_keylocker;
+
+   make_keylocker_data();
+   }
+
+   keyloaded = load_keylocker();
+   if (!keyloaded) {
+   pr_err_once("x86/keylocker: Failed to load internal key\n");
+   goto disable_keylocker;
+   }
+
+   pr_info_once("x86/keylocker: Activated\n");
+   return;
+
+disable_keylocker:
+   clear_cpu_cap(c, X86_FEATURE_KEYLOCKER);
+   pr_info_once("x86/keylocker: Disabled\n");
+out:
+   /* Make sure the feature disabled for kexec-reboot. */
+   cr4_clear_bits(X86_CR4_KEYLOCKER);
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1554,10 +1589,11 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
 
-   /* Set up SMEP/SMAP/UMIP */
+   /* Setup various Intel-specific CPU security features */
setup_smep(c);
setup_smap(c);
setup_umip(c);
+   setup_keylocker(c);
 
/* Enable FSGSBASE instructions if available. */
if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
diff --git a/arch/x86/kernel/keylocker.c b/arch/x86/kernel/keylocker.c
new file mode 100644
index ..e455d806b80c
--- /dev/null
+++ b/arch/x86/kernel/keylocker.c
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Key Locker feature check and support the internal key
+ */
+
+#include 
+
+#include 
+#include 
+#include 
+
+bool check_keylocker_readiness(void)
+{
+   u32 eax, ebx, ecx, edx;
+
+   cpuid_count(KEYLOCKER_CPUID, 0, , , , );
+   /* BIOS may not enable it on some systems. */
+   if (!(ebx & KEYLOCKER_CPUID_EBX_AESKLE)) {
+   pr_debug("x86/keylocker: not fully enabled\n");
+   return false;
+   }
+
+   return true;
+}
+
+/* Load Internal (Wrapping) Key */
+#define LOADIWKEY  ".byte 0xf3,0x0f,0x38,0xdc,0xd1"
+#define LOADIWKEY_NUM_OPERANDS 3
+
+static struct key {

[PATCH v1 0/2] Input: ads7846: reduce SPI related CPU load

2020-11-10 Thread Oleksij Rempel
changes v2:
- add back settle_delay_usecs support
- execute power down on the end of the main transfer.
- make it work with 2.5MHz SPI clock

This series is optimizing SPI transfer related CPU load.

Oleksij Rempel (2):
  Input: ads7846: convert to full duplex
  Input: ads7846: convert to one message

 drivers/input/touchscreen/ads7846.c | 456 
 1 file changed, 199 insertions(+), 257 deletions(-)

-- 
2.28.0



[PATCH v1 0/2] Input: ads7846: reduce SPI related CPU load

2020-11-04 Thread Oleksij Rempel
This series is optimizing SPI transfer related CPU load.

Oleksij Rempel (2):
  Input: ads7846: convert to full duplex
  Input: ads7846: convert to one message

 drivers/input/touchscreen/ads7846.c | 395 +++-
 1 file changed, 151 insertions(+), 244 deletions(-)

-- 
2.28.0



Re: [PATCH] docs/cpu-load: format the example code.

2020-10-21 Thread Jonathan Corbet
On Mon, 19 Oct 2020 01:05:57 +0800
Hui Su  wrote:

> format the example code.
> 
> Signed-off-by: Hui Su 
> ---
>  Documentation/admin-guide/cpu-load.rst | 63 ++
>  1 file changed, 33 insertions(+), 30 deletions(-)

Hmm...this document wasn't always that way; it looks like maybe the code
got mangled during the RST conversion.

I question the value of this document in general...but as long as we have
it the code might as well look right, so I've applied this, thanks.

jon


[PATCH] docs/cpu-load: format the example code.

2020-10-18 Thread Hui Su
format the example code.

Signed-off-by: Hui Su 
---
 Documentation/admin-guide/cpu-load.rst | 63 ++
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/Documentation/admin-guide/cpu-load.rst 
b/Documentation/admin-guide/cpu-load.rst
index ebdecf864080..f3ada90e9ca8 100644
--- a/Documentation/admin-guide/cpu-load.rst
+++ b/Documentation/admin-guide/cpu-load.rst
@@ -61,43 +61,46 @@ will lead to quite erratic information inside 
``/proc/stat``::
 
static volatile sig_atomic_t stop;
 
-   static void sighandler (int signr)
+   static void sighandler(int signr)
{
-   (void) signr;
-   stop = 1;
+   (void) signr;
+   stop = 1;
}
+
static unsigned long hog (unsigned long niters)
{
-   stop = 0;
-   while (!stop && --niters);
-   return niters;
+   stop = 0;
+   while (!stop && --niters);
+   return niters;
}
+
int main (void)
{
-   int i;
-   struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
-   .it_value = { .tv_sec = 0, .tv_usec = 1 } };
-   sigset_t set;
-   unsigned long v[HIST];
-   double tmp = 0.0;
-   unsigned long n;
-   signal (SIGALRM, );
-   setitimer (ITIMER_REAL, , NULL);
-
-   hog (ULONG_MAX);
-   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
-   for (i = 0; i < HIST; ++i) tmp += v[i];
-   tmp /= HIST;
-   n = tmp - (tmp / 3.0);
-
-   sigemptyset ();
-   sigaddset (, SIGALRM);
-
-   for (;;) {
-   hog (n);
-   sigwait (, );
-   }
-   return 0;
+   int i;
+   struct itimerval it = {
+   .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+   .it_value= { .tv_sec = 0, .tv_usec = 1 } };
+   sigset_t set;
+   unsigned long v[HIST];
+   double tmp = 0.0;
+   unsigned long n;
+   signal(SIGALRM, );
+   setitimer(ITIMER_REAL, , NULL);
+
+   hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog(ULONG_MAX);
+   for (i = 0; i < HIST; ++i) tmp += v[i];
+   tmp /= HIST;
+   n = tmp - (tmp / 3.0);
+
+   sigemptyset();
+   sigaddset(, SIGALRM);
+
+   for (;;) {
+   hog(n);
+   sigwait(, );
+   }
+   return 0;
}
 
 
-- 
2.25.1




[PATCH] doc/zh_CN: fix title heading markup in admin-guide cpu-load

2020-08-02 Thread Lukas Bulwahn
Documentation generation warns:

  Documentation/translations/zh_CN/admin-guide/cpu-load.rst:1:
  WARNING: Title overline too short.

Extend title heading markup by one. It was just off by one.

Fixes: e210c66d567c ("doc/zh_CN: add cpu-load Chinese version")
Signed-off-by: Lukas Bulwahn 
---
Alex, Tao, please ack.

Jonathan, please pick this quick minor warning fix.

applies on your docs-next and next-20200731

 Documentation/translations/zh_CN/admin-guide/cpu-load.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/translations/zh_CN/admin-guide/cpu-load.rst 
b/Documentation/translations/zh_CN/admin-guide/cpu-load.rst
index 0116d0477799..c972731c0e57 100644
--- a/Documentation/translations/zh_CN/admin-guide/cpu-load.rst
+++ b/Documentation/translations/zh_CN/admin-guide/cpu-load.rst
@@ -1,6 +1,6 @@
-===
+
 CPU 负载
-===
+
 
 Linux通过``/proc/stat``和``/proc/uptime``导出各种信息,用户空间工具
 如top(1)使用这些信息计算系统花费在某个特定状态的平均时间。
-- 
2.17.1



[RFC 3/3] media: stm32-dcmi: Inform cpufreq governors about cpu load needs

2020-05-26 Thread Benjamin Gaignard
When start streaming the CPU load could remain very low because almost
all the capture pipeline is done in hardware (i.e. without using the CPU)
and let believe to cpufreq governor that it could use lower frequencies.
If the governor decides to use a too low frequency that becomes a problem
when we need to acknowledge the interrupt during the blanking time.

To avoid this problem, DCMI driver informs the cpufreq governors by adding
a cpufreq minimum load QoS resquest.

Signed-off-by: Benjamin Gaignard 
---
 drivers/media/platform/stm32/stm32-dcmi.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/media/platform/stm32/stm32-dcmi.c 
b/drivers/media/platform/stm32/stm32-dcmi.c
index b8931490b83b..774f2506b2f1 100644
--- a/drivers/media/platform/stm32/stm32-dcmi.c
+++ b/drivers/media/platform/stm32/stm32-dcmi.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -173,6 +174,8 @@ struct stm32_dcmi {
struct media_device mdev;
struct media_padvid_cap_pad;
struct media_pipeline   pipeline;
+
+   struct pm_qos_request   qos_request;
 };
 
 static inline struct stm32_dcmi *notifier_to_dcmi(struct v4l2_async_notifier 
*n)
@@ -827,6 +830,9 @@ static int dcmi_start_streaming(struct vb2_queue *vq, 
unsigned int count)
else
reg_set(dcmi->regs, DCMI_IER, IT_OVR | IT_ERR);
 
+   cpufreq_minload_qos_add_request(>qos_request,
+   CPUFREQ_GOV_QOS_MIN_LOAD_MAX_VALUE);
+
return 0;
 
 err_pipeline_stop:
@@ -859,6 +865,8 @@ static void dcmi_stop_streaming(struct vb2_queue *vq)
struct stm32_dcmi *dcmi = vb2_get_drv_priv(vq);
struct dcmi_buf *buf, *node;
 
+   cpufreq_minload_qos_remove_request(>qos_request);
+
dcmi_pipeline_stop(dcmi);
 
media_pipeline_stop(>vdev->entity);
-- 
2.15.0



Re: [PATCH] thermal: cpu_cooling: Actually trace CPU load in thermal_power_cpu_get_power

2019-05-10 Thread Javi Merino
On Thu, May 02, 2019 at 11:32:38AM -0700, Matthias Kaehlcke wrote:
> The CPU load values passed to the thermal_power_cpu_get_power
> tracepoint are zero for all CPUs, unless, unless the
> thermal_power_cpu_limit tracepoint is enabled too:
> 
>   irq/41-rockchip-98[000]    290.972410: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x0,0x0,0x0,0x0}} dynamic_power=4815
> 
> vs
> 
>   irq/41-rockchip-96[000] 95.773585: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x56,0x64,0x64,0x5e}} dynamic_power=4959
>   irq/41-rockchip-96[000] 95.773596: thermal_power_cpu_limit:
>   cpus=000f freq=408000 cdev_state=10 power=416
> 
> There seems to be no good reason for omitting the CPU load information
> depending on another tracepoint. My guess is that the intention was to
> check whether thermal_power_cpu_get_power is (still) enabled, however
> 'load_cpu != NULL' already indicates that it was at least enabled when
> cpufreq_get_requested_power() was entered, there seems little gain
> from omitting the assignment if the tracepoint was just disabled, so
> just remove the check.
> 
> Fixes: 6828a4711f99 ("thermal: add trace events to the power allocator 
> governor")
> Signed-off-by: Matthias Kaehlcke 

Yep, looks good to me.

Acked-by: Javi Merino 

> ---
>  drivers/thermal/cpu_cooling.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> index f7c1f49ec87f..b437804e099b 100644
> --- a/drivers/thermal/cpu_cooling.c
> +++ b/drivers/thermal/cpu_cooling.c
> @@ -458,7 +458,7 @@ static int cpufreq_get_requested_power(struct 
> thermal_cooling_device *cdev,
>   load = 0;
>  
>   total_load += load;
> - if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
> + if (load_cpu)
>   load_cpu[i] = load;
>  
>   i++;
> -- 
> 2.21.0.593.g511ec345e18-goog
> 


Re: [PATCH] thermal: cpu_cooling: Actually trace CPU load in thermal_power_cpu_get_power

2019-05-10 Thread Viresh Kumar
On 02-05-19, 11:32, Matthias Kaehlcke wrote:
> The CPU load values passed to the thermal_power_cpu_get_power
> tracepoint are zero for all CPUs, unless, unless the
> thermal_power_cpu_limit tracepoint is enabled too:
> 
>   irq/41-rockchip-98[000]    290.972410: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x0,0x0,0x0,0x0}} dynamic_power=4815
> 
> vs
> 
>   irq/41-rockchip-96[000] 95.773585: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x56,0x64,0x64,0x5e}} dynamic_power=4959
>   irq/41-rockchip-96[000] 95.773596: thermal_power_cpu_limit:
>   cpus=000f freq=408000 cdev_state=10 power=416
> 
> There seems to be no good reason for omitting the CPU load information
> depending on another tracepoint. My guess is that the intention was to
> check whether thermal_power_cpu_get_power is (still) enabled, however
> 'load_cpu != NULL' already indicates that it was at least enabled when
> cpufreq_get_requested_power() was entered, there seems little gain
> from omitting the assignment if the tracepoint was just disabled, so
> just remove the check.
> 
> Fixes: 6828a4711f99 ("thermal: add trace events to the power allocator 
> governor")
> Signed-off-by: Matthias Kaehlcke 
> ---
>  drivers/thermal/cpu_cooling.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> index f7c1f49ec87f..b437804e099b 100644
> --- a/drivers/thermal/cpu_cooling.c
> +++ b/drivers/thermal/cpu_cooling.c
> @@ -458,7 +458,7 @@ static int cpufreq_get_requested_power(struct 
> thermal_cooling_device *cdev,
>   load = 0;
>  
>   total_load += load;
> - if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
> + if (load_cpu)
>   load_cpu[i] = load;
>  
>   i++;

Acked-by: Viresh Kumar 

-- 
viresh


Re: [PATCH] thermal: cpu_cooling: Actually trace CPU load in thermal_power_cpu_get_power

2019-05-03 Thread Daniel Lezcano
On 02/05/2019 20:32, Matthias Kaehlcke wrote:
> The CPU load values passed to the thermal_power_cpu_get_power
> tracepoint are zero for all CPUs, unless, unless the
> thermal_power_cpu_limit tracepoint is enabled too:
> 
>   irq/41-rockchip-98[000]    290.972410: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x0,0x0,0x0,0x0}} dynamic_power=4815
> 
> vs
> 
>   irq/41-rockchip-96[000] 95.773585: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x56,0x64,0x64,0x5e}} dynamic_power=4959
>   irq/41-rockchip-96[000] 95.773596: thermal_power_cpu_limit:
>   cpus=000f freq=408000 cdev_state=10 power=416
> 
> There seems to be no good reason for omitting the CPU load information
> depending on another tracepoint. My guess is that the intention was to
> check whether thermal_power_cpu_get_power is (still) enabled, however
> 'load_cpu != NULL' already indicates that it was at least enabled when
> cpufreq_get_requested_power() was entered, there seems little gain
> from omitting the assignment if the tracepoint was just disabled, so
> just remove the check.
> 
> Fixes: 6828a4711f99 ("thermal: add trace events to the power allocator 
> governor")
> Signed-off-by: Matthias Kaehlcke 
> ---

Yes, load_cpu is needed in any case for both traces. The change makes sense.

Reviewed-by: Daniel Lezcano 


-- 
 <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs

Follow Linaro:  <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog



Re: [PATCH] thermal: cpu_cooling: Actually trace CPU load in thermal_power_cpu_get_power

2019-05-03 Thread Viresh Kumar
On 02-05-19, 11:32, Matthias Kaehlcke wrote:
> The CPU load values passed to the thermal_power_cpu_get_power
> tracepoint are zero for all CPUs, unless, unless the
> thermal_power_cpu_limit tracepoint is enabled too:
> 
>   irq/41-rockchip-98[000]    290.972410: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x0,0x0,0x0,0x0}} dynamic_power=4815
> 
> vs
> 
>   irq/41-rockchip-96[000] 95.773585: thermal_power_cpu_get_power:
>   cpus=000f freq=180 load={{0x56,0x64,0x64,0x5e}} dynamic_power=4959
>   irq/41-rockchip-96[000] 95.773596: thermal_power_cpu_limit:
>   cpus=000f freq=408000 cdev_state=10 power=416
> 
> There seems to be no good reason for omitting the CPU load information
> depending on another tracepoint. My guess is that the intention was to
> check whether thermal_power_cpu_get_power is (still) enabled, however
> 'load_cpu != NULL' already indicates that it was at least enabled when
> cpufreq_get_requested_power() was entered, there seems little gain
> from omitting the assignment if the tracepoint was just disabled, so
> just remove the check.
> 
> Fixes: 6828a4711f99 ("thermal: add trace events to the power allocator 
> governor")
> Signed-off-by: Matthias Kaehlcke 
> ---
>  drivers/thermal/cpu_cooling.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> index f7c1f49ec87f..b437804e099b 100644
> --- a/drivers/thermal/cpu_cooling.c
> +++ b/drivers/thermal/cpu_cooling.c
> @@ -458,7 +458,7 @@ static int cpufreq_get_requested_power(struct 
> thermal_cooling_device *cdev,
>   load = 0;
>  
>   total_load += load;
> - if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
> + if (load_cpu)
>   load_cpu[i] = load;
>  
>   i++;

@Javi: I find this change to be fine, do you have any comments on this ?

-- 
viresh


[PATCH] thermal: cpu_cooling: Actually trace CPU load in thermal_power_cpu_get_power

2019-05-02 Thread Matthias Kaehlcke
The CPU load values passed to the thermal_power_cpu_get_power
tracepoint are zero for all CPUs, unless, unless the
thermal_power_cpu_limit tracepoint is enabled too:

  irq/41-rockchip-98[000]    290.972410: thermal_power_cpu_get_power:
  cpus=000f freq=180 load={{0x0,0x0,0x0,0x0}} dynamic_power=4815

vs

  irq/41-rockchip-96[000] 95.773585: thermal_power_cpu_get_power:
  cpus=000f freq=180 load={{0x56,0x64,0x64,0x5e}} dynamic_power=4959
  irq/41-rockchip-96[000] 95.773596: thermal_power_cpu_limit:
  cpus=000f freq=408000 cdev_state=10 power=416

There seems to be no good reason for omitting the CPU load information
depending on another tracepoint. My guess is that the intention was to
check whether thermal_power_cpu_get_power is (still) enabled, however
'load_cpu != NULL' already indicates that it was at least enabled when
cpufreq_get_requested_power() was entered, there seems little gain
from omitting the assignment if the tracepoint was just disabled, so
just remove the check.

Fixes: 6828a4711f99 ("thermal: add trace events to the power allocator 
governor")
Signed-off-by: Matthias Kaehlcke 
---
 drivers/thermal/cpu_cooling.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index f7c1f49ec87f..b437804e099b 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -458,7 +458,7 @@ static int cpufreq_get_requested_power(struct 
thermal_cooling_device *cdev,
load = 0;
 
total_load += load;
-   if (trace_thermal_power_cpu_limit_enabled() && load_cpu)
+   if (load_cpu)
load_cpu[i] = load;
 
i++;
-- 
2.21.0.593.g511ec345e18-goog



[PATCH 4/7] sched/fair: don't include effective load of process in the old cpu load

2017-07-14 Thread Josef Bacik
From: Josef Bacik <jba...@fb.com>

We have no idea how long ago the process went to sleep.  It could have just gone
to sleep, in which case we would essentially be counting the load of the process
twice in the previous effective load.  Since the history presumably already
exists in the previous cpu load don't bother adding it's effective load again.

Signed-off-by: Josef Bacik <jba...@fb.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d958634..ee8dced 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5689,7 +5689,7 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
 
-   prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+   prev_eff_load *= load;
}
 
balanced = this_eff_load <= prev_eff_load;
-- 
2.9.3



[PATCH 4/7] sched/fair: don't include effective load of process in the old cpu load

2017-07-14 Thread Josef Bacik
From: Josef Bacik 

We have no idea how long ago the process went to sleep.  It could have just gone
to sleep, in which case we would essentially be counting the load of the process
twice in the previous effective load.  Since the history presumably already
exists in the previous cpu load don't bother adding it's effective load again.

Signed-off-by: Josef Bacik 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d958634..ee8dced 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5689,7 +5689,7 @@ static int wake_affine(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
 
-   prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+   prev_eff_load *= load;
}
 
balanced = this_eff_load <= prev_eff_load;
-- 
2.9.3



[PATCH v2 08/31] cpu-load: standardize document format

2017-06-17 Thread Mauro Carvalho Chehab
Each text file under Documentation follows a different
format. Some doesn't even have titles!

Change its representation to follow the adopted standard,
using ReST markups for it to be parseable by Sphinx:

- mark literals;
- Adjust document title;
- Use a list for references.

Signed-off-by: Mauro Carvalho Chehab <mche...@s-opensource.com>
---
 Documentation/cpu-load.txt | 117 +++--
 1 file changed, 59 insertions(+), 58 deletions(-)

diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
index 287224e57cfc..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/cpu-load.txt
@@ -1,9 +1,10 @@
+====
 CPU load
-
+
 
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
 
 $ iostat
 Linux 2.6.18.3-exp (linmac) 02/20/2007
@@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period 
the
 system spent 10.01% of the time doing work in user space, 2.92% in the
 kernel, and was overall 81.63% of the time idle.
 
-In most cases the `/proc/stat' information reflects the reality quite
+In most cases the ``/proc/stat``information reflects the reality quite
 closely, however due to the nature of how/when the kernel collects
 this data sometimes it can not be trusted at all.
 
@@ -33,78 +34,78 @@ Example
 ---
 
 If we imagine the system with one task that periodically burns cycles
-in the following manner:
+in the following manner::
 
- time line between two timer interrupts
-|--|
- ^^
- |_ something begins working  |
-  |_ something goes to sleep
- (only to be awaken quite soon)
+ time line between two timer interrupts
+|--|
+ ^^
+ |_ something begins working  |
+  |_ something goes to sleep
+ (only to be awaken quite soon)
 
 In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
+``/proc/stat`` (since the timer interrupt will always happen when the
 system is executing the idle handler), but in reality the load is
 closer to 99%.
 
 One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
+will lead to quite erratic information inside ``/proc/stat``::
 
 
-/* gcc -o hog smallhog.c */
-#include 
-#include 
-#include 
-#include 
-#define HIST 10
+   /* gcc -o hog smallhog.c */
+   #include 
+   #include 
+   #include 
+   #include 
+   #define HIST 10
 
-static volatile sig_atomic_t stop;
+   static volatile sig_atomic_t stop;
 
-static void sighandler (int signr)
-{
- (void) signr;
- stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
- stop = 0;
- while (!stop && --niters);
- return niters;
-}
-int main (void)
-{
- int i;
- struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
- .it_value = { .tv_sec = 0, .tv_usec = 1 } };
- sigset_t set;
- unsigned long v[HIST];
- double tmp = 0.0;
- unsigned long n;
- signal (SIGALRM, );
- setitimer (ITIMER_REAL, , NULL);
+   static void sighandler (int signr)
+   {
+   (void) signr;
+   stop = 1;
+   }
+   static unsigned long hog (unsigned long niters)
+   {
+   stop = 0;
+   while (!stop && --niters);
+   return niters;
+   }
+   int main (void)
+   {
+   int i;
+   struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+   .it_value = { .tv_sec = 0, .tv_usec = 1 } };
+   sigset_t set;
+   unsigned long v[HIST];
+   double tmp = 0.0;
+   unsigned long n;
+   signal (SIGALRM, );
+   setitimer (ITIMER_REAL, , NULL);
 
- hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) tmp += v[i];
- tmp /= HIST;
- n = tmp - (tmp / 3.0);
+   hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) tmp += v[i];
+   tmp /= HIST;
+   n = tmp - (tmp / 3.0);
 
- sigemptyset ();
- sigaddset (, SIGALRM);
+   sigemptyset ();
+   sigaddset (, SIGALRM);
 
- for (;;) {
- 

[PATCH v2 08/31] cpu-load: standardize document format

2017-06-17 Thread Mauro Carvalho Chehab
Each text file under Documentation follows a different
format. Some doesn't even have titles!

Change its representation to follow the adopted standard,
using ReST markups for it to be parseable by Sphinx:

- mark literals;
- Adjust document title;
- Use a list for references.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/cpu-load.txt | 117 +++--
 1 file changed, 59 insertions(+), 58 deletions(-)

diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
index 287224e57cfc..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/cpu-load.txt
@@ -1,9 +1,10 @@
+
 CPU load
-
+
 
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
 
 $ iostat
 Linux 2.6.18.3-exp (linmac) 02/20/2007
@@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period 
the
 system spent 10.01% of the time doing work in user space, 2.92% in the
 kernel, and was overall 81.63% of the time idle.
 
-In most cases the `/proc/stat' information reflects the reality quite
+In most cases the ``/proc/stat``information reflects the reality quite
 closely, however due to the nature of how/when the kernel collects
 this data sometimes it can not be trusted at all.
 
@@ -33,78 +34,78 @@ Example
 ---
 
 If we imagine the system with one task that periodically burns cycles
-in the following manner:
+in the following manner::
 
- time line between two timer interrupts
-|--|
- ^^
- |_ something begins working  |
-  |_ something goes to sleep
- (only to be awaken quite soon)
+ time line between two timer interrupts
+|--|
+ ^^
+ |_ something begins working  |
+  |_ something goes to sleep
+ (only to be awaken quite soon)
 
 In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
+``/proc/stat`` (since the timer interrupt will always happen when the
 system is executing the idle handler), but in reality the load is
 closer to 99%.
 
 One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
+will lead to quite erratic information inside ``/proc/stat``::
 
 
-/* gcc -o hog smallhog.c */
-#include 
-#include 
-#include 
-#include 
-#define HIST 10
+   /* gcc -o hog smallhog.c */
+   #include 
+   #include 
+   #include 
+   #include 
+   #define HIST 10
 
-static volatile sig_atomic_t stop;
+   static volatile sig_atomic_t stop;
 
-static void sighandler (int signr)
-{
- (void) signr;
- stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
- stop = 0;
- while (!stop && --niters);
- return niters;
-}
-int main (void)
-{
- int i;
- struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
- .it_value = { .tv_sec = 0, .tv_usec = 1 } };
- sigset_t set;
- unsigned long v[HIST];
- double tmp = 0.0;
- unsigned long n;
- signal (SIGALRM, );
- setitimer (ITIMER_REAL, , NULL);
+   static void sighandler (int signr)
+   {
+   (void) signr;
+   stop = 1;
+   }
+   static unsigned long hog (unsigned long niters)
+   {
+   stop = 0;
+   while (!stop && --niters);
+   return niters;
+   }
+   int main (void)
+   {
+   int i;
+   struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+   .it_value = { .tv_sec = 0, .tv_usec = 1 } };
+   sigset_t set;
+   unsigned long v[HIST];
+   double tmp = 0.0;
+   unsigned long n;
+   signal (SIGALRM, );
+   setitimer (ITIMER_REAL, , NULL);
 
- hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) tmp += v[i];
- tmp /= HIST;
- n = tmp - (tmp / 3.0);
+   hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) tmp += v[i];
+   tmp /= HIST;
+   n = tmp - (tmp / 3.0);
 
- sigemptyset ();
- sigaddset (, SIGALRM);
+   sigemptyset ();
+   sigaddset (, SIGALRM);
 
- for (;;) {
- hog (n);
- sigwait 

[PATCH 09/31] cpu-load: standardize document format

2017-05-18 Thread Mauro Carvalho Chehab
Each text file under Documentation follows a different
format. Some doesn't even have titles!

Change its representation to follow the adopted standard,
using ReST markups for it to be parseable by Sphinx:

- mark literals;
- Adjust document title;
- Use a list for references.

Signed-off-by: Mauro Carvalho Chehab <mche...@s-opensource.com>
---
 Documentation/cpu-load.txt | 117 +++--
 1 file changed, 59 insertions(+), 58 deletions(-)

diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
index 287224e57cfc..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/cpu-load.txt
@@ -1,9 +1,10 @@
+====
 CPU load
-
+
 
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
 
 $ iostat
 Linux 2.6.18.3-exp (linmac) 02/20/2007
@@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period 
the
 system spent 10.01% of the time doing work in user space, 2.92% in the
 kernel, and was overall 81.63% of the time idle.
 
-In most cases the `/proc/stat' information reflects the reality quite
+In most cases the ``/proc/stat``information reflects the reality quite
 closely, however due to the nature of how/when the kernel collects
 this data sometimes it can not be trusted at all.
 
@@ -33,78 +34,78 @@ Example
 ---
 
 If we imagine the system with one task that periodically burns cycles
-in the following manner:
+in the following manner::
 
- time line between two timer interrupts
-|--|
- ^^
- |_ something begins working  |
-  |_ something goes to sleep
- (only to be awaken quite soon)
+ time line between two timer interrupts
+|--|
+ ^^
+ |_ something begins working  |
+  |_ something goes to sleep
+ (only to be awaken quite soon)
 
 In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
+``/proc/stat`` (since the timer interrupt will always happen when the
 system is executing the idle handler), but in reality the load is
 closer to 99%.
 
 One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
+will lead to quite erratic information inside ``/proc/stat``::
 
 
-/* gcc -o hog smallhog.c */
-#include 
-#include 
-#include 
-#include 
-#define HIST 10
+   /* gcc -o hog smallhog.c */
+   #include 
+   #include 
+   #include 
+   #include 
+   #define HIST 10
 
-static volatile sig_atomic_t stop;
+   static volatile sig_atomic_t stop;
 
-static void sighandler (int signr)
-{
- (void) signr;
- stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
- stop = 0;
- while (!stop && --niters);
- return niters;
-}
-int main (void)
-{
- int i;
- struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
- .it_value = { .tv_sec = 0, .tv_usec = 1 } };
- sigset_t set;
- unsigned long v[HIST];
- double tmp = 0.0;
- unsigned long n;
- signal (SIGALRM, );
- setitimer (ITIMER_REAL, , NULL);
+   static void sighandler (int signr)
+   {
+   (void) signr;
+   stop = 1;
+   }
+   static unsigned long hog (unsigned long niters)
+   {
+   stop = 0;
+   while (!stop && --niters);
+   return niters;
+   }
+   int main (void)
+   {
+   int i;
+   struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+   .it_value = { .tv_sec = 0, .tv_usec = 1 } };
+   sigset_t set;
+   unsigned long v[HIST];
+   double tmp = 0.0;
+   unsigned long n;
+   signal (SIGALRM, );
+   setitimer (ITIMER_REAL, , NULL);
 
- hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) tmp += v[i];
- tmp /= HIST;
- n = tmp - (tmp / 3.0);
+   hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) tmp += v[i];
+   tmp /= HIST;
+   n = tmp - (tmp / 3.0);
 
- sigemptyset ();
- sigaddset (, SIGALRM);
+   sigemptyset ();
+   sigaddset (, SIGALRM);
 
- for (;;) {
- 

[PATCH 09/31] cpu-load: standardize document format

2017-05-18 Thread Mauro Carvalho Chehab
Each text file under Documentation follows a different
format. Some doesn't even have titles!

Change its representation to follow the adopted standard,
using ReST markups for it to be parseable by Sphinx:

- mark literals;
- Adjust document title;
- Use a list for references.

Signed-off-by: Mauro Carvalho Chehab 
---
 Documentation/cpu-load.txt | 117 +++--
 1 file changed, 59 insertions(+), 58 deletions(-)

diff --git a/Documentation/cpu-load.txt b/Documentation/cpu-load.txt
index 287224e57cfc..2d01ce43d2a2 100644
--- a/Documentation/cpu-load.txt
+++ b/Documentation/cpu-load.txt
@@ -1,9 +1,10 @@
+
 CPU load
-
+
 
-Linux exports various bits of information via `/proc/stat' and
-`/proc/uptime' that userland tools, such as top(1), use to calculate
-the average time system spent in a particular state, for example:
+Linux exports various bits of information via ``/proc/stat`` and
+``/proc/uptime`` that userland tools, such as top(1), use to calculate
+the average time system spent in a particular state, for example::
 
 $ iostat
 Linux 2.6.18.3-exp (linmac) 02/20/2007
@@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period 
the
 system spent 10.01% of the time doing work in user space, 2.92% in the
 kernel, and was overall 81.63% of the time idle.
 
-In most cases the `/proc/stat' information reflects the reality quite
+In most cases the ``/proc/stat``information reflects the reality quite
 closely, however due to the nature of how/when the kernel collects
 this data sometimes it can not be trusted at all.
 
@@ -33,78 +34,78 @@ Example
 ---
 
 If we imagine the system with one task that periodically burns cycles
-in the following manner:
+in the following manner::
 
- time line between two timer interrupts
-|--|
- ^^
- |_ something begins working  |
-  |_ something goes to sleep
- (only to be awaken quite soon)
+ time line between two timer interrupts
+|--|
+ ^^
+ |_ something begins working  |
+  |_ something goes to sleep
+ (only to be awaken quite soon)
 
 In the above situation the system will be 0% loaded according to the
-`/proc/stat' (since the timer interrupt will always happen when the
+``/proc/stat`` (since the timer interrupt will always happen when the
 system is executing the idle handler), but in reality the load is
 closer to 99%.
 
 One can imagine many more situations where this behavior of the kernel
-will lead to quite erratic information inside `/proc/stat'.
+will lead to quite erratic information inside ``/proc/stat``::
 
 
-/* gcc -o hog smallhog.c */
-#include 
-#include 
-#include 
-#include 
-#define HIST 10
+   /* gcc -o hog smallhog.c */
+   #include 
+   #include 
+   #include 
+   #include 
+   #define HIST 10
 
-static volatile sig_atomic_t stop;
+   static volatile sig_atomic_t stop;
 
-static void sighandler (int signr)
-{
- (void) signr;
- stop = 1;
-}
-static unsigned long hog (unsigned long niters)
-{
- stop = 0;
- while (!stop && --niters);
- return niters;
-}
-int main (void)
-{
- int i;
- struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
- .it_value = { .tv_sec = 0, .tv_usec = 1 } };
- sigset_t set;
- unsigned long v[HIST];
- double tmp = 0.0;
- unsigned long n;
- signal (SIGALRM, );
- setitimer (ITIMER_REAL, , NULL);
+   static void sighandler (int signr)
+   {
+   (void) signr;
+   stop = 1;
+   }
+   static unsigned long hog (unsigned long niters)
+   {
+   stop = 0;
+   while (!stop && --niters);
+   return niters;
+   }
+   int main (void)
+   {
+   int i;
+   struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
+   .it_value = { .tv_sec = 0, .tv_usec = 1 } };
+   sigset_t set;
+   unsigned long v[HIST];
+   double tmp = 0.0;
+   unsigned long n;
+   signal (SIGALRM, );
+   setitimer (ITIMER_REAL, , NULL);
 
- hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
- for (i = 0; i < HIST; ++i) tmp += v[i];
- tmp /= HIST;
- n = tmp - (tmp / 3.0);
+   hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
+   for (i = 0; i < HIST; ++i) tmp += v[i];
+   tmp /= HIST;
+   n = tmp - (tmp / 3.0);
 
- sigemptyset ();
- sigaddset (, SIGALRM);
+   sigemptyset ();
+   sigaddset (, SIGALRM);
 
- for (;;) {
- hog (n);
- sigwait 

Re: [PATCH v2] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-11-14 Thread Rafael J. Wysocki
On Monday, November 14, 2016 10:31:11 AM Srinivas Pandruvada wrote:
> Use get_target_pstate_use_cpu_load() to calculate target P-State for
> devices, which uses preferred power management profile as PM_MOBILE
> in ACPI FADT.
> This may help in resolving some thermal issues caused by low sustained
> cpu bound workloads. The current algorithm tend to over provision in this
> case as it doesn't look at the CPU busyness.
> 
> Also included the fix from Arnd Bergmann  to solve compile
> issue, when CONFIG_ACPI is not defined.
> 
> Signed-off-by: Srinivas Pandruvada 
> ---
> v2:
> Include fix from Arnd Bergmann for fixing compile issue
> 
>  drivers/cpufreq/intel_pstate.c | 14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index eb76073..9be0720 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1828,6 +1828,19 @@ static void __init copy_pid_params(struct 
> pstate_adjust_policy *policy)
>   pid_params.setpoint = policy->setpoint;
>  }
>  
> +#ifdef CONFIG_ACPI
> +static void intel_pstate_use_acpi_profile(void)
> +{
> + if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
> + pstate_funcs.get_target_pstate =
> + get_target_pstate_use_cpu_load;
> +}
> +#else
> +static void intel_pstate_use_acpi_profile(void)
> +{
> +}
> +#endif
> +
>  static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
>  {
>   pstate_funcs.get_max   = funcs->get_max;
> @@ -1839,6 +1852,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
> *funcs)
>   pstate_funcs.get_vid   = funcs->get_vid;
>   pstate_funcs.get_target_pstate = funcs->get_target_pstate;
>  
> + intel_pstate_use_acpi_profile();
>  }
>  
>  #ifdef CONFIG_ACPI
> 

I've replaced the original commit with this patch.

Thanks,
Rafael



Re: [PATCH v2] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-11-14 Thread Rafael J. Wysocki
On Monday, November 14, 2016 10:31:11 AM Srinivas Pandruvada wrote:
> Use get_target_pstate_use_cpu_load() to calculate target P-State for
> devices, which uses preferred power management profile as PM_MOBILE
> in ACPI FADT.
> This may help in resolving some thermal issues caused by low sustained
> cpu bound workloads. The current algorithm tend to over provision in this
> case as it doesn't look at the CPU busyness.
> 
> Also included the fix from Arnd Bergmann  to solve compile
> issue, when CONFIG_ACPI is not defined.
> 
> Signed-off-by: Srinivas Pandruvada 
> ---
> v2:
> Include fix from Arnd Bergmann for fixing compile issue
> 
>  drivers/cpufreq/intel_pstate.c | 14 ++
>  1 file changed, 14 insertions(+)
> 
> diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
> index eb76073..9be0720 100644
> --- a/drivers/cpufreq/intel_pstate.c
> +++ b/drivers/cpufreq/intel_pstate.c
> @@ -1828,6 +1828,19 @@ static void __init copy_pid_params(struct 
> pstate_adjust_policy *policy)
>   pid_params.setpoint = policy->setpoint;
>  }
>  
> +#ifdef CONFIG_ACPI
> +static void intel_pstate_use_acpi_profile(void)
> +{
> + if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
> + pstate_funcs.get_target_pstate =
> + get_target_pstate_use_cpu_load;
> +}
> +#else
> +static void intel_pstate_use_acpi_profile(void)
> +{
> +}
> +#endif
> +
>  static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
>  {
>   pstate_funcs.get_max   = funcs->get_max;
> @@ -1839,6 +1852,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
> *funcs)
>   pstate_funcs.get_vid   = funcs->get_vid;
>   pstate_funcs.get_target_pstate = funcs->get_target_pstate;
>  
> + intel_pstate_use_acpi_profile();
>  }
>  
>  #ifdef CONFIG_ACPI
> 

I've replaced the original commit with this patch.

Thanks,
Rafael



[PATCH v2] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-11-14 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Also included the fix from Arnd Bergmann  to solve compile
issue, when CONFIG_ACPI is not defined.

Signed-off-by: Srinivas Pandruvada 
---
v2:
Include fix from Arnd Bergmann for fixing compile issue

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eb76073..9be0720 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1828,6 +1828,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(void)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1839,6 +1852,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.7.4



[PATCH v2] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-11-14 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Also included the fix from Arnd Bergmann  to solve compile
issue, when CONFIG_ACPI is not defined.

Signed-off-by: Srinivas Pandruvada 
---
v2:
Include fix from Arnd Bergmann for fixing compile issue

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index eb76073..9be0720 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1828,6 +1828,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(void)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1839,6 +1852,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.7.4



[RFC][PATCH] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-10-21 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Signed-off-by: Srinivas Pandruvada 
---
If some wants to look at test results, email me. I can't attach
pdf document here.

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d9a0196..637536d8 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1746,6 +1746,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(struct pstate_funcs *funcs)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1757,6 +1770,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.5.5



[RFC][PATCH] cpufreq: intel_pstate: Use cpu load based algorithm for PM_MOBILE

2016-10-21 Thread Srinivas Pandruvada
Use get_target_pstate_use_cpu_load() to calculate target P-State for
devices, which uses preferred power management profile as PM_MOBILE
in ACPI FADT.
This may help in resolving some thermal issues caused by low sustained
cpu bound workloads. The current algorithm tend to over provision in this
case as it doesn't look at the CPU busyness.

Signed-off-by: Srinivas Pandruvada 
---
If some wants to look at test results, email me. I can't attach
pdf document here.

 drivers/cpufreq/intel_pstate.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index d9a0196..637536d8 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -1746,6 +1746,19 @@ static void __init copy_pid_params(struct 
pstate_adjust_policy *policy)
pid_params.setpoint = policy->setpoint;
 }
 
+#ifdef CONFIG_ACPI
+static void intel_pstate_use_acpi_profile(void)
+{
+   if (acpi_gbl_FADT.preferred_profile == PM_MOBILE)
+   pstate_funcs.get_target_pstate =
+   get_target_pstate_use_cpu_load;
+}
+#else
+static void intel_pstate_use_acpi_profile(struct pstate_funcs *funcs)
+{
+}
+#endif
+
 static void __init copy_cpu_funcs(struct pstate_funcs *funcs)
 {
pstate_funcs.get_max   = funcs->get_max;
@@ -1757,6 +1770,7 @@ static void __init copy_cpu_funcs(struct pstate_funcs 
*funcs)
pstate_funcs.get_vid   = funcs->get_vid;
pstate_funcs.get_target_pstate = funcs->get_target_pstate;
 
+   intel_pstate_use_acpi_profile();
 }
 
 #ifdef CONFIG_ACPI
-- 
2.5.5



[tip:sched/core] sched/fair: Update rq clock before updating nohz CPU load

2016-05-05 Thread tip-bot for Matt Fleming
Commit-ID:  b52fad2db5d792d89975cebf2fe1646a7af28ed0
Gitweb: http://git.kernel.org/tip/b52fad2db5d792d89975cebf2fe1646a7af28ed0
Author: Matt Fleming <m...@codeblueprint.co.uk>
AuthorDate: Tue, 3 May 2016 20:46:54 +0100
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Thu, 5 May 2016 09:41:09 +0200

sched/fair: Update rq clock before updating nohz CPU load

If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
update the rq clock before calling cpu_load_update(), otherwise any
time calculations will be stale.

All other paths currently call update_rq_clock().

Signed-off-by: Matt Fleming <m...@codeblueprint.co.uk>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Reviewed-by: Wanpeng Li <wanpeng...@hotmail.com>
Cc: Frederic Weisbecker <fweis...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Mel Gorman <mgor...@techsingularity.net>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Mike Galbraith <umgwanakikb...@gmail.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1462304814-11715-1-git-send-email-m...@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c381a6..7a00c7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4724,6 +4724,7 @@ void cpu_load_update_nohz_stop(void)
 
load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(_rq->lock);
+   update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(_rq->lock);
 }


[tip:sched/core] sched/fair: Update rq clock before updating nohz CPU load

2016-05-05 Thread tip-bot for Matt Fleming
Commit-ID:  b52fad2db5d792d89975cebf2fe1646a7af28ed0
Gitweb: http://git.kernel.org/tip/b52fad2db5d792d89975cebf2fe1646a7af28ed0
Author: Matt Fleming 
AuthorDate: Tue, 3 May 2016 20:46:54 +0100
Committer:  Ingo Molnar 
CommitDate: Thu, 5 May 2016 09:41:09 +0200

sched/fair: Update rq clock before updating nohz CPU load

If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
update the rq clock before calling cpu_load_update(), otherwise any
time calculations will be stale.

All other paths currently call update_rq_clock().

Signed-off-by: Matt Fleming 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Wanpeng Li 
Cc: Frederic Weisbecker 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Mike Galbraith 
Cc: Mike Galbraith 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1462304814-11715-1-git-send-email-m...@codeblueprint.co.uk
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8c381a6..7a00c7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4724,6 +4724,7 @@ void cpu_load_update_nohz_stop(void)
 
load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(_rq->lock);
+   update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(_rq->lock);
 }


Re: [PATCH] sched/fair: Update rq clock before updating nohz cpu load

2016-05-03 Thread Wanpeng Li
2016-05-04 3:46 GMT+08:00 Matt Fleming :
> If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
> update the rq clock before calling cpu_load_update(), otherwise any
> time calculations will be stale.
>
> All other paths currently call update_rq_clock().
>
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> Cc: Mike Galbraith 
> Cc: Mel Gorman 
> Cc: Thomas Gleixner 
> Cc: Frederic Weisbecker 
> Cc: Rik van Riel 
> Signed-off-by: Matt Fleming 

Reviewed-by: Wanpeng Li 

> ---
>  kernel/sched/fair.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b8a33abce650..aa9ba82f0d7c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4723,6 +4723,7 @@ void cpu_load_update_nohz_stop(void)
>
> load = weighted_cpuload(cpu_of(this_rq));
> raw_spin_lock(_rq->lock);
> +   update_rq_clock(this_rq);
> cpu_load_update_nohz(this_rq, curr_jiffies, load);
> raw_spin_unlock(_rq->lock);
>  }
> --
> 2.7.3
>



-- 
Regards,
Wanpeng Li


Re: [PATCH] sched/fair: Update rq clock before updating nohz cpu load

2016-05-03 Thread Wanpeng Li
2016-05-04 3:46 GMT+08:00 Matt Fleming :
> If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
> update the rq clock before calling cpu_load_update(), otherwise any
> time calculations will be stale.
>
> All other paths currently call update_rq_clock().
>
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> Cc: Mike Galbraith 
> Cc: Mel Gorman 
> Cc: Thomas Gleixner 
> Cc: Frederic Weisbecker 
> Cc: Rik van Riel 
> Signed-off-by: Matt Fleming 

Reviewed-by: Wanpeng Li 

> ---
>  kernel/sched/fair.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b8a33abce650..aa9ba82f0d7c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4723,6 +4723,7 @@ void cpu_load_update_nohz_stop(void)
>
> load = weighted_cpuload(cpu_of(this_rq));
> raw_spin_lock(_rq->lock);
> +   update_rq_clock(this_rq);
> cpu_load_update_nohz(this_rq, curr_jiffies, load);
> raw_spin_unlock(_rq->lock);
>  }
> --
> 2.7.3
>



-- 
Regards,
Wanpeng Li


[PATCH] sched/fair: Update rq clock before updating nohz cpu load

2016-05-03 Thread Matt Fleming
If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
update the rq clock before calling cpu_load_update(), otherwise any
time calculations will be stale.

All other paths currently call update_rq_clock().

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Mike Galbraith 
Cc: Mel Gorman 
Cc: Thomas Gleixner 
Cc: Frederic Weisbecker 
Cc: Rik van Riel 
Signed-off-by: Matt Fleming 
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..aa9ba82f0d7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4723,6 +4723,7 @@ void cpu_load_update_nohz_stop(void)
 
load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(_rq->lock);
+   update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(_rq->lock);
 }
-- 
2.7.3



[PATCH] sched/fair: Update rq clock before updating nohz cpu load

2016-05-03 Thread Matt Fleming
If we're accessing rq_clock() (e.g. in sched_avg_update()) we should
update the rq clock before calling cpu_load_update(), otherwise any
time calculations will be stale.

All other paths currently call update_rq_clock().

Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Mike Galbraith 
Cc: Mel Gorman 
Cc: Thomas Gleixner 
Cc: Frederic Weisbecker 
Cc: Rik van Riel 
Signed-off-by: Matt Fleming 
---
 kernel/sched/fair.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33abce650..aa9ba82f0d7c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4723,6 +4723,7 @@ void cpu_load_update_nohz_stop(void)
 
load = weighted_cpuload(cpu_of(this_rq));
raw_spin_lock(_rq->lock);
+   update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
raw_spin_unlock(_rq->lock);
 }
-- 
2.7.3



[tip:sched/core] sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  9fd81dd5ce0b12341c9f83346f8d32ac68bd3841
Gitweb: http://git.kernel.org/tip/9fd81dd5ce0b12341c9f83346f8d32ac68bd3841
Author: Frederic Weisbecker <fweis...@gmail.com>
AuthorDate: Tue, 19 Apr 2016 17:36:51 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Sat, 23 Apr 2016 14:20:42 +0200

sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates

Some code in CPU load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E . McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1461080211-16271-1-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/sched/core.c  | 5 ++---
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c98a268..71dffbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7381,8 +7381,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
 
-   rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7401,12 +7399,13 @@ void __init sched_init(void)
 
rq_attach_root(rq, _root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+   rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b70367a..b8a33ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4557,6 +4557,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4596,7 +4597,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4609,6 +4610,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4619,6 +4621,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4731,8 +4734,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32d9e22..69da6fc 100644
--- a/kernel/sched/s

[tip:sched/core] sched/fair: Correctly handle nohz ticks CPU load accounting

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  1f41906a6fda1114debd3898668bd7ab6470ee41
Gitweb: http://git.kernel.org/tip/1f41906a6fda1114debd3898668bd7ab6470ee41
Author: Frederic Weisbecker <fweis...@gmail.com>
AuthorDate: Wed, 13 Apr 2016 15:56:51 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Sat, 23 Apr 2016 14:20:42 +0200

sched/fair: Correctly handle nohz ticks CPU load accounting

Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with CPU load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E . McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 97 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0b7f602..d894f2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecd81c4..b70367a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4563,7 +4563,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4592,12 +4591,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4642,10 +4641,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz

[tip:sched/core] sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  9fd81dd5ce0b12341c9f83346f8d32ac68bd3841
Gitweb: http://git.kernel.org/tip/9fd81dd5ce0b12341c9f83346f8d32ac68bd3841
Author: Frederic Weisbecker 
AuthorDate: Tue, 19 Apr 2016 17:36:51 +0200
Committer:  Ingo Molnar 
CommitDate: Sat, 23 Apr 2016 14:20:42 +0200

sched/fair: Optimize !CONFIG_NO_HZ_COMMON CPU load updates

Some code in CPU load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Signed-off-by: Frederic Weisbecker 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E . McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1461080211-16271-1-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 kernel/sched/core.c  | 5 ++---
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c98a268..71dffbb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7381,8 +7381,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
 
-   rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7401,12 +7399,13 @@ void __init sched_init(void)
 
rq_attach_root(rq, _root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+   rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b70367a..b8a33ab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4557,6 +4557,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4596,7 +4597,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4609,6 +4610,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4619,6 +4621,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4731,8 +4734,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 32d9e22..69da6fc 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-   unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
+   unsigned long last_load_update_tick;
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigne

[tip:sched/core] sched/fair: Correctly handle nohz ticks CPU load accounting

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  1f41906a6fda1114debd3898668bd7ab6470ee41
Gitweb: http://git.kernel.org/tip/1f41906a6fda1114debd3898668bd7ab6470ee41
Author: Frederic Weisbecker 
AuthorDate: Wed, 13 Apr 2016 15:56:51 +0200
Committer:  Ingo Molnar 
CommitDate: Sat, 23 Apr 2016 14:20:42 +0200

sched/fair: Correctly handle nohz ticks CPU load accounting

Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with CPU load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Signed-off-by: Frederic Weisbecker 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E . McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1460555812-25375-3-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 97 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0b7f602..d894f2d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ecd81c4..b70367a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4563,7 +4563,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4592,12 +4591,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4642,10 +4641,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-  unsigned long curr_jiffies,
-  unsigned long load,
-  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by

[tip:sched/core] sched/fair: Gather CPU load functions under a more conventional namespace

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  cee1afce3053e7aa0793fbd5f2e845fa2cef9e33
Gitweb: http://git.kernel.org/tip/cee1afce3053e7aa0793fbd5f2e845fa2cef9e33
Author: Frederic Weisbecker <fweis...@gmail.com>
AuthorDate: Wed, 13 Apr 2016 15:56:50 +0200
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Sat, 23 Apr 2016 14:20:41 +0200

sched/fair: Gather CPU load functions under a more conventional namespace

The CPU load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E . McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Link: 
http://lkml.kernel.org/r/1460555812-25375-2-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13c1c1d..0b7f602 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06efbb9..c98a268 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2917,7 +2917,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c328bd7..ecd81c4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4559,7 +4559,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4594,7 +4594,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). F

[tip:sched/core] sched/fair: Gather CPU load functions under a more conventional namespace

2016-04-23 Thread tip-bot for Frederic Weisbecker
Commit-ID:  cee1afce3053e7aa0793fbd5f2e845fa2cef9e33
Gitweb: http://git.kernel.org/tip/cee1afce3053e7aa0793fbd5f2e845fa2cef9e33
Author: Frederic Weisbecker 
AuthorDate: Wed, 13 Apr 2016 15:56:50 +0200
Committer:  Ingo Molnar 
CommitDate: Sat, 23 Apr 2016 14:20:41 +0200

sched/fair: Gather CPU load functions under a more conventional namespace

The CPU load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Signed-off-by: Frederic Weisbecker 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E . McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1460555812-25375-2-git-send-email-fweis...@gmail.com
Signed-off-by: Ingo Molnar 
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 13c1c1d..0b7f602 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 06efbb9..c98a268 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2917,7 +2917,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c328bd7..ecd81c4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4559,7 +4559,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4594,7 +4594,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
  unsigned long pending_updates, int active)
 {
unsigned long tickless_load = active ? t

Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-20 Thread Wanpeng Li
Hi Frederic,
2016-04-13 21:56 GMT+08:00 Frederic Weisbecker <fweis...@gmail.com>:
> Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
> mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
> mode and we try to minimize the ticks as much as possible. The nohz
> subsystem uses a confusing terminology with the internal state
> "ts->tick_stopped" which is also available through its public interface
> with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
> reduced with the best effort rather than stopped. In the best case the
> tick can indeed be actually stopped but there is no guarantee about that.
> If a timer needs to fire one second later, a tick will fire while the
> CPU is in nohz mode and this is a very common scenario.
>
> Now this confusion happens to be a problem with cpu load updates:
> cpu_load_update_active() doesn't handle nohz ticks correctly because it
> assumes that ticks are completely stopped in nohz mode and that
> cpu_load_update_active() can't be called in dynticks mode. When that
> happens, the whole previous tickless load is ignored and the function
> just records the load for the current tick, ignoring potentially long
> idle periods behind.

This is for one timer interrupt per second scenario, right?

Regards,
Wanpeng Li


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-20 Thread Wanpeng Li
Hi Frederic,
2016-04-13 21:56 GMT+08:00 Frederic Weisbecker :
> Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
> mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
> mode and we try to minimize the ticks as much as possible. The nohz
> subsystem uses a confusing terminology with the internal state
> "ts->tick_stopped" which is also available through its public interface
> with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
> reduced with the best effort rather than stopped. In the best case the
> tick can indeed be actually stopped but there is no guarantee about that.
> If a timer needs to fire one second later, a tick will fire while the
> CPU is in nohz mode and this is a very common scenario.
>
> Now this confusion happens to be a problem with cpu load updates:
> cpu_load_update_active() doesn't handle nohz ticks correctly because it
> assumes that ticks are completely stopped in nohz mode and that
> cpu_load_update_active() can't be called in dynticks mode. When that
> happens, the whole previous tickless load is ignored and the function
> just records the load for the current tick, ignoring potentially long
> idle periods behind.

This is for one timer interrupt per second scenario, right?

Regards,
Wanpeng Li


[PATCH v2] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-19 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 kernel/sched/core.c  | 5 ++---
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..06321d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7328,8 +7328,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
 
-   rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7348,12 +7346,13 @@ void __init sched_init(void)
 
rq_attach_root(rq, _root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+   rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8b79f9..93eea4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,7 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4489,6 +4489,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4528,7 +4529,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4542,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4553,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4663,8 +4666,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ_COMMON
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
 #endif
-- 
2.7.0



[PATCH v2] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-19 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Ingo Molnar 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 kernel/sched/core.c  | 5 ++---
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..06321d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7328,8 +7328,6 @@ void __init sched_init(void)
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
 
-   rq->last_load_update_tick = jiffies;
-
 #ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
@@ -7348,12 +7346,13 @@ void __init sched_init(void)
 
rq_attach_root(rq, _root_domain);
 #ifdef CONFIG_NO_HZ_COMMON
+   rq->last_load_update_tick = jiffies;
rq->nohz_flags = 0;
 #endif
 #ifdef CONFIG_NO_HZ_FULL
rq->last_sched_tick = 0;
 #endif
-#endif
+#endif /* CONFIG_SMP */
init_rq_hrtick(rq);
atomic_set(>nr_iowait, 0);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8b79f9..93eea4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,7 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4489,6 +4489,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4528,7 +4529,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4542,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4553,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4663,8 +4666,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ_COMMON
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
 #endif
-- 
2.7.0



Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-19 Thread Frederic Weisbecker
On Tue, Apr 19, 2016 at 09:01:00AM +0900, Byungchul Park wrote:
> On Mon, Apr 18, 2016 at 03:35:06PM +0200, Frederic Weisbecker wrote:
> > On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> > > On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > > > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> > > >  void cpu_load_update_active(struct rq *this_rq)
> > > >  {
> > > > unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > > > -   /*
> > > > -* See the mess around cpu_load_update_idle() / 
> > > > cpu_load_update_nohz().
> > > > -*/
> > > > -   this_rq->last_load_update_tick = jiffies;
> > > > -   __cpu_load_update(this_rq, load, 1, 1);
> > > > +
> > > > +   if (tick_nohz_tick_stopped())
> > > > +   cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > > > +   else
> > > > +   cpu_load_update_periodic(this_rq, load);
> > > 
> > > Considering it further, I wonder if needing it.
> > > (Sorry if I missed something.)
> > > 
> > > Case 1. tickless -> (scheduler_tick) -> tickless
> > > 
> > >   I am not sure for this case if the rq's load can be changed or not,
> > >   especially, if the rq's load can be changed *at this point*.
> > >   Please remind that the load[0] is set here.
> > 
> > load[0] won't change because it's set by cpu_load_update_nohz_start().
> > But all the other load[idx] need to be decayed further.
> 
> Ah. Right. Sched tick will be handled even in the case 1...
> 
> I like your patches. But I am still wondering if the sched tick handling is
> necessary even in the case 1. Of course it's another problem though.

Right, we could indeed ignore those ticks happening in dynticks/idle and just 
wait
for the end of the dynticks frame that calls cpu_load_update_stop(). In fact
the first version of this patchset did that but Thomas didn't seem to like it.

That said more local updates means less need for remote updates through
cpu_load_update_idle().


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-19 Thread Frederic Weisbecker
On Tue, Apr 19, 2016 at 09:01:00AM +0900, Byungchul Park wrote:
> On Mon, Apr 18, 2016 at 03:35:06PM +0200, Frederic Weisbecker wrote:
> > On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> > > On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > > > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> > > >  void cpu_load_update_active(struct rq *this_rq)
> > > >  {
> > > > unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > > > -   /*
> > > > -* See the mess around cpu_load_update_idle() / 
> > > > cpu_load_update_nohz().
> > > > -*/
> > > > -   this_rq->last_load_update_tick = jiffies;
> > > > -   __cpu_load_update(this_rq, load, 1, 1);
> > > > +
> > > > +   if (tick_nohz_tick_stopped())
> > > > +   cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > > > +   else
> > > > +   cpu_load_update_periodic(this_rq, load);
> > > 
> > > Considering it further, I wonder if needing it.
> > > (Sorry if I missed something.)
> > > 
> > > Case 1. tickless -> (scheduler_tick) -> tickless
> > > 
> > >   I am not sure for this case if the rq's load can be changed or not,
> > >   especially, if the rq's load can be changed *at this point*.
> > >   Please remind that the load[0] is set here.
> > 
> > load[0] won't change because it's set by cpu_load_update_nohz_start().
> > But all the other load[idx] need to be decayed further.
> 
> Ah. Right. Sched tick will be handled even in the case 1...
> 
> I like your patches. But I am still wondering if the sched tick handling is
> necessary even in the case 1. Of course it's another problem though.

Right, we could indeed ignore those ticks happening in dynticks/idle and just 
wait
for the end of the dynticks frame that calls cpu_load_update_stop(). In fact
the first version of this patchset did that but Thomas didn't seem to like it.

That said more local updates means less need for remote updates through
cpu_load_update_idle().


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Mon, Apr 18, 2016 at 03:35:06PM +0200, Frederic Weisbecker wrote:
> On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> > On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> > >  void cpu_load_update_active(struct rq *this_rq)
> > >  {
> > >   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > > - /*
> > > -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> > > -  */
> > > - this_rq->last_load_update_tick = jiffies;
> > > - __cpu_load_update(this_rq, load, 1, 1);
> > > +
> > > + if (tick_nohz_tick_stopped())
> > > + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > > + else
> > > + cpu_load_update_periodic(this_rq, load);
> > 
> > Considering it further, I wonder if needing it.
> > (Sorry if I missed something.)
> > 
> > Case 1. tickless -> (scheduler_tick) -> tickless
> > 
> > I am not sure for this case if the rq's load can be changed or not,
> > especially, if the rq's load can be changed *at this point*.
> > Please remind that the load[0] is set here.
> 
> load[0] won't change because it's set by cpu_load_update_nohz_start().
> But all the other load[idx] need to be decayed further.

Ah. Right. Sched tick will be handled even in the case 1...

I like your patches. But I am still wondering if the sched tick handling is
necessary even in the case 1. Of course it's another problem though.

Thanks anyway,
Byungchul

> 
> > 
> > Case 2. tickless -> (scheduler_tick) -> restart tick
> > 
> > Will be done by the tick restart routine when exiting irq.
> > -> no problem.
> > 
> > Case 3. tick -> (scheduler_tick) -> tickless
> > 
> > Same as before.
> > -> no problem.
> > 
> > Case 4. tick -> (scheduler_tick) -> tick
> > 
> > We can rely on regular schedule_tick().
> > -> no problem.
> > 
> 
> Thanks for your review!


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Mon, Apr 18, 2016 at 03:35:06PM +0200, Frederic Weisbecker wrote:
> On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> > On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> > >  void cpu_load_update_active(struct rq *this_rq)
> > >  {
> > >   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > > - /*
> > > -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> > > -  */
> > > - this_rq->last_load_update_tick = jiffies;
> > > - __cpu_load_update(this_rq, load, 1, 1);
> > > +
> > > + if (tick_nohz_tick_stopped())
> > > + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > > + else
> > > + cpu_load_update_periodic(this_rq, load);
> > 
> > Considering it further, I wonder if needing it.
> > (Sorry if I missed something.)
> > 
> > Case 1. tickless -> (scheduler_tick) -> tickless
> > 
> > I am not sure for this case if the rq's load can be changed or not,
> > especially, if the rq's load can be changed *at this point*.
> > Please remind that the load[0] is set here.
> 
> load[0] won't change because it's set by cpu_load_update_nohz_start().
> But all the other load[idx] need to be decayed further.

Ah. Right. Sched tick will be handled even in the case 1...

I like your patches. But I am still wondering if the sched tick handling is
necessary even in the case 1. Of course it's another problem though.

Thanks anyway,
Byungchul

> 
> > 
> > Case 2. tickless -> (scheduler_tick) -> restart tick
> > 
> > Will be done by the tick restart routine when exiting irq.
> > -> no problem.
> > 
> > Case 3. tick -> (scheduler_tick) -> tickless
> > 
> > Same as before.
> > -> no problem.
> > 
> > Case 4. tick -> (scheduler_tick) -> tick
> > 
> > We can rely on regular schedule_tick().
> > -> no problem.
> > 
> 
> Thanks for your review!


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-18 Thread Frederic Weisbecker
On Wed, Apr 13, 2016 at 03:56:52PM +0200, Frederic Weisbecker wrote:
> Some code in cpu load update only concern NO_HZ configs but it is
> built on all configurations. When NO_HZ isn't built, that code is harmless
> but just happens to take some useless ressources in CPU and memory:
> 
> 1) one useless field in struct rq
> 2) jiffies record on every tick that is never used (cpu_load_update_periodic)
> 3) decay_load_missed is called two times on every tick to eventually
>return immediately with no action taken. And that function is dead
>code.
> 
> For pure optimization purposes, lets conditionally build the NO_HZ
> related code.
> 
> Cc: Byungchul Park <byungchul.p...@lge.com>
> Cc: Chris Metcalf <cmetc...@ezchip.com>
> Cc: Christoph Lameter <c...@linux.com>
> Cc: Ingo Molnar <mi...@elte.hu>
> Cc: Luiz Capitulino <lcapitul...@redhat.com>
> Cc: Mike Galbraith <efa...@gmx.de>
> Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> Cc: Peter Zijlstra <pet...@infradead.org>
> Cc: Rik van Riel <r...@redhat.com>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
> ---
>  kernel/sched/core.c  | 3 ++-
>  kernel/sched/fair.c  | 9 +++--
>  kernel/sched/sched.h | 6 --
>  3 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4c522a7..59a2821 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7327,8 +7327,9 @@ void __init sched_init(void)
>  
>   for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
>   rq->cpu_load[j] = 0;
> -
> +#ifdef CONFIG_NO_HZ_COMMON
>   rq->last_load_update_tick = jiffies;
> +#endif

I forgot to also conditionally initialize it in sched_init(), I need to resend
the patchset with that fixed.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-18 Thread Frederic Weisbecker
On Wed, Apr 13, 2016 at 03:56:52PM +0200, Frederic Weisbecker wrote:
> Some code in cpu load update only concern NO_HZ configs but it is
> built on all configurations. When NO_HZ isn't built, that code is harmless
> but just happens to take some useless ressources in CPU and memory:
> 
> 1) one useless field in struct rq
> 2) jiffies record on every tick that is never used (cpu_load_update_periodic)
> 3) decay_load_missed is called two times on every tick to eventually
>return immediately with no action taken. And that function is dead
>code.
> 
> For pure optimization purposes, lets conditionally build the NO_HZ
> related code.
> 
> Cc: Byungchul Park 
> Cc: Chris Metcalf 
> Cc: Christoph Lameter 
> Cc: Ingo Molnar 
> Cc: Luiz Capitulino 
> Cc: Mike Galbraith 
> Cc: Paul E. McKenney 
> Cc: Peter Zijlstra 
> Cc: Rik van Riel 
> Cc: Thomas Gleixner 
> Signed-off-by: Frederic Weisbecker 
> ---
>  kernel/sched/core.c  | 3 ++-
>  kernel/sched/fair.c  | 9 +++--
>  kernel/sched/sched.h | 6 --
>  3 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4c522a7..59a2821 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7327,8 +7327,9 @@ void __init sched_init(void)
>  
>   for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
>   rq->cpu_load[j] = 0;
> -
> +#ifdef CONFIG_NO_HZ_COMMON
>   rq->last_load_update_tick = jiffies;
> +#endif

I forgot to also conditionally initialize it in sched_init(), I need to resend
the patchset with that fixed.


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Frederic Weisbecker
On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> >  void cpu_load_update_active(struct rq *this_rq)
> >  {
> > unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > -   /*
> > -* See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> > -*/
> > -   this_rq->last_load_update_tick = jiffies;
> > -   __cpu_load_update(this_rq, load, 1, 1);
> > +
> > +   if (tick_nohz_tick_stopped())
> > +   cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > +   else
> > +   cpu_load_update_periodic(this_rq, load);
> 
> Considering it further, I wonder if needing it.
> (Sorry if I missed something.)
> 
> Case 1. tickless -> (scheduler_tick) -> tickless
> 
>   I am not sure for this case if the rq's load can be changed or not,
>   especially, if the rq's load can be changed *at this point*.
>   Please remind that the load[0] is set here.

load[0] won't change because it's set by cpu_load_update_nohz_start().
But all the other load[idx] need to be decayed further.

> 
> Case 2. tickless -> (scheduler_tick) -> restart tick
> 
>   Will be done by the tick restart routine when exiting irq.
>   -> no problem.
> 
> Case 3. tick -> (scheduler_tick) -> tickless
> 
>   Same as before.
>   -> no problem.
> 
> Case 4. tick -> (scheduler_tick) -> tick
> 
>   We can rely on regular schedule_tick().
>   -> no problem.
> 

Thanks for your review!


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Frederic Weisbecker
On Mon, Apr 18, 2016 at 06:17:21PM +0900, Byungchul Park wrote:
> On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> > @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
> >  void cpu_load_update_active(struct rq *this_rq)
> >  {
> > unsigned long load = weighted_cpuload(cpu_of(this_rq));
> > -   /*
> > -* See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> > -*/
> > -   this_rq->last_load_update_tick = jiffies;
> > -   __cpu_load_update(this_rq, load, 1, 1);
> > +
> > +   if (tick_nohz_tick_stopped())
> > +   cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> > +   else
> > +   cpu_load_update_periodic(this_rq, load);
> 
> Considering it further, I wonder if needing it.
> (Sorry if I missed something.)
> 
> Case 1. tickless -> (scheduler_tick) -> tickless
> 
>   I am not sure for this case if the rq's load can be changed or not,
>   especially, if the rq's load can be changed *at this point*.
>   Please remind that the load[0] is set here.

load[0] won't change because it's set by cpu_load_update_nohz_start().
But all the other load[idx] need to be decayed further.

> 
> Case 2. tickless -> (scheduler_tick) -> restart tick
> 
>   Will be done by the tick restart routine when exiting irq.
>   -> no problem.
> 
> Case 3. tick -> (scheduler_tick) -> tickless
> 
>   Same as before.
>   -> no problem.
> 
> Case 4. tick -> (scheduler_tick) -> tick
> 
>   We can rely on regular schedule_tick().
>   -> no problem.
> 

Thanks for your review!


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
>  void cpu_load_update_active(struct rq *this_rq)
>  {
>   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> - /*
> -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> -  */
> - this_rq->last_load_update_tick = jiffies;
> - __cpu_load_update(this_rq, load, 1, 1);
> +
> + if (tick_nohz_tick_stopped())
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> + else
> + cpu_load_update_periodic(this_rq, load);

Considering it further, I wonder if needing it.
(Sorry if I missed something.)

Case 1. tickless -> (scheduler_tick) -> tickless

I am not sure for this case if the rq's load can be changed or not,
especially, if the rq's load can be changed *at this point*.
Please remind that the load[0] is set here.

Case 2. tickless -> (scheduler_tick) -> restart tick

Will be done by the tick restart routine when exiting irq.
-> no problem.

Case 3. tick -> (scheduler_tick) -> tickless

Same as before.
-> no problem.

Case 4. tick -> (scheduler_tick) -> tick

We can rely on regular schedule_tick().
-> no problem.



Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
>  void cpu_load_update_active(struct rq *this_rq)
>  {
>   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> - /*
> -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> -  */
> - this_rq->last_load_update_tick = jiffies;
> - __cpu_load_update(this_rq, load, 1, 1);
> +
> + if (tick_nohz_tick_stopped())
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> + else
> + cpu_load_update_periodic(this_rq, load);

Considering it further, I wonder if needing it.
(Sorry if I missed something.)

Case 1. tickless -> (scheduler_tick) -> tickless

I am not sure for this case if the rq's load can be changed or not,
especially, if the rq's load can be changed *at this point*.
Please remind that the load[0] is set here.

Case 2. tickless -> (scheduler_tick) -> restart tick

Will be done by the tick restart routine when exiting irq.
-> no problem.

Case 3. tick -> (scheduler_tick) -> tickless

Same as before.
-> no problem.

Case 4. tick -> (scheduler_tick) -> tick

We can rely on regular schedule_tick().
-> no problem.



Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> @@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
> missed_updates, int idx)
>   *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
>   *
>   * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the 
> extra
> - * term. See the @active paramter.
> + * term.
>   */
> -static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
> -   unsigned long pending_updates, int active)
> +static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> + unsigned long pending_updates)
>  {
> - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
> + unsigned long tickless_load = this_rq->cpu_load[0];

Hello,

Good for my humble code to be fixed so we can write it like this here.

> @@ -4618,26 +4617,56 @@ static void cpu_load_update_idle(struct rq *this_rq)
>   if (weighted_cpuload(cpu_of(this_rq)))
>   return;
>  
> - __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
>  }
>  
>  /*
> - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
> + * Record CPU load on nohz entry so we know the tickless load to account
> + * on nohz exit. cpu_load[0] happens then to be updated more frequently
> + * than other cpu_load[idx] but it should be fine as cpu_load readers
> + * shouldn't rely into synchronized cpu_load[*] updates.
>   */
> -void cpu_load_update_nohz(int active)
> +void cpu_load_update_nohz_start(void)
>  {
>   struct rq *this_rq = this_rq();
> +
> + /*
> +  * This is all lockless but should be fine. If weighted_cpuload changes
> +  * concurrently we'll exit nohz. And cpu_load write can race with
> +  * cpu_load_update_idle() but both updater would be writing the same.
> +  */
> + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));

Like it.

> +/*
> + * Account the tickless load in the end of a nohz frame.
> + */
> +void cpu_load_update_nohz_stop(void)
> +{
>   unsigned long curr_jiffies = READ_ONCE(jiffies);
> - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
> + struct rq *this_rq = this_rq();
> + unsigned long load;
>  
>   if (curr_jiffies == this_rq->last_load_update_tick)
>   return;
>  
> + load = weighted_cpuload(cpu_of(this_rq));

Like it.

> @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
>  void cpu_load_update_active(struct rq *this_rq)
>  {
>   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> - /*
> -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> -  */
> - this_rq->last_load_update_tick = jiffies;
> - __cpu_load_update(this_rq, load, 1, 1);
> +
> + if (tick_nohz_tick_stopped())
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> + else
> + cpu_load_update_periodic(this_rq, load);

Oh! We have missed it until now. Terrible.. :-(

Thank you,
Byungchul


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-18 Thread Byungchul Park
On Wed, Apr 13, 2016 at 03:56:51PM +0200, Frederic Weisbecker wrote:
> @@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
> missed_updates, int idx)
>   *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
>   *
>   * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the 
> extra
> - * term. See the @active paramter.
> + * term.
>   */
> -static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
> -   unsigned long pending_updates, int active)
> +static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> + unsigned long pending_updates)
>  {
> - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
> + unsigned long tickless_load = this_rq->cpu_load[0];

Hello,

Good for my humble code to be fixed so we can write it like this here.

> @@ -4618,26 +4617,56 @@ static void cpu_load_update_idle(struct rq *this_rq)
>   if (weighted_cpuload(cpu_of(this_rq)))
>   return;
>  
> - __cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
>  }
>  
>  /*
> - * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
> + * Record CPU load on nohz entry so we know the tickless load to account
> + * on nohz exit. cpu_load[0] happens then to be updated more frequently
> + * than other cpu_load[idx] but it should be fine as cpu_load readers
> + * shouldn't rely into synchronized cpu_load[*] updates.
>   */
> -void cpu_load_update_nohz(int active)
> +void cpu_load_update_nohz_start(void)
>  {
>   struct rq *this_rq = this_rq();
> +
> + /*
> +  * This is all lockless but should be fine. If weighted_cpuload changes
> +  * concurrently we'll exit nohz. And cpu_load write can race with
> +  * cpu_load_update_idle() but both updater would be writing the same.
> +  */
> + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));

Like it.

> +/*
> + * Account the tickless load in the end of a nohz frame.
> + */
> +void cpu_load_update_nohz_stop(void)
> +{
>   unsigned long curr_jiffies = READ_ONCE(jiffies);
> - unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
> + struct rq *this_rq = this_rq();
> + unsigned long load;
>  
>   if (curr_jiffies == this_rq->last_load_update_tick)
>   return;
>  
> + load = weighted_cpuload(cpu_of(this_rq));

Like it.

> @@ -4645,11 +4674,11 @@ void cpu_load_update_nohz(int active)
>  void cpu_load_update_active(struct rq *this_rq)
>  {
>   unsigned long load = weighted_cpuload(cpu_of(this_rq));
> - /*
> -  * See the mess around cpu_load_update_idle() / cpu_load_update_nohz().
> -  */
> - this_rq->last_load_update_tick = jiffies;
> - __cpu_load_update(this_rq, load, 1, 1);
> +
> + if (tick_nohz_tick_stopped())
> + cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
> + else
> + cpu_load_update_periodic(this_rq, load);

Oh! We have missed it until now. Terrible.. :-(

Thank you,
Byungchul


[PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-13 Thread Frederic Weisbecker
Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with cpu load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 97 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa5ee0d..07b1b1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f33764d..a8b79f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4495,7 +4495,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4574,10 +4573,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-  unsigned long curr_jiffies,
-  unsigned long load,
-  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load 

[PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-13 Thread Frederic Weisbecker
Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with cpu load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Ingo Molnar 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 97 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa5ee0d..07b1b1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f33764d..a8b79f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4495,7 +4495,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4574,10 +4573,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-  unsigned long curr_jiffies,
-  unsigned long load,
-  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why 
we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still 

[PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-13 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 kernel/sched/core.c  | 3 ++-
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..59a2821 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7327,8 +7327,9 @@ void __init sched_init(void)
 
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
-
+#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+#endif
 
 #ifdef CONFIG_SMP
rq->sd = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8b79f9..93eea4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,7 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4489,6 +4489,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4528,7 +4529,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4542,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4553,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4663,8 +4666,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ_COMMON
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
 #endif
-- 
2.7.0



[PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-13 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Ingo Molnar 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 kernel/sched/core.c  | 3 ++-
 kernel/sched/fair.c  | 9 +++--
 kernel/sched/sched.h | 6 --
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..59a2821 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7327,8 +7327,9 @@ void __init sched_init(void)
 
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
-
+#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+#endif
 
 #ifdef CONFIG_SMP
rq->sd = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a8b79f9..93eea4f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,7 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
-
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * per rq 'load' arrray crap; XXX kill this.
  */
@@ -4489,6 +4489,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
}
return load;
 }
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
@@ -4528,7 +4529,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
-   unsigned long tickless_load = this_rq->cpu_load[0];
+   unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4542,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4553,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4663,8 +4666,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ_COMMON
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
 #endif
-- 
2.7.0



[PATCH 0/3] sched: Fix/improve nohz cpu load updates v3

2016-04-13 Thread Frederic Weisbecker
Thanks Peterz and Chris, here is the v3 that addresses your reviews:

* Add comment about cpu_load[0] being updated more frequently than
  other cpu_load[idx]. See comment above cpu_load_update_start().

* Simplify ifdeffery for decay_load_missed() calls. Use __maybe_unused
  on the related variable.

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
sched/nohz-v3

HEAD: 101db6e825806a08e14c3e06b5242c10608db2df

Thanks,
Frederic
---

Frederic Weisbecker (3):
  sched: Gather cpu load functions under a more conventional namespace
  sched: Correctly handle nohz ticks cpu load accounting
  sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates


 Documentation/trace/ftrace.txt |  10 ++--
 include/linux/sched.h  |   6 ++-
 kernel/sched/core.c|   5 +-
 kernel/sched/fair.c| 112 +++--
 kernel/sched/sched.h   |  10 ++--
 kernel/time/tick-sched.c   |   9 ++--
 6 files changed, 96 insertions(+), 56 deletions(-)


[PATCH 0/3] sched: Fix/improve nohz cpu load updates v3

2016-04-13 Thread Frederic Weisbecker
Thanks Peterz and Chris, here is the v3 that addresses your reviews:

* Add comment about cpu_load[0] being updated more frequently than
  other cpu_load[idx]. See comment above cpu_load_update_start().

* Simplify ifdeffery for decay_load_missed() calls. Use __maybe_unused
  on the related variable.

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
sched/nohz-v3

HEAD: 101db6e825806a08e14c3e06b5242c10608db2df

Thanks,
Frederic
---

Frederic Weisbecker (3):
  sched: Gather cpu load functions under a more conventional namespace
  sched: Correctly handle nohz ticks cpu load accounting
  sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates


 Documentation/trace/ftrace.txt |  10 ++--
 include/linux/sched.h  |   6 ++-
 kernel/sched/core.c|   5 +-
 kernel/sched/fair.c| 112 +++--
 kernel/sched/sched.h   |  10 ++--
 kernel/time/tick-sched.c   |   9 ++--
 6 files changed, 96 insertions(+), 56 deletions(-)


[PATCH 1/3] sched: Gather cpu load functions under a more conventional namespace

2016-04-13 Thread Frederic Weisbecker
The cpu load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847..aa5ee0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fc..4c522a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2915,7 +2915,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66..f33764d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4526,7 +4526,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
  unsigned long pending_updates, int active)
 {
unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
@@ -4574,7 +4574,7 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
+static void __cpu_

[PATCH 1/3] sched: Gather cpu load functions under a more conventional namespace

2016-04-13 Thread Frederic Weisbecker
The cpu load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847..aa5ee0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fc..4c522a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2915,7 +2915,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66..f33764d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4526,7 +4526,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
  unsigned long pending_updates, int active)
 {
unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
@@ -4574,7 +4574,7 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
+static void __cpu_load_update_nohz(struct rq *this_rq,
   unsigned long curr_jiffies,
   unsigned long load,
   int active)
@@ -4589,7 +4589,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
 

Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-12 Thread Peter Zijlstra
On Mon, Apr 11, 2016 at 08:21:31PM +0200, Frederic Weisbecker wrote:
> On Mon, Apr 11, 2016 at 10:53:01AM -0400, Chris Metcalf wrote:
> > On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:
> > >So I tried and it warns about the unused variable tickless_load, so I
> > >would need two scattered ifdeffery in the function:
> > >
> > >@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
> > >missed_updates, int idx)
> > >  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> > >   unsigned long pending_updates)
> > >  {
> > >+#ifdef CONFIG_NO_HZ_COMMON
> > >   unsigned long tickless_load = this_rq->cpu_load[0];
> > >+#endif
> > 
> > Just move the initialization down to the first use, as a regular
> > assignment, and add __maybe_unused to the declaration, and the compiler
> > will then keep quiet (see Documentation/CodingStyle).
> > 
> > I have no comment on which of the approaches looks better overall,
> > but I think using __maybe_unused definitely improves this approach.
> 
> I thought about it yeah. I usually avoid __maybe_unused because it's often
> a bad sign concerning the code layout.
> 
> Now in this precise case I wouldn't mind though. Peter what's your opinion?

Sure, go with __maybe_unused.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-12 Thread Peter Zijlstra
On Mon, Apr 11, 2016 at 08:21:31PM +0200, Frederic Weisbecker wrote:
> On Mon, Apr 11, 2016 at 10:53:01AM -0400, Chris Metcalf wrote:
> > On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:
> > >So I tried and it warns about the unused variable tickless_load, so I
> > >would need two scattered ifdeffery in the function:
> > >
> > >@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
> > >missed_updates, int idx)
> > >  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> > >   unsigned long pending_updates)
> > >  {
> > >+#ifdef CONFIG_NO_HZ_COMMON
> > >   unsigned long tickless_load = this_rq->cpu_load[0];
> > >+#endif
> > 
> > Just move the initialization down to the first use, as a regular
> > assignment, and add __maybe_unused to the declaration, and the compiler
> > will then keep quiet (see Documentation/CodingStyle).
> > 
> > I have no comment on which of the approaches looks better overall,
> > but I think using __maybe_unused definitely improves this approach.
> 
> I thought about it yeah. I usually avoid __maybe_unused because it's often
> a bad sign concerning the code layout.
> 
> Now in this precise case I wouldn't mind though. Peter what's your opinion?

Sure, go with __maybe_unused.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Frederic Weisbecker
On Mon, Apr 11, 2016 at 10:53:01AM -0400, Chris Metcalf wrote:
> On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:
> >So I tried and it warns about the unused variable tickless_load, so I
> >would need two scattered ifdeffery in the function:
> >
> >@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
> >missed_updates, int idx)
> >  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> > unsigned long pending_updates)
> >  {
> >+#ifdef CONFIG_NO_HZ_COMMON
> > unsigned long tickless_load = this_rq->cpu_load[0];
> >+#endif
> 
> Just move the initialization down to the first use, as a regular
> assignment, and add __maybe_unused to the declaration, and the compiler
> will then keep quiet (see Documentation/CodingStyle).
> 
> I have no comment on which of the approaches looks better overall,
> but I think using __maybe_unused definitely improves this approach.

I thought about it yeah. I usually avoid __maybe_unused because it's often
a bad sign concerning the code layout.

Now in this precise case I wouldn't mind though. Peter what's your opinion?

Thanks.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Frederic Weisbecker
On Mon, Apr 11, 2016 at 10:53:01AM -0400, Chris Metcalf wrote:
> On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:
> >So I tried and it warns about the unused variable tickless_load, so I
> >would need two scattered ifdeffery in the function:
> >
> >@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
> >missed_updates, int idx)
> >  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
> > unsigned long pending_updates)
> >  {
> >+#ifdef CONFIG_NO_HZ_COMMON
> > unsigned long tickless_load = this_rq->cpu_load[0];
> >+#endif
> 
> Just move the initialization down to the first use, as a regular
> assignment, and add __maybe_unused to the declaration, and the compiler
> will then keep quiet (see Documentation/CodingStyle).
> 
> I have no comment on which of the approaches looks better overall,
> but I think using __maybe_unused definitely improves this approach.

I thought about it yeah. I usually avoid __maybe_unused because it's often
a bad sign concerning the code layout.

Now in this precise case I wouldn't mind though. Peter what's your opinion?

Thanks.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Chris Metcalf

On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:

So I tried and it warns about the unused variable tickless_load, so I
would need two scattered ifdeffery in the function:

@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
  {
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long tickless_load = this_rq->cpu_load[0];
+#endif


Just move the initialization down to the first use, as a regular
assignment, and add __maybe_unused to the declaration, and the compiler
will then keep quiet (see Documentation/CodingStyle).

I have no comment on which of the approaches looks better overall,
but I think using __maybe_unused definitely improves this approach.
--
Chris Metcalf, Mellanox Technologies
http://www.mellanox.com



Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Chris Metcalf

On 4/11/2016 9:18 AM, Frederic Weisbecker wrote:

So I tried and it warns about the unused variable tickless_load, so I
would need two scattered ifdeffery in the function:

@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
  {
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long tickless_load = this_rq->cpu_load[0];
+#endif


Just move the initialization down to the first use, as a regular
assignment, and add __maybe_unused to the declaration, and the compiler
will then keep quiet (see Documentation/CodingStyle).

I have no comment on which of the approaches looks better overall,
but I think using __maybe_unused definitely improves this approach.
--
Chris Metcalf, Mellanox Technologies
http://www.mellanox.com



Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 07:44:14PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 02:55:22PM +0200, Frederic Weisbecker wrote:
> > > > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > > > unsigned long this_load,
> > > >  
> > > > /* scale is effectively 1 << i now, and >> i divides by 
> > > > scale */
> > > >  
> > > > -   old_load = this_rq->cpu_load[i];
> > > #ifdef CONFIG_NO_HZ_COMMON
> > > > -   old_load = decay_load_missed(old_load, pending_updates 
> > > > - 1, i);
> > > > -   if (tickless_load) {
> > > > -   old_load -= decay_load_missed(tickless_load, 
> > > > pending_updates - 1, i);
> > > > -   /*
> > > > -* old_load can never be a negative value 
> > > > because a
> > > > -* decayed tickless_load cannot be greater than 
> > > > the
> > > > -* original tickless_load.
> > > > -*/
> > > > -   old_load += tickless_load;
> > > > -   }
> > > #endif
> > 
> > Ah sure, if you prefer it that way, I can do that.
> 
> Yes please. I normally favour the thing you did, but here it makes
> tricky code that much harder to read.

So I tried and it warns about the unused variable tickless_load, so I
would need two scattered ifdeffery in the function:

@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long tickless_load = this_rq->cpu_load[0];
+#endif
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4544,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4555,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This


Are you still sure you don't want the ifdeffed inline function?

Thanks.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-11 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 07:44:14PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 02:55:22PM +0200, Frederic Weisbecker wrote:
> > > > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > > > unsigned long this_load,
> > > >  
> > > > /* scale is effectively 1 << i now, and >> i divides by 
> > > > scale */
> > > >  
> > > > -   old_load = this_rq->cpu_load[i];
> > > #ifdef CONFIG_NO_HZ_COMMON
> > > > -   old_load = decay_load_missed(old_load, pending_updates 
> > > > - 1, i);
> > > > -   if (tickless_load) {
> > > > -   old_load -= decay_load_missed(tickless_load, 
> > > > pending_updates - 1, i);
> > > > -   /*
> > > > -* old_load can never be a negative value 
> > > > because a
> > > > -* decayed tickless_load cannot be greater than 
> > > > the
> > > > -* original tickless_load.
> > > > -*/
> > > > -   old_load += tickless_load;
> > > > -   }
> > > #endif
> > 
> > Ah sure, if you prefer it that way, I can do that.
> 
> Yes please. I normally favour the thing you did, but here it makes
> tricky code that much harder to read.

So I tried and it warns about the unused variable tickless_load, so I
would need two scattered ifdeffery in the function:

@@ -4528,7 +4529,9 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
unsigned long pending_updates)
 {
+#ifdef CONFIG_NO_HZ_COMMON
unsigned long tickless_load = this_rq->cpu_load[0];
+#endif
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4541,6 +4544,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
 
old_load = this_rq->cpu_load[i];
+#ifdef CONFIG_NO_HZ_COMMON
old_load = decay_load_missed(old_load, pending_updates - 1, i);
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
@@ -4551,6 +4555,7 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 */
old_load += tickless_load;
}
+#endif
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This


Are you still sure you don't want the ifdeffed inline function?

Thanks.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 02:55:22PM +0200, Frederic Weisbecker wrote:
> > > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > > unsigned long this_load,
> > >  
> > >   /* scale is effectively 1 << i now, and >> i divides by scale */
> > >  
> > > - old_load = this_rq->cpu_load[i];
> > #ifdef CONFIG_NO_HZ_COMMON
> > > - old_load = decay_load_missed(old_load, pending_updates - 1, i);
> > > - if (tickless_load) {
> > > - old_load -= decay_load_missed(tickless_load, 
> > > pending_updates - 1, i);
> > > - /*
> > > -  * old_load can never be a negative value because a
> > > -  * decayed tickless_load cannot be greater than the
> > > -  * original tickless_load.
> > > -  */
> > > - old_load += tickless_load;
> > > - }
> > #endif
> 
> Ah sure, if you prefer it that way, I can do that.

Yes please. I normally favour the thing you did, but here it makes
tricky code that much harder to read.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 02:55:22PM +0200, Frederic Weisbecker wrote:
> > > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > > unsigned long this_load,
> > >  
> > >   /* scale is effectively 1 << i now, and >> i divides by scale */
> > >  
> > > - old_load = this_rq->cpu_load[i];
> > #ifdef CONFIG_NO_HZ_COMMON
> > > - old_load = decay_load_missed(old_load, pending_updates - 1, i);
> > > - if (tickless_load) {
> > > - old_load -= decay_load_missed(tickless_load, 
> > > pending_updates - 1, i);
> > > - /*
> > > -  * old_load can never be a negative value because a
> > > -  * decayed tickless_load cannot be greater than the
> > > -  * original tickless_load.
> > > -  */
> > > - old_load += tickless_load;
> > > - }
> > #endif
> 
> Ah sure, if you prefer it that way, I can do that.

Yes please. I normally favour the thing you did, but here it makes
tricky code that much harder to read.


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 02:53:21PM +0200, Frederic Weisbecker wrote:
> On Fri, Apr 08, 2016 at 11:41:54AM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> > > +void cpu_load_update_nohz_start(void)
> > >  {
> > >   struct rq *this_rq = this_rq();
> > > +
> > > + /*
> > > +  * This is all lockless but should be fine. If weighted_cpuload changes
> > > +  * concurrently we'll exit nohz. And cpu_load write can race with
> > > +  * cpu_load_update_idle() but both updater would be writing the same.
> > > +  */
> > > + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> > > +}
> > 
> > There is more to this; this also updates ->cpu_load[0] at possibly much
> > higher frequency than we've done before, while not updating the other
> > ->cpu_load[] members.
> > 
> > Now, I'm not sure we care, but it is a bit odd.
> 
> This is right. cpu_load[0] is aimed at showing the most recent load,
> without knowing when was the last update (the last tick/update could
> have been recent or not, the readers shouldn't care). Now we can
> indeed worry about it if this field is read altogether with the other
> indexes and those are put into some relation. But it seems that each
> of the rq->cpu_load[i] are read independently without relation or
> comparison. Now really I'm saying that without much assurance as I
> don't know the details of the load balancing code.
> 
> If in doubt I can create a field in struct rq to record the tickless
> load instead of storing it in cpu_load[0]. That was in fact the first
> direction I took in my drafts.

Yeah, no I think this is fine, just maybe wants a comment.

> > > +/*
> > > + * Account the tickless load in the end of a nohz frame.
> > > + */
> > > +void cpu_load_update_nohz_stop(void)
> > > +{
> > >   unsigned long curr_jiffies = READ_ONCE(jiffies);
> > > + struct rq *this_rq = this_rq();
> > > + unsigned long load;
> > >  
> > >   if (curr_jiffies == this_rq->last_load_update_tick)
> > >   return;
> > >  
> > > + load = weighted_cpuload(cpu_of(this_rq));
> > >   raw_spin_lock(_rq->lock);
> > > + cpu_load_update_nohz(this_rq, curr_jiffies, load);
> > >   raw_spin_unlock(_rq->lock);
> > >  }
> > 
> > And this makes us take rq->lock when waking from nohz; a bit
> > unfortunate. Do we really need this though? 
> 
> Note it was already the case before this patchset, the function was called
> cpu_load_update_nohz() instead.

Ah, ok. I got lost in the whole rename + nohz maze (again). OK no
problem then.


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 02:53:21PM +0200, Frederic Weisbecker wrote:
> On Fri, Apr 08, 2016 at 11:41:54AM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> > > +void cpu_load_update_nohz_start(void)
> > >  {
> > >   struct rq *this_rq = this_rq();
> > > +
> > > + /*
> > > +  * This is all lockless but should be fine. If weighted_cpuload changes
> > > +  * concurrently we'll exit nohz. And cpu_load write can race with
> > > +  * cpu_load_update_idle() but both updater would be writing the same.
> > > +  */
> > > + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> > > +}
> > 
> > There is more to this; this also updates ->cpu_load[0] at possibly much
> > higher frequency than we've done before, while not updating the other
> > ->cpu_load[] members.
> > 
> > Now, I'm not sure we care, but it is a bit odd.
> 
> This is right. cpu_load[0] is aimed at showing the most recent load,
> without knowing when was the last update (the last tick/update could
> have been recent or not, the readers shouldn't care). Now we can
> indeed worry about it if this field is read altogether with the other
> indexes and those are put into some relation. But it seems that each
> of the rq->cpu_load[i] are read independently without relation or
> comparison. Now really I'm saying that without much assurance as I
> don't know the details of the load balancing code.
> 
> If in doubt I can create a field in struct rq to record the tickless
> load instead of storing it in cpu_load[0]. That was in fact the first
> direction I took in my drafts.

Yeah, no I think this is fine, just maybe wants a comment.

> > > +/*
> > > + * Account the tickless load in the end of a nohz frame.
> > > + */
> > > +void cpu_load_update_nohz_stop(void)
> > > +{
> > >   unsigned long curr_jiffies = READ_ONCE(jiffies);
> > > + struct rq *this_rq = this_rq();
> > > + unsigned long load;
> > >  
> > >   if (curr_jiffies == this_rq->last_load_update_tick)
> > >   return;
> > >  
> > > + load = weighted_cpuload(cpu_of(this_rq));
> > >   raw_spin_lock(_rq->lock);
> > > + cpu_load_update_nohz(this_rq, curr_jiffies, load);
> > >   raw_spin_unlock(_rq->lock);
> > >  }
> > 
> > And this makes us take rq->lock when waking from nohz; a bit
> > unfortunate. Do we really need this though? 
> 
> Note it was already the case before this patchset, the function was called
> cpu_load_update_nohz() instead.

Ah, ok. I got lost in the whole rename + nohz maze (again). OK no
problem then.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 12:48:21PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 03:07:13AM +0200, Frederic Weisbecker wrote:
> > index 4c522a7..59a2821 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -7327,8 +7327,9 @@ void __init sched_init(void)
> >  
> > for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
> > rq->cpu_load[j] = 0;
> > -
> > +#ifdef CONFIG_NO_HZ_COMMON
> > rq->last_load_update_tick = jiffies;
> > +#endif
> >  
> >  #ifdef CONFIG_SMP
> > rq->sd = NULL;
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 1dd864d..4618e5b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
> > *this_rq,
> >  
> >  static void cpu_load_update_periodic(struct rq *this_rq, unsigned long 
> > load)
> >  {
> > +#ifdef CONFIG_NO_HZ_COMMON
> > /* See the mess around cpu_load_update_nohz(). */
> > this_rq->last_load_update_tick = READ_ONCE(jiffies);
> > +#endif
> > cpu_load_update(this_rq, load, 1);
> >  }
> >  
> 
> Here you do the simple #ifdef, while here you make a giant mess instead
> of the relatively straight forward:
> 
> > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > unsigned long this_load,
> >  
> > /* scale is effectively 1 << i now, and >> i divides by scale */
> >  
> > -   old_load = this_rq->cpu_load[i];
> #ifdef CONFIG_NO_HZ_COMMON
> > -   old_load = decay_load_missed(old_load, pending_updates - 1, i);
> > -   if (tickless_load) {
> > -   old_load -= decay_load_missed(tickless_load, 
> > pending_updates - 1, i);
> > -   /*
> > -* old_load can never be a negative value because a
> > -* decayed tickless_load cannot be greater than the
> > -* original tickless_load.
> > -*/
> > -   old_load += tickless_load;
> > -   }
> #endif

Ah sure, if you prefer it that way, I can do that.

Thanks.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 12:48:21PM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 03:07:13AM +0200, Frederic Weisbecker wrote:
> > index 4c522a7..59a2821 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -7327,8 +7327,9 @@ void __init sched_init(void)
> >  
> > for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
> > rq->cpu_load[j] = 0;
> > -
> > +#ifdef CONFIG_NO_HZ_COMMON
> > rq->last_load_update_tick = jiffies;
> > +#endif
> >  
> >  #ifdef CONFIG_SMP
> > rq->sd = NULL;
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 1dd864d..4618e5b 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
> > *this_rq,
> >  
> >  static void cpu_load_update_periodic(struct rq *this_rq, unsigned long 
> > load)
> >  {
> > +#ifdef CONFIG_NO_HZ_COMMON
> > /* See the mess around cpu_load_update_nohz(). */
> > this_rq->last_load_update_tick = READ_ONCE(jiffies);
> > +#endif
> > cpu_load_update(this_rq, load, 1);
> >  }
> >  
> 
> Here you do the simple #ifdef, while here you make a giant mess instead
> of the relatively straight forward:
> 
> > @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> > unsigned long this_load,
> >  
> > /* scale is effectively 1 << i now, and >> i divides by scale */
> >  
> > -   old_load = this_rq->cpu_load[i];
> #ifdef CONFIG_NO_HZ_COMMON
> > -   old_load = decay_load_missed(old_load, pending_updates - 1, i);
> > -   if (tickless_load) {
> > -   old_load -= decay_load_missed(tickless_load, 
> > pending_updates - 1, i);
> > -   /*
> > -* old_load can never be a negative value because a
> > -* decayed tickless_load cannot be greater than the
> > -* original tickless_load.
> > -*/
> > -   old_load += tickless_load;
> > -   }
> #endif

Ah sure, if you prefer it that way, I can do that.

Thanks.


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 11:41:54AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> > +void cpu_load_update_nohz_start(void)
> >  {
> > struct rq *this_rq = this_rq();
> > +
> > +   /*
> > +* This is all lockless but should be fine. If weighted_cpuload changes
> > +* concurrently we'll exit nohz. And cpu_load write can race with
> > +* cpu_load_update_idle() but both updater would be writing the same.
> > +*/
> > +   this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> > +}
> 
> There is more to this; this also updates ->cpu_load[0] at possibly much
> higher frequency than we've done before, while not updating the other
> ->cpu_load[] members.
> 
> Now, I'm not sure we care, but it is a bit odd.

This is right. cpu_load[0] is aimed at showing the most recent load, without 
knowing
when was the last update (the last tick/update could have been recent or not, 
the readers shouldn't
care). Now we can indeed worry about it if this field is read altogether with 
the other indexes and
those are put into some relation. But it seems that each of the rq->cpu_load[i] 
are read independently
without relation or comparison. Now really I'm saying that without much 
assurance as I don't know
the details of the load balancing code.

If in doubt I can create a field in struct rq to record the tickless load 
instead
of storing it in cpu_load[0]. That was in fact the first direction I took in my 
drafts.

> 
> > +/*
> > + * Account the tickless load in the end of a nohz frame.
> > + */
> > +void cpu_load_update_nohz_stop(void)
> > +{
> > unsigned long curr_jiffies = READ_ONCE(jiffies);
> > +   struct rq *this_rq = this_rq();
> > +   unsigned long load;
> >  
> > if (curr_jiffies == this_rq->last_load_update_tick)
> > return;
> >  
> > +   load = weighted_cpuload(cpu_of(this_rq));
> > raw_spin_lock(_rq->lock);
> > +   cpu_load_update_nohz(this_rq, curr_jiffies, load);
> > raw_spin_unlock(_rq->lock);
> >  }
> 
> And this makes us take rq->lock when waking from nohz; a bit
> unfortunate. Do we really need this though? 

Note it was already the case before this patchset, the function was called
cpu_load_update_nohz() instead.

I think we need it, at least due to remote updates performed by 
cpu_load_update_idle()
(which only updates when load is 0 though).

> Will not a tick be forthcoming real-soon-now?

No guarantee about that. If the next task only runs for less than 10ms (in 
HZ=100),
there might be no tick until the next idle period.

Besides in nohz_full, the next task may be running tickless as well, in which 
case
we really need to update now.


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Frederic Weisbecker
On Fri, Apr 08, 2016 at 11:41:54AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> > +void cpu_load_update_nohz_start(void)
> >  {
> > struct rq *this_rq = this_rq();
> > +
> > +   /*
> > +* This is all lockless but should be fine. If weighted_cpuload changes
> > +* concurrently we'll exit nohz. And cpu_load write can race with
> > +* cpu_load_update_idle() but both updater would be writing the same.
> > +*/
> > +   this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> > +}
> 
> There is more to this; this also updates ->cpu_load[0] at possibly much
> higher frequency than we've done before, while not updating the other
> ->cpu_load[] members.
> 
> Now, I'm not sure we care, but it is a bit odd.

This is right. cpu_load[0] is aimed at showing the most recent load, without 
knowing
when was the last update (the last tick/update could have been recent or not, 
the readers shouldn't
care). Now we can indeed worry about it if this field is read altogether with 
the other indexes and
those are put into some relation. But it seems that each of the rq->cpu_load[i] 
are read independently
without relation or comparison. Now really I'm saying that without much 
assurance as I don't know
the details of the load balancing code.

If in doubt I can create a field in struct rq to record the tickless load 
instead
of storing it in cpu_load[0]. That was in fact the first direction I took in my 
drafts.

> 
> > +/*
> > + * Account the tickless load in the end of a nohz frame.
> > + */
> > +void cpu_load_update_nohz_stop(void)
> > +{
> > unsigned long curr_jiffies = READ_ONCE(jiffies);
> > +   struct rq *this_rq = this_rq();
> > +   unsigned long load;
> >  
> > if (curr_jiffies == this_rq->last_load_update_tick)
> > return;
> >  
> > +   load = weighted_cpuload(cpu_of(this_rq));
> > raw_spin_lock(_rq->lock);
> > +   cpu_load_update_nohz(this_rq, curr_jiffies, load);
> > raw_spin_unlock(_rq->lock);
> >  }
> 
> And this makes us take rq->lock when waking from nohz; a bit
> unfortunate. Do we really need this though? 

Note it was already the case before this patchset, the function was called
cpu_load_update_nohz() instead.

I think we need it, at least due to remote updates performed by 
cpu_load_update_idle()
(which only updates when load is 0 though).

> Will not a tick be forthcoming real-soon-now?

No guarantee about that. If the next task only runs for less than 10ms (in 
HZ=100),
there might be no tick until the next idle period.

Besides in nohz_full, the next task may be running tickless as well, in which 
case
we really need to update now.


Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 03:07:13AM +0200, Frederic Weisbecker wrote:
> index 4c522a7..59a2821 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7327,8 +7327,9 @@ void __init sched_init(void)
>  
>   for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
>   rq->cpu_load[j] = 0;
> -
> +#ifdef CONFIG_NO_HZ_COMMON
>   rq->last_load_update_tick = jiffies;
> +#endif
>  
>  #ifdef CONFIG_SMP
>   rq->sd = NULL;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1dd864d..4618e5b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
> *this_rq,
>  
>  static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
>  {
> +#ifdef CONFIG_NO_HZ_COMMON
>   /* See the mess around cpu_load_update_nohz(). */
>   this_rq->last_load_update_tick = READ_ONCE(jiffies);
> +#endif
>   cpu_load_update(this_rq, load, 1);
>  }
>  

Here you do the simple #ifdef, while here you make a giant mess instead
of the relatively straight forward:

> @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> unsigned long this_load,
>  
>   /* scale is effectively 1 << i now, and >> i divides by scale */
>  
> - old_load = this_rq->cpu_load[i];
#ifdef CONFIG_NO_HZ_COMMON
> - old_load = decay_load_missed(old_load, pending_updates - 1, i);
> - if (tickless_load) {
> - old_load -= decay_load_missed(tickless_load, 
> pending_updates - 1, i);
> - /*
> -  * old_load can never be a negative value because a
> -  * decayed tickless_load cannot be greater than the
> -  * original tickless_load.
> -  */
> - old_load += tickless_load;
> - }
#endif
>   new_load = this_load;
>   /*
>* Round up the averaging division if load is increasing. This




Re: [PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 03:07:13AM +0200, Frederic Weisbecker wrote:
> index 4c522a7..59a2821 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7327,8 +7327,9 @@ void __init sched_init(void)
>  
>   for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
>   rq->cpu_load[j] = 0;
> -
> +#ifdef CONFIG_NO_HZ_COMMON
>   rq->last_load_update_tick = jiffies;
> +#endif
>  
>  #ifdef CONFIG_SMP
>   rq->sd = NULL;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1dd864d..4618e5b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
> *this_rq,
>  
>  static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
>  {
> +#ifdef CONFIG_NO_HZ_COMMON
>   /* See the mess around cpu_load_update_nohz(). */
>   this_rq->last_load_update_tick = READ_ONCE(jiffies);
> +#endif
>   cpu_load_update(this_rq, load, 1);
>  }
>  

Here you do the simple #ifdef, while here you make a giant mess instead
of the relatively straight forward:

> @@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, 
> unsigned long this_load,
>  
>   /* scale is effectively 1 << i now, and >> i divides by scale */
>  
> - old_load = this_rq->cpu_load[i];
#ifdef CONFIG_NO_HZ_COMMON
> - old_load = decay_load_missed(old_load, pending_updates - 1, i);
> - if (tickless_load) {
> - old_load -= decay_load_missed(tickless_load, 
> pending_updates - 1, i);
> - /*
> -  * old_load can never be a negative value because a
> -  * decayed tickless_load cannot be greater than the
> -  * original tickless_load.
> -  */
> - old_load += tickless_load;
> - }
#endif
>   new_load = this_load;
>   /*
>* Round up the averaging division if load is increasing. This




Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> +void cpu_load_update_nohz_start(void)
>  {
>   struct rq *this_rq = this_rq();
> +
> + /*
> +  * This is all lockless but should be fine. If weighted_cpuload changes
> +  * concurrently we'll exit nohz. And cpu_load write can race with
> +  * cpu_load_update_idle() but both updater would be writing the same.
> +  */
> + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> +}

There is more to this; this also updates ->cpu_load[0] at possibly much
higher frequency than we've done before, while not updating the other
->cpu_load[] members.

Now, I'm not sure we care, but it is a bit odd.

> +/*
> + * Account the tickless load in the end of a nohz frame.
> + */
> +void cpu_load_update_nohz_stop(void)
> +{
>   unsigned long curr_jiffies = READ_ONCE(jiffies);
> + struct rq *this_rq = this_rq();
> + unsigned long load;
>  
>   if (curr_jiffies == this_rq->last_load_update_tick)
>   return;
>  
> + load = weighted_cpuload(cpu_of(this_rq));
>   raw_spin_lock(_rq->lock);
> + cpu_load_update_nohz(this_rq, curr_jiffies, load);
>   raw_spin_unlock(_rq->lock);
>  }

And this makes us take rq->lock when waking from nohz; a bit
unfortunate. Do we really need this though? Will not a tick be
forthcoming real-soon-now?


Re: [PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-08 Thread Peter Zijlstra
On Fri, Apr 08, 2016 at 03:07:12AM +0200, Frederic Weisbecker wrote:
> +void cpu_load_update_nohz_start(void)
>  {
>   struct rq *this_rq = this_rq();
> +
> + /*
> +  * This is all lockless but should be fine. If weighted_cpuload changes
> +  * concurrently we'll exit nohz. And cpu_load write can race with
> +  * cpu_load_update_idle() but both updater would be writing the same.
> +  */
> + this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
> +}

There is more to this; this also updates ->cpu_load[0] at possibly much
higher frequency than we've done before, while not updating the other
->cpu_load[] members.

Now, I'm not sure we care, but it is a bit odd.

> +/*
> + * Account the tickless load in the end of a nohz frame.
> + */
> +void cpu_load_update_nohz_stop(void)
> +{
>   unsigned long curr_jiffies = READ_ONCE(jiffies);
> + struct rq *this_rq = this_rq();
> + unsigned long load;
>  
>   if (curr_jiffies == this_rq->last_load_update_tick)
>   return;
>  
> + load = weighted_cpuload(cpu_of(this_rq));
>   raw_spin_lock(_rq->lock);
> + cpu_load_update_nohz(this_rq, curr_jiffies, load);
>   raw_spin_unlock(_rq->lock);
>  }

And this makes us take rq->lock when waking from nohz; a bit
unfortunate. Do we really need this though? Will not a tick be
forthcoming real-soon-now?


[PATCH 1/3] sched: Gather cpu load functions under a more conventional namespace

2016-04-07 Thread Frederic Weisbecker
The cpu load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847..aa5ee0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fc..4c522a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2915,7 +2915,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66..f33764d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4526,7 +4526,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
  unsigned long pending_updates, int active)
 {
unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
@@ -4574,7 +4574,7 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
+static void __cpu_

[PATCH 1/3] sched: Gather cpu load functions under a more conventional namespace

2016-04-07 Thread Frederic Weisbecker
The cpu load update related functions have a weak naming convention
currently, starting with update_cpu_load_*() which isn't ideal as
"update" is a very generic concept.

Since two of these functions are public already (and a third is to come)
that's enough to introduce a more conventional naming scheme. So let's
do the following rename instead:

update_cpu_load_*() -> cpu_load_update_*()

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 Documentation/trace/ftrace.txt | 10 +-
 include/linux/sched.h  |  4 ++--
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 24 
 kernel/sched/sched.h   |  4 ++--
 kernel/time/tick-sched.c   |  2 +-
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297..9857606 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
   -0   3dN.1   12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
   -0   3dN.1   12us : ktime_get <-tick_nohz_idle_exit
   -0   3dN.1   12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
-  -0   3dN.1   13us : update_cpu_load_nohz <-tick_nohz_idle_exit
-  -0   3dN.1   13us : _raw_spin_lock <-update_cpu_load_nohz
+  -0   3dN.1   13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+  -0   3dN.1   13us : _raw_spin_lock <-cpu_load_update_nohz
   -0   3dN.1   13us : add_preempt_count <-_raw_spin_lock
-  -0   3dN.2   13us : __update_cpu_load <-update_cpu_load_nohz
-  -0   3dN.2   14us : sched_avg_update <-__update_cpu_load
-  -0   3dN.2   14us : _raw_spin_unlock <-update_cpu_load_nohz
+  -0   3dN.2   13us : __cpu_load_update <-cpu_load_update_nohz
+  -0   3dN.2   14us : sched_avg_update <-__cpu_load_update
+  -0   3dN.2   14us : _raw_spin_unlock <-cpu_load_update_nohz
   -0   3dN.2   14us : sub_preempt_count <-_raw_spin_unlock
   -0   3dN.1   15us : calc_load_exit_idle <-tick_nohz_idle_exit
   -0   3dN.1   15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52c4847..aa5ee0d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,9 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz(int active);
 #else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz(int active) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b489fc..4c522a7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2915,7 +2915,7 @@ void scheduler_tick(void)
raw_spin_lock(>lock);
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
-   update_cpu_load_active(rq);
+   cpu_load_update_active(rq);
calc_global_load_tick(rq);
raw_spin_unlock(>lock);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0fe30e66..f33764d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4491,7 +4491,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
 }
 
 /**
- * __update_cpu_load - update the rq->cpu_load[] statistics
+ * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
@@ -4526,7 +4526,7 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
  * term. See the @active paramter.
  */
-static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
  unsigned long pending_updates, int active)
 {
unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
@@ -4574,7 +4574,7 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __update_cpu_load_nohz(struct rq *this_rq,
+static void __cpu_load_update_nohz(struct rq *this_rq,
   unsigned long curr_jiffies,
   unsigned long load,
   int active)
@@ -4589,7 +4589,7 @@ static void __update_cpu_load_nohz(struct rq *this_rq,
 

[PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-07 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 kernel/sched/core.c  |  3 ++-
 kernel/sched/fair.c  | 43 ---
 kernel/sched/sched.h |  6 --
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..59a2821 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7327,8 +7327,9 @@ void __init sched_init(void)
 
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
-
+#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+#endif
 
 #ifdef CONFIG_SMP
rq->sd = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1dd864d..4618e5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,6 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
 
 /*
  * per rq 'load' arrray crap; XXX kill this.
@@ -4490,6 +4491,33 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
return load;
 }
 
+static unsigned long
+cpu_load_update_missed(unsigned long old_load, unsigned long tickless_load,
+  unsigned long pending_updates, int idx)
+{
+   old_load = decay_load_missed(old_load, pending_updates - 1, idx);
+   if (tickless_load) {
+   old_load -= decay_load_missed(tickless_load, pending_updates - 
1, idx);
+   /*
+* old_load can never be a negative value because a
+* decayed tickless_load cannot be greater than the
+* original tickless_load.
+*/
+   old_load += tickless_load;
+   }
+   return old_load;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline unsigned long
+cpu_load_update_missed(unsigned long old_load, unsigned long tickless_load,
+  unsigned long pending_updates, int idx)
+{
+   return old_load;
+}
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
@@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 
/* scale is effectively 1 << i now, and >> i divides by scale */
 
-   old_load = this_rq->cpu_load[i];
-   old_load = decay_load_missed(old_load, pending_updates - 1, i);
-   if (tickless_load) {
-   old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
-   /*
-* old_load can never be a negative value because a
-* decayed tickless_load cannot be greater than the
-* original tickless_load.
-*/
-   old_load += tickless_load;
-   }
+   old_load = cpu_load_update_missed(this_rq->cpu_load[i],
+ tickless_load, 
pending_updates, i);
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CON

[PATCH 3/3] sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates

2016-04-07 Thread Frederic Weisbecker
Some code in cpu load update only concern NO_HZ configs but it is
built on all configurations. When NO_HZ isn't built, that code is harmless
but just happens to take some useless ressources in CPU and memory:

1) one useless field in struct rq
2) jiffies record on every tick that is never used (cpu_load_update_periodic)
3) decay_load_missed is called two times on every tick to eventually
   return immediately with no action taken. And that function is dead
   code.

For pure optimization purposes, lets conditionally build the NO_HZ
related code.

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Ingo Molnar 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 kernel/sched/core.c  |  3 ++-
 kernel/sched/fair.c  | 43 ---
 kernel/sched/sched.h |  6 --
 3 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4c522a7..59a2821 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7327,8 +7327,9 @@ void __init sched_init(void)
 
for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
-
+#ifdef CONFIG_NO_HZ_COMMON
rq->last_load_update_tick = jiffies;
+#endif
 
 #ifdef CONFIG_SMP
rq->sd = NULL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1dd864d..4618e5b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4423,6 +4423,7 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 }
 
 #ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
 
 /*
  * per rq 'load' arrray crap; XXX kill this.
@@ -4490,6 +4491,33 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
return load;
 }
 
+static unsigned long
+cpu_load_update_missed(unsigned long old_load, unsigned long tickless_load,
+  unsigned long pending_updates, int idx)
+{
+   old_load = decay_load_missed(old_load, pending_updates - 1, idx);
+   if (tickless_load) {
+   old_load -= decay_load_missed(tickless_load, pending_updates - 
1, idx);
+   /*
+* old_load can never be a negative value because a
+* decayed tickless_load cannot be greater than the
+* original tickless_load.
+*/
+   old_load += tickless_load;
+   }
+   return old_load;
+}
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline unsigned long
+cpu_load_update_missed(unsigned long old_load, unsigned long tickless_load,
+  unsigned long pending_updates, int idx)
+{
+   return old_load;
+}
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
 /**
  * __cpu_load_update - update the rq->cpu_load[] statistics
  * @this_rq: The rq to update statistics for
@@ -4540,17 +4568,8 @@ static void cpu_load_update(struct rq *this_rq, unsigned 
long this_load,
 
/* scale is effectively 1 << i now, and >> i divides by scale */
 
-   old_load = this_rq->cpu_load[i];
-   old_load = decay_load_missed(old_load, pending_updates - 1, i);
-   if (tickless_load) {
-   old_load -= decay_load_missed(tickless_load, 
pending_updates - 1, i);
-   /*
-* old_load can never be a negative value because a
-* decayed tickless_load cannot be greater than the
-* original tickless_load.
-*/
-   old_load += tickless_load;
-   }
+   old_load = cpu_load_update_missed(this_rq->cpu_load[i],
+ tickless_load, 
pending_updates, i);
new_load = this_load;
/*
 * Round up the averaging division if load is increasing. This
@@ -4661,8 +4680,10 @@ static inline void cpu_load_update_nohz(struct rq 
*this_rq,
 
 static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
 {
+#ifdef CONFIG_NO_HZ_COMMON
/* See the mess around cpu_load_update_nohz(). */
this_rq->last_load_update_tick = READ_ONCE(jiffies);
+#endif
cpu_load_update(this_rq, load, 1);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1802013..2302bb6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -585,11 +585,13 @@ struct rq {
 #endif
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+#ifdef CONFIG_NO_HZ_COMMON
+#ifdef CONFIG_SMP
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ_COMMON
+#endif /* CONFIG_SMP */
u64 nohz_stamp;
unsigned long nohz_flags;
-#endif
+#endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
unsigned long last_sched_tick;
 #endif
-- 
2.7.0



[PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-07 Thread Frederic Weisbecker
Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with cpu load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Cc: Byungchul Park <byungchul.p...@lge.com>
Cc: Chris Metcalf <cmetc...@ezchip.com>
Cc: Christoph Lameter <c...@linux.com>
Cc: Ingo Molnar <mi...@elte.hu>
Cc: Luiz Capitulino <lcapitul...@redhat.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Rik van Riel <r...@redhat.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Frederic Weisbecker <fweis...@gmail.com>
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 95 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa5ee0d..07b1b1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f33764d..1dd864d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4495,7 +4495,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4574,10 +4573,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-  unsigned long curr_jiffies,
-  unsigned long load,
-  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load 

[PATCH 0/3] sched: Fix/improve nohz cpu load updates v2

2016-04-07 Thread Frederic Weisbecker
This series tries to fix issues against CFS cpu load accounting in
nohz configs (both idle and full nohz). Some optimizations coming along.

Following Peterz reviews, I've tried to improve the changelogs. Comments
have been updated as well and some changes have been refactored.

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
sched/nohz-v2

HEAD: 55cb7c1a25acb140253ca4e55f8d43cd31404b55

Thanks,
Frederic
---

Frederic Weisbecker (3):
  sched: Gather cpu load functions under a more conventional namespace
  sched: Correctly handle nohz ticks cpu load accounting
  sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates


 Documentation/trace/ftrace.txt |  10 +--
 include/linux/sched.h  |   6 +-
 kernel/sched/core.c|   5 +-
 kernel/sched/fair.c| 146 +++--
 kernel/sched/sched.h   |  10 +--
 kernel/time/tick-sched.c   |   9 +--
 6 files changed, 120 insertions(+), 66 deletions(-)


[PATCH 2/3] sched: Correctly handle nohz ticks cpu load accounting

2016-04-07 Thread Frederic Weisbecker
Ticks can happen while the CPU is in dynticks-idle or dynticks-singletask
mode. In fact "nohz" or "dynticks" only mean that we exit the periodic
mode and we try to minimize the ticks as much as possible. The nohz
subsystem uses a confusing terminology with the internal state
"ts->tick_stopped" which is also available through its public interface
with tick_nohz_tick_stopped(). This is a misnomer as the tick is instead
reduced with the best effort rather than stopped. In the best case the
tick can indeed be actually stopped but there is no guarantee about that.
If a timer needs to fire one second later, a tick will fire while the
CPU is in nohz mode and this is a very common scenario.

Now this confusion happens to be a problem with cpu load updates:
cpu_load_update_active() doesn't handle nohz ticks correctly because it
assumes that ticks are completely stopped in nohz mode and that
cpu_load_update_active() can't be called in dynticks mode. When that
happens, the whole previous tickless load is ignored and the function
just records the load for the current tick, ignoring potentially long
idle periods behind.

In order to solve this, we could account the current load for the
previous nohz time but there is a risk that we account the load of a
task that got freshly enqueued for the whole nohz period.

So instead, lets record the dynticks load on nohz frame entry so we know
what to record in case of nohz ticks, then use this record to account
the tickless load on nohz ticks and nohz frame end.

Cc: Byungchul Park 
Cc: Chris Metcalf 
Cc: Christoph Lameter 
Cc: Ingo Molnar 
Cc: Luiz Capitulino 
Cc: Mike Galbraith 
Cc: Paul E. McKenney 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Signed-off-by: Frederic Weisbecker 
---
 include/linux/sched.h|  6 ++-
 kernel/sched/fair.c  | 95 +++-
 kernel/time/tick-sched.c |  9 +++--
 3 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index aa5ee0d..07b1b1e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -178,9 +178,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, 
unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
 #else
-static inline void cpu_load_update_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
 #endif
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f33764d..1dd864d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4495,7 +4495,6 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  * @this_rq: The rq to update statistics for
  * @this_load: The current load
  * @pending_updates: The number of missed updates
- * @active: !0 for NOHZ_FULL
  *
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
@@ -4524,12 +4523,12 @@ decay_load_missed(unsigned long load, unsigned long 
missed_updates, int idx)
  *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
  *
  * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term. See the @active paramter.
+ * term.
  */
-static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
- unsigned long pending_updates, int active)
+static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
+   unsigned long pending_updates)
 {
-   unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
+   unsigned long tickless_load = this_rq->cpu_load[0];
int i, scale;
 
this_rq->nr_load_updates++;
@@ -4574,10 +4573,23 @@ static unsigned long weighted_cpuload(const int cpu)
 }
 
 #ifdef CONFIG_NO_HZ_COMMON
-static void __cpu_load_update_nohz(struct rq *this_rq,
-  unsigned long curr_jiffies,
-  unsigned long load,
-  int active)
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we need to avoid the delta approach from the regular tick when
+ * possible since that would seriously skew the load calculation. This is why 
we
+ * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
+ * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
+ * loop exit, nohz_idle_balance, nohz full exit...)
+ *
+ * This means we might still 

[PATCH 0/3] sched: Fix/improve nohz cpu load updates v2

2016-04-07 Thread Frederic Weisbecker
This series tries to fix issues against CFS cpu load accounting in
nohz configs (both idle and full nohz). Some optimizations coming along.

Following Peterz reviews, I've tried to improve the changelogs. Comments
have been updated as well and some changes have been refactored.

git://git.kernel.org/pub/scm/linux/kernel/git/frederic/linux-dynticks.git
sched/nohz-v2

HEAD: 55cb7c1a25acb140253ca4e55f8d43cd31404b55

Thanks,
Frederic
---

Frederic Weisbecker (3):
  sched: Gather cpu load functions under a more conventional namespace
  sched: Correctly handle nohz ticks cpu load accounting
  sched: Optimize !CONFIG_NO_HZ_COMMON cpu load updates


 Documentation/trace/ftrace.txt |  10 +--
 include/linux/sched.h  |   6 +-
 kernel/sched/core.c|   5 +-
 kernel/sched/fair.c| 146 +++--
 kernel/sched/sched.h   |  10 +--
 kernel/time/tick-sched.c   |   9 +--
 6 files changed, 120 insertions(+), 66 deletions(-)


Re: [PATCH 3/4] sched: Optimize tick periodic cpu load updates

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:23:37AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:06PM +0200, Frederic Weisbecker wrote:
> > Don't bother with the whole pending tickless cpu load machinery if
> > we run a tick periodic kernel. That's less job for the CPU on ticks.
> 
> Again, the changelog really could use help. Is this a pure optimization
> patch? If so, do you have numbers?

Well until now we have always tried to keep the nohz code under ifdef.
For optimizations and kernel size. I haven't measured it though, I guess
the gain is hardly visible.

> 
> > +++ b/kernel/sched/sched.h
> > @@ -585,8 +585,10 @@ struct rq {
> >  #endif
> > #define CPU_LOAD_IDX_MAX 5
> > unsigned long cpu_load[CPU_LOAD_IDX_MAX];
> > +#ifdef CONFIG_NO_HZ_COMMON
> > +# ifdef CONFIG_SMP
> 
> I'm not a fan of this #ifdef indenting and nothing near there uses this
> style, so please don't introduce it here.

Ok.

Thanks.

> 
> > unsigned long last_load_update_tick;
> > -#ifdef CONFIG_NO_HZ_COMMON
> > +# endif
> > u64 nohz_stamp;
> > unsigned long nohz_flags;
> >  #endif
> > -- 
> > 2.7.0
> > 


Re: [PATCH 3/4] sched: Optimize tick periodic cpu load updates

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:23:37AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:06PM +0200, Frederic Weisbecker wrote:
> > Don't bother with the whole pending tickless cpu load machinery if
> > we run a tick periodic kernel. That's less job for the CPU on ticks.
> 
> Again, the changelog really could use help. Is this a pure optimization
> patch? If so, do you have numbers?

Well until now we have always tried to keep the nohz code under ifdef.
For optimizations and kernel size. I haven't measured it though, I guess
the gain is hardly visible.

> 
> > +++ b/kernel/sched/sched.h
> > @@ -585,8 +585,10 @@ struct rq {
> >  #endif
> > #define CPU_LOAD_IDX_MAX 5
> > unsigned long cpu_load[CPU_LOAD_IDX_MAX];
> > +#ifdef CONFIG_NO_HZ_COMMON
> > +# ifdef CONFIG_SMP
> 
> I'm not a fan of this #ifdef indenting and nothing near there uses this
> style, so please don't introduce it here.

Ok.

Thanks.

> 
> > unsigned long last_load_update_tick;
> > -#ifdef CONFIG_NO_HZ_COMMON
> > +# endif
> > u64 nohz_stamp;
> > unsigned long nohz_flags;
> >  #endif
> > -- 
> > 2.7.0
> > 


Re: [PATCH 2/4] sched: Correctly handle nohz ticks cpu load accounting

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:15:20AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:05PM +0200, Frederic Weisbecker wrote:
> > Ticks can happen in the middle of a nohz frame and
> 
> I'm still miffed with that.. And this changelog doesn't even explain why
> and how.

Indeed, it looks like I've cooked sloppy changelogs in this series, I'll
do another pass on all of them.

> 
> > cpu_load_update_active() doesn't handle these correctly. It forgets the
> > whole previous tickless load and just records the current tick, ignoring
> > potentially long idle periods.
> > 
> > In order to solve this, record the load on nohz frame entry so we know
> > what to record in case of nohz interruptions, then use this recorded load
> > to account the tickless load on nohz ticks and nohz frame end.
> 
> 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f33764d..394f008 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4527,9 +4527,9 @@ decay_load_missed(unsigned long load, unsigned long 
> > missed_updates, int idx)
> >   * term. See the @active paramter.
> 
> ^^
> 
> What active parameter... you need to update that comment.

Yeah, forgot that.

Thanks.


Re: [PATCH 2/4] sched: Correctly handle nohz ticks cpu load accounting

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:15:20AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:05PM +0200, Frederic Weisbecker wrote:
> > Ticks can happen in the middle of a nohz frame and
> 
> I'm still miffed with that.. And this changelog doesn't even explain why
> and how.

Indeed, it looks like I've cooked sloppy changelogs in this series, I'll
do another pass on all of them.

> 
> > cpu_load_update_active() doesn't handle these correctly. It forgets the
> > whole previous tickless load and just records the current tick, ignoring
> > potentially long idle periods.
> > 
> > In order to solve this, record the load on nohz frame entry so we know
> > what to record in case of nohz interruptions, then use this recorded load
> > to account the tickless load on nohz ticks and nohz frame end.
> 
> 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f33764d..394f008 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4527,9 +4527,9 @@ decay_load_missed(unsigned long load, unsigned long 
> > missed_updates, int idx)
> >   * term. See the @active paramter.
> 
> ^^
> 
> What active parameter... you need to update that comment.

Yeah, forgot that.

Thanks.


Re: [PATCH 1/4] sched: Gather cpu load functions under a common namespace

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:09:08AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:04PM +0200, Frederic Weisbecker wrote:
> > This way they are easily grep'able and recognized.
> 
> Please mention the actual renames done and the new naming scheme.

Right, I'll add more details.

Thanks.


Re: [PATCH 1/4] sched: Gather cpu load functions under a common namespace

2016-04-02 Thread Frederic Weisbecker
On Sat, Apr 02, 2016 at 09:09:08AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 01, 2016 at 03:23:04PM +0200, Frederic Weisbecker wrote:
> > This way they are easily grep'able and recognized.
> 
> Please mention the actual renames done and the new naming scheme.

Right, I'll add more details.

Thanks.


Re: [PATCH 4/4] sched: Conditionally build cpu load decay code for nohz

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:07PM +0200, Frederic Weisbecker wrote:
> To complete the tick periodic kernel optimizations.

-ENOCHANGELOG


Re: [PATCH 4/4] sched: Conditionally build cpu load decay code for nohz

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:07PM +0200, Frederic Weisbecker wrote:
> To complete the tick periodic kernel optimizations.

-ENOCHANGELOG


Re: [PATCH 3/4] sched: Optimize tick periodic cpu load updates

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:06PM +0200, Frederic Weisbecker wrote:
> Don't bother with the whole pending tickless cpu load machinery if
> we run a tick periodic kernel. That's less job for the CPU on ticks.

Again, the changelog really could use help. Is this a pure optimization
patch? If so, do you have numbers?

> +++ b/kernel/sched/sched.h
> @@ -585,8 +585,10 @@ struct rq {
>  #endif
>   #define CPU_LOAD_IDX_MAX 5
>   unsigned long cpu_load[CPU_LOAD_IDX_MAX];
> +#ifdef CONFIG_NO_HZ_COMMON
> +# ifdef CONFIG_SMP

I'm not a fan of this #ifdef indenting and nothing near there uses this
style, so please don't introduce it here.

>   unsigned long last_load_update_tick;
> -#ifdef CONFIG_NO_HZ_COMMON
> +# endif
>   u64 nohz_stamp;
>   unsigned long nohz_flags;
>  #endif
> -- 
> 2.7.0
> 


Re: [PATCH 3/4] sched: Optimize tick periodic cpu load updates

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:06PM +0200, Frederic Weisbecker wrote:
> Don't bother with the whole pending tickless cpu load machinery if
> we run a tick periodic kernel. That's less job for the CPU on ticks.

Again, the changelog really could use help. Is this a pure optimization
patch? If so, do you have numbers?

> +++ b/kernel/sched/sched.h
> @@ -585,8 +585,10 @@ struct rq {
>  #endif
>   #define CPU_LOAD_IDX_MAX 5
>   unsigned long cpu_load[CPU_LOAD_IDX_MAX];
> +#ifdef CONFIG_NO_HZ_COMMON
> +# ifdef CONFIG_SMP

I'm not a fan of this #ifdef indenting and nothing near there uses this
style, so please don't introduce it here.

>   unsigned long last_load_update_tick;
> -#ifdef CONFIG_NO_HZ_COMMON
> +# endif
>   u64 nohz_stamp;
>   unsigned long nohz_flags;
>  #endif
> -- 
> 2.7.0
> 


Re: [PATCH 2/4] sched: Correctly handle nohz ticks cpu load accounting

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:05PM +0200, Frederic Weisbecker wrote:
> Ticks can happen in the middle of a nohz frame and

I'm still miffed with that.. And this changelog doesn't even explain why
and how.

> cpu_load_update_active() doesn't handle these correctly. It forgets the
> whole previous tickless load and just records the current tick, ignoring
> potentially long idle periods.
> 
> In order to solve this, record the load on nohz frame entry so we know
> what to record in case of nohz interruptions, then use this recorded load
> to account the tickless load on nohz ticks and nohz frame end.


> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f33764d..394f008 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4527,9 +4527,9 @@ decay_load_missed(unsigned long load, unsigned long 
> missed_updates, int idx)
>   * term. See the @active paramter.

^^

What active parameter... you need to update that comment.

>   */
>  static void __cpu_load_update(struct rq *this_rq, unsigned long this_load,
> -   unsigned long pending_updates, int active)
> +   unsigned long pending_updates)
>  {
> - unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
> + unsigned long tickless_load = this_rq->cpu_load[0];
>   int i, scale;
>  
>   this_rq->nr_load_updates++;


Re: [PATCH 1/4] sched: Gather cpu load functions under a common namespace

2016-04-02 Thread Peter Zijlstra
On Fri, Apr 01, 2016 at 03:23:04PM +0200, Frederic Weisbecker wrote:
> This way they are easily grep'able and recognized.

Please mention the actual renames done and the new naming scheme.



  1   2   3   4   5   6   >