from:"Vincent Guittot"

[PATCH v2 5/5] sched/pelt: Remove shift of thermal clock

2023-12-21 Thread Vincent Guittot

The optional shift of the clock used by thermal/hw load avg has been
introduced to handle case where the signal was not always a high frequency
hw signal. Now that cpufreq provides a signal for firmware and
SW pressure, we can remove this exception and always keep this PELT signal
aligned with other signals.
Mark deprecated sched_thermal_decay_shift boot parameter.

Signed-off-by: Vincent Guittot 
---
 .../admin-guide/kernel-parameters.txt  |  1 +
 kernel/sched/core.c|  2 +-
 kernel/sched/fair.c| 10 ++
 kernel/sched/sched.h   | 18 --
 4 files changed, 4 insertions(+), 27 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..2ee15522b15d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5722,6 +5722,7 @@
but is useful for debugging and performance tuning.
 
sched_thermal_decay_shift=
+   [Deprecated]
[KNL, SMP] Set a decay shift for scheduler thermal
pressure signal. Thermal pressure signal follows the
default decay period of other scheduler pelt
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6f084bdf1c5..c68e47bfd5ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5670,7 +5670,7 @@ void scheduler_tick(void)
 
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
-   update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
+   update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ce0d32f441a8..16d71e764131 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice   
= 75ULL;
 
 const_debug unsigned int sysctl_sched_migration_cost   = 50UL;
 
-int sched_hw_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
-   int _shift = 0;
-
-   if (kstrtoint(str, 0, &_shift))
-   pr_warn("Unable to set scheduler thermal pressure decay shift 
parameter\n");
-
-   sched_hw_decay_shift = clamp(_shift, 0, 10);
+   pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
 }
 __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -9271,7 +9265,7 @@ static bool __update_blocked_others(struct rq *rq, bool 
*done)
 
decayed = update_rt_rq_load_avg(now, rq, curr_class == _sched_class) 
|
  update_dl_rq_load_avg(now, rq, curr_class == _sched_class) 
|
- update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
  update_irq_load_avg(rq, 0);
 
if (others_have_blocked(rq))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 677d24202eec..6fc6718a1060 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1520,24 +1520,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
 }
 
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- *  Decay shiftDecay period(ms)
- * 0   32
- * 1   64
- * 2   128
- * 3   256
- * 4   512
- */
-extern int sched_hw_decay_shift;
-
-static inline u64 rq_clock_hw(struct rq *rq)
-{
-   return rq_clock_task(rq) >> sched_hw_decay_shift;
-}
-
 static inline void rq_clock_skip_update(struct rq *rq)
 {
lockdep_assert_rq_held(rq);
-- 
2.34.1

[PATCH v2 4/5] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

2023-12-21 Thread Vincent Guittot

Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into HW pressure to reflect that it returns
a pressure applied by HW (i.e. with a high frequency change) and not
always related to thermal mitigation but also generated by max current
limitation as an example. Such high frequency signal needs filtering to be
smoothed and provide an value that reflects the average available capacity
into the scheduler time scale.

Signed-off-by: Vincent Guittot 
---
 arch/arm/include/asm/topology.h   |  6 ++---
 arch/arm64/include/asm/topology.h |  6 ++---
 drivers/base/arch_topology.c  | 26 +--
 drivers/cpufreq/qcom-cpufreq-hw.c |  4 +--
 include/linux/arch_topology.h |  8 +++---
 include/linux/sched/topology.h|  8 +++---
 .../{thermal_pressure.h => hw_pressure.h} | 14 +-
 include/trace/events/sched.h  |  2 +-
 init/Kconfig  | 12 -
 kernel/sched/core.c   |  8 +++---
 kernel/sched/fair.c   | 16 ++--
 kernel/sched/pelt.c   | 18 ++---
 kernel/sched/pelt.h   | 16 ++--
 kernel/sched/sched.h  | 10 +++
 14 files changed, 77 insertions(+), 77 deletions(-)
 rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a..ad36b6570067 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
 
 #else
 
diff --git a/arch/arm64/include/asm/topology.h 
b/arch/arm64/include/asm/topology.h
index a323b109b9c4..0f6ef432fb84 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
 
 #include 
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 0906114963ff..405af7a87008 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
 #include 
 
 #define CREATE_TRACE_POINTS
-#include 
+#include 
 
 static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
 static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+DEFINE_PER_CPU(unsigned long, hw_pressure);
 
 /**
- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update HW pressure for CPUs
  * @cpus: The related CPUs for which capacity has been reduced
  * @capped_freq : The maximum allowed frequency that CPUs can run at
  *
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of HW pressure for all @cpus in the mask. The
  * cpumask should include all (online+offline) affected CPUs, to avoid
  * operating on stale data when hot-plug is used for some CPUs. The
  * @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * HW capping. It might be also a boost frequency value, which is bigger
  * than the internal 'capacity_freq_ref' max frequency. In such case the
  * pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no HW throttling. The @capped_freq must be provided in kHz.
  */
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
  unsigned long capped_freq)
 {
-   unsigned long max_capacity, capacity, th_pressure;
+   unsigned long max_capacity, capacity, hw_pr

[PATCH v2 2/5] sched: Take cpufreq feedback into account

2023-12-21 Thread Vincent Guittot

Aggregate the different pressures applied on the capacity of CPUs and
create a new function that returns the actual capacity of the CPU:
  get_actual_cpu_capacity()

Signed-off-by: Vincent Guittot 
Reviewed-by: Lukasz Luba 
---
 kernel/sched/fair.c | 45 +
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..0235081defa5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4932,13 +4932,22 @@ static inline void util_est_update(struct cfs_rq 
*cfs_rq,
trace_sched_util_est_se_tp(>se);
 }
 
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+   unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+   capacity -= max(thermal_load_avg(cpu_rq(cpu)), 
cpufreq_get_pressure(cpu));
+
+   return capacity;
+}
+
 static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
 {
-   unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+   unsigned long capacity_orig;
bool fits, uclamp_max_fits;
 
/*
@@ -4970,7 +4979,6 @@ static inline int util_fits_cpu(unsigned long util,
 * goal is to cap the task. So it's okay if it's getting less.
 */
capacity_orig = arch_scale_cpu_capacity(cpu);
-   capacity_orig_thermal = capacity_orig - 
arch_scale_thermal_pressure(cpu);
 
/*
 * We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5045,7 +5053,8 @@ static inline int util_fits_cpu(unsigned long util,
 * handle the case uclamp_min > uclamp_max.
 */
uclamp_min = min(uclamp_min, uclamp_max);
-   if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+   if (fits && (util < uclamp_min) &&
+   (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
 
return fits;
@@ -7426,7 +7435,7 @@ select_idle_capacity(struct task_struct *p, struct 
sched_domain *sd, int target)
 * Look for the CPU with best capacity.
 */
else if (fits < 0)
-   cpu_cap = arch_scale_cpu_capacity(cpu) - 
thermal_load_avg(cpu_rq(cpu));
+   cpu_cap = get_actual_cpu_capacity(cpu);
 
/*
 * First, select CPU which fits better (-1 being better than 0).
@@ -7919,8 +7928,8 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
-   unsigned long best_thermal_cap = 0;
-   unsigned long prev_thermal_cap = 0;
+   unsigned long best_actual_cap = 0;
+   unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7950,7 +7959,7 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
 
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
-   unsigned long cpu_cap, cpu_thermal_cap, util;
+   unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -7962,18 +7971,17 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
 
-   /* Account thermal pressure for the energy estimation */
+   /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
-   cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
-   cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+   cpu_actual_cap = get_actual_cpu_capacity(cpu);
 
-   eenv.cpu_cap = cpu_thermal_cap;
+   eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
 
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
 
-   eenv.pd_cap += cpu_thermal_cap;
+   eenv.pd_cap += cpu_actual_cap;
 
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8044,7 +8052,7 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
-   prev_thermal_cap = cpu_thermal_cap;
+   prev_a

[PATCH v2 3/5] thermal/cpufreq: Remove arch_update_thermal_pressure()

2023-12-21 Thread Vincent Guittot

arch_update_thermal_pressure() aims to update fast changing signal which
should be averaged using PELT filtering before being provided to the
scheduler which can't make smart use of fast changing signal.
cpufreq now provides the maximum freq_qos pressure on the capacity to the
scheduler, which includes cpufreq cooling device. Remove the call to
arch_update_thermal_pressure() in cpufreq cooling device as this is
handled by cpufreq_get_pressure().

Signed-off-by: Vincent Guittot 
Reviewed-by: Lukasz Luba 
---
 drivers/thermal/cpufreq_cooling.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c 
b/drivers/thermal/cpufreq_cooling.c
index e2cc7bd30862..e77d3b44903e 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -448,7 +448,6 @@ static int cpufreq_set_cur_state(struct 
thermal_cooling_device *cdev,
 unsigned long state)
 {
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
-   struct cpumask *cpus;
unsigned int frequency;
int ret;
 
@@ -465,8 +464,6 @@ static int cpufreq_set_cur_state(struct 
thermal_cooling_device *cdev,
ret = freq_qos_update_request(_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
-   cpus = cpufreq_cdev->policy->related_cpus;
-   arch_update_thermal_pressure(cpus, frequency);
ret = 0;
}
 
-- 
2.34.1

[PATCH v2 1/5] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-21 Thread Vincent Guittot

Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
filtered as the pressure will happen for dozens ms or more.

Signed-off-by: Vincent Guittot 
---
 drivers/cpufreq/cpufreq.c | 34 ++
 include/linux/cpufreq.h   | 10 ++
 2 files changed, 44 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44db4f59c4cc..15bd41f9bb5e 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2563,6 +2563,38 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, 
unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @policy: cpufreq policy of the CPUs.
+ *
+ * Update the value of cpufreq pressure for all @cpus in the policy.
+ */
+static void cpufreq_update_pressure(struct cpufreq_policy *policy)
+{
+   unsigned long max_capacity, capped_freq, pressure;
+   u32 max_freq;
+   int cpu;
+
+   cpu = cpumask_first(policy->related_cpus);
+   pressure = max_capacity = arch_scale_cpu_capacity(cpu);
+   capped_freq = policy->max;
+   max_freq = arch_scale_freq_ref(cpu);
+
+   /*
+* Handle properly the boost frequencies, which should simply clean
+* the thermal pressure value.
+*/
+   if (max_freq <= capped_freq)
+   pressure -= max_capacity;
+   else
+   pressure -= mult_frac(max_capacity, capped_freq, max_freq);
+
+   for_each_cpu(cpu, policy->related_cpus)
+   WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
 /**
  * cpufreq_set_policy - Modify cpufreq policy parameters.
  * @policy: Policy object to modify.
@@ -2618,6 +2650,8 @@ static int cpufreq_set_policy(struct cpufreq_policy 
*policy,
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
trace_cpu_frequency_limits(policy);
 
+   cpufreq_update_pressure(policy);
+
policy->cached_target_freq = UINT_MAX;
 
pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3dd..b1d97edd3253 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct 
cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
 bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+   return per_cpu(cpufreq_pressure, cpu);
+}
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
@@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
return false;
 }
 static inline void disable_cpufreq(void) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ_STAT
-- 
2.34.1

[PATCH v2 0/5] Rework system pressure interface to the scheduler

2023-12-21 Thread Vincent Guittot

Following the consolidation and cleanup of CPU capacity in [1], this serie
reworks how the scheduler gets the pressures on CPUs. We need to take into
account all pressures applied by cpufreq on the compute capacity of a CPU
for dozens of ms or more and not only cpufreq cooling device or HW
mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
- one from cpufreq and freq_qos
- one from HW high freq mitigiation.

The next step will be to add a dedicated interface for long standing
capping of the CPU capacity (i.e. for seconds or more) like the
scaling_max_freq of cpufreq sysfs. The latter is already taken into
account by this serie but as a temporary pressure which is not always the
best choice when we know that it will happen for seconds or more.

[1] 
https://lore.kernel.org/lkml/20231211104855.558096-1-vincent.guit...@linaro.org/

Change since v1:
- Use struct cpufreq_policy as parameter of cpufreq_update_pressure()
- Fix typos and comments
- Make sched_thermal_decay_shift boot param as deprecated

Vincent Guittot (5):
  cpufreq: Add a cpufreq pressure feedback for the scheduler
  sched: Take cpufreq feedback into account
  thermal/cpufreq: Remove arch_update_thermal_pressure()
  sched: Rename arch_update_thermal_pressure into
arch_update_hw_pressure
  sched/pelt: Remove shift of thermal clock

 .../admin-guide/kernel-parameters.txt |  1 +
 arch/arm/include/asm/topology.h   |  6 +-
 arch/arm64/include/asm/topology.h |  6 +-
 drivers/base/arch_topology.c  | 26 
 drivers/cpufreq/cpufreq.c | 34 ++
 drivers/cpufreq/qcom-cpufreq-hw.c |  4 +-
 drivers/thermal/cpufreq_cooling.c |  3 -
 include/linux/arch_topology.h |  8 +--
 include/linux/cpufreq.h   | 10 +++
 include/linux/sched/topology.h|  8 +--
 .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
 include/trace/events/sched.h  |  2 +-
 init/Kconfig  | 12 ++--
 kernel/sched/core.c   |  8 +--
 kernel/sched/fair.c   | 63 +--
 kernel/sched/pelt.c   | 18 +++---
 kernel/sched/pelt.h   | 16 ++---
 kernel/sched/sched.h  | 22 +--
 18 files changed, 142 insertions(+), 119 deletions(-)
 rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

-- 
2.34.1

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-14 Thread Vincent Guittot

On Thu, 14 Dec 2023 at 10:20, Lukasz Luba  wrote:
>
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >   drivers/cpufreq/cpufreq.c | 48 +++
> >   include/linux/cpufreq.h   | 10 
> >   2 files changed, 58 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..7d5f71be8d29 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy 
> > *policy, unsigned int cpu)
> >   }
> >   EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
>
> Why do we export this variable when we have get/update functions?
> Do we expect modules would manipulate those per-cpu variables
> independently and not like we do per-cpumask in the update func.?

No, I will remove the EXPORT_PER_CPU_SYMBOL_GPL

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

2023-12-14 Thread Vincent Guittot

On Thu, 14 Dec 2023 at 09:53, Lukasz Luba  wrote:
>
>
>
> On 12/14/23 08:36, Vincent Guittot wrote:
> > On Thu, 14 Dec 2023 at 09:30, Lukasz Luba  wrote:
> >>
> >>
> >> On 12/12/23 14:27, Vincent Guittot wrote:
>
> [snip]
>
> >>>update_rq_clock(rq);
> >>> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> >>> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
> >>> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> >>> + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> >>
> >> We switch to task clock here, could you tell me why?
> >> Don't we have to maintain the boot command parameter for the shift?
> >
> > This should have been part of the patch5 that I finally removed. IMO,
> > the additional time shift with rq_clock_thermal is no more needed now
> > that we have 2 separates signals
> >
>
> I didn't like the left-shift which causes the signal to converge slowly.
> I rather wanted right-shift to converge (react faster), so you have my
> vote for this change. Also, I agree that with the two-signal approach
> this shift trick can go away now. I just worry about the dropped boot
> parameter.
>
> So, are going to send that patach5 which removes the
> 'sched_thermal_decay_shift' and documentation bit?

Yes, i will add it back for the next version

Re: [PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

2023-12-14 Thread Vincent Guittot

On Thu, 14 Dec 2023 at 09:30, Lukasz Luba  wrote:
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Now that cpufreq provides a pressure value to the scheduler, rename
> > arch_update_thermal_pressure into hw pressure to reflect that it returns
> > a pressure applied by HW with a high frequency and which needs filtering.
>
> I would elaborte this meaning 'filtering' here. Something like:
> '... high frequency and which needs filtering to smooth the singal and
> get an average value. That reflects available capacity of the CPU in
> longer period'

Ok I will update the commit message to provide more details

>
> > This pressure is not always related to thermal mitigation but can also be
> > generated by max current limitation as an example.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >   arch/arm/include/asm/topology.h   |  6 ++---
> >   arch/arm64/include/asm/topology.h |  6 ++---
> >   drivers/base/arch_topology.c  | 26 +--
> >   drivers/cpufreq/qcom-cpufreq-hw.c |  4 +--
> >   include/linux/arch_topology.h |  8 +++---
> >   include/linux/sched/topology.h|  8 +++---
> >   .../{thermal_pressure.h => hw_pressure.h} | 14 +-
> >   include/trace/events/sched.h  |  2 +-
> >   init/Kconfig  | 12 -
> >   kernel/sched/core.c   |  8 +++---
> >   kernel/sched/fair.c   | 12 -
> >   kernel/sched/pelt.c   | 18 ++---
> >   kernel/sched/pelt.h   | 16 ++--
> >   kernel/sched/sched.h  |  4 +--
> >   14 files changed, 72 insertions(+), 72 deletions(-)
> >   rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
> >
> > diff --git a/arch/arm/include/asm/topology.h 
> > b/arch/arm/include/asm/topology.h
> > index 853c4f81ba4a..e175e8596b5d 100644
> > --- a/arch/arm/include/asm/topology.h
> > +++ b/arch/arm/include/asm/topology.h
> > @@ -22,9 +22,9 @@
> >   /* Enable topology flag updates */
> >   #define arch_update_cpu_topology topology_update_cpu_topology
> >
> > -/* Replace task scheduler's default thermal pressure API */
> > -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> > -#define arch_update_thermal_pressure topology_update_thermal_pressure
> > +/* Replace task scheduler's default hw pressure API */
> > +#define arch_scale_hw_pressure topology_get_hw_pressure
> > +#define arch_update_hw_pressure  topology_update_hw_pressure
> >
> >   #else
> >
> > diff --git a/arch/arm64/include/asm/topology.h 
> > b/arch/arm64/include/asm/topology.h
> > index a323b109b9c4..a427650bdfba 100644
> > --- a/arch/arm64/include/asm/topology.h
> > +++ b/arch/arm64/include/asm/topology.h
> > @@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
> >   /* Enable topology flag updates */
> >   #define arch_update_cpu_topology topology_update_cpu_topology
> >
> > -/* Replace task scheduler's default thermal pressure API */
> > -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> > -#define arch_update_thermal_pressure topology_update_thermal_pressure
> > +/* Replace task scheduler's default hw pressure API */
>
> s/hw/HW/ ?
>
> > +#define arch_scale_hw_pressure topology_get_hw_pressure
> > +#define arch_update_hw_pressure  topology_update_hw_pressure
> >
> >   #include 
> >
> > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> > index 0906114963ff..3d8dc9d5c3ad 100644
> > --- a/drivers/base/arch_topology.c
> > +++ b/drivers/base/arch_topology.c
> > @@ -22,7 +22,7 @@
> >   #include 
> >
> >   #define CREATE_TRACE_POINTS
> > -#include 
> > +#include 
> >
> >   static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
> >   static struct cpumask scale_freq_counters_mask;
> > @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, 
> > unsigned long capacity)
> >   per_cpu(cpu_scale, cpu) = capacity;
> >   }
> >
> > -DEFINE_PER_CPU(unsigned long, thermal_pressure);
> > +DEFINE_PER_CPU(unsigned long, hw_pressure);
> >
> >   /**
> > - * topology_update_thermal_pressure() - Update thermal pressure for CPUs
> > + * topology_update_hw_pressure() - Update hw pressure for CPUs
>
> same here: HW?
>
> >* @cpus: The related CPUs for which capacity has been reduced
&g

Re: [PATCH 0/5] Rework system pressure interface to the scheduler

2023-12-14 Thread Vincent Guittot

On Thu, 14 Dec 2023 at 09:21, Lukasz Luba  wrote:
>
> Hi Vincent,
>
> I've been waiting for this feature, thanks!
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Following the consolidation and cleanup of CPU capacity in [1], this serie
> > reworks how the scheduler gets the pressures on CPUs. We need to take into
> > account all pressures applied by cpufreq on the compute capacity of a CPU
> > for dozens of ms or more and not only cpufreq cooling device or HW
> > mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
> > - one from cpufreq and freq_qos
> > - one from HW high freq mitigiation.
> >
> > The next step will be to add a dedicated interface for long standing
> > capping of the CPU capacity (i.e. for seconds or more) like the
> > scaling_max_freq of cpufreq sysfs. The latter is already taken into
> > account by this serie but as a temporary pressure which is not always the
> > best choice when we know that it will happen for seconds or more.
> >
> > [1] 
> > https://lore.kernel.org/lkml/20231211104855.558096-1-vincent.guit...@linaro.org/
> >
> > Vincent Guittot (4):
> >cpufreq: Add a cpufreq pressure feedback for the scheduler
> >sched: Take cpufreq feedback into account
> >thermal/cpufreq: Remove arch_update_thermal_pressure()
> >sched: Rename arch_update_thermal_pressure into
> >  arch_update_hw_pressure
> >
> >   arch/arm/include/asm/topology.h   |  6 +--
> >   arch/arm64/include/asm/topology.h |  6 +--
> >   drivers/base/arch_topology.c  | 26 -
> >   drivers/cpufreq/cpufreq.c | 48 +
> >   drivers/cpufreq/qcom-cpufreq-hw.c |  4 +-
> >   drivers/thermal/cpufreq_cooling.c |  3 --
> >   include/linux/arch_topology.h |  8 +--
> >   include/linux/cpufreq.h   | 10 
> >   include/linux/sched/topology.h|  8 +--
> >   .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
> >   include/trace/events/sched.h  |  2 +-
> >   init/Kconfig  | 12 ++---
> >   kernel/sched/core.c   |  8 +--
> >   kernel/sched/fair.c   | 53 ++-
> >   kernel/sched/pelt.c   | 18 +++
> >   kernel/sched/pelt.h   | 16 +++---
> >   kernel/sched/sched.h  |  4 +-
> >   17 files changed, 152 insertions(+), 94 deletions(-)
> >   rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
> >
>
> I would like to test it, but something worries me. Why there is 0/5 in
> this subject and only 4 patches?

I removed a patch from the series but copied/pasted the cover letter
subject without noticing the /5 instead of /4

>
> Could you tell me your base branch that I can apply this, please?

It applies on top of tip/sched/core + [1]
and you can find it here:
https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure

>
> Regards,
> Lukasz

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-13 Thread Vincent Guittot

On Thu, 14 Dec 2023 at 06:43, Viresh Kumar  wrote:
>
> On 12-12-23, 15:27, Vincent Guittot wrote:
> > @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
> > *policy,
> >   policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> >   trace_cpu_frequency_limits(policy);
> >
> > + cpus = policy->related_cpus;
> > + cpufreq_update_pressure(cpus, policy->max);
> > +
> >   policy->cached_target_freq = UINT_MAX;
>
> One more question, why are you doing this from cpufreq_set_policy ? If
> due to cpufreq cooling or from userspace, we end up limiting the
> maximum possible frequency, will this routine always get called ?

Yes, any update of a FREQ_QOS_MAX ends up calling cpufreq_set_policy()
to update the policy->max


>
> --
> viresh

Re: [PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-13 Thread Vincent Guittot

On Wed, 13 Dec 2023 at 08:17, Viresh Kumar  wrote:
>
> On 12-12-23, 15:27, Vincent Guittot wrote:
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >  drivers/cpufreq/cpufreq.c | 48 +++
> >  include/linux/cpufreq.h   | 10 
> >  2 files changed, 58 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..7d5f71be8d29 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy 
> > *policy, unsigned int cpu)
> >  }
> >  EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
> > +
> > +/**
> > + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> > + * @cpus: The related CPUs for which max capacity has been reduced
> > + * @capped_freq : The maximum allowed frequency that CPUs can run at
> > + *
> > + * Update the value of cpufreq pressure for all @cpus in the mask. The
> > + * cpumask should include all (online+offline) affected CPUs, to avoid
> > + * operating on stale data when hot-plug is used for some CPUs. The
> > + * @capped_freq reflects the currently allowed max CPUs frequency due to
> > + * freq_qos capping. It might be also a boost frequency value, which is 
> > bigger
> > + * than the internal 'capacity_freq_ref' max frequency. In such case the
> > + * pressure value should simply be removed, since this is an indication 
> > that
> > + * there is no capping. The @capped_freq must be provided in kHz.
> > + */
> > +static void cpufreq_update_pressure(const struct cpumask *cpus,
>
> Since this is defined as 'static', why not just pass policy here ?

Mainly because we only need the cpumask and also because this follows
the same pattern as other place like arch_topology.c

>
> > +   unsigned long capped_freq)
> > +{
> > + unsigned long max_capacity, capacity, pressure;
> > + u32 max_freq;
> > + int cpu;
> > +
> > + cpu = cpumask_first(cpus);
> > + max_capacity = arch_scale_cpu_capacity(cpu);
>
> This anyway expects all of them to be from the same policy ..
>
> > + max_freq = arch_scale_freq_ref(cpu);
> > +
> > + /*
> > +  * Handle properly the boost frequencies, which should simply clean
> > +  * the thermal pressure value.
> > +  */
> > + if (max_freq <= capped_freq)
> > + capacity = max_capacity;
> > + else
> > + capacity = mult_frac(max_capacity, capped_freq, max_freq);
> > +
> > + pressure = max_capacity - capacity;
> > +
>
> Extra blank line here.
>
> > +
> > + for_each_cpu(cpu, cpus)
> > + WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
> > +}
> > +
> >  /**
> >   * cpufreq_set_policy - Modify cpufreq policy parameters.
> >   * @policy: Policy object to modify.
> > @@ -2584,6 +2628,7 @@ static int cpufreq_set_policy(struct cpufreq_policy 
> > *policy,
> >  {
> >   struct cpufreq_policy_data new_data;
> >   struct cpufreq_governor *old_gov;
> > + struct cpumask *cpus;
> >   int ret;
> >
> >   memcpy(_data.cpuinfo, >cpuinfo, sizeof(policy->cpuinfo));
> > @@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
> > *policy,
> >   policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> >   trace_cpu_frequency_limits(policy);
> >
> > + cpus = policy->related_cpus;
>
> You don't need the extra variable anyway, but lets just pass policy
> instead to the routine.

In fact I have followed what was done in cpufreq_cooling.c with
arch_update_thermal_pressure().

Will remove it

>
> > + cpufreq_update_pressure(cpus, policy->max);
> > +
> >   policy->cached_target_freq = UINT_MAX;
> >
> >   pr_debug("new min and max freqs are %u - %u kHz\n",
> > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> > index afda5f24d3dd..b1d97edd3253 100644
> > --- a/include/linux/cpufreq.h
> > +++ b/include/linux/cpufreq.h
> > @@ -241,6 +241,12 @@ struct kobject *get_gov

[PATCH 4/4] sched: Rename arch_update_thermal_pressure into arch_update_hw_pressure

2023-12-12 Thread Vincent Guittot

Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into hw pressure to reflect that it returns
a pressure applied by HW with a high frequency and which needs filtering.
This pressure is not always related to thermal mitigation but can also be
generated by max current limitation as an example.

Signed-off-by: Vincent Guittot 
---
 arch/arm/include/asm/topology.h   |  6 ++---
 arch/arm64/include/asm/topology.h |  6 ++---
 drivers/base/arch_topology.c  | 26 +--
 drivers/cpufreq/qcom-cpufreq-hw.c |  4 +--
 include/linux/arch_topology.h |  8 +++---
 include/linux/sched/topology.h|  8 +++---
 .../{thermal_pressure.h => hw_pressure.h} | 14 +-
 include/trace/events/sched.h  |  2 +-
 init/Kconfig  | 12 -
 kernel/sched/core.c   |  8 +++---
 kernel/sched/fair.c   | 12 -
 kernel/sched/pelt.c   | 18 ++---
 kernel/sched/pelt.h   | 16 ++--
 kernel/sched/sched.h  |  4 +--
 14 files changed, 72 insertions(+), 72 deletions(-)
 rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a..e175e8596b5d 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default hw pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
 
 #else
 
diff --git a/arch/arm64/include/asm/topology.h 
b/arch/arm64/include/asm/topology.h
index a323b109b9c4..a427650bdfba 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
 /* Enable topology flag updates */
 #define arch_update_cpu_topology topology_update_cpu_topology
 
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure   topology_update_thermal_pressure
+/* Replace task scheduler's default hw pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressuretopology_update_hw_pressure
 
 #include 
 
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 0906114963ff..3d8dc9d5c3ad 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
 #include 
 
 #define CREATE_TRACE_POINTS
-#include 
+#include 
 
 static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
 static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+DEFINE_PER_CPU(unsigned long, hw_pressure);
 
 /**
- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update hw pressure for CPUs
  * @cpus: The related CPUs for which capacity has been reduced
  * @capped_freq : The maximum allowed frequency that CPUs can run at
  *
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of hw pressure for all @cpus in the mask. The
  * cpumask should include all (online+offline) affected CPUs, to avoid
  * operating on stale data when hot-plug is used for some CPUs. The
  * @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * hw capping. It might be also a boost frequency value, which is bigger
  * than the internal 'capacity_freq_ref' max frequency. In such case the
  * pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no hw throttling. The @capped_freq must be provided in kHz.
  */
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
  unsigned long capped_freq)
 {
-   unsigned long max_capacity, capacity, th_pressure;
+   unsigned long max_capacity, capacity, hw_pressure;
u32 max_freq;
int cpu;
 
@@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct 
cpumask

[PATCH 1/4] cpufreq: Add a cpufreq pressure feedback for the scheduler

2023-12-12 Thread Vincent Guittot

Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
filtered as the pressure will happen for dozens ms or more.

Signed-off-by: Vincent Guittot 
---
 drivers/cpufreq/cpufreq.c | 48 +++
 include/linux/cpufreq.h   | 10 
 2 files changed, 58 insertions(+)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44db4f59c4cc..7d5f71be8d29 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2563,6 +2563,50 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, 
unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_get_policy);
 
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+EXPORT_PER_CPU_SYMBOL_GPL(cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @cpus: The related CPUs for which max capacity has been reduced
+ * @capped_freq : The maximum allowed frequency that CPUs can run at
+ *
+ * Update the value of cpufreq pressure for all @cpus in the mask. The
+ * cpumask should include all (online+offline) affected CPUs, to avoid
+ * operating on stale data when hot-plug is used for some CPUs. The
+ * @capped_freq reflects the currently allowed max CPUs frequency due to
+ * freq_qos capping. It might be also a boost frequency value, which is bigger
+ * than the internal 'capacity_freq_ref' max frequency. In such case the
+ * pressure value should simply be removed, since this is an indication that
+ * there is no capping. The @capped_freq must be provided in kHz.
+ */
+static void cpufreq_update_pressure(const struct cpumask *cpus,
+ unsigned long capped_freq)
+{
+   unsigned long max_capacity, capacity, pressure;
+   u32 max_freq;
+   int cpu;
+
+   cpu = cpumask_first(cpus);
+   max_capacity = arch_scale_cpu_capacity(cpu);
+   max_freq = arch_scale_freq_ref(cpu);
+
+   /*
+* Handle properly the boost frequencies, which should simply clean
+* the thermal pressure value.
+*/
+   if (max_freq <= capped_freq)
+   capacity = max_capacity;
+   else
+   capacity = mult_frac(max_capacity, capped_freq, max_freq);
+
+   pressure = max_capacity - capacity;
+
+
+   for_each_cpu(cpu, cpus)
+   WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
 /**
  * cpufreq_set_policy - Modify cpufreq policy parameters.
  * @policy: Policy object to modify.
@@ -2584,6 +2628,7 @@ static int cpufreq_set_policy(struct cpufreq_policy 
*policy,
 {
struct cpufreq_policy_data new_data;
struct cpufreq_governor *old_gov;
+   struct cpumask *cpus;
int ret;
 
memcpy(_data.cpuinfo, >cpuinfo, sizeof(policy->cpuinfo));
@@ -2618,6 +2663,9 @@ static int cpufreq_set_policy(struct cpufreq_policy 
*policy,
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
trace_cpu_frequency_limits(policy);
 
+   cpus = policy->related_cpus;
+   cpufreq_update_pressure(cpus, policy->max);
+
policy->cached_target_freq = UINT_MAX;
 
pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3dd..b1d97edd3253 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct 
cpufreq_policy *policy);
 void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
 void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
 bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+   return per_cpu(cpufreq_pressure, cpu);
+}
 #else
 static inline unsigned int cpufreq_get(unsigned int cpu)
 {
@@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
return false;
 }
 static inline void disable_cpufreq(void) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+   return 0;
+}
 #endif
 
 #ifdef CONFIG_CPU_FREQ_STAT
-- 
2.34.1

[PATCH 3/4] thermal/cpufreq: Remove arch_update_thermal_pressure()

2023-12-12 Thread Vincent Guittot

arch_update_thermal_pressure() aims to update fast changing signal which
should be averaged using PELT filtering before being provided to the
scheduler which can't make smart use of fast changing signal.
cpufreq now provides the maximum freq_qos pressure on the capacity to the
scheduler, which includes cpufreq cooling device. Remove the call to
arch_update_thermal_pressure() in cpufreq cooling device as this is
handled by cpufreq_get_pressure().

Signed-off-by: Vincent Guittot 
---
 drivers/thermal/cpufreq_cooling.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c 
b/drivers/thermal/cpufreq_cooling.c
index e2cc7bd30862..e77d3b44903e 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -448,7 +448,6 @@ static int cpufreq_set_cur_state(struct 
thermal_cooling_device *cdev,
 unsigned long state)
 {
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
-   struct cpumask *cpus;
unsigned int frequency;
int ret;
 
@@ -465,8 +464,6 @@ static int cpufreq_set_cur_state(struct 
thermal_cooling_device *cdev,
ret = freq_qos_update_request(_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
-   cpus = cpufreq_cdev->policy->related_cpus;
-   arch_update_thermal_pressure(cpus, frequency);
ret = 0;
}
 
-- 
2.34.1

[PATCH 2/4] sched: Take cpufreq feedback into account

2023-12-12 Thread Vincent Guittot

Aggregate the different pressures applied on the capacity of CPUs and
create a new function that returns the actual capacity of the CPU:
  get_actual_cpu_capacity()

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 43 +++
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..11d3be829302 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4932,12 +4932,20 @@ static inline void util_est_update(struct cfs_rq 
*cfs_rq,
trace_sched_util_est_se_tp(>se);
 }
 
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+   unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+   capacity -= max(thermal_load_avg(cpu_rq(cpu)), 
cpufreq_get_pressure(cpu));
+
+   return capacity;
+}
 static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
 {
-   unsigned long capacity_orig, capacity_orig_thermal;
+   unsigned long capacity_orig;
unsigned long capacity = capacity_of(cpu);
bool fits, uclamp_max_fits;
 
@@ -4970,7 +4978,6 @@ static inline int util_fits_cpu(unsigned long util,
 * goal is to cap the task. So it's okay if it's getting less.
 */
capacity_orig = arch_scale_cpu_capacity(cpu);
-   capacity_orig_thermal = capacity_orig - 
arch_scale_thermal_pressure(cpu);
 
/*
 * We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5045,7 +5052,7 @@ static inline int util_fits_cpu(unsigned long util,
 * handle the case uclamp_min > uclamp_max.
 */
uclamp_min = min(uclamp_min, uclamp_max);
-   if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+   if (fits && (util < uclamp_min) && (uclamp_min > 
get_actual_cpu_capacity(cpu)))
return -1;
 
return fits;
@@ -7426,7 +7433,7 @@ select_idle_capacity(struct task_struct *p, struct 
sched_domain *sd, int target)
 * Look for the CPU with best capacity.
 */
else if (fits < 0)
-   cpu_cap = arch_scale_cpu_capacity(cpu) - 
thermal_load_avg(cpu_rq(cpu));
+   cpu_cap = get_actual_cpu_capacity(cpu);
 
/*
 * First, select CPU which fits better (-1 being better than 0).
@@ -7919,8 +7926,8 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
-   unsigned long best_thermal_cap = 0;
-   unsigned long prev_thermal_cap = 0;
+   unsigned long best_actual_cap = 0;
+   unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7950,7 +7957,7 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
 
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
-   unsigned long cpu_cap, cpu_thermal_cap, util;
+   unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -7962,18 +7969,17 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
 
-   /* Account thermal pressure for the energy estimation */
+   /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
-   cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
-   cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+   cpu_actual_cap = get_actual_cpu_capacity(cpu);
 
-   eenv.cpu_cap = cpu_thermal_cap;
+   eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
 
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
 
-   eenv.pd_cap += cpu_thermal_cap;
+   eenv.pd_cap += cpu_actual_cap;
 
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8044,7 +8050,7 @@ static int find_energy_efficient_cpu(struct task_struct 
*p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
-   prev_thermal_cap = cpu_thermal_cap;
+   prev_actual_cap = cpu_actual_cap;

[PATCH 0/5] Rework system pressure interface to the scheduler

2023-12-12 Thread Vincent Guittot

Following the consolidation and cleanup of CPU capacity in [1], this serie
reworks how the scheduler gets the pressures on CPUs. We need to take into
account all pressures applied by cpufreq on the compute capacity of a CPU
for dozens of ms or more and not only cpufreq cooling device or HW
mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
- one from cpufreq and freq_qos
- one from HW high freq mitigiation.

The next step will be to add a dedicated interface for long standing
capping of the CPU capacity (i.e. for seconds or more) like the
scaling_max_freq of cpufreq sysfs. The latter is already taken into
account by this serie but as a temporary pressure which is not always the
best choice when we know that it will happen for seconds or more.

[1] 
https://lore.kernel.org/lkml/20231211104855.558096-1-vincent.guit...@linaro.org/

Vincent Guittot (4):
  cpufreq: Add a cpufreq pressure feedback for the scheduler
  sched: Take cpufreq feedback into account
  thermal/cpufreq: Remove arch_update_thermal_pressure()
  sched: Rename arch_update_thermal_pressure into
arch_update_hw_pressure

 arch/arm/include/asm/topology.h   |  6 +--
 arch/arm64/include/asm/topology.h |  6 +--
 drivers/base/arch_topology.c  | 26 -
 drivers/cpufreq/cpufreq.c | 48 +
 drivers/cpufreq/qcom-cpufreq-hw.c |  4 +-
 drivers/thermal/cpufreq_cooling.c |  3 --
 include/linux/arch_topology.h |  8 +--
 include/linux/cpufreq.h   | 10 
 include/linux/sched/topology.h|  8 +--
 .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
 include/trace/events/sched.h  |  2 +-
 init/Kconfig  | 12 ++---
 kernel/sched/core.c   |  8 +--
 kernel/sched/fair.c   | 53 ++-
 kernel/sched/pelt.c   | 18 +++
 kernel/sched/pelt.h   | 16 +++---
 kernel/sched/sched.h  |  4 +-
 17 files changed, 152 insertions(+), 94 deletions(-)
 rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

-- 
2.34.1

Re: [PATCH v2] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-20 Thread Vincent Guittot

On Tue, 20 Apr 2021 at 17:20, Rik van Riel  wrote:
>
> On Tue, 2021-04-20 at 11:04 +0200, Vincent Guittot wrote:
> > On Mon, 19 Apr 2021 at 18:51, Rik van Riel  wrote:
> > >
> > > @@ -10688,7 +10697,7 @@ static int newidle_balance(struct rq
> > > *this_rq, struct rq_flags *rf)
> > > if (this_rq->nr_running != this_rq->cfs.h_nr_running)
> > > pulled_task = -1;
> > >
> > > -   if (pulled_task)
> > > +   if (pulled_task || this_rq->ttwu_pending)
> >
> > This needs at least a comment to explain why we must clear
> > this_rq->idle_stamp when this_rq->ttwu_pending is set whereas it is
> > also done during sched_ttwu_pending()
> >
> > > this_rq->idle_stamp = 0;
>
> I spent some time staring at sched_ttwu_pending and
> the functions it calls, but I can't seem to spot
> where it clears rq->idle_stamp, except inside
> ttwu_do_wakeup where it will end up adding a
> non-idle period into the rq->avg_idle, which seems
> wrong.

Not sure that this is really wrong because it ends up scheduling the
idle task which is immediately preempted. But the preemption happened
in the idle task, isn't it ?

>
> If we are actually idle, and get woken up with a
> ttwu_queue task, we do not come through newidle_balance,
> and we end up counting the idle time into the avg_idle
> number.
>
> However, if a task is woken up while the CPU is
> in newidle_balance, because prev != idle, we should
> not count that period towards rq->avg_idle, for
> the same reason we do so when we pulled a task.

As mentioned above, we have effectively schedule the idle task in your
case whereas we don't in the other cases

IIUC, your problem comes from rq->avg_idle decreasing a lot in such
cases. And because rq->avg_idle is used to decide if you have time to
run newlyidle_balance,you skip it more often.

>
> I'll add a comment in v3 explaining why idle_stamp
> needs to be 0.

Yes please.

>
> --
> All Rights Reversed.

Re: [PATCH 2/2] sched/fair: Relax task_hot() for misfit tasks

2021-04-20 Thread Vincent Guittot

On Mon, 19 Apr 2021 at 19:13, Valentin Schneider
 wrote:
>
> On 16/04/21 15:51, Vincent Guittot wrote:
> > Le jeudi 15 avril 2021 ï¿½ 18:58:46 (+0100), Valentin Schneider a ï¿½crit :
> >> +
> >> +/*
> >> + * What does migrating this task do to our capacity-aware scheduling 
> >> criterion?
> >> + *
> >> + * Returns 1, if the task needs more capacity than the dst CPU can 
> >> provide.
> >> + * Returns 0, if the task needs the extra capacity provided by the dst CPU
> >> + * Returns -1, if the task isn't impacted by the migration wrt capacity.
> >> + */
> >> +static int migrate_degrades_capacity(struct task_struct *p, struct lb_env 
> >> *env)
> >> +{
> >> +if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
> >> +return -1;
> >> +
> >> +if (!task_fits_capacity(p, capacity_of(env->src_cpu))) {
> >> +if (cpu_capacity_greater(env->dst_cpu, env->src_cpu))
> >> +return 0;
> >> +else if (cpu_capacity_greater(env->src_cpu, env->dst_cpu))
> >> +return 1;
> >> +else
> >> +return -1;
> >> +}
> >
> > Being there means that task fits src_cpu capacity so why testing p against 
> > dst_cpu ?
> >
>
> Because if p fits on src_cpu, we don't want to move it to a dst_cpu on
> which it *doesn't* fit.

OK. I was confused because I thought that this was only to force
migration in case of group_misfit_task but you tried to extend to
other cases... I'm not convinced that you succeeded to cover all cases

Also I found this function which returns 3 values a bit disturbing.
IIUC you tried to align to migrate_degrades_capacity but you should
have better aligned to task_hot and return only 0 or 1. -1 is not used

>
> >> +
> >> +return task_fits_capacity(p, capacity_of(env->dst_cpu)) ? -1 : 1;
> >> +}
> >
> > I prefer the below which easier to read because the same var is use 
> > everywhere and you can remove cpu_capacity_greater.
> >
> > static int migrate_degrades_capacity(struct task_struct *p, struct lb_env 
> > *env)
> > {
> > unsigned long src_capacity, dst_capacity;
> >
> > if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
> > return -1;
> >
> > src_capacity = capacity_of(env->src_cpu);
> > dst_capacity = capacity_of(env->dst_cpu);
> >
> > if (!task_fits_capacity(p, src_capacity)) {
> > if (capacity_greater(dst_capacity, src_capacity))
> > return 0;
> > else if (capacity_greater(src_capacity, dst_capacity))
> > return 1;
> > else
> > return -1;
> > }
> >
> > return task_fits_capacity(p, dst_capacity) ? -1 : 1;
> > }
> >
>
> I'll take it, thanks!
>
> >
> >> +
> >>  #ifdef CONFIG_NUMA_BALANCING
> >>  /*
> >>   * Returns 1, if task migration degrades locality
> >> @@ -7672,6 +7698,15 @@ int can_migrate_task(struct task_struct *p, struct 
> >> lb_env *env)
> >>  if (tsk_cache_hot == -1)
> >>  tsk_cache_hot = task_hot(p, env);
> >>
> >> +/*
> >> + * On a (sane) asymmetric CPU capacity system, the increase in compute
> >> + * capacity should offset any potential performance hit caused by a
> >> + * migration.
> >> + */
> >> +if ((env->dst_grp_type == group_has_spare) &&
> >
> > Shouldn't it be env->src_grp_type == group_misfit_task to only care of 
> > misfit task case as
> > stated in $subject
> >
>
> Previously this was env->idle != CPU_NOT_IDLE, but I figured dst_grp_type
> could give us a better picture. Staring at this some more, this isn't so
> true when the group size goes up - there's no guarantees the dst_cpu is the
> one that has spare cycles, and the other CPUs might not be able to grant
> the capacity uplift dst_cpu can.

yeah you have to keep checking for env->idle != CPU_NOT_IDLE

>
> As for not using src_grp_type == group_misfit_task, this is pretty much the
> same as [1]. CPU-bound (misfit) task + some other task on the same rq
> implies group_overloaded classification when balancing at MC level (no SMT,
> so one group per CPU).

Is it something that happens often or just a sporadic/transient state
? I mean does it really worth the extra complexity and do you see
performance improvement ?

You should better focus on fixing the simple case of group_misfit_task
task. This other cases looks far more complex with lot of corner cases

>
> [1]: http://lore.kernel.org/r/jhjblcuv2mo.mog...@arm.com

Re: [PATCH v2] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-20 Thread Vincent Guittot

On Mon, 19 Apr 2021 at 18:51, Rik van Riel  wrote:
>
> The try_to_wake_up function has an optimization where it can queue
> a task for wakeup on its previous CPU, if the task is still in the
> middle of going to sleep inside schedule().
>
> Once schedule() re-enables IRQs, the task will be woken up with an
> IPI, and placed back on the runqueue.
>
> If we have such a wakeup pending, there is no need to search other
> CPUs for runnable tasks. Just skip (or bail out early from) newidle
> balancing, and run the just woken up task.
>
> For a memcache like workload test, this reduces total CPU use by
> about 2%, proportionally split between user and system time,
> and p99 and p95 application response time by 2-3% on average.
> The schedstats run_delay number shows a similar improvement.
>
> Signed-off-by: Rik van Riel 
> ---
> v2:
>  - fix !SMP build error and prev-not-CFS case by moving check into 
> newidle_balance
>  - fix formatting of if condition
>  - audit newidle_balance return value use to make sure we get that right
>  - reset idle_stamp when breaking out of the loop due to ->ttwu_pending
>
>  kernel/sched/fair.c | 13 +++--
>  1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 69680158963f..5e26f013e182 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10594,6 +10594,14 @@ static int newidle_balance(struct rq *this_rq, 
> struct rq_flags *rf)
> u64 curr_cost = 0;
>
> update_misfit_status(NULL, this_rq);
> +
> +   /*
> +* There is a task waiting to run. No need to search for one.
> +* Return 0; the task will be enqueued when switching to idle.
> +*/
> +   if (this_rq->ttwu_pending)
> +   return 0;
> +
> /*
>  * We must set idle_stamp _before_ calling idle_balance(), such that 
> we
>  * measure the duration of idle_balance() as idle time.
> @@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
>  * Stop searching for tasks to pull if there are
>  * now runnable tasks on this rq.
>  */
> -   if (pulled_task || this_rq->nr_running > 0)
> +   if (pulled_task || this_rq->nr_running > 0 ||
> +   this_rq->ttwu_pending)
> break;
> }
> rcu_read_unlock();
> @@ -10688,7 +10697,7 @@ static int newidle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
> if (this_rq->nr_running != this_rq->cfs.h_nr_running)
> pulled_task = -1;
>
> -   if (pulled_task)
> +   if (pulled_task || this_rq->ttwu_pending)

This needs at least a comment to explain why we must clear
this_rq->idle_stamp when this_rq->ttwu_pending is set whereas it is
also done during sched_ttwu_pending()

> this_rq->idle_stamp = 0;
>
> rq_repin_lock(this_rq, rf);
> --
> 2.25.4
>
>

Re: [PATCH] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-19 Thread Vincent Guittot

On Mon, 19 Apr 2021 at 04:18, Rik van Riel  wrote:
>
> The try_to_wake_up function has an optimization where it can queue
> a task for wakeup on its previous CPU, if the task is still in the
> middle of going to sleep inside schedule().
>
> Once schedule() re-enables IRQs, the task will be woken up with an
> IPI, and placed back on the runqueue.
>
> If we have such a wakeup pending, there is no need to search other
> CPUs for runnable tasks. Just skip (or bail out early from) newidle
> balancing, and run the just woken up task.
>
> For a memcache like workload test, this reduces total CPU use by
> about 2%, proportionally split between user and system time,
> and p99 and p95 application response time by 2-3% on average.
> The schedstats run_delay number shows a similar improvement.
>
> Signed-off-by: Rik van Riel 
> ---
>  kernel/sched/fair.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 69680158963f..19a92c48939f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7163,6 +7163,14 @@ done: __maybe_unused;
> if (!rf)
> return NULL;
>
> +   /*
> +* We have a woken up task pending here. No need to search for ones
> +* elsewhere. This task will be enqueued the moment we unblock irqs
> +* upon exiting the scheduler.
> +*/
> +   if (rq->ttwu_pending)
> +   return NULL;

Would it be better to put this check at the beg of newidle_balance() ?
If prev is not a cfs task, we never reach this point but instead use the path:
class->balance => balance_fair => newidle_balance

and we will not check for rq->ttwu_pending

> +
> new_tasks = newidle_balance(rq, rf);
>
> /*
> @@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
>  * Stop searching for tasks to pull if there are
>  * now runnable tasks on this rq.
>  */
> -   if (pulled_task || this_rq->nr_running > 0)
> +   if (pulled_task || this_rq->nr_running > 0 ||
> +   this_rq->ttwu_pending)
> break;
> }
> rcu_read_unlock();
> --
> 2.25.4
>
>

Re: [PATCH 2/2] sched/fair: Relax task_hot() for misfit tasks

2021-04-16 Thread Vincent Guittot

Le jeudi 15 avril 2021 à 18:58:46 (+0100), Valentin Schneider a écrit :
> Consider the following topology:
> 
>   DIE [  ]
>   MC  [][]
>0  1  2  3
> 
>   capacity_orig_of(x \in {0-1}) < capacity_orig_of(x \in {2-3})
> 
> w/ CPUs 2-3 idle and CPUs 0-1 running CPU hogs (util_avg=1024).
> 
> When CPU2 goes through load_balance() (via periodic / NOHZ balance), it
> should pull one CPU hog from either CPU0 or CPU1 (this is misfit task
> upmigration). However, should a e.g. pcpu kworker awake on CPU0 just before
> this load_balance() happens and preempt the CPU hog running there, we would
> have, for the [0-1] group at CPU2's DIE level:
> 
> o sgs->sum_nr_running > sgs->group_weight
> o sgs->group_capacity * 100 < sgs->group_util * imbalance_pct
> 
> IOW, this group is group_overloaded.
> 
> Considering CPU0 is picked by find_busiest_queue(), we would then visit the
> preempted CPU hog in detach_tasks(). However, given it has just been
> preempted by this pcpu kworker, task_hot() will prevent it from being
> detached. We then leave load_balance() without having done anything.
> 
> Long story short, preempted misfit tasks are affected by task_hot(), while
> currently running misfit tasks are intentionally preempted by the stopper
> task to migrate them over to a higher-capacity CPU.
> 
> Align detach_tasks() with the active-balance logic and let it pick a
> cache-hot misfit task when the destination CPU can provide a capacity
> uplift.
> 
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/fair.c | 36 
>  1 file changed, 36 insertions(+)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d2d1a69d7aa7..43fc98d34276 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7493,6 +7493,7 @@ struct lb_env {
>   enum fbq_type   fbq_type;
>   enum migration_type migration_type;
>   enum group_type src_grp_type;
> + enum group_type dst_grp_type;
>   struct list_headtasks;
>  };
>  
> @@ -7533,6 +7534,31 @@ static int task_hot(struct task_struct *p, struct 
> lb_env *env)
>   return delta < (s64)sysctl_sched_migration_cost;
>  }
>  
> +
> +/*
> + * What does migrating this task do to our capacity-aware scheduling 
> criterion?
> + *
> + * Returns 1, if the task needs more capacity than the dst CPU can provide.
> + * Returns 0, if the task needs the extra capacity provided by the dst CPU
> + * Returns -1, if the task isn't impacted by the migration wrt capacity.
> + */
> +static int migrate_degrades_capacity(struct task_struct *p, struct lb_env 
> *env)
> +{
> + if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
> + return -1;
> +
> + if (!task_fits_capacity(p, capacity_of(env->src_cpu))) {
> + if (cpu_capacity_greater(env->dst_cpu, env->src_cpu))
> + return 0;
> + else if (cpu_capacity_greater(env->src_cpu, env->dst_cpu))
> + return 1;
> + else
> + return -1;
> + }

Being there means that task fits src_cpu capacity so why testing p against 
dst_cpu ?

> +
> + return task_fits_capacity(p, capacity_of(env->dst_cpu)) ? -1 : 1;
> +}

I prefer the below which easier to read because the same var is use everywhere 
and you can remove cpu_capacity_greater.

static int migrate_degrades_capacity(struct task_struct *p, struct lb_env *env)
{
unsigned long src_capacity, dst_capacity;

if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
return -1;

src_capacity = capacity_of(env->src_cpu);
dst_capacity = capacity_of(env->dst_cpu);

if (!task_fits_capacity(p, src_capacity)) {
if (capacity_greater(dst_capacity, src_capacity))
return 0;
else if (capacity_greater(src_capacity, dst_capacity))
return 1;
else
return -1;
}

return task_fits_capacity(p, dst_capacity) ? -1 : 1;
}


> +
>  #ifdef CONFIG_NUMA_BALANCING
>  /*
>   * Returns 1, if task migration degrades locality
> @@ -7672,6 +7698,15 @@ int can_migrate_task(struct task_struct *p, struct 
> lb_env *env)
>   if (tsk_cache_hot == -1)
>   tsk_cache_hot = task_hot(p, env);
>  
> + /*
> +  * On a (sane) asymmetric CPU capacity system, the increase in compute
> +  * capacity should offset any potential performance hit caused by a
> +  * migration.
> +  */
> + if ((env->dst_grp_type == group_has_spare) &&

Shouldn't it be env->src_grp_type == group_misfit_task to only care of misfit 
task case as
stated in $subject


> + !migrate_degrades_capacity(p, env))
> + tsk_cache_hot = 0;
> +
>   if (tsk_cache_hot <= 0 ||
>   env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
>   if (tsk_cache_hot == 1) {
> @@ -9310,6 +9345,7 @@ static struct sched_group *find_busiest_group(struct 
> lb_env *env)
>   if (!sds.busiest)
>   goto

Re: [PATCH 1/2] sched/fair: Filter out locally-unsolvable misfit imbalances

2021-04-16 Thread Vincent Guittot

On Thu, 15 Apr 2021 at 19:58, Valentin Schneider
 wrote:
>
> Consider the following (hypothetical) asymmetric CPU capacity topology,
> with some amount of capacity pressure (RT | DL | IRQ | thermal):
>
>   DIE [  ]
>   MC  [][]
>0  1  2  3
>
>   | CPU | capacity_orig | capacity |
>   |-+---+--|
>   |   0 |   870 |  860 |
>   |   1 |   870 |  600 |
>   |   2 |  1024 |  850 |
>   |   3 |  1024 |  860 |
>
> If CPU1 has a misfit task, then CPU0, CPU2 and CPU3 are valid candidates to
> grant the task an uplift in CPU capacity. Consider CPU0 and CPU3 as
> sufficiently busy, i.e. don't have enough spare capacity to accommodate
> CPU1's misfit task. This would then fall on CPU2 to pull the task.
>
> This currently won't happen, because CPU2 will fail
>
>   capacity_greater(capacity_of(CPU2), sg->sgc->max_capacity)
>
> in update_sd_pick_busiest(), where 'sg' is the [0, 1] group at DIE
> level. In this case, the max_capacity is that of CPU0's, which is at this
> point in time greater than that of CPU2's. This comparison doesn't make
> much sense, given that the only CPUs we should care about in this scenario
> are CPU1 (the CPU with the misfit task) and CPU2 (the load-balance
> destination CPU).
>
> Aggregate a misfit task's load into sgs->group_misfit_task_load only if
> env->dst_cpu would grant it a capacity uplift.
>
> Note that the aforementioned capacity vs sgc->max_capacity comparison was
> meant to prevent misfit task downmigration: candidate groups classified as
> group_misfit_task but with a higher (max) CPU capacity than the destination 
> CPU
> would be discarded. This change makes it so said group_misfit_task
> classification can't happen anymore, which may cause some undesired
> downmigrations.
>
> Further tweak find_busiest_queue() to ensure this doesn't happen. Also note
> find_busiest_queue() can now iterate over CPUs with a higher capacity than
> the local CPU's, so add a capacity check there.
>
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/fair.c | 63 -
>  1 file changed, 45 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9b8ae02f1994..d2d1a69d7aa7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5759,6 +5759,12 @@ static unsigned long capacity_of(int cpu)
> return cpu_rq(cpu)->cpu_capacity;
>  }
>
> +/* Is CPU a's capacity noticeably greater than CPU b's? */
> +static inline bool cpu_capacity_greater(int a, int b)
> +{
> +   return capacity_greater(capacity_of(a), capacity_of(b));
> +}
> +
>  static void record_wakee(struct task_struct *p)
>  {
> /*
> @@ -7486,6 +7492,7 @@ struct lb_env {
>
> enum fbq_type   fbq_type;
> enum migration_type migration_type;
> +   enum group_type src_grp_type;
> struct list_headtasks;
>  };
>
> @@ -8447,6 +8454,32 @@ static bool update_nohz_stats(struct rq *rq)
>  #endif
>  }
>
> +static inline void update_sg_lb_misfit_stats(struct lb_env *env,
> +struct sched_group *group,
> +struct sg_lb_stats *sgs,
> +int *sg_status,
> +int cpu)
> +{
> +   struct rq *rq = cpu_rq(cpu);
> +
> +   if (!(env->sd->flags & SD_ASYM_CPUCAPACITY) ||
> +   !rq->misfit_task_load)
> +   return;
> +
> +   *sg_status |= SG_OVERLOAD;
> +
> +   /*
> +* Don't attempt to maximize load for misfit tasks that can't be
> +* granted a CPU capacity uplift.
> +*/
> +   if (cpu_capacity_greater(env->dst_cpu, cpu)) {
> +   sgs->group_misfit_task_load = max(
> +   sgs->group_misfit_task_load,
> +   rq->misfit_task_load);
> +   }
> +
> +}
> +
>  /**
>   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
>   * @env: The load balancing environment.
> @@ -8498,12 +8531,7 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> if (local_group)
> continue;
>
> -   /* Check for a misfit task on the cpu */
> -   if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> -   sgs->group_misfit_task_load < rq->misfit_task_load) {
> -   sgs->group_misfit_task_load = rq->misfit_task_load;
> -   *sg_status |= SG_OVERLOAD;
> -   }
> +   update_sg_lb_misfit_stats(env, group, sgs, sg_status, i);
> }
>
> /* Check if dst CPU is idle and preferred to this group */
> @@ -8550,15 +8578,9 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> if (!sgs->sum_h_nr_running)
> return false;
>
> -   /*
> -* Don't try to pull misfit

Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-13 Thread Vincent Guittot

On Mon, 12 Apr 2021 at 17:24, Mel Gorman  wrote:
>
> On Mon, Apr 12, 2021 at 02:21:47PM +0200, Vincent Guittot wrote:
> > > > Peter, Valentin, Vincent, Mel, etal
> > > >
> > > > On architectures where we have multiple levels of cache access latencies
> > > > within a DIE, (For example: one within the current LLC or SMT core and 
> > > > the
> > > > other at MC or Hemisphere, and finally across hemispheres), do you have 
> > > > any
> > > > suggestions on how we could handle the same in the core scheduler?
> >
> > I would say that SD_SHARE_PKG_RESOURCES is there for that and doesn't
> > only rely on cache
> >
>
> From topology.c
>
> SD_SHARE_PKG_RESOURCES - describes shared caches
>
> I'm guessing here because I am not familiar with power10 but the central
> problem appears to be when to prefer selecting a CPU sharing L2 or L3
> cache and the core assumes the last-level-cache is the only relevant one.
>
> For this patch, I wondered if setting SD_SHARE_PKG_RESOURCES would have
> unintended consequences for load balancing because load within a die may
> not be spread between SMT4 domains if SD_SHARE_PKG_RESOURCES was set at
> the MC level.

But the SMT4 level is still present  here with select_idle_core taking
of the spreading

>
> > >
> > > Minimally I think it would be worth detecting when there are multiple
> > > LLCs per node and detecting that in generic code as a static branch. In
> > > select_idle_cpu, consider taking two passes -- first on the LLC domain
> > > and if no idle CPU is found then taking a second pass if the search depth
> >
> > We have done a lot of changes to reduce and optimize the fast path and
> > I don't think re adding another layer  in the fast path makes sense as
> > you will end up unrolling the for_each_domain behind some
> > static_banches.
> >
>
> Searching the node would only happen if a) there was enough search depth
> left and b) there were no idle CPUs at the LLC level. As no new domain
> is added, it's not clear to me why for_each_domain would change.

What I mean is that you should directly do for_each_sched_domain in
the fast path because that what you are proposing at the end. It's no
more looks like a fast path but a traditional LB

>
> But still, your comment reminded me that different architectures have
> different requirements
>
> Power 10 appears to prefer CPU selection sharing L2 cache but desires
> spillover to L3 when selecting and idle CPU.
>
> X86 varies, it might want the Power10 approach for some families and prefer
> L3 spilling over to a CPU on the same node in others.
>
> S390 cares about something called books and drawers although I've no
> what it means as such and whether it has any preferences on
> search order.
>
> ARM has similar requirements again according to "scheduler: expose the
> topology of clusters and add cluster scheduler" and that one *does*
> add another domain.
>
> I had forgotten about the ARM patches but remembered that they were
> interesting because they potentially help the Zen situation but I didn't
> get the chance to review them before they fell off my radar again. About
> all I recall is that I thought the "cluster" terminology was vague.
>
> The only commonality I thought might exist is that architectures may
> like to define what the first domain to search for an idle CPU and a
> second domain. Alternatively, architectures could specify a domain to
> search primarily but also search the next domain in the hierarchy if
> search depth permits. The default would be the existing behaviour --
> search CPUs sharing a last-level-cache.
>
> > SD_SHARE_PKG_RESOURCES should be set to the last level where we can
> > efficiently move task between CPUs at wakeup
> >
>
> The definition of "efficiently" varies. Moving tasks between CPUs sharing
> a cache is most efficient but moving the task to a CPU that at least has
> local memory channels is a reasonable option if there are no idle CPUs
> sharing cache and preferable to stacking.

That's why setting SD_SHARE_PKG_RESOURCES for P10 looks fine to me.
This last level of SD_SHARE_PKG_RESOURCES should define the cpumask to
be considered  in fast path

>
> > > allows within the node with the LLC CPUs masked out. While there would be
> > > a latency hit because cache is not shared, it would still be a CPU local
> > > to memory that is idle. That would potentially be beneficial on Zen*
> > > as well without having to introduce new domains in the topology hierarchy.
> >
> >

Re: [RFC/PATCH] powerpc/smp: Add SD_SHARE_PKG_RESOURCES flag to MC sched-domain

2021-04-12 Thread Vincent Guittot

On Mon, 12 Apr 2021 at 11:37, Mel Gorman  wrote:
>
> On Mon, Apr 12, 2021 at 11:54:36AM +0530, Srikar Dronamraju wrote:
> > * Gautham R. Shenoy  [2021-04-02 11:07:54]:
> >
> > >
> > > To remedy this, this patch proposes that the LLC be moved to the MC
> > > level which is a group of cores in one half of the chip.
> > >
> > >   SMT (SMT4) --> MC (Hemisphere)[LLC] --> DIE
> > >
> >
> > I think marking Hemisphere as a LLC in a P10 scenario is a good idea.
> >
> > > While there is no cache being shared at this level, this is still the
> > > level where some amount of cache-snooping takes place and it is
> > > relatively faster to access the data from the caches of the cores
> > > within this domain. With this change, we no longer see regressions on
> > > P10 for applications which require single threaded performance.
> >
> > Peter, Valentin, Vincent, Mel, etal
> >
> > On architectures where we have multiple levels of cache access latencies
> > within a DIE, (For example: one within the current LLC or SMT core and the
> > other at MC or Hemisphere, and finally across hemispheres), do you have any
> > suggestions on how we could handle the same in the core scheduler?

I would say that SD_SHARE_PKG_RESOURCES is there for that and doesn't
only rely on cache

> >
>
> Minimally I think it would be worth detecting when there are multiple
> LLCs per node and detecting that in generic code as a static branch. In
> select_idle_cpu, consider taking two passes -- first on the LLC domain
> and if no idle CPU is found then taking a second pass if the search depth

We have done a lot of changes to reduce and optimize the fast path and
I don't think re adding another layer  in the fast path makes sense as
you will end up unrolling the for_each_domain behind some
static_banches.

SD_SHARE_PKG_RESOURCES should be set to the last level where we can
efficiently move task between CPUs at wakeup

> allows within the node with the LLC CPUs masked out. While there would be
> a latency hit because cache is not shared, it would still be a CPU local
> to memory that is idle. That would potentially be beneficial on Zen*
> as well without having to introduce new domains in the topology hierarchy.

What is the current sched_domain topology description for zen ?

>
> --
> Mel Gorman
> SUSE Labs

Re: [PATCH] sched/fair: Rate limit calls to update_blocked_averages() for NOHZ

2021-04-09 Thread Vincent Guittot

On Fri, 9 Apr 2021 at 01:05, Tim Chen  wrote:
>
>
>
>
> On 4/8/21 7:51 AM, Vincent Guittot wrote:
>
> >> I was suprised to find the overall cpu% consumption of 
> >> update_blocked_averages
> >> and throughput of the benchmark still didn't change much.  So I took a
> >> peek into the profile and found the update_blocked_averages calls shifted 
> >> to the idle load balancer.
> >> The call to update_locked_averages was reduced in newidle_balance so the 
> >> patch did
> >> what we intended.  But the overall rate of calls to
> >
> > At least , we have removed the useless call to update_blocked_averages
> > in newidle_balance when we will not perform any newly idle load
> > balance
> >
> >> update_blocked_averages remain roughly the same, shifting from
> >> newidle_balance to run_rebalance_domains.
> >>
> >>100.00%  (810cf070)
> >> |
> >> ---update_blocked_averages
> >>|
> >>|--95.47%--run_rebalance_domains
> >>|  __do_softirq
> >>|  |
> >>|  |--94.27%--asm_call_irq_on_stack
> >>|  |  do_softirq_own_stack
> >
> > The call of  update_blocked_averages mainly comes from SCHED_SOFTIRQ.
> > And as a result, not from the new path
> > do_idle()->nohz_run_idle_balance() which has been added by this patch
> > to defer the call to update_nohz_stats() after newlyidle_balance and
> > before entering idle.
> >
> >>|  |  |
> >>|  |  |--93.74%--irq_exit_rcu
> >>|  |  |  |
> >>|  |  |  
> >> |--88.20%--sysvec_apic_timer_interrupt
> >>|  |  |  |  
> >> asm_sysvec_apic_timer_interrupt
> >>|  |  |  |  |
> >>...
> >>|
> >>|
> >> --4.53%--newidle_balance
> >>   pick_next_task_fair
> >>
> >> I was expecting idle load balancer to be rate limited to 60 Hz, which
> >
> > Why 60Hz ?
> >
>
> My thinking is we will trigger load balance only after rq->next_balance.
>
> void trigger_load_balance(struct rq *rq)
> {
> /* Don't need to rebalance while attached to NULL domain */
> if (unlikely(on_null_domain(rq)))
> return;
>
> if (time_after_eq(jiffies, rq->next_balance))
> raise_softirq(SCHED_SOFTIRQ);
>
> nohz_balancer_kick(rq);
> }
>
> And it seems like next_balance is set to be 60 Hz
>
> static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
> {
> int continue_balancing = 1;
> int cpu = rq->cpu;
> int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
> unsigned long interval;
> struct sched_domain *sd;
> /* Earliest time when we have to do rebalance again */
> unsigned long next_balance = jiffies + 60*HZ;

This doesn't mean 60 Hz period but 60*HZ with HZ being the number of
jiffies per second. We init next_balance with now + 60 sec to make
sure it's far later than the next balance of the sched_domains

Then, update_next_balance() keeps track of 1st balance to happen next time

>
>
> >> should be 15 jiffies apart on the test system with CONFIG_HZ_250.
> >> When I did a trace on a single CPU, I see that update_blocked_averages
> >> are often called between 1 to 4 jiffies apart, which is at a much higher
> >> rate than I expected.  I haven't taken a closer look yet.  But you may
> >
> > 2 things can trigger a SCHED_SOFTIRQ/run_rebalance_domains:
> > - the need for an update of blocked load which should not happen more
> > than once every 32ms which means a rate of around 30Hz
> > - the need for a load balance of a sched_domain. The min interval for
> > a sched_domain is its weight when the CPU is idle which is usually few
> > jiffies
> >
> > The only idea that I have for now is that we spend less time in
> > newidle_balance which changes the dynamic of your system.
> >
> > In your trace, could you check if update_blocked_averages is called
> > during the tick ? and Is the current task idle task ?
>
> Here's a snapshot of the trace. However I didn't have the current task in my 
>

Re: [PATCH] sched/fair: Rate limit calls to update_blocked_averages() for NOHZ

2021-04-08 Thread Vincent Guittot

On Wed, 7 Apr 2021 at 19:19, Tim Chen  wrote:
>
>
>
> On 4/7/21 7:02 AM, Vincent Guittot wrote:
> > Hi Tim,
> >
> > On Wed, 24 Mar 2021 at 17:05, Tim Chen  wrote:
> >>
> >>
> >>
> >> On 3/24/21 6:44 AM, Vincent Guittot wrote:
> >>> Hi Tim,
> >>
> >>>
> >>> IIUC your problem, we call update_blocked_averages() but because of:
> >>>
> >>>   if (this_rq->avg_idle < curr_cost + 
> >>> sd->max_newidle_lb_cost) {
> >>>   update_next_balance(sd, _balance);
> >>>   break;
> >>>   }
> >>>
> >>> the for_each_domain loop stops even before running load_balance on the 1st
> >>> sched domain level which means that update_blocked_averages() was called
> >>> unnecessarily.
> >>>
> >>
> >> That's right
> >>
> >>> And this is even more true with a small sysctl_sched_migration_cost which 
> >>> allows newly
> >>> idle LB for very small this_rq->avg_idle. We could wonder why you set 
> >>> such a low value
> >>> for sysctl_sched_migration_cost which is lower than the 
> >>> max_newidle_lb_cost of the
> >>> smallest domain but that's probably because of task_hot().
> >>>
> >>> if avg_idle is lower than the sd->max_newidle_lb_cost of the 1st 
> >>> sched_domain, we should
> >>> skip spin_unlock/lock and for_each_domain() loop entirely
> >>>
> >>> Maybe something like below:
> >>>
> >>
> >> The patch makes sense.  I'll ask our benchmark team to queue this patch 
> >> for testing.
> >
> > Do you have feedback from your benchmark team ?
> >
>
> Vincent,
>
> Thanks for following up. I just got some data back from the benchmark team.
> The performance didn't change with your patch.  And the overall cpu% of 
> update_blocked_averages
> also remain at about the same level.  My first thought was perhaps this update
> still didn't catch all the calls to update_blocked_averages
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> -   !READ_ONCE(this_rq->rd->overload)) {
> +   !READ_ONCE(this_rq->rd->overload) ||
> +   (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
>
> To experiment, I added one more check on the next_balance to further limit
> the path to actually do idle load balance with the next_balance time.
>
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> -   !READ_ONCE(this_rq->rd->overload)) {
> +   time_before(jiffies, this_rq->next_balance) ||
> +   !READ_ONCE(this_rq->rd->overload) ||
> +   (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
>
> I was suprised to find the overall cpu% consumption of update_blocked_averages
> and throughput of the benchmark still didn't change much.  So I took a
> peek into the profile and found the update_blocked_averages calls shifted to 
> the idle load balancer.
> The call to update_locked_averages was reduced in newidle_balance so the 
> patch did
> what we intended.  But the overall rate of calls to

At least , we have removed the useless call to update_blocked_averages
in newidle_balance when we will not perform any newly idle load
balance

> update_blocked_averages remain roughly the same, shifting from
> newidle_balance to run_rebalance_domains.
>
>100.00%  (810cf070)
> |
> ---update_blocked_averages
>|
>|--95.47%--run_rebalance_domains
>|  __do_softirq
>|  |
>|  |--94.27%--asm_call_irq_on_stack
>|  |  do_softirq_own_stack

The call of  update_blocked_averages mainly comes from SCHED_SOFTIRQ.
And as a result, not from the new path
do_idle()->nohz_run_idle_balance() which has been added by this patch
to defer the call to update_nohz_stats() after newlyidle_balance and
before entering idle.

>|  |  |
>|  |  |--93.74%--irq_exit_rcu
>|  |  |  |
>|  |  |  
> |--88.20%--sysvec_apic_timer_interrupt
>|  |  |  |  
> asm_sysvec_apic_timer_interrupt
>|  |  |  |  |
>...
>|
>

Re: [PATCH] sched/fair: Rate limit calls to update_blocked_averages() for NOHZ

2021-04-07 Thread Vincent Guittot

Hi Tim,

On Wed, 24 Mar 2021 at 17:05, Tim Chen  wrote:
>
>
>
> On 3/24/21 6:44 AM, Vincent Guittot wrote:
> > Hi Tim,
>
> >
> > IIUC your problem, we call update_blocked_averages() but because of:
> >
> >   if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
> >   update_next_balance(sd, _balance);
> >   break;
> >   }
> >
> > the for_each_domain loop stops even before running load_balance on the 1st
> > sched domain level which means that update_blocked_averages() was called
> > unnecessarily.
> >
>
> That's right
>
> > And this is even more true with a small sysctl_sched_migration_cost which 
> > allows newly
> > idle LB for very small this_rq->avg_idle. We could wonder why you set such 
> > a low value
> > for sysctl_sched_migration_cost which is lower than the max_newidle_lb_cost 
> > of the
> > smallest domain but that's probably because of task_hot().
> >
> > if avg_idle is lower than the sd->max_newidle_lb_cost of the 1st 
> > sched_domain, we should
> > skip spin_unlock/lock and for_each_domain() loop entirely
> >
> > Maybe something like below:
> >
>
> The patch makes sense.  I'll ask our benchmark team to queue this patch for 
> testing.

Do you have feedback from your benchmark team ?

Regards,
Vincent
>
> Tim
>
>

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-07 Thread Vincent Guittot

On Wed, 7 Apr 2021 at 12:19, Peter Zijlstra  wrote:
>
> On Wed, Apr 07, 2021 at 11:54:37AM +0200, Peter Zijlstra wrote:
>
> > Let me have another poke at it.
>
> Pretty much what you did, except I also did s/smt/has_idle_core/ and
> fixed that @sd thing.
>
> Like so then?

Yes. Looks good to me

>
> ---
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int c
>  {
> struct sched_domain_shared *sds;
>
> -   if (static_branch_likely(_smt_present)) {
> -   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> -   if (sds)
> -   return READ_ONCE(sds->has_idle_cores);
> -   }
> +   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +   if (sds)
> +   return READ_ONCE(sds->has_idle_cores);
>
> return def;
>  }
> @@ -6112,6 +6110,24 @@ static int select_idle_core(struct task_
> return -1;
>  }
>
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +{
> +   int cpu;
> +
> +   for_each_cpu(cpu, cpu_smt_mask(target)) {
> +   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> +   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> +   continue;
> +   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +   return cpu;
> +   }
> +
> +   return -1;
> +}
> +
>  #else /* CONFIG_SCHED_SMT */
>
>  static inline void set_idle_cores(int cpu, int val)
> @@ -6128,6 +6144,11 @@ static inline int select_idle_core(struc
> return __select_idle_cpu(core);
>  }
>
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
> *sd, int target)
> +{
> +   return -1;
> +}
> +
>  #endif /* CONFIG_SCHED_SMT */
>
>  /*
> @@ -6135,11 +6156,10 @@ static inline int select_idle_core(struc
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
>   * average idle time for this rq (as found in rq->avg_idle).
>   */
> -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> bool has_idle_core, int target)
>  {
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
> int i, cpu, idle_cpu = -1, nr = INT_MAX;
> -   bool smt = test_idle_cores(target, false);
> int this = smp_processor_id();
> struct sched_domain *this_sd;
> u64 time;
> @@ -6150,7 +6170,7 @@ static int select_idle_cpu(struct task_s
>
> cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
>
> -   if (sched_feat(SIS_PROP) && !smt) {
> +   if (sched_feat(SIS_PROP) && !has_idle_core) {
> u64 avg_cost, avg_idle, span_avg;
>
> /*
> @@ -6170,7 +6190,7 @@ static int select_idle_cpu(struct task_s
> }
>
> for_each_cpu_wrap(cpu, cpus, target) {
> -   if (smt) {
> +   if (has_idle_core) {
> i = select_idle_core(p, cpu, cpus, _cpu);
> if ((unsigned int)i < nr_cpumask_bits)
> return i;
> @@ -6184,10 +6204,10 @@ static int select_idle_cpu(struct task_s
> }
> }
>
> -   if (smt)
> +   if (has_idle_core)
> set_idle_cores(this, false);
>
> -   if (sched_feat(SIS_PROP) && !smt) {
> +   if (sched_feat(SIS_PROP) && !has_idle_core) {
> time = cpu_clock(this) - time;
> update_avg(_sd->avg_scan_cost, time);
> }
> @@ -6242,6 +6262,7 @@ static inline bool asym_fits_capacity(in
>   */
>  static int select_idle_sibling(struct task_struct *p, int prev, int target)
>  {
> +   bool has_idle_core = false;
> struct sched_domain *sd;
> unsigned long task_util;
> int i, recent_used_cpu;
> @@ -6321,7 +6342,17 @@ static int select_idle_sibling(struct ta
> if (!sd)
> return target;
>
> -   i = select_idle_cpu(p, sd, target);
> +   if (static_branch_likely(_smt_present)) {
> +   has_idle_core = test_idle_cores(target, false);
> +
> +   if (!has_idle_core && cpus_share_cache(prev, target)) {
> +   i = select_idle_smt(p, sd, prev);
> +   if ((unsigned int)i < nr_cpumask_bits)
> +   return i;
> +   }
> +   }
> +
> +   i = select_idle_cpu(p, sd, has_idle_core, target);
> if ((unsigned)i < nr_cpumask_bits)
> return i;
>

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-07 Thread Vincent Guittot

On Wed, 7 Apr 2021 at 11:55, Peter Zijlstra  wrote:
>
> On Wed, Apr 07, 2021 at 11:42:17AM +0200, Vincent Guittot wrote:
> > I would really prefer to keep that out of select_idle_cpu which aims to 
> > merge in one
> > single loop the walk through sd_llc. In the case of select_idle_smt, this 
> > is done outside
> > the loop:
>
> Fair enough.
>
> > @@ -6317,11 +6339,21 @@ static int select_idle_sibling(struct task_struct 
> > *p, int prev, int target)
> >   }
> >   }
> >
> > + if (static_branch_likely(_smt_present)) {
> > + smt = test_idle_cores(target, false);
> > + if (!smt && cpus_share_cache(prev, target)) {
> > + /* No idle core. Check if prev has an idle sibling. */
> > + i = select_idle_smt(p, sd, prev);
> > + if ((unsigned int)i < nr_cpumask_bits)
> > + return i;
> > + }
> > + }
> > +
> >   sd = rcu_dereference(per_cpu(sd_llc, target));
> >   if (!sd)
> >   return target;
>
> It needs to be here, otherwise you're using @sd uninitialized.

argh yes...

>
> > - i = select_idle_cpu(p, sd, target);
> > + i = select_idle_cpu(p, sd, smt, target);
> >   if ((unsigned)i < nr_cpumask_bits)
> >   return i;
>
> Let me have another poke at it.

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-07 Thread Vincent Guittot

Le mercredi 07 avril 2021 à 09:17:18 (+0200), Peter Zijlstra a écrit :
> On Tue, Apr 06, 2021 at 11:26:37AM -0400, Rik van Riel wrote:
> > I would be happy to pull the static branch out of select_idle_smt()
> > and place it into this if condition, though. You are right that
> > would save some overhead on non-smt systems.
> > 
> > Peter, would you prefer a follow-up patch for that or a version 4
> > of the patch?
> 
> Sorry, I was side-tracked with that core scheduling crap.. Something
> like the below then?
> 
> (Also fixed that stray line-wrap)
> 
> ---
> Subject: sched/fair: Bring back select_idle_smt(), but differently
> From: Rik van Riel 
> Date: Fri, 26 Mar 2021 15:19:32 -0400
> 
> From: Rik van Riel 
> 
> Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
> select_idle_core/cpu()"), resulting in the kernel being more efficient
> at finding an idle CPU, and in tasks spending less time waiting to be
> run, both according to the schedstats run_delay numbers, and according
> to measured application latencies. Yay.
> 
> The flip side of this is that we see more task migrations (about 30%
> more), higher cache misses, higher memory bandwidth utilization, and
> higher CPU use, for the same number of requests/second.
> 
> This is most pronounced on a memcache type workload, which saw a
> consistent 1-3% increase in total CPU use on the system, due to those
> increased task migrations leading to higher L2 cache miss numbers, and
> higher memory utilization. The exclusive L3 cache on Skylake does us
> no favors there.
> 
> On our web serving workload, that effect is usually negligible.
> 
> It appears that the increased number of CPU migrations is generally a
> good thing, since it leads to lower cpu_delay numbers, reflecting the
> fact that tasks get to run faster. However, the reduced locality and
> the corresponding increase in L2 cache misses hurts a little.
> 
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing
> select_idle_smt with a twist: when a socket has no idle cores, check
> to see if the sibling of "prev" is idle, before searching all the
> other CPUs.
> 
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
> 
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are
> back down to what they were before. The p95 and p99 response times for
> the memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
> 
> Signed-off-by: Rik van Riel 
> Signed-off-by: Peter Zijlstra (Intel) 
> Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com
> ---
>  kernel/sched/fair.c |   39 +--
>  1 file changed, 37 insertions(+), 2 deletions(-)
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6112,6 +6112,27 @@ static int select_idle_core(struct task_
>   return -1;
>  }
>  
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +{
> + int cpu;
> +
> + if (!static_branch_likely(_smt_present))
> + return -1;
> +
> + for_each_cpu(cpu, cpu_smt_mask(target)) {
> + if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> + !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> + continue;
> + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> + return cpu;
> + }
> +
> + return -1;
> +}
> +
>  #else /* CONFIG_SCHED_SMT */
>  
>  static inline void set_idle_cores(int cpu, int val)
> @@ -6128,6 +6149,11 @@ static inline int select_idle_core(struc
>   return __select_idle_cpu(core);
>  }
>  
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
> *sd, int target)
> +{
> + return -1;
> +}
> +
>  #endif /* CONFIG_SCHED_SMT */
>  
>  /*
> @@ -6135,7 +6161,7 @@ static inline int select_idle_core(struc
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
>   * average idle time for this rq (as found in rq->avg_idle).
>   */
> -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int prev, int target)
>  {
>   struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
>   int i, cpu, idle_cpu = -1, nr = INT_MAX;
> @@ -6148,6 +6174,15 @@ static int select_idle_cpu(struct task_s
>   if (!this_sd)
>   return -1;
>  
> + /* If we have SMT but there are no idle cores */
> + if (static_branch_likely(_smt_presernt) && !smt) {
> + if (cpus_share_cache(prev, target)) {
> + i = select_idle_smt(p, sd, prev);
> +

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Vincent Guittot

On Tue, 6 Apr 2021 at 17:55, Rik van Riel  wrote:
>
> On Tue, 2021-04-06 at 17:31 +0200, Vincent Guittot wrote:
> > On Tue, 6 Apr 2021 at 17:26, Rik van Riel  wrote:
> > > On Tue, 2021-04-06 at 17:10 +0200, Vincent Guittot wrote:
> > > > On Fri, 26 Mar 2021 at 20:19, Rik van Riel 
> > > > wrote:
> > > >
> > > > > -static int select_idle_cpu(struct task_struct *p, struct
> > > > > sched_domain *sd, int target)
> > > > > +static int select_idle_cpu(struct task_struct *p, struct
> > > > > sched_domain *sd, int prev, int target)
> > > > >  {
> > > > > struct cpumask *cpus =
> > > > > this_cpu_cpumask_var_ptr(select_idle_mask);
> > > > > int i, cpu, idle_cpu = -1, nr = INT_MAX;
> > > > > @@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct
> > > > > task_struct *p, struct sched_domain *sd, int t
> > > > >
> > > > > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > > > >
> > > > > -   if (sched_feat(SIS_PROP) && !smt) {
> > > > > -   u64 avg_cost, avg_idle, span_avg;
> > > > > +   if (!smt) {
> > > > > +   if (cpus_share_cache(prev, target)) {
> > > >
> > > > Have you checked the impact on no smt system ? would worth a
> > > > static
> > > > branch.
> > > >
> > > > Also, this doesn't need to be in select_idle_cpu() which aims to
> > > > loop
> > > > the sched_domain becaus you only compare  target and prev. So you
> > > > can
> > > > move this call to select_idle_smt() in select_idle_sibling()
> > >
> > > After Mel's rewrite, there no longer are calls to
> > > select_idle_core() or select_idle_smt() in select_idle_sibling().
> >
> > select_idle_smt() had even disappeared that why it was not in
> > select_idle_sibling
> >
> > > Everything got folded into one single loop in select_idle_cpu()
> >
> > but this is done completely out of the loop so we don't need to
> > complify the function with unrelated stuff
>
> Not entirely. The call to select_idle_smt() is still
> conditional on test_idle_cores() returning false.
>
> We only look for the
> other sibling if there is no idle
> core in the LLC. If there is an idle core, we prefer
> that.
>
> Pulling the select_idle_smt() call out of select_idle_cpu()
> would mean having to test_idle_cores() twice.

In this case passes  the results test_idle_cores as a parameters instead of prev

>
> --
> All Rights Reversed.

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Vincent Guittot

On Tue, 6 Apr 2021 at 17:31, Vincent Guittot  wrote:
>
> On Tue, 6 Apr 2021 at 17:26, Rik van Riel  wrote:
> >
> > On Tue, 2021-04-06 at 17:10 +0200, Vincent Guittot wrote:
> > > On Fri, 26 Mar 2021 at 20:19, Rik van Riel  wrote:
> > >
> > > > -static int select_idle_cpu(struct task_struct *p, struct
> > > > sched_domain *sd, int target)
> > > > +static int select_idle_cpu(struct task_struct *p, struct
> > > > sched_domain *sd, int prev, int target)
> > > >  {
> > > > struct cpumask *cpus =
> > > > this_cpu_cpumask_var_ptr(select_idle_mask);
> > > > int i, cpu, idle_cpu = -1, nr = INT_MAX;
> > > > @@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct
> > > > task_struct *p, struct sched_domain *sd, int t
> > > >
> > > > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > > >
> > > > -   if (sched_feat(SIS_PROP) && !smt) {
> > > > -   u64 avg_cost, avg_idle, span_avg;
> > > > +   if (!smt) {
> > > > +   if (cpus_share_cache(prev, target)) {
> > >
> > > Have you checked the impact on no smt system ? would worth a static
> > > branch.
> > >
> > > Also, this doesn't need to be in select_idle_cpu() which aims to loop
> > > the sched_domain becaus you only compare  target and prev. So you can
> > > move this call to select_idle_smt() in select_idle_sibling()
> >
> > After Mel's rewrite, there no longer are calls to
> > select_idle_core() or select_idle_smt() in select_idle_sibling().
>
> select_idle_smt() had even disappeared that why it was not in
> select_idle_sibling
>
> >
> > Everything got folded into one single loop in select_idle_cpu()
>
> but this is done completely out of the loop so we don't need to
> complify the function with unrelated stuff


s/complify/complexify/

>
> >
> > I would be happy to pull the static branch out of select_idle_smt()
> > and place it into this if condition, though. You are right that
> > would save some overhead on non-smt systems.
> >
> > Peter, would you prefer a follow-up patch for that or a version 4
> > of the patch?
> >
> > --
> > All Rights Reversed.

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Vincent Guittot

On Tue, 6 Apr 2021 at 17:26, Rik van Riel  wrote:
>
> On Tue, 2021-04-06 at 17:10 +0200, Vincent Guittot wrote:
> > On Fri, 26 Mar 2021 at 20:19, Rik van Riel  wrote:
> >
> > > -static int select_idle_cpu(struct task_struct *p, struct
> > > sched_domain *sd, int target)
> > > +static int select_idle_cpu(struct task_struct *p, struct
> > > sched_domain *sd, int prev, int target)
> > >  {
> > > struct cpumask *cpus =
> > > this_cpu_cpumask_var_ptr(select_idle_mask);
> > > int i, cpu, idle_cpu = -1, nr = INT_MAX;
> > > @@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct
> > > task_struct *p, struct sched_domain *sd, int t
> > >
> > > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > >
> > > -   if (sched_feat(SIS_PROP) && !smt) {
> > > -   u64 avg_cost, avg_idle, span_avg;
> > > +   if (!smt) {
> > > +   if (cpus_share_cache(prev, target)) {
> >
> > Have you checked the impact on no smt system ? would worth a static
> > branch.
> >
> > Also, this doesn't need to be in select_idle_cpu() which aims to loop
> > the sched_domain becaus you only compare  target and prev. So you can
> > move this call to select_idle_smt() in select_idle_sibling()
>
> After Mel's rewrite, there no longer are calls to
> select_idle_core() or select_idle_smt() in select_idle_sibling().

select_idle_smt() had even disappeared that why it was not in
select_idle_sibling

>
> Everything got folded into one single loop in select_idle_cpu()

but this is done completely out of the loop so we don't need to
complify the function with unrelated stuff

>
> I would be happy to pull the static branch out of select_idle_smt()
> and place it into this if condition, though. You are right that
> would save some overhead on non-smt systems.
>
> Peter, would you prefer a follow-up patch for that or a version 4
> of the patch?
>
> --
> All Rights Reversed.

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Vincent Guittot

On Fri, 26 Mar 2021 at 20:19, Rik van Riel  wrote:
>
> On Mon, 22 Mar 2021 11:03:06 +
> Mel Gorman  wrote:
>
>
> > Second, select_idle_smt() does not use the cpus mask so consider moving
> > the cpus initialisation after select_idle_smt() has been called.
> > Specifically this initialisation
> >
> >   cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> >
> > Alternatively, clear the bits in the SMT sibling scan to avoid checking
> > the siblings twice. It's a tradeoff because initialising and clearing
> > bits is not free and the cost is wasted if a sibling is free.
>
> I tried a number of different variations on moving the CPU mask
> initialization, and clearing CPUs from the mask, and failed to
> get any clear results from those in testing, even in workloads
> with lots of context switches.
>
> Below is a simple version that seems to perform identically to
> more complicated versions :)
>
> ---8<---
> sched,fair: bring back select_idle_smt, but differently
>
> Mel Gorman did some nice work in 9fe1f127b913
> ("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel
> being more efficient at finding an idle CPU, and in tasks spending less
> time waiting to be run, both according to the schedstats run_delay
> numbers, and according to measured application latencies. Yay.
>
> The flip side of this is that we see more task migrations (about
> 30% more), higher cache misses, higher memory bandwidth utilization,
> and higher CPU use, for the same number of requests/second.
>
> This is most pronounced on a memcache type workload, which saw
> a consistent 1-3% increase in total CPU use on the system, due
> to those increased task migrations leading to higher L2 cache
> miss numbers, and higher memory utilization. The exclusive L3
> cache on Skylake does us no favors there.
>
> On our web serving workload, that effect is usually negligible.
>
> It appears that the increased number of CPU migrations is generally
> a good thing, since it leads to lower cpu_delay numbers, reflecting
> the fact that tasks get to run faster. However, the reduced locality
> and the corresponding increase in L2 cache misses hurts a little.
>
> The patch below appears to fix the regression, while keeping the
> benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt
> with a twist: when a socket has no idle cores, check to see if the
> sibling of "prev" is idle, before searching all the other CPUs.
>
> This fixes both the occasional 9% regression on the web serving
> workload, and the continuous 2% CPU use regression on the memcache
> type workload.
>
> With Mel's patches and this patch together, task migrations are still
> high, but L2 cache misses, memory bandwidth, and CPU time used are back
> down to what they were before. The p95 and p99 response times for the
> memcache type application improve by about 10% over what they were
> before Mel's patches got merged.
>
> Signed-off-by: Rik van Riel 
> ---
>  kernel/sched/fair.c | 68 ++---
>  1 file changed, 52 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 794c2cb945f8..69680158963f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6098,6 +6098,28 @@ static int select_idle_core(struct task_struct *p, int 
> core, struct cpumask *cpu
> return -1;
>  }
>
> +/*
> + * Scan the local SMT mask for idle CPUs.
> + */
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int
> +target)
> +{
> +   int cpu;
> +
> +   if (!static_branch_likely(_smt_present))
> +   return -1;
> +
> +   for_each_cpu(cpu, cpu_smt_mask(target)) {
> +   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> +   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> +   continue;
> +   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
> +   return cpu;
> +   }
> +
> +   return -1;
> +}
> +
>  #else /* CONFIG_SCHED_SMT */
>
>  static inline void set_idle_cores(int cpu, int val)
> @@ -6114,6 +6136,11 @@ static inline int select_idle_core(struct task_struct 
> *p, int core, struct cpuma
> return __select_idle_cpu(core);
>  }
>
> +static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
> *sd, int target)
> +{
> +   return -1;
> +}
> +
>  #endif /* CONFIG_SCHED_SMT */
>
>  /*
> @@ -6121,7 +6148,7 @@ static inline int select_idle_core(struct task_struct 
> *p, int core, struct cpuma
>   * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
>   * average idle time for this rq (as found in rq->avg_idle).
>   */
> -static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int target)
> +static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
> int prev, int target)
>  {
> struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
>

Re: [PATCH 2/4] sched/fair: Introduce arch_sched_asym_prefer_early()

2021-04-06 Thread Vincent Guittot

On Tue, 6 Apr 2021 at 06:11, Ricardo Neri
 wrote:
>
> Introduce arch_sched_asym_prefer_early() so that architectures with SMT
> can delay the decision to label a candidate busiest group as
> group_asym_packing.
>
> When using asymmetric packing, high priority idle CPUs pull tasks from
> scheduling groups with low priority CPUs. The decision on using asymmetric
> packing for load balancing is done after collecting the statistics of a
> candidate busiest group. However, this decision needs to consider the
> state of SMT siblings of dst_cpu.
>
> Cc: Aubrey Li 
> Cc: Ben Segall 
> Cc: Daniel Bristot de Oliveira 
> Cc: Dietmar Eggemann 
> Cc: Joel Fernandes (Google) 
> Cc: Mel Gorman 
> Cc: Quentin Perret 
> Cc: Srinivas Pandruvada 
> Cc: Steven Rostedt 
> Cc: Tim Chen 
> Reviewed-by: Len Brown 
> Signed-off-by: Ricardo Neri 
> ---
>  include/linux/sched/topology.h |  1 +
>  kernel/sched/fair.c| 11 ++-
>  2 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 8f0f778b7c91..663b98959305 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -57,6 +57,7 @@ static inline int cpu_numa_flags(void)
>  #endif
>
>  extern int arch_asym_cpu_priority(int cpu);
> +extern bool arch_sched_asym_prefer_early(int a, int b);
>
>  struct sched_domain_attr {
> int relax_domain_level;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4ef3fa0d5e8d..e74da853b046 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -106,6 +106,15 @@ int __weak arch_asym_cpu_priority(int cpu)
> return -cpu;
>  }
>
> +/*
> + * For asym packing, early check if CPUs with higher priority should be
> + * preferred. On some architectures, more data is needed to make a decision.
> + */
> +bool __weak arch_sched_asym_prefer_early(int a, int b)
> +{
> +   return sched_asym_prefer(a, b);
> +}
> +
>  /*
>   * The margin used when comparing utilization with CPU capacity.
>   *
> @@ -8458,7 +8467,7 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
> env->idle != CPU_NOT_IDLE &&
> sgs->sum_h_nr_running &&
> -   sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) {
> +   arch_sched_asym_prefer_early(env->dst_cpu, 
> group->asym_prefer_cpu)) {

If itmt set arch_sched_asym_prefer_early to true all groups will be
set as group_asym_packing unconditionally which is wrong. The state
has to be set only when we want asym packing migration

> sgs->group_asym_packing = 1;
> }
>
> --
> 2.17.1
>

Re: [PATCH v4 3/3] sched/fair: Introduce a CPU capacity comparison helper

2021-04-02 Thread Vincent Guittot

On Thu, 1 Apr 2021 at 21:30, Valentin Schneider
 wrote:
>
> During load-balance, groups classified as group_misfit_task are filtered
> out if they do not pass
>
>   group_smaller_max_cpu_capacity(, );
>
> which itself employs fits_capacity() to compare the sgc->max_capacity of
> both groups.
>
> Due to the underlying margin, fits_capacity(X, 1024) will return false for
> any X > 819. Tough luck, the capacity_orig's on e.g. the Pixel 4 are
> {261, 871, 1024}. If a CPU-bound task ends up on one of those "medium"
> CPUs, misfit migration will never intentionally upmigrate it to a CPU of
> higher capacity due to the aforementioned margin.
>
> One may argue the 20% margin of fits_capacity() is excessive in the advent
> of counter-enhanced load tracking (APERF/MPERF, AMUs), but one point here
> is that fits_capacity() is meant to compare a utilization value to a
> capacity value, whereas here it is being used to compare two capacity
> values. As CPU capacity and task utilization have different dynamics, a
> sensible approach here would be to add a new helper dedicated to comparing
> CPU capacities.
>
> While at it, replace group_smaller_{min, max}_cpu_capacity() with
> comparisons of the source group's min/max capacity and the destination
> CPU's capacity.
>
> Reviewed-by: Qais Yousef 
> Tested-by: Lingutla Chandrasekhar 
> Signed-off-by: Valentin Schneider 

Reviewed-by: Vincent Guittot 

> ---
>  kernel/sched/fair.c | 33 ++---
>  1 file changed, 10 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d8077f82a380..c9c5c2697998 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu)
>   */
>  #define fits_capacity(cap, max)((cap) * 1280 < (max) * 1024)
>
> +/*
> + * The margin used when comparing CPU capacities.
> + * is 'cap1' noticeably greater than 'cap2'
> + *
> + * (default: ~5%)
> + */
> +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
>  #endif
>
>  #ifdef CONFIG_CFS_BANDWIDTH
> @@ -8364,26 +8371,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct 
> sg_lb_stats *sgs)
> return false;
>  }
>
> -/*
> - * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
> - * per-CPU capacity than sched_group ref.
> - */
> -static inline bool
> -group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group 
> *ref)
> -{
> -   return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
> -}
> -
> -/*
> - * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
> - * per-CPU capacity_orig than sched_group ref.
> - */
> -static inline bool
> -group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group 
> *ref)
> -{
> -   return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
> -}
> -
>  static inline enum
>  group_type group_classify(unsigned int imbalance_pct,
>   struct sched_group *group,
> @@ -8539,7 +8526,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  * internally or be covered by avg_load imbalance (eventually).
>  */
> if (sgs->group_type == group_misfit_task &&
> -   (!group_smaller_max_cpu_capacity(sg, sds->local) ||
> +   (!capacity_greater(capacity_of(env->dst_cpu), 
> sg->sgc->max_capacity) ||
>  sds->local_stat.group_type != group_has_spare))
> return false;
>
> @@ -8623,7 +8610,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  */
> if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> (sgs->group_type <= group_fully_busy) &&
> -   (group_smaller_min_cpu_capacity(sds->local, sg)))
> +   (capacity_greater(sg->sgc->min_capacity, 
> capacity_of(env->dst_cpu
> return false;
>
> return true;
> @@ -9423,7 +9410,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
>  * average load.
>  */
> if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> -   capacity_of(env->dst_cpu) < capacity &&
> +   !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
> nr_running == 1)
> continue;
>
> --
> 2.25.1
>

Re: [PATCH v4 2/3] sched/fair: Clean up active balance nr_balance_failed trickery

2021-04-02 Thread Vincent Guittot

On Thu, 1 Apr 2021 at 21:30, Valentin Schneider
 wrote:
>
> When triggering an active load balance, sd->nr_balance_failed is set to
> such a value that any further can_migrate_task() using said sd will ignore
> the output of task_hot().
>
> This behaviour makes sense, as active load balance intentionally preempts a
> rq's running task to migrate it right away, but this asynchronous write is
> a bit shoddy, as the stopper thread might run active_load_balance_cpu_stop
> before the sd->nr_balance_failed write either becomes visible to the
> stopper's CPU or even happens on the CPU that appended the stopper work.
>
> Add a struct lb_env flag to denote active balancing, and use it in
> can_migrate_task(). Remove the sd->nr_balance_failed write that served the
> same purpose. Cleanup the LBF_DST_PINNED active balance special case.
>
> Signed-off-by: Valentin Schneider 

Reviewed-by: Vincent Guittot 

> ---
>  kernel/sched/fair.c | 31 +++
>  1 file changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04d5e14fa261..d8077f82a380 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7422,6 +7422,7 @@ enum migration_type {
>  #define LBF_NEED_BREAK 0x02
>  #define LBF_DST_PINNED  0x04
>  #define LBF_SOME_PINNED0x08
> +#define LBF_ACTIVE_LB  0x10
>
>  struct lb_env {
> struct sched_domain *sd;
> @@ -7583,10 +7584,13 @@ int can_migrate_task(struct task_struct *p, struct 
> lb_env *env)
>  * our sched_group. We may want to revisit it if we couldn't
>  * meet load balance goals by pulling other tasks on src_cpu.
>  *
> -* Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
> -* already computed one in current iteration.
> +* Avoid computing new_dst_cpu
> +* - for NEWLY_IDLE
> +* - if we have already computed one in current iteration
> +* - if it's an active balance
>  */
> -   if (env->idle == CPU_NEWLY_IDLE || (env->flags & 
> LBF_DST_PINNED))
> +   if (env->idle == CPU_NEWLY_IDLE ||
> +   env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
> return 0;
>
> /* Prevent to re-select dst_cpu via env's CPUs: */
> @@ -7611,10 +7615,14 @@ int can_migrate_task(struct task_struct *p, struct 
> lb_env *env)
>
> /*
>  * Aggressive migration if:
> -* 1) destination numa is preferred
> -* 2) task is cache cold, or
> -* 3) too many balance attempts have failed.
> +* 1) active balance
> +* 2) destination numa is preferred
> +* 3) task is cache cold, or
> +* 4) too many balance attempts have failed.
>  */
> +   if (env->flags & LBF_ACTIVE_LB)
> +   return 1;
> +
> tsk_cache_hot = migrate_degrades_locality(p, env);
> if (tsk_cache_hot == -1)
> tsk_cache_hot = task_hot(p, env);
> @@ -9805,9 +9813,6 @@ static int load_balance(int this_cpu, struct rq 
> *this_rq,
> active_load_balance_cpu_stop, busiest,
> >active_balance_work);
> }
> -
> -   /* We've kicked active balancing, force task 
> migration. */
> -   sd->nr_balance_failed = sd->cache_nice_tries+1;
> }
> } else {
> sd->nr_balance_failed = 0;
> @@ -9957,13 +9962,7 @@ static int active_load_balance_cpu_stop(void *data)
> .src_cpu= busiest_rq->cpu,
> .src_rq = busiest_rq,
> .idle   = CPU_IDLE,
> -   /*
> -* can_migrate_task() doesn't need to compute 
> new_dst_cpu
> -* for active balancing. Since we have CPU_IDLE, but 
> no
> -* @dst_grpmask we need to make that test go away 
> with lying
> -* about DST_PINNED.
> -*/
> -   .flags  = LBF_DST_PINNED,
> +   .flags  = LBF_ACTIVE_LB,
> };
>
> schedstat_inc(sd->alb_count);
> --
> 2.25.1
>

Re: [PATCH v4 1/3] sched/fair: Ignore percpu threads for imbalance pulls

2021-04-02 Thread Vincent Guittot

On Thu, 1 Apr 2021 at 21:30, Valentin Schneider
 wrote:
>
> From: Lingutla Chandrasekhar 
>
> During load balance, LBF_SOME_PINNED will bet set if any candidate task
> cannot be detached due to CPU affinity constraints. This can result in
> setting env->sd->parent->sgc->group_imbalance, which can lead to a group
> being classified as group_imbalanced (rather than any of the other, lower
> group_type) when balancing at a higher level.
>
> In workloads involving a single task per CPU, LBF_SOME_PINNED can often be
> set due to per-CPU kthreads being the only other runnable tasks on any
> given rq. This results in changing the group classification during
> load-balance at higher levels when in reality there is nothing that can be
> done for this affinity constraint: per-CPU kthreads, as the name implies,
> don't get to move around (modulo hotplug shenanigans).
>
> It's not as clear for userspace tasks - a task could be in an N-CPU cpuset
> with N-1 offline CPUs, making it an "accidental" per-CPU task rather than
> an intended one. KTHREAD_IS_PER_CPU gives us an indisputable signal which
> we can leverage here to not set LBF_SOME_PINNED.
>
> Note that the aforementioned classification to group_imbalance (when
> nothing can be done) is especially problematic on big.LITTLE systems, which
> have a topology the likes of:
>
>   DIE [  ]
>   MC  [][]
>0  1  2  3
>L  L  B  B
>
>   arch_scale_cpu_capacity(L) < arch_scale_cpu_capacity(B)
>
> Here, setting LBF_SOME_PINNED due to a per-CPU kthread when balancing at MC
> level on CPUs [0-1] will subsequently prevent CPUs [2-3] from classifying
> the [0-1] group as group_misfit_task when balancing at DIE level. Thus, if
> CPUs [0-1] are running CPU-bound (misfit) tasks, ill-timed per-CPU kthreads
> can significantly delay the upgmigration of said misfit tasks. Systems
> relying on ASYM_PACKING are likely to face similar issues.
>
> Signed-off-by: Lingutla Chandrasekhar 
> [Use kthread_is_per_cpu() rather than p->nr_cpus_allowed]
> [Reword changelog]
> Signed-off-by: Valentin Schneider 

Reviewed-by: Vincent Guittot 

> ---
>  kernel/sched/fair.c | 4 
>  1 file changed, 4 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6d73bdbb2d40..04d5e14fa261 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7567,6 +7567,10 @@ int can_migrate_task(struct task_struct *p, struct 
> lb_env *env)
> if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
> return 0;
>
> +   /* Disregard pcpu kthreads; they are where they need to be. */
> +   if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p))
> +   return 0;
> +
> if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
> int cpu;
>
> --
> 2.25.1
>

Re: [PATCH -next] sched/fair: Move update_nohz_stats() under CONFIG_NO_HZ_COMMON

2021-03-30 Thread Vincent Guittot

On Tue, 30 Mar 2021 at 14:06, Kefeng Wang  wrote:
>
> update_nohz_stats() only call _nohz_idle_balance() under CONFIG_NO_HZ_COMMON.

a similar patch has already been sent and reviewed :
20210329144029.29200-1-yuehaib...@huawei.com

>
> Signed-off-by: Kefeng Wang 
> ---
>  kernel/sched/fair.c | 40 ++--
>  1 file changed, 18 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6d73bdbb2d40..2a20ada83cbb 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8395,28 +8395,6 @@ group_type group_classify(unsigned int imbalance_pct,
> return group_has_spare;
>  }
>
> -static bool update_nohz_stats(struct rq *rq)
> -{
> -#ifdef CONFIG_NO_HZ_COMMON
> -   unsigned int cpu = rq->cpu;
> -
> -   if (!rq->has_blocked_load)
> -   return false;
> -
> -   if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> -   return false;
> -
> -   if (!time_after(jiffies, 
> READ_ONCE(rq->last_blocked_load_update_tick)))
> -   return true;
> -
> -   update_blocked_averages(cpu);
> -
> -   return rq->has_blocked_load;
> -#else
> -   return false;
> -#endif
> -}
> -
>  /**
>   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
>   * @env: The load balancing environment.
> @@ -10380,6 +10358,24 @@ void nohz_balance_enter_idle(int cpu)
> WRITE_ONCE(nohz.has_blocked, 1);
>  }
>
> +static bool update_nohz_stats(struct rq *rq)
> +{
> +   unsigned int cpu = rq->cpu;
> +
> +   if (!rq->has_blocked_load)
> +   return false;
> +
> +   if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> +   return false;
> +
> +   if (!time_after(jiffies, 
> READ_ONCE(rq->last_blocked_load_update_tick)))
> +   return true;
> +
> +   update_blocked_averages(cpu);
> +
> +   return rq->has_blocked_load;
> +}
> +
>  /*
>   * Internal function that runs load balance for all idle cpus. The load 
> balance
>   * can be a simple update of blocked load or a complete load balance with
> --
> 2.26.2
>

Re: [PATCH -next] sched/fair: Fix unused function warning

2021-03-29 Thread Vincent Guittot

Hi,

On Sat, 27 Mar 2021 at 14:59, YueHaibing  wrote:
>
> while CONFIG_NO_HZ_COMMON is not enabled, gcc warn this:
>
> kernel/sched/fair.c:8398:13: warning: ‘update_nohz_stats’ defined but not 
> used [-Wunused-function]
>  static bool update_nohz_stats(struct rq *rq)
>  ^
>
> Move update_nohz_stats() to #ifdef block fix this.

Could you add a fix tag  ?
Fixes: 0826530de3cb ("sched/fair: Remove update of blocked load from
newidle_balance")

Also could you move update_nohz_stats() closer to its only caller
_nohz_idle_balance()

With these small nits above:

Reviewed-by: Vincent Guittot 

>
> Signed-off-by: YueHaibing 
> ---
>  kernel/sched/fair.c | 40 ++--
>  1 file changed, 18 insertions(+), 22 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6d73bdbb2d40..c7a7ef97d167 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8395,28 +8395,6 @@ group_type group_classify(unsigned int imbalance_pct,
> return group_has_spare;
>  }
>
> -static bool update_nohz_stats(struct rq *rq)
> -{
> -#ifdef CONFIG_NO_HZ_COMMON
> -   unsigned int cpu = rq->cpu;
> -
> -   if (!rq->has_blocked_load)
> -   return false;
> -
> -   if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> -   return false;
> -
> -   if (!time_after(jiffies, 
> READ_ONCE(rq->last_blocked_load_update_tick)))
> -   return true;
> -
> -   update_blocked_averages(cpu);
> -
> -   return rq->has_blocked_load;
> -#else
> -   return false;
> -#endif
> -}
> -
>  /**
>   * update_sg_lb_stats - Update sched_group's statistics for load balancing.
>   * @env: The load balancing environment.
> @@ -10097,6 +10075,24 @@ static inline int on_null_domain(struct rq *rq)
>  }
>
>  #ifdef CONFIG_NO_HZ_COMMON
> +static bool update_nohz_stats(struct rq *rq)
> +{
> +   unsigned int cpu = rq->cpu;
> +
> +   if (!rq->has_blocked_load)
> +   return false;
> +
> +   if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
> +   return false;
> +
> +   if (!time_after(jiffies, 
> READ_ONCE(rq->last_blocked_load_update_tick)))
> +   return true;
> +
> +   update_blocked_averages(cpu);
> +
> +   return rq->has_blocked_load;
> +}
> +
>  /*
>   * idle load balancing details
>   * - When one of the busy CPUs notice that there may be an idle rebalancing
> --
> 2.17.1
>

Re: [PATCH 9/9] sched,fair: Alternative sched_slice()

2021-03-26 Thread Vincent Guittot

On Fri, 26 Mar 2021 at 11:43, Peter Zijlstra  wrote:
>
> The current sched_slice() seems to have issues; there's two possible
> things that could be improved:
>
>  - the 'nr_running' used for __sched_period() is daft when cgroups are
>considered. Using the RQ wide h_nr_running seems like a much more
>consistent number.
>
>  - (esp) cgroups can slice it real fine, which makes for easy
>over-scheduling, ensure min_gran is what the name says.
>
> Signed-off-by: Peter Zijlstra (Intel) 
> ---
>  kernel/sched/fair.c |   15 ++-
>  kernel/sched/features.h |3 +++
>  2 files changed, 17 insertions(+), 1 deletion(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -680,7 +680,16 @@ static u64 __sched_period(unsigned long
>   */
>  static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
>  {
> -   u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
> +   unsigned int nr_running = cfs_rq->nr_running;
> +   u64 slice;
> +
> +   if (sched_feat(ALT_PERIOD))
> +   nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
> +
> +   slice = __sched_period(nr_running + !se->on_rq);
> +
> +   if (sched_feat(BASE_SLICE))
> +   slice -= sysctl_sched_min_granularity;
>
> for_each_sched_entity(se) {
> struct load_weight *load;
> @@ -697,6 +706,10 @@ static u64 sched_slice(struct cfs_rq *cf
> }
> slice = __calc_delta(slice, se->load.weight, load);
> }
> +
> +   if (sched_feat(BASE_SLICE))
> +   slice += sysctl_sched_min_granularity;

Why not only doing a max of slice and sysctl_sched_min_granularity
instead of scaling only the part above sysctl_sched_min_granularity ?

With your change, cases where the slices would have been in a good
range already, will be modified as well

> +
> return slice;
>  }
>
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -90,3 +90,6 @@ SCHED_FEAT(WA_BIAS, true)
>   */
>  SCHED_FEAT(UTIL_EST, true)
>  SCHED_FEAT(UTIL_EST_FASTUP, true)
> +
> +SCHED_FEAT(ALT_PERIOD, true)
> +SCHED_FEAT(BASE_SLICE, true)
>
>

Re: [PATCH] sched/fair: Rate limit calls to update_blocked_averages() for NOHZ

2021-03-24 Thread Vincent Guittot

Hi Tim,

Le mardi 23 mars 2021 à 14:37:59 (-0700), Tim Chen a écrit :
> 
> 
> On 1/29/21 9:27 AM, Vincent Guittot wrote:
> > 
> > The patch below moves the update of the blocked load of CPUs outside 
> > newidle_balance().
> 
> On a well known database workload, we also saw a lot of overhead to do 
> update_blocked_averages
> in newidle_balance().  So changes to reduce this overhead is much welcomed.
> 
> Turning on cgroup induces 9% throughput degradation on a 2 socket 40 cores 
> per socket Icelake system.  
> 
> A big part of the overhead in our database workload comes from updating
> blocked averages in newidle_balance, caused by I/O threads making
> some CPUs go in and out of idle frequently in the following code path:
> 
> __blkdev_direct_IO_simple
>   |  
>   |io_schedule_timeout
>   |  |  
>   |   schedule_timeout
>   | |  
>   |  schedule
>   ||  
>   | __schedule
>   |   |  
>   |pick_next_task_fair
>   |  |  
>   |   
> newidle_balance
>   | | 
>  
>  
> update_blocked_averages
> 
> We found update_blocked_averages() now consumed most CPU time, eating up 2% 
> of the CPU cycles once cgroup
> gets turned on.
> 
> I hacked up Joe's original patch to rate limit the update of blocked
> averages called from newidle_balance().  The 9% throughput degradation 
> reduced to
> 5.4%.  We'll be testing Vincent's change to see if it can give
> similar performance improvement.
> 
> Though in our test environment, sysctl_sched_migration_cost was kept
> much lower (25000) compared to the default (50), to encourage migrations 
> to idle cpu
> and reduce latency.  We got quite a lot of calls to update_blocked_averages 
> directly 
> and then try to load_balance in newidle_balance instead of relegating
> the responsibility to idle load balancer.  (See code snippet in 
> newidle_balance below)  
> 
> 
> if (this_rq->avg_idle < sysctl_sched_migration_cost ||   
> <-sched_migration_cost check
> !READ_ONCE(this_rq->rd->overload)) {
> 
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> if (sd)
> update_next_balance(sd, _balance);
> rcu_read_unlock();
> 
> goto out;  <--- invoke idle load balancer
> }
> 
> raw_spin_unlock(_rq->lock);
> 
> update_blocked_averages(this_cpu);
> 
>    followed by load balance code ---
> 
 
> So the update_blocked_averages offload to idle_load_balancer in Vincent's 
> patch is less 
> effective in this case with small sched_migration_cost.
> 
> Looking at the code a bit more, we don't actually load balance every time in 
> this code path
> unless our avg_idle time exceeds some threshold.  Doing 
> update_blocked_averages immediately 

IIUC your problem, we call update_blocked_averages() but because of:

if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, _balance);
break;
}

the for_each_domain loop stops even before running load_balance on the 1st
sched domain level which means that update_blocked_averages() was called
unnecessarily. 

And this is even more true with a small sysctl_sched_migration_cost which 
allows newly
idle LB for very small this_rq->avg_idle. We could wonder why you set such a 
low value 
for sysctl_sched_migration_cost which is lower than the max_newidle_lb_cost of 
the
smallest domain but that's probably because of task_hot().

if avg_idle is lower than the sd->max_newidle_lb_cost of the 1st sched_domain, 
we should
skip spin_unlock/lock and for_each_domain() loop entirely

Maybe something like below:


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 76e33a70d575..08933e0d87ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10723,17 +10723,21 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 */
rq_unpin_lock(this_rq, rf);

+   rcu_read_lock();
+   sd = rcu_deref

Re: [PATCH v2] sched/fair: reduce long-tail newly idle balance cost

2021-03-23 Thread Vincent Guittot

Hi Aurey,

On Tue, 16 Mar 2021 at 05:27, Li, Aubrey  wrote:
>
> On 2021/2/24 16:15, Aubrey Li wrote:
> > A long-tail load balance cost is observed on the newly idle path,
> > this is caused by a race window between the first nr_running check
> > of the busiest runqueue and its nr_running recheck in detach_tasks.
> >
> > Before the busiest runqueue is locked, the tasks on the busiest
> > runqueue could be pulled by other CPUs and nr_running of the busiest
> > runqueu becomes 1 or even 0 if the running task becomes idle, this
> > causes detach_tasks breaks with LBF_ALL_PINNED flag set, and triggers
> > load_balance redo at the same sched_domain level.
> >
> > In order to find the new busiest sched_group and CPU, load balance will
> > recompute and update the various load statistics, which eventually leads
> > to the long-tail load balance cost.
> >
> > This patch clears LBF_ALL_PINNED flag for this race condition, and hence
> > reduces the long-tail cost of newly idle balance.
>
> Ping...

Reviewed-by: Vincent Guittot 

>
> >
> > Cc: Vincent Guittot 
> > Cc: Mel Gorman 
> > Cc: Andi Kleen 
> > Cc: Tim Chen 
> > Cc: Srinivas Pandruvada 
> > Cc: Rafael J. Wysocki 
> > Signed-off-by: Aubrey Li 
> > ---
> >  kernel/sched/fair.c | 9 +
> >  1 file changed, 9 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 04a3ce2..5c67804 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7675,6 +7675,15 @@ static int detach_tasks(struct lb_env *env)
> >
> >   lockdep_assert_held(>src_rq->lock);
> >
> > + /*
> > +  * Source run queue has been emptied by another CPU, clear
> > +  * LBF_ALL_PINNED flag as we will not test any task.
> > +  */
> > + if (env->src_rq->nr_running <= 1) {
> > + env->flags &= ~LBF_ALL_PINNED;
> > + return 0;
> > + }
> > +
> >   if (env->imbalance <= 0)
> >   return 0;
> >
> >
>

Re: [PATCH] sched/fair: remove redundant test_idle_cores for non-smt

2021-03-22 Thread Vincent Guittot

On Sat, 20 Mar 2021 at 23:21, Barry Song  wrote:
>
> update_idle_core() is only done for the case of sched_smt_present.
> but test_idle_cores() is done for all machines even those without
> smt.
> this could contribute to up 8%+ hackbench performance loss on a
> machine like kunpeng 920 which has no smt. this patch removes the
> redundant test_idle_cores() for non-smt machines.
>
> we run the below hackbench with different -g parameter from 2 to
> 14, for each different g, we run the command 10 times and get the
> average time:
> $ numactl -N 0 hackbench -p -T -l 2 -g $1
>
> hackbench will report the time which is needed to complete a certain
> number of messages transmissions between a certain number of tasks,
> for example:
> $ numactl -N 0 hackbench -p -T -l 2 -g 10
> Running in threaded mode with 10 groups using 40 file descriptors each
> (== 400 tasks)
> Each sender will pass 2 messages of 100 bytes
>
> The below is the result of hackbench w/ and w/o this patch:
> g=2  4 6   8  10 12  14
> w/o: 1.8151 3.8499 5.5142 7.2491 9.0340 10.7345 12.0929
> w/ : 1.8428 3.7436 5.4501 6.9522 8.2882  9.9535 11.3367
>   +4.1%  +8.3%  +7.3%   +6.3%
>
> Signed-off-by: Barry Song 

Reviewed-by: Vincent Guittot 

> ---
>  kernel/sched/fair.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 2e2ab1e..de42a32 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6038,9 +6038,11 @@ static inline bool test_idle_cores(int cpu, bool def)
>  {
> struct sched_domain_shared *sds;
>
> -   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> -   if (sds)
> -   return READ_ONCE(sds->has_idle_cores);
> +   if (static_branch_likely(_smt_present)) {
> +   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
> +   if (sds)
> +   return READ_ONCE(sds->has_idle_cores);
> +   }
>
> return def;
>  }
> --
> 1.8.3.1
>

Re: [PATCH v3 6/7] sched/fair: Filter out locally-unsolvable misfit imbalances

2021-03-19 Thread Vincent Guittot

On Mon, 15 Mar 2021 at 20:18, Valentin Schneider
 wrote:
>
> On 15/03/21 16:13, Vincent Guittot wrote:
> > On Thu, 11 Mar 2021 at 13:05, Valentin Schneider
> >  wrote:
> >>
> >> Consider the following (hypothetical) asymmetric CPU capacity topology,
> >> with some amount of capacity pressure (RT | DL | IRQ | thermal):
> >>
> >>   DIE [  ]
> >>   MC  [][]
> >>0  1  2  3
> >>
> >>   | CPU | capacity_orig | capacity |
> >>   |-+---+--|
> >>   |   0 |   870 |  860 |
> >>   |   1 |   870 |  600 |
> >>   |   2 |  1024 |  850 |
> >>   |   3 |  1024 |  860 |
> >>
> >> If CPU1 has a misfit task, then CPU0, CPU2 and CPU3 are valid candidates to
> >> grant the task an uplift in CPU capacity. Consider CPU0 and CPU3 as
> >> sufficiently busy, i.e. don't have enough spare capacity to accommodate
> >> CPU1's misfit task. This would then fall on CPU2 to pull the task.
> >>
> >> This currently won't happen, because CPU2 will fail
> >>
> >>   capacity_greater(capacity_of(CPU2), sg->sgc->max_capacity)
> >
> > which has been introduced by the previous patch: patch5
> >
> >>
> >> in update_sd_pick_busiest(), where 'sg' is the [0, 1] group at DIE
> >> level. In this case, the max_capacity is that of CPU0's, which is at this
> >> point in time greater than that of CPU2's. This comparison doesn't make
> >> much sense, given that the only CPUs we should care about in this scenario
> >> are CPU1 (the CPU with the misfit task) and CPU2 (the load-balance
> >> destination CPU).
> >>
> >> Aggregate a misfit task's load into sgs->group_misfit_task_load only if
> >> env->dst_cpu would grant it a capacity uplift. Separately track whether a
> >> sched_group contains a misfit task to still classify it as
> >> group_misfit_task and not pick it as busiest group when pulling from a
> >
> > Could you give more details about why we should keep tracking the
> > group as misfit ? Do you have a UC in mind ?
> >
>
> As stated the current behaviour is to classify groups as group_misfit_task
> regardless of the dst_cpu's capacity. When we see a group_misfit_task
> candidate group misfit task with higher per-CPU capacity than the local
> group, we don't pick it as busiest.
>
> I initially thought not marking those as group_misfit_task was the right
> thing to do, as they could then be classified as group_fully_busy or
> group_has_spare. Consider:
>
>   DIE [  ]
>   MC  [][]
>0  1  2  3
>L  L  B  B
>
>   arch_scale_capacity(L) < arch_scale_capacity(B)
>
>   CPUs 0-1 are idle / lightly loaded
>   CPU2 has a misfit task and a few very small tasks
>   CPU3 has a few very small tasks
>
> When CPU0 is running load_balance() at DIE level, right now we'll classify
> the [2-3] group as group_misfit_task and not pick it as busiest because the
> local group has a lower CPU capacity.
>
> If we didn't do that, we could leave the misfit task alone and pull some
> small task(s) from CPU2 or CPU3, which would be a good thing to

Are you sure? the last check in update_sd_pick_busiest() should
already filter this. So it should be enough to let it be classify
correctly

A group should be classified as group_misfit_task when there is a task
to migrate in priority compared to some other groups. In your case,
you tag it as group_misfit_task but in order to do the opposite, i.e.
make sure to not select it. As mentioned above, this will be filter in
the last check in update_sd_pick_busiest()

> do. However, by allowing a group containing a misfit task to be picked as
> the busiest group when a CPU of lower capacity is pulling, we run the risk
> of the misfit task itself being downmigrated - e.g. if we repeatedly
> increment the sd->nr_balance_failed counter and do an active balance (maybe
> because the small tasks were unfortunately cache_hot()).
>
> It's less than ideal, but I considered not downmigrating misfit tasks was
> the thing to prioritize (and FWIW it also maintains current behaviour).
>
>
> Another approach would be to add task utilization vs CPU capacity checks in
> detach_tasks() and need_active_balance() to prevent downmigration when
> env->imbalance_type < group_misfit_task. This may go against the busiest
> group selection heuristics however (misfit tasks could be the main
> contributors to the imbalance, but we end up not moving them).
>
>
> >> lower-capacity CPU (which is the current behaviour an

Re: [PATCH v3 6/7] sched/fair: Filter out locally-unsolvable misfit imbalances

2021-03-15 Thread Vincent Guittot

On Thu, 11 Mar 2021 at 13:05, Valentin Schneider
 wrote:
>
> Consider the following (hypothetical) asymmetric CPU capacity topology,
> with some amount of capacity pressure (RT | DL | IRQ | thermal):
>
>   DIE [  ]
>   MC  [][]
>0  1  2  3
>
>   | CPU | capacity_orig | capacity |
>   |-+---+--|
>   |   0 |   870 |  860 |
>   |   1 |   870 |  600 |
>   |   2 |  1024 |  850 |
>   |   3 |  1024 |  860 |
>
> If CPU1 has a misfit task, then CPU0, CPU2 and CPU3 are valid candidates to
> grant the task an uplift in CPU capacity. Consider CPU0 and CPU3 as
> sufficiently busy, i.e. don't have enough spare capacity to accommodate
> CPU1's misfit task. This would then fall on CPU2 to pull the task.
>
> This currently won't happen, because CPU2 will fail
>
>   capacity_greater(capacity_of(CPU2), sg->sgc->max_capacity)

which has been introduced by the previous patch: patch5

>
> in update_sd_pick_busiest(), where 'sg' is the [0, 1] group at DIE
> level. In this case, the max_capacity is that of CPU0's, which is at this
> point in time greater than that of CPU2's. This comparison doesn't make
> much sense, given that the only CPUs we should care about in this scenario
> are CPU1 (the CPU with the misfit task) and CPU2 (the load-balance
> destination CPU).
>
> Aggregate a misfit task's load into sgs->group_misfit_task_load only if
> env->dst_cpu would grant it a capacity uplift. Separately track whether a
> sched_group contains a misfit task to still classify it as
> group_misfit_task and not pick it as busiest group when pulling from a

Could you give more details about why we should keep tracking the
group as misfit ? Do you have a UC in mind ?

> lower-capacity CPU (which is the current behaviour and prevents
> down-migration).
>
> Since find_busiest_queue() can now iterate over CPUs with a higher capacity
> than the local CPU's, add a capacity check there.
>
> Reviewed-by: Qais Yousef 
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/fair.c | 39 ++-
>  1 file changed, 30 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1e8a242cd1f7..41cdda7a8ea6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5759,6 +5759,12 @@ static unsigned long capacity_of(int cpu)
> return cpu_rq(cpu)->cpu_capacity;
>  }
>
> +/* Is CPU a's capacity noticeably greater than CPU b's? */
> +static inline bool cpu_capacity_greater(int a, int b)
> +{
> +   return capacity_greater(capacity_of(a), capacity_of(b));
> +}
> +
>  static void record_wakee(struct task_struct *p)
>  {
> /*
> @@ -8091,7 +8097,8 @@ struct sg_lb_stats {
> unsigned int group_weight;
> enum group_type group_type;
> unsigned int group_asym_packing; /* Tasks should be moved to 
> preferred CPU */
> -   unsigned long group_misfit_task_load; /* A CPU has a task too big for 
> its capacity */
> +   unsigned long group_misfit_task_load; /* Task load that can be 
> uplifted */
> +   int   group_has_misfit_task; /* A CPU has a task too big for 
> its capacity */
>  #ifdef CONFIG_NUMA_BALANCING
> unsigned int nr_numa_running;
> unsigned int nr_preferred_running;
> @@ -8364,7 +8371,7 @@ group_type group_classify(unsigned int imbalance_pct,
> if (sgs->group_asym_packing)
> return group_asym_packing;
>
> -   if (sgs->group_misfit_task_load)
> +   if (sgs->group_has_misfit_task)
> return group_misfit_task;
>
> if (!group_has_capacity(imbalance_pct, sgs))
> @@ -8447,10 +8454,21 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> continue;
>
> /* Check for a misfit task on the cpu */
> -   if (sd_has_asym_cpucapacity(env->sd) &&
> -   sgs->group_misfit_task_load < rq->misfit_task_load) {
> -   sgs->group_misfit_task_load = rq->misfit_task_load;
> -   *sg_status |= SG_OVERLOAD;
> +   if (!sd_has_asym_cpucapacity(env->sd) ||
> +   !rq->misfit_task_load)
> +   continue;
> +
> +   *sg_status |= SG_OVERLOAD;
> +   sgs->group_has_misfit_task = true;
> +
> +   /*
> +* Don't attempt to maximize load for misfit tasks that can't 
> be
> +* granted a CPU capacity uplift.
> +*/
> +   if (cpu_capacity_greater(env->dst_cpu, i)) {
> +   sgs->group_misfit_task_load = max(
> +   sgs->group_misfit_task_load,
> +   rq->misfit_task_load);

Please encapsulate all this misfit specific code in a dedicated
function which will be called from update_sg_lb_stats

> }
> }
>
> @@ -8501,7 +8519,7 @@ static bool

Re: [PATCH v3 4/7] sched/fair: Introduce a CPU capacity comparison helper

2021-03-15 Thread Vincent Guittot

On Thu, 11 Mar 2021 at 13:05, Valentin Schneider
 wrote:
>
> During load-balance, groups classified as group_misfit_task are filtered
> out if they do not pass
>
>   group_smaller_max_cpu_capacity(, );
>
> which itself employs fits_capacity() to compare the sgc->max_capacity of
> both groups.
>
> Due to the underlying margin, fits_capacity(X, 1024) will return false for
> any X > 819. Tough luck, the capacity_orig's on e.g. the Pixel 4 are
> {261, 871, 1024}. If a CPU-bound task ends up on one of those "medium"
> CPUs, misfit migration will never intentionally upmigrate it to a CPU of
> higher capacity due to the aforementioned margin.
>
> One may argue the 20% margin of fits_capacity() is excessive in the advent
> of counter-enhanced load tracking (APERF/MPERF, AMUs), but one point here
> is that fits_capacity() is meant to compare a utilization value to a
> capacity value, whereas here it is being used to compare two capacity
> values. As CPU capacity and task utilization have different dynamics, a
> sensible approach here would be to add a new helper dedicated to comparing
> CPU capacities.
>
> Reviewed-by: Qais Yousef 
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/fair.c | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index db892f6e222f..ddb2ab3edf6d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu)
>   */
>  #define fits_capacity(cap, max)((cap) * 1280 < (max) * 1024)
>
> +/*
> + * The margin used when comparing CPU capacities.
> + * is 'cap1' noticeably greater than 'cap2'
> + *
> + * (default: ~5%)
> + */
> +#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)

defined but not used.

Should be merged with next patch which start to use it

>  #endif
>
>  #ifdef CONFIG_CFS_BANDWIDTH
> --
> 2.25.1
>

Re: [PATCH v3 3/7] sched/fair: Add more sched_asym_cpucapacity static branch checks

2021-03-15 Thread Vincent Guittot

On Thu, 11 Mar 2021 at 13:05, Valentin Schneider
 wrote:
>
> Rik noted a while back that a handful of
>
>   sd->flags & SD_ASYM_CPUCAPACITY
>
> & family in the CFS load-balancer code aren't guarded by the
> sched_asym_cpucapacity static branch.

guarding asym capacity with static branch in fast path makes sense but
I see no benefit in this slow path but hiding and complexifying the
code. Also if you start with this way then you have to add a nop in
all other places where flag or a group_type might be unused.

>
> Turning those checks into NOPs for those who don't need it is fairly
> straightforward, and hiding it in a helper doesn't change code size in all
> but one spot. It also gives us a place to document the differences between
> checking the static key and checking the SD flag.
>
> Suggested-by: Rik van Riel 
> Reviewed-by: Qais Yousef 
> Signed-off-by: Valentin Schneider 
> ---
>  kernel/sched/fair.c  | 21 -
>  kernel/sched/sched.h | 33 +
>  2 files changed, 41 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f50a902bdf24..db892f6e222f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6300,15 +6300,8 @@ static int select_idle_sibling(struct task_struct *p, 
> int prev, int target)
>  * sd_asym_cpucapacity rather than sd_llc.
>  */
> if (static_branch_unlikely(_asym_cpucapacity)) {
> +   /* See sd_has_asym_cpucapacity() */
> sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
> -   /*
> -* On an asymmetric CPU capacity system where an exclusive
> -* cpuset defines a symmetric island (i.e. one unique
> -* capacity_orig value through the cpuset), the key will be 
> set
> -* but the CPUs within that cpuset will not have a domain with
> -* SD_ASYM_CPUCAPACITY. These should follow the usual 
> symmetric
> -* capacity path.
> -*/
> if (sd) {
> i = select_idle_capacity(p, sd, target);
> return ((unsigned)i < nr_cpumask_bits) ? i : target;
> @@ -8467,7 +8460,7 @@ static inline void update_sg_lb_stats(struct lb_env 
> *env,
> continue;
>
> /* Check for a misfit task on the cpu */
> -   if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> +   if (sd_has_asym_cpucapacity(env->sd) &&
> sgs->group_misfit_task_load < rq->misfit_task_load) {
> sgs->group_misfit_task_load = rq->misfit_task_load;
> *sg_status |= SG_OVERLOAD;
> @@ -8524,7 +8517,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  * CPUs in the group should either be possible to resolve
>  * internally or be covered by avg_load imbalance (eventually).
>  */
> -   if (sgs->group_type == group_misfit_task &&
> +   if (static_branch_unlikely(_asym_cpucapacity) &&
> +   sgs->group_type == group_misfit_task &&
> (!group_smaller_max_cpu_capacity(sg, sds->local) ||
>  sds->local_stat.group_type != group_has_spare))
> return false;
> @@ -8607,7 +8601,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
>  * throughput. Maximize throughput, power/energy consequences are not
>  * considered.
>  */
> -   if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> +   if (sd_has_asym_cpucapacity(env->sd) &&
> (sgs->group_type <= group_fully_busy) &&
> (group_smaller_min_cpu_capacity(sds->local, sg)))
> return false;
> @@ -8730,7 +8724,7 @@ static inline void update_sg_wakeup_stats(struct 
> sched_domain *sd,
> }
>
> /* Check if task fits in the group */
> -   if (sd->flags & SD_ASYM_CPUCAPACITY &&
> +   if (sd_has_asym_cpucapacity(sd) &&
> !task_fits_capacity(p, group->sgc->max_capacity)) {
> sgs->group_misfit_task_load = 1;
> }
> @@ -9408,7 +9402,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
>  * Higher per-CPU capacity is considered better than balancing
>  * average load.
>  */
> -   if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> +   if (sd_has_asym_cpucapacity(env->sd) &&
> capacity_of(env->dst_cpu) < capacity &&
> nr_running == 1)
> continue;
> @@ -10225,6 +10219,7 @@ static void nohz_balancer_kick(struct rq *rq)
> }
> }
>
> +/* See sd_has_asym_cpucapacity(). */
> sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
> if (sd) {
> /*
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index d2e09a647c4f..27bf70bc86c7 100644
> ---

Re: [PATCH] sched/fair: Prefer idle CPU to cache affinity

2021-03-10 Thread Vincent Guittot

On Wed, 10 Mar 2021 at 06:53, Srikar Dronamraju
 wrote:
>
> * Vincent Guittot  [2021-03-08 14:52:39]:
>
> > On Fri, 26 Feb 2021 at 17:41, Srikar Dronamraju
> >  wrote:
> > >
>
> Thanks Vincent for your review comments.
>
> > > +static int prefer_idler_llc(int this_cpu, int prev_cpu, int sync)
> > > +{
> > > +   struct sched_domain_shared *tsds, *psds;
> > > +   int pnr_busy, pllc_size, tnr_busy, tllc_size, diff;
> > > +
> > > +   tsds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
> > > +   tnr_busy = atomic_read(>nr_busy_cpus);
> > > +   tllc_size = per_cpu(sd_llc_size, this_cpu);
> > > +
> > > +   psds = rcu_dereference(per_cpu(sd_llc_shared, prev_cpu));
> > > +   pnr_busy = atomic_read(>nr_busy_cpus);
> > > +   pllc_size = per_cpu(sd_llc_size, prev_cpu);
> > > +
> > > +   /* No need to compare, if both LLCs are fully loaded */
> > > +   if (pnr_busy == pllc_size && tnr_busy == pllc_size)
> > > +   return nr_cpumask_bits;
> > > +
> > > +   if (sched_feat(WA_WAKER) && tnr_busy < tllc_size)
> > > +   return this_cpu;
> >
> > Why have you chosen to favor this_cpu instead of prev_cpu unlike for 
> > wake_idle ?
>
> At this point, we know the waker running on this_cpu and wakee which was
> running on prev_cpu are affine to each other and this_cpu and prev_cpu dont
> share cache. I chose to move them close to each other to benefit from the
> cache sharing. Based on feedback from Peter and Rik, I made the check more
> conservative i.e tnr_busy <= tllc_size/smt_weight (where smt_weight is the
> cpumask weight of smt domain for this_cpu) i.e if we have a free core in

yeah make sense

> this llc domain, chose this_cpu.  select_idle_sibling() should pick an idle
> cpu/core/smt within the llc domain for this_cpu.
>
> Do you feel, this may not be the correct option?

I was worried that we end up pulling tasks in same llc but the
condition above and wake_wide should prevent such behavior

>
> We are also experimenting with another option, were we call prefer_idler_cpu
> after wa_weight. I.e
> 1. if wake_affine_weight choses this_cpu but llc in prev_cpu has an idle
> smt/CPU but there are no idle smt/CPU in this_cpu, then chose idle smt/CPU
> in prev_cpu
> 2. if wake_affine_weight choses nr_cpumask(aka prev_cpu) but llc in this_cpu
> has an idle smt/CPU but there are no idle smt/CPU in prev_cpu, then chose
> idle smt/CPU in this_cpu
>
>
> > > +
> > > +   /* For better wakeup latency, prefer idler LLC to cache affinity 
> > > */
> > > +   diff = tnr_busy * pllc_size - sync - pnr_busy * tllc_size;
> > > +   if (!diff)
> > > +   return nr_cpumask_bits;
> > > +   if (diff < 0)
> > > +   return this_cpu;
> > > +
> > > +   return prev_cpu;
> > > +}
> > > +
> > >  static int wake_affine(struct sched_domain *sd, struct task_struct *p,
> > >int this_cpu, int prev_cpu, int sync)
> > >  {
> > > @@ -5877,6 +5907,10 @@ static int wake_affine(struct sched_domain *sd, 
> > > struct task_struct *p,
> > > if (sched_feat(WA_IDLE))
> > > target = wake_affine_idle(this_cpu, prev_cpu, sync);
> > >
> > > +   if (sched_feat(WA_IDLER_LLC) && target == nr_cpumask_bits &&
> > > +   !cpus_share_cache(this_cpu, prev_cpu))
> > > +   target = prefer_idler_llc(this_cpu, prev_cpu, sync);
> >
> > could you use the same naming convention as others function ?
> > wake_affine_llc as an example
>
> I guess you meant s/prefer_idler_llc/wake_affine_llc/

yes

> Sure. I can modify.
>
> >
> > > +
> > > if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
> > > target = wake_affine_weight(sd, p, this_cpu, prev_cpu, 
> > > sync);
> > >
> > > @@ -5884,8 +5918,11 @@ static int wake_affine(struct sched_domain *sd, 
> > > struct task_struct *p,
> > > if (target == nr_cpumask_bits)
> > > return prev_cpu;
> > >
> > > -   schedstat_inc(sd->ttwu_move_affine);
> > > -   schedstat_inc(p->se.statistics.nr_wakeups_affine);
> > > +   if (target == this_cpu) {
> >
> > How is this condition related to $subject ?
>
> Before this change, wake_affine_weight and wake_affine_idle would either
> return this_cpu or nr_cpumask_bits. Just before this check, we check if
> target is nr_cpumask_bits and return prev_cpu. So the stats were only
> incremented when target was this_cpu.
>
> However with prefer_idler_llc, we may return this_cpu, prev_cpu or
> nr_cpumask_bits. Now we only to update stats when we have chosen to migrate
> the task to this_cpu. Hence I had this check.

ok got it.

May be return earlier in this case like for  if (target ==
nr_cpumask_bits) above

>
> If we use the slightly lazier approach which is check for wa_weight first
> before wa_idler_llc, then we may not need this change at all.
>
> --
> Thanks and Regards
> Srikar Dronamraju

Re: [PATCH V5 2/2] cpufreq: CPPC: Add support for frequency invariance

2021-03-09 Thread Vincent Guittot

if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
> > + return;
> > +
> > + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, 
> > cpu_present_mask);
> > +
> > + for_each_possible_cpu(i) {
> > + cppc_fi = _cpu(cppc_freq_inv, i);
> > + irq_work_sync(_fi->irq_work);
> > + }
> > +
> > + kthread_destroy_worker(kworker_fie);
> > + kworker_fie = NULL;
> > +}
> > +
> > +static void __init cppc_freq_invariance_init(void)
> > +{
> > + struct cppc_perf_fb_ctrs fb_ctrs = {0};
> > + struct cppc_freq_invariance *cppc_fi;
> > + struct sched_attr attr = {
> > + .size   = sizeof(struct sched_attr),
> > + .sched_policy   = SCHED_DEADLINE,
> > + .sched_nice = 0,
> > + .sched_priority = 0,
> > + /*
> > +  * Fake (unused) bandwidth; workaround to "fix"
> > +  * priority inheritance.
> > +  */
> > + .sched_runtime  = 100,
> > + .sched_deadline = 1000,
> > + .sched_period   = 1000,
> > + };
> > + int i, ret;
> > +
> > + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate)
> > + return;
> > +
> > + kworker_fie = kthread_create_worker(0, "cppc_fie");
> > + if (IS_ERR(kworker_fie))
> > + return;
> > +
> > + ret = sched_setattr_nocheck(kworker_fie->task, );
> > + if (ret) {
> > + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__,
> > + ret);
> > + kthread_destroy_worker(kworker_fie);
> > + return;
> > + }
> > +
>
> Nit: to me it makes more sense to move the code below to
> cppc_freq_invariance_policy_init(). It seems a bit strange to do part of
> the initialization of the per-cpu information there, and part here. But
> I do understand the reasons for it. Moving the code below would also
> save some cycles going through the CPUs again and will mimic the
> frequency invariance setup process in the arm64 topology, where we do
> amu_fie_setup() at policy creation time.
>
> It's not a big deal so I'll leave it up to you.
>
> > + for_each_possible_cpu(i) {
> > + cppc_fi = _cpu(cppc_freq_inv, i);
> > +
> > + /* A policy failed to initialize, abort */
> > + if (unlikely(!cppc_fi->cpu_data))
> > + return cppc_freq_invariance_exit();
> > +
> > + ret = cppc_get_perf_ctrs(i, _ctrs);
> > + if (ret) {
> > + pr_warn("%s: failed to read perf counters: %d\n",
> > + __func__, ret);
> > + return cppc_freq_invariance_exit();
> > + }
> > +
> > + cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
> > + }
> > +
> > + /* Register for freq-invariance */
> > + topology_set_scale_freq_source(_sftd, cpu_present_mask);
> > +}
>
> After another very quick round of testing:
>
> Reviewed-by: Ionela Voinescu 
> Tested-by: Ionela Voinescu 
>
> I did not get the chance to test on ThunderX2 yet, but if you are happy
> with your testing on it, I won't delay this any further.

I have just run some functional  tests on thx2 with rt-app: I have run
a periodic task (6ms running / 30ms periods) at different frequencies
(2.5Ghz, 2Ghz, 1.5Ghz, 1.333Ghz, 1Ghz) and the PELT signals stays the
same for all frequencies.

Tested-by: Vincent Guittot 


>
> Thanks,
> Ionela.

Re: [PATCH] sched/fair: Prefer idle CPU to cache affinity

2021-03-08 Thread Vincent Guittot

On Fri, 26 Feb 2021 at 17:41, Srikar Dronamraju
 wrote:
>
> On POWER8 and POWER9, the last level cache (L2) has been at the level of
> a group of 8 threads (SMT8 on POWER8, a big-core comprising of a pair of
> SMT4 cores on POWER9). However, on POWER10, the LLC domain is at the
> level of a group of SMT4 threads within the SMT8 core. Due to the
> shrinking in the size of the LLC domain, the probability of finding an
> idle CPU in the LLC domain of the target is lesser on POWER10 compared
> to the previous generation processors.
>
> With commit 9538abee18cc ("powerpc/smp: Add support detecting
> thread-groups sharing L2 cache") benchmarks such as Daytrader
> (https://github.com/WASdev/sample.daytrader7) show a drop in throughput
> in a configuration consisting of 1 JVM spanning across 6-8 Bigcores on
> POWER10.  Analysis showed that this was because more number of wakeups
> were happening on busy CPUs when the utilization was 60-70%. This drop
> in throughput also shows up as a drop in CPU utilization. However most
> other benchmarks benefit with detecting the thread-groups that share L2
> cache.
>
> Current order of preference to pick a LLC while waking a wake-affine
> task:
> 1. Between the waker CPU and previous CPU, prefer the LLC of the CPU
>that is idle.
>
> 2. Between the waker CPU and previous CPU, prefer the LLC of the CPU
>that is less lightly loaded.
>
> In the current situation where waker and previous CPUs are busy, but
> only one of its LLC has an idle CPU, Scheduler may end up picking a LLC
> with no idle CPUs. To mitigate this, add a new step between 1 and 2
> where Scheduler compares idle CPUs in waker and previous LLCs and picks
> the appropriate one.
>
> The other alternative is to search for an idle CPU in the other LLC, if
> the current select_idle_sibling is unable to find an idle CPU in the
> preferred LLC. But that may increase the time to select a CPU.
>
>
>  5.11-rc6  5.11-rc6+revert   
> 5.11-rc6+patch
> 8CORE/1JVM  80USERS   throughput 6651.66716.3 (0.97%)6940 
> (4.34%)
>   sys/user:time  59.75/23.86   61.77/24.55   60/24
>
> 8CORE/2JVM  80USERS   throughput 6425.46446.8 (0.33%)6473.2 
> (0.74%)
>   sys/user:time  70.59/24.25   72.28/23.77   70/24
>
> 8CORE/4JVM  80USERS   throughput 5355.35551.2 (3.66%)5586.6 
> (4.32%)
>   sys/user:time  76.74/21.79   76.54/22.73   76/22
>
> 8CORE/8JVM  80USERS   throughput 4420.64553.3 (3.00%)4405.8 
> (-0.33%)
>   sys/user:time  79.13/20.32   78.76/21.01   79/20
>
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Michael Neuling 
> Cc: Gautham R Shenoy 
> Cc: Parth Shah 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Dietmar Eggemann 
> Cc: Mel Gorman 
> Cc: Vincent Guittot 
> Co-developed-by: Gautham R Shenoy 
> Signed-off-by: Gautham R Shenoy 
> Co-developed-by: Parth Shah 
> Signed-off-by: Parth Shah 
> Signed-off-by: Srikar Dronamraju 
> ---
>  kernel/sched/fair.c | 41 +++--
>  kernel/sched/features.h |  2 ++
>  2 files changed, 41 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a8bd7b13634..d49bfcdc4a19 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5869,6 +5869,36 @@ wake_affine_weight(struct sched_domain *sd, struct 
> task_struct *p,
> return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
>  }
>
> +static int prefer_idler_llc(int this_cpu, int prev_cpu, int sync)
> +{
> +   struct sched_domain_shared *tsds, *psds;
> +   int pnr_busy, pllc_size, tnr_busy, tllc_size, diff;
> +
> +   tsds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
> +   tnr_busy = atomic_read(>nr_busy_cpus);
> +   tllc_size = per_cpu(sd_llc_size, this_cpu);
> +
> +   psds = rcu_dereference(per_cpu(sd_llc_shared, prev_cpu));
> +   pnr_busy = atomic_read(>nr_busy_cpus);
> +   pllc_size = per_cpu(sd_llc_size, prev_cpu);
> +
> +   /* No need to compare, if both LLCs are fully loaded */
> +   if (pnr_busy == pllc_size && tnr_busy == pllc_size)
> +   return nr_cpumask_bits;
> +
> +   if (sched_feat(WA_WAKER) && tnr_busy < tllc_size)
> +   return this_cpu;

Why have you chosen to favor this_cpu instead of prev_cpu unlike for wake_idle ?

> +
> +   /* For better wakeup latency, prefer idler LLC to cache affinity */
> +   diff = tnr_busy * pllc_size - sync - pnr_busy * tllc_si

Re: [RFC PATCH v8] sched/fair: select idle cpu from idle cpumask for task wakeup

2021-03-08 Thread Vincent Guittot

Hi Aubrey,

On Thu, 4 Mar 2021 at 14:51, Li, Aubrey  wrote:
>
> Hi Peter,
>
> On 2020/12/11 23:07, Vincent Guittot wrote:
> > On Thu, 10 Dec 2020 at 02:44, Aubrey Li  wrote:
> >>
> >> Add idle cpumask to track idle cpus in sched domain. Every time
> >> a CPU enters idle, the CPU is set in idle cpumask to be a wakeup
> >> target. And if the CPU is not in idle, the CPU is cleared in idle
> >> cpumask during scheduler tick to ratelimit idle cpumask update.
> >>
> >> When a task wakes up to select an idle cpu, scanning idle cpumask
> >> has lower cost than scanning all the cpus in last level cache domain,
> >> especially when the system is heavily loaded.
> >>
> >> Benchmarks including hackbench, schbench, uperf, sysbench mysql and
> >> kbuild have been tested on a x86 4 socket system with 24 cores per
> >> socket and 2 hyperthreads per core, total 192 CPUs, no regression
> >> found.
> >>
> snip
> >>
> >> Cc: Peter Zijlstra 
> >> Cc: Mel Gorman 
> >> Cc: Vincent Guittot 
> >> Cc: Qais Yousef 
> >> Cc: Valentin Schneider 
> >> Cc: Jiang Biao 
> >> Cc: Tim Chen 
> >> Signed-off-by: Aubrey Li 
> >
> > This version looks good to me. I don't see regressions of v5 anymore
> > and see some improvements on heavy cases
> >
> > Reviewed-by: Vincent Guittot 
>
> May I know your thoughts about this patch?
> Is it cpumask operation potentially too expensive to be here?

Could you rebase your patch ? It doesn't apply anymore on
tip/sched/core was recent changes

>
> Thanks,
> -Aubrey
> >
> >> ---
> >>  include/linux/sched/topology.h | 13 ++
> >>  kernel/sched/core.c|  2 ++
> >>  kernel/sched/fair.c| 45 +-
> >>  kernel/sched/idle.c|  5 
> >>  kernel/sched/sched.h   |  4 +++
> >>  kernel/sched/topology.c|  3 ++-
> >>  6 files changed, 70 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/linux/sched/topology.h 
> >> b/include/linux/sched/topology.h
> >> index 820511289857..b47b85163607 100644
> >> --- a/include/linux/sched/topology.h
> >> +++ b/include/linux/sched/topology.h
> >> @@ -65,8 +65,21 @@ struct sched_domain_shared {
> >> atomic_tref;
> >> atomic_tnr_busy_cpus;
> >> int has_idle_cores;
> >> +   /*
> >> +* Span of all idle CPUs in this domain.
> >> +*
> >> +* NOTE: this field is variable length. (Allocated dynamically
> >> +* by attaching extra space to the end of the structure,
> >> +* depending on how many CPUs the kernel has booted up with)
> >> +*/
> >> +   unsigned long   idle_cpus_span[];
> >>  };
> >>
> >> +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared 
> >> *sds)
> >> +{
> >> +   return to_cpumask(sds->idle_cpus_span);
> >> +}
> >> +
> >>  struct sched_domain {
> >> /* These fields must be setup */
> >> struct sched_domain __rcu *parent;  /* top domain must be null 
> >> terminated */
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index c4da7e17b906..b136e2440ea4 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -4011,6 +4011,7 @@ void scheduler_tick(void)
> >>
> >>  #ifdef CONFIG_SMP
> >> rq->idle_balance = idle_cpu(cpu);
> >> +   update_idle_cpumask(cpu, rq->idle_balance);
> >> trigger_load_balance(rq);
> >>  #endif
> >>  }
> >> @@ -7186,6 +7187,7 @@ void __init sched_init(void)
> >> rq->idle_stamp = 0;
> >> rq->avg_idle = 2*sysctl_sched_migration_cost;
> >> rq->max_idle_balance_cost = sysctl_sched_migration_cost;
> >> +   rq->last_idle_state = 1;
> >>
> >> INIT_LIST_HEAD(>cfs_tasks);
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index c0c4d9ad7da8..25f36ecfee54 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6146,7 +6146,12 @@ static int select_idle_cpu(struct task_struct *p, 
> >> struct sched_domain *sd, int t
> >>
> >> time = cpu_

Re: [RFC PATCH v4 2/3] scheduler: add scheduler level for clusters

2021-03-08 Thread Vincent Guittot

On Tue, 2 Mar 2021 at 00:08, Barry Song  wrote:
>
> ARM64 chip Kunpeng 920 has 6 or 8 clusters in each NUMA node, and each
> cluster has 4 cpus. All clusters share L3 cache data, but each cluster
> has local L3 tag. On the other hand, each clusters will share some
> internal system bus. This means cache coherence overhead inside one
> cluster is much less than the overhead across clusters.
>
> This patch adds the sched_domain for clusters. On kunpeng 920, without
> this patch, domain0 of cpu0 would be MC with cpu0~cpu23 with ; with this
> patch, MC becomes domain1, a new domain0 "CLS" including cpu0-cpu3.
>
> This will help spread unrelated tasks among clusters, thus decrease the
> contention and improve the throughput, for example, stream benchmark can
> improve around 4.3%~6.3% by this patch:
>
> w/o patch:
> numactl -N 0 /usr/lib/lmbench/bin/stream -P 12 -M 1024M -N 5
> STREAM copy latency: 3.36 nanoseconds
> STREAM copy bandwidth: 57072.50 MB/sec
> STREAM scale latency: 3.40 nanoseconds
> STREAM scale bandwidth: 56542.52 MB/sec
> STREAM add latency: 5.10 nanoseconds
> STREAM add bandwidth: 56482.83 MB/sec
> STREAM triad latency: 5.14 nanoseconds
> STREAM triad bandwidth: 56069.52 MB/sec
>
> w/ patch:
> $ numactl -N 0 /usr/lib/lmbench/bin/stream -P 12 -M 1024M -N 5
> STREAM copy latency: 3.22 nanoseconds
> STREAM copy bandwidth: 59660.96 MB/sec->  +4.5%
> STREAM scale latency: 3.25 nanoseconds
> STREAM scale bandwidth: 59002.29 MB/sec   ->  +4.3%
> STREAM add latency: 4.80 nanoseconds
> STREAM add bandwidth: 60036.62 MB/sec ->  +6.3%
> STREAM triad latency: 4.86 nanoseconds
> STREAM triad bandwidth: 59228.30 MB/sec   ->  +5.6%
>
> On the other hand, while doing WAKE_AFFINE, this patch will try to find
> a core in the target cluster before scanning the whole llc domain. So it
> helps gather related tasks within one cluster.

Could you split this patch in 2 patches ? One for adding a cluster
sched domain level and one for modifying the wake up path ?

This would ease the review and I would be curious about the impact of
each feature in the performance. In particular, I'm still not
convinced that the modification of the wakeup path is the root of the
hackbench improvement; especially with g=14 where there should not be
much idle CPUs with 14*40 tasks on at most 32 CPUs.  IIRC, there was
no obvious improvement with the changes in select_idle_cpu unless you
hack the behavior to not fall back to llc domain

> we run the below hackbench with different -g parameter from 2 to 14, for
> each different g, we run the command 10 times and get the average time
> $ numactl -N 0 hackbench -p -T -l 2 -g $1
>
> hackbench will report the time which is needed to complete a certain number
> of messages transmissions between a certain number of tasks, for example:
> $ numactl -N 0 hackbench -p -T -l 2 -g 10
> Running in threaded mode with 10 groups using 40 file descriptors each
> (== 400 tasks)
> Each sender will pass 2 messages of 100 bytes
> Time: 8.874
>
> The below is the result of hackbench w/ and w/o the patch:
> g=2  4 6   8  10 12  14
> w/o: 1.9596 4.0506 5.9654 8.0068 9.8147 11.4900 13.1163
> w/ : 1.9362 3.9197 5.6570 7.1376 8.5263 10.0512 11.3256
> +3.3%  +5.2%  +10.9% +13.2%  +12.8%  +13.7%
>
> Signed-off-by: Barry Song 
> ---
> -v4:
>   * rebased to tip/sched/core with the latest unified code of select_idle_cpu
>   * also added benchmark data of spreading unrelated tasks
>   * avoided the iteration of sched_domain by moving to static_key(addressing
> Vincent's comment
>
>  arch/arm64/Kconfig |  7 +
>  include/linux/sched/cluster.h  | 19 
>  include/linux/sched/sd_flags.h |  9 ++
>  include/linux/sched/topology.h |  7 +
>  include/linux/topology.h   |  7 +
>  kernel/sched/core.c| 18 
>  kernel/sched/fair.c| 66 
> +-
>  kernel/sched/sched.h   |  1 +
>  kernel/sched/topology.c|  6 
>  9 files changed, 126 insertions(+), 14 deletions(-)
>  create mode 100644 include/linux/sched/cluster.h
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index f39568b..158b0fa 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -971,6 +971,13 @@ config SCHED_MC
>   making when dealing with multi-core CPU chips at a cost of slightly
>   increased overhead in some places. If unsure say N here.
>
> +config SCHED_CLUSTER
> +   bool "Cluster scheduler support"
> +   help
> + Cluster scheduler support improves the CPU scheduler's decision
> + making when dealing with machines that have clusters(sharing 
> internal
> + bus or sharing LLC cache tag). If unsure say N here.
> +
>  config SCHED_SMT
> bool "SMT scheduler support"
> help
> diff --git a/include/linux/sched/cluster.h b/include/linux/sched/cluster.h
> new file mode 100644
> index 000..ea6c475

[tip: sched/core] sched/fair: Remove update of blocked load from newidle_balance

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 0826530de3cbdc89e60a89e86def94a5f0fc81ca
Gitweb:
https://git.kernel.org/tip/0826530de3cbdc89e60a89e86def94a5f0fc81ca
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:01 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched/fair: Remove update of blocked load from newidle_balance

newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating the
blocked load of CPUs varies according to the number of CPU cgroups
with non-decayed load and extends this critical period to an uncontrolled
level.

Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.

This reduces the IRQ latency from O(nr_cgroups * nr_nohz_cpus) to
O(nr_cgroups).

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-2-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb..806e16f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
 
 struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
 
-   if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
-   env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
-   env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = _sgs;
int local_group;
@@ -8981,14 +8971,6 @@ next_group:
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if ((env->flags & LBF_NOHZ_AGAIN) &&
-   cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
-   WRITE_ONCE(nohz.next_blocked,
-  jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
-   }
-#endif
 
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(>busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
 
-   raw_spin_unlock(_rq->lock);
/*
-* This CPU is going to be idle and blocked load of idle CPUs
-* need to be updated. Run the ilb locally as it is a good
-* candidate for ilb instead of waking up another idle CPU.
-* Kick an normal ilb if we failed to do the update.
+* Blocked load of idle CPUs need to be updated.
+* Kick an ILB to update statistics.
 */
-   if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
-   kick_ilb(NOHZ_STATS_KICK);
-   raw_spin_lock(_rq->lock);
+   kick_ilb(NOHZ_STATS_KICK);
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
update_next_balance(sd, _balance);
rcu_read_unlock();
 
-   nohz_newidle_balance(this_rq);
-
goto out;
}
 
@@ -10654,6 +10629,8 @@ out:
 
if (pulled_task)
this_rq->idle_stamp = 0;
+   else
+   nohz_newidle_balance(this_rq);
 
rq_repin_lock(this_rq, rf);

[tip: sched/core] sched/fair: Remove unused parameter of update_nohz_stats

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 64f84f273592d17dcdca20244168ad9f525a39c3
Gitweb:
https://git.kernel.org/tip/64f84f273592d17dcdca20244168ad9f525a39c3
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:03 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched/fair: Remove unused parameter of update_nohz_stats

idle load balance is the only user of update_nohz_stats and doesn't use
force parameter. Remove it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-4-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a458e9..1b91030 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8352,7 +8352,7 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
-static bool update_nohz_stats(struct rq *rq, bool force)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -8363,7 +8363,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
 
update_blocked_averages(cpu);
@@ -10401,7 +10401,7 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 
rq = cpu_rq(balance_cpu);
 
-   has_blocked_load |= update_nohz_stats(rq, true);
+   has_blocked_load |= update_nohz_stats(rq);
 
/*
 * If time for next balance is due,

[tip: sched/core] sched/fair: Remove unused return of _nohz_idle_balance

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: ab2dde5e98db23387147fb4e7a52b6cf8141cdb3
Gitweb:
https://git.kernel.org/tip/ab2dde5e98db23387147fb4e7a52b6cf8141cdb3
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:02 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched/fair: Remove unused return of _nohz_idle_balance

The return of _nohz_idle_balance() is not used anymore so we can remove
it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-3-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 806e16f..6a458e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10354,10 +10354,8 @@ out:
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
  * tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
  */
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
   enum cpu_idle_type idle)
 {
/* Earliest time when we have to do rebalance again */
@@ -10367,7 +10365,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
-   int ret = false;
struct rq *rq;
 
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10447,15 +10444,10 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-   /* The full idle balance loop has been done */
-   ret = true;
-
 abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
-   return ret;
 }
 
 /*

[tip: sched/core] sched/fair: Reorder newidle_balance pulled_task tests

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 6553fc18179113a11835d5fde1735259f8943a55
Gitweb:
https://git.kernel.org/tip/6553fc18179113a11835d5fde1735259f8943a55
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:05 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched/fair: Reorder newidle_balance pulled_task tests

Reorder the tests and skip useless ones when no load balance has been
performed and rq lock has not been released.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-6-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c00918..356a245 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10584,7 +10584,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
 
-out:
/*
 * While browsing the domains, we released the rq lock, a task could
 * have been enqueued in the meantime. Since we're not going idle,
@@ -10593,14 +10592,15 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
 
-   /* Move the next balance forward */
-   if (time_after(this_rq->next_balance, next_balance))
-   this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
+out:
+   /* Move the next balance forward */
+   if (time_after(this_rq->next_balance, next_balance))
+   this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
else

[tip: sched/core] sched/fair: Trigger the update of blocked load on newly idle cpu

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: c6f886546cb8a38617cdbe755fe50d3acd2463e4
Gitweb:
https://git.kernel.org/tip/c6f886546cb8a38617cdbe755fe50d3acd2463e4
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:06 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00

sched/fair: Trigger the update of blocked load on newly idle cpu

Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-7-vincent.guit...@linaro.org
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/fair.c  | 24 +---
 kernel/sched/idle.c  |  6 ++
 kernel/sched/sched.h |  7 +++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9dfb34..361974e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -737,7 +737,7 @@ static void nohz_csd_func(void *info)
/*
 * Release the rq::nohz_csd.
 */
-   flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+   flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, 
nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
 
rq->idle_balance = idle_cpu(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 356a245..e87e1b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
return true;
 }
 
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+   unsigned int flags;
+
+   flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+   /*
+* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+* (ie NOHZ_STATS_KICK set) and will do the same.
+*/
+   if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+   _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
 static void nohz_newidle_balance(struct rq *this_rq)
 {
int this_cpu = this_rq->cpu;
@@ -10474,10 +10492,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
 
/*
-* Blocked load of idle CPUs need to be updated.
-* Kick an ILB to update statistics.
+* Set the need to trigger ILB in order to update blocked load
+* before entering idle state.
 */
-   kick_ilb(NOHZ_STATS_KICK);
+   atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7199e6f..7a92d60 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ exit_idle:
 static void do_idle(void)
 {
int cpu = smp_processor_id();
+
+   /*
+* Check if we need to update blocked load
+*/
+   nohz_run_idle_balance(cpu);
+
/*
 * If the arch has a polling bit, we maintain an invariant:
 *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522..0ddc9a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2385,9 +2385,11 @@ extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_BALANCE_KICK_BIT  0
 #define NOHZ_STATS_KICK_BIT1
+#define NOHZ_NEWILB_KICK_BIT   2
 
 #define NOHZ_BALANCE_KICK  BIT(NOHZ_BALANCE_KICK_BIT)
 #define NOHZ_STATS_KICKBIT(NOHZ_STATS_KICK_BIT)
+#define NOHZ_NEWILB_KICK   BIT(NOHZ_NEWILB_KICK_BIT)
 
 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
@@ -2398,6 +2400,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
 
 #ifdef CONFIG_SMP
 static inline

[tip: sched/core] sched/fair: Merge for each idle cpu loop of ILB

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 7a82e5f52a3506bc35a4dc04d53ad2c9daf82e7f
Gitweb:
https://git.kernel.org/tip/7a82e5f52a3506bc35a4dc04d53ad2c9daf82e7f
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:04 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:21 +01:00

sched/fair: Merge for each idle cpu loop of ILB

Remove the specific case for handling this_cpu outside for_each_cpu() loop
when running ILB. Instead we use for_each_cpu_wrap() and start with the
next cpu after this_cpu so we will continue to finish with this_cpu.

update_nohz_stats() is now used for this_cpu too and will prevents
unnecessary update. We don't need a special case for handling the update of
nohz.next_balance for this_cpu anymore because it is now handled by the
loop like others.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-5-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 32 +++-
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b91030..3c00918 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10043,22 +10043,9 @@ out:
 * When the cpu is attached to null domain for ex, it will not be
 * updated.
 */
-   if (likely(update_next_balance)) {
+   if (likely(update_next_balance))
rq->next_balance = next_balance;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   /*
-* If this CPU has been elected to perform the nohz idle
-* balance. Other idle CPUs have already rebalanced with
-* nohz_idle_balance() and nohz.next_balance has been
-* updated accordingly. This CPU is now running the idle load
-* balance for itself and we need to update the
-* nohz.next_balance accordingly.
-*/
-   if ((idle == CPU_IDLE) && time_after(nohz.next_balance, 
rq->next_balance))
-   nohz.next_balance = rq->next_balance;
-#endif
-   }
 }
 
 static inline int on_null_domain(struct rq *rq)
@@ -10385,8 +10372,12 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 */
smp_mb();
 
-   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-   if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+   /*
+* Start with the next CPU after this_cpu so we will end with this_cpu 
and let a
+* chance for other idle cpu to pull load.
+*/
+   for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+   if (!idle_cpu(balance_cpu))
continue;
 
/*
@@ -10432,15 +10423,6 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
 
-   /* Newly idle CPU doesn't need an update */
-   if (idle != CPU_NEWLY_IDLE) {
-   update_blocked_averages(this_cpu);
-   has_blocked_load |= this_rq->has_blocked_load;
-   }
-
-   if (flags & NOHZ_BALANCE_KICK)
-   rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

[tip: sched/core] sched/fair: Reduce the window for duplicated update

2021-03-06 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 39b6a429c30482c349f1bb3746470fe473cbdb0f
Gitweb:
https://git.kernel.org/tip/39b6a429c30482c349f1bb3746470fe473cbdb0f
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:07 +01:00
Committer: Ingo Molnar 
CommitterDate: Sat, 06 Mar 2021 12:40:22 +01:00

sched/fair: Reduce the window for duplicated update

Start to update last_blocked_load_update_tick to reduce the possibility
of another cpu starting the update one more time

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Ingo Molnar 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-8-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e87e1b3..f1b55f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7852,16 +7852,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
 }
 
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
 {
-   rq->last_blocked_load_update_tick = jiffies;
+   WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
 
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
 }
 #else
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
@@ -8022,6 +8026,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
+   update_blocked_load_tick(rq);
update_rq_clock(rq);
 
decayed |= __update_blocked_others(rq, );
@@ -8363,7 +8368,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
 
update_blocked_averages(cpu);

[tip: sched/core] sched/fair: Trigger the update of blocked load on newly idle cpu

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 63dbe695827f0f612a0cdbc82a43a974bcd536cd
Gitweb:
https://git.kernel.org/tip/63dbe695827f0f612a0cdbc82a43a974bcd536cd
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:06 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Trigger the update of blocked load on newly idle cpu

Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-7-vincent.guit...@linaro.org
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/fair.c  | 24 +---
 kernel/sched/idle.c  |  6 ++
 kernel/sched/sched.h |  7 +++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9dfb34..361974e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -737,7 +737,7 @@ static void nohz_csd_func(void *info)
/*
 * Release the rq::nohz_csd.
 */
-   flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+   flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, 
nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
 
rq->idle_balance = idle_cpu(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 356a245..e87e1b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
return true;
 }
 
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+   unsigned int flags;
+
+   flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+   /*
+* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+* (ie NOHZ_STATS_KICK set) and will do the same.
+*/
+   if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+   _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
 static void nohz_newidle_balance(struct rq *this_rq)
 {
int this_cpu = this_rq->cpu;
@@ -10474,10 +10492,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
 
/*
-* Blocked load of idle CPUs need to be updated.
-* Kick an ILB to update statistics.
+* Set the need to trigger ILB in order to update blocked load
+* before entering idle state.
 */
-   kick_ilb(NOHZ_STATS_KICK);
+   atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7199e6f..7a92d60 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ exit_idle:
 static void do_idle(void)
 {
int cpu = smp_processor_id();
+
+   /*
+* Check if we need to update blocked load
+*/
+   nohz_run_idle_balance(cpu);
+
/*
 * If the arch has a polling bit, we maintain an invariant:
 *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522..0ddc9a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2385,9 +2385,11 @@ extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_BALANCE_KICK_BIT  0
 #define NOHZ_STATS_KICK_BIT1
+#define NOHZ_NEWILB_KICK_BIT   2
 
 #define NOHZ_BALANCE_KICK  BIT(NOHZ_BALANCE_KICK_BIT)
 #define NOHZ_STATS_KICKBIT(NOHZ_STATS_KICK_BIT)
+#define NOHZ_NEWILB_KICK   BIT(NOHZ_NEWILB_KICK_BIT)
 
 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
@@ -2398,6 +2400,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
 
 #ifdef CONFIG_SMP
 static inline

[tip: sched/core] sched/fair: Remove unused parameter of update_nohz_stats

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 21c5d27a4c5d9fddb2c35ccdd5cddc11b75f753d
Gitweb:
https://git.kernel.org/tip/21c5d27a4c5d9fddb2c35ccdd5cddc11b75f753d
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:03 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Remove unused parameter of update_nohz_stats

idle load balance is the only user of update_nohz_stats and doesn't use
force parameter. Remove it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-4-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a458e9..1b91030 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8352,7 +8352,7 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
-static bool update_nohz_stats(struct rq *rq, bool force)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -8363,7 +8363,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
 
update_blocked_averages(cpu);
@@ -10401,7 +10401,7 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 
rq = cpu_rq(balance_cpu);
 
-   has_blocked_load |= update_nohz_stats(rq, true);
+   has_blocked_load |= update_nohz_stats(rq);
 
/*
 * If time for next balance is due,

[tip: sched/core] sched/fair: Reduce the window for duplicated update

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 780eec5b50930b34e2f096b4dce5368d90497b55
Gitweb:
https://git.kernel.org/tip/780eec5b50930b34e2f096b4dce5368d90497b55
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:07 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Reduce the window for duplicated update

Start to update last_blocked_load_update_tick to reduce the possibility
of another cpu starting the update one more time

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-8-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e87e1b3..f1b55f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7852,16 +7852,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
 }
 
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
 {
-   rq->last_blocked_load_update_tick = jiffies;
+   WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
 
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
 }
 #else
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
@@ -8022,6 +8026,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
+   update_blocked_load_tick(rq);
update_rq_clock(rq);
 
decayed |= __update_blocked_others(rq, );
@@ -8363,7 +8368,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
 
update_blocked_averages(cpu);

[tip: sched/core] sched/fair: Remove update of blocked load from newidle_balance

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 1690607f4232c120a2d6ff1f9d0766551d9609f1
Gitweb:
https://git.kernel.org/tip/1690607f4232c120a2d6ff1f9d0766551d9609f1
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:01 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Remove update of blocked load from newidle_balance

newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating the
blocked load of CPUs varies according to the number of CPU cgroups
with non-decayed load and extends this critical period to an uncontrolled
level.

Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.

This reduces the IRQ latency from O(nr_cgroups * nr_nohz_cpus) to
O(nr_cgroups).

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-2-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb..806e16f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
 
 struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
 
-   if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
-   env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
-   env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = _sgs;
int local_group;
@@ -8981,14 +8971,6 @@ next_group:
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if ((env->flags & LBF_NOHZ_AGAIN) &&
-   cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
-   WRITE_ONCE(nohz.next_blocked,
-  jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
-   }
-#endif
 
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(>busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
 
-   raw_spin_unlock(_rq->lock);
/*
-* This CPU is going to be idle and blocked load of idle CPUs
-* need to be updated. Run the ilb locally as it is a good
-* candidate for ilb instead of waking up another idle CPU.
-* Kick an normal ilb if we failed to do the update.
+* Blocked load of idle CPUs need to be updated.
+* Kick an ILB to update statistics.
 */
-   if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
-   kick_ilb(NOHZ_STATS_KICK);
-   raw_spin_lock(_rq->lock);
+   kick_ilb(NOHZ_STATS_KICK);
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
update_next_balance(sd, _balance);
rcu_read_unlock();
 
-   nohz_newidle_balance(this_rq);
-
goto out;
}
 
@@ -10654,6 +10629,8 @@ out:
 
if (pulled_task)
this_rq->idle_stamp = 0;
+   else
+   nohz_newidle_balance(this_rq);
 
rq_repin_lock(this_rq, rf);

[tip: sched/core] sched/fair: Merge for each idle cpu loop of ILB

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 2aa7f2f6d1e4308b81bef079091561445b9cb949
Gitweb:
https://git.kernel.org/tip/2aa7f2f6d1e4308b81bef079091561445b9cb949
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:04 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Merge for each idle cpu loop of ILB

Remove the specific case for handling this_cpu outside for_each_cpu() loop
when running ILB. Instead we use for_each_cpu_wrap() and start with the
next cpu after this_cpu so we will continue to finish with this_cpu.

update_nohz_stats() is now used for this_cpu too and will prevents
unnecessary update. We don't need a special case for handling the update of
nohz.next_balance for this_cpu anymore because it is now handled by the
loop like others.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-5-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 32 +++-
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b91030..3c00918 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10043,22 +10043,9 @@ out:
 * When the cpu is attached to null domain for ex, it will not be
 * updated.
 */
-   if (likely(update_next_balance)) {
+   if (likely(update_next_balance))
rq->next_balance = next_balance;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   /*
-* If this CPU has been elected to perform the nohz idle
-* balance. Other idle CPUs have already rebalanced with
-* nohz_idle_balance() and nohz.next_balance has been
-* updated accordingly. This CPU is now running the idle load
-* balance for itself and we need to update the
-* nohz.next_balance accordingly.
-*/
-   if ((idle == CPU_IDLE) && time_after(nohz.next_balance, 
rq->next_balance))
-   nohz.next_balance = rq->next_balance;
-#endif
-   }
 }
 
 static inline int on_null_domain(struct rq *rq)
@@ -10385,8 +10372,12 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 */
smp_mb();
 
-   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-   if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+   /*
+* Start with the next CPU after this_cpu so we will end with this_cpu 
and let a
+* chance for other idle cpu to pull load.
+*/
+   for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+   if (!idle_cpu(balance_cpu))
continue;
 
/*
@@ -10432,15 +10423,6 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
 
-   /* Newly idle CPU doesn't need an update */
-   if (idle != CPU_NEWLY_IDLE) {
-   update_blocked_averages(this_cpu);
-   has_blocked_load |= this_rq->has_blocked_load;
-   }
-
-   if (flags & NOHZ_BALANCE_KICK)
-   rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

[tip: sched/core] sched/fair: Reorder newidle_balance pulled_task tests

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 053192dea58da994fb3dd7ad235440accf292a08
Gitweb:
https://git.kernel.org/tip/053192dea58da994fb3dd7ad235440accf292a08
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:05 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Reorder newidle_balance pulled_task tests

Reorder the tests and skip useless ones when no load balance has been
performed and rq lock has not been released.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-6-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c00918..356a245 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10584,7 +10584,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
 
-out:
/*
 * While browsing the domains, we released the rq lock, a task could
 * have been enqueued in the meantime. Since we're not going idle,
@@ -10593,14 +10592,15 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
 
-   /* Move the next balance forward */
-   if (time_after(this_rq->next_balance, next_balance))
-   this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
+out:
+   /* Move the next balance forward */
+   if (time_after(this_rq->next_balance, next_balance))
+   this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
else

[tip: sched/core] sched/fair: Remove unused return of _nohz_idle_balance

2021-03-03 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: f2c0af1dabdae4674fb7ddba0ac88ca78d0fe675
Gitweb:
https://git.kernel.org/tip/f2c0af1dabdae4674fb7ddba0ac88ca78d0fe675
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:02 +01:00
Committer: Peter Zijlstra 
CommitterDate: Wed, 03 Mar 2021 10:32:59 +01:00

sched/fair: Remove unused return of _nohz_idle_balance

The return of _nohz_idle_balance() is not used anymore so we can remove
it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-3-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 806e16f..6a458e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10354,10 +10354,8 @@ out:
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
  * tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
  */
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
   enum cpu_idle_type idle)
 {
/* Earliest time when we have to do rebalance again */
@@ -10367,7 +10365,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
-   int ret = false;
struct rq *rq;
 
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10447,15 +10444,10 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-   /* The full idle balance loop has been done */
-   ret = true;
-
 abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
-   return ret;
 }
 
 /*

[tip: sched/core] sched/fair: Remove unused parameter of update_nohz_stats

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 8af3f0fbfbaa3b78bb1fc577ee42c3228f3cc822
Gitweb:
https://git.kernel.org/tip/8af3f0fbfbaa3b78bb1fc577ee42c3228f3cc822
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:03 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:23 +01:00

sched/fair: Remove unused parameter of update_nohz_stats

idle load balance is the only user of update_nohz_stats and doesn't use
force parameter. Remove it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-4-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a458e9..1b91030 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8352,7 +8352,7 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
-static bool update_nohz_stats(struct rq *rq, bool force)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -8363,7 +8363,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
 
update_blocked_averages(cpu);
@@ -10401,7 +10401,7 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 
rq = cpu_rq(balance_cpu);
 
-   has_blocked_load |= update_nohz_stats(rq, true);
+   has_blocked_load |= update_nohz_stats(rq);
 
/*
 * If time for next balance is due,

[tip: sched/core] sched/fair: Remove unused return of _nohz_idle_balance

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 81df323258719a0194fadbf4aa93e213a552e460
Gitweb:
https://git.kernel.org/tip/81df323258719a0194fadbf4aa93e213a552e460
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:02 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:23 +01:00

sched/fair: Remove unused return of _nohz_idle_balance

The return of _nohz_idle_balance() is not used anymore so we can remove
it

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-3-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 806e16f..6a458e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10354,10 +10354,8 @@ out:
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
  * tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
  */
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
   enum cpu_idle_type idle)
 {
/* Earliest time when we have to do rebalance again */
@@ -10367,7 +10365,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
-   int ret = false;
struct rq *rq;
 
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10447,15 +10444,10 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-   /* The full idle balance loop has been done */
-   ret = true;
-
 abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
-   return ret;
 }
 
 /*

[tip: sched/core] sched/fair: Reduce the window for duplicated update

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 839ffb99d94f930fecbdee2fdfb883b10c30326b
Gitweb:
https://git.kernel.org/tip/839ffb99d94f930fecbdee2fdfb883b10c30326b
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:07 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:25 +01:00

sched/fair: Reduce the window for duplicated update

Start to update last_blocked_load_update_tick to reduce the possibility
of another cpu starting the update one more time

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-8-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e87e1b3..f1b55f9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7852,16 +7852,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
 }
 
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
 {
-   rq->last_blocked_load_update_tick = jiffies;
+   WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
 
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
 }
 #else
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
@@ -8022,6 +8026,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
+   update_blocked_load_tick(rq);
update_rq_clock(rq);
 
decayed |= __update_blocked_others(rq, );
@@ -8363,7 +8368,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
 
update_blocked_averages(cpu);

[tip: sched/core] sched/fair: Merge for each idle cpu loop of ILB

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: c0d2f3b54ed88ce1079f8ffb094205d3f578a9bb
Gitweb:
https://git.kernel.org/tip/c0d2f3b54ed88ce1079f8ffb094205d3f578a9bb
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:04 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:24 +01:00

sched/fair: Merge for each idle cpu loop of ILB

Remove the specific case for handling this_cpu outside for_each_cpu() loop
when running ILB. Instead we use for_each_cpu_wrap() and start with the
next cpu after this_cpu so we will continue to finish with this_cpu.

update_nohz_stats() is now used for this_cpu too and will prevents
unnecessary update. We don't need a special case for handling the update of
nohz.next_balance for this_cpu anymore because it is now handled by the
loop like others.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-5-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 32 +++-
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1b91030..3c00918 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10043,22 +10043,9 @@ out:
 * When the cpu is attached to null domain for ex, it will not be
 * updated.
 */
-   if (likely(update_next_balance)) {
+   if (likely(update_next_balance))
rq->next_balance = next_balance;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   /*
-* If this CPU has been elected to perform the nohz idle
-* balance. Other idle CPUs have already rebalanced with
-* nohz_idle_balance() and nohz.next_balance has been
-* updated accordingly. This CPU is now running the idle load
-* balance for itself and we need to update the
-* nohz.next_balance accordingly.
-*/
-   if ((idle == CPU_IDLE) && time_after(nohz.next_balance, 
rq->next_balance))
-   nohz.next_balance = rq->next_balance;
-#endif
-   }
 }
 
 static inline int on_null_domain(struct rq *rq)
@@ -10385,8 +10372,12 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 */
smp_mb();
 
-   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-   if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+   /*
+* Start with the next CPU after this_cpu so we will end with this_cpu 
and let a
+* chance for other idle cpu to pull load.
+*/
+   for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+   if (!idle_cpu(balance_cpu))
continue;
 
/*
@@ -10432,15 +10423,6 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
 
-   /* Newly idle CPU doesn't need an update */
-   if (idle != CPU_NEWLY_IDLE) {
-   update_blocked_averages(this_cpu);
-   has_blocked_load |= this_rq->has_blocked_load;
-   }
-
-   if (flags & NOHZ_BALANCE_KICK)
-   rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));

[tip: sched/core] sched/fair: Remove update of blocked load from newidle_balance

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 06a35afe89800789fc47ca5c41fbe435cc77d8e0
Gitweb:
https://git.kernel.org/tip/06a35afe89800789fc47ca5c41fbe435cc77d8e0
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:01 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:23 +01:00

sched/fair: Remove update of blocked load from newidle_balance

newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating the
blocked load of CPUs varies according to the number of CPU cgroups
with non-decayed load and extends this critical period to an uncontrolled
level.

Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.

This reduces the IRQ latency from O(nr_cgroups * nr_nohz_cpus) to
O(nr_cgroups).

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-2-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb..806e16f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
 
 struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
 
-   if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
-   env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
-   env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = _sgs;
int local_group;
@@ -8981,14 +8971,6 @@ next_group:
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if ((env->flags & LBF_NOHZ_AGAIN) &&
-   cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
-   WRITE_ONCE(nohz.next_blocked,
-  jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
-   }
-#endif
 
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(>busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
 
-   raw_spin_unlock(_rq->lock);
/*
-* This CPU is going to be idle and blocked load of idle CPUs
-* need to be updated. Run the ilb locally as it is a good
-* candidate for ilb instead of waking up another idle CPU.
-* Kick an normal ilb if we failed to do the update.
+* Blocked load of idle CPUs need to be updated.
+* Kick an ILB to update statistics.
 */
-   if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
-   kick_ilb(NOHZ_STATS_KICK);
-   raw_spin_lock(_rq->lock);
+   kick_ilb(NOHZ_STATS_KICK);
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
update_next_balance(sd, _balance);
rcu_read_unlock();
 
-   nohz_newidle_balance(this_rq);
-
goto out;
}
 
@@ -10654,6 +10629,8 @@ out:
 
if (pulled_task)
this_rq->idle_stamp = 0;
+   else
+   nohz_newidle_balance(this_rq);
 
rq_repin_lock(this_rq, rf);

[tip: sched/core] sched/fair: Trigger the update of blocked load on newly idle cpu

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 1705e3b449f62d957e897239ef6c67ca574acfc6
Gitweb:
https://git.kernel.org/tip/1705e3b449f62d957e897239ef6c67ca574acfc6
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:06 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:24 +01:00

sched/fair: Trigger the update of blocked load on newly idle cpu

Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-7-vincent.guit...@linaro.org
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/fair.c  | 24 +---
 kernel/sched/idle.c  |  6 ++
 kernel/sched/sched.h |  7 +++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9dfb34..361974e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -737,7 +737,7 @@ static void nohz_csd_func(void *info)
/*
 * Release the rq::nohz_csd.
 */
-   flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+   flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, 
nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
 
rq->idle_balance = idle_cpu(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 356a245..e87e1b3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
return true;
 }
 
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+   unsigned int flags;
+
+   flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+   /*
+* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+* (ie NOHZ_STATS_KICK set) and will do the same.
+*/
+   if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+   _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
 static void nohz_newidle_balance(struct rq *this_rq)
 {
int this_cpu = this_rq->cpu;
@@ -10474,10 +10492,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
 
/*
-* Blocked load of idle CPUs need to be updated.
-* Kick an ILB to update statistics.
+* Set the need to trigger ILB in order to update blocked load
+* before entering idle state.
 */
-   kick_ilb(NOHZ_STATS_KICK);
+   atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7199e6f..7a92d60 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ exit_idle:
 static void do_idle(void)
 {
int cpu = smp_processor_id();
+
+   /*
+* Check if we need to update blocked load
+*/
+   nohz_run_idle_balance(cpu);
+
/*
 * If the arch has a polling bit, we maintain an invariant:
 *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522..0ddc9a6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2385,9 +2385,11 @@ extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_BALANCE_KICK_BIT  0
 #define NOHZ_STATS_KICK_BIT1
+#define NOHZ_NEWILB_KICK_BIT   2
 
 #define NOHZ_BALANCE_KICK  BIT(NOHZ_BALANCE_KICK_BIT)
 #define NOHZ_STATS_KICKBIT(NOHZ_STATS_KICK_BIT)
+#define NOHZ_NEWILB_KICK   BIT(NOHZ_NEWILB_KICK_BIT)
 
 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
@@ -2398,6 +2400,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
 
 #ifdef CONFIG_SMP
 static inline

[tip: sched/core] sched/fair: Reorder newidle_balance pulled_task tests

2021-03-02 Thread tip-bot2 for Vincent Guittot

The following commit has been merged into the sched/core branch of tip:

Commit-ID: 0c067f38e1b9640e9121fb7e0bb38fa8f867a248
Gitweb:
https://git.kernel.org/tip/0c067f38e1b9640e9121fb7e0bb38fa8f867a248
Author:Vincent Guittot 
AuthorDate:Wed, 24 Feb 2021 14:30:05 +01:00
Committer: Peter Zijlstra 
CommitterDate: Mon, 01 Mar 2021 18:17:24 +01:00

sched/fair: Reorder newidle_balance pulled_task tests

Reorder the tests and skip useless ones when no load balance has been
performed and rq lock has not been released.

Signed-off-by: Vincent Guittot 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: 
https://lkml.kernel.org/r/20210224133007.28644-6-vincent.guit...@linaro.org
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3c00918..356a245 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10584,7 +10584,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
 
-out:
/*
 * While browsing the domains, we released the rq lock, a task could
 * have been enqueued in the meantime. Since we're not going idle,
@@ -10593,14 +10592,15 @@ out:
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
 
-   /* Move the next balance forward */
-   if (time_after(this_rq->next_balance, next_balance))
-   this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
+out:
+   /* Move the next balance forward */
+   if (time_after(this_rq->next_balance, next_balance))
+   this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
else

Re: [PATCH v2] sched/pelt: Fix task util_est update filtering

2021-02-25 Thread Vincent Guittot

On Thu, 25 Feb 2021 at 17:58, Vincent Donnefort
 wrote:
>
> Being called for each dequeue, util_est reduces the number of its updates
> by filtering out when the EWMA signal is different from the task util_avg
> by less than 1%. It is a problem for a sudden util_avg ramp-up. Due to the
> decay from a previous high util_avg, EWMA might now be close enough to
> the new util_avg. No update would then happen while it would leave
> ue.enqueued with an out-of-date value.
>
> Taking into consideration the two util_est members, EWMA and enqueued for
> the filtering, ensures, for both, an up-to-date value.
>
> This is for now an issue only for the trace probe that might return the
> stale value. Functional-wise, it isn't a problem, as the value is always
> accessed through max(enqueued, ewma).
>
> This problem has been observed using LISA's UtilConvergence:test_means on
> the sd845c board.
>
> No regression observed with Hackbench on sd845c and Perf-bench sched pipe
> on hikey/hikey960.
>
> Signed-off-by: Vincent Donnefort 
> Reviewed-by: Dietmar Eggemann 

Reviewed-by: Vincent Guittot 

>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9e4104ae39ae..214e02862994 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3966,24 +3966,27 @@ static inline void util_est_dequeue(struct cfs_rq 
> *cfs_rq,
> trace_sched_util_est_cfs_tp(cfs_rq);
>  }
>
> +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
> +
>  /*
> - * Check if a (signed) value is within a specified (unsigned) margin,
> + * Check if a (signed) value is within the (unsigned) util_est margin,
>   * based on the observation that:
>   *
>   * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
>   *
> - * NOTE: this only works when value + maring < INT_MAX.
> + * NOTE: this only works when value + UTIL_EST_MARGIN < INT_MAX.
>   */
> -static inline bool within_margin(int value, int margin)
> +static inline bool util_est_within_margin(int value)
>  {
> -   return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
> +   return ((unsigned int)(value + UTIL_EST_MARGIN - 1) <
> +   (2 * UTIL_EST_MARGIN - 1));
>  }
>
>  static inline void util_est_update(struct cfs_rq *cfs_rq,
>struct task_struct *p,
>bool task_sleep)
>  {
> -   long last_ewma_diff;
> +   long last_ewma_diff, last_enqueued_diff;
> struct util_est ue;
>
> if (!sched_feat(UTIL_EST))
> @@ -4004,6 +4007,8 @@ static inline void util_est_update(struct cfs_rq 
> *cfs_rq,
> if (ue.enqueued & UTIL_AVG_UNCHANGED)
> return;
>
> +   last_enqueued_diff = ue.enqueued;
> +
> /*
>  * Reset EWMA on utilization increases, the moving average is used 
> only
>  * to smooth utilization decreases.
> @@ -4017,12 +4022,17 @@ static inline void util_est_update(struct cfs_rq 
> *cfs_rq,
> }
>
> /*
> -* Skip update of task's estimated utilization when its EWMA is
> +* Skip update of task's estimated utilization when its members are
>  * already ~1% close to its last activation value.
>  */
> last_ewma_diff = ue.enqueued - ue.ewma;
> -   if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
> +   last_enqueued_diff -= ue.enqueued;
> +   if (util_est_within_margin(last_ewma_diff)) {
> +   if (!util_est_within_margin(last_enqueued_diff))
> +   goto done;
> +
> return;
> +   }
>
> /*
>  * To avoid overestimation of actual task utilization, skip updates if
> --
> 2.25.1
>

Re: [PATCH V2] sched: pull tasks when CPU is about to run SCHED_IDLE tasks

2021-02-25 Thread Vincent Guittot

On Mon, 22 Feb 2021 at 08:35,  wrote:
>
> From: Chen Xiaoguang 
>
> In order to use the computer efficiently we usually deploy online
> tasks and offline tasks in the same computer.
>
> The online tasks are more important than the offline tasks and are
> latency sensitive we should make sure the online tasks preempt the
> offline tasks as soon as possible while there are online tasks
> waiting to run.
>
> Online tasks using the SCHED_NORMAL policy and offline tasks using
> the SCHED_ILDE policy. This patch decrease the latency of online
> tasks by doing a load balance before a offline tasks to run.
>
> Signed-off-by: Chen Xiaoguang 
> Signed-off-by: Chen He 

Reviewed-by: Vincent Guittot 

Thanks

> ---
> v1 -> v2:
>  - Add checking in balance_fair
>  - Remove task state checking in pick_next_task_fair
>  - Add comment about the change
> ---
>  kernel/sched/fair.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a8bd7b..80b69a2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6833,7 +6833,13 @@ static void task_dead_fair(struct task_struct *p)
>  static int
>  balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  {
> -   if (rq->nr_running)
> +   /*
> +* Return if SCHED_NORMAL tasks exist.
> +* Else if only SCHED_IDLE tasks in rq then do a load balance trying
> +* to pull SCHED_NORMAL tasks to run so as to reduce the latency of
> +* SCHED_NORMAL task.
> +*/
> +   if (rq->nr_running && !sched_idle_rq(rq))
> return 1;
>
> return newidle_balance(rq, rf) != 0;
> @@ -7013,6 +7019,14 @@ struct task_struct *
> struct task_struct *p;
> int new_tasks;
>
> +   /*
> +* Before a CPU switches from running SCHED_NORMAL task to SCHED_IDLE
> +* task, do a load balance trying to pull SCHED_NORMAL tasks to run
> +* so as to reduce the latency of SCHED_NORMAL task.
> +*/
> +   if (sched_idle_rq(rq) && prev && prev->policy != SCHED_IDLE)
> +   goto idle;
> +
>  again:
> if (!sched_fair_runnable(rq))
> goto idle;
> --
> 1.8.3.1
>

Re: [PATCH] sched/pelt: Fix task util_est update filtering

2021-02-25 Thread Vincent Guittot

On Mon, 22 Feb 2021 at 10:24, Vincent Donnefort
 wrote:
>
> On Fri, Feb 19, 2021 at 11:48:28AM +0100, Vincent Guittot wrote:
> > On Tue, 16 Feb 2021 at 17:39,  wrote:
> > >
> > > From: Vincent Donnefort 
> > >
> > > Being called for each dequeue, util_est reduces the number of its updates
> > > by filtering out when the EWMA signal is different from the task util_avg
> > > by less than 1%. It is a problem for a sudden util_avg ramp-up. Due to the
> > > decay from a previous high util_avg, EWMA might now be close enough to
> > > the new util_avg. No update would then happen while it would leave
> > > ue.enqueued with an out-of-date value.
> > >
> > > Taking into consideration the two util_est members, EWMA and enqueued for
> > > the filtering, ensures, for both, an up-to-date value.
> > >
> > > This is for now an issue only for the trace probe that might return the
> > > stale value. Functional-wise, it isn't (yet) a problem, as the value is
> >
> > What do you mean by "it isn't (yet) a problem" ? How could this become
> > a problem ?
>
> I wrote "yet" as nothing prevents anyone from using the ue.enqueued signal.

Hmm.. you are not supposed to use it outside the helper functions so
this is irrelevant IMO which means that only the trace probe is
impacted

>
> >
> > > always accessed through max(enqueued, ewma).
> > >
> >
> > This adds more tests and or update of  struct avg.util_est. It would
> > be good to have an idea of the perf impact. Especially because this
> > only fixes a tracing problem
>
> I ran hackbench on the big cores of a SD845C board. After 100 iterations of
> 100 loops runs, the geometric mean of the hackbench test is 0.1% lower
> with this patch applied (2.0833s vs 2.0858s). The p-value, computed with
> the ks_2samp [1] is 0.37. We can't conclude that the two distributions are
> different. This patch, in this scenario seems completely harmless.

For such kind of change,  perf bench sched pipe is better to highlight
any perf regression. I have done a quick test and i haven't seen
noticeable difference

>
> Shall I include those results in the commit message?
>
> [1] 
> https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html
>
> >
> >
> > > This problem has been observed using LISA's UtilConvergence:test_means on
> > > the sd845c board.
> > >
> > > Signed-off-by: Vincent Donnefort 
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 794c2cb945f8..9008e0c42def 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -3941,24 +3941,27 @@ static inline void util_est_dequeue(struct cfs_rq 
> > > *cfs_rq,
> > > trace_sched_util_est_cfs_tp(cfs_rq);
> > >  }
> > >
> > > +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
> > > +
> > >  /*
> > > - * Check if a (signed) value is within a specified (unsigned) margin,
> > > + * Check if a (signed) value is within the (unsigned) util_est margin,
> > >   * based on the observation that:
> > >   *
> > >   * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
> > >   *
> > > - * NOTE: this only works when value + maring < INT_MAX.
> > > + * NOTE: this only works when value + UTIL_EST_MARGIN < INT_MAX.
> > >   */
> > > -static inline bool within_margin(int value, int margin)
> > > +static inline bool util_est_within_margin(int value)
> > >  {
> > > -   return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
> > > +   return ((unsigned int)(value + UTIL_EST_MARGIN - 1) <
> > > +   (2 * UTIL_EST_MARGIN - 1));
> > >  }
> > >
> > >  static inline void util_est_update(struct cfs_rq *cfs_rq,
> > >struct task_struct *p,
> > >bool task_sleep)
> > >  {
> > > -   long last_ewma_diff;
> > > +   long last_ewma_diff, last_enqueued_diff;
> > > struct util_est ue;
> > >
> > > if (!sched_feat(UTIL_EST))
> > > @@ -3979,6 +3982,8 @@ static inline void util_est_update(struct cfs_rq 
> > > *cfs_rq,
> > > if (ue.enqueued & UTIL_AVG_UNCHANGED)
> > > return;
> > >
> > > +   last_enqueued_diff = ue.enqueued;
> > > +
> > > /*
> > >  * Reset EWMA on utiliza

Re: [PATCH 0/7 v4] move update blocked load outside newidle_balance

2021-02-25 Thread Vincent Guittot

Hi Valentin,

On Wed, 24 Feb 2021 at 19:46, Valentin Schneider
 wrote:
>
> On 24/02/21 14:30, Vincent Guittot wrote:
> > Joel reported long preempt and irq off sequence in newidle_balance because
> > of a large number of CPU cgroups in use and having to be updated. This
> > patchset moves the update outside newidle_imblance. This enables to early
> > abort during the updates in case of pending irq as an example.
> >
> > Instead of kicking a normal ILB that will wakes up CPU which is already
> > idle, patch 6 triggers the update of statistics in the idle thread of
> > the CPU before selecting and entering an idle state.
> >
> > Changes on v4:
> > - Add a dedicated bit for updating blocked load when entering idle.
> >   This simplifies the management of concurrency with kick_ilb.
> >
>
> I believe that solves the issues vs nohz balance.
>
> One last thing for patch 7: mayhaps we could do a tad better to avoid
> duplicate updates going through a heapful of leaf cfs rqs, see
>
>   http://lore.kernel.org/r/jhj4kiht7oh.mog...@arm.com

rq->last_blocked_load_update_tick is there only to filter duplicate
update during _nohz_idle_balance but not for other normal LB.

>
>
> Otherwise, feel free to add to the lot:
>
>   Reviewed-by: Valentin Schneider 
>

Re: [PATCH 0/7 v4] move update blocked load outside newidle_balance

2021-02-24 Thread Vincent Guittot

On Wed, 24 Feb 2021 at 18:41, Peter Zijlstra  wrote:
>
> On Wed, Feb 24, 2021 at 04:57:15PM +0100, Vincent Guittot wrote:
> > On Wed, 24 Feb 2021 at 16:54, Peter Zijlstra  wrote:
> > >
> > > On Wed, Feb 24, 2021 at 02:30:00PM +0100, Vincent Guittot wrote:
> > > > Joel reported long preempt and irq off sequence in newidle_balance 
> > > > because
> > > > of a large number of CPU cgroups in use and having to be updated. This
> > > > patchset moves the update outside newidle_imblance. This enables to 
> > > > early
> > > > abort during the updates in case of pending irq as an example.
> > > >
> > > > Instead of kicking a normal ILB that will wakes up CPU which is already
> > > > idle, patch 6 triggers the update of statistics in the idle thread of
> > > > the CPU before selecting and entering an idle state.
> > >
> > > I'm confused... update_blocked_averages(), which calls
> > > __update_blocked_fair(), which is the one doing the cgroup iteration
> > > thing, runs with rq->lock held, and thus will have IRQs disabled any
> > > which way around we turn this thing.
> > >
> > > Or is the problem that we called nohz_idle_balance(), which does
> > > update_nohz_stats() -> update_blocked_averages() for evey NOHZ cpu from
> > > newidle balance, such that we get NR_NOHZ_CPUS * NR_CGROUPS IRQ latency?
> > > Which is now reduced to just NR_CGROUPS ?
> >
> > Yes we can now abort between each cpu update
>
> OK, shall I add something like:
>
> This reduces the IRQ latency from O(nr_cgroups * nr_nohz_cpus) to
> O(nr_cgroups).
>
> To the changelog of patch #1 ?

Yes, good point. This will clarify the range of improvement

Re: [PATCH 0/7 v4] move update blocked load outside newidle_balance

2021-02-24 Thread Vincent Guittot

On Wed, 24 Feb 2021 at 16:54, Peter Zijlstra  wrote:
>
> On Wed, Feb 24, 2021 at 02:30:00PM +0100, Vincent Guittot wrote:
> > Joel reported long preempt and irq off sequence in newidle_balance because
> > of a large number of CPU cgroups in use and having to be updated. This
> > patchset moves the update outside newidle_imblance. This enables to early
> > abort during the updates in case of pending irq as an example.
> >
> > Instead of kicking a normal ILB that will wakes up CPU which is already
> > idle, patch 6 triggers the update of statistics in the idle thread of
> > the CPU before selecting and entering an idle state.
>
> I'm confused... update_blocked_averages(), which calls
> __update_blocked_fair(), which is the one doing the cgroup iteration
> thing, runs with rq->lock held, and thus will have IRQs disabled any
> which way around we turn this thing.
>
> Or is the problem that we called nohz_idle_balance(), which does
> update_nohz_stats() -> update_blocked_averages() for evey NOHZ cpu from
> newidle balance, such that we get NR_NOHZ_CPUS * NR_CGROUPS IRQ latency?
> Which is now reduced to just NR_CGROUPS ?

Yes we can now abort between each cpu update

[PATCH 6/7 v4] sched/fair: trigger the update of blocked load on newly idle cpu

2021-02-24 Thread Vincent Guittot

Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/fair.c  | 24 +---
 kernel/sched/idle.c  |  6 ++
 kernel/sched/sched.h |  7 +++
 4 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 88a2e2bdbabe..61ec83e52a08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -737,7 +737,7 @@ static void nohz_csd_func(void *info)
/*
 * Release the rq::nohz_csd.
 */
-   flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+   flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, 
nohz_flags(cpu));
WARN_ON(!(flags & NOHZ_KICK_MASK));
 
rq->idle_balance = idle_cpu(cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 586f6ce0d302..46c220a4f7ed 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
return true;
 }
 
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+   unsigned int flags;
+
+   flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
+
+   /*
+* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
+* (ie NOHZ_STATS_KICK set) and will do the same.
+*/
+   if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
+   _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
+}
+
 static void nohz_newidle_balance(struct rq *this_rq)
 {
int this_cpu = this_rq->cpu;
@@ -10474,10 +10492,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
 
/*
-* Blocked load of idle CPUs need to be updated.
-* Kick an ILB to update statistics.
+* Set the need to trigger ILB in order to update blocked load
+* before entering idle state.
 */
-   kick_ilb(NOHZ_STATS_KICK);
+   atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7199e6f23789..7a92d6054aba 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
 static void do_idle(void)
 {
int cpu = smp_processor_id();
+
+   /*
+* Check if we need to update blocked load
+*/
+   nohz_run_idle_balance(cpu);
+
/*
 * If the arch has a polling bit, we maintain an invariant:
 *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..0ddc9a6ff03a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2385,9 +2385,11 @@ extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 #define NOHZ_BALANCE_KICK_BIT  0
 #define NOHZ_STATS_KICK_BIT1
+#define NOHZ_NEWILB_KICK_BIT   2
 
 #define NOHZ_BALANCE_KICK  BIT(NOHZ_BALANCE_KICK_BIT)
 #define NOHZ_STATS_KICKBIT(NOHZ_STATS_KICK_BIT)
+#define NOHZ_NEWILB_KICK   BIT(NOHZ_NEWILB_KICK_BIT)
 
 #define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
 
@@ -2398,6 +2400,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
 
 #ifdef CONFIG_SMP
 static inline
-- 
2.17.1

[PATCH 7/7 v4] sched/fair: reduce the window for duplicated update

2021-02-24 Thread Vincent Guittot

Start to update last_blocked_load_update_tick to reduce the possibility
of another cpu starting the update one more time

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46c220a4f7ed..38a1297edd76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7852,16 +7852,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
 }
 
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
 {
-   rq->last_blocked_load_update_tick = jiffies;
+   WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
 
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
 }
 #else
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
@@ -8022,6 +8026,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
+   update_blocked_load_tick(rq);
update_rq_clock(rq);
 
decayed |= __update_blocked_others(rq, );
@@ -8363,7 +8368,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
 
update_blocked_averages(cpu);
-- 
2.17.1

[PATCH 5/7 v4] sched/fair: reorder newidle_balance pulled_task tests

2021-02-24 Thread Vincent Guittot

Reorder the tests and skip useless ones when no load balance has been
performed and rq lock has not been released.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0323fda07682..586f6ce0d302 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10584,7 +10584,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
 
-out:
/*
 * While browsing the domains, we released the rq lock, a task could
 * have been enqueued in the meantime. Since we're not going idle,
@@ -10593,14 +10592,15 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
 
-   /* Move the next balance forward */
-   if (time_after(this_rq->next_balance, next_balance))
-   this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
+out:
+   /* Move the next balance forward */
+   if (time_after(this_rq->next_balance, next_balance))
+   this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
else
-- 
2.17.1

[PATCH 4/7 v4] sched/fair: merge for each idle cpu loop of ILB

2021-02-24 Thread Vincent Guittot

Remove the specific case for handling this_cpu outside for_each_cpu() loop
when running ILB. Instead we use for_each_cpu_wrap() and start with the
next cpu after this_cpu so we will continue to finish with this_cpu.

update_nohz_stats() is now used for this_cpu too and will prevents
unnecessary update. We don't need a special case for handling the update of
nohz.next_balance for this_cpu anymore because it is now handled by the
loop like others.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 32 +++-
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f52f4dd3fb9e..0323fda07682 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10043,22 +10043,9 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
 * When the cpu is attached to null domain for ex, it will not be
 * updated.
 */
-   if (likely(update_next_balance)) {
+   if (likely(update_next_balance))
rq->next_balance = next_balance;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   /*
-* If this CPU has been elected to perform the nohz idle
-* balance. Other idle CPUs have already rebalanced with
-* nohz_idle_balance() and nohz.next_balance has been
-* updated accordingly. This CPU is now running the idle load
-* balance for itself and we need to update the
-* nohz.next_balance accordingly.
-*/
-   if ((idle == CPU_IDLE) && time_after(nohz.next_balance, 
rq->next_balance))
-   nohz.next_balance = rq->next_balance;
-#endif
-   }
 }
 
 static inline int on_null_domain(struct rq *rq)
@@ -10385,8 +10372,12 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 */
smp_mb();
 
-   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-   if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+   /*
+* Start with the next CPU after this_cpu so we will end with this_cpu 
and let a
+* chance for other idle cpu to pull load.
+*/
+   for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+   if (!idle_cpu(balance_cpu))
continue;
 
/*
@@ -10432,15 +10423,6 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
 
-   /* Newly idle CPU doesn't need an update */
-   if (idle != CPU_NEWLY_IDLE) {
-   update_blocked_averages(this_cpu);
-   has_blocked_load |= this_rq->has_blocked_load;
-   }
-
-   if (flags & NOHZ_BALANCE_KICK)
-   rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-- 
2.17.1

[PATCH 3/7 v4] sched/fair: remove unused parameter of update_nohz_stats

2021-02-24 Thread Vincent Guittot

idle load balance is the only user of update_nohz_stats and doesn't use
force parameter. Remove it

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e23709f6854b..f52f4dd3fb9e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8352,7 +8352,7 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
-static bool update_nohz_stats(struct rq *rq, bool force)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -8363,7 +8363,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
 
update_blocked_averages(cpu);
@@ -10401,7 +10401,7 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 
rq = cpu_rq(balance_cpu);
 
-   has_blocked_load |= update_nohz_stats(rq, true);
+   has_blocked_load |= update_nohz_stats(rq);
 
/*
 * If time for next balance is due,
-- 
2.17.1

[PATCH 1/7 v4] sched/fair: remove update of blocked load from newidle_balance

2021-02-24 Thread Vincent Guittot

newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating the
blocked load of CPUs varies according to the number of CPU cgroups
with non-decayed load and extends this critical period to an uncontrolled
level.

Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a8bd7b13634..0d45b7716384 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
 
 struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
 
-   if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
-   env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
-   env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = _sgs;
int local_group;
@@ -8981,14 +8971,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if ((env->flags & LBF_NOHZ_AGAIN) &&
-   cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
-   WRITE_ONCE(nohz.next_blocked,
-  jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
-   }
-#endif
 
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(>busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
 
-   raw_spin_unlock(_rq->lock);
/*
-* This CPU is going to be idle and blocked load of idle CPUs
-* need to be updated. Run the ilb locally as it is a good
-* candidate for ilb instead of waking up another idle CPU.
-* Kick an normal ilb if we failed to do the update.
+* Blocked load of idle CPUs need to be updated.
+* Kick an ILB to update statistics.
 */
-   if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
-   kick_ilb(NOHZ_STATS_KICK);
-   raw_spin_lock(_rq->lock);
+   kick_ilb(NOHZ_STATS_KICK);
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
update_next_balance(sd, _balance);
rcu_read_unlock();
 
-   nohz_newidle_balance(this_rq);
-
goto out;
}
 
@@ -10654,6 +10629,8 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 
if (pulled_task)
this_rq->idle_stamp = 0;
+   else
+   nohz_newidle_balance(this_rq);
 
rq_repin_lock(this_rq, rf);
 
-- 
2.17.1

[PATCH 2/7 v4] sched/fair: remove unused return of _nohz_idle_balance

2021-02-24 Thread Vincent Guittot

The return of _nohz_idle_balance() is not used anymore so we can remove
it

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d45b7716384..e23709f6854b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10354,10 +10354,8 @@ void nohz_balance_enter_idle(int cpu)
  * Internal function that runs load balance for all idle cpus. The load balance
  * can be a simple update of blocked load or a complete load balance with
  * tasks movement depending of flags.
- * The function returns false if the loop has stopped before running
- * through all idle CPUs.
  */
-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
+static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
   enum cpu_idle_type idle)
 {
/* Earliest time when we have to do rebalance again */
@@ -10367,7 +10365,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
int update_next_balance = 0;
int this_cpu = this_rq->cpu;
int balance_cpu;
-   int ret = false;
struct rq *rq;
 
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -10447,15 +10444,10 @@ static bool _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-   /* The full idle balance loop has been done */
-   ret = true;
-
 abort:
/* There is still blocked load, enable periodic update */
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
-
-   return ret;
 }
 
 /*
-- 
2.17.1

[PATCH 0/7 v4] move update blocked load outside newidle_balance

2021-02-24 Thread Vincent Guittot

Joel reported long preempt and irq off sequence in newidle_balance because
of a large number of CPU cgroups in use and having to be updated. This
patchset moves the update outside newidle_imblance. This enables to early
abort during the updates in case of pending irq as an example.

Instead of kicking a normal ILB that will wakes up CPU which is already
idle, patch 6 triggers the update of statistics in the idle thread of
the CPU before selecting and entering an idle state.

Changes on v4:
- Add a dedicated bit for updating blocked load when entering idle.
  This simplifies the management of concurrency with kick_ilb.

Changes on v3:
- Fixed a compilation error for !CONFIG_SMP && CONFIG_NO_HZ_COMMON
  reported by kernel test robot 
- Took advantage of this new version to add a short desciption for
  nohz_run_idle_balance

Changes on v2:
- Fixed some typos and updated some comments
- Added more cleanup
- Changed to way to trigger ILB in idle thread context to remove a possible
  race condition between the normal softirq ILB and this new mecanism. The
  cpu can already be set in idle_cpus_mask because even if the cpu is added
  later when entering idle, it might not have been removed yet from previous
  idle phase.
  
Vincent Guittot (7):
  sched/fair: remove update of blocked load from newidle_balance
  sched/fair: remove unused return of _nohz_idle_balance
  sched/fair: remove unused parameter of update_nohz_stats
  sched/fair: merge for each idle cpu loop of ILB
  sched/fair: reorder newidle_balance pulled_task tests
  sched/fair: trigger the update of blocked load on newly idle cpu
  sched/fair: reduce the window for duplicated update

 kernel/sched/core.c  |   2 +-
 kernel/sched/fair.c  | 118 +--
 kernel/sched/idle.c  |   6 +++
 kernel/sched/sched.h |   7 +++
 4 files changed, 60 insertions(+), 73 deletions(-)

-- 
2.17.1

Re: [RFC PATCH v1] sched/fair: limit load balance redo times at the same sched_domain level

2021-02-23 Thread Vincent Guittot

On Tue, 23 Feb 2021 at 06:41, Li, Aubrey  wrote:
>
> Hi Vincent,
>
> Sorry for the delay, I just returned from Chinese New Year holiday.
>
> On 2021/1/25 22:51, Vincent Guittot wrote:
> > On Mon, 25 Jan 2021 at 15:00, Li, Aubrey  wrote:
> >>
> >> On 2021/1/25 18:56, Vincent Guittot wrote:
> >>> On Mon, 25 Jan 2021 at 06:50, Aubrey Li  wrote:
> >>>>
> >>>> A long-tail load balance cost is observed on the newly idle path,
> >>>> this is caused by a race window between the first nr_running check
> >>>> of the busiest runqueue and its nr_running recheck in detach_tasks.
> >>>>
> >>>> Before the busiest runqueue is locked, the tasks on the busiest
> >>>> runqueue could be pulled by other CPUs and nr_running of the busiest
> >>>> runqueu becomes 1, this causes detach_tasks breaks with LBF_ALL_PINNED
> >>>
> >>> We should better detect that when trying to detach task like below
> >>
> >> This should be a compromise from my understanding. If we give up load 
> >> balance
> >> this time due to the race condition, we do reduce the load balance cost on 
> >> the
> >> newly idle path, but if there is an imbalance indeed at the same 
> >> sched_domain
> >
> > Redo path is there in case, LB has found an imbalance but it can't
> > move some loads from this busiest rq to dest rq because of some cpu
> > affinity. So it tries to fix the imbalance by moving load onto another
> > rq of the group. In your case, the imbalance has disappeared because
> > it has already been pulled by another rq so you don't have to try to
> > find another imbalance. And I would even say you should not in order
> > to let other level to take a chance to spread the load
> >
> >> level, we have to wait the next softirq entry to handle that imbalance. 
> >> This
> >> means the tasks on the second busiest runqueue have to stay longer, which 
> >> could
> >> introduce tail latency as well. That's why I introduced a variable to 
> >> control
> >> the redo loops. I'll send this to the benchmark queue to see if it makes 
> >> any
> >
> > TBH, I don't like multiplying the number of knobs
>
> Sure, I can take your approach, :)
>
> >>>
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -7688,6 +7688,16 @@ static int detach_tasks(struct lb_env *env)
> >>>
> >>> lockdep_assert_held(>src_rq->lock);
> >>>
> >>> +   /*
> >>> +* Another CPU has emptied this runqueue in the meantime.
> >>> +* Just return and leave the load_balance properly.
> >>> +*/
> >>> +   if (env->src_rq->nr_running <= 1 && !env->loop) {
>
> May I know why !env->loop is needed here? IIUC, if detach_tasks is invoked

IIRC,  my point was to do the test only when trying to detach the 1st
task. A lot of things can happen when a break is involved but TBH I
can't remember a precise UC. It may be over cautious

> from LBF_NEED_BREAK, env->loop could be non-zero, but as long as src_rq's
> nr_running <=1, we should return immediately with LBF_ALL_PINNED flag cleared.
>
> How about the following change?
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 04a3ce20da67..1761d33accaa 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7683,8 +7683,11 @@ static int detach_tasks(struct lb_env *env)
>  * We don't want to steal all, otherwise we may be treated 
> likewise,
>  * which could at worst lead to a livelock crash.
>  */
> -   if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
> +   if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 
> 1) {

IMO, we must do the test before:  while (!list_empty(tasks)) {

because src_rq might have become empty if waiting tasks have been
pulled by another cpu and the running one became idle in the meantime

> +   /* Clear the flag as we will not test any task */
> +   env->flag &= ~LBF_ALL_PINNED;
> break;
> +   }
>
> p = list_last_entry(tasks, struct task_struct, se.group_node);
>
> Thanks,
> -Aubrey

Re: UBSAN: shift-out-of-bounds in load_balance

2021-02-23 Thread Vincent Guittot

On Tue, 23 Feb 2021 at 13:03, Valentin Schneider
 wrote:
>
>
> +Vincent
>
> On 22/02/21 09:12, syzbot wrote:
> > syzbot has found a reproducer for the following issue on:
> >
> > HEAD commit:31caf8b2 Merge branch 'linus' of git://git.kernel.org/pub/..
> > git tree:   upstream
> > console output: https://syzkaller.appspot.com/x/log.txt?x=16ab2682d0
> > kernel config:  https://syzkaller.appspot.com/x/.config?x=b81388f0b32761d4
> > dashboard link: https://syzkaller.appspot.com/bug?extid=d7581744d5fd27c9fbe1
> > syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1277457f50
> >
> > IMPORTANT: if you fix the issue, please add the following tag to the commit:
> > Reported-by: syzbot+d7581744d5fd27c9f...@syzkaller.appspotmail.com
> >
> > 
> > UBSAN: shift-out-of-bounds in kernel/sched/fair.c:7712:14
> > shift exponent 149 is too large for 64-bit type 'long unsigned int'
>
> That 149 is surprising.

Yes, surprising. But is it really a problem in itself  ? shifting left
 would be a problem because of the overflow but here we shift right to
divide and the result is correct

Beside this, it seems that a significant number of previous attempts
to balance load has been done with another migration_type otherwise it
would  have raised a problem earlier (at 65) if previous LB were also
migration_load. It would be good to understand why the 148 previous
ones failed

>
> sd->cache_nice_tries is \in {1, 2}, and sd->nr_balanced_failed should be in
> the same ballpark.
>
> A successful load_balance() resets it to 0; a failed one increments
> it. Once it gets to sd->cache_nice_tries + 3, this should trigger an active
> balance, which will either set it to sd->cache_nice_tries+1 or reset it to
> 0. There is this one condition that could let it creep up uncontrollably:
>
>   /*
>* Don't kick the active_load_balance_cpu_stop,
>* if the curr task on busiest CPU can't be
>* moved to this_cpu:
>*/
>   if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
>   raw_spin_unlock_irqrestore(>lock,
>   flags);
>   goto out_one_pinned;
>   }
>
> So despite the resulting sd->balance_interval increase, repeatedly hitting
> this might yield the above. Would we then want something like this?
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8a8bd7b13634..b65c24b5ae91 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7422,6 +7422,11 @@ struct lb_env {
> struct list_headtasks;
>  };
>
> +static inline unsigned int sd_balance_failed_cap(struct sched_domain *sd)
> +{
> +   return sd->cache_nice_tries + 3;
> +}
> +
>  /*
>   * Is this task likely cache-hot:
>   */
> @@ -9493,7 +9498,7 @@ imbalanced_active_balance(struct lb_env *env)
>  * threads on a system with spare capacity
>  */
> if ((env->migration_type == migrate_task) &&
> -   (sd->nr_balance_failed > sd->cache_nice_tries+2))
> +   (sd->nr_balance_failed >= sd_balance_failed_cap(sd)))
> return 1;
>
> return 0;
> @@ -9737,8 +9742,10 @@ static int load_balance(int this_cpu, struct rq 
> *this_rq,
>  * frequent, pollute the failure counter causing
>  * excessive cache_hot migrations and active balances.
>  */
> -   if (idle != CPU_NEWLY_IDLE)
> -   sd->nr_balance_failed++;
> +   if (idle != CPU_NEWLY_IDLE) {
> +   sd->nr_balance_failed = min(sd->nr_balance_failed + 1,
> +   
> sd_balance_failed_cap(sd));

nr_balance_failed is an interesting metric that we want to monitor
sometimes and we would like to be able to divide higher than
2^(sd->cache_nice_tries + 3).

If we really want to prevent out of bound shift, The below is more
appropriate IMO:

index 636741fa27c9..4d0b3fa30849 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7707,7 +7707,7 @@ static int detach_tasks(struct lb_env *env)
 * migrate.
 */

-   if ((load >> env->sd->nr_balance_failed) >
env->imbalance)
+   if ((load >> min_t(int,
env->sd->nr_balance_failed, BITS_PER_LONG)) > env->imbalance)
goto next;

env->imbalance -= load;


> +   }
>
> if (need_active_balance()) {
> unsigned long flags;
>

Re: [PATCH] sched/pelt: Fix task util_est update filtering

2021-02-19 Thread Vincent Guittot

On Tue, 16 Feb 2021 at 17:39,  wrote:
>
> From: Vincent Donnefort 
>
> Being called for each dequeue, util_est reduces the number of its updates
> by filtering out when the EWMA signal is different from the task util_avg
> by less than 1%. It is a problem for a sudden util_avg ramp-up. Due to the
> decay from a previous high util_avg, EWMA might now be close enough to
> the new util_avg. No update would then happen while it would leave
> ue.enqueued with an out-of-date value.
>
> Taking into consideration the two util_est members, EWMA and enqueued for
> the filtering, ensures, for both, an up-to-date value.
>
> This is for now an issue only for the trace probe that might return the
> stale value. Functional-wise, it isn't (yet) a problem, as the value is

What do you mean by "it isn't (yet) a problem" ? How could this become
a problem ?

> always accessed through max(enqueued, ewma).
>

This adds more tests and or update of  struct avg.util_est. It would
be good to have an idea of the perf impact. Especially because this
only fixes a tracing problem


> This problem has been observed using LISA's UtilConvergence:test_means on
> the sd845c board.
>
> Signed-off-by: Vincent Donnefort 
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 794c2cb945f8..9008e0c42def 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3941,24 +3941,27 @@ static inline void util_est_dequeue(struct cfs_rq 
> *cfs_rq,
> trace_sched_util_est_cfs_tp(cfs_rq);
>  }
>
> +#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
> +
>  /*
> - * Check if a (signed) value is within a specified (unsigned) margin,
> + * Check if a (signed) value is within the (unsigned) util_est margin,
>   * based on the observation that:
>   *
>   * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
>   *
> - * NOTE: this only works when value + maring < INT_MAX.
> + * NOTE: this only works when value + UTIL_EST_MARGIN < INT_MAX.
>   */
> -static inline bool within_margin(int value, int margin)
> +static inline bool util_est_within_margin(int value)
>  {
> -   return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
> +   return ((unsigned int)(value + UTIL_EST_MARGIN - 1) <
> +   (2 * UTIL_EST_MARGIN - 1));
>  }
>
>  static inline void util_est_update(struct cfs_rq *cfs_rq,
>struct task_struct *p,
>bool task_sleep)
>  {
> -   long last_ewma_diff;
> +   long last_ewma_diff, last_enqueued_diff;
> struct util_est ue;
>
> if (!sched_feat(UTIL_EST))
> @@ -3979,6 +3982,8 @@ static inline void util_est_update(struct cfs_rq 
> *cfs_rq,
> if (ue.enqueued & UTIL_AVG_UNCHANGED)
> return;
>
> +   last_enqueued_diff = ue.enqueued;
> +
> /*
>  * Reset EWMA on utilization increases, the moving average is used 
> only
>  * to smooth utilization decreases.
> @@ -3992,12 +3997,19 @@ static inline void util_est_update(struct cfs_rq 
> *cfs_rq,
> }
>
> /*
> -* Skip update of task's estimated utilization when its EWMA is
> +* Skip update of task's estimated utilization when its members are
>  * already ~1% close to its last activation value.
>  */
> last_ewma_diff = ue.enqueued - ue.ewma;
> -   if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
> +   last_enqueued_diff -= ue.enqueued;
> +   if (util_est_within_margin(last_ewma_diff)) {
> +   if (!util_est_within_margin(last_enqueued_diff)) {
> +   ue.ewma = ue.enqueued;
> +   goto done;
> +   }
> +
> return;
> +   }
>
> /*
>  * To avoid overestimation of actual task utilization, skip updates if
> --
> 2.25.1
>

Re: [PATCH 6/7 v3] sched/fair: trigger the update of blocked load on newly idle cpu

2021-02-17 Thread Vincent Guittot

On Wed, 17 Feb 2021 at 12:51, Valentin Schneider
 wrote:
>
> On 15/02/21 16:02, Vincent Guittot wrote:
> > On Fri, 12 Feb 2021 at 20:19, Valentin Schneider
> >  wrote:
> >> I don't think there is anything inherently wrong with it - the
> >> nohz_idle_balance() call resulting from the kick_ilb() IPI will just bail
> >> out due to the flags being cleared here. This wasn't immediately clear to
> >> me however.
> >
> > In fact, I forgot to replace the WARN_ON in nohz_csd_func() by a
> > simple return as reported by kernel test robot / oliver.s...@intel.com
> >
>
> Can't that actually be a problem? kick_ilb() says:
>
>  * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
>  * the first flag owns it; cleared by nohz_csd_func().
>
> So if you have:
>
>   kick_ilb() -> kicks CPU42
>
> And then said CPU42 goes through, before nohz_csd_func(),:
>
>   do_idle() -> nohz_run_idle_balance()
>
> you could have yet another CPU do:
>
>   kick_ilb() -> kicks CPU42
>
> which would break rq->nohz_csd serialization.

Yeah there are ever further problems and I get some rcu_sched log on
my large server with one benchmark with one specific parameter which I
can't reproduce on my smaller system. Right now, I'm working on making
both exclusive which should be mainly about testing if this_cpu is set
in nohz.idle_cpus_mask

>
> >>
> >> > +}
> >> > +

Re: [PATCH 6/7 v3] sched/fair: trigger the update of blocked load on newly idle cpu

2021-02-15 Thread Vincent Guittot

On Fri, 12 Feb 2021 at 20:19, Valentin Schneider
 wrote:
>
> On 12/02/21 15:17, Vincent Guittot wrote:
> > Instead of waking up a random and already idle CPU, we can take advantage
> > of this_cpu being about to enter idle to run the ILB and update the
> > blocked load.
> >
> > Signed-off-by: Vincent Guittot 
> > ---
> >  kernel/sched/fair.c  | 24 +---
> >  kernel/sched/idle.c  |  6 ++
> >  kernel/sched/sched.h |  5 +
> >  3 files changed, 32 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 5d285d93e433..cd0ea635225e 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, 
> > enum cpu_idle_type idle)
> >   return true;
> >  }
> >
> > +/*
> > + * Check if we need to run the ILB for updating blocked load before 
> > entering
> > + * idle state.
> > + */
> > +void nohz_run_idle_balance(int cpu)
> > +{
> > + unsigned int flags;
> > +
> > + flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
> > +
> > + if (flags && !need_resched()) {
> > + struct rq *rq = cpu_rq(cpu);
> > +
> > + rq->nohz_idle_balance = flags;
> > + nohz_idle_balance(rq, CPU_IDLE);
> > + }
>
> So this can now run a full fledged nohz_idle_balance() if NOHZ_BALANCE_MASK
> is set.

Yes.
>
> I don't think there is anything inherently wrong with it - the
> nohz_idle_balance() call resulting from the kick_ilb() IPI will just bail
> out due to the flags being cleared here. This wasn't immediately clear to
> me however.

In fact, I forgot to replace the WARN_ON in nohz_csd_func() by a
simple return as reported by kernel test robot / oliver.s...@intel.com

>
> > +}
> > +

[PATCH 7/7 v3] sched/fair: reduce the window for duplicated update

2021-02-12 Thread Vincent Guittot

Start to update last_blocked_load_update_tick to reduce the possibility
of another cpu starting the update one more time

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cd0ea635225e..7ef0911529ee 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7852,16 +7852,20 @@ static inline bool others_have_blocked(struct rq *rq)
return false;
 }
 
-static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+static inline void update_blocked_load_tick(struct rq *rq)
 {
-   rq->last_blocked_load_update_tick = jiffies;
+   WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
+}
 
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
if (!has_blocked)
rq->has_blocked_load = 0;
 }
 #else
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
 static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_tick(struct rq *rq) {}
 static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) 
{}
 #endif
 
@@ -8022,6 +8026,7 @@ static void update_blocked_averages(int cpu)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
+   update_blocked_load_tick(rq);
update_rq_clock(rq);
 
decayed |= __update_blocked_others(rq, );
@@ -8363,7 +8368,7 @@ static bool update_nohz_stats(struct rq *rq)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
return true;
 
update_blocked_averages(cpu);
-- 
2.17.1

[PATCH 6/7 v3] sched/fair: trigger the update of blocked load on newly idle cpu

2021-02-12 Thread Vincent Guittot

Instead of waking up a random and already idle CPU, we can take advantage
of this_cpu being about to enter idle to run the ILB and update the
blocked load.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c  | 24 +---
 kernel/sched/idle.c  |  6 ++
 kernel/sched/sched.h |  5 +
 3 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5d285d93e433..cd0ea635225e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10453,6 +10453,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum 
cpu_idle_type idle)
return true;
 }
 
+/*
+ * Check if we need to run the ILB for updating blocked load before entering
+ * idle state.
+ */
+void nohz_run_idle_balance(int cpu)
+{
+   unsigned int flags;
+
+   flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
+
+   if (flags && !need_resched()) {
+   struct rq *rq = cpu_rq(cpu);
+
+   rq->nohz_idle_balance = flags;
+   nohz_idle_balance(rq, CPU_IDLE);
+   }
+}
+
 static void nohz_newidle_balance(struct rq *this_rq)
 {
int this_cpu = this_rq->cpu;
@@ -10474,10 +10492,10 @@ static void nohz_newidle_balance(struct rq *this_rq)
return;
 
/*
-* Blocked load of idle CPUs need to be updated.
-* Kick an ILB to update statistics.
+* Set the need to trigger ILB in order to update blocked load
+* before entering idle state.
 */
-   kick_ilb(NOHZ_STATS_KICK);
+   atomic_or(NOHZ_STATS_KICK, nohz_flags(this_cpu));
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 305727ea0677..52a4e9ce2f9b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -261,6 +261,12 @@ static void cpuidle_idle_call(void)
 static void do_idle(void)
 {
int cpu = smp_processor_id();
+
+   /*
+* Check if we need to update some blocked load
+*/
+   nohz_run_idle_balance(cpu);
+
/*
 * If the arch has a polling bit, we maintain an invariant:
 *
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6edc67df3554..17de50acb88d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2374,6 +2374,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
 #endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void nohz_run_idle_balance(int cpu);
+#else
+static inline void nohz_run_idle_balance(int cpu) { }
+#endif
 
 #ifdef CONFIG_SMP
 static inline
-- 
2.17.1

[PATCH 5/7 v3] sched/fair: reorder newidle_balance pulled_task tests

2021-02-12 Thread Vincent Guittot

Reorder the tests and skip useless ones when no load balance has been
performed and rq lock has not been released.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac4add026f32..5d285d93e433 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10584,7 +10584,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
 
-out:
/*
 * While browsing the domains, we released the rq lock, a task could
 * have been enqueued in the meantime. Since we're not going idle,
@@ -10593,14 +10592,15 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
 
-   /* Move the next balance forward */
-   if (time_after(this_rq->next_balance, next_balance))
-   this_rq->next_balance = next_balance;
-
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
+out:
+   /* Move the next balance forward */
+   if (time_after(this_rq->next_balance, next_balance))
+   this_rq->next_balance = next_balance;
+
if (pulled_task)
this_rq->idle_stamp = 0;
else
-- 
2.17.1

[PATCH 3/7 v3] sched/fair: remove unused parameter of update_nohz_stats

2021-02-12 Thread Vincent Guittot

idle load balance is the only user of update_nohz_stats and doesn't use
force parameter. Remove it

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f3f0f8cca061..4573a0abd38a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8352,7 +8352,7 @@ group_type group_classify(unsigned int imbalance_pct,
return group_has_spare;
 }
 
-static bool update_nohz_stats(struct rq *rq, bool force)
+static bool update_nohz_stats(struct rq *rq)
 {
 #ifdef CONFIG_NO_HZ_COMMON
unsigned int cpu = rq->cpu;
@@ -8363,7 +8363,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
return false;
 
-   if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
+   if (!time_after(jiffies, rq->last_blocked_load_update_tick))
return true;
 
update_blocked_averages(cpu);
@@ -10401,7 +10401,7 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 
rq = cpu_rq(balance_cpu);
 
-   has_blocked_load |= update_nohz_stats(rq, true);
+   has_blocked_load |= update_nohz_stats(rq);
 
/*
 * If time for next balance is due,
-- 
2.17.1

[PATCH 1/7 v3] sched/fair: remove update of blocked load from newidle_balance

2021-02-12 Thread Vincent Guittot

newidle_balance runs with both preempt and irq disabled which prevent
local irq to run during this period. The duration for updating the
blocked load of CPUs varies according to the number of CPU cgroups
with non-decayed load and extends this critical period to an uncontrolled
level.

Remove the update from newidle_balance and trigger a normal ILB that
will take care of the update instead.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 33 +
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 59b645e3c4fd..bfe1e235fe01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7392,8 +7392,6 @@ enum migration_type {
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
 #define LBF_SOME_PINNED0x08
-#define LBF_NOHZ_STATS 0x10
-#define LBF_NOHZ_AGAIN 0x20
 
 struct lb_env {
struct sched_domain *sd;
@@ -8397,9 +8395,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
 
-   if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, 
false))
-   env->flags |= LBF_NOHZ_AGAIN;
-
sgs->group_load += cpu_load(rq);
sgs->group_util += cpu_util(i);
sgs->group_runnable += cpu_runnable(rq);
@@ -8940,11 +8935,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
struct sg_lb_stats tmp_sgs;
int sg_status = 0;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
-   env->flags |= LBF_NOHZ_STATS;
-#endif
-
do {
struct sg_lb_stats *sgs = _sgs;
int local_group;
@@ -8981,14 +8971,6 @@ static inline void update_sd_lb_stats(struct lb_env 
*env, struct sd_lb_stats *sd
/* Tag domain that child domain prefers tasks go to siblings first */
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   if ((env->flags & LBF_NOHZ_AGAIN) &&
-   cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
-
-   WRITE_ONCE(nohz.next_blocked,
-  jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
-   }
-#endif
 
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(>busiest_stat);
@@ -10517,16 +10499,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
return;
 
-   raw_spin_unlock(_rq->lock);
/*
-* This CPU is going to be idle and blocked load of idle CPUs
-* need to be updated. Run the ilb locally as it is a good
-* candidate for ilb instead of waking up another idle CPU.
-* Kick an normal ilb if we failed to do the update.
+* Blocked load of idle CPUs need to be updated.
+* Kick an ILB to update statistics.
 */
-   if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
-   kick_ilb(NOHZ_STATS_KICK);
-   raw_spin_lock(_rq->lock);
+   kick_ilb(NOHZ_STATS_KICK);
 }
 
 #else /* !CONFIG_NO_HZ_COMMON */
@@ -10587,8 +10564,6 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
update_next_balance(sd, _balance);
rcu_read_unlock();
 
-   nohz_newidle_balance(this_rq);
-
goto out;
}
 
@@ -10654,6 +10629,8 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 
if (pulled_task)
this_rq->idle_stamp = 0;
+   else
+   nohz_newidle_balance(this_rq);
 
rq_repin_lock(this_rq, rf);
 
-- 
2.17.1

[PATCH 4/7 v3] sched/fair: merge for each idle cpu loop of ILB

2021-02-12 Thread Vincent Guittot

Remove the specific case for handling this_cpu outside for_each_cpu() loop
when running ILB. Instead we use for_each_cpu_wrap() and start with the
next cpu after this_cpu so we will continue to finish with this_cpu.

update_nohz_stats() is now used for this_cpu too and will prevents
unnecessary update. We don't need a special case for handling the update of
nohz.next_balance for this_cpu anymore because it is now handled by the
loop like others.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 32 +++-
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4573a0abd38a..ac4add026f32 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10043,22 +10043,9 @@ static void rebalance_domains(struct rq *rq, enum 
cpu_idle_type idle)
 * When the cpu is attached to null domain for ex, it will not be
 * updated.
 */
-   if (likely(update_next_balance)) {
+   if (likely(update_next_balance))
rq->next_balance = next_balance;
 
-#ifdef CONFIG_NO_HZ_COMMON
-   /*
-* If this CPU has been elected to perform the nohz idle
-* balance. Other idle CPUs have already rebalanced with
-* nohz_idle_balance() and nohz.next_balance has been
-* updated accordingly. This CPU is now running the idle load
-* balance for itself and we need to update the
-* nohz.next_balance accordingly.
-*/
-   if ((idle == CPU_IDLE) && time_after(nohz.next_balance, 
rq->next_balance))
-   nohz.next_balance = rq->next_balance;
-#endif
-   }
 }
 
 static inline int on_null_domain(struct rq *rq)
@@ -10385,8 +10372,12 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
 */
smp_mb();
 
-   for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-   if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
+   /*
+* Start with the next CPU after this_cpu so we will end with this_cpu 
and let a
+* chance for other idle cpu to pull load.
+*/
+   for_each_cpu_wrap(balance_cpu,  nohz.idle_cpus_mask, this_cpu+1) {
+   if (!idle_cpu(balance_cpu))
continue;
 
/*
@@ -10432,15 +10423,6 @@ static void _nohz_idle_balance(struct rq *this_rq, 
unsigned int flags,
if (likely(update_next_balance))
nohz.next_balance = next_balance;
 
-   /* Newly idle CPU doesn't need an update */
-   if (idle != CPU_NEWLY_IDLE) {
-   update_blocked_averages(this_cpu);
-   has_blocked_load |= this_rq->has_blocked_load;
-   }
-
-   if (flags & NOHZ_BALANCE_KICK)
-   rebalance_domains(this_rq, CPU_IDLE);
-
WRITE_ONCE(nohz.next_blocked,
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
 
-- 
2.17.1

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 3961 matches

Mail list logo