from:"Juri Lelli"

[PATCH v5 8/8] arm64: add sysfs cpu_capacity attribute

2016-06-15 Thread Juri Lelli

Add a sysfs cpu_capacity attribute with which it is possible to read and
write (thus over-writing default values) CPUs capacity. This might be
useful in situations where values needs changing after boot.

The new attribute shows up as:

 /sys/devices/system/cpu/cpu*/cpu_capacity

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Mark Brown 
Cc: Sudeep Holla 
Signed-off-by: Juri Lelli 
---
 arch/arm64/kernel/topology.c | 68 
 1 file changed, 68 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 780c2c7..33a35411 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -37,6 +37,74 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
+#ifdef CONFIG_PROC_SYSCTL
+#include 
+#include 
+static ssize_t show_cpu_capacity(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   ssize_t rc;
+   int cpunum = cpu->dev.id;
+   unsigned long capacity = arch_scale_cpu_capacity(NULL, cpunum);
+
+   rc = sprintf(buf, "%lu\n", capacity);
+
+   return rc;
+}
+
+static ssize_t store_cpu_capacity(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   int this_cpu = cpu->dev.id, i;
+   unsigned long new_capacity;
+   ssize_t ret;
+
+   if (count) {
+   char *p = (char *) buf;
+
+   ret = kstrtoul(p, 0, &new_capacity);
+   if (ret)
+   return ret;
+   if (new_capacity > SCHED_CAPACITY_SCALE)
+   return -EINVAL;
+
+   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
+   set_capacity_scale(i, new_capacity);
+   }
+
+   return count;
+}
+
+static DEVICE_ATTR(cpu_capacity,
+  0644,
+  show_cpu_capacity,
+  store_cpu_capacity);
+
+static int register_cpu_capacity_sysctl(void)
+{
+   int i;
+   struct device *cpu;
+
+   for_each_possible_cpu(i) {
+   cpu = get_cpu_device(i);
+   if (!cpu) {
+   pr_err("%s: too early to get CPU%d device!\n",
+  __func__, i);
+   continue;
+   }
+   device_create_file(cpu, &dev_attr_cpu_capacity);
+   }
+
+   return 0;
+}
+late_initcall(register_cpu_capacity_sysctl);
+#endif
+
 static u32 capacity_scale;
 static u32 *raw_capacity;
 static bool cap_parsing_failed;
-- 
2.7.0

[PATCH v5 7/8] arm: add sysfs cpu_capacity attribute

2016-06-15 Thread Juri Lelli

Add a sysfs cpu_capacity attribute with which it is possible to read and
write (thus over-writing default values) CPUs capacity. This might be
useful in situations where values needs changing after boot.

The new attribute shows up as:

 /sys/devices/system/cpu/cpu*/cpu_capacity

Cc: Russell King 
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c | 68 ++
 1 file changed, 68 insertions(+)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 2bab290..a240bbd 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -52,6 +52,74 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
+#ifdef CONFIG_PROC_SYSCTL
+#include 
+#include 
+static ssize_t show_cpu_capacity(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   ssize_t rc;
+   int cpunum = cpu->dev.id;
+   unsigned long capacity = arch_scale_cpu_capacity(NULL, cpunum);
+
+   rc = sprintf(buf, "%lu\n", capacity);
+
+   return rc;
+}
+
+static ssize_t store_cpu_capacity(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   int this_cpu = cpu->dev.id, i;
+   unsigned long new_capacity;
+   ssize_t ret;
+
+   if (count) {
+   char *p = (char *) buf;
+
+   ret = kstrtoul(p, 0, &new_capacity);
+   if (ret)
+   return ret;
+   if (new_capacity > SCHED_CAPACITY_SCALE)
+   return -EINVAL;
+
+   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
+   set_capacity_scale(i, new_capacity);
+   }
+
+   return count;
+}
+
+static DEVICE_ATTR(cpu_capacity,
+  0644,
+  show_cpu_capacity,
+  store_cpu_capacity);
+
+static int register_cpu_capacity_sysctl(void)
+{
+   int i;
+   struct device *cpu;
+
+   for_each_possible_cpu(i) {
+   cpu = get_cpu_device(i);
+   if (!cpu) {
+   pr_err("%s: too early to get CPU%d device!\n",
+  __func__, i);
+   continue;
+   }
+   device_create_file(cpu, &dev_attr_cpu_capacity);
+   }
+
+   return 0;
+}
+late_initcall(register_cpu_capacity_sysctl);
+#endif
+
 #ifdef CONFIG_OF
 struct cpu_efficiency {
const char *compatible;
-- 
2.7.0

[PATCH v5 3/8] arm, dts: add TC2 cpu capacity-dmips-mhz information

2016-06-15 Thread Juri Lelli

Add TC2 cpu capacity binding information.

Cc: Liviu Dudau 
Cc: Sudeep Holla 
Cc: Lorenzo Pieralisi 
Cc: Rob Herring 
Cc: Pawel Moll 
Cc: Mark Rutland 
Cc: Ian Campbell 
Cc: Kumar Gala 
Cc: Russell King 
Cc: devicet...@vger.kernel.org
Signed-off-by: Juri Lelli 
---

Changes from v1:
  - capacity-scale removed

Changes from v4:
  - binding changed to capacity-dmips-mhz
---
 arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts 
b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
index 0205c97..45d08cc 100644
--- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
+++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
@@ -39,6 +39,7 @@
reg = <0>;
cci-control-port = <&cci_control1>;
cpu-idle-states = <&CLUSTER_SLEEP_BIG>;
+   capacity-dmips-mhz = <1024>;
};
 
cpu1: cpu@1 {
@@ -47,6 +48,7 @@
reg = <1>;
cci-control-port = <&cci_control1>;
cpu-idle-states = <&CLUSTER_SLEEP_BIG>;
+   capacity-dmips-mhz = <1024>;
};
 
cpu2: cpu@2 {
@@ -55,6 +57,7 @@
reg = <0x100>;
cci-control-port = <&cci_control2>;
cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
+   capacity-dmips-mhz = <516>;
};
 
cpu3: cpu@3 {
@@ -63,6 +66,7 @@
reg = <0x101>;
cci-control-port = <&cci_control2>;
cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
+   capacity-dmips-mhz = <516>;
};
 
cpu4: cpu@4 {
@@ -71,6 +75,7 @@
reg = <0x102>;
cci-control-port = <&cci_control2>;
cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
+   capacity-dmips-mhz = <516>;
};
 
idle-states {
-- 
2.7.0

[PATCH v5 6/8] arm64, dts: add Juno r2 cpu capacity-dmips-mhz information

2016-06-15 Thread Juri Lelli

Add Juno r2 cpu capacity-dmips-mhz bindings information.

Cc: Rob Herring 
Cc: Pawel Moll 
Cc: Mark Rutland 
Cc: Ian Campbell 
Cc: Kumar Gala 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Liviu Dudau 
Cc: Sudeep Holla 
Cc: Arnd Bergmann 
Cc: Jon Medhurst 
Cc: Olof Johansson 
Cc: Robin Murphy 
Cc: devicet...@vger.kernel.org
Signed-off-by: Juri Lelli 
---

Changes from v4:
  - new patch since Juno r2 dt has been merged
---
 arch/arm64/boot/dts/arm/juno-r2.dts | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/arm64/boot/dts/arm/juno-r2.dts 
b/arch/arm64/boot/dts/arm/juno-r2.dts
index 88ecd61..823b614b 100644
--- a/arch/arm64/boot/dts/arm/juno-r2.dts
+++ b/arch/arm64/boot/dts/arm/juno-r2.dts
@@ -90,6 +90,7 @@
next-level-cache = <&A72_L2>;
clocks = <&scpi_dvfs 0>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <1024>;
};
 
A72_1: cpu@1 {
@@ -100,6 +101,7 @@
next-level-cache = <&A72_L2>;
clocks = <&scpi_dvfs 0>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <1024>;
};
 
A53_0: cpu@100 {
@@ -110,6 +112,7 @@
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <485>;
};
 
A53_1: cpu@101 {
@@ -120,6 +123,7 @@
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <485>;
};
 
A53_2: cpu@102 {
@@ -130,6 +134,7 @@
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <485>;
};
 
A53_3: cpu@103 {
@@ -140,6 +145,7 @@
next-level-cache = <&A53_L2>;
clocks = <&scpi_dvfs 1>;
cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+   capacity-dmips-mhz = <485>;
};
 
A72_L2: l2-cache0 {
-- 
2.7.0

[PATCH v5 4/8] arm64: parse cpu capacity-dmips-mhz from DT

2016-06-15 Thread Juri Lelli

With the introduction of cpu capacity-dmips-mhz bindings, CPU capacities
can now be calculated from values extracted from DT and information
coming from cpufreq. Add parsing of DT information at boot time, and
complement it with cpufreq information. Also, store such information
using per CPU variables, as we do for arm.

Caveat: the information provided by this patch will start to be used in
the future. We need to #define arch_scale_cpu_capacity to something
provided in arch, so that scheduler's default implementation (which gets
used if arch_scale_cpu_capacity is not defined) is overwritten.

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Mark Brown 
Cc: Sudeep Holla 
Signed-off-by: Juri Lelli 
---

Changes from v1:
  - normalize w.r.t. highest capacity found in DT
  - bailout conditions (all-or-nothing)

Changes from v4:
  - parsing modified to reflect change in binding (capacity-dmips-mhz)
---
 arch/arm64/kernel/topology.c | 145 +++
 1 file changed, 145 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 694f6de..780c2c7 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -19,10 +19,152 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
 
+static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+{
+   return per_cpu(cpu_scale, cpu);
+}
+
+static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+{
+   per_cpu(cpu_scale, cpu) = capacity;
+}
+
+static u32 capacity_scale;
+static u32 *raw_capacity;
+static bool cap_parsing_failed;
+
+static void __init parse_cpu_capacity(struct device_node *cpu_node, int cpu)
+{
+   int ret;
+   u32 cpu_capacity;
+
+   if (cap_parsing_failed)
+   return;
+
+   ret = of_property_read_u32(cpu_node,
+  "capacity-dmips-mhz",
+  &cpu_capacity);
+   if (!ret) {
+   if (!raw_capacity) {
+   raw_capacity = kzalloc(sizeof(*raw_capacity) *
+   num_possible_cpus(), GFP_KERNEL);
+   if (!raw_capacity) {
+   pr_err("cpu_capacity: failed to allocate memory"
+  " for raw capacities\n");
+   cap_parsing_failed = true;
+   return;
+   }
+   }
+   capacity_scale = max(cpu_capacity, capacity_scale);
+   raw_capacity[cpu] = cpu_capacity;
+   pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n",
+   cpu_node->full_name, raw_capacity[cpu]);
+   } else {
+   pr_err("cpu_capacity: missing %s raw capacity "
+  "(fallback to 1024 for all CPUs)\n",
+   cpu_node->full_name);
+   cap_parsing_failed = true;
+   kfree(raw_capacity);
+   }
+}
+
+static void normalize_cpu_capacity(void)
+{
+   u64 capacity;
+   int cpu;
+
+   if (WARN_ON(!raw_capacity) || cap_parsing_failed)
+   return;
+
+   pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale);
+   for_each_possible_cpu(cpu) {
+   pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n",
+cpu, raw_capacity[cpu]);
+   capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
+   / capacity_scale;
+   set_capacity_scale(cpu, capacity);
+   pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
+   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   }
+}
+
+#ifdef CONFIG_CPU_FREQ
+static cpumask_var_t cpus_to_visit;
+static bool cap_parsing_done;
+
+static int
+init_cpu_capacity_callback(struct notifier_block *nb,
+  unsigned long val,
+  void *data)
+{
+   struct cpufreq_policy *policy = data;
+   int cpu;
+
+   if (cap_parsing_failed || cap_parsing_done)
+   return 0;
+
+   switch (val) {
+   case CPUFREQ_NOTIFY:
+   pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] "
+   "(to_visit=%*pbl)\n",
+   cpumask_pr_args(policy->related_cpus),
+   cpumask_pr_args(cpus_to_visit));
+   cpumask_andnot(cpus_to_visit,
+  cpus_to_visit,
+  policy->related_cpus);
+   for_each_cpu(cpu, policy->related_cpus) {
+   raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) *
+

[PATCH v5 2/8] arm: parse cpu capacity-dmips-mhz from DT

2016-06-15 Thread Juri Lelli

With the introduction of cpu capacity-dmips-mhz bindings, CPU capacities
can now be calculated from values extracted from DT and information
coming from cpufreq. Add parsing of DT information at boot time, and
complement it with cpufreq information. We keep code that can produce
same information, based on different DT properties and hard-coded
values, as fall-back for backward compatibility.

Caveat: the information provided by this patch will start to be used in
the future. We need to #define arch_scale_cpu_capacity to something
provided in arch, so that scheduler's default implementation (which gets
used if arch_scale_cpu_capacity is not defined) is overwritten.

Cc: Russell King 
Signed-off-by: Juri Lelli 
---

Changes from v1:
  - normalize w.r.t. highest capacity found in DT
  - bailout conditions (all-or-nothing)

Changes from v4:
  - parsing modified to reflect change in binding (capacity-dmips-mhz)
---
 arch/arm/kernel/topology.c | 145 -
 1 file changed, 144 insertions(+), 1 deletion(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ec279d1..2bab290 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -78,6 +78,134 @@ static unsigned long *__cpu_capacity;
 #define cpu_capacity(cpu)  __cpu_capacity[cpu]
 
 static unsigned long middle_capacity = 1;
+static bool cap_from_dt = true;
+static u32 *raw_capacity;
+static bool cap_parsing_failed;
+static u32 capacity_scale;
+
+static int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu)
+{
+   int ret = 1;
+   u32 cpu_capacity;
+
+   if (cap_parsing_failed)
+   return !ret;
+
+   ret = of_property_read_u32(cpu_node,
+  "capacity-dmips-mhz",
+  &cpu_capacity);
+   if (!ret) {
+   if (!raw_capacity) {
+   raw_capacity = kzalloc(sizeof(*raw_capacity) *
+   num_possible_cpus(), GFP_KERNEL);
+   if (!raw_capacity) {
+   pr_err("cpu_capacity: failed to allocate memory"
+  " for raw capacities\n");
+   cap_parsing_failed = true;
+   return !ret;
+   }
+   }
+   capacity_scale = max(cpu_capacity, capacity_scale);
+   raw_capacity[cpu] = cpu_capacity;
+   pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n",
+   cpu_node->full_name, raw_capacity[cpu]);
+   } else {
+   pr_err("cpu_capacity: missing %s raw capacity "
+  "(fallback to 1024 for all CPUs)\n",
+  cpu_node->full_name);
+   cap_parsing_failed = true;
+   kfree(raw_capacity);
+   }
+
+   return !ret;
+}
+
+static void normalize_cpu_capacity(void)
+{
+   u64 capacity;
+   int cpu;
+
+   if (WARN_ON(!raw_capacity) || cap_parsing_failed)
+   return;
+
+   pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale);
+   for_each_possible_cpu(cpu) {
+   capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
+   / capacity_scale;
+   set_capacity_scale(cpu, capacity);
+   pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
+   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   }
+}
+
+#ifdef CONFIG_CPU_FREQ
+static cpumask_var_t cpus_to_visit;
+static bool cap_parsing_done;
+
+static int
+init_cpu_capacity_callback(struct notifier_block *nb,
+  unsigned long val,
+  void *data)
+{
+   struct cpufreq_policy *policy = data;
+   int cpu;
+
+   if (cap_parsing_failed || cap_parsing_done)
+   return 0;
+
+   switch (val) {
+   case CPUFREQ_NOTIFY:
+   pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] "
+   "(to_visit=%*pbl)\n",
+   cpumask_pr_args(policy->related_cpus),
+   cpumask_pr_args(cpus_to_visit));
+   cpumask_andnot(cpus_to_visit,
+  cpus_to_visit,
+  policy->related_cpus);
+   for_each_cpu(cpu, policy->related_cpus) {
+   raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) *
+   policy->max / 1000UL;
+   capacity_scale = max(raw_capacity[cpu], capacity_scale);
+   }
+   if (cpumask_empty(cpus_to_visit)) {
+   normalize_cpu_capacity();
+   kfree(raw_capacity);
+

Re: [PATCH v5 4/8] arm64: parse cpu capacity-dmips-mhz from DT

2016-06-15 Thread Juri Lelli

On 15/06/16 11:20, Mark Brown wrote:
> On 15 June 2016 at 11:17, Juri Lelli  wrote:
> 
> 
> > Cc: Mark Brown 
> >
> 
> Please use broo...@kernel.org for any upstream work, I'm really not set up
> to handle upstream stuff on my Linaro address.

Sure. Will update the address.

Best,

- Juri

Re: [PATCH v5 1/8] Documentation: arm: define DT cpu capacity-dmips-mhz bindings

2016-06-15 Thread Juri Lelli

On 15/06/16 15:04, Mark Brown wrote:
> On Wed, Jun 15, 2016 at 11:17:50AM +0100, Juri Lelli wrote:
> 
> > +CPU capacities are obtained by running a suitable benchmark. This binding 
> > makes
> > +no aspersions on the validity or suitability of any particular benchmark, 
> > the
> > +final capacity should, however, be:
> 
> Makes no guarantees.  An aspersion is an insult!
> 

Whoops! :-/

Sure, I'll change that.

> > +CPU capacities are obtained by running the Dhrystone benchmark on each CPU 
> > at
> > +max frequency. The obtained DMIPS score is then divided by the frequency 
> > (in
> > +MHz) at which the benchmark has been run, so that DMIPS/MHz are obtained.
> > +Such values are then normalized w.r.t. the highest score obtained in the
> > +system.
> 
> Perhaps worth mentioning that caches and so on should be enabled (based
> on previous experience with people doing bringup)?

OK. I'll add something along this line.

Thanks,

- Juri

Re: [PATCH v5 1/8] Documentation: arm: define DT cpu capacity-dmips-mhz bindings

2016-06-15 Thread Juri Lelli

On 15/06/16 14:51, Vincent Guittot wrote:
> On 15 June 2016 at 12:17, Juri Lelli  wrote:

[...]

> > +==
> > +ARM CPUs capacity bindings
> > +==
> > +
> > +==
> > +1 - Introduction
> > +==
> > +
> > +ARM systems may be configured to have cpus with different power/performance
> > +characteristics within the same chip. In this case, additional information
> > +has to be made available to the kernel (the scheduler in particular) for
> 
> not sure that it's worth mentioning the scheduler in particular here
> 

OK. I can remove that bit.

> > +it to be aware of such differences and take decisions accordingly.
> > +

[...]

> > +==
> > +3 - capacity-dmips-mhz
> > +==
> > +
> > +capacity-dmips-mhz is an optional cpu node [1] property: u32 value
> > +representing CPU capacity expressed in normalized DMIPS/MHz. At boot time, 
> > the
> > +maximum frequency available to the cpu is then used to calculate the 
> > capacity
> > +value internally used by the kernel.
> > +
> > +capacity-dmips-mhz property is all-or-nothing: if it is specified for a cpu
> > +node, it has to be specified for every other cpu nodes, or the system will
> > +fall back to the default capacity value for every CPU. If cpufreq is not
> > +available, final capacities are calculated by directly using 
> > capacity-dmips-
> > +mhz values (normalized w.r.t. the highest value found while parsing the 
> > DT).
> 
> looks good to me
> 

Great! Thanks for reviewing this.

Best,

- Juri

Re: [PATCH v5 4/8] arm64: parse cpu capacity-dmips-mhz from DT

2016-06-15 Thread Juri Lelli

On 15/06/16 14:49, Mark Brown wrote:
> On Wed, Jun 15, 2016 at 11:17:53AM +0100, Juri Lelli wrote:
> 
> > +   if (!raw_capacity) {
> > +   raw_capacity = kzalloc(sizeof(*raw_capacity) *
> > +   num_possible_cpus(), GFP_KERNEL);
> 
> kcalloc()?
> 

Right. Will change.

> > +   if (!raw_capacity) {
> > +   pr_err("cpu_capacity: failed to allocate memory"
> > +  " for raw capacities\n");
> 
> It's normally better to avoid splitting errors message so people can
> grep if they see the error.
> 

Tried to avoid breaking 80 columns. But, we seem to have longer pr_err
strings already. I'll change that.

> > +   } else {
> > +   pr_err("cpu_capacity: missing %s raw capacity "
> > +  "(fallback to 1024 for all CPUs)\n",
> > +   cpu_node->full_name);
> 
> That's going to complain fairly loudly for all existing DTs isn't it and
> it's kind of redundant if all the cores have the same capacity (which is
> a very common case)?  How about printing an error only if we already
> found one, or printing a single warning at the end if we didn't get
> anything?

Right, I'll change the condition for which pr_err is emitted. I think
the situation for which we care the most about is when we find partial
information in DT.

Thanks,

- Juri

Re: [PATCH v5 1/8] Documentation: arm: define DT cpu capacity-dmips-mhz bindings

2016-06-16 Thread Juri Lelli

Hi,

On 15/06/16 17:11, Rob Herring wrote:
> On Wed, Jun 15, 2016 at 5:17 AM, Juri Lelli  wrote:

[...]

> > +==
> > +2 - CPU capacity definition
> > +==
> > +
> > +CPU capacity is a number that provides the scheduler information about CPUs
> > +heterogeneity. Such heterogeneity can come from micro-architectural 
> > differences
> > +(e.g., ARM big.LITTLE systems) or maximum frequency at which CPUs can run
> > +(e.g., SMP systems with multiple frequency domains). Heterogeneity in this
> > +context is about differing performance characteristics; this binding tries 
> > to
> > +capture a first-order approximation of the relative performance of CPUs.
> > +
> > +CPU capacities are obtained by running a suitable benchmark. This binding 
> > makes
> > +no aspersions on the validity or suitability of any particular benchmark, 
> > the
> > +final capacity should, however, be:
> > +
> > +* A "single-threaded" or CPU affine benchmark
> > +* Divided by the running frequency of the CPU executing the benchmark
> > +* Not subject to dynamic frequency scaling of the CPU
> > +
> > +For the time being we however advise usage of the Dhrystone benchmark. What
> > +above thus becomes:
> > +
> > +CPU capacities are obtained by running the Dhrystone benchmark on each CPU 
> > at
> > +max frequency. The obtained DMIPS score is then divided by the frequency 
> > (in
> > +MHz) at which the benchmark has been run, so that DMIPS/MHz are obtained.
> > +Such values are then normalized w.r.t. the highest score obtained in the
> > +system.
> 
> So the property says it represents DMIPS/MHz, but we take that and
> "normalize" them back to a made up numbers?

The normalization step is required if one wants to prevent
cross-platform comparisons (I think that's what vendors generally want).
They are not made up, they still come from measured DMIPS/MHz values.

> Perhaps that step should
> be optional. Then paranoid Si vendors can put their fake numbers in
> and end users can update the dts files with real numbers.
> 

But, you can also decide to skip that step and put non normalized
numbers in. This documentation is advising people for what seems to be
be the most common way of coming up with values that, once in a DT,
won't be used to compare perf of different platforms.

Maybe we want to add a paragraph clearly stating this point?

> Is there any point in allowing people to pick their own scale? Why not
> just 100 (as in percent)?
> 

I think we agreed that not picking any particular scale is more flexible
and easier to use. For example, if there is a 2x factor between you cpus
you can simply put 1 and 2 there. But, if you need higher "resolution"
you can put whatever suits you better and we'll use the max as scale.

Thanks a lot for the review.

Best,

- Juri

[PATCH v4] sched/deadline: remove useless param from setup_new_dl_entity

2016-07-13 Thread Juri Lelli

setup_new_dl_entity() takes two parameters, but it only actually uses
one of them, under a different name, to setup a new dl_entity, after:

 2f9f3fdc928 "sched/deadline: Remove dl_new from struct sched_dl_entity"

as we currently do

 setup_new_dl_entity(&p->dl, &p->dl)

However, before Luca's change we were doing

 setup_new_dl_entity(dl_se, pi_se)

in update_dl_entity() for a dl_se->new entity: we were using pi_se's
parameters (the potential PI donor) for setting up a new entity.

Restore this behaviour (as we want to correctly initialize parameters of
a boosted task that enters DEADLINE) by removing the useless second
parameter of setup_new_dl_entity() and retrieving the top waiter
directly from inside that function.

While we are at it we also optimize things further calling setup_new_dl_
entity only for already queued tasks, since (as pointed out by Xunlei)
we already do the very same update at tasks wakeup time anyway.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Luca Abeni 
Cc: Xunlei Pang 
Signed-off-by: Juri Lelli 

---
 Changes from v3:
   - call setup_new_dl_entity only if task is enqueued already

 Changes from v2:
   - optimize by calling rt_mutex_get_top_task only if the task is
 boosted (as suggested by Steve)

 Changes from v1:
   - Steve pointed out that we were actually using the second parameter
 to permorm initialization
   - Luca confirmed that behavior is slightly changed w.r.t. before his
 change
   - changelog updated and original behavior restored
---
 kernel/sched/deadline.c | 39 ---
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..dc56f5be0112 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -346,11 +346,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct 
task_struct *p,
  * one, and to (try to!) reconcile itself with its own scheduling
  * parameters.
  */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-  struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 {
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+   struct task_struct *pi_task;
+   struct sched_dl_entity *pi_se = dl_se;
 
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
 
@@ -363,6 +364,15 @@ static inline void setup_new_dl_entity(struct 
sched_dl_entity *dl_se,
return;
 
/*
+* Use the scheduling parameters of the top pi-waiter task,
+* if we have one from which we can inherit a deadline.
+*/
+   if (dl_se->dl_boosted &&
+   (pi_task = rt_mutex_get_top_task(dl_task_of(dl_se))) &&
+   dl_prio(pi_task->normal_prio))
+   pi_se = &pi_task->dl;
+
+   /*
 * We use the regular wall clock time to set deadlines in the
 * future; in fact, we must consider execution overheads (time
 * spent on hardirq context, etc.).
@@ -1720,19 +1730,26 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(&p->dl, &p->dl);
 
-   if (task_on_rq_queued(p) && rq->curr != p) {
+   if (task_on_rq_queued(p)) {
+   /*
+* If p is not queued we will update its parameters at next
+* wakeup.
+*/
+   if (dl_time_before(p->dl.deadline, rq_clock(rq)))
+   setup_new_dl_entity(&p->dl);
+
+   if (rq->curr != p) {
 #ifdef CONFIG_SMP
-   if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
-   queue_push_tasks(rq);
+   if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
+   queue_push_tasks(rq);
 #else
-   if (dl_task(rq->curr))
-   check_preempt_curr_dl(rq, p, 0);
-   else
-   resched_curr(rq);
+   if (dl_task(rq->curr))
+   check_preempt_curr_dl(rq, p, 0);
+   else
+   resched_curr(rq);
 #endif
+   }
}
 }
 
-- 
2.7.4

Re: [PATCH v2] sched/deadline: remove useless param from setup_new_dl_entity

2016-07-04 Thread Juri Lelli

On 04/07/16 11:03, Luca Abeni wrote:
> Hi Juri,
> 
> On Wed, 29 Jun 2016 20:07:43 +0100
> Juri Lelli  wrote:
> 
> > setup_new_dl_entity() takes two parameters, but it only actually uses
> > one of them, under a different name, to setup a new dl_entity, after:
> > 
> >  2f9f3fdc928 "sched/deadline: Remove dl_new from struct
> > sched_dl_entity"
> > 
> > as we currently do
> > 
> >  setup_new_dl_entity(&p->dl, &p->dl)
> > 
> > However, before Luca's change we were doing
> > 
> >  setup_new_dl_entity(dl_se, pi_se)
> > 
> > in update_dl_entity() for a dl_se->new entity: we were using pi_se's
> > parameters (the potential PI donor) for setting up a new entity.
> > 
> > Restore this behaviour (as we want to correctly initialize parameters
> > of a boosted task that enters DEADLINE) by removing the useless second
> > parameter of setup_new_dl_entity() and retrieving the top waiter
> > directly from inside that function.
> I did not have time to test this patch yet, but it still looks good to
> me.
> 

Thanks for reviewing it.

Best,

- Juri

Re: [PATCH v2] sched/deadline: remove useless param from setup_new_dl_entity

2016-07-05 Thread Juri Lelli

On 05/07/16 15:37, Wanpeng Li wrote:
> 2016-06-30 3:07 GMT+08:00 Juri Lelli :
> > setup_new_dl_entity() takes two parameters, but it only actually uses
> > one of them, under a different name, to setup a new dl_entity, after:
> >
> >  2f9f3fdc928 "sched/deadline: Remove dl_new from struct sched_dl_entity"
> >
> > as we currently do
> >
> >  setup_new_dl_entity(&p->dl, &p->dl)
> >
> > However, before Luca's change we were doing
> >
> >  setup_new_dl_entity(dl_se, pi_se)
> >
> > in update_dl_entity() for a dl_se->new entity: we were using pi_se's
> > parameters (the potential PI donor) for setting up a new entity.
> >
> > Restore this behaviour (as we want to correctly initialize parameters of
> > a boosted task that enters DEADLINE) by removing the useless second
> > parameter of setup_new_dl_entity() and retrieving the top waiter
> > directly from inside that function.
> >
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Cc: Steven Rostedt 
> > Cc: Luca Abeni 
> > Signed-off-by: Juri Lelli 
> >
> 
> Reviewed-by: Wanpeng Li 
> 

Thanks!

- Juri

Re: [PATCH v2] sched/deadline: remove useless param from setup_new_dl_entity

2016-07-05 Thread Juri Lelli

On 05/07/16 10:20, Steven Rostedt wrote:
> On Wed, 29 Jun 2016 20:07:43 +0100
> Juri Lelli  wrote:
> 
> 
> > ---
> >  kernel/sched/deadline.c | 14 +++---
> >  1 file changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index fcb7f0217ff4..2000ad2294d5 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -346,11 +346,12 @@ static void check_preempt_curr_dl(struct rq *rq, 
> > struct task_struct *p,
> >   * one, and to (try to!) reconcile itself with its own scheduling
> >   * parameters.
> >   */
> > -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
> > -  struct sched_dl_entity *pi_se)
> > +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >  {
> > struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > struct rq *rq = rq_of_dl_rq(dl_rq);
> > +   struct task_struct *pi_task = rt_mutex_get_top_task(dl_task_of(dl_se));
> > +   struct sched_dl_entity *pi_se = dl_se;
> >  
> > WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
> >  
> > @@ -363,6 +364,13 @@ static inline void setup_new_dl_entity(struct 
> > sched_dl_entity *dl_se,
> > return;
> >  
> > /*
> > +* Use the scheduling parameters of the top pi-waiter task,
> > +* if we have one from which we can inherit a deadline.
> > +*/
> > +   if (pi_task && dl_se->dl_boosted && dl_prio(pi_task->normal_prio))
> > +   pi_se = &pi_task->dl;
> > +
> 
> OK, I'm micro-optimizing now, but hey, isn't this a fast path?
> 
> What about changing the above to:
> 
>   struct task_struct *pi_task;
>   [...]
> 
>   if (dl_se->dl_boosted && dl_prio(pi_task->normal_prio &&
^
OK, we need to reorder these two
V
>   (pi_task = rt_mutex_get_top_task(dl_task_of(dl_se)))
>   pe_se = &pi_task->dl;
> 
> This way we don't need to do any work of looking at
> rt_mutex_get_top_task() for the normal case.
> 

But, yes. Looks good to me. I'll shoot a v3 ASAP.

Thanks,

- Juri

Re: [PATCH v2] sched/deadline: remove useless param from setup_new_dl_entity

2016-07-05 Thread Juri Lelli

On 05/07/16 12:47, Steven Rostedt wrote:
> On Tue, 5 Jul 2016 15:39:33 +0100
> Juri Lelli  wrote:
> 
>   return;
> > > >  
> > > > /*
> > > > +* Use the scheduling parameters of the top pi-waiter task,
> > > > +* if we have one from which we can inherit a deadline.
> > > > +*/
> > > > +   if (pi_task && dl_se->dl_boosted && 
> > > > dl_prio(pi_task->normal_prio))
> > > > +   pi_se = &pi_task->dl;
> > > > +  
> > > 
> > > OK, I'm micro-optimizing now, but hey, isn't this a fast path?
> > > 
> > > What about changing the above to:
> > > 
> > >   struct task_struct *pi_task;
> > >   [...]
> > > 
> > >   if (dl_se->dl_boosted && dl_prio(pi_task->normal_prio &&  
> > ^
> > OK, we need to reorder these two
> > V
> > >   (pi_task = rt_mutex_get_top_task(dl_task_of(dl_se)))
> > >   pe_se = &pi_task->dl;
> 
> Opps, you're right.
> 
> > > 
> > > This way we don't need to do any work of looking at
> > > rt_mutex_get_top_task() for the normal case.
> > >   
> > 
> > But, yes. Looks good to me. I'll shoot a v3 ASAP.
> 
> I have to ask, should there be any check if the dl_se has a shorter
> deadline than the pi one?
> 

Yeah. I wondered the same actually. I convinced myself that, since the
task is boosted, we assume that the donor will have a shorter deadline.
We seem to be doing the same elsewhere, but Luca was saying some time
ago that the DI thing my have some problems and needs to be revised.
Is is fair enough fixing this bit in accordance with the current (maybe
broken) behaviour and then spend time reviewing the whole thing, or do
we want to do both at the same time (which will of course require more
time)?

Best,

- Juri

Re: [RFC PATCH 1/3] sched/fair: Aggregate task utilization only on root cfs_rq

2016-06-02 Thread Juri Lelli

Hi,

minor comment below.

On 01/06/16 20:39, Dietmar Eggemann wrote:
> cpu utilization (cpu_util()) is defined as the cpu (original) capacity
> capped cfs_rq->avg->util_avg signal of the root cfs_rq.
> 
> With the current pelt version, the utilization of a task [en|de]queued
> on/from a cfs_rq, representing a task group other than the root task group
> on a cpu, is not immediately propagated down to the root cfs_rq.
> 
> This makes decisions based on cpu_util() for scheduling or cpu frequency
> settings less accurate in case tasks are running in task groups.
> 
> This patch aggregates the task utilization only on the root cfs_rq,
> essentially avoiding maintaining utilization for a se/cfs_rq representing
> task groups other than the root task group (!entity_is_task(se) and
> &rq_of(cfs_rq)->cfs != cfs_rq).
> 
> The additional if/else condition to set @update_util in
> __update_load_avg() is replaced in 'sched/fair: Change @running of
> __update_load_avg() to @update_util' by providing the information whether
> utilization has to be maintained via an argument to this function.
> 
> The additional requirements for the alignment of the last_update_time of a
> se and the root cfs_rq are handled by the patch 'sched/fair: Sync se with
> root cfs_rq'.
> 
> Signed-off-by: Dietmar Eggemann 
> ---
>  kernel/sched/fair.c | 48 
>  1 file changed, 36 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 218f8e83db73..212becd3708f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2705,6 +2705,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
> *sa,
>   u32 contrib;
>   unsigned int delta_w, scaled_delta_w, decayed = 0;
>   unsigned long scale_freq, scale_cpu;
> + int update_util = 0;
>  
>   delta = now - sa->last_update_time;
>   /*
> @@ -2725,6 +2726,12 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
> *sa,
>   return 0;
>   sa->last_update_time = now;
>  
> + if (cfs_rq) {
> + if (&rq_of(cfs_rq)->cfs == cfs_rq)

Maybe we can wrap this sort of checks in a static inline improving
readability?

Best,

- Juri

Re: [RFC PATCH 3/3] sched/fair: Change @running of __update_load_avg() to @update_util

2016-06-02 Thread Juri Lelli

Hi,

another minor comment below. :-)

On 01/06/16 20:39, Dietmar Eggemann wrote:
> The information whether a se/cfs_rq should get its load and
> utilization (se representing a task and root cfs_rq) or only its load
> (se representing a task group and cfs_rq owned by this se) updated can
> be passed into __update_load_avg() to avoid the additional if/else
> condition to set update_util.
> 
> @running is changed to @update_util which now carries the information if
> the utilization of the se/cfs_rq should be updated and if the se/cfs_rq
> is running or not.
> 
> Signed-off-by: Dietmar Eggemann 
> ---
>  kernel/sched/fair.c | 42 +-
>  1 file changed, 21 insertions(+), 21 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 3ae8e79fb687..a1c13975cf56 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2669,6 +2669,10 @@ static u32 __compute_runnable_contrib(u64 n)
>  
>  #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
>  
> +#define upd_util_se(se, rng) ((entity_is_task(se) << 1) | (rng))
> +#define upd_util_cfs_rq(cfs_rq) \
> + (((&rq_of(cfs_rq)->cfs == cfs_rq) << 1) | !!cfs_rq->curr)
> +
>  /*
>   * We can represent the historical contribution to runnable average as the
>   * coefficients of a geometric series.  To do this we sub-divide our runnable
> @@ -2699,13 +2703,12 @@ static u32 __compute_runnable_contrib(u64 n)
>   */
>  static __always_inline int
>  __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
> -   unsigned long weight, int running, struct cfs_rq *cfs_rq)
> +   unsigned long weight, int update_util, struct cfs_rq *cfs_rq)
>  {
>   u64 delta, scaled_delta, periods;
>   u32 contrib;
>   unsigned int delta_w, scaled_delta_w, decayed = 0;
>   unsigned long scale_freq, scale_cpu;
> - int update_util = 0;
>  
>   delta = now - sa->last_update_time;
>   /*
> @@ -2726,12 +2729,6 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
> *sa,
>   return 0;
>   sa->last_update_time = now;
>  
> - if (cfs_rq) {
> - if (&rq_of(cfs_rq)->cfs == cfs_rq)
> - update_util = 1;
> - } else if (entity_is_task(container_of(sa, struct sched_entity, avg)))
> - update_util = 1;
> -
>   scale_freq = arch_scale_freq_capacity(NULL, cpu);
>   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
>  
> @@ -2757,7 +2754,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
> *sa,
>   weight * scaled_delta_w;
>   }
>   }
> - if (update_util && running)
> + if (update_util == 0x3)

How about a define for these masks?

Best,

- Juri

Re: [RFC PATCH 1/3] sched/fair: Aggregate task utilization only on root cfs_rq

2016-06-02 Thread Juri Lelli

On 02/06/16 16:53, Dietmar Eggemann wrote:
> On 02/06/16 10:23, Juri Lelli wrote:
> 
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 218f8e83db73..212becd3708f 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -2705,6 +2705,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg 
> >> *sa,
> >>u32 contrib;
> >>unsigned int delta_w, scaled_delta_w, decayed = 0;
> >>unsigned long scale_freq, scale_cpu;
> >> +  int update_util = 0;
> >>  
> >>delta = now - sa->last_update_time;
> >>/*
> >> @@ -2725,6 +2726,12 @@ __update_load_avg(u64 now, int cpu, struct 
> >> sched_avg *sa,
> >>return 0;
> >>sa->last_update_time = now;
> >>  
> >> +  if (cfs_rq) {
> >> +  if (&rq_of(cfs_rq)->cfs == cfs_rq)
> > 
> > Maybe we can wrap this sort of checks in a static inline improving
> > readability?
> 
> Something like this?
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 218f8e83db73..01b0fa689ef9 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -251,6 +251,18 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> return cfs_rq->rq;
>  }
>  
> +/* cfs_rq "owned" by the root task group */
> +static inline struct cfs_rq *root_rq_of(struct cfs_rq *cfs_rq)
> +{
> +   return &rq_of(cfs_rq)->cfs;
> +}
> +
> +/* Is this cfs_rq "owned" by the root task group ? */
> +static inline bool rq_is_root(struct cfs_rq *cfs_rq)
> +{
> +   return root_rq_of(cfs_rq) == cfs_rq;
> +}
> +
>  /* An entity is a task if it doesn't "own" a runqueue */
>  #define entity_is_task(se) (!se->my_q)
>  
> @@ -376,6 +388,16 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
> return container_of(cfs_rq, struct rq, cfs);
>  }
>  
> +static inline struct cfs_rq *root_rq_of(struct cfs_rq *cfs_rq)
> +{
> +   return cfs_rq;
> +}
> +
> +static inline bool rq_is_root(struct cfs_rq *cfs_rq)
> +{
> +   return true;
> +}
> +
>  #define entity_is_task(se) 1
> 

Looks good to me.

Thanks,

- Juri

Re: [PATCH] sched/rt: RT_RUNTIME_GREED sched feature

2016-11-08 Thread Juri Lelli

Hi Daniel,

On 07/11/16 14:51, Daniel Bristot de Oliveira wrote:
> On 11/07/2016 11:31 AM, Tommaso Cucinotta wrote:

[...]

> > -) only issue might be that, if a non-RT task wakes up after the
> > unthrottle, it will have to wait, but worst-case it will have a chance
> > in the next throttling window
> 
> In the current default behavior (RT_RUNTIME_SHARING), in a domain with
> more than two CPUs, the worst case easily become "infinity," because a
> CPU can borrow runtime from another CPU. There is no guarantee for
> minimum latency for non-rt tasks. Anyway, if the user wants to provide
> such guarantee, they just need not enable this feature, while disabling
> RT_RUNTIME_SHARING (or run the non-rt task as a deadline task ;-))
> 

I could only skim through the patch, so please forgive me if I'm talking
gibberish, but I think what Tommaso is saying is that with your current
approach if an unlucky OTHER task wakes up just after you unthrottled an
rt_rq (by replenishment), it will have to wait until the next throttling
event. I agree that this is still better than current status, and that
you can still configure the system to avoid this from happening. What
I'm wondering though is if we could modify your implementation and only
do the replenishment when the replenishment timer actually fires, but
let RT tasks continue to run, while their rt_rq is throttled, if no
OTHER task is present, or wakes up. I guess this will complicate things,
and maybe doesn't buy us much, just an idea. :)

Otherwise, the patch looks good and useful to me.

Best,

- Juri

Re: [RFC v3 1/6] Track the active utilisation

2016-11-08 Thread Juri Lelli

On 01/11/16 22:10, Luca Abeni wrote:
> Hi Juri,
> 
> On Tue, 1 Nov 2016 16:45:43 +0000
> Juri Lelli  wrote:
> 
> > Hi,
> > 
> > a few nitpicks on subject and changelog and a couple of questions below.
> > 
> > Subject should be changed to something like
> > 
> >  sched/deadline: track the active utilisation
> Ok; that's easy :)
> I guess a similar change should be applied to the subjects of all the
> other patches, right?
> 

Yep. Subject have usually the form:

 : 

> 
> > 
> > On 24/10/16 16:06, Luca Abeni wrote:
> > > The active utilisation here is defined as the total utilisation of the  
> > 
> > s/The active/Active/
> > s/here//
> > s/of the active/of active/
> Ok; I'll do this in the next revision of the patchset.
> 

Thanks.

> 
> > > active (TASK_RUNNING) tasks queued on a runqueue. Hence, it is increased
> > > when a task wakes up and is decreased when a task blocks.
> > > 
> > > When a task is migrated from CPUi to CPUj, immediately subtract the task's
> > > utilisation from CPUi and add it to CPUj. This mechanism is implemented by
> > > modifying the pull and push functions.
> > > Note: this is not fully correct from the theoretical point of view
> > > (the utilisation should be removed from CPUi only at the 0 lag time),  
> > 
> > a more theoretically sound solution will follow.
> Notice that even the next patch (introducing the "inactive timer") ends up
> migrating the utilisation immediately (on tasks' migration), without waiting
> for the 0-lag time.
> This is because of the reason explained in the following paragraph:
> 

OK, but is still _more_ theoretically sound. :)

> > > but doing the right thing would be _MUCH_ more complex (leaving the
> > > timer armed when the task is on a different CPU... Inactive timers should
> > > be moved from per-task timers to per-runqueue lists of timers! Bah...)  
> > 
> > I'd remove this paragraph above.
> Ok. Re-reading the changelog, I suspect this is not the correct place for this
> comment.
> 
> 
> > > The utilisation tracking mechanism implemented in this commit can be
> > > fixed / improved by decreasing the active utilisation at the so-called
> > > "0-lag time" instead of when the task blocks.  
> > 
> > And maybe this as well, or put it as more information about the "more
> > theoretically sound" solution?
> Ok... I can remove the paragraph, or point to the next commit (which
> implements the more theoretically sound solution). Is such a "forward
> reference" in changelogs ok?
> 

I'd just say that a better solution will follow. The details about why
it's better might be then put in the changelog and as comments in the
code of the next patch.

> [...]
> > > @@ -947,14 +965,19 @@ static void enqueue_task_dl(struct rq *rq, struct 
> > > task_struct *p, int flags)
> > >   return;
> > >   }
> > >  
> > > + if (p->on_rq == TASK_ON_RQ_MIGRATING)
> > > + add_running_bw(&p->dl, &rq->dl);
> > > +
> > >   /*
> > >* If p is throttled, we do nothing. In fact, if it exhausted
> > >* its budget it needs a replenishment and, since it now is on
> > >* its rq, the bandwidth timer callback (which clearly has not
> > >* run yet) will take care of this.
> > >*/
> > > - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
> > > + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
> > > + add_running_bw(&p->dl, &rq->dl);  
> > 
> > Don't rememeber if we discussed this already, but do we need to add the bw 
> > here
> > even if the task is not actually enqueued until after the replenishment 
> > timer
> > fires?
> I think yes... The active utilization does not depend on the fact that the 
> task
> is on the runqueue or not, but depends on the task's state (in GRUB parlance,
> "inactive" vs "active contending"). In other words, even when a task is 
> throttled
> its utilization must be counted in the active utilization.
> 

OK. Could you add a comment about this point please (so that I don't
forget again :)?

> 
> [...]
> > >   /*
> > >* Since this might be the only -deadline task on the rq,
> > >* this is the right place to try to pull some other one
> > > @@ -1712,6 +1748,7 @@ static void switched_from_dl(struct rq *rq, struct 
> > > task_struct *p)
&

Re: [RFC v3 1/6] Track the active utilisation

2016-11-08 Thread Juri Lelli

On 08/11/16 19:17, Luca Abeni wrote:
> Hi Juri,
> 
> On Tue, 8 Nov 2016 17:56:35 +0000
> Juri Lelli  wrote:
> [...]
> > > > >  static void switched_to_dl(struct rq *rq, struct task_struct
> > > > > *p) {
> > > > > + add_running_bw(&p->dl, &rq->dl);
> > > > >  
> > > > >   /* If p is not queued we will update its parameters at
> > > > > next wakeup. */ if (!task_on_rq_queued(p))  
> > > > 
> > > > Don't we also need to remove bw in task_dead_dl()?
> > > I think task_dead_dl() is invoked after invoking dequeue_task_dl(),
> > > which takes care of this... Or am I wrong? (I think I explicitly
> > > tested this, and modifications to task_dead_dl() turned out to be
> > > unneeded)
> > > 
> > 
> > Mmm. You explicitly check that TASK_ON_RQ_MIGRATING or DEQUEUE_SLEEP
> > (which btw can be actually put together with an or condition), so I
> > don't think that any of those turn out to be true when the task dies.
> I might be very wrong here, but I think do_exit() just does something
> like
>   tsk->state = TASK_DEAD;
> and then invokes schedule(), and __schedule() does
> if (!preempt && prev->state) {
> if (unlikely(signal_pending_state(prev->state, prev))) {
> prev->state = TASK_RUNNING;
> } else {
> deactivate_task(rq, prev, DEQUEUE_SLEEP);
>   [...]
> so dequeue_task_dl() will see DEQUEUE_SLEEP... Or am I misunderstanding
> what you are saying?
> 
> > Also, AFAIU, do_exit() works on current and the TASK_DEAD case is
> > handled in finish_task_switch(), so I don't think we are taking care
> > of the "task is dying" condition.
> Ok, so I am missing something... The state is set to TASK_DEAD, and
> then schedule() is called... So, __schedule() sees the dying task as
> "prev" and invokes deactivate_task() with the DEQUEUE_SLEEP flag...
> After that, finish_task_switch() calls task_dead_dl(). Is this wrong?
> If not, why aren't we taking care of the "task is dying" condition?
> 

No, I think you are right. But, semantically this cleanup goes in
task_dead_dl(), IMHO. It's most probably moot if it complicates things,
but it might be helpful to differentiate the case between a task that is
actually going to sleep (and for which we want to activate the timer)
and a task that is dying (and for which we want to release bw
immediately). So, it actually matters for next patch, not here. But,
maybe we want to do things clean from start?

> 
> > Peter, does what I'm saying make any sense? :)
> > 
> > I still have to set up things here to test these patches (sorry, I was
> > travelling), but could you try to create some tasks and that kill them
> > from another shell to see if the accounting deviates or not? Or did
> > you already do this test?
> I think this is one of the tests I tried... 
> I have to check if I changed this code after the test (but I do not
> think I did). Anyway, tomorrow I'll write a script for automating this
> test, and I'll leave it running for some hours.
> 

OK, thanks. As said I think that you actually handle the case already,
but I'll try to setup testing as well soon.

Thanks,

- Juri

Re: [RFC v3 1/6] Track the active utilisation

2016-11-08 Thread Juri Lelli

On 08/11/16 20:09, Luca Abeni wrote:
> Hi again,
> 
> On Tue, 8 Nov 2016 18:53:09 +0000
> Juri Lelli  wrote:
> [...]
> > > > Also, AFAIU, do_exit() works on current and the TASK_DEAD case is
> > > > handled in finish_task_switch(), so I don't think we are taking
> > > > care of the "task is dying" condition.
> > > Ok, so I am missing something... The state is set to TASK_DEAD, and
> > > then schedule() is called... So, __schedule() sees the dying task as
> > > "prev" and invokes deactivate_task() with the DEQUEUE_SLEEP flag...
> > > After that, finish_task_switch() calls task_dead_dl(). Is this
> > > wrong? If not, why aren't we taking care of the "task is dying"
> > > condition?
> > > 
> > 
> > No, I think you are right. But, semantically this cleanup goes in
> > task_dead_dl(), IMHO.
> Just to be sure I understand correctly: you suggest to add a check for
> "state == TASK_DEAD" (skipping the cleanup if the condition is true) in
> dequeue_task_dl(), and to add a sub_running_bw() in task_dead_dl()...
> Is this understanding correct?

This is more ugly, I know. It makes probably sense though if we then need it in
the next patch. But you are saying the contrary (we don't actually need it), so
in that case we might just want to add a comment here explaining why we handle
the "task is dying" case together with "task is going to sleep" (so that I
don't forget? :).

> 
> > It's most probably moot if it complicates
> > things, but it might be helpful to differentiate the case between a
> > task that is actually going to sleep (and for which we want to
> > activate the timer) and a task that is dying (and for which we want
> > to release bw immediately).
> I suspect the two cases should be handled in the same way :)
> 
> > So, it actually matters for next patch,
> > not here. But, maybe we want to do things clean from start?
> You mean, because patch 2/6 adds
> +   if (hrtimer_active(&p->dl.inactive_timer)) {
> +   raw_spin_lock_irq(&task_rq(p)->lock);
> +   sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
> +   raw_spin_unlock_irq(&task_rq(p)->lock);
> +   }
> in task_dead_dl()? I suspect this hunk is actually unneeded (worse, it
> is wrong :). I am trying to remember why it is there, but I cannot find
> any reason... In the next days, I'll run some tests to check if that
> hunk is actually needed. If yes, then I'll modify patch 1/6 as you
> suggest; if it is not needed, I'll remove it from patch 2/6 and I'll
> not do this change to patch 1/6... Is this ok?
> 

I guess yes, if we don't need to differentiate. Maybe just add a comment as I
am saying above?

Thanks,

- Juri

Re: [PATCH v7 REPOST 8/9] arm: add sysfs cpu_capacity attribute

2016-11-18 Thread Juri Lelli

Hi Russell,

On 03/11/16 05:28, Juri Lelli wrote:
> Hi,
> 
> apologies for the delay in replying, but I'm attending Linux Plumbers
> this week.
> 
> On 30/10/16 20:45, Russell King - ARM Linux wrote:
> > On Mon, Oct 17, 2016 at 04:46:49PM +0100, Juri Lelli wrote:

[...]

> 
> I should have addressed your comments with the updated version below. If
> it looks good to you I'll superseed the old version with this new one.
> 

The two updated patches are still listed as incoming in your system.
Do you think we will be able to queue them for 4.10? IMHO, it would be
good to have all pieces in together at once (Catalin and Sudeep already
queued their respective bits).

Thanks a lot.

Best,

- Juri

Re: [RFC v3 2/6] Improve the tracking of active utilisation

2016-11-10 Thread Juri Lelli

On 02/11/16 03:35, Luca Abeni wrote:
> On Tue, 1 Nov 2016 22:46:33 +0100
> luca abeni  wrote:
> [...]
> > > > @@ -1074,6 +1161,14 @@ select_task_rq_dl(struct task_struct *p, int 
> > > > cpu, int sd_flag, int flags)
> > > > }
> > > > rcu_read_unlock();
> > > >  
> > > > +   rq = task_rq(p);
> > > > +   raw_spin_lock(&rq->lock);
> > > > +   if (hrtimer_active(&p->dl.inactive_timer)) {
> > > > +   sub_running_bw(&p->dl, &rq->dl);
> > > > +   hrtimer_try_to_cancel(&p->dl.inactive_timer);
> > > 
> > > Can't we subtract twice if it happens that after we grabbed rq_lock the 
> > > timer
> > > fired, so it's now waiting for that lock and it goes ahead and 
> > > sub_running_bw
> > > again after we release the lock?  
> > Uhm... I somehow convinced myself that this could not happen, but I do not
> > remember the details, sorry :(
> I think I remember the answer now: pi_lock is acquired before invoking 
> select_task_rq
> and is released after invoking enqueue_task... So, if there is a pending 
> inactive
> timer, its handler will be executed after the task is enqueued... It will see 
> the task
> as RUNNING, and will not decrease the active utilisation.
> 

Oh, because we do task_rq_lock() inactive_task_timer(). So, that should
save us from the double subtract. Would you mind adding something along
the line of what you said above as a comment for next version?

Thanks,

- Juri

Re: [RFC v3 2/6] Improve the tracking of active utilisation

2016-11-10 Thread Juri Lelli

On 10/11/16 10:04, Juri Lelli wrote:
> On 02/11/16 03:35, Luca Abeni wrote:
> > On Tue, 1 Nov 2016 22:46:33 +0100
> > luca abeni  wrote:
> > [...]
> > > > > @@ -1074,6 +1161,14 @@ select_task_rq_dl(struct task_struct *p, int 
> > > > > cpu, int sd_flag, int flags)
> > > > >   }
> > > > >   rcu_read_unlock();
> > > > >  
> > > > > + rq = task_rq(p);
> > > > > + raw_spin_lock(&rq->lock);
> > > > > + if (hrtimer_active(&p->dl.inactive_timer)) {
> > > > > + sub_running_bw(&p->dl, &rq->dl);
> > > > > + hrtimer_try_to_cancel(&p->dl.inactive_timer);
> > > > 
> > > > Can't we subtract twice if it happens that after we grabbed rq_lock the 
> > > > timer
> > > > fired, so it's now waiting for that lock and it goes ahead and 
> > > > sub_running_bw
> > > > again after we release the lock?  
> > > Uhm... I somehow convinced myself that this could not happen, but I do not
> > > remember the details, sorry :(
> > I think I remember the answer now: pi_lock is acquired before invoking 
> > select_task_rq
> > and is released after invoking enqueue_task... So, if there is a pending 
> > inactive
> > timer, its handler will be executed after the task is enqueued... It will 
> > see the task
> > as RUNNING, and will not decrease the active utilisation.
> > 
> 
> Oh, because we do task_rq_lock() inactive_task_timer(). So, that should
> save us from the double subtract. Would you mind adding something along
> the line of what you said above as a comment for next version?
> 

Mmm, wait again.

Cannot the following happen?

 - inactive_timer fires and does sub_running_bw (as the task is not
   RUNNING)
 - another cpu does try_to_wake_up and blocks on pi_lock
 - inactive timer releases both pi and rq locks (but is still executing,
   let's say it is doing put_task_struct())
 - try_to_wake_up goes ahead and calls select_task_rq_dl
   + it finds inactive_timer active
   + sub_running_bw again :(

Re: [RFC v3 2/6] Improve the tracking of active utilisation

2016-11-10 Thread Juri Lelli

On 10/11/16 13:15, Luca Abeni wrote:
> On Thu, 10 Nov 2016 11:56:10 +
> Juri Lelli  wrote:
> 
> > On 10/11/16 10:04, Juri Lelli wrote:
> > > On 02/11/16 03:35, Luca Abeni wrote:  
> > > > On Tue, 1 Nov 2016 22:46:33 +0100
> > > > luca abeni  wrote:
> > > > [...]  
> > > > > > > @@ -1074,6 +1161,14 @@ select_task_rq_dl(struct task_struct
> > > > > > > *p, int cpu, int sd_flag, int flags) }
> > > > > > >   rcu_read_unlock();
> > > > > > >  
> > > > > > > + rq = task_rq(p);
> > > > > > > + raw_spin_lock(&rq->lock);
> > > > > > > + if (hrtimer_active(&p->dl.inactive_timer)) {
> > > > > > > + sub_running_bw(&p->dl, &rq->dl);
> > > > > > > +
> > > > > > > hrtimer_try_to_cancel(&p->dl.inactive_timer);  
> > > > > > 
> > > > > > Can't we subtract twice if it happens that after we grabbed
> > > > > > rq_lock the timer fired, so it's now waiting for that lock
> > > > > > and it goes ahead and sub_running_bw again after we release
> > > > > > the lock?
> > > > > Uhm... I somehow convinced myself that this could not happen,
> > > > > but I do not remember the details, sorry :(  
> > > > I think I remember the answer now: pi_lock is acquired before
> > > > invoking select_task_rq and is released after invoking
> > > > enqueue_task... So, if there is a pending inactive timer, its
> > > > handler will be executed after the task is enqueued... It will
> > > > see the task as RUNNING, and will not decrease the active
> > > > utilisation. 
> > > 
> > > Oh, because we do task_rq_lock() inactive_task_timer(). So, that
> > > should save us from the double subtract. Would you mind adding
> > > something along the line of what you said above as a comment for
> > > next version? 
> > 
> > Mmm, wait again.
> > 
> > Cannot the following happen?
> > 
> >  - inactive_timer fires and does sub_running_bw (as the task is not
> >RUNNING)
> >  - another cpu does try_to_wake_up and blocks on pi_lock
> >  - inactive timer releases both pi and rq locks (but is still
> > executing, let's say it is doing put_task_struct())
> >  - try_to_wake_up goes ahead and calls select_task_rq_dl
> >+ it finds inactive_timer active
> >+ sub_running_bw again :(
> Uhm... Right; this can happen :(
> 

:(

> Ok; I'll think about some possible solution for this race... If I do
> not find any simple way to solve it, I'll add a "contending" flag,
> which allows to know if the inactive timer handler already executed or
> not.
> 

Right, this might help.

Another thing that I was thinking of is whether we can use the return
value of hrtimer_try_to_cancel() to decide what to do:

 - if it returns 0 it means that the callback exectuted or the timer was
   never set, so nothing to do (as in nothing to sub_running_bw from)
 - if it returns 1 we succedeed, so we need to actively sub_running_bw
 - if -1 we can assume that it will eventually do sub_running_bw() so we
   don't need to care explicitly

Now I guess the problem is that the task can be migrated while his
inactive_timer is set (by select_task_rq_dl or by other classes load
balacing if setscheduled to a different class). Can't we store a back
reference to the rq from which the inactive_timer was queued and use
that to sub_running_bw() from? It seems that we might end up with some
"shadow" bandwidth, say when we do a wakeup migration, but maybe this is
something we can tolerate? Just thinking aloud. :)

> BTW, talking about sched_dl_entity flags: I see there are three
> different int fields "dl_throttled, "dl_boosted" and "dl_yielded"; any
> reason for doing this instead of having a "dl_flags" field and setting
> its different bits when the entity is throttled, boosted or yielded? In
> other words: if I need this "contending" flag, should I add a new
> "dl_contending" field?
> 

I think you might want to add a clean-up patch to your series (or a
separate one) fixing the current situation, and the build on to adding
the new flag if needed.

Thanks,

- Juri

Re: [RFC v3 2/6] Improve the tracking of active utilisation

2016-11-01 Thread Juri Lelli

Hi,

On 24/10/16 16:06, Luca Abeni wrote:
> This patch implements a more theoretically sound algorithm for
> thracking the active utilisation: instead of decreasing it when a

s/thracking/tracking/
s/the//

> task blocks, use a timer (the "inactive timer", named after the
> "Inactive" task state of the GRUB algorithm) to decrease the
> active utilisaation at the so called "0-lag time".

s/utilisaation/utilisation/

> 
> Signed-off-by: Luca Abeni 
> ---
>  include/linux/sched.h   |   1 +
>  kernel/sched/core.c |   1 +
>  kernel/sched/deadline.c | 139 
> ++--
>  kernel/sched/sched.h|   1 +
>  4 files changed, 126 insertions(+), 16 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 348f51b..22543c6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1433,6 +1433,7 @@ struct sched_dl_entity {
>* own bandwidth to be enforced, thus we need one timer per task.
>*/
>   struct hrtimer dl_timer;
> + struct hrtimer inactive_timer;
>  };
>  
>  union rcu_special {
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 94732d1..664c618 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2217,6 +2217,7 @@ static void __sched_fork(unsigned long clone_flags, 
> struct task_struct *p)
>  
>   RB_CLEAR_NODE(&p->dl.rb_node);
>   init_dl_task_timer(&p->dl);
> + init_inactive_task_timer(&p->dl);
>   __dl_clear_params(p);
>  
>   INIT_LIST_HEAD(&p->rt.run_list);
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 3d95c1d..80d1541 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -47,6 +47,7 @@ static void add_running_bw(struct sched_dl_entity *dl_se, 
> struct dl_rq *dl_rq)
>  {
>   u64 se_bw = dl_se->dl_bw;
>  
> + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);

This and the one below go in 1/6.

>   dl_rq->running_bw += se_bw;
>  }
>  
> @@ -54,11 +55,52 @@ static void sub_running_bw(struct sched_dl_entity *dl_se, 
> struct dl_rq *dl_rq)
>  {
>   u64 se_bw = dl_se->dl_bw;
>  
> + lockdep_assert_held(&(rq_of_dl_rq(dl_rq))->lock);
>   dl_rq->running_bw -= se_bw;
>   if (WARN_ON(dl_rq->running_bw < 0))
>   dl_rq->running_bw = 0;
>  }
>  
> +static void task_go_inactive(struct task_struct *p)
> +{
> + struct sched_dl_entity *dl_se = &p->dl;
> + struct hrtimer *timer = &dl_se->inactive_timer;
> + struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> + struct rq *rq = rq_of_dl_rq(dl_rq);
> + s64 zerolag_time;
> +
> + WARN_ON(dl_se->dl_runtime == 0);
> +
> + /* If the inactive timer is already armed, return immediately */
> + if (hrtimer_active(&dl_se->inactive_timer))
> + return;
> +
> + zerolag_time = dl_se->deadline -
> +  div64_long((dl_se->runtime * dl_se->dl_period),
> + dl_se->dl_runtime);
> +
> + /*
> +  * Using relative times instead of the absolute "0-lag time"
> +  * allows to simplify the code
> +  */
> + zerolag_time -= rq_clock(rq);
> +
> + /*
> +  * If the "0-lag time" already passed, decrease the active
> +  * utilization now, instead of starting a timer
> +  */
> + if (zerolag_time < 0) {
> + sub_running_bw(dl_se, dl_rq);
> + if (!dl_task(p))
> + __dl_clear_params(p);
> +
> + return;
> + }
> +
> + get_task_struct(p);
> + hrtimer_start(timer, ns_to_ktime(zerolag_time), HRTIMER_MODE_REL);
> +}
> +
>  static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
>  {
>   struct sched_dl_entity *dl_se = &p->dl;
> @@ -514,7 +556,20 @@ static void update_dl_entity(struct sched_dl_entity 
> *dl_se,
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>   struct rq *rq = rq_of_dl_rq(dl_rq);
>  
> - add_running_bw(dl_se, dl_rq);
> + if (hrtimer_is_queued(&dl_se->inactive_timer)) {
> + hrtimer_try_to_cancel(&dl_se->inactive_timer);

Why we are OK with just trying to cancel the inactive timer?

> + WARN_ON(dl_task_of(dl_se)->nr_cpus_allowed > 1);

What's wrong with nr_cpus_allowed > 1 tasks?

> + } else {
> + /*
> +  * The "inactive timer" has been cancelled in
> +  * select_task_rq_dl() (and the acvive utilisation has
> +  * been decreased). So, increase the active utilisation.
> +  * If select_task_rq_dl() could not cancel the timer,
> +  * inactive_task_timer() will * find the task state as
> +  * TASK_RUNNING, and will do nothing, so we are still safe.
> +  */
> + add_running_bw(dl_se, dl_rq);
> + }
>  
>   if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
>   dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
> @@ -602,14 +657,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
> *timer)
>  
>

Re: [RFC v3 1/6] Track the active utilisation

2016-11-01 Thread Juri Lelli

Hi,

a few nitpicks on subject and changelog and a couple of questions below.

Subject should be changed to something like

 sched/deadline: track the active utilisation

On 24/10/16 16:06, Luca Abeni wrote:
> The active utilisation here is defined as the total utilisation of the

s/The active/Active/
s/here//
s/of the active/of active/

> active (TASK_RUNNING) tasks queued on a runqueue. Hence, it is increased
> when a task wakes up and is decreased when a task blocks.
> 
> When a task is migrated from CPUi to CPUj, immediately subtract the task's
> utilisation from CPUi and add it to CPUj. This mechanism is implemented by
> modifying the pull and push functions.
> Note: this is not fully correct from the theoretical point of view
> (the utilisation should be removed from CPUi only at the 0 lag time),

a more theoretically sound solution will follow.

> but doing the right thing would be _MUCH_ more complex (leaving the
> timer armed when the task is on a different CPU... Inactive timers should
> be moved from per-task timers to per-runqueue lists of timers! Bah...)

I'd remove this paragraph above.

> 
> The utilisation tracking mechanism implemented in this commit can be
> fixed / improved by decreasing the active utilisation at the so-called
> "0-lag time" instead of when the task blocks.

And maybe this as well, or put it as more information about the "more
theoretically sound" solution?

> 
> Signed-off-by: Juri Lelli 
> Signed-off-by: Luca Abeni 
> ---
>  kernel/sched/deadline.c | 39 ++-
>  kernel/sched/sched.h|  6 ++
>  2 files changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 37e2449..3d95c1d 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -43,6 +43,22 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se)
>   return !RB_EMPTY_NODE(&dl_se->rb_node);
>  }
>  
> +static void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq 
> *dl_rq)
> +{
> + u64 se_bw = dl_se->dl_bw;
> +
> + dl_rq->running_bw += se_bw;
> +}
> +
> +static void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq 
> *dl_rq)
> +{
> + u64 se_bw = dl_se->dl_bw;
> +
> + dl_rq->running_bw -= se_bw;
> + if (WARN_ON(dl_rq->running_bw < 0))
> + dl_rq->running_bw = 0;
> +}
> +
>  static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
>  {
>   struct sched_dl_entity *dl_se = &p->dl;
> @@ -498,6 +514,8 @@ static void update_dl_entity(struct sched_dl_entity 
> *dl_se,
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>   struct rq *rq = rq_of_dl_rq(dl_rq);
>  
> + add_running_bw(dl_se, dl_rq);
> +
>   if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
>   dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
>   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> @@ -947,14 +965,19 @@ static void enqueue_task_dl(struct rq *rq, struct 
> task_struct *p, int flags)
>   return;
>   }
>  
> + if (p->on_rq == TASK_ON_RQ_MIGRATING)
> + add_running_bw(&p->dl, &rq->dl);
> +
>   /*
>* If p is throttled, we do nothing. In fact, if it exhausted
>* its budget it needs a replenishment and, since it now is on
>* its rq, the bandwidth timer callback (which clearly has not
>* run yet) will take care of this.
>*/
> - if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
> + if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) {
> + add_running_bw(&p->dl, &rq->dl);

Don't rememeber if we discussed this already, but do we need to add the bw here
even if the task is not actually enqueued until after the replenishment timer
fires?

>   return;
> + }
>  
>   enqueue_dl_entity(&p->dl, pi_se, flags);
>  
> @@ -972,6 +995,12 @@ static void dequeue_task_dl(struct rq *rq, struct 
> task_struct *p, int flags)
>  {
>   update_curr_dl(rq);
>   __dequeue_task_dl(rq, p, flags);
> +
> + if (p->on_rq == TASK_ON_RQ_MIGRATING)
> + sub_running_bw(&p->dl, &rq->dl);
> +
> + if (flags & DEQUEUE_SLEEP)
> + sub_running_bw(&p->dl, &rq->dl);
>  }
>  
>  /*
> @@ -1501,7 +1530,9 @@ static int push_dl_task(struct rq *rq)
>   }
>  
>   deactivate_task(rq, next_task, 0);
> + sub_running_bw(&next_task->dl, &rq->dl);
>   set_task_cpu(next_task, later_rq->cpu);
&g

Re: [RFC PATCH] sched/deadline: show leftover runtime and abs deadline in /proc/-/sched

2016-10-25 Thread Juri Lelli

On 25/10/16 11:25, Peter Zijlstra wrote:
> On Tue, Oct 25, 2016 at 12:32:53AM +0200, Tommaso Cucinotta wrote:
> > Hi all,
> > 
> > this is a tiny patch providing readings of the current (leftover)
> > runtime and absolute deadline in /proc/*/sched. Mostly useful for
> > debugging, I heard others playing with SCHED_DEADLINE had some need
> > for similar patches as well.
> > 
> > In addition to debugging, reading the leftover runtime is generally
> > useful for adaptive/incremental RT logics that need to check whether
> > there's enough runtime left for refining the computed result, or just
> > use what we've computed so far and block till the next instance.
> > Also, knowing what the absolute scheduling deadline is (along with
> > what clock it refers to) might be useful for synchronization purposes.
> > (albeit, for real production code, I wouldn't like to parse /proc anyway,
> > rather I'd prefer to retrieve those params via eg sched_getscheduler()?)
> 
> So for programmatic use, this interface is not recommended. For
> debugging this is fine.
> 
> Not sure what form the programmatic interface should take, we have
> precedence in sys_sched_rr_get_interval() for a syscall (we could even
> abuse this one).
> 
> Anybody any ideas?
> 

Maybe extend getattr() to return actual runtime params? (instead of the
static ones, or along to them)

Re: [RFC PATCH] sched/deadline: show leftover runtime and abs deadline in /proc/*/sched

2016-10-25 Thread Juri Lelli

Hi,

On 25/10/16 00:32, Tommaso Cucinotta wrote:
> From: Tommaso Cucinotta 

You should probably commit from your sssup address?

Also changelog is missing. You can simply put here part of your cover
letter wordings.

> 
> ---
>  kernel/sched/debug.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index fa178b6..109adc0 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct 
> seq_file *m)
>  #endif
>   P(policy);
>   P(prio);
> + if (p->policy == SCHED_DEADLINE) {
> + P(dl.runtime);
> + P(dl.deadline);
> + }

For testing purposes looks ok to me.

Best,

- Juri

Re: [PATCH] cpufreq: schedutil: add up/down frequency transition rate limits

2016-11-21 Thread Juri Lelli

On 21/11/16 11:19, Peter Zijlstra wrote:
> On Mon, Nov 21, 2016 at 03:38:05PM +0530, Viresh Kumar wrote:
> > On 17-11-16, 10:48, Viresh Kumar wrote:

[...]

> > 
> > (Background story for others from my discussion with Rafael on IRC: Rafael
> > proposed that instead of this patch we can add down_rate_limit_delta_us (>0 
> > =)
> > which can be added to rate_limit_us (rate limit while increasing freq) to 
> > find
> > the rate limit to be used in the downward direction. And I raised the point
> > that it looks much neater to have separate up and down rate_limit_us. I also
> > said that people may have a valid case where they want to keep 
> > down_rate_limit
> > lower than up_rate_limit and Rafael wasn't fully sure of any such cases).
> > 
> 
> Urgh...
> 
> 
> So no tunables and rate limits here at all please.
> 
> During LPC we discussed the rampup and decay issues and decided that we
> should very much first address them by playing with the PELT stuff.
> Morton was going to play with capping the decay on the util signal. This
> should greatly improve the ramp-up scenario and cure some other wobbles.
> 
> The decay can be set by changing the over-all pelt decay, if so desired.
> 

Do you mean we might want to change the decay (make it different from
ramp-up) once for all, or maybe we make it tunable so that we can
address different power/perf requirements?

> Also, there was the idea of; once the above ideas have all been
> explored; tying the freq ram rate to the power curve.
> 

Yep. That's an interesting one to look at, but it might require some
time.

> So NAK on everything tunable here.

Re: [PATCH] cpufreq: schedutil: add up/down frequency transition rate limits

2016-11-21 Thread Juri Lelli

On 21/11/16 13:26, Peter Zijlstra wrote:
> On Mon, Nov 21, 2016 at 12:14:32PM +0000, Juri Lelli wrote:
> > On 21/11/16 11:19, Peter Zijlstra wrote:
> 
> > > So no tunables and rate limits here at all please.
> > > 
> > > During LPC we discussed the rampup and decay issues and decided that we
> > > should very much first address them by playing with the PELT stuff.
> > > Morton was going to play with capping the decay on the util signal. This
> > > should greatly improve the ramp-up scenario and cure some other wobbles.
> > > 
> > > The decay can be set by changing the over-all pelt decay, if so desired.
> > > 
> > 
> > Do you mean we might want to change the decay (make it different from
> > ramp-up) once for all, or maybe we make it tunable so that we can
> > address different power/perf requirements?
> 
> So the limited decay would be the dominant factor in ramp-up time,
> leaving the regular PELT period the dominant factor for ramp-down.
> 

Hmmm, AFAIU the limited decay will help not forgetting completely the
contribution of tasks that sleep for a long time, but it won't modify
the actual ramp-up of the signal. So, for new tasks we will need to play
with a sensible initial value (trading off perf and power as usual).

> (Note that the decay limit would only be applied on the per-task signal,
> not the accumulated signal.)
> 

Right, and since schedutil consumes the latter, we could still suffer
from too frequent frequency switch events I guess (this is where the
down threshold thing came as a quick and dirty fix). Maybe we can think
of some smoothing applied to the accumulated signal, or make it decay
slower (don't really know what this means in practice, though :) ?

> It could be an option, for some, to build the kernel with a PELT window
> of 16ms or so (half its current size), this of course means regenerating
> all the constants etc.. And this very much is a compile time thing.
> 

Right. I seem to remember that helped a bit for mobile type of
workloads. But never did a thorough evaluation.

> We could fairly easy; if this is so desired; make the PELT window size a
> CONFIG option (hidden by default).
> 
> But like everything; patches should come with numbers justifying them
> etc..
> 

Sure. :)

> > > Also, there was the idea of; once the above ideas have all been
> > > explored; tying the freq ram rate to the power curve.
> > > 
> > 
> > Yep. That's an interesting one to look at, but it might require some
> > time.
> 
> Sure, just saying that we should resist knobs until all other avenues
> have been explored. Never start with a knob.

Re: [PATCH] cpufreq: schedutil: add up/down frequency transition rate limits

2016-11-21 Thread Juri Lelli

On 21/11/16 15:43, Peter Zijlstra wrote:
> On Mon, Nov 21, 2016 at 02:37:27PM +0000, Juri Lelli wrote:
> > On 21/11/16 15:17, Peter Zijlstra wrote:
> 
> > > Not sure I follow. So by limiting decay to the task value, the moment we
> > > add it back to the accumulated signal (wakeup), the accumulated signal
> > > jumps up quickly and ramp-up is achieved.
> > > 
> > 
> > This is true, but it seems that this potentially spiky behaviour
> > (which in general depends on tasks composition and periodicity) might
> > affect power savings (as in you don't generally want to switch between
> > high and low freqs too often). So that's why I was just thinking that
> > some sort of smoothing applied to the signal schedutil uses might help.
> 
> Hurm.. so during LPC it was said that fast ramp-up was desired. Note
> that we'll not ramp down this fast, the accumulated signal will decay
> slowly as per blocked-load PELT rules. So only ramp-up is spiky, but
> that is what was desired AFAIU.
> 

Yep, fast ramp-up is quite crucial I'd say. And it's also true that we
should in theory already ramp-down slower. My worries originate mostly
from our experience with Android devices, for which we ended up
introducing thresholds as per subject of this thread. But it's also true
that the landscape it's different there (e.g., slightly different
governor, different utilization signal, etc.), so I guess we should now
re-evaluate things in light of what we discussed here and at LPC.

Re: [PATCH] cpufreq: schedutil: add up/down frequency transition rate limits

2016-11-21 Thread Juri Lelli

On 21/11/16 15:17, Peter Zijlstra wrote:
> On Mon, Nov 21, 2016 at 01:53:08PM +0000, Juri Lelli wrote:
> > On 21/11/16 13:26, Peter Zijlstra wrote:
> 
> > > So the limited decay would be the dominant factor in ramp-up time,
> > > leaving the regular PELT period the dominant factor for ramp-down.
> > > 
> > 
> > Hmmm, AFAIU the limited decay will help not forgetting completely the
> > contribution of tasks that sleep for a long time, but it won't modify
> > the actual ramp-up of the signal. So, for new tasks we will need to play
> > with a sensible initial value (trading off perf and power as usual).
> 
> Oh, you mean ramp-up for bright spanking new tasks? I forgot the
> details, but I think we can fudge the 'history' such that those too ramp
> up quickly.
> 

Right. I think Vincent had some ideas on this front already.

> > > (Note that the decay limit would only be applied on the per-task signal,
> > > not the accumulated signal.)
> > > 
> > 
> > Right, and since schedutil consumes the latter, we could still suffer
> > from too frequent frequency switch events I guess (this is where the
> > down threshold thing came as a quick and dirty fix). Maybe we can think
> > of some smoothing applied to the accumulated signal, or make it decay
> > slower (don't really know what this means in practice, though :) ?
> 
> Not sure I follow. So by limiting decay to the task value, the moment we
> add it back to the accumulated signal (wakeup), the accumulated signal
> jumps up quickly and ramp-up is achieved.
> 

This is true, but it seems that this potentially spiky behaviour
(which in general depends on tasks composition and periodicity) might
affect power savings (as in you don't generally want to switch between
high and low freqs too often). So that's why I was just thinking that
some sort of smoothing applied to the signal schedutil uses might help.

Re: [PATCH v7 1/8] Documentation: arm: define DT cpu capacity-dmips-mhz bindings

2016-12-12 Thread Juri Lelli

On 12/12/16 12:40, Lorenzo Pieralisi wrote:
> On Mon, Sep 05, 2016 at 03:22:45PM +0100, Juri Lelli wrote:
> 
> [...]
> 
> > +===
> > +5 - References
> > +===
> > +
> > +[1] ARM Linux Kernel documentation - CPUs bindings
> > +Documentation/devicetree/bindings/arm/cpus.txt
> > diff --git a/Documentation/devicetree/bindings/arm/cpus.txt 
> > b/Documentation/devicetree/bindings/arm/cpus.txt
> > index e6782d50cbcd..c1dcf4cade2e 100644
> > --- a/Documentation/devicetree/bindings/arm/cpus.txt
> > +++ b/Documentation/devicetree/bindings/arm/cpus.txt
> > @@ -241,6 +241,14 @@ nodes to be present and contain the properties 
> > described below.
> > # List of phandles to idle state nodes supported
> >   by this cpu [3].
> >  
> > +   - capacity-dmips-mhz
> > +   Usage: Optional
> > +   Value type: 
> > +   Definition:
> > +   # u32 value representing CPU capacity [3] in
> > + DMIPS/MHz, relative to highest capacity-dmips-mhz
> > + in the system.
> > +
> > - rockchip,pmu
> > Usage: optional for systems that have an "enable-method"
> >property value of "rockchip,rk3066-smp"
> > @@ -464,3 +472,5 @@ cpus {
> >  [2] arm/msm/qcom,kpss-acc.txt
> >  [3] ARM Linux kernel documentation - idle states bindings
> >  Documentation/devicetree/bindings/arm/idle-states.txt
> > +[3] ARM Linux kernel documentation - cpu capacity bindings
> > +Documentation/devicetree/bindings/arm/cpu-capacity.txt
> 
> Trivia: bumped into this while reviewing something else, too many
> threes, you will have to fix the added reference up.

Argh! Fixed locally, thanks for catching it.

Best,

- Juri

[RFD PATCH 1/5] sched/cpufreq_schedutil: make use of DEADLINE utilization signal

2017-03-24 Thread Juri Lelli

SCHED_DEADLINE tracks active utilization signal with a per rq variable
named running_bw.

Make use of that to drive cpu frequency selection: add up FAIR and
DEADLINE contribution to get the required CPU capacity to handle both
requirements.

Co-authored-by: Claudio Scordino 
Signed-off-by: Juri Lelli 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Rafael J. Wysocki 
Cc: Viresh Kumar 
Cc: Luca Abeni 
---
 include/linux/sched/cpufreq.h|  2 --
 kernel/sched/cpufreq_schedutil.c | 13 ++---
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index d2be2ccbb372..39640bb3a8ee 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -11,8 +11,6 @@
 #define SCHED_CPUFREQ_DL   (1U << 1)
 #define SCHED_CPUFREQ_IOWAIT   (1U << 2)
 
-#define SCHED_CPUFREQ_RT_DL(SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
-
 #ifdef CONFIG_CPU_FREQ
 struct update_util_data {
void (*func)(struct update_util_data *data, u64 time, unsigned int 
flags);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index f5ffe241812e..05f5625ea005 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -154,12 +154,11 @@ static unsigned int get_next_freq(struct sugov_policy 
*sg_policy,
 static void sugov_get_util(unsigned long *util, unsigned long *max)
 {
struct rq *rq = this_rq();
-   unsigned long cfs_max;
+   unsigned long dl_util = (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> 
20;
 
-   cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+   *max = arch_scale_cpu_capacity(NULL, smp_processor_id());
 
-   *util = min(rq->cfs.avg.util_avg, cfs_max);
-   *max = cfs_max;
+   *util = min(rq->cfs.avg.util_avg + dl_util, *max);
 }
 
 static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
@@ -207,7 +206,7 @@ static void sugov_update_single(struct update_util_data 
*hook, u64 time,
if (!sugov_should_update_freq(sg_policy, time))
return;
 
-   if (flags & SCHED_CPUFREQ_RT_DL) {
+   if (flags & SCHED_CPUFREQ_RT) {
next_f = policy->cpuinfo.max_freq;
} else {
sugov_get_util(&util, &max);
@@ -242,7 +241,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu 
*sg_cpu)
j_sg_cpu->iowait_boost = 0;
continue;
}
-   if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+   if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
 
j_util = j_sg_cpu->util;
@@ -278,7 +277,7 @@ static void sugov_update_shared(struct update_util_data 
*hook, u64 time,
sg_cpu->last_update = time;
 
if (sugov_should_update_freq(sg_policy, time)) {
-   if (flags & SCHED_CPUFREQ_RT_DL)
+   if (flags & SCHED_CPUFREQ_RT)
next_f = sg_policy->policy->cpuinfo.max_freq;
else
next_f = sugov_next_freq_shared(sg_cpu);
-- 
2.10.0

[RFD PATCH 2/5] sched/deadline: move cpu frequency selection triggering points

2017-03-24 Thread Juri Lelli

Since SCHED_DEADLINE doesn't track utilization signal (but reserves a
fraction of CPU bandwidth to tasks admitted to the system), there is no
point in evaluating frequency changes during each tick event.

Move frequency selection triggering points to where running_bw changes.

Co-authored-by: Claudio Scordino 
Signed-off-by: Juri Lelli 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Rafael J. Wysocki 
Cc: Viresh Kumar 
Cc: Luca Abeni 
---
 kernel/sched/deadline.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 55471016d73c..5c1a205e830f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -52,6 +52,8 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
dl_rq->running_bw += dl_bw;
SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
+   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+   cpufreq_update_this_cpu(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
 }
 
 static inline
@@ -64,6 +66,8 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
if (dl_rq->running_bw > old)
dl_rq->running_bw = 0;
+   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+   cpufreq_update_this_cpu(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
 }
 
 static inline
@@ -953,9 +957,6 @@ static void update_curr_dl(struct rq *rq)
return;
}
 
-   /* kick cpufreq (see the comment in kernel/sched/sched.h). */
-   cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
-
schedstat_set(curr->se.statistics.exec_max,
  max(curr->se.statistics.exec_max, delta_exec));
 
-- 
2.10.0

[RFD PATCH 0/5] SCHED_DEADLINE freq/cpu invariance and OPP selection

2017-03-24 Thread Juri Lelli

Hi,

this is a very exploratory set implementing frequency/cpu invariance and OPP
selection for SCHED_DEADLINE. The set has been slightly tested on a Juno
platform. While the actual implementation is very premature, I'm posting this
early to facilitate discussion at OSPM-summit [1].

Results of the testing, highlighting why these features are useful are
available here:

 - without the set
   https://gist.github.com/a6e3ee99cec32e00cc537b53cd3d54d2

 - with the set
   https://gist.github.com/1f7d485fc3ce9234fe627dcb53b2935c

The set is based on tip/sched/core as of today (bc4278987e38) plus a couple of
schedutil fixes coming from linux-pm/linux-next and Luca's "CPU reclaiming for
SCHED_DEADLINE" v5 [2].

Patches high level description:

 o [01-02]/05 add the necessary links to start accounting DEADLINE contribution
  to OPP selection 
 o 03/05  it's an hack to make possible (on ARM) to change frequency for
  DEADLINE tasks (that would possibly delay the SCHED_FIFO worker
  kthread); suggestions on how to do this properly are very welcome
 o 04/05  it's a schedutil change that copes with the fact that DEADLINE
  doesn't require periodic OPP selection triggering point
 o 05/05  implements frequency/cpu invariance for tasks' reservation
  parameters*; which basically means that we implement GRUB-PA [3]

Please have a look. Feedback on how we want to shape this is the sole purpose
of this posting.

In case you would like to test this out:

 git://linux-arm.org/linux-jl.git upstream/deadline/freq-rfd

Best,

- Juri

[1] http://retis.sssup.it/ospm-summit/index.html
[2] https://marc.info/?l=linux-kernel&m=149029880524038
[3] C. Scordino, G. Lipari, A Resource Reservation Algorithm for Power-Aware
Scheduling of Periodic and Aperiodic Real-Time Tasks, IEEE Transactions
on Computers, December 2006.

* Notice that this currently breaks !CONFIG_SMP, as arch_scale_{freq,cpu}
_capacity and SCHED_CAPACITY_SCALE are only defined on CONFIG_SMP. Fixing
this particular issue is straightforward, but we should probably look into
making frequency scaling (and PELT averages) available on !CONFIG_SMP as well
(so that schedutil can work on such configurations for example). Since this is
only an RFD and since a proper rework might be non trivial, I decided to leave
it out of scope for the time being.

Juri Lelli (5):
  sched/cpufreq_schedutil: make use of DEADLINE utilization signal
  sched/deadline: move cpu frequency selection triggering points
  sched/cpufreq_schedutil: make worker kthread be SCHED_DEADLINE
  sched/cpufreq_schedutil: always consider all CPUs when deciding next
freq
  sched/deadline: make bandwidth enforcement scale-invariant

 include/linux/sched.h|  1 +
 include/linux/sched/cpufreq.h|  2 --
 include/uapi/linux/sched.h   |  1 +
 kernel/sched/core.c  | 19 +--
 kernel/sched/cpufreq_schedutil.c | 37 ++---
 kernel/sched/deadline.c  | 40 +---
 kernel/sched/fair.c  |  2 --
 kernel/sched/sched.h | 10 +-
 8 files changed, 83 insertions(+), 29 deletions(-)

-- 
2.10.0

[RFD PATCH 5/5] sched/deadline: make bandwidth enforcement scale-invariant

2017-03-24 Thread Juri Lelli

Apply frequency and cpu scale-invariance correction factor to bandwidth
enforcement (similar to what we already do to fair utilization tracking).

Each delta_exec gets scaled considering current frequency and maximum
cpu capacity; which means that the reservation runtime parameter (that
need to be specified profiling the task execution at max frequency on
biggest capacity core) gets thus scaled accordingly.

Signed-off-by: Juri Lelli 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Rafael J. Wysocki 
Cc: Viresh Kumar 
Cc: Luca Abeni 
Cc: Claudio Scordino 
---
 kernel/sched/deadline.c | 27 +++
 kernel/sched/fair.c |  2 --
 kernel/sched/sched.h|  2 ++
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 853de524c6c6..7141d6f51ee0 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -940,7 +940,9 @@ static void update_curr_dl(struct rq *rq)
 {
struct task_struct *curr = rq->curr;
struct sched_dl_entity *dl_se = &curr->dl;
-   u64 delta_exec;
+   u64 delta_exec, scaled_delta_exec;
+   unsigned long scale_freq, scale_cpu;
+   int cpu = cpu_of(rq);
 
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
@@ -974,9 +976,26 @@ static void update_curr_dl(struct rq *rq)
if (unlikely(dl_entity_is_special(dl_se)))
return;
 
-   if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
-   delta_exec = grub_reclaim(delta_exec, rq, curr->dl.dl_bw);
-   dl_se->runtime -= delta_exec;
+   /*
+* XXX When clock frequency is controlled by the scheduler (via
+* schedutil governor) we implement GRUB-PA: the spare reclaimed
+* bandwidth is used to clock down frequency.
+*
+* However, what below seems to assume scheduler to always be in
+* control of clock frequency; when running at a fixed frequency
+* (e.g., performance or userspace governor), shouldn't we instead
+* use the grub_reclaim mechanism below?
+*
+* if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM))
+*  delta_exec = grub_reclaim(delta_exec, rq, curr->dl.dl_bw);
+* dl_se->runtime -= delta_exec;
+*/
+   scale_freq = arch_scale_freq_capacity(NULL, cpu);
+   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+   scaled_delta_exec = cap_scale(delta_exec, scale_freq);
+   scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
+   dl_se->runtime -= scaled_delta_exec;
 
 throttle:
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2805bd7c8994..37f12d0a3bc4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2818,8 +2818,6 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
 }
 
-#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
-
 /*
  * We can represent the historical contribution to runnable average as the
  * coefficients of a geometric series.  To do this we sub-divide our runnable
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b5e81120813..81bd048ed181 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -155,6 +155,8 @@ static inline int task_has_dl_policy(struct task_struct *p)
return dl_policy(p->policy);
 }
 
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
 static inline int dl_entity_is_special(struct sched_dl_entity *dl_se)
 {
return dl_se->flags & SCHED_FLAG_SPECIAL;
-- 
2.10.0

[RFD PATCH 3/5] sched/cpufreq_schedutil: make worker kthread be SCHED_DEADLINE

2017-03-24 Thread Juri Lelli

Worker kthread needs to be able to change frequency for all other
threads.

Make it special, just under STOP class.

Signed-off-by: Juri Lelli 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Rafael J. Wysocki 
Cc: Viresh Kumar 
Cc: Luca Abeni 
Cc: Claudio Scordino 
---
 include/linux/sched.h|  1 +
 include/uapi/linux/sched.h   |  1 +
 kernel/sched/core.c  | 19 +--
 kernel/sched/cpufreq_schedutil.c | 15 ---
 kernel/sched/deadline.c  |  6 ++
 kernel/sched/sched.h |  8 +++-
 6 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 952cac87e433..6f508980f320 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,6 +1351,7 @@ extern int idle_cpu(int cpu);
 extern int sched_setscheduler(struct task_struct *, int, const struct 
sched_param *);
 extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct 
sched_param *);
 extern int sched_setattr(struct task_struct *, const struct sched_attr *);
+extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr 
*);
 extern struct task_struct *idle_task(int cpu);
 
 /**
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index e2a6c7b3510b..72723859ef74 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -48,5 +48,6 @@
  */
 #define SCHED_FLAG_RESET_ON_FORK   0x01
 #define SCHED_FLAG_RECLAIM 0x02
+#define SCHED_FLAG_SPECIAL 0x04
 
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 378d402ee7a6..9b211c77cb54 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2495,6 +2495,9 @@ static int dl_overflow(struct task_struct *p, int policy,
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
 
+   if (attr->sched_flags & SCHED_FLAG_SPECIAL)
+   return 0;
+
/* !deadline task may carry old deadline bandwidth */
if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
return 0;
@@ -4052,6 +4055,10 @@ __getparam_dl(struct task_struct *p, struct sched_attr 
*attr)
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
+   /* special dl tasks don't actually use any parameter */
+   if (attr->sched_flags & SCHED_FLAG_SPECIAL)
+   return true;
+
/* deadline != 0 */
if (attr->sched_deadline == 0)
return false;
@@ -4138,7 +4145,9 @@ static int __sched_setscheduler(struct task_struct *p,
}
 
if (attr->sched_flags &
-   ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
+   ~(SCHED_FLAG_RESET_ON_FORK |
+ SCHED_FLAG_RECLAIM |
+ SCHED_FLAG_SPECIAL))
return -EINVAL;
 
/*
@@ -4260,7 +4269,8 @@ static int __sched_setscheduler(struct task_struct *p,
}
 #endif
 #ifdef CONFIG_SMP
-   if (dl_bandwidth_enabled() && dl_policy(policy)) {
+   if (dl_bandwidth_enabled() && dl_policy(policy) &&
+   !(attr->sched_flags & SCHED_FLAG_SPECIAL)) {
cpumask_t *span = rq->rd->span;
 
/*
@@ -4390,6 +4400,11 @@ int sched_setattr(struct task_struct *p, const struct 
sched_attr *attr)
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
+{
+   return __sched_setscheduler(p, attr, false, true);
+}
+
 /**
  * sched_setscheduler_nocheck - change the scheduling policy and/or RT 
priority of a thread from kernelspace.
  * @p: the task in question.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 05f5625ea005..da67a1cf91e7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -394,7 +394,16 @@ static void sugov_policy_free(struct sugov_policy 
*sg_policy)
 static int sugov_kthread_create(struct sugov_policy *sg_policy)
 {
struct task_struct *thread;
-   struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
+   struct sched_attr attr = {
+   .size = sizeof(struct sched_attr),
+   .sched_policy = SCHED_DEADLINE,
+   .sched_flags = SCHED_FLAG_SPECIAL,
+   .sched_nice = 0,
+   .sched_priority = 0,
+   .sched_runtime = 0,
+   .sched_deadline = 0,
+   .sched_period = 0,
+   };
struct cpufreq_policy *policy = sg_policy->policy;
int ret;
 
@@ -412,10 +421,10 @@ static int sugov_kthread_create(struct sugov_policy 
*sg_policy)
return PTR_ERR(thread);
}
 
-   ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m);
+   ret = sched_setattr_nocheck(thread, &attr);

[RFD PATCH 4/5] sched/cpufreq_schedutil: always consider all CPUs when deciding next freq

2017-03-24 Thread Juri Lelli

No assumption can be made upon the rate at which frequency updates get
triggered, as there are scheduling policies (like SCHED_DEADLINE) which
don't trigger them so frequently.

Remove such assumption from the code.

Signed-off-by: Juri Lelli 
Cc: Peter Zijlstra 
Cc: Ingo Molnar 
Cc: Rafael J. Wysocki 
Cc: Viresh Kumar 
Cc: Luca Abeni 
Cc: Claudio Scordino 
---
 kernel/sched/cpufreq_schedutil.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index da67a1cf91e7..40f30373b709 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -233,14 +233,13 @@ static unsigned int sugov_next_freq_shared(struct 
sugov_cpu *sg_cpu)
 * If the CPU utilization was last updated before the previous
 * frequency update and the time elapsed between the last update
 * of the CPU utilization and the last frequency update is long
-* enough, don't take the CPU into account as it probably is
-* idle now (and clear iowait_boost for it).
+* enough, reset iowait_boost, as it probably is not boosted
+* anymore now.
 */
delta_ns = last_freq_update_time - j_sg_cpu->last_update;
-   if (delta_ns > TICK_NSEC) {
+   if (delta_ns > TICK_NSEC)
j_sg_cpu->iowait_boost = 0;
-   continue;
-   }
+
if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
return policy->cpuinfo.max_freq;
 
-- 
2.10.0

Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization

2017-03-27 Thread Juri Lelli

On 24/03/17 22:47, Luca Abeni wrote:
> Hi Peter,
> 
> On Fri, 24 Mar 2017 14:20:41 +0100
> Peter Zijlstra  wrote:
> 
> > On Fri, Mar 24, 2017 at 04:52:55AM +0100, luca abeni wrote:
> > 

[...]

> > 
> > In general I feel it would be nice to have a state diagram included
> > somewhere near these two functions. It would be nice to not have to
> > dig out the PDF every time.
> 
> Ok... Since I am not good at ascii art, would it be ok to add a textual
> description? If yes, I'll add a comment like:
> "
> The utilization of a task is added to the runqueue's active utilization
> when the task becomes active (is enqueued in the runqueue), and is

Is enqueued for the first time on a new period, maybe? It seems to be
contradictory w.r.t. what below (if wakeup before 0 lag time) otherwise.

> removed when the task becomes inactive. A task does not become
> immediately inactive when it blocks, but becomes inactive at the so
> called "0 lag time"; so, we setup the "inactive timer" to fire at the
> "0 lag time". When the "inactive timer" fires, the task utilization is
> removed from the runqueue's active utilization. If the task wakes up
> again on the same runqueue before the "0 lag time", the active
> utilization must not be changed and the "inactive timer" must be
> cancelled. If the task wakes up again on a different runqueue before
> the "0 lag time", then the task's utilization must be removed from the
> previous runqueue's active utilization and must be added to the new
> runqueue's active utilization.
> In order to avoid races between a task waking up on a runqueue while the
> "inactive timer" is running on a different CPU, the "dl_non_contending"
> flag is used to indicate that a task is not on a runqueue but is active
> (so, the flag is set when the task blocks and is cleared when the
> "inactive timer" fires or when the task  wakes up).
> "
> (if this is ok, where can I add this comment?)
> 

Thanks for this Luca. Not sure it adds much to your text above, but we
might want to consider adding something like below?

--->8---
   1st enqueue   +--+
 |  |
   +>+ ACTIVEcontending |
   | |  |
   | ++--+--+
   |  |  ^
   |  |  |
  ++---+  |  |
  || dequeue  |  |  wakeup before
  |INACTIVE|  |  |  0 lag time
  ||  |  |
  ++---+  |  |
   ^  |  |
   |  V  |
   | ++--+--+
   | |  |
   +-+ ACTIVEnonCONTEND |
 |  |
0 lag time   +--+
elapsed
--->8---

Thanks,

- Juri

Re: [RFC v5 2/9] sched/deadline: improve the tracking of active utilization

2017-03-27 Thread Juri Lelli

On 27/03/17 09:43, Luca Abeni wrote:
> Hi Juri,
> 
> On Mon, 27 Mar 2017 08:17:45 +0100
> Juri Lelli  wrote:
> [...]
> > > > In general I feel it would be nice to have a state diagram
> > > > included somewhere near these two functions. It would be nice to
> > > > not have to dig out the PDF every time.  
> > > 
> > > Ok... Since I am not good at ascii art, would it be ok to add a
> > > textual description? If yes, I'll add a comment like:
> > > "
> > > The utilization of a task is added to the runqueue's active
> > > utilization when the task becomes active (is enqueued in the
> > > runqueue), and is  
> > 
> > Is enqueued for the first time on a new period, maybe? It seems to be
> > contradictory w.r.t. what below (if wakeup before 0 lag time)
> > otherwise.
> I think it should be "is enqueued in the runqueue and was previously
> not active" (I did not write the "and was previously not active" to

Right.

> avoid complicanting the sentence even more... But this
> "simplification" was not a good idea :). The fact that this happens in a
> new period or not is (in my understanding) irrelevant...
> 
> 
> > > removed when the task becomes inactive. A task does not become
> > > immediately inactive when it blocks, but becomes inactive at the so
> > > called "0 lag time"; so, we setup the "inactive timer" to fire at
> > > the "0 lag time". When the "inactive timer" fires, the task
> > > utilization is removed from the runqueue's active utilization. If
> > > the task wakes up again on the same runqueue before the "0 lag
> > > time", the active utilization must not be changed and the "inactive
> > > timer" must be cancelled. If the task wakes up again on a different
> > > runqueue before the "0 lag time", then the task's utilization must
> > > be removed from the previous runqueue's active utilization and must
> > > be added to the new runqueue's active utilization.
> > > In order to avoid races between a task waking up on a runqueue
> > > while the "inactive timer" is running on a different CPU, the
> > > "dl_non_contending" flag is used to indicate that a task is not on
> > > a runqueue but is active (so, the flag is set when the task blocks
> > > and is cleared when the "inactive timer" fires or when the task
> > > wakes up). "
> > > (if this is ok, where can I add this comment?)
> > >   
> > 
> > Thanks for this Luca. Not sure it adds much to your text above, but we
> > might want to consider adding something like below?
> > 
> > --->8---  
> >1st enqueue   +--+
> >  |  |
> >+>+ ACTIVEcontending |
> >| |  |
> >| ++--+--+
> >|  |  ^
> >|  |  |
> >   ++---+  |  |
> >   || dequeue  |  |  wakeup before
> >   |INACTIVE|  |  |  0 lag time
> >   ||  |  |
> >   ++---+  |  |
> >^  |  |
> >|  V  |
> >| ++--+--+
> >| |  |
> >+-+ ACTIVEnonCONTEND |
> >  |  |
> > 0 lag time   +--+
> > elapsed
> > --->8---  
> 
> I am not sure if introducing the "active non contending" name is a good
> idea or not (see my previous email), but I am not the best person to
> decide this... If people like this figure, I am more than happy to add
> it :)
> (but then maybe we can change "0 lag time elapsed" with "inactive timer
> fires" and we can display in the figure the state of the
> "dl_non_contending"/"inactive_timer_armed" flag)
> 

Sure. Let's see what people think about what you say in the other email
and I'll update the figure accordingly.

[PATCH v3 1/9] Documentation: arm: fix wrong reference number in DT definition

2017-03-27 Thread Juri Lelli

Reference to cpu capacity binding has a wrong number. Fix it.

Reported-by: Lorenzo Pieralisi 
Signed-off-by: Juri Lelli 
Acked-by: Rob Herring 
---
 Documentation/devicetree/bindings/arm/cpus.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/cpus.txt 
b/Documentation/devicetree/bindings/arm/cpus.txt
index 698ad1f097fa..83ec3fa0a05e 100644
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -248,7 +248,7 @@ nodes to be present and contain the properties described 
below.
Usage: Optional
Value type: 
Definition:
-   # u32 value representing CPU capacity [3] in
+   # u32 value representing CPU capacity [4] in
  DMIPS/MHz, relative to highest capacity-dmips-mhz
  in the system.
 
@@ -475,5 +475,5 @@ cpus {
 [2] arm/msm/qcom,kpss-acc.txt
 [3] ARM Linux kernel documentation - idle states bindings
 Documentation/devicetree/bindings/arm/idle-states.txt
-[3] ARM Linux kernel documentation - cpu capacity bindings
+[4] ARM Linux kernel documentation - cpu capacity bindings
 Documentation/devicetree/bindings/arm/cpu-capacity.txt
-- 
2.10.0

[PATCH v3 0/9] Fix issues and factorize arm/arm64 capacity information code

2017-03-27 Thread Juri Lelli

Hi,

arm and arm64 topology.c share a lot of code related to parsing of capacity
information. This is v3 of a solution [1] (based on Will's, Catalin's and
Mark's off-line suggestions) to move such common code in a single place:
drivers/base/arch_topology.c (by creating such file and conditionally compiling
it for arm and arm64 only).

First 4 patches are actually fixes for the current code.

Patch 5 is the actual refactoring.

Patch 6 is a minor change suggested by Greg and can be squashed as needed.

Patch 7 removes one of the extern symbols by changing a bit the now common
code.

Patch 8 removes the remaining externs (as required by Russell during v1 review)
by creating a new header file include/linux/arch_topology.h and including that
from arm, arm64 and drivers.

Last patch addresses Dietmar's comments to v1 and adds a 'atd_' prefix to
interfaces exported by drivers code and used by arch (and potentially others in
the future).

Changes from v2:

 - rebase on top of 4.11-rc4
 - fix various problems pointed out by Greg, thanks for the review!
   (see patch 5 for details)

The set is based on top of linux/master (4.11-rc4 c02ed2e75ef4) and it is also
available from:

 git://linux-arm.org/linux-jl.git upstream/default_caps_factorize-v3

Best,

- Juri

[1] v1 - https://marc.info/?l=linux-kernel&m=148483680119355&w=2
v2 - https://marc.info/?l=linux-kernel&m=148663344018205&w=2

Juri Lelli (9):
  Documentation: arm: fix wrong reference number in DT definition
  Documentation/ABI: add information about cpu_capacity
  arm: fix return value of parse_cpu_capacity
  arm: remove wrong CONFIG_PROC_SYSCTL ifdef
  arm, arm64: factorize common cpu capacity default code
  drivers: remove useless comment from base/arch_topology.c
  arm,arm64,drivers: reduce scope of cap_parsing_failed
  arm,arm64,drivers: move externs in a new header file
  arm,arm64,drivers: add a prefix to drivers arch_topology interfaces

 Documentation/ABI/testing/sysfs-devices-system-cpu |   7 +
 Documentation/devicetree/bindings/arm/cpus.txt |   4 +-
 arch/arm/Kconfig   |   1 +
 arch/arm/kernel/topology.c | 221 +--
 arch/arm64/Kconfig |   1 +
 arch/arm64/kernel/topology.c   | 226 +--
 drivers/base/Kconfig   |   8 +
 drivers/base/Makefile  |   1 +
 drivers/base/arch_topology.c   | 241 +
 include/linux/arch_topology.h  |  17 ++
 10 files changed, 288 insertions(+), 439 deletions(-)
 create mode 100644 drivers/base/arch_topology.c
 create mode 100644 include/linux/arch_topology.h

-- 
2.10.0

[PATCH v3 2/9] Documentation/ABI: add information about cpu_capacity

2017-03-27 Thread Juri Lelli

/sys/devices/system/cpu/cpu#/cpu_capacity describe information about
CPUs heterogeneity (ref. to Documentation/devicetree/bindings/arm/
cpu-capacity.txt).

Add such description.

Signed-off-by: Juri Lelli 
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2a4a423d08e0..f3d5817c4ef0 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -366,3 +366,10 @@ Contact:   Linux ARM Kernel Mailing list 

 Description:   AArch64 CPU registers
'identification' directory exposes the CPU ID registers for
 identifying model and revision of the CPU.
+
+What:  /sys/devices/system/cpu/cpu#/cpu_capacity
+Date:  December 2016
+Contact:   Linux kernel mailing list 
+Description:   information about CPUs heterogeneity.
+
+   cpu_capacity: capacity of cpu#.
-- 
2.10.0

[PATCH v3 4/9] arm: remove wrong CONFIG_PROC_SYSCTL ifdef

2017-03-27 Thread Juri Lelli

The sysfs cpu_capacity entry for each CPU has nothing to do with
PROC_FS, nor it's in /proc/sys path.

Remove such ifdef.

Cc: Russell King 
Reported-and-suggested-by: Sudeep Holla 
Fixes: 7e5930aaef5d ('ARM: 8622/3: add sysfs cpu_capacity attribute')
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 4e4af809606a..162c82aeed96 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -57,7 +57,6 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
-#ifdef CONFIG_PROC_SYSCTL
 static ssize_t cpu_capacity_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -114,7 +113,6 @@ static int register_cpu_capacity_sysctl(void)
return 0;
 }
 subsys_initcall(register_cpu_capacity_sysctl);
-#endif
 
 #ifdef CONFIG_OF
 struct cpu_efficiency {
-- 
2.10.0

[PATCH v3 7/9] arm,arm64,drivers: reduce scope of cap_parsing_failed

2017-03-27 Thread Juri Lelli

Reduce the scope of cap_parsing_failed (making it static in
drivers/base/arch_topology.c) by slightly changing {arm,arm64} DT
parsing code.

For arm checking for !cap_parsing_failed before calling normalize_
cpu_capacity() is superfluous, as returning an error from parse_
cpu_capacity() (above) means cap_from _dt is set to false.

For arm64 we can simply check if raw_capacity points to something,
which is not if capacity parsing has failed.

Suggested-by: Morten Rasmussen 
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c   | 3 +--
 arch/arm64/kernel/topology.c | 5 +
 drivers/base/arch_topology.c | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 49ef025ffaa0..1e35a3265ddf 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -76,7 +76,6 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-extern bool cap_parsing_failed;
 extern void normalize_cpu_capacity(void);
 extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
@@ -165,7 +164,7 @@ static void __init parse_dt_topology(void)
middle_capacity = ((max_capacity / 3)
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
 
-   if (cap_from_dt && !cap_parsing_failed)
+   if (cap_from_dt)
normalize_cpu_capacity();
 }
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index c5bc31eb97e8..7e1f6f75185b 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -27,7 +27,6 @@
 #include 
 #include 
 
-extern bool cap_parsing_failed;
 extern void normalize_cpu_capacity(void);
 extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
@@ -187,10 +186,8 @@ static int __init parse_dt_topology(void)
 * cluster with restricted subnodes.
 */
map = of_get_child_by_name(cn, "cpu-map");
-   if (!map) {
-   cap_parsing_failed = true;
+   if (!map)
goto out;
-   }
 
ret = parse_cluster(map, 0);
if (ret != 0)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index b24d9a2af2c5..6543a032b332 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -93,7 +93,7 @@ subsys_initcall(register_cpu_capacity_sysctl);
 
 static u32 capacity_scale;
 static u32 *raw_capacity;
-bool cap_parsing_failed;
+static bool cap_parsing_failed;
 
 void normalize_cpu_capacity(void)
 {
@@ -208,7 +208,7 @@ static int __init register_cpufreq_notifier(void)
 * until we have the necessary code to parse the cpu capacity, so
 * skip registering cpufreq notifier.
 */
-   if (!acpi_disabled || cap_parsing_failed)
+   if (!acpi_disabled || !raw_capacity)
return -EINVAL;
 
if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) {
-- 
2.10.0

[PATCH v3 6/9] drivers: remove useless comment from base/arch_topology.c

2017-03-27 Thread Juri Lelli

Printing out an error message when we failed to get the cpu device is
not helping anyone. Remove it.

Signed-off-by: Juri Lelli 
---
 drivers/base/arch_topology.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index c33482121b7d..b24d9a2af2c5 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -81,11 +81,9 @@ static int register_cpu_capacity_sysctl(void)
 
for_each_possible_cpu(i) {
cpu = get_cpu_device(i);
-   if (!cpu) {
-   pr_err("%s: too early to get CPU%d device!\n",
-  __func__, i);
+   if (!cpu)
continue;
-   }
+
device_create_file(cpu, &dev_attr_cpu_capacity);
}
 
-- 
2.10.0

[PATCH v3 3/9] arm: fix return value of parse_cpu_capacity

2017-03-27 Thread Juri Lelli

parse_cpu_capacity() has to return 0 on failure, but it currently returns
1 instead if raw_capacity kcalloc failed.

Fix it by removing the negation of the return value.

Cc: Russell King 
Reported-by: Morten Rasmussen 
Fixes: 06073ee26775 ('ARM: 8621/3: parse cpu capacity-dmips-mhz from DT')
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index f8a3ab82e77f..4e4af809606a 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -166,7 +166,7 @@ static int __init parse_cpu_capacity(struct device_node 
*cpu_node, int cpu)
if (!raw_capacity) {
pr_err("cpu_capacity: failed to allocate memory 
for raw capacities\n");
cap_parsing_failed = true;
-   return !ret;
+   return ret;
}
}
capacity_scale = max(cpu_capacity, capacity_scale);
-- 
2.10.0

[PATCH v3 9/9] arm,arm64,drivers: add a prefix to drivers arch_topology interfaces

2017-03-27 Thread Juri Lelli

Now that some functions that deal with arch topology information live
under drivers, there is a clash of naming that might create confusion.

Tidy things up by creating a drivers namespace for interfaces used by
arch code; achieve this by prepending a 'atd_' (arch topology driver)
prefix to driver interfaces.

Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c|  8 
 arch/arm64/kernel/topology.c  |  4 ++--
 drivers/base/arch_topology.c  | 20 ++--
 include/linux/arch_topology.h |  8 
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 557be4f1d2d7..e53391026c1b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -111,7 +111,7 @@ static void __init parse_dt_topology(void)
continue;
}
 
-   if (parse_cpu_capacity(cn, cpu)) {
+   if (atd_parse_cpu_capacity(cn, cpu)) {
of_node_put(cn);
continue;
}
@@ -160,7 +160,7 @@ static void __init parse_dt_topology(void)
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
 
if (cap_from_dt)
-   normalize_cpu_capacity();
+   atd_normalize_cpu_capacity();
 }
 
 /*
@@ -173,10 +173,10 @@ static void update_cpu_capacity(unsigned int cpu)
if (!cpu_capacity(cpu) || cap_from_dt)
return;
 
-   set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+   atd_set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
 
pr_info("CPU%u: update cpu_capacity %lu\n",
-   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   cpu, atd_scale_cpu_capacity(NULL, cpu));
 }
 
 #else
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 255230c3e835..5f24faa09c05 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -39,7 +39,7 @@ static int __init get_cpu_for_node(struct device_node *node)
 
for_each_possible_cpu(cpu) {
if (of_get_cpu_node(cpu, NULL) == cpu_node) {
-   parse_cpu_capacity(cpu_node, cpu);
+   atd_parse_cpu_capacity(cpu_node, cpu);
of_node_put(cpu_node);
return cpu;
}
@@ -191,7 +191,7 @@ static int __init parse_dt_topology(void)
if (ret != 0)
goto out_map;
 
-   normalize_cpu_capacity();
+   atd_normalize_cpu_capacity();
 
/*
 * Check that all cores are in the topology; the SMP code will
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 0fc77fa900a9..87dca3320536 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -25,12 +25,12 @@
 static DEFINE_MUTEX(cpu_scale_mutex);
 static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long atd_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
return per_cpu(cpu_scale, cpu);
 }
 
-void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+void atd_set_capacity_scale(unsigned int cpu, unsigned long capacity)
 {
per_cpu(cpu_scale, cpu) = capacity;
 }
@@ -42,7 +42,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
struct cpu *cpu = container_of(dev, struct cpu, dev);
 
return sprintf(buf, "%lu\n",
-   arch_scale_cpu_capacity(NULL, cpu->dev.id));
+   atd_scale_cpu_capacity(NULL, cpu->dev.id));
 }
 
 static ssize_t cpu_capacity_store(struct device *dev,
@@ -67,7 +67,7 @@ static ssize_t cpu_capacity_store(struct device *dev,
 
mutex_lock(&cpu_scale_mutex);
for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
-   set_capacity_scale(i, new_capacity);
+   atd_set_capacity_scale(i, new_capacity);
mutex_unlock(&cpu_scale_mutex);
 
return count;
@@ -96,7 +96,7 @@ static u32 capacity_scale;
 static u32 *raw_capacity;
 static bool cap_parsing_failed;
 
-void normalize_cpu_capacity(void)
+void atd_normalize_cpu_capacity(void)
 {
u64 capacity;
int cpu;
@@ -111,14 +111,14 @@ void normalize_cpu_capacity(void)
 cpu, raw_capacity[cpu]);
capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
/ capacity_scale;
-   set_capacity_scale(cpu, capacity);
+   atd_set_capacity_scale(cpu, capacity);
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
-   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   cpu, atd_scale_cpu_capacity(NULL, cpu));
}
mutex_unlock(&cpu_scale_mutex);
 }
 
-int __init parse_cpu_capacity(struct device_node

[PATCH v3 8/9] arm,arm64,drivers: move externs in a new header file

2017-03-27 Thread Juri Lelli

Create a new header file (include/linux/arch_topology.h) and put there
declarations of interfaces used by arm, arm64 and drivers code.

Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c|  7 +--
 arch/arm64/kernel/topology.c  |  4 +---
 drivers/base/arch_topology.c  |  1 +
 include/linux/arch_topology.h | 17 +
 4 files changed, 20 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/arch_topology.h

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 1e35a3265ddf..557be4f1d2d7 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -11,6 +11,7 @@
  * for more details.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -45,10 +46,6 @@
  * updated during this sequence.
  */
 
-extern unsigned long
-arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
-extern void set_capacity_scale(unsigned int cpu, unsigned long capacity);
-
 #ifdef CONFIG_OF
 struct cpu_efficiency {
const char *compatible;
@@ -76,8 +73,6 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-extern void normalize_cpu_capacity(void);
-extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
 /*
  * Iterate all CPUs' descriptor in DT and compute the efficiency
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 7e1f6f75185b..255230c3e835 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -11,6 +11,7 @@
  * for more details.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -27,9 +28,6 @@
 #include 
 #include 
 
-extern void normalize_cpu_capacity(void);
-extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
-
 static int __init get_cpu_for_node(struct device_node *node)
 {
struct device_node *cpu_node;
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 6543a032b332..0fc77fa900a9 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -13,6 +13,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
new file mode 100644
index ..4edae9fe8cdd
--- /dev/null
+++ b/include/linux/arch_topology.h
@@ -0,0 +1,17 @@
+/*
+ * include/linux/arch_topology.h - arch specific cpu topology information
+ */
+#ifndef _LINUX_ARCH_TOPOLOGY_H_
+#define _LINUX_ARCH_TOPOLOGY_H_
+
+void normalize_cpu_capacity(void);
+
+struct device_node;
+int parse_cpu_capacity(struct device_node *cpu_node, int cpu);
+
+struct sched_domain;
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
+
+void set_capacity_scale(unsigned int cpu, unsigned long capacity);
+
+#endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
2.10.0

Re: [PATCH 0/8] sched/deadline: Return the best satisfying affinity and dl in cpudl_find

2017-03-27 Thread Juri Lelli

Hi,

On 23/03/17 19:32, Byungchul Park wrote:
> cpudl_find() is used to find a cpu having the latest dl. The function
> should return the latest cpu among ones satisfying task's affinity and
> dl constraint, but current code gives up immediately and just return
> fail when it fails at the test *only with* the maximum cpu.
> 
> For example:
> 
>cpu 0 is running a task (dl: 10).
>cpu 1 is running a task (dl: 9).
>cpu 2 is running a task (dl: 8).
>cpu 3 is running a task (dl: 2).
> 
>where cpu 3 want to push a task (affinity is 1 2 3 and dl is 1).

Hummm, but this should only happen if you disable admission control,
right? Otherwise task's affinity can't be smaller that 0-3.

> 
> In this case, the task should be migrated from cpu 3 to cpu 1, and
> preempt cpu 1's task. However, current code just returns fail because
> it fails at the affinity test with the maximum cpu, that is, cpu 0.
> 
> This patch set tries to find the best among ones satisfying task's
> affinity and dl constraint until success or no more to see.
> 

Anyway, do you have numbers showing how common is you fail scenario?
It would be interesting to understand how much the slow path is actually
used, IMHO.

Thanks,

- Juri

[PATCH v3 5/9] arm, arm64: factorize common cpu capacity default code

2017-03-27 Thread Juri Lelli

arm and arm64 share lot of code relative to parsing CPU capacity
information from DT, using that information for appropriate scaling and
exposing a sysfs interface for chaging such values at runtime.

Factorize such code in a common place (driver/base/arch_topology.c) in
preparation for further additions.

Suggested-by: Will Deacon 
Suggested-by: Mark Rutland 
Suggested-by: Catalin Marinas 
Cc: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Greg Kroah-Hartman 
Signed-off-by: Juri Lelli 
---

Changes from v2:
 - make capacity_scale and raw_capacity static
 - added SPDX header
 - improved indent
 - misc. whitespaces/newlines fixes

Changes from v1:
 - keep the original GPLv2 header
---
 arch/arm/Kconfig |   1 +
 arch/arm/kernel/topology.c   | 213 ++---
 arch/arm64/Kconfig   |   1 +
 arch/arm64/kernel/topology.c | 219 +--
 drivers/base/Kconfig |   8 ++
 drivers/base/Makefile|   1 +
 drivers/base/arch_topology.c | 242 +++
 7 files changed, 262 insertions(+), 423 deletions(-)
 create mode 100644 drivers/base/arch_topology.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0d4e71b42c77..cd61154bb6d0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -25,6 +25,7 @@ config ARM
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select GENERIC_ALLOCATOR
+   select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY
select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_EARLY_IOREMAP
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 162c82aeed96..49ef025ffaa0 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -44,75 +44,10 @@
  * to run the rebalance_domains for all idle cores and the cpu_capacity can be
  * updated during this sequence.
  */
-static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-static DEFINE_MUTEX(cpu_scale_mutex);
 
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-   return per_cpu(cpu_scale, cpu);
-}
-
-static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
-{
-   per_cpu(cpu_scale, cpu) = capacity;
-}
-
-static ssize_t cpu_capacity_show(struct device *dev,
-struct device_attribute *attr,
-char *buf)
-{
-   struct cpu *cpu = container_of(dev, struct cpu, dev);
-
-   return sprintf(buf, "%lu\n",
-   arch_scale_cpu_capacity(NULL, cpu->dev.id));
-}
-
-static ssize_t cpu_capacity_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t count)
-{
-   struct cpu *cpu = container_of(dev, struct cpu, dev);
-   int this_cpu = cpu->dev.id, i;
-   unsigned long new_capacity;
-   ssize_t ret;
-
-   if (count) {
-   ret = kstrtoul(buf, 0, &new_capacity);
-   if (ret)
-   return ret;
-   if (new_capacity > SCHED_CAPACITY_SCALE)
-   return -EINVAL;
-
-   mutex_lock(&cpu_scale_mutex);
-   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
-   set_capacity_scale(i, new_capacity);
-   mutex_unlock(&cpu_scale_mutex);
-   }
-
-   return count;
-}
-
-static DEVICE_ATTR_RW(cpu_capacity);
-
-static int register_cpu_capacity_sysctl(void)
-{
-   int i;
-   struct device *cpu;
-
-   for_each_possible_cpu(i) {
-   cpu = get_cpu_device(i);
-   if (!cpu) {
-   pr_err("%s: too early to get CPU%d device!\n",
-  __func__, i);
-   continue;
-   }
-   device_create_file(cpu, &dev_attr_cpu_capacity);
-   }
-
-   return 0;
-}
-subsys_initcall(register_cpu_capacity_sysctl);
+extern unsigned long
+arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
+extern void set_capacity_scale(unsigned int cpu, unsigned long capacity);
 
 #ifdef CONFIG_OF
 struct cpu_efficiency {
@@ -141,145 +76,9 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-static u32 *raw_capacity;
-static bool cap_parsing_failed;
-static u32 capacity_scale;
-
-static int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu)
-{
-   int ret = 1;
-   u32 cpu_capacity;
-
-   if (cap_parsing_failed)
-   return !ret;
-
-   ret = of_property_read_u32(cpu_node,
-  "capacity-dmips-mhz",
-  &cpu_capacity);
-   if (!ret) {
-

Re: [PATCH v3 1/3] sched/deadline: Make find_later_rq() choose a closer cpu in topology

2017-03-27 Thread Juri Lelli

Hi,

On 23/03/17 11:12, Byungchul Park wrote:
> When cpudl_find() returns any among free_cpus, the cpu might not be
> closer than others, considering sched domain. For example:
> 
>this_cpu: 15
>free_cpus: 0, 1,..., 14 (== later_mask)
>best_cpu: 0
> 
>topology:
> 
>0 --+
>+--+
>1 --+  |
>   +-- ... --+
>2 --+  | |
>+--+ |
>3 --+|
> 
>... ...
> 
>12 --+   |
> +--+|
>13 --+  ||
>+-- ... -+
>14 --+  |
> +--+
>15 --+
> 
> In this case, it would be best to select 14 since it's a free cpu and
> closest to 15(this_cpu). However, currently the code select 0(best_cpu)
> even though that's just any among free_cpus. Fix it.
> 
> Signed-off-by: Byungchul Park 
> ---
>  kernel/sched/deadline.c | 29 +++--
>  1 file changed, 15 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index a2ce590..49c93b9 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1324,7 +1324,7 @@ static int find_later_rq(struct task_struct *task)
>   struct sched_domain *sd;
>   struct cpumask *later_mask = 
> this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
>   int this_cpu = smp_processor_id();
> - int best_cpu, cpu = task_cpu(task);
> + int cpu = task_cpu(task);
>  
>   /* Make sure the mask is initialized first */
>   if (unlikely(!later_mask))
> @@ -1337,17 +1337,14 @@ static int find_later_rq(struct task_struct *task)
>* We have to consider system topology and task affinity
>* first, then we can look for a suitable cpu.
>*/
> - best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
> - task, later_mask);
> - if (best_cpu == -1)
> + if (cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask) == -1)

It seems that with this we loose the last user of the current return
value of cpudl_find() (heap maximum). I guess we want to change the
return value to be (int)bool, as in rt, so that we can simplify this and
the conditions in check_preempt_equal_dl.

>   return -1;
>  
>   /*
> -  * If we are here, some target has been found,
> -  * the most suitable of which is cached in best_cpu.
> -  * This is, among the runqueues where the current tasks
> -  * have later deadlines than the task's one, the rq
> -  * with the latest possible one.
> +  * If we are here, some targets have been found, including
> +  * the most suitable which is, among the runqueues where the
> +  * current tasks have later deadlines than the task's one, the
> +  * rq with the latest possible one.
>*
>* Now we check how well this matches with task's
>* affinity and system topology.
> @@ -1367,6 +1364,7 @@ static int find_later_rq(struct task_struct *task)
>   rcu_read_lock();
>   for_each_domain(cpu, sd) {
>   if (sd->flags & SD_WAKE_AFFINE) {
> + int closest_cpu;

Can we still call this best_cpu, so that we are aligned with rt?

Thanks,

- Juri

Re: [RFD PATCH 3/5] sched/cpufreq_schedutil: make worker kthread be SCHED_DEADLINE

2017-03-27 Thread Juri Lelli

On 27/03/17 18:50, Peter Zijlstra wrote:
> On Fri, Mar 24, 2017 at 02:08:58PM +0000, Juri Lelli wrote:
> > Worker kthread needs to be able to change frequency for all other
> > threads.
> > 
> > Make it special, just under STOP class.
> 
> *yuck* ;-)
> 

Eh, I know. :/

> So imagine our I2C/SPI bus is 'busy' and its mutex taken, then this
> 'soecial' task will need to boost it. Now add BWI to your thinking and
> shudder.
> 

Currently that kthread is FIFO already, so boosting still applies. Not as
bad as in the BWI case though. More thinking required.

> 
> On IRC broonie mentioned that:
> 
>  - most PMIC operations are fire and forget (no need to wait for a
>response).
>  - PMIC 'packets' are 'small'.
>  - SPI has the possibility to push stuff on the queue.
> 
> Taken together this seems to suggest we can rework cpufreq drivers to
> function in-context, either directly push the packet on the bus if
> available, or queue it and let whoever owns it sort it without blocking.
> 
> It might be possible to rework/augment I2C to also support pushing stuff
> on a queue.
> 
> 
> So if we can make all that work, we can do away with this horrible
> horrible kthread. Which is, IMO, a much better solution.
> 
> Thoughts?

Right. This is more a schedutil (cpufreq) problem though, IMHO. Even if
I agree that what you are proposing is way more clean (and here I
actually assume it's feasible at all), I fear it will take quite some
time to get reworked.

Do we want to wait until that moment to get DEADLINE contribution
accounted for? :(

Thanks,

- Juri

Re: [RFD PATCH 3/5] sched/cpufreq_schedutil: make worker kthread be SCHED_DEADLINE

2017-03-27 Thread Juri Lelli

On 27/03/17 19:05, Rafael J. Wysocki wrote:
> On Monday, March 27, 2017 06:01:34 PM Juri Lelli wrote:
> > On 27/03/17 18:50, Peter Zijlstra wrote:
> > > On Fri, Mar 24, 2017 at 02:08:58PM +, Juri Lelli wrote:
> > > > Worker kthread needs to be able to change frequency for all other
> > > > threads.
> > > > 
> > > > Make it special, just under STOP class.
> > > 
> > > *yuck* ;-)
> > > 
> > 
> > Eh, I know. :/
> > 
> > > So imagine our I2C/SPI bus is 'busy' and its mutex taken, then this
> > > 'soecial' task will need to boost it. Now add BWI to your thinking and
> > > shudder.
> > > 
> > 
> > Currently that kthread is FIFO already, so boosting still applies. Not as
> > bad as in the BWI case though. More thinking required.
> > 
> > > 
> > > On IRC broonie mentioned that:
> > > 
> > >  - most PMIC operations are fire and forget (no need to wait for a
> > >response).
> > >  - PMIC 'packets' are 'small'.
> > >  - SPI has the possibility to push stuff on the queue.
> > > 
> > > Taken together this seems to suggest we can rework cpufreq drivers to
> > > function in-context, either directly push the packet on the bus if
> > > available, or queue it and let whoever owns it sort it without blocking.
> > > 
> > > It might be possible to rework/augment I2C to also support pushing stuff
> > > on a queue.
> > > 
> > > 
> > > So if we can make all that work, we can do away with this horrible
> > > horrible kthread. Which is, IMO, a much better solution.
> > > 
> > > Thoughts?
> > 
> > Right. This is more a schedutil (cpufreq) problem though, IMHO. Even if
> > I agree that what you are proposing is way more clean (and here I
> > actually assume it's feasible at all), I fear it will take quite some
> > time to get reworked.
> 
> Why do you think so?
> 

It simply seemed a major rework to me. :)

Re: [PATCH] sched/deadline: Add missing update_rq_clock() in dl_task_timer()

2017-03-15 Thread Juri Lelli

Hi,

On 06/03/17 21:51, Wanpeng Li wrote:
> From: Wanpeng Li 
> 
> The following warning can be triggered by hot-unplugging the CPU
> on which an active SCHED_DEADLINE task is running on:
> 
>  [ cut here ]
>  WARNING: CPU: 7 PID: 0 at kernel/sched/sched.h:833 
> replenish_dl_entity+0x71e/0xc40
>  rq->clock_update_flags < RQCF_ACT_SKIP
>  CPU: 7 PID: 0 Comm: swapper/7 Tainted: GB   4.11.0-rc1+ #24
>  Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 
> 02/16/2016
>  Call Trace:
>   
>   dump_stack+0x85/0xc4
>   __warn+0x172/0x1b0
>   warn_slowpath_fmt+0xb4/0xf0
>   ? __warn+0x1b0/0x1b0
>   ? debug_check_no_locks_freed+0x2c0/0x2c0
>   ? cpudl_set+0x3d/0x2b0
>   replenish_dl_entity+0x71e/0xc40
>   enqueue_task_dl+0x2ea/0x12e0
>   ? dl_task_timer+0x777/0x990
>   ? __hrtimer_run_queues+0x270/0xa50
>   dl_task_timer+0x316/0x990
>   ? enqueue_task_dl+0x12e0/0x12e0
>   ? enqueue_task_dl+0x12e0/0x12e0
>   __hrtimer_run_queues+0x270/0xa50
>   ? hrtimer_cancel+0x20/0x20
>   ? hrtimer_interrupt+0x119/0x600
>   hrtimer_interrupt+0x19c/0x600
>   ? trace_hardirqs_off+0xd/0x10
>   local_apic_timer_interrupt+0x74/0xe0
>   smp_apic_timer_interrupt+0x76/0xa0
>   apic_timer_interrupt+0x93/0xa0
> 
> The DL task will be migrated to a suitable later deadline rq once the DL 
> timer fires and currnet rq is offline. The rq clock of the new rq should 
> be updated. This patch fixes it by updating the rq clock after holding 
> the new rq's rq lock.
> 
> Cc: Juri Lelli 
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> Cc: Thomas Gleixner 
> Cc: Matt Fleming 
> Signed-off-by: Wanpeng Li 
> ---
>  kernel/sched/deadline.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 99b2c33..c6db3fd 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -638,6 +638,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
> *timer)
>   lockdep_unpin_lock(&rq->lock, rf.cookie);
>   rq = dl_task_offline_migration(rq, p);
>   rf.cookie = lockdep_pin_lock(&rq->lock);
> + update_rq_clock(rq);

Looks good to me.

Acked-by: Juri Lelli 

Thanks,

- Juri

Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-15 Thread Juri Lelli

Hi Joel,

On 15/03/17 05:59, Joel Fernandes wrote:
> On Wed, Mar 15, 2017 at 4:40 AM, Patrick Bellasi
>  wrote:
> > On 13-Mar 03:08, Joel Fernandes (Google) wrote:
> >> Hi Patrick,
> >>
> >> On Tue, Feb 28, 2017 at 6:38 AM, Patrick Bellasi
> >>  wrote:
> >> > Currently schedutil enforce a maximum OPP when RT/DL tasks are RUNNABLE.
> >> > Such a mandatory policy can be made more tunable from userspace thus
> >> > allowing for example to define a reasonable max capacity (i.e.
> >> > frequency) which is required for the execution of a specific RT/DL
> >> > workload. This will contribute to make the RT class more "friendly" for
> >> > power/energy sensible applications.
> >> >
> >> > This patch extends the usage of capacity_{min,max} to the RT/DL classes.
> >> > Whenever a task in these classes is RUNNABLE, the capacity required is
> >> > defined by the constraints of the control group that task belongs to.
> >> >
> >>
> >> We briefly discussed this at Linaro Connect that this works well for
> >> sporadic RT tasks that run briefly and then sleep for long periods of
> >> time - so certainly this patch is good, but its only a partial
> >> solution to the problem of frequent and short-sleepers and something
> >> is required to keep the boost active for short non-RUNNABLE as well.
> >> The behavior with many periodic RT tasks is that they will sleep for
> >> short intervals and run for short intervals periodically. In this case
> >> removing the clamp (or the boost as in schedtune v2) on a dequeue will
> >> essentially mean during a narrow window cpufreq can drop the frequency
> >> and only to make it go back up again.
> >>
> >> Currently for schedtune v2, I am working on prototyping something like
> >> the following for Android:
> >> - if RT task is enqueue, introduce the boost.
> >> - When task is dequeued, start a timer for a  "minimum deboost delay
> >> time" before taking out the boost.
> >> - If task is enqueued again before the timer fires, then cancel the timer.
> >>
> >> I don't think any "fix" to this particular issue should be to the
> >> schedutil governor and should be sorted before going to cpufreq itself
> >> (that is before making the request). What do you think about this?
> >
> > My short observations are:
> >
> > 1) for certain RT tasks, which have a quite "predictable" activation
> >pattern, we should definitively try to use DEADLINE... which will
> >factor out all "boosting potential races" since the bandwidth
> >requirements are well defined at task description time.
> 
> I don't immediately see how deadline can fix this, when a task is
> dequeued after end of its current runtime, its bandwidth will be
> subtracted from the active running bandwidth. This is what drives the
> DL part of the capacity request. In this case, we run into the same
> issue as with the boost-removal on dequeue. Isn't it?
> 

Unfortunately, I still have to post the set of patches (based on Luca's
reclaiming set) that introduces driving of clock frequency from
DEADLINE, so I guess everything we can discuss about how DEADLINE might
help here might be difficult to understand. :(

I should definitely fix that.

However, trying to quickly summarize how that would work (for who is
already somewhat familiar with reclaiming bits):

 - a task utilization contribution is accounted for (at rq level) as
   soon as it wakes up for the first time in a new period
 - its contribution is then removed after the 0lag time (or when the
   task gets throttled)
 - frequency transitions are triggered accordingly

So, I don't see why triggering a go down request after the 0lag time
expired and quickly reacting to tasks waking up would have create
problems in your case?

Thanks,

- Juri

Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-15 Thread Juri Lelli

On 15/03/17 09:13, Joel Fernandes wrote:
> On Wed, Mar 15, 2017 at 7:44 AM, Juri Lelli  wrote:
> > Hi Joel,
> >
> > On 15/03/17 05:59, Joel Fernandes wrote:
> >> On Wed, Mar 15, 2017 at 4:40 AM, Patrick Bellasi
> >>  wrote:
> >> > On 13-Mar 03:08, Joel Fernandes (Google) wrote:
> >> >> Hi Patrick,
> >> >>
> >> >> On Tue, Feb 28, 2017 at 6:38 AM, Patrick Bellasi
> >> >>  wrote:
> >> >> > Currently schedutil enforce a maximum OPP when RT/DL tasks are 
> >> >> > RUNNABLE.
> >> >> > Such a mandatory policy can be made more tunable from userspace thus
> >> >> > allowing for example to define a reasonable max capacity (i.e.
> >> >> > frequency) which is required for the execution of a specific RT/DL
> >> >> > workload. This will contribute to make the RT class more "friendly" 
> >> >> > for
> >> >> > power/energy sensible applications.
> >> >> >
> >> >> > This patch extends the usage of capacity_{min,max} to the RT/DL 
> >> >> > classes.
> >> >> > Whenever a task in these classes is RUNNABLE, the capacity required is
> >> >> > defined by the constraints of the control group that task belongs to.
> >> >> >
> >> >>
> >> >> We briefly discussed this at Linaro Connect that this works well for
> >> >> sporadic RT tasks that run briefly and then sleep for long periods of
> >> >> time - so certainly this patch is good, but its only a partial
> >> >> solution to the problem of frequent and short-sleepers and something
> >> >> is required to keep the boost active for short non-RUNNABLE as well.
> >> >> The behavior with many periodic RT tasks is that they will sleep for
> >> >> short intervals and run for short intervals periodically. In this case
> >> >> removing the clamp (or the boost as in schedtune v2) on a dequeue will
> >> >> essentially mean during a narrow window cpufreq can drop the frequency
> >> >> and only to make it go back up again.
> >> >>
> >> >> Currently for schedtune v2, I am working on prototyping something like
> >> >> the following for Android:
> >> >> - if RT task is enqueue, introduce the boost.
> >> >> - When task is dequeued, start a timer for a  "minimum deboost delay
> >> >> time" before taking out the boost.
> >> >> - If task is enqueued again before the timer fires, then cancel the 
> >> >> timer.
> >> >>
> >> >> I don't think any "fix" to this particular issue should be to the
> >> >> schedutil governor and should be sorted before going to cpufreq itself
> >> >> (that is before making the request). What do you think about this?
> >> >
> >> > My short observations are:
> >> >
> >> > 1) for certain RT tasks, which have a quite "predictable" activation
> >> >pattern, we should definitively try to use DEADLINE... which will
> >> >factor out all "boosting potential races" since the bandwidth
> >> >requirements are well defined at task description time.
> >>
> >> I don't immediately see how deadline can fix this, when a task is
> >> dequeued after end of its current runtime, its bandwidth will be
> >> subtracted from the active running bandwidth. This is what drives the
> >> DL part of the capacity request. In this case, we run into the same
> >> issue as with the boost-removal on dequeue. Isn't it?
> >>
> >
> > Unfortunately, I still have to post the set of patches (based on Luca's
> > reclaiming set) that introduces driving of clock frequency from
> > DEADLINE, so I guess everything we can discuss about how DEADLINE might
> > help here might be difficult to understand. :(
> >
> > I should definitely fix that.
> 
> I fully understand, Sorry to be discussing this too soon here...
> 

No problem. I just thought I should clarify before people go WTH are
these guys talking about?! :)

> > However, trying to quickly summarize how that would work (for who is
> > already somewhat familiar with reclaiming bits):
> >
> >  - a task utilization contribution is accounted for (at rq level) as
> >soon as it wakes up for the first time in a new period
> >  - its contribution is then removed after the

Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-16 Thread Juri Lelli

On 15/03/17 16:40, Joel Fernandes wrote:
> On Wed, Mar 15, 2017 at 9:24 AM, Juri Lelli  wrote:
> [..]
> >
> >> > However, trying to quickly summarize how that would work (for who is
> >> > already somewhat familiar with reclaiming bits):
> >> >
> >> >  - a task utilization contribution is accounted for (at rq level) as
> >> >soon as it wakes up for the first time in a new period
> >> >  - its contribution is then removed after the 0lag time (or when the
> >> >task gets throttled)
> >> >  - frequency transitions are triggered accordingly
> >> >
> >> > So, I don't see why triggering a go down request after the 0lag time
> >> > expired and quickly reacting to tasks waking up would have create
> >> > problems in your case?
> >>
> >> In my experience, the 'reacting to tasks' bit doesn't work very well.
> >
> > Humm.. but in this case we won't be 'reacting', we will be
> > 'anticipating' tasks' needs, right?
> 
> Are you saying we will start ramping frequency before the next
> activation so that we're ready for it?
> 

I'm saying that there is no need to ramp, simply select the frequency
that is needed for a task (or a set of them).

> If not, it sounds like it will only make the frequency request on the
> next activation when the Active bandwidth increases due to the task
> waking up. By then task has already started to run, right?
> 

When the task is enqueued back we select the frequency considering its
bandwidth request (and the bandwidth/utilization of the others). So,
when it actually starts running it will already have enough capacity to
finish in time.

Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-16 Thread Juri Lelli

On 16/03/17 12:27, Patrick Bellasi wrote:
> On 16-Mar 11:16, Juri Lelli wrote:
> > On 15/03/17 16:40, Joel Fernandes wrote:
> > > On Wed, Mar 15, 2017 at 9:24 AM, Juri Lelli  wrote:
> > > [..]
> > > >
> > > >> > However, trying to quickly summarize how that would work (for who is
> > > >> > already somewhat familiar with reclaiming bits):
> > > >> >
> > > >> >  - a task utilization contribution is accounted for (at rq level) as
> > > >> >soon as it wakes up for the first time in a new period
> > > >> >  - its contribution is then removed after the 0lag time (or when the
> > > >> >task gets throttled)
> > > >> >  - frequency transitions are triggered accordingly
> > > >> >
> > > >> > So, I don't see why triggering a go down request after the 0lag time
> > > >> > expired and quickly reacting to tasks waking up would have create
> > > >> > problems in your case?
> > > >>
> > > >> In my experience, the 'reacting to tasks' bit doesn't work very well.
> > > >
> > > > Humm.. but in this case we won't be 'reacting', we will be
> > > > 'anticipating' tasks' needs, right?
> > > 
> > > Are you saying we will start ramping frequency before the next
> > > activation so that we're ready for it?
> > > 
> > 
> > I'm saying that there is no need to ramp, simply select the frequency
> > that is needed for a task (or a set of them).
> > 
> > > If not, it sounds like it will only make the frequency request on the
> > > next activation when the Active bandwidth increases due to the task
> > > waking up. By then task has already started to run, right?
> > > 
> > 
> > When the task is enqueued back we select the frequency considering its
> > bandwidth request (and the bandwidth/utilization of the others). So,
> > when it actually starts running it will already have enough capacity to
> > finish in time.
> 
> Here we are factoring out the time required to actually switch to the
> required OPP. I think Joel was referring to this time.
> 

Right. But, this is an HW limitation. It seems a problem that every
scheduler driven decision will have to take into account. So, doesn't
make more sense to let the driver (or the governor shim layer) introduce
some sort of hysteresis to frequency changes if needed?

> That time cannot really be eliminated but from having faster OOP
> swiching HW support. Still, jumping strating to the "optimal" OPP
> instead of rumping up is a big improvement.
> 
> 
> -- 
> #include 
> 
> Patrick Bellasi

Re: [RFC v3 5/5] sched/{core,cpufreq_schedutil}: add capacity clamping for RT/DL tasks

2017-03-16 Thread Juri Lelli

On 16/03/17 09:58, Joel Fernandes wrote:
> On Thu, Mar 16, 2017 at 5:44 AM, Juri Lelli  wrote:
> > On 16/03/17 12:27, Patrick Bellasi wrote:
> >> On 16-Mar 11:16, Juri Lelli wrote:
> >> > On 15/03/17 16:40, Joel Fernandes wrote:
> >> > > On Wed, Mar 15, 2017 at 9:24 AM, Juri Lelli  wrote:
> >> > > [..]
> >> > > >
> >> > > >> > However, trying to quickly summarize how that would work (for who 
> >> > > >> > is
> >> > > >> > already somewhat familiar with reclaiming bits):
> >> > > >> >
> >> > > >> >  - a task utilization contribution is accounted for (at rq level) 
> >> > > >> > as
> >> > > >> >soon as it wakes up for the first time in a new period
> >> > > >> >  - its contribution is then removed after the 0lag time (or when 
> >> > > >> > the
> >> > > >> >task gets throttled)
> >> > > >> >  - frequency transitions are triggered accordingly
> >> > > >> >
> >> > > >> > So, I don't see why triggering a go down request after the 0lag 
> >> > > >> > time
> >> > > >> > expired and quickly reacting to tasks waking up would have create
> >> > > >> > problems in your case?
> >> > > >>
> >> > > >> In my experience, the 'reacting to tasks' bit doesn't work very 
> >> > > >> well.
> >> > > >
> >> > > > Humm.. but in this case we won't be 'reacting', we will be
> >> > > > 'anticipating' tasks' needs, right?
> >> > >
> >> > > Are you saying we will start ramping frequency before the next
> >> > > activation so that we're ready for it?
> >> > >
> >> >
> >> > I'm saying that there is no need to ramp, simply select the frequency
> >> > that is needed for a task (or a set of them).
> >> >
> >> > > If not, it sounds like it will only make the frequency request on the
> >> > > next activation when the Active bandwidth increases due to the task
> >> > > waking up. By then task has already started to run, right?
> >> > >
> >> >
> >> > When the task is enqueued back we select the frequency considering its
> >> > bandwidth request (and the bandwidth/utilization of the others). So,
> >> > when it actually starts running it will already have enough capacity to
> >> > finish in time.
> >>
> >> Here we are factoring out the time required to actually switch to the
> >> required OPP. I think Joel was referring to this time.
> >>
> 
> Yes, that's what I meant.
> 
> >
> > Right. But, this is an HW limitation. It seems a problem that every
> > scheduler driven decision will have to take into account. So, doesn't
> > make more sense to let the driver (or the governor shim layer) introduce
> > some sort of hysteresis to frequency changes if needed?
> 
> The problem IMO which Hysterisis in the governor will not help is what
> if you had a DL task that is not waking up for several periods and
> then wakes up, then for that wake up, we would still be subject to the
> HW limitation of time taken to switch to needed OPP. Right?
> 

True, but in this case the problem is that you cannot really predict the
future anyway. So, if your HW is so slow to react that it always causes
latency problems then I guess you'll be forced to statically raise your
min_freq value to cope with that HW limitation, indipendently from
scheduling policies/heuristics?

OTOH, hysteresis, when properly tuned, should cover the 'normal' cases.

> >> That time cannot really be eliminated but from having faster OOP
> >> swiching HW support. Still, jumping strating to the "optimal" OPP
> >> instead of rumping up is a big improvement.
> 
> Yes I think so.
> 
> Thanks,
> Joel

Re: [PATCH v2 6/9] arm, arm64: factorize common cpu capacity default code

2017-03-09 Thread Juri Lelli

Hi Greg,

did you have a chance to have a look at my replies below?

It would be really helpful to understand from you how to move forward
with this set.

Best Regards,

- Juri

On 13/02/17 15:09, Juri Lelli wrote:
> Hi Greg,
> 
> On 10/02/17 15:28, Greg KH wrote:
> > On Thu, Feb 09, 2017 at 09:25:22AM +0000, Juri Lelli wrote:
> > > arm and arm64 share lot of code relative to parsing CPU capacity
> > > information from DT, using that information for appropriate scaling and
> > > exposing a sysfs interface for chaging such values at runtime.
> > > 
> > > Factorize such code in a common place (driver/base/arch_topology.c) in
> > > preparation for further additions.
> > > 
> > > Suggested-by: Will Deacon 
> > > Suggested-by: Mark Rutland 
> > > Suggested-by: Catalin Marinas 
> > > Cc: Russell King 
> > > Cc: Catalin Marinas 
> > > Cc: Will Deacon 
> > > Cc: Greg Kroah-Hartman 
> > > Signed-off-by: Juri Lelli 
> > > ---
> > > 
> > > Changes from v1:
> > >  - keep the original GPLv2 header
> > > ---
> > >  arch/arm/Kconfig |   1 +
> > >  arch/arm/kernel/topology.c   | 213 ++
> > >  arch/arm64/Kconfig   |   1 +
> > >  arch/arm64/kernel/topology.c | 219 
> > > +--
> > >  drivers/base/Kconfig |   8 ++
> > >  drivers/base/Makefile|   1 +
> > >  drivers/base/arch_topology.c | 237 
> > > +++
> > >  7 files changed, 257 insertions(+), 423 deletions(-)
> > >  create mode 100644 drivers/base/arch_topology.c
> > 
> > Ah, so you want _me_ to maintain this, ok, I better review it...
> > 
> 
> This has been suggested as a possible way to stop replicating code between arm
> and arm64 (and possibly other archs in the future). Are you in principle OK
> with it?
> 
> Thanks a lot for your comments, please find my answers below.
> 
> > > --- a/drivers/base/Kconfig
> > > +++ b/drivers/base/Kconfig
> > > @@ -339,4 +339,12 @@ config CMA_ALIGNMENT
> > >  
> > >  endif
> > >  
> > > +config GENERIC_ARCH_TOPOLOGY
> > > + bool
> > > + help
> > > +   Enable support for architectures common topology code: e.g., parsing
> > > +   CPU capacity information from DT, usage of such information for
> > > +   appropriate scaling, sysfs interface for changing capacity values at
> > > +  runtime.
> > 
> > Mix of spaces and tabs :(
> > 
> 
> Argh. :(
> 
> > > +
> > >  endmenu
> > > diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> > > index f2816f6ff76a..397e5c344e6a 100644
> > > --- a/drivers/base/Makefile
> > > +++ b/drivers/base/Makefile
> > > @@ -23,6 +23,7 @@ obj-$(CONFIG_SOC_BUS) += soc.o
> > >  obj-$(CONFIG_PINCTRL) += pinctrl.o
> > >  obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
> > >  obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
> > > +obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
> > >  
> > >  obj-y+= test/
> > >  
> > > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> > > new file mode 100644
> > > index ..c1dd430adad2
> > > --- /dev/null
> > > +++ b/drivers/base/arch_topology.c
> > > @@ -0,0 +1,237 @@
> > > +/*
> > > + * driver/base/arch_topology.c - Arch specific cpu topology information
> > 
> > No need to keep the filename in the file, you know what it is called :)
> > 
> 
> OK, removed.
> 
> > > + *
> > > + * Copyright (C) 2016, ARM Ltd.
> > > + * Written by: Juri Lelli, ARM Ltd.
> > > + *
> > > + * This file is subject to the terms and conditions of the GNU General 
> > > Public
> > > + * License.  See the file "COPYING" in the main directory of this archive
> > > + * for more details.
> > 
> > So, v2 only?  Please be specific.  Even better yet, use a SPDX header if
> > you want to, those are always nice.
> > 
> 
> Yes, v2 only.
> 
>   * for more details. 
>   
>  
> + *   
>

[PATCH] sched/deadline: remove useless param from setup_new_dl_entity

2016-06-17 Thread Juri Lelli

setup_new_dl_entity() takes two parameters, but it only actually uses
one of them to setup a new dl_entity.

Remove the second, useless, parameter.

Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Steven Rostedt 
Cc: Luca Abeni 
Signed-off-by: Juri Lelli 
---
 kernel/sched/deadline.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217ff4..5229788a4765 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -346,8 +346,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct 
task_struct *p,
  * one, and to (try to!) reconcile itself with its own scheduling
  * parameters.
  */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
-  struct sched_dl_entity *pi_se)
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
 {
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
@@ -367,8 +366,8 @@ static inline void setup_new_dl_entity(struct 
sched_dl_entity *dl_se,
 * future; in fact, we must consider execution overheads (time
 * spent on hardirq context, etc.).
 */
-   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
-   dl_se->runtime = pi_se->dl_runtime;
+   dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
+   dl_se->runtime = dl_se->dl_runtime;
 }
 
 /*
@@ -1721,7 +1720,7 @@ static void switched_from_dl(struct rq *rq, struct 
task_struct *p)
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-   setup_new_dl_entity(&p->dl, &p->dl);
+   setup_new_dl_entity(&p->dl);
 
if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-- 
2.7.0

Re: [PATCH] sched/deadline: remove useless param from setup_new_dl_entity

2016-06-17 Thread Juri Lelli

On 17/06/16 11:58, Luca Abeni wrote:
> Hi,
> 
> On Fri, 17 Jun 2016 10:48:41 +0100
> Juri Lelli  wrote:
> 
> > setup_new_dl_entity() takes two parameters, but it only actually uses
> > one of them to setup a new dl_entity.
> > 
> > Remove the second, useless, parameter.
> 
> Funnily enough, I was adding something similar in my local queue :)
> Looks good to me
> 

Yeah, noticed this while looking at your reclaiming patches. :)

Thanks for reviewing,

- Juri

> 
> > 
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Cc: Steven Rostedt 
> > Cc: Luca Abeni 
> > Signed-off-by: Juri Lelli 
> > ---
> >  kernel/sched/deadline.c | 9 -
> >  1 file changed, 4 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index fcb7f0217ff4..5229788a4765 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -346,8 +346,7 @@ static void check_preempt_curr_dl(struct rq *rq,
> > struct task_struct *p,
> >   * one, and to (try to!) reconcile itself with its own scheduling
> >   * parameters.
> >   */
> > -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
> > -  struct sched_dl_entity *pi_se)
> > +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >  {
> > struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > struct rq *rq = rq_of_dl_rq(dl_rq);
> > @@ -367,8 +366,8 @@ static inline void setup_new_dl_entity(struct
> > sched_dl_entity *dl_se,
> >  * future; in fact, we must consider execution overheads
> > (time
> >  * spent on hardirq context, etc.).
> >  */
> > -   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> > -   dl_se->runtime = pi_se->dl_runtime;
> > +   dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
> > +   dl_se->runtime = dl_se->dl_runtime;
> >  }
> >  
> >  /*
> > @@ -1721,7 +1720,7 @@ static void switched_from_dl(struct rq *rq,
> > struct task_struct *p) static void switched_to_dl(struct rq *rq,
> > struct task_struct *p) {
> > if (dl_time_before(p->dl.deadline, rq_clock(rq)))
> > -   setup_new_dl_entity(&p->dl, &p->dl);
> > +   setup_new_dl_entity(&p->dl);
> >  
> > if (task_on_rq_queued(p) && rq->curr != p) {
> >  #ifdef CONFIG_SMP
>

Re: [PATCH] sched/deadline: remove useless param from setup_new_dl_entity

2016-06-17 Thread Juri Lelli

On 17/06/16 09:49, Steven Rostedt wrote:
> On Fri, 17 Jun 2016 10:48:41 +0100
> Juri Lelli  wrote:
> 
> > setup_new_dl_entity() takes two parameters, but it only actually uses
> > one of them to setup a new dl_entity.
> > 
> 
> Actually this patch is making it so that setup_new_dl_entity() only
> uses one of the parameters. Can you note why that change happened.
> Because this change log implies that the second parameter wasn't used
> before this patch, and that is incorrect.
> 

True, but we were practically already using the same parameter, under a
different name though, after

2f9f3fdc928 "sched/deadline: Remove dl_new from struct sched_dl_entity"

as we currently do:

  setup_new_dl_entity(&p->dl, &p->dl)

> This patch reverts part of the change done in
> commit 2d3d891d334 "sched/deadline: Add SCHED_DEADLINE inheritance
> logic"
> 

Before Luca's change we were doing

 setup_new_dl_entity(dl_se, pi_se)

in update_dl_entity() for a dl_se->new entity. So, I guess the question
is actually why we wanted to use pi_se's parameters (the potential PI
donor) for setting up a new entity? Maybe we broke the situation where a
task is currently boosted by a DEADLINE waiter and we swich the holder
to DEADLINE?

> It would be nice to have the reason in the change log.
> 

Thanks a lot for pointing out what might be more than inaccuracy in the
changelog.

Best,

- Juri

> 
> > Remove the second, useless, parameter.
> > 
> > Cc: Ingo Molnar 
> > Cc: Peter Zijlstra 
> > Cc: Steven Rostedt 
> > Cc: Luca Abeni 
> > Signed-off-by: Juri Lelli 
> > ---
> >  kernel/sched/deadline.c | 9 -
> >  1 file changed, 4 insertions(+), 5 deletions(-)
> > 
> > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > index fcb7f0217ff4..5229788a4765 100644
> > --- a/kernel/sched/deadline.c
> > +++ b/kernel/sched/deadline.c
> > @@ -346,8 +346,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct 
> > task_struct *p,
> >   * one, and to (try to!) reconcile itself with its own scheduling
> >   * parameters.
> >   */
> > -static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
> > -  struct sched_dl_entity *pi_se)
> > +static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
> >  {
> > struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > struct rq *rq = rq_of_dl_rq(dl_rq);
> > @@ -367,8 +366,8 @@ static inline void setup_new_dl_entity(struct 
> > sched_dl_entity *dl_se,
> >  * future; in fact, we must consider execution overheads (time
> >  * spent on hardirq context, etc.).
> >  */
> > -   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> > -   dl_se->runtime = pi_se->dl_runtime;
> > +   dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
> > +   dl_se->runtime = dl_se->dl_runtime;
> >  }
> >  
> >  /*
> > @@ -1721,7 +1720,7 @@ static void switched_from_dl(struct rq *rq, struct 
> > task_struct *p)
> >  static void switched_to_dl(struct rq *rq, struct task_struct *p)
> >  {
> > if (dl_time_before(p->dl.deadline, rq_clock(rq)))
> > -   setup_new_dl_entity(&p->dl, &p->dl);
> > +   setup_new_dl_entity(&p->dl);
> >  
> > if (task_on_rq_queued(p) && rq->curr != p) {
> >  #ifdef CONFIG_SMP
>

Re: [PATCH v7 REPOST 8/9] arm: add sysfs cpu_capacity attribute

2016-11-02 Thread Juri Lelli

Hi,

apologies for the delay in replying, but I'm attending Linux Plumbers
this week.

On 30/10/16 20:45, Russell King - ARM Linux wrote:
> On Mon, Oct 17, 2016 at 04:46:49PM +0100, Juri Lelli wrote:
> > +#ifdef CONFIG_PROC_SYSCTL
> > +#include 
> > +#include 
> 
> Include files at the top of the file please.  No need to ifdef them.
> They're sorted alphabetically, so new additions should be alphabetical.
> (That's a general rule - if something is already alphabetical, do not
> make it non-alphabetical.)
> 
> > +static ssize_t show_cpu_capacity(struct device *dev,
> > +struct device_attribute *attr,
> > +char *buf)
> > +{
> > +   struct cpu *cpu = container_of(dev, struct cpu, dev);
> > +   ssize_t rc;
> > +   int cpunum = cpu->dev.id;
> > +   unsigned long capacity = arch_scale_cpu_capacity(NULL, cpunum);
> > +
> > +   rc = sprintf(buf, "%lu\n", capacity);
> > +
> > +   return rc;
> 
> Way too many lines for such a simple function.  This can be simplified
> to just:
> 
>   struct cpu *cpu = container_of(dev, struct cpu, dev);
> 
>   return sprintf(buf, "%lu\n", arch_scale_cpu_capacity(NULL, cpu->dev.id);
> 
> If you don't like the last line ending on column 79, then feel free to
> break it across two lines after the format string.
> 
> > +}
> > +
> > +static ssize_t store_cpu_capacity(struct device *dev,
> > + struct device_attribute *attr,
> > + const char *buf,
> > + size_t count)
> > +{
> > +   struct cpu *cpu = container_of(dev, struct cpu, dev);
> > +   int this_cpu = cpu->dev.id, i;
> > +   unsigned long new_capacity;
> > +   ssize_t ret;
> > +
> > +   if (count) {
> > +   char *p = (char *) buf;
> > +
> > +   ret = kstrtoul(p, 0, &new_capacity);
> 
> Unnecessary cast - kstrtoul takes a const char pointer, and in any case
> it's really bad form to cast away the "const-ness" of any pointer.  So,
> just:
> 
>   if (count) {
>   ret = kstrtoul(buf, 0, &new_capacity);
> 
> should work just fine.
> 
> > +   if (ret)
> > +   return ret;
> > +   if (new_capacity > SCHED_CAPACITY_SCALE)
> > +   return -EINVAL;
> > +
> > +   mutex_lock(&cpu_scale_mutex);
> > +   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
> > +   set_capacity_scale(i, new_capacity);
> > +   mutex_unlock(&cpu_scale_mutex);
> > +   }
> > +
> > +   return count;
> > +}
> > +
> > +static DEVICE_ATTR(cpu_capacity,
> > +  0644,
> > +  show_cpu_capacity,
> > +  store_cpu_capacity);
> 
> There's a move to use the named DEVICE_ATTR_RW() for this kind of thing,
> it'll want the functions named xxx_show() and xxx_store().  I see
> there's some recent patches to do this conversion across the kernel, so
> this should probably be done before submission.
> 
> > +
> > +static int register_cpu_capacity_sysctl(void)
> > +{
> > +   int i;
> > +   struct device *cpu;
> > +
> > +   for_each_possible_cpu(i) {
> > +   cpu = get_cpu_device(i);
> > +   if (!cpu) {
> > +   pr_err("%s: too early to get CPU%d device!\n",
> > +  __func__, i);
> > +   continue;
> > +   }
> > +   device_create_file(cpu, &dev_attr_cpu_capacity);
> > +   }
> > +
> > +   return 0;
> > +}
> > +late_initcall(register_cpu_capacity_sysctl);
> 
> Hmm, this is really weird.  topology_init() in arch/arm/kernel/setup.c
> is where these devices get created, and they're created at
> subsys_initcall() time.  By that point, the list of possible CPUs has
> to be static, it's not going to change.  I don't see why this has to be
> done at late_initcall() - and since topology.c will be linked after
> setup.c, I don't see why it shouldn't be at subsys_initcall() level to
> follow on after topology_init().
> 

I should have addressed your comments with the updated version below. If
it looks good to you I'll superseed the old version with this new one.

Best,

- Juri

--->8---
>From 14c0f21d403ad47843896eecc042334d4e0ed8dd Mon Sep 17 00:00:00 2001
From: Juri Lelli 
Date: Thu, 15 Oct 2015 13:53:37 +0100
Subject: [PATCH v7.1 8/9] arm:

Re: [PATCH v7 REPOST 0/9] CPUs capacity information for heterogeneous systems

2016-11-02 Thread Juri Lelli

Hi Catalin,

On 30/10/16 14:22, Catalin Marinas wrote:
> On Mon, Oct 17, 2016 at 04:46:41PM +0100, Juri Lelli wrote:
> > I'm thus now assuming that everybody is OK with the patches and that they 
> > can
> > be queued for 4.10 (we certainly need this plumbing at this point). Please
> > speak if my assumption is wrong (and provide feedback! :).
> > Otherwise I'm going to:
> > 
> >  - use Russell's patching system for patches 2 and 8
> >  - ask Sudeep to pull patches 3,5,6 and 7
> >  - ask Catalin/Will to pull patches 1,4 and 9
> 
> I'm happy to queue patches 1, 4 and 9 for 4.10 (though it might have
> been easier for the whole series to go through arm-soc).
> 
> > Do you think we might get into trouble splitting the merge process this way?
> 
> Probably not. The only minor downside is that I have to grab a new DT
> for Juno from Sudeep to test the patches. Not an issue, though.
> 

Thanks and apologies if merging through different trees generates some
confusion.

I updated arm patches to address Russell's comments. I did the same for
arm64. I'll reply with the updated version, so you can see if it looks
good to you as well. In case it is OK, I already updated the for-arm64
branch with the new version:
 
 git://linux-arm.org/linux-jl.git upstream/default_caps_for-arm64 

Best,

- Juri

Re: [PATCH v7 REPOST 9/9] arm64: add sysfs cpu_capacity attribute

2016-11-02 Thread Juri Lelli

Hi,

small update to be in sync with Russell's comments on arm correspoding
patch.

On 17/10/16 16:46, Juri Lelli wrote:
> Add a sysfs cpu_capacity attribute with which it is possible to read and
> write (thus over-writing default values) CPUs capacity. This might be
> useful in situations where values needs changing after boot.
> 
> The new attribute shows up as:
> 
>  /sys/devices/system/cpu/cpu*/cpu_capacity
> 
> Cc: Catalin Marinas 
> Cc: Will Deacon 
> Cc: Mark Brown 
> Cc: Sudeep Holla 
> Signed-off-by: Juri Lelli 
> ---
> 
> Changes from v5:
>   - add mutex to protect cpu_scale (as pointed out by Morten off-line)

--->8---
>From 17684f3db6d74342da424997badcb3714a1a9e63 Mon Sep 17 00:00:00 2001
From: Juri Lelli 
Date: Wed, 14 Oct 2015 12:02:05 +0100
Subject: [PATCH v7.1 9/9] arm64: add sysfs cpu_capacity attribute

Add a sysfs cpu_capacity attribute with which it is possible to read and
write (thus over-writing default values) CPUs capacity. This might be
useful in situations where values needs changing after boot.

The new attribute shows up as:

 /sys/devices/system/cpu/cpu*/cpu_capacity

Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Mark Brown 
Cc: Sudeep Holla 
Signed-off-by: Juri Lelli 
---

Changes from v5:
  - add mutex to protect cpu_scale (as pointed out by Morten off-line)

Changes from v7:
  - include files moved at top of file
  - show_cpu_capacity simplified to less lines of code
  - unnecessary cast removed in store_cpu_capacity
  - use DEVICE_ATTR_RW() instead of DEVICE_ATTR()
  - use subsys_initcall instead of late_initcall
---
 arch/arm64/kernel/topology.c | 64 
 1 file changed, 64 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index b75b0ba2e113..23e9e13bd2aa 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -20,12 +20,15 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
+#include 
 #include 
 #include 
 
 static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+static DEFINE_MUTEX(cpu_scale_mutex);
 
 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
@@ -37,6 +40,65 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
+#ifdef CONFIG_PROC_SYSCTL
+static ssize_t cpu_capacity_show(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+
+   return sprintf(buf, "%lu\n",
+   arch_scale_cpu_capacity(NULL, cpu->dev.id));
+}
+
+static ssize_t cpu_capacity_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+   struct cpu *cpu = container_of(dev, struct cpu, dev);
+   int this_cpu = cpu->dev.id, i;
+   unsigned long new_capacity;
+   ssize_t ret;
+
+   if (count) {
+   ret = kstrtoul(buf, 0, &new_capacity);
+   if (ret)
+   return ret;
+   if (new_capacity > SCHED_CAPACITY_SCALE)
+   return -EINVAL;
+
+   mutex_lock(&cpu_scale_mutex);
+   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
+   set_capacity_scale(i, new_capacity);
+   mutex_unlock(&cpu_scale_mutex);
+   }
+
+   return count;
+}
+
+static DEVICE_ATTR_RW(cpu_capacity);
+
+static int register_cpu_capacity_sysctl(void)
+{
+   int i;
+   struct device *cpu;
+
+   for_each_possible_cpu(i) {
+   cpu = get_cpu_device(i);
+   if (!cpu) {
+   pr_err("%s: too early to get CPU%d device!\n",
+  __func__, i);
+   continue;
+   }
+   device_create_file(cpu, &dev_attr_cpu_capacity);
+   }
+
+   return 0;
+}
+subsys_initcall(register_cpu_capacity_sysctl);
+#endif
+
 static u32 capacity_scale;
 static u32 *raw_capacity;
 static bool cap_parsing_failed;
@@ -87,6 +149,7 @@ static void normalize_cpu_capacity(void)
return;
 
pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale);
+   mutex_lock(&cpu_scale_mutex);
for_each_possible_cpu(cpu) {
pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n",
 cpu, raw_capacity[cpu]);
@@ -96,6 +159,7 @@ static void normalize_cpu_capacity(void)
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
cpu, arch_scale_cpu_capacity(NULL, cpu));
}
+   mutex_unlock(&cpu_scale_mutex);
 }
 
 #ifdef CONFIG_CPU_FREQ
-- 
2.10.0

Re: [PATCH v3] sched/deadline: Change the time to replenish runtime for sleep tasks

2017-02-28 Thread Juri Lelli

On 28/02/17 22:09, Byungchul Park wrote:
> On Tue, Feb 28, 2017 at 11:35:15AM +0000, Juri Lelli wrote:
> > Hi,
> > 
> > On 23/02/17 15:14, Byungchul Park wrote:
> > > Let's consider the following example.
> > > 
> > > timeline : o...o.o...o..o
> > >^   ^ ^   ^  ^
> > >|   | |   |  |
> > >start   | |   |  |
> > > original runtime |   |  |
> > >  sleep with (-)runtime   |  |
> > >  original deadline  |
> > >   wake up
> > > 
> > > When this task is woken up, a negative runtime should be considered,
> > > which means that the task should get penalized when assigning runtime,
> > > becasue it already spent more than expected. Current code handles this
> > > by replenishing a runtime in hrtimer callback for deadline. But this
> > > approach has room for improvement:
> > > 
> > >It will be replenished twice unnecessarily if the task sleeps for
> > >long time so that the deadline, assigned in the hrtimer callback,
> > >also passed. In other words, one happens in the callback and the
> > >other happens in update_dl_entiry() when waking it up.
> > > 
> > > So force to replenish it for sleep tasks when waking it up.
> > > 
> > > Signed-off-by: Byungchul Park 
> > > ---
> > >  kernel/sched/deadline.c | 13 ++---
> > >  1 file changed, 6 insertions(+), 7 deletions(-)
> > > 
> > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > index 27737f3..cb43ce9 100644
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -498,8 +498,9 @@ static void update_dl_entity(struct sched_dl_entity 
> > > *dl_se,
> > >   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
> > >   struct rq *rq = rq_of_dl_rq(dl_rq);
> > >  
> > > - if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
> > > - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
> > > + if (dl_time_before(dl_se->deadline, rq_clock(rq)))
> > > + replenish_dl_entity(dl_se, pi_se);
> > > + else if (dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
> > >   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> > >   dl_se->runtime = pi_se->dl_runtime;
> > >   }
> > > @@ -621,13 +622,11 @@ static enum hrtimer_restart dl_task_timer(struct 
> > > hrtimer *timer)
> > >* __dequeue_task_dl()
> > >* prev->on_rq = 0;
> > >*
> > > -  * We can be both throttled and !queued. Replenish the counter
> > > -  * but do not enqueue -- wait for our wakeup to do that.
> > > +  * We can be both throttled and !queued. Wait for our wakeup to
> > > +  * replenish runtime and enqueue p.
> > >*/
> > > - if (!task_on_rq_queued(p)) {
> > > - replenish_dl_entity(dl_se, dl_se);
> > 
> > Hasn't this patch the same problem we discussed a couple of weeks ago?
> 
> No. This patch solves the problem by calling replenish_dl_entity() when
> a dl task is woken up.
> 

So, if the task was throttled in the "going to sleep" path we set the
replenishment timer to fire at your "original deadline" instant of time
above. Now, 3 things can happen I guess:

 - task wakes up before the replenishment timer ("original deadline")
   -> it is throttled, so we do nothing
 
 - task wakes up after the replenishment timer
   -> we replenished it in the timer callback (which considers negative
  runtime from previous execution)
  + deadline should be in the future
  + dl_entity shouldn't overflow
  + we don't touch its parameters, but we simply enqueue it back on dl_rq

 - task wakes up even after the deadline it has got from previous
   replenishment expired
   -> we assign to him completely new parameters, but since he didn't
  use the previous runtime at all, this should be fine I guess

What am I still missing? :)

> The problem was that it cannot consider negative runtime if we replenish
> the task when it's woken up. So I made replenish_dl_entity() called even
> on wake-up path, instead of simple assignment.
> 
> IMHO, this patch avoids double-replenishing properly, but adds additional
> condition on wake-up path to acheive it. To be honest, I don't think it's
> worth as I expected.
> 
> Thank you,
> Byungchul
> 
> > 
> > https://marc.info/?l=linux-kernel&m=148699950802995
> > 
> > Thanks,
> > 
> > - Juri

Re: [PATCH v3] sched/deadline: Change the time to replenish runtime for sleep tasks

2017-03-01 Thread Juri Lelli

On 01/03/17 12:37, Byungchul Park wrote:
> On Tue, Feb 28, 2017 at 05:47:53PM +0000, Juri Lelli wrote:
> > > > > Let's consider the following example.
> > > > > 
> > > > > timeline : o...o.o...o..o
> > > > >^   ^ ^   ^  ^
> > > > >|   | |   |  |
> > > > >start   | |   |  |
> > > > > original runtime |   |  |
> > > > >  sleep with (-)runtime   |  |
> > > > >  original deadline  |
> > > > >   wake up
> > > > > 
> > > > > When this task is woken up, a negative runtime should be considered,
> > > > > which means that the task should get penalized when assigning runtime,
> > > > > becasue it already spent more than expected. Current code handles this
> > > > > by replenishing a runtime in hrtimer callback for deadline. But this
> > > > > approach has room for improvement:
> > > > > 
> > > > >It will be replenished twice unnecessarily if the task sleeps for
> > > > >long time so that the deadline, assigned in the hrtimer callback,
> > > > >also passed. In other words, one happens in the callback and the
> > > > >other happens in update_dl_entiry() when waking it up.
> > > > > 
> > > > > So force to replenish it for sleep tasks when waking it up.
> > > > > 
> > > > > Signed-off-by: Byungchul Park 
> > > > > ---
> 
> ...
> 
> > So, if the task was throttled in the "going to sleep" path we set the
> > replenishment timer to fire at your "original deadline" instant of time
> 
> Hi,
> 
> Precisely speaking, we set the timer if the task was throttled, no
> matter whether it's in the 'going to sleep' path or not. IWO, the timer
> is set even in the path. And I want to say it's unnecessary.
> 
> > above. Now, 3 things can happen I guess:
> > 
> >  - task wakes up before the replenishment timer ("original deadline")
> >-> it is throttled, so we do nothing
> >  
> >  - task wakes up after the replenishment timer
> >-> we replenished it in the timer callback (which considers negative
> >   runtime from previous execution)
> >   + deadline should be in the future
> >   + dl_entity shouldn't overflow
> >   + we don't touch its parameters, but we simply enqueue it back on 
> > dl_rq
> > 
> >  - task wakes up even after the deadline it has got from previous
> >replenishment expired
> >-> we assign to him completely new parameters, but since he didn't
> >   use the previous runtime at all, this should be fine I guess
> 
> Exactly same as what I thought. I agree that current code works properly.
> However, the code has room for improvement because tasks being throttled
> in 'going to sleep' path do not need to set additional timer.
> 
> To be honest, I also tried to remove setting timer in the case, but it
> makes code a bit cluttered because I should handle the your first case
> above, that means I should add a proper timer on the wake-up path if
> necessary.
> 
> The try would be worthy if avoiding unnecessary timer is more important
> than making code cluttered. But I did not include the try since I'm not
> sure. What do you think about that? Which one is more important between
> avoiding unnecessary timer and avoiding making code cluttered?
> 

I'd err on the side of clearness and safety (where I assume the current
code is safer only because it has been tested for a while :). However,
if you can prove in a reproducible manner that the changes you are
proposing make overhead way smaller, we might still consider them.

Anyway, IMVHO, there are still lot of known issues and missing features
that deserve time; so, if your have some time to look at DEADLINE stuff,
I'd kindly point you at

https://github.com/jlelli/sched-deadline/wiki/TODOs

You would be very welcome if you think you can pick something of what
above up, and we should synchronize in that case, as several people are
already working on some of the aforementioned bits.

Thanks,

- Juri

Re: [PATCH 3/2] sched/deadline: Use deadline instead of period when calculating overflow

2017-02-15 Thread Juri Lelli

Hi,

On 15/02/17 08:40, Luca Abeni wrote:
> Hi Steven,
> 
> On Tue, 14 Feb 2017 19:14:17 -0500
> Steven Rostedt  wrote:
> [...]
> > > I am not sure about the correct fix (wouldn't
> > > "runtime / (deadline - t) > dl_runtime / dl_deadline" allow the
> > > task to use a fraction of CPU time equal to dl_runtime /
> > > dl_deadline?)
> > > 
> > > The current code is clearly wrong (as shown by Daniel), but I do not
> > > understand how the current check can allow the task to consume more
> > > than dl_runtime / dl_period... I need some more time to think about
> > > this issue. 
> > >   
> > 
> > This is in dl_entity_overflow() which is called by update_dl_entity()
> > which has this:
> > 
> > if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
> > dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
> > dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> > dl_se->runtime = pi_se->dl_runtime;
> > }
> > 
> > 
> > The comments in this code state:
> > 
> >  * The policy here is that we update the deadline of the entity only
> > if:
> >  *  - the current deadline is in the past,
> >  *  - using the remaining runtime with the current deadline would make
> >  *the entity exceed its bandwidth.
> > 
> > That second comment is saying that when this task woke up, if the
> > percentage left to run will exceed its bandwidth with the rest of the
> > system then reset its deadline and its runtime.
> 
> Right; this is the problem. When the relative deadline is different
> from the period, the term "bandwidth" is ambiguous... We can consider
> the utilisation (maximum runtime / period), or the density (maximum
> runtime / relative deadline). In some sense, the two approaches are
> both correct (if we use density, we are more pessimistic but we try to
> respect deadlines in a hard way; if we use utilisation, we allow more
> tasks to be admitted but we can only provide bounded tardiness).
> 
> What the current code is doing is to mix the two approaches (resulting
> in a wrong runtime/deadline assignment).
> 
> > What happens in the current logic, is that overflow() check says, when
> > the deadline is much smaller than the period, "yeah, we're going to
> > exceed our percentage!" so give us more, even though it wont exceed
> > its percentage if we compared runtime with deadline.
> > 
> > The relative-runtime / relative-period is a tiny percentage, which
> > does not reflect the percentage that the task is allowed to have
> > before the deadline is hit. The tasks bandwidth should be calculated
> > by the relative-runtime / relative-deadline, as runtime <= deadline
> > <= period, and the runtime should happen within the deadline.
> > 
> > When the task wakes up, it currently looks at how much time is left
> > absolute-deadline - t, and compares it to the amount of runtime left.
> > The percentage allowed should still be compared with the percentage
> > between relative-runtime and relative-deadline. The relative-period or
> > even absolute-period, should have no influence in this decision.
> 
> Ok, thanks; I think I can now see why this can result in a task
> consuming more than the reserved utilisation. I still need some time to
> convince me that "runtime / (deadline - t) > dl_runtime / dl_deadline"
> is the correct check to use (in this case, shouldn't we also change the
> admission test to use densities instead of utilisations?)
> 

Right, this is what I was wondering as well, as dl_overflow() currently
looks at the period. And I also have some recollection of this
discussion happening already in the past, unfortunately it was not on
the list.

That discussion started with the following patch

--->8---
>From 6cd9b6f3c2b9f144828aa09ad2a355b00a153348 Mon Sep 17 00:00:00 2001
From: Juri Lelli 
Date: Fri, 4 Sep 2015 15:41:42 +0100
Subject: [PATCH] sched/core: fix SCHED_DEADLINE admission control

As Documentation/sched/sched-deadline.txt says, a new task can pass
through admission control if sum(WCET_i / min{D_i, P_i}) <= 1.
However, if the user specifies both sched_period and sched_deadline,
we actually check that sum(WCET_i / P_i) <= 1; and this is a less
restrictive check w.r.t. the former.

Fix this by always using sched_deadline parameter to compute new_bw,
as we also impose that runtime <= deadline <= period (if period != 0)
and deadline != 0.

Fixes: 4df1638cfaf9 ("sched/deadline: Fix overflow to handle period==0 and 
deadline!=0")
Signed-off-by: Juri Lelli 
---
 kernel/sched/core.c | 4

Re: [PATCH] sched/deadline: Remove unnecessary condition in push_dl_task()

2017-02-15 Thread Juri Lelli

[+Steve, Luca]

Hi,

On 15/02/17 14:11, Byungchul Park wrote:
> Once pick_next_pushable_dl_task(rq) return a task, it guarantees that
> the task's cpu is rq->cpu, so task_cpu(next_task) is always rq->cpu if
> task == next_task. Remove a redundant condition and make code simpler.
> 
> Signed-off-by: Byungchul Park 
> ---
>  kernel/sched/deadline.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 27737f3..ad8d577 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1483,7 +1483,7 @@ static int push_dl_task(struct rq *rq)
>* then possible that next_task has migrated.
>*/
>   task = pick_next_pushable_dl_task(rq);
> - if (task_cpu(next_task) == rq->cpu && task == next_task) {
> + if (task == next_task) {

Seems a sensible optimization to me. Actually, we are doing the same for
rt.c; Steve, Peter, do you think we should optimize that as well?

Thanks,

- Juri

Re: [PATCH 3/2] sched/deadline: Use deadline instead of period when calculating overflow

2017-02-15 Thread Juri Lelli

On 15/02/17 13:31, Luca Abeni wrote:
> Hi Juri,
> 
> On Wed, 15 Feb 2017 10:29:19 +0000
> Juri Lelli  wrote:
> [...]
> > > Ok, thanks; I think I can now see why this can result in a task
> > > consuming more than the reserved utilisation. I still need some
> > > time to convince me that "runtime / (deadline - t) > dl_runtime /
> > > dl_deadline" is the correct check to use (in this case, shouldn't
> > > we also change the admission test to use densities instead of
> > > utilisations?) 
> > 
> > Right, this is what I was wondering as well, as dl_overflow()
> > currently looks at the period. And I also have some recollection of
> > this discussion happening already in the past, unfortunately it was
> > not on the list.
> > 
> > That discussion started with the following patch
> [...]
> > that we then dediced not to propose since (note that these are just my
> > memories of the dicussion, so everything it's up for further
> > discussion, also in light of the problem highlighted by Daniel)
> > 
> >  - SCHED_DEADLINE, as the documentation says, does AC using
> > utilization
> >  - it is however true that a sufficient (but not necessary) test on
> > UP for D_i != P_i cases is the one of my patch above
> >  - we have agreed in the past that the kernel should only check that
> > we don't cause "overload" in the system (which is still the case if we
> >consider utilizations), not "hard schedulability"
> I remember a similar discussion; I think the decision about what to do
> depends on what are the requirements: hard deadline guarantees (but in
> this case global EDF is just a bad choice) or tardines no overload
> guarantees?
> 
> My understanding was that the kernel guarantees that deadline tasks
> will not starve non-deadline tasks, and that there is an upper bound
> for the tardiness experienced by deadline tasks. If this understanding
> is correct, then the current admission test is ok. But if I
> misunderstood the purpose of the kernel admission test, then maybe your
> patch is ok.
> 
> Then, it is important to keep the admission test consistent with the
> checks performed in dl_entity_overflow() (but whatever we decide to do,
> dl_entity_overflow() should be fixed).
> 

I'm sorry, but I'm a bit lost. :(

Why do you say 'whatever we decide to do'?

In my understanding:

 - if we decide AC shouldn't change (as we care about not-starving
   others and having bounded tardiness), then I'd say dl_entity_overflow
   shouldn't change as well, since it's using dl_runtime/dl_period as
   'static bandwidth' (as AC does)
 
 - if we instead move to use densities when doing AC (dl_runtime/dl_
   deadline), I think we should also change the check in dl_entity_
   overflow, as Steve is proposing

 - in both cases Daniel's fixes look sensible to have

Where am I wrong? :)

Actually, another thing that we noticed, talking on IRC with Peter, is
that we seem to be replenishing differently on different occasions:

 - on wakeup (if overflowing) we do

   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
   dl_se->runtime = pi_se->dl_runtime;

 - when the replenishment timer fires (un-thottle and with runtime < 0)

   dl_se->deadline += pi_se->dl_period;
   dl_se->runtime += pi_se->dl_runtime;

Isn't this problematic as well?

Thanks,

- Juri

Re: [PATCH 3/2] sched/deadline: Use deadline instead of period when calculating overflow

2017-02-15 Thread Juri Lelli

On 15/02/17 14:13, Luca Abeni wrote:
> On Wed, 15 Feb 2017 12:59:25 +
> Juri Lelli  wrote:
> 
> > On 15/02/17 13:31, Luca Abeni wrote:
> > > Hi Juri,
> > > 
> > > On Wed, 15 Feb 2017 10:29:19 +
> > > Juri Lelli  wrote:
> > > [...]  
> > > > > Ok, thanks; I think I can now see why this can result in a task
> > > > > consuming more than the reserved utilisation. I still need some
> > > > > time to convince me that "runtime / (deadline - t) >
> > > > > dl_runtime / dl_deadline" is the correct check to use (in this
> > > > > case, shouldn't we also change the admission test to use
> > > > > densities instead of utilisations?)   
> > > > 
> > > > Right, this is what I was wondering as well, as dl_overflow()
> > > > currently looks at the period. And I also have some recollection
> > > > of this discussion happening already in the past, unfortunately
> > > > it was not on the list.
> > > > 
> > > > That discussion started with the following patch  
> > > [...]  
> > > > that we then dediced not to propose since (note that these are
> > > > just my memories of the dicussion, so everything it's up for
> > > > further discussion, also in light of the problem highlighted by
> > > > Daniel)
> > > > 
> > > >  - SCHED_DEADLINE, as the documentation says, does AC using
> > > > utilization
> > > >  - it is however true that a sufficient (but not necessary) test
> > > > on UP for D_i != P_i cases is the one of my patch above
> > > >  - we have agreed in the past that the kernel should only check
> > > > that we don't cause "overload" in the system (which is still the
> > > > case if we consider utilizations), not "hard schedulability"  
> > > I remember a similar discussion; I think the decision about what to
> > > do depends on what are the requirements: hard deadline guarantees
> > > (but in this case global EDF is just a bad choice) or tardines no
> > > overload guarantees?
> > > 
> > > My understanding was that the kernel guarantees that deadline tasks
> > > will not starve non-deadline tasks, and that there is an upper bound
> > > for the tardiness experienced by deadline tasks. If this
> > > understanding is correct, then the current admission test is ok.
> > > But if I misunderstood the purpose of the kernel admission test,
> > > then maybe your patch is ok.
> > > 
> > > Then, it is important to keep the admission test consistent with the
> > > checks performed in dl_entity_overflow() (but whatever we decide to
> > > do, dl_entity_overflow() should be fixed).
> > >   
> > 
> > I'm sorry, but I'm a bit lost. :(
> > 
> > Why do you say 'whatever we decide to do'?
> > 
> > In my understanding:
> > 
> >  - if we decide AC shouldn't change (as we care about not-starving
> >others and having bounded tardiness), then I'd say
> > dl_entity_overflow shouldn't change as well, since it's using
> > dl_runtime/dl_period as 'static bandwidth' (as AC does)
> 
> Yes, but it is comparing dl_runtime/dl_period with
> runtime/(deadline-t), mixing different things. I still need to think
> more about this, but I think it should either compare
> runtime/(deadline-t) with dl_runtime/dl_deadline or
> runtime/(end_of_reservation_period-t) with dl_runtime/dl_period...
> Otherwise we risk to have issues as shown by Daniel and Steven.

OK.

> 
> 
> >  - if we instead move to use densities when doing AC (dl_runtime/dl_
> >deadline), I think we should also change the check in dl_entity_
> >overflow, as Steve is proposing
> > 
> >  - in both cases Daniel's fixes look sensible to have
> Yes, Daniel's fixes fix a possible DoS, so they should go in... Then,
> we can decide how to improve the situation.
> 
> > 
> > Where am I wrong? :)
> > 
> > Actually, another thing that we noticed, talking on IRC with Peter, is
> > that we seem to be replenishing differently on different occasions:
> > 
> >  - on wakeup (if overflowing) we do
> > 
> >dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> >dl_se->runtime = pi_se->dl_runtime;
> > 
> >  - when the replenishment timer fires (un-thottle and with runtime <
> > 0)
> > 
> >dl_se->deadline += pi_se->dl_period;
> >dl_se->runtime += pi_se->dl_runtime;
> > 
> > Isn't this problematic as well?
> I _think_ this is correct, because they are two different things: in
> the first case, we generate a new scheduling deadline starting from
> current time (so, the deadline must be computed based on the relative
> deadline); in the second case, we postpone an existing scheduling
> deadline (so, it must be postponed by one period)[*]... No? Or am I
> misunderstanding the issue you saw?
> 

No, what you are saying makes sense, we don't actually have a problem.

Re: [PATCH 3/2] sched/deadline: Use deadline instead of period when calculating overflow

2017-02-15 Thread Juri Lelli

On 15/02/17 14:33, Daniel Bristot de Oliveira wrote:
> On 02/15/2017 01:59 PM, Juri Lelli wrote:
> > Actually, another thing that we noticed, talking on IRC with Peter, is
> > that we seem to be replenishing differently on different occasions:
> 
> When a task is awakened (not by the replenishment timer), it is not
> possible to know if the absolute deadline var stores the absolute
> deadline of activation which took place in the instant
> (current time) - dl_period.
> 
> Therefore, assuming the next deadline is one dl_deadline away from now
> is correct.
> 
> IOW: that is a sporadic activation - the task is activated after at
> least minimum inter-arrival time between activation/replenishment:
> 
> >  - on wakeup (if overflowing) we do
> > 
> >dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
> >dl_se->runtime = pi_se->dl_runtime;
> 
> 
> In the replenishment timer, it is known that the absolute deadline
> instant of the previous activation is in the deadline var. So
> putting the absolute deadline one dl_period away is correct [1].
> 
> Another point is that this case avoids creating time drift due
> to latencies. For instance, in the case of a 1 ms delay of the timer
> (interrupts disabled?), the wakeup replenishment would push the
> absolute a relative deadline + 1 ms away from the previous deadline.
> 
> IOW: the replenishment timer makes the periodic case - a fixed time
> offset from the previous activation/replenishment.
> 
> >  - when the replenishment timer fires (un-thottle and with runtime < 0)
> > 
> >dl_se->deadline += pi_se->dl_period;
> >dl_se->runtime += pi_se->dl_runtime;
> 
> So I think it is correct. Am I missing something?
> 

Nope, you are right, no problems here.

Thanks,

- Juri

Re: [PATCH] sched/deadline: Remove unnecessary condition in push_dl_task()

2017-02-15 Thread Juri Lelli

On 15/02/17 09:25, Steven Rostedt wrote:
> On Wed, 15 Feb 2017 10:47:49 +
> Juri Lelli  wrote:
> 
> > [+Steve, Luca]
> > 
> > Hi,
> > 
> > On 15/02/17 14:11, Byungchul Park wrote:
> > > Once pick_next_pushable_dl_task(rq) return a task, it guarantees that
> > > the task's cpu is rq->cpu, so task_cpu(next_task) is always rq->cpu if
> > > task == next_task. Remove a redundant condition and make code simpler.
> > > 
> > > Signed-off-by: Byungchul Park 
> > > ---
> > >  kernel/sched/deadline.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > index 27737f3..ad8d577 100644
> > > --- a/kernel/sched/deadline.c
> > > +++ b/kernel/sched/deadline.c
> > > @@ -1483,7 +1483,7 @@ static int push_dl_task(struct rq *rq)
> > >* then possible that next_task has migrated.
> > >*/
> > >   task = pick_next_pushable_dl_task(rq);
> > > - if (task_cpu(next_task) == rq->cpu && task == next_task) {
> > > + if (task == next_task) {  
> > 
> > Seems a sensible optimization to me. Actually, we are doing the same for
> > rt.c; Steve, Peter, do you think we should optimize that as well?
> > 
> 
> Are we doing the same for push_rt_task()? I don't see it, and I don't
> see it in tip/sched/core either. What I have is:
> 
>   if (task_cpu(next_task) == rq->cpu && task == next_task) {

Sorry, bad wording on my side. I meant the are currently checking the
same conditions both for DL and for RT, and we should probably optimize
RT as well if we are going to take this patch.

> 
> But that said, I believe this patch is correct, and we should change
> rt.c as well.
> 

That's what I meant. :)

> 
>   task = pick_next_pushable_dl_task(rq);
> 
> Which has:
> 
>   BUG_ON(rq->cpu != task_cpu(task))
> 
> when it returns a task other than NULL. Which means that task_cpu(task)
> must be rq->cpu. Then if task == next_task, then task_cpu(next_task)
> must be rq->cpu as well.

Right.

> 
> Reviewed-by: Steven Rostedt (VMware) 
> 

You can also add mine

Reviewed-by: Juri Lelli 

> Mind fixing rt.c if it hasn't been fixed already.
> 
> -- Steve

Re: [PATCH v3 2/2] sched/rt: Remove unnecessary condition in push_rt_task()

2017-02-16 Thread Juri Lelli

Hi,

On 16/02/17 16:15, byungchul.park wrote:
> > -Original Message-
> > From: Steven Rostedt [mailto:rost...@goodmis.org]
> > Sent: Thursday, February 16, 2017 11:46 AM
> > To: Byungchul Park
> > Cc: pet...@infradead.org; mi...@kernel.org; linux-kernel@vger.kernel.org;
> > juri.le...@gmail.com; kernel-t...@lge.com
> > Subject: Re: [PATCH v3 2/2] sched/rt: Remove unnecessary condition in
> > push_rt_task()
> > 
> > On Thu, 16 Feb 2017 11:34:17 +0900
> > Byungchul Park  wrote:
> > 
> > > pick_next_pushable_task(rq) has BUG_ON(rq_cpu != task_cpu(task)) when
> > > it returns a task other than NULL, which means that task_cpu(task) must
> > > be rq->cpu. So if task == next_task, then task_cpu(next_task) must be
> > > rq->cpu as well. Remove the redundant condition and make code simpler.
> > >
> > > By this patch, unnecessary one branch and two LOAD operations in 'if'
> > > statement can be avoided.
> > >
> > > Signed-off-by: Byungchul Park 
> > > Reviewed-by: Steven Rostedt (VMware) 
> > > Reviewed-by: Juri Lelli 
> > > ---
> > 
> > This is a different patch, I don't believe Juri reviewed it yet.
> 
> Hello juri,
> 
> I should have asked you first and been more careful before I added
> 'reviewed-by'. Can I ask you for your opinion about the additional one?
> 

Looks good to me, you can leave my Reviewed-by.

Steve, thanks for pointing out that I didn't yet reviewed it. :)

Thanks,

- Juri

Re: [PATCH] sched/deadline: Remove redundant code replenishing runtime

2017-02-10 Thread Juri Lelli

Hi,

On 10/02/17 18:11, Byungchul Park wrote:
> For a task passing its deadline while !rq, it will be replenished
> in the following path because dl_se->deadline < rq_lock.
> 
>enqueue_dl_entity(ENQUEUE_WAKEUP)
>   update_dl_entity
> 
> Therefore, code replenishing it in the timer callback in the case is
> unnecessary. This is not for enhancing performance but just for removing
> a redundant code.
> 
> Signed-off-by: Byungchul Park 
> ---
>  kernel/sched/deadline.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 27737f3..9c77696 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -624,10 +624,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer 
> *timer)
>* We can be both throttled and !queued. Replenish the counter
>* but do not enqueue -- wait for our wakeup to do that.
>*/
> - if (!task_on_rq_queued(p)) {
> - replenish_dl_entity(dl_se, dl_se);

I think we actually want to replenish and set the next deadline at this
point of time, not the one that we get when the task will eventually wake up.

Best,

- Juri

Re: [PATCH v2 6/9] arm, arm64: factorize common cpu capacity default code

2017-02-13 Thread Juri Lelli

Hi Greg,

On 10/02/17 15:28, Greg KH wrote:
> On Thu, Feb 09, 2017 at 09:25:22AM +0000, Juri Lelli wrote:
> > arm and arm64 share lot of code relative to parsing CPU capacity
> > information from DT, using that information for appropriate scaling and
> > exposing a sysfs interface for chaging such values at runtime.
> > 
> > Factorize such code in a common place (driver/base/arch_topology.c) in
> > preparation for further additions.
> > 
> > Suggested-by: Will Deacon 
> > Suggested-by: Mark Rutland 
> > Suggested-by: Catalin Marinas 
> > Cc: Russell King 
> > Cc: Catalin Marinas 
> > Cc: Will Deacon 
> > Cc: Greg Kroah-Hartman 
> > Signed-off-by: Juri Lelli 
> > ---
> > 
> > Changes from v1:
> >  - keep the original GPLv2 header
> > ---
> >  arch/arm/Kconfig |   1 +
> >  arch/arm/kernel/topology.c   | 213 ++
> >  arch/arm64/Kconfig   |   1 +
> >  arch/arm64/kernel/topology.c | 219 +--
> >  drivers/base/Kconfig |   8 ++
> >  drivers/base/Makefile|   1 +
> >  drivers/base/arch_topology.c | 237 
> > +++
> >  7 files changed, 257 insertions(+), 423 deletions(-)
> >  create mode 100644 drivers/base/arch_topology.c
> 
> Ah, so you want _me_ to maintain this, ok, I better review it...
> 

This has been suggested as a possible way to stop replicating code between arm
and arm64 (and possibly other archs in the future). Are you in principle OK
with it?

Thanks a lot for your comments, please find my answers below.

> > --- a/drivers/base/Kconfig
> > +++ b/drivers/base/Kconfig
> > @@ -339,4 +339,12 @@ config CMA_ALIGNMENT
> >  
> >  endif
> >  
> > +config GENERIC_ARCH_TOPOLOGY
> > +   bool
> > +   help
> > + Enable support for architectures common topology code: e.g., parsing
> > + CPU capacity information from DT, usage of such information for
> > + appropriate scaling, sysfs interface for changing capacity values at
> > +  runtime.
> 
> Mix of spaces and tabs :(
> 

Argh. :(

> > +
> >  endmenu
> > diff --git a/drivers/base/Makefile b/drivers/base/Makefile
> > index f2816f6ff76a..397e5c344e6a 100644
> > --- a/drivers/base/Makefile
> > +++ b/drivers/base/Makefile
> > @@ -23,6 +23,7 @@ obj-$(CONFIG_SOC_BUS) += soc.o
> >  obj-$(CONFIG_PINCTRL) += pinctrl.o
> >  obj-$(CONFIG_DEV_COREDUMP) += devcoredump.o
> >  obj-$(CONFIG_GENERIC_MSI_IRQ_DOMAIN) += platform-msi.o
> > +obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += arch_topology.o
> >  
> >  obj-y  += test/
> >  
> > diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> > new file mode 100644
> > index ..c1dd430adad2
> > --- /dev/null
> > +++ b/drivers/base/arch_topology.c
> > @@ -0,0 +1,237 @@
> > +/*
> > + * driver/base/arch_topology.c - Arch specific cpu topology information
> 
> No need to keep the filename in the file, you know what it is called :)
> 

OK, removed.

> > + *
> > + * Copyright (C) 2016, ARM Ltd.
> > + * Written by: Juri Lelli, ARM Ltd.
> > + *
> > + * This file is subject to the terms and conditions of the GNU General 
> > Public
> > + * License.  See the file "COPYING" in the main directory of this archive
> > + * for more details.
> 
> So, v2 only?  Please be specific.  Even better yet, use a SPDX header if
> you want to, those are always nice.
> 

Yes, v2 only.

  * for more details.   

 
+ * 

 
+ * Released under the GPLv2 only.  

 
+ * SPDX-License-Identifier: GPL-2.0 

Would do, right?

> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +static DEFINE_MUTEX(cpu_scale_mutex);
> > +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
> >

Re: [PATCH] sched/deadline: Remove redundant code replenishing runtime

2017-02-13 Thread Juri Lelli

[+Luca]

On 13/02/17 13:29, Byungchul Park wrote:
> On Mon, Feb 13, 2017 at 11:30:09AM +0900, Byungchul Park wrote:
> > On Fri, Feb 10, 2017 at 01:39:33PM +0000, Juri Lelli wrote:
> > > Hi,
> > > 
> > > On 10/02/17 18:11, Byungchul Park wrote:
> > > > For a task passing its deadline while !rq, it will be replenished
> > > > in the following path because dl_se->deadline < rq_lock.
> > > > 
> > > >enqueue_dl_entity(ENQUEUE_WAKEUP)
> > > >   update_dl_entity
> > > > 
> > > > Therefore, code replenishing it in the timer callback in the case is
> > > > unnecessary. This is not for enhancing performance but just for removing
> > > > a redundant code.
> > > > 
> > > > Signed-off-by: Byungchul Park 
> > > > ---
> > > >  kernel/sched/deadline.c | 4 +---
> > > >  1 file changed, 1 insertion(+), 3 deletions(-)
> > > > 
> > > > diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> > > > index 27737f3..9c77696 100644
> > > > --- a/kernel/sched/deadline.c
> > > > +++ b/kernel/sched/deadline.c
> > > > @@ -624,10 +624,8 @@ static enum hrtimer_restart dl_task_timer(struct 
> > > > hrtimer *timer)
> > > >  * We can be both throttled and !queued. Replenish the counter
> > > >  * but do not enqueue -- wait for our wakeup to do that.
> > > >  */
> > > > -   if (!task_on_rq_queued(p)) {
> > > > -   replenish_dl_entity(dl_se, dl_se);
> > > 
> > > I think we actually want to replenish and set the next deadline at this
> > > point of time, not the one that we get when the task will eventually wake 
> > > up.
> > 
> > Hello juri,
> > 
> > But I wonder if it's meaningful to set a next deadline for a 'sleeping
> > task', which, rather, could be worse because its bandwidth might be
> > distorted at the time it's woken up.
> > 

What you mean by 'distorted'. AFAIU, we just want to replenish when
needed. The instant of time when the task will eventually wake up it is
something we cannot rely upon, and could introduce errors.

IIUC, your situation looks like the below

   |---vxxx^ooo
   |   |   |
   |   |   |
sleep/throttle |   |
 r. timer  |
   wakeup

The task gets throttled while going to sleep, when the replenishment
timer fires you are proposing we do nothing and we actually replenishing
using the wakeup rq_clock() as reference. My worry is that, by doing so,
we make the task potentially loose some of its bandwidth, as we will
have lost some time (the 3 x-es in the diagram above) when calculating
its next dynamic deadline. 

> > IMHO, it's neat to set its deadline and runtime when being woken up, in
> > the case already passed its deadline. Am I wrong?
> 
> And I found that dl_entity_overflow() returns true and replenishes the
> task unconditionally in update_dl_entity() again when the task is woken
> up, because 'runtime / (deadline - t) > dl_runtime / dl_period' is true.
> 

Why 'unconditionally'? It will postpone and replenish if the task is
going to overflow, if not, it will keep its runtime and deadline we set
when the replenishment timer fired.

Re: [PATCH 1/2] sched/deadline: Replenishment timer should fire in the next period

2017-02-13 Thread Juri Lelli

On 13/02/17 09:59, Steven Rostedt wrote:
> On Mon, 13 Feb 2017 12:10:25 +0100
> Peter Zijlstra  wrote:
> 
> 
> > I've interpreted this as:
> > 
> > Reviewed-by: Luca Abeni 
> > 
> > Holler if you disagree.
> 
> You can add mine too. I put in a lot of trace_printk()s and it all
> appears to be exactly as Daniel describes.
> 
> Reviewed-by: Steven Rostedt (VMware) 
> 

I'm a bit late, but I looked at it as well and it looks good.

If you want,

Reviewed-by: Juri Lelli 

Thanks!

- Juri

Re: [PATCH] sched/deadline: Remove redundant code replenishing runtime

2017-02-14 Thread Juri Lelli

On 14/02/17 11:06, Byungchul Park wrote:
> On Tue, Feb 14, 2017 at 08:42:43AM +0900, Byungchul Park wrote:
> > On Mon, Feb 13, 2017 at 03:24:55PM +0000, Juri Lelli wrote:
> > > > > > I think we actually want to replenish and set the next deadline at 
> > > > > > this
> > > > > > point of time, not the one that we get when the task will 
> > > > > > eventually wake up.
> > > > > 
> > > > > Hello juri,
> > > > > 
> > > > > But I wonder if it's meaningful to set a next deadline for a 'sleeping
> > > > > task', which, rather, could be worse because its bandwidth might be
> > > > > distorted at the time it's woken up.
> > > > > 
> > > 
> > > What you mean by 'distorted'. AFAIU, we just want to replenish when
> > > needed. The instant of time when the task will eventually wake up it is
> > > something we cannot rely upon, and could introduce errors.
> > > 
> > > IIUC, your situation looks like the below
> > 
> > Exactly.
> > 
> > > 
> > >|---vxxx^ooo
> > >|   |   |
> > >|   |   |
> > > sleep/throttle |   |
> > >  r. timer  |
> > >  wakeup
> > 
> > Sorry for bothering you..
> > 
> > > The task gets throttled while going to sleep, when the replenishment
> > > timer fires you are proposing we do nothing and we actually replenishing
> > > using the wakeup rq_clock() as reference. My worry is that, by doing so,
> > > we make the task potentially loose some of its bandwidth, as we will
> > > have lost some time (the 3 x-es in the diagram above) when calculating
> > > its next dynamic deadline.
> > 
> > I meant, when we decide whether it's overflowed in dl_entiry_overflow(),
> > 'right' might be smaller than 'left' because 't' is the time the 3 x-es
> > already passed.
> > 
> > Of course, here I assumed that runtime ~= 0 and deadline ~= rq_clock
> > when it was throttled, if scheduler works nicely.
> > 
> > > > > IMHO, it's neat to set its deadline and runtime when being woken up, 
> > > > > in
> > > > > the case already passed its deadline. Am I wrong?
> > > > 
> > > > And I found that dl_entity_overflow() returns true and replenishes the
> > > > task unconditionally in update_dl_entity() again when the task is woken
> > > > up, because 'runtime / (deadline - t) > dl_runtime / dl_period' is true.
> > > > 
> > > 
> > > Why 'unconditionally'? It will postpone and replenish if the task is
> > 
> > Not exactly 'unconditially' if my assumption is broken. Sorry for
> > choosing a word that is not careful.
> > 
> > > going to overflow, if not, it will keep its runtime and deadline we set
> > 
> > I meant the task will be almost always considered 'overflow', as I
> > explained above. So it will be overwritten again when waking up the task
> > than keep what we set in timer callback.
> 
> I don't want to argue strongly, keeping current code unchanged is ok.
> I just wanted to say it will be replenished twice in most cases if:
> 
>1. The task was throttled and passed its deadline while sleeping.
> 
> Of course, it also depends on how much negative runtime it had when
> throttled. Sorry for bothering you and thanks for explaining it.

No bothering at all! Thanks for raising a potential problem, but I guess
we need to be correct 100% of the times, without trying to optimize for
the most common case.

Best,

- Juri

Re: [PATCH v3] sched/deadline: Change the time to replenish runtime for sleep tasks

2017-02-28 Thread Juri Lelli

Hi,

On 23/02/17 15:14, Byungchul Park wrote:
> Let's consider the following example.
> 
> timeline : o...o.o...o..o
>^   ^ ^   ^  ^
>|   | |   |  |
>start   | |   |  |
> original runtime |   |  |
>  sleep with (-)runtime   |  |
>  original deadline  |
>   wake up
> 
> When this task is woken up, a negative runtime should be considered,
> which means that the task should get penalized when assigning runtime,
> becasue it already spent more than expected. Current code handles this
> by replenishing a runtime in hrtimer callback for deadline. But this
> approach has room for improvement:
> 
>It will be replenished twice unnecessarily if the task sleeps for
>long time so that the deadline, assigned in the hrtimer callback,
>also passed. In other words, one happens in the callback and the
>other happens in update_dl_entiry() when waking it up.
> 
> So force to replenish it for sleep tasks when waking it up.
> 
> Signed-off-by: Byungchul Park 
> ---
>  kernel/sched/deadline.c | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 27737f3..cb43ce9 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -498,8 +498,9 @@ static void update_dl_entity(struct sched_dl_entity 
> *dl_se,
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>   struct rq *rq = rq_of_dl_rq(dl_rq);
>  
> - if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
> - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
> + if (dl_time_before(dl_se->deadline, rq_clock(rq)))
> + replenish_dl_entity(dl_se, pi_se);
> + else if (dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
>   dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
>   dl_se->runtime = pi_se->dl_runtime;
>   }
> @@ -621,13 +622,11 @@ static enum hrtimer_restart dl_task_timer(struct 
> hrtimer *timer)
>* __dequeue_task_dl()
>* prev->on_rq = 0;
>*
> -  * We can be both throttled and !queued. Replenish the counter
> -  * but do not enqueue -- wait for our wakeup to do that.
> +  * We can be both throttled and !queued. Wait for our wakeup to
> +  * replenish runtime and enqueue p.
>*/
> - if (!task_on_rq_queued(p)) {
> - replenish_dl_entity(dl_se, dl_se);

Hasn't this patch the same problem we discussed a couple of weeks ago?

https://marc.info/?l=linux-kernel&m=148699950802995

Thanks,

- Juri

Re: [PATCH] sched: deadline: Fix confusing comments about selection of top pi-waiter

2017-02-28 Thread Juri Lelli

Hi,

On 22/02/17 20:09, Joel Fernandes wrote:
> This comment in the code is incomplete, and I believe it begs a definition of
> dl_boosted to make sense of the condition that follows. Rewrite the comment 
> and
> also rearrange the condition that follows to reflect the first condition "we
> have a top pi-waiter which is a SCHED_DEADLINE task" in that order. Also fix a
> typo that follows.
> 
> Cc: Juri Lelli 
> Signed-off-by: Joel Fernandes 

Looks good to me.

Acked-by: Juri Lelli 

Thanks,

- Juri

[PATCH v2 7/9] arm,arm64,drivers: reduce scope of cap_parsing_failed

2017-02-09 Thread Juri Lelli

Reduce the scope of cap_parsing_failed (making it static in
drivers/base/arch_topology.c) by slightly changing {arm,arm64} DT
parsing code.

For arm checking for !cap_parsing_failed before calling normalize_
cpu_capacity() is superfluous, as returning an error from parse_
cpu_capacity() (above) means cap_from _dt is set to false.

For arm64 we can simply check if raw_capacity points to something,
which is not if capacity parsing has failed.

Suggested-by: Morten Rasmussen 
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c   | 3 +--
 arch/arm64/kernel/topology.c | 5 +
 drivers/base/arch_topology.c | 4 ++--
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 51e9ed6439f1..5d4679a5418b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -75,7 +75,6 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-extern bool cap_parsing_failed;
 extern void normalize_cpu_capacity(void);
 extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
@@ -164,7 +163,7 @@ static void __init parse_dt_topology(void)
middle_capacity = ((max_capacity / 3)
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
 
-   if (cap_from_dt && !cap_parsing_failed)
+   if (cap_from_dt)
normalize_cpu_capacity();
 }
 
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index f629f7524d65..deb5ebc1bdfe 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -26,7 +26,6 @@
 #include 
 #include 
 
-extern bool cap_parsing_failed;
 extern void normalize_cpu_capacity(void);
 extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
@@ -186,10 +185,8 @@ static int __init parse_dt_topology(void)
 * cluster with restricted subnodes.
 */
map = of_get_child_by_name(cn, "cpu-map");
-   if (!map) {
-   cap_parsing_failed = true;
+   if (!map)
goto out;
-   }
 
ret = parse_cluster(map, 0);
if (ret != 0)
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index c1dd430adad2..c75fa5ab732b 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -90,7 +90,7 @@ subsys_initcall(register_cpu_capacity_sysctl);
 
 u32 capacity_scale;
 u32 *raw_capacity;
-bool cap_parsing_failed;
+static bool cap_parsing_failed;
 
 void normalize_cpu_capacity(void)
 {
@@ -205,7 +205,7 @@ static int __init register_cpufreq_notifier(void)
 * until we have the necessary code to parse the cpu capacity, so
 * skip registering cpufreq notifier.
 */
-   if (!acpi_disabled || cap_parsing_failed)
+   if (!acpi_disabled || !raw_capacity)
return -EINVAL;
 
if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) {
-- 
2.10.0

[PATCH v2 5/9] arm64: remove wrong CONFIG_PROC_SYSCTL ifdef

2017-02-09 Thread Juri Lelli

The sysfs cpu_capacity entry for each CPU has nothing to do with
PROC_FS, nor it's in /proc/sys path.

Remove such ifdef.

Cc: Will Deacon 
Cc: Catalin Marinas 
Reported-and-suggested-by: Sudeep Holla 
Fixes: be8f185d8af4 ('arm64: add sysfs cpu_capacity attribute')
Signed-off-by: Juri Lelli 
---
 arch/arm64/kernel/topology.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 655e65f38f31..565dd69888cc 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -41,7 +41,6 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
-#ifdef CONFIG_PROC_SYSCTL
 static ssize_t cpu_capacity_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -98,7 +97,6 @@ static int register_cpu_capacity_sysctl(void)
return 0;
 }
 subsys_initcall(register_cpu_capacity_sysctl);
-#endif
 
 static u32 capacity_scale;
 static u32 *raw_capacity;
-- 
2.10.0

[PATCH v2 8/9] arm,arm64,drivers: move externs in a new header file

2017-02-09 Thread Juri Lelli

Create a new header file (include/linux/arch_topology.h) and put there
declarations of interfaces used by arm, arm64 and drivers code.

Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c|  7 +--
 arch/arm64/kernel/topology.c  |  4 +---
 drivers/base/arch_topology.c  |  1 +
 include/linux/arch_topology.h | 17 +
 4 files changed, 20 insertions(+), 9 deletions(-)
 create mode 100644 include/linux/arch_topology.h

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 5d4679a5418b..1855632105a4 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -11,6 +11,7 @@
  * for more details.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -44,10 +45,6 @@
  * updated during this sequence.
  */
 
-extern unsigned long
-arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
-extern void set_capacity_scale(unsigned int cpu, unsigned long capacity);
-
 #ifdef CONFIG_OF
 struct cpu_efficiency {
const char *compatible;
@@ -75,8 +72,6 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-extern void normalize_cpu_capacity(void);
-extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
 
 /*
  * Iterate all CPUs' descriptor in DT and compute the efficiency
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index deb5ebc1bdfe..6d2af4d9479f 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -11,6 +11,7 @@
  * for more details.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -26,9 +27,6 @@
 #include 
 #include 
 
-extern void normalize_cpu_capacity(void);
-extern int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu);
-
 static int __init get_cpu_for_node(struct device_node *node)
 {
struct device_node *cpu_node;
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index c75fa5ab732b..409a8bb613f2 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -10,6 +10,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
new file mode 100644
index ..4edae9fe8cdd
--- /dev/null
+++ b/include/linux/arch_topology.h
@@ -0,0 +1,17 @@
+/*
+ * include/linux/arch_topology.h - arch specific cpu topology information
+ */
+#ifndef _LINUX_ARCH_TOPOLOGY_H_
+#define _LINUX_ARCH_TOPOLOGY_H_
+
+void normalize_cpu_capacity(void);
+
+struct device_node;
+int parse_cpu_capacity(struct device_node *cpu_node, int cpu);
+
+struct sched_domain;
+unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
+
+void set_capacity_scale(unsigned int cpu, unsigned long capacity);
+
+#endif /* _LINUX_ARCH_TOPOLOGY_H_ */
-- 
2.10.0

[PATCH v2 9/9] arm,arm64,drivers: add a prefix to drivers arch_topology interfaces

2017-02-09 Thread Juri Lelli

Now that some functions that deal with arch topology information live
under drivers, there is a clash of naming that might create confusion.

Tidy things up by creating a drivers namespace for interfaces used by
arch code; achieve this by prepending a 'atd_' (arch topology driver)
prefix to driver interfaces.

Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c|  8 
 arch/arm64/kernel/topology.c  |  4 ++--
 drivers/base/arch_topology.c  | 20 ++--
 include/linux/arch_topology.h |  8 
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 1855632105a4..a33e0891fee2 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -110,7 +110,7 @@ static void __init parse_dt_topology(void)
continue;
}
 
-   if (parse_cpu_capacity(cn, cpu)) {
+   if (atd_parse_cpu_capacity(cn, cpu)) {
of_node_put(cn);
continue;
}
@@ -159,7 +159,7 @@ static void __init parse_dt_topology(void)
>> (SCHED_CAPACITY_SHIFT-1)) + 1;
 
if (cap_from_dt)
-   normalize_cpu_capacity();
+   atd_normalize_cpu_capacity();
 }
 
 /*
@@ -172,10 +172,10 @@ static void update_cpu_capacity(unsigned int cpu)
if (!cpu_capacity(cpu) || cap_from_dt)
return;
 
-   set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+   atd_set_capacity_scale(cpu, cpu_capacity(cpu) / middle_capacity);
 
pr_info("CPU%u: update cpu_capacity %lu\n",
-   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   cpu, atd_scale_cpu_capacity(NULL, cpu));
 }
 
 #else
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 6d2af4d9479f..1f6b34b6b359 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -38,7 +38,7 @@ static int __init get_cpu_for_node(struct device_node *node)
 
for_each_possible_cpu(cpu) {
if (of_get_cpu_node(cpu, NULL) == cpu_node) {
-   parse_cpu_capacity(cpu_node, cpu);
+   atd_parse_cpu_capacity(cpu_node, cpu);
of_node_put(cpu_node);
return cpu;
}
@@ -190,7 +190,7 @@ static int __init parse_dt_topology(void)
if (ret != 0)
goto out_map;
 
-   normalize_cpu_capacity();
+   atd_normalize_cpu_capacity();
 
/*
 * Check that all cores are in the topology; the SMP code will
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 409a8bb613f2..5c20280bb746 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,12 +22,12 @@
 static DEFINE_MUTEX(cpu_scale_mutex);
 static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
 
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
+unsigned long atd_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
return per_cpu(cpu_scale, cpu);
 }
 
-void set_capacity_scale(unsigned int cpu, unsigned long capacity)
+void atd_set_capacity_scale(unsigned int cpu, unsigned long capacity)
 {
per_cpu(cpu_scale, cpu) = capacity;
 }
@@ -39,7 +39,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
struct cpu *cpu = container_of(dev, struct cpu, dev);
 
return sprintf(buf, "%lu\n",
-   arch_scale_cpu_capacity(NULL, cpu->dev.id));
+   atd_scale_cpu_capacity(NULL, cpu->dev.id));
 }
 
 static ssize_t cpu_capacity_store(struct device *dev,
@@ -61,7 +61,7 @@ static ssize_t cpu_capacity_store(struct device *dev,
 
mutex_lock(&cpu_scale_mutex);
for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
-   set_capacity_scale(i, new_capacity);
+   atd_set_capacity_scale(i, new_capacity);
mutex_unlock(&cpu_scale_mutex);
}
 
@@ -93,7 +93,7 @@ u32 capacity_scale;
 u32 *raw_capacity;
 static bool cap_parsing_failed;
 
-void normalize_cpu_capacity(void)
+void atd_normalize_cpu_capacity(void)
 {
u64 capacity;
int cpu;
@@ -108,14 +108,14 @@ void normalize_cpu_capacity(void)
 cpu, raw_capacity[cpu]);
capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
/ capacity_scale;
-   set_capacity_scale(cpu, capacity);
+   atd_set_capacity_scale(cpu, capacity);
pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
-   cpu, arch_scale_cpu_capacity(NULL, cpu));
+   cpu, atd_scale_cpu_capacity(NULL, cpu));
}
mutex_unlock(&cpu_scale_mutex);
 }
 
-int __init parse_cpu_capacity(struct

[PATCH v2 6/9] arm, arm64: factorize common cpu capacity default code

2017-02-09 Thread Juri Lelli

arm and arm64 share lot of code relative to parsing CPU capacity
information from DT, using that information for appropriate scaling and
exposing a sysfs interface for chaging such values at runtime.

Factorize such code in a common place (driver/base/arch_topology.c) in
preparation for further additions.

Suggested-by: Will Deacon 
Suggested-by: Mark Rutland 
Suggested-by: Catalin Marinas 
Cc: Russell King 
Cc: Catalin Marinas 
Cc: Will Deacon 
Cc: Greg Kroah-Hartman 
Signed-off-by: Juri Lelli 
---

Changes from v1:
 - keep the original GPLv2 header
---
 arch/arm/Kconfig |   1 +
 arch/arm/kernel/topology.c   | 213 ++
 arch/arm64/Kconfig   |   1 +
 arch/arm64/kernel/topology.c | 219 +--
 drivers/base/Kconfig |   8 ++
 drivers/base/Makefile|   1 +
 drivers/base/arch_topology.c | 237 +++
 7 files changed, 257 insertions(+), 423 deletions(-)
 create mode 100644 drivers/base/arch_topology.c

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 186c4c214e0a..6dd5736c1e3c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -19,6 +19,7 @@ config ARM
select EDAC_SUPPORT
select EDAC_ATOMIC_SCRUB
select GENERIC_ALLOCATOR
+   select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY
select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI)
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_EARLY_IOREMAP
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index c760a321935b..51e9ed6439f1 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -43,75 +43,10 @@
  * to run the rebalance_domains for all idle cores and the cpu_capacity can be
  * updated during this sequence.
  */
-static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
-static DEFINE_MUTEX(cpu_scale_mutex);
 
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-   return per_cpu(cpu_scale, cpu);
-}
-
-static void set_capacity_scale(unsigned int cpu, unsigned long capacity)
-{
-   per_cpu(cpu_scale, cpu) = capacity;
-}
-
-static ssize_t cpu_capacity_show(struct device *dev,
-struct device_attribute *attr,
-char *buf)
-{
-   struct cpu *cpu = container_of(dev, struct cpu, dev);
-
-   return sprintf(buf, "%lu\n",
-   arch_scale_cpu_capacity(NULL, cpu->dev.id));
-}
-
-static ssize_t cpu_capacity_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf,
- size_t count)
-{
-   struct cpu *cpu = container_of(dev, struct cpu, dev);
-   int this_cpu = cpu->dev.id, i;
-   unsigned long new_capacity;
-   ssize_t ret;
-
-   if (count) {
-   ret = kstrtoul(buf, 0, &new_capacity);
-   if (ret)
-   return ret;
-   if (new_capacity > SCHED_CAPACITY_SCALE)
-   return -EINVAL;
-
-   mutex_lock(&cpu_scale_mutex);
-   for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
-   set_capacity_scale(i, new_capacity);
-   mutex_unlock(&cpu_scale_mutex);
-   }
-
-   return count;
-}
-
-static DEVICE_ATTR_RW(cpu_capacity);
-
-static int register_cpu_capacity_sysctl(void)
-{
-   int i;
-   struct device *cpu;
-
-   for_each_possible_cpu(i) {
-   cpu = get_cpu_device(i);
-   if (!cpu) {
-   pr_err("%s: too early to get CPU%d device!\n",
-  __func__, i);
-   continue;
-   }
-   device_create_file(cpu, &dev_attr_cpu_capacity);
-   }
-
-   return 0;
-}
-subsys_initcall(register_cpu_capacity_sysctl);
+extern unsigned long
+arch_scale_cpu_capacity(struct sched_domain *sd, int cpu);
+extern void set_capacity_scale(unsigned int cpu, unsigned long capacity);
 
 #ifdef CONFIG_OF
 struct cpu_efficiency {
@@ -140,145 +75,9 @@ static unsigned long *__cpu_capacity;
 
 static unsigned long middle_capacity = 1;
 static bool cap_from_dt = true;
-static u32 *raw_capacity;
-static bool cap_parsing_failed;
-static u32 capacity_scale;
-
-static int __init parse_cpu_capacity(struct device_node *cpu_node, int cpu)
-{
-   int ret = 1;
-   u32 cpu_capacity;
-
-   if (cap_parsing_failed)
-   return !ret;
-
-   ret = of_property_read_u32(cpu_node,
-  "capacity-dmips-mhz",
-  &cpu_capacity);
-   if (!ret) {
-   if (!raw_capacity) {
-   raw_capacity = kcalloc(num_possible_cpus(),
-

[PATCH v2 4/9] arm: remove wrong CONFIG_PROC_SYSCTL ifdef

2017-02-09 Thread Juri Lelli

The sysfs cpu_capacity entry for each CPU has nothing to do with
PROC_FS, nor it's in /proc/sys path.

Remove such ifdef.

Cc: Russell King 
Reported-and-suggested-by: Sudeep Holla 
Fixes: 7e5930aaef5d ('ARM: 8622/3: add sysfs cpu_capacity attribute')
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index b439f7fff86b..c760a321935b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -56,7 +56,6 @@ static void set_capacity_scale(unsigned int cpu, unsigned 
long capacity)
per_cpu(cpu_scale, cpu) = capacity;
 }
 
-#ifdef CONFIG_PROC_SYSCTL
 static ssize_t cpu_capacity_show(struct device *dev,
 struct device_attribute *attr,
 char *buf)
@@ -113,7 +112,6 @@ static int register_cpu_capacity_sysctl(void)
return 0;
 }
 subsys_initcall(register_cpu_capacity_sysctl);
-#endif
 
 #ifdef CONFIG_OF
 struct cpu_efficiency {
-- 
2.10.0

[PATCH v2 2/9] Documentation/ABI: add information about cpu_capacity

2017-02-09 Thread Juri Lelli

/sys/devices/system/cpu/cpu#/cpu_capacity describe information about
CPUs heterogeneity (ref. to Documentation/devicetree/bindings/arm/
cpu-capacity.txt).

Add such description.

Signed-off-by: Juri Lelli 
---
 Documentation/ABI/testing/sysfs-devices-system-cpu | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu 
b/Documentation/ABI/testing/sysfs-devices-system-cpu
index 2a4a423d08e0..f3d5817c4ef0 100644
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -366,3 +366,10 @@ Contact:   Linux ARM Kernel Mailing list 

 Description:   AArch64 CPU registers
'identification' directory exposes the CPU ID registers for
 identifying model and revision of the CPU.
+
+What:  /sys/devices/system/cpu/cpu#/cpu_capacity
+Date:  December 2016
+Contact:   Linux kernel mailing list 
+Description:   information about CPUs heterogeneity.
+
+   cpu_capacity: capacity of cpu#.
-- 
2.10.0

[PATCH v2 0/9] Fix issues and factorize arm/arm64 capacity information code

2017-02-09 Thread Juri Lelli

Hi,

arm and arm64 topology.c share a lot of code related to parsing of capacity
information. This is v2 of a solution [1] (based on Will's, Catalin's and
Mark's off-line suggestions) to move such common code in a single place:
drivers/base/arch_topology.c (by creating such file and conditionally compiling
it for arm and arm64 only).

First 5 patches are actually fixes for the current code.

Patch 6 is the actual refactoring.

Patch 7 removes one of the extern symbols by changing a bit the now common
code.

Patch 8 removes the remaining externs (as required by Russell during v1 review)
by creating a new header file include/linux/arch_topology.h and including that
from arm, arm64 and drivers.

Last patch addresses Dietmar's comments to v1 and adds a 'atd_' prefix to
interfaces exported by drivers code and used by arch (and potentially others in
the future).

Changes from v1:

 - rebase on top of 4.10-rc7
 - fix licence issue as pointed out by Russell
 - propose a solution for removing all remaining externs
 - addressed Dietmar's comments regarding better namespaces

The set is based on top of linux/master (4.10-rc7 d5adbfcd5f7b) and it is also
available from:

 git://linux-arm.org/linux-jl.git upstream/default_caps_factorize-v2

Best,

- Juri

[1] v1 - https://marc.info/?l=linux-kernel&m=148483680119355&w=2

Juri Lelli (9):
  Documentation: arm: fix wrong reference number in DT definition
  Documentation/ABI: add information about cpu_capacity
  arm: fix return value of parse_cpu_capacity
  arm: remove wrong CONFIG_PROC_SYSCTL ifdef
  arm64: remove wrong CONFIG_PROC_SYSCTL ifdef
  arm, arm64: factorize common cpu capacity default code
  arm,arm64,drivers: reduce scope of cap_parsing_failed
  arm,arm64,drivers: move externs in a new header file
  arm,arm64,drivers: add a prefix to drivers arch_topology interfaces

 Documentation/ABI/testing/sysfs-devices-system-cpu |   7 +
 Documentation/devicetree/bindings/arm/cpus.txt |   4 +-
 arch/arm/Kconfig   |   1 +
 arch/arm/kernel/topology.c | 221 +--
 arch/arm64/Kconfig |   1 +
 arch/arm64/kernel/topology.c   | 228 +---
 drivers/base/Kconfig   |   8 +
 drivers/base/Makefile  |   1 +
 drivers/base/arch_topology.c   | 238 +
 include/linux/arch_topology.h  |  17 ++
 10 files changed, 285 insertions(+), 441 deletions(-)
 create mode 100644 drivers/base/arch_topology.c
 create mode 100644 include/linux/arch_topology.h

-- 
2.10.0

[PATCH v2 1/9] Documentation: arm: fix wrong reference number in DT definition

2017-02-09 Thread Juri Lelli

Reference to cpu capacity binding has a wrong number. Fix it.

Reported-by: Lorenzo Pieralisi 
Signed-off-by: Juri Lelli 
Acked-by: Rob Herring 
---
 Documentation/devicetree/bindings/arm/cpus.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/cpus.txt 
b/Documentation/devicetree/bindings/arm/cpus.txt
index a1bcfeed5f24..c27376a27a92 100644
--- a/Documentation/devicetree/bindings/arm/cpus.txt
+++ b/Documentation/devicetree/bindings/arm/cpus.txt
@@ -246,7 +246,7 @@ nodes to be present and contain the properties described 
below.
Usage: Optional
Value type: 
Definition:
-   # u32 value representing CPU capacity [3] in
+   # u32 value representing CPU capacity [4] in
  DMIPS/MHz, relative to highest capacity-dmips-mhz
  in the system.
 
@@ -473,5 +473,5 @@ cpus {
 [2] arm/msm/qcom,kpss-acc.txt
 [3] ARM Linux kernel documentation - idle states bindings
 Documentation/devicetree/bindings/arm/idle-states.txt
-[3] ARM Linux kernel documentation - cpu capacity bindings
+[4] ARM Linux kernel documentation - cpu capacity bindings
 Documentation/devicetree/bindings/arm/cpu-capacity.txt
-- 
2.10.0

[PATCH v2 3/9] arm: fix return value of parse_cpu_capacity

2017-02-09 Thread Juri Lelli

parse_cpu_capacity() has to return 0 on failure, but it currently returns
1 instead if raw_capacity kcalloc failed.

Fix it by removing the negation of the return value.

Cc: Russell King 
Reported-by: Morten Rasmussen 
Fixes: 06073ee26775 ('ARM: 8621/3: parse cpu capacity-dmips-mhz from DT')
Signed-off-by: Juri Lelli 
---
 arch/arm/kernel/topology.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index ebf47d91b804..b439f7fff86b 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -165,7 +165,7 @@ static int __init parse_cpu_capacity(struct device_node 
*cpu_node, int cpu)
if (!raw_capacity) {
pr_err("cpu_capacity: failed to allocate memory 
for raw capacities\n");
cap_parsing_failed = true;
-   return !ret;
+   return ret;
}
}
capacity_scale = max(cpu_capacity, capacity_scale);
-- 
2.10.0

Re: [RFC v4 0/6] CPU reclaiming for SCHED_DEADLINE

2017-01-11 Thread Juri Lelli

Hi,

On 04/01/17 19:30, Luca Abeni wrote:
> 2017-01-04 19:00 GMT+01:00, Daniel Bristot de Oliveira :
> [...]
> > Some tasks start to use more CPU time, while others seems to use less
> > CPU than it was reserved for them. See the task 14926, it is using
> > only 23.8 % of the CPU, which is less than its 10/30 reservation.
> 
>  What happened here is that some runqueues have an active utilisation
>  larger than 0.95. So, GRUB is decreasing the amount of time received by
>  the tasks on those runqueues to consume less than 95%... This is the
>  reason for the effect you noticed below:
> >>>
> >>> I see. But, AFAIK, the Linux's sched deadline measures the load
> >>> globally, not locally. So, it is not a problem having a load > than 95%
> >>> in the local queue if the global queue is < 95%.
> >>>
> >>> Am I missing something?
> >>
> >> The version of GRUB reclaiming implemented in my patches tracks a
> >> per-runqueue "active utilization", and uses it for reclaiming.
> >
> > I _think_ that this might be (one of) the source(s) of the problem...
> I agree that this can cause some problems, but I am not sure if it
> justifies the huge difference in utilisations you observed
> 
> > Just exercising...
> >
> > For example, with my taskset, with a hypothetical perfect balance of the
> > whole runqueue, one possible scenario is:
> >
> >CPU01 2 3
> > # TASKS   33 3 2
> >
> > In this case, CPUs 0 1 2 are with 100% of local utilization. Thus, the
> > current task on these CPUs will have their runtime decreased by GRUB.
> > Meanwhile, the luck tasks in the CPU 3 would use an additional time that
> > they "globally" do not have - because the system, globally, has a load
> > higher than the 66.6...% of the local runqueue. Actually, part of the
> > time decreased from tasks on [0-2] are being used by the tasks on 3,
> > until the next migration of any task, which will change the luck
> > tasks... but without any guaranty that all tasks will be the luck one on
> > every activation, causing the problem.
> >
> > Does it make sense?
> 
> Yes; but my impression is that gEDF will migrate tasks so that the
> distribution of the reclaimed CPU bandwidth is almost uniform...
> Instead, you saw huge differences in the utilisations (and I do not
> think that "compressing" the utilisations from 100% to 95% can
> decrease the utilisation of a task from 33% to 25% / 26%... :)
>

I tried to replicate Daniel's experiment, but I don't see such a skewed
allocation. They get a reasonably uniform bandwidth and the trace
looks fairly good as well (all processes get to run on the different
processors at some time).

> I suspect there is something more going on here (might be some bug in
> one of my patches). I am trying to better understand what happened.
> 

However, playing with this a bit further, I found out one thing that
looks counter-intuitive (at least to me :).

Simplifying Daniel's example, let's say that we have one 10/30 task
running on a CPU with a 500/1000 global limit. Applying grub_reclaim()
formula we have:

 delta_exec = delta * (0.5 + 0.333) = delta * 0.833

Which in practice means that 1ms of real delta (at 1000HZ) corresponds
to 0.833ms of virtual delta. Considering this, a 10ms (over 30ms)
reservation gets "extended" to ~12ms (over 30ms), that is to say the
task consumes 0.4 of the CPU's bandwidth. top seems to back what I'm
saying, but am I still talking nonsense? :)

I was expecting that the task could consume 0.5 worth of bandwidth with
the given global limit. Is the current behaviour intended?

If we want to change this behaviour maybe something like the following
might work?

 delta_exec = (delta * to_ratio((1ULL << 20) - rq->dl.non_deadline_bw,
rq->dl.running_bw)) >> 20

The idea would be to normalize running_bw over the available dl_bw.

Thoughts?

Best,

- Juri

Re: [RFC v4 0/6] CPU reclaiming for SCHED_DEADLINE

2017-01-11 Thread Juri Lelli

On 11/01/17 13:39, Luca Abeni wrote:
> Hi Juri,
> (I reply from my new email address)
> 
> On Wed, 11 Jan 2017 12:19:51 +
> Juri Lelli  wrote:
> [...]
> > > > For example, with my taskset, with a hypothetical perfect balance
> > > > of the whole runqueue, one possible scenario is:
> > > >
> > > >CPU01 2 3
> > > > # TASKS   33 3 2
> > > >
> > > > In this case, CPUs 0 1 2 are with 100% of local utilization.
> > > > Thus, the current task on these CPUs will have their runtime
> > > > decreased by GRUB. Meanwhile, the luck tasks in the CPU 3 would
> > > > use an additional time that they "globally" do not have - because
> > > > the system, globally, has a load higher than the 66.6...% of the
> > > > local runqueue. Actually, part of the time decreased from tasks
> > > > on [0-2] are being used by the tasks on 3, until the next
> > > > migration of any task, which will change the luck tasks... but
> > > > without any guaranty that all tasks will be the luck one on every
> > > > activation, causing the problem.
> > > >
> > > > Does it make sense?  
> > > 
> > > Yes; but my impression is that gEDF will migrate tasks so that the
> > > distribution of the reclaimed CPU bandwidth is almost uniform...
> > > Instead, you saw huge differences in the utilisations (and I do not
> > > think that "compressing" the utilisations from 100% to 95% can
> > > decrease the utilisation of a task from 33% to 25% / 26%... :)
> > >  
> > 
> > I tried to replicate Daniel's experiment, but I don't see such a
> > skewed allocation. They get a reasonably uniform bandwidth and the
> > trace looks fairly good as well (all processes get to run on the
> > different processors at some time).
> 
> With some effort, I replicated the issue noticed by Daniel... I think
> it also depends on the CPU speed (and on good or bad luck :), but the
> "unfair" CPU allocation can actually happen.

Yeah, actual allocation in general varies. I guess the question is: do
we care? We currently don't load balance considering utilizations, only
dynamic deadlines matter.

> I am working on a fix (based on the m-grub modifications proposed at
> last April's SAC - in my original patchset, I over-simplified the
> algorithm).
> 

OK, will have a look to next version.

> 
> > > I suspect there is something more going on here (might be some bug
> > > in one of my patches). I am trying to better understand what
> > > happened. 
> > 
> > However, playing with this a bit further, I found out one thing that
> > looks counter-intuitive (at least to me :).
> > 
> > Simplifying Daniel's example, let's say that we have one 10/30 task
> > running on a CPU with a 500/1000 global limit. Applying grub_reclaim()
> > formula we have:
> > 
> >  delta_exec = delta * (0.5 + 0.333) = delta * 0.833
> > 
> > Which in practice means that 1ms of real delta (at 1000HZ) corresponds
> > to 0.833ms of virtual delta. Considering this, a 10ms (over 30ms)
> > reservation gets "extended" to ~12ms (over 30ms), that is to say the
> > task consumes 0.4 of the CPU's bandwidth. top seems to back what I'm
> > saying, but am I still talking nonsense? :)
> 
> You are right; my "Do not reclaim the whole CPU bandwidth" patch is an
> approximation... I hoped that this approximation could be more precise
> than what it really is.
> I used the "Uact + unreclaimable utilization" equation to avoid
> divisions in grub_reclaim(), but the equation should really be "Uact /
> reclaimable utilization"... So, in your example it is
>   delta * 0. / 0.5 = delta * 0.
> that results in 15ms over 30ms, as expected.
> 
> I'll fix that patch for the next submission.
> 

Right, OK.

> > I was expecting that the task could consume 0.5 worth of bandwidth
> > with the given global limit. Is the current behaviour intended?
> > 
> > If we want to change this behaviour maybe something like the following
> > might work?
> > 
> >  delta_exec = (delta * to_ratio((1ULL << 20) - rq->dl.non_deadline_bw,
> > rq->dl.running_bw)) >> 20
> My current patch does
>   (delta * rq->dl.running_bw * rq->dl.deadline_bw_inv) >> 20 >> 8;
> where rq->dl.deadline_bw_inv has been set to
>   to_ratio(global_rt_runtime(), global_rt_period()) >> 12;
>   
> This seems to work fine, and should introduce less overhead than
> to_ratio().
> 

Sure, we don't want to do divisions if we can. Why the intermediate
right shifts, though?

Thanks,

- Juri

Re: [RFC v4 2/6] sched/deadline: improve the tracking of active utilization

2017-01-11 Thread Juri Lelli

Hi,

On 30/12/16 12:33, Luca Abeni wrote:
> From: Luca Abeni 
> 
> This patch implements a more theoretically sound algorithm for
> tracking active utilization: instead of decreasing it when a
> task blocks, use a timer (the "inactive timer", named after the
> "Inactive" task state of the GRUB algorithm) to decrease the
> active utilization at the so called "0-lag time".
> 
> Signed-off-by: Luca Abeni 
> ---

[...]

> +static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
> +{
> + struct sched_dl_entity *dl_se = container_of(timer,
> +  struct sched_dl_entity,
> +  inactive_timer);
> + struct task_struct *p = dl_task_of(dl_se);
> + struct rq_flags rf;
> + struct rq *rq;
> +
> + rq = task_rq_lock(p, &rf);
> +
> + if (!dl_task(p) || p->state == TASK_DEAD) {
> + if (p->state == TASK_DEAD && dl_se->dl_non_contending)
> + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
> +
> + __dl_clear_params(p);
> +
> + goto unlock;
> + }
> + if (dl_se->dl_non_contending == 0)
> + goto unlock;
> +
> + sched_clock_tick();
> + update_rq_clock(rq);
> +
> + sub_running_bw(dl_se, &rq->dl);
> + dl_se->dl_non_contending = 0;
> +unlock:
> + task_rq_unlock(rq, p, &rf);
> + put_task_struct(p);
> +
> + return HRTIMER_NORESTART;
> +}
> +

[...]

>  static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
> @@ -934,7 +1014,28 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
>   if (flags & ENQUEUE_WAKEUP) {
>   struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
>  
> - add_running_bw(dl_se, dl_rq);
> + if (dl_se->dl_non_contending) {
> + /*
> +  * If the timer handler is currently running and the
> +  * timer cannot be cancelled, inactive_task_timer()
> +  * will see that dl_not_contending is not set, and
> +  * will do nothing, so we are still safe.

Here and below: the timer callback will actually put_task_struct() (see
above) if dl_not_contending is not set; that's why we don't need to do
that if try_to_cancel returned -1 (or 0). Saying "will do nothing" is a
bit misleading, IMHO.

> +  */
> + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1)
> + put_task_struct(dl_task_of(dl_se));
> + WARN_ON(dl_task_of(dl_se)->nr_cpus_allowed > 1);
> + dl_se->dl_non_contending = 0;
> + } else {

[...]

> @@ -1097,6 +1198,22 @@ select_task_rq_dl(struct task_struct *p, int cpu, int 
> sd_flag, int flags)
>   }
>   rcu_read_unlock();
>  
> + rq = task_rq(p);
> + raw_spin_lock(&rq->lock);
> + if (p->dl.dl_non_contending) {
> + sub_running_bw(&p->dl, &rq->dl);
> + p->dl.dl_non_contending = 0;
> + /*
> +  * If the timer handler is currently running and the
> +  * timer cannot be cancelled, inactive_task_timer()
> +  * will see that dl_not_contending is not set, and
> +  * will do nothing, so we are still safe.
> +  */
> + if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
> + put_task_struct(p);
> + }
> + raw_spin_unlock(&rq->lock);
> +
>  out:
>   return cpu;
>  }

We already raised the issue about having to lock the rq in
select_task_rq_dl() while reviewing the previous version; did you have
any thinking about possible solutions? Maybe simply bail out (need to
see how frequent this is however) or use an inner lock?

Best,

- Juri

< 4 5 6 7 8 9 10 11 12 13 >

801 - 900 of 1334 matches

Mail list logo