[PATCH v3] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-20 Thread Rik van Riel
The try_to_wake_up function has an optimization where it can queue
a task for wakeup on its previous CPU, if the task is still in the
middle of going to sleep inside schedule().

Once schedule() re-enables IRQs, the task will be woken up with an
IPI, and placed back on the runqueue.

If we have such a wakeup pending, there is no need to search other
CPUs for runnable tasks. Just skip (or bail out early from) newidle
balancing, and run the just woken up task.

For a memcache like workload test, this reduces total CPU use by
about 2%, proportionally split between user and system time,
and p99 and p95 application response time by 10% on average.
The schedstats run_delay number shows a similar improvement.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69680158963f..fd80175c3b3e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10594,6 +10594,14 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
update_misfit_status(NULL, this_rq);
+
+   /*
+* There is a task waiting to run. No need to search for one.
+* Return 0; the task will be enqueued when switching to idle.
+*/
+   if (this_rq->ttwu_pending)
+   return 0;
+
/*
 * We must set idle_stamp _before_ calling idle_balance(), such that we
 * measure the duration of idle_balance() as idle time.
@@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 * Stop searching for tasks to pull if there are
 * now runnable tasks on this rq.
 */
-   if (pulled_task || this_rq->nr_running > 0)
+   if (pulled_task || this_rq->nr_running > 0 ||
+   this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
@@ -10688,7 +10697,12 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
+   /*
+* If we are no longer idle, do not let the time spent here pull
+* down this_rq->avg_idle. That could lead to newidle_balance not
+* doing enough work, and the CPU actually going idle.
+*/
+   if (pulled_task || this_rq->ttwu_pending)
this_rq->idle_stamp = 0;
 
rq_repin_lock(this_rq, rf);
-- 
2.25.4




Re: [PATCH v2] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-20 Thread Rik van Riel
On Tue, 2021-04-20 at 11:04 +0200, Vincent Guittot wrote:
> On Mon, 19 Apr 2021 at 18:51, Rik van Riel  wrote:
> > 
> > @@ -10688,7 +10697,7 @@ static int newidle_balance(struct rq
> > *this_rq, struct rq_flags *rf)
> > if (this_rq->nr_running != this_rq->cfs.h_nr_running)
> > pulled_task = -1;
> > 
> > -   if (pulled_task)
> > +   if (pulled_task || this_rq->ttwu_pending)
> 
> This needs at least a comment to explain why we must clear
> this_rq->idle_stamp when this_rq->ttwu_pending is set whereas it is
> also done during sched_ttwu_pending()
> 
> > this_rq->idle_stamp = 0;

I spent some time staring at sched_ttwu_pending and
the functions it calls, but I can't seem to spot
where it clears rq->idle_stamp, except inside
ttwu_do_wakeup where it will end up adding a
non-idle period into the rq->avg_idle, which seems
wrong.

If we are actually idle, and get woken up with a
ttwu_queue task, we do not come through newidle_balance,
and we end up counting the idle time into the avg_idle
number.

However, if a task is woken up while the CPU is
in newidle_balance, because prev != idle, we should
not count that period towards rq->avg_idle, for
the same reason we do so when we pulled a task.

I'll add a comment in v3 explaining why idle_stamp
needs to be 0.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH v2] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-19 Thread Rik van Riel
The try_to_wake_up function has an optimization where it can queue
a task for wakeup on its previous CPU, if the task is still in the
middle of going to sleep inside schedule().

Once schedule() re-enables IRQs, the task will be woken up with an
IPI, and placed back on the runqueue.

If we have such a wakeup pending, there is no need to search other
CPUs for runnable tasks. Just skip (or bail out early from) newidle
balancing, and run the just woken up task.

For a memcache like workload test, this reduces total CPU use by
about 2%, proportionally split between user and system time,
and p99 and p95 application response time by 2-3% on average.
The schedstats run_delay number shows a similar improvement.

Signed-off-by: Rik van Riel 
---
v2:
 - fix !SMP build error and prev-not-CFS case by moving check into 
newidle_balance
 - fix formatting of if condition
 - audit newidle_balance return value use to make sure we get that right
 - reset idle_stamp when breaking out of the loop due to ->ttwu_pending

 kernel/sched/fair.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69680158963f..5e26f013e182 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10594,6 +10594,14 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
u64 curr_cost = 0;
 
update_misfit_status(NULL, this_rq);
+
+   /*
+* There is a task waiting to run. No need to search for one.
+* Return 0; the task will be enqueued when switching to idle.
+*/
+   if (this_rq->ttwu_pending)
+   return 0;
+
/*
 * We must set idle_stamp _before_ calling idle_balance(), such that we
 * measure the duration of idle_balance() as idle time.
@@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 * Stop searching for tasks to pull if there are
 * now runnable tasks on this rq.
 */
-   if (pulled_task || this_rq->nr_running > 0)
+   if (pulled_task || this_rq->nr_running > 0 ||
+   this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
@@ -10688,7 +10697,7 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
pulled_task = -1;
 
-   if (pulled_task)
+   if (pulled_task || this_rq->ttwu_pending)
this_rq->idle_stamp = 0;
 
rq_repin_lock(this_rq, rf);
-- 
2.25.4




Re: [PATCH] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-19 Thread Rik van Riel
On Mon, 2021-04-19 at 12:22 +0100, Valentin Schneider wrote:
> On 18/04/21 22:17, Rik van Riel wrote:
> > @@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq
> > *this_rq, struct rq_flags *rf)
> >* Stop searching for tasks to pull if there are
> >* now runnable tasks on this rq.
> >*/
> > -   if (pulled_task || this_rq->nr_running > 0)
> > +   if (pulled_task || this_rq->nr_running > 0 ||
> > +   this_rq->ttwu_pending)
> >   break;
> 
> I thought newidle_balance() would already handle this by checking
> idle_cpu(), but that can't work due to rq->curr never being rq->idle
> here
> (we're trying very hard to prevent this!).
> 
> Would there be any point in bunching up these two checks from
> idle_cpu()
> into an inline helper and reusing it here?

I'm not sure, all the sched classes seem to have their own
magic in the balance functions, and there are enough special
situations out there that we might only be able to consolidate
a few callsites, not the rest.

Also, it turns out newidle_balance needs a different return
value depending on whether we have ->ttwu_pending, a pulled
task, or a queued realtime task...

I'll send v2 without any considation here, since I cannot
figure out a way to make things better here :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-18 Thread Rik van Riel
The try_to_wake_up function has an optimization where it can queue
a task for wakeup on its previous CPU, if the task is still in the
middle of going to sleep inside schedule().

Once schedule() re-enables IRQs, the task will be woken up with an
IPI, and placed back on the runqueue.

If we have such a wakeup pending, there is no need to search other
CPUs for runnable tasks. Just skip (or bail out early from) newidle
balancing, and run the just woken up task.

For a memcache like workload test, this reduces total CPU use by
about 2%, proportionally split between user and system time,
and p99 and p95 application response time by 2-3% on average.
The schedstats run_delay number shows a similar improvement.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69680158963f..19a92c48939f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7163,6 +7163,14 @@ done: __maybe_unused;
if (!rf)
return NULL;
 
+   /*
+* We have a woken up task pending here. No need to search for ones
+* elsewhere. This task will be enqueued the moment we unblock irqs
+* upon exiting the scheduler.
+*/
+   if (rq->ttwu_pending)
+   return NULL;
+
new_tasks = newidle_balance(rq, rf);
 
/*
@@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
rq_flags *rf)
 * Stop searching for tasks to pull if there are
 * now runnable tasks on this rq.
 */
-   if (pulled_task || this_rq->nr_running > 0)
+   if (pulled_task || this_rq->nr_running > 0 ||
+   this_rq->ttwu_pending)
break;
}
rcu_read_unlock();
-- 
2.25.4




Re: [PATCH 2/2] sched/fair: Relax task_hot() for misfit tasks

2021-04-15 Thread Rik van Riel
On Thu, 2021-04-15 at 18:58 +0100, Valentin Schneider wrote:
> Consider the following topology:
> 
> Long story short, preempted misfit tasks are affected by task_hot(),
> while
> currently running misfit tasks are intentionally preempted by the
> stopper
> task to migrate them over to a higher-capacity CPU.
> 
> Align detach_tasks() with the active-balance logic and let it pick a
> cache-hot misfit task when the destination CPU can provide a capacity
> uplift.
> 
> Signed-off-by: Valentin Schneider 

Reviewed-by: Rik van Riel 


This patch looks good, but...

> @@ -7672,6 +7698,15 @@ int can_migrate_task(struct task_struct *p,
> struct lb_env *env)
>   if (tsk_cache_hot == -1)
>   tsk_cache_hot = task_hot(p, env);
>  
> + /*
> +  * On a (sane) asymmetric CPU capacity system, the increase in
> compute
> +  * capacity should offset any potential performance hit caused
> by a
> +  * migration.
> +  */
> + if ((env->dst_grp_type == group_has_spare) &&
> + !migrate_degrades_capacity(p, env))
> + tsk_cache_hot = 0;

... I'm starting to wonder if we should not rename the
tsk_cache_hot variable to something else to make this
code more readable. Probably in another patch :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 1/2] sched/fair: Filter out locally-unsolvable misfit imbalances

2021-04-15 Thread Rik van Riel
On Thu, 2021-04-15 at 18:58 +0100, Valentin Schneider wrote:
> ...
> Further tweak find_busiest_queue() to ensure this doesn't happen.
> Also note
> find_busiest_queue() can now iterate over CPUs with a higher capacity
> than
> the local CPU's, so add a capacity check there.
> 
> Signed-off-by: Valentin Schneider 

Reviewed-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2 00/16] Multigenerational LRU Framework

2021-04-14 Thread Rik van Riel
On Wed, 2021-04-14 at 13:14 -0600, Yu Zhao wrote:
> On Wed, Apr 14, 2021 at 9:59 AM Rik van Riel 
> wrote:
> > On Wed, 2021-04-14 at 08:51 -0700, Andi Kleen wrote:
> > > >2) It will not scan PTE tables under non-leaf PMD entries
> > > > that
> > > > do not
> > > >   have the accessed bit set, when
> > > >   CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
> > > 
> > > This assumes  that workloads have reasonable locality. Could
> > > there
> > > be a worst case where only one or two pages in each PTE are used,
> > > so this PTE skipping trick doesn't work?
> > 
> > Databases with large shared memory segments shared between
> > many processes come to mind as a real-world example of a
> > worst case scenario.
> 
> Well, I don't think you two are talking about the same thing. Andi
> was
> focusing on sparsity. Your example seems to be about sharing, i.e.,
> ihgh mapcount. Of course both can happen at the same time, as I
> tested
> here:
> https://lore.kernel.org/linux-mm/yhful%2fddtiml4...@google.com/#t
> 
> I'm skeptical that shared memory used by databases is that sparse,
> i.e., one page per PTE table, because the extremely low locality
> would
> heavily penalize their performance. But my knowledge in databases is
> close to zero. So feel free to enlighten me or just ignore what I
> said.

A database may have a 200GB shared memory segment,
and a worker task that gets spun up to handle a
query might access only 1MB of memory to answer
that query.

That memory could be from anywhere inside the
shared memory segment. Maybe some of the accesses
are more dense, and others more sparse, who knows?

A lot of the locality
will depend on how memory
space inside the shared memory segment is reclaimed
and recycled inside the database.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2 00/16] Multigenerational LRU Framework

2021-04-14 Thread Rik van Riel
On Wed, 2021-04-14 at 08:51 -0700, Andi Kleen wrote:
> >2) It will not scan PTE tables under non-leaf PMD entries that
> > do not
> >   have the accessed bit set, when
> >   CONFIG_HAVE_ARCH_PARENT_PMD_YOUNG=y.
> 
> This assumes  that workloads have reasonable locality. Could there
> be a worst case where only one or two pages in each PTE are used,
> so this PTE skipping trick doesn't work?

Databases with large shared memory segments shared between
many processes come to mind as a real-world example of a
worst case scenario.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2 00/16] Multigenerational LRU Framework

2021-04-14 Thread Rik van Riel
On Wed, 2021-04-14 at 16:27 +0800, Huang, Ying wrote:
> Yu Zhao  writes:
> 
> > On Wed, Apr 14, 2021 at 12:15 AM Huang, Ying 
> > wrote:
> > > 
> > NUMA Optimization
> > -
> > Support NUMA policies and per-node RSS counters.
> > 
> > We only can move forward one step at a time. Fair?
> 
> You don't need to implement that now definitely.  But we can discuss
> the
> possible solution now.

That was my intention, too. I want to make sure we don't
end up "painting ourselves into a corner" by moving in some
direction we have no way to get out of.

The patch set looks promising, but we need some plan to
avoid the worst case behaviors that forced us into rmap
based scanning initially.

> Note that it's possible that only some processes are bound to some
> NUMA
> nodes, while other processes aren't bound.

For workloads like PostgresQL or Oracle, it is common
to have maybe 70% of memory in a large shared memory
segment, spread between all the NUMA nodes, and mapped
into hundreds, if not thousands, of processes in the
system.

Now imagine we have an 8 node system, and memory
pressure in the DMA32 zone of node 0.

How will the current VM behave?

Wha
t will the virtual scanning need to do?

If we can come up with a solution to make virtual
scanning scale for that kind of workload, great.

If not ... if it turns out most of the benefits of
the multigeneratinal LRU framework come from sorting
the pages into multiple LRUs, and from being able
to easily reclaim unmapped pages before having to
scan mapped ones, could it be an idea to implement
that first, independently from virtual scanning?

I am all for improving
our page reclaim system, I
just want to make sure we don't revisit the old traps
that forced us where we are today :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2 00/16] Multigenerational LRU Framework

2021-04-13 Thread Rik van Riel
On Wed, 2021-04-14 at 09:14 +1000, Dave Chinner wrote:
> On Tue, Apr 13, 2021 at 10:13:24AM -0600, Jens Axboe wrote:
> 
> > The initial posting of this patchset did no better, in fact it did
> > a bit
> > worse. Performance dropped to the same levels and kswapd was using
> > as
> > much CPU as before, but on top of that we also got excessive
> > swapping.
> > Not at a high rate, but 5-10MB/sec continually.
> > 
> > I had some back and forths with Yu Zhao and tested a few new
> > revisions,
> > and the current series does much better in this regard. Performance
> > still dips a bit when page cache fills, but not nearly as much, and
> > kswapd is using less CPU than before.
> 
> Profiles would be interesting, because it sounds to me like reclaim
> *might* be batching page cache removal better (e.g. fewer, larger
> batches) and so spending less time contending on the mapping tree
> lock...
> 
> IOWs, I suspect this result might actually be a result of less lock
> contention due to a change in batch processing characteristics of
> the new algorithm rather than it being a "better" algorithm...

That seems quite likely to me, given the issues we have
had with virtual scan reclaim algorithms in the past.

SeongJae, what is this algorithm supposed to do when faced
with situations like this:
1) Running on a system with 8 NUMA nodes, and
memory
   pressure in one of those nodes.
2) Running PostgresQL or Oracle, with hundreds of
   processes mapping the same (very large) shared
   memory segment.

How do you keep your algorithm from falling into the worst
case virtual scanning scenarios that were crippling the
2.4 kernel 15+ years ago on systems with just a few GB of
memory?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[tip: sched/core] sched/fair: Bring back select_idle_smt(), but differently

2021-04-09 Thread tip-bot2 for Rik van Riel
The following commit has been merged into the sched/core branch of tip:

Commit-ID: c722f35b513f807629603bbf24640b1a48be21b5
Gitweb:
https://git.kernel.org/tip/c722f35b513f807629603bbf24640b1a48be21b5
Author:Rik van Riel 
AuthorDate:Fri, 26 Mar 2021 15:19:32 -04:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 09 Apr 2021 18:01:39 +02:00

sched/fair: Bring back select_idle_smt(), but differently

Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
select_idle_core/cpu()"), resulting in the kernel being more efficient
at finding an idle CPU, and in tasks spending less time waiting to be
run, both according to the schedstats run_delay numbers, and according
to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about 30%
more), higher cache misses, higher memory bandwidth utilization, and
higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw a
consistent 1-3% increase in total CPU use on the system, due to those
increased task migrations leading to higher L2 cache miss numbers, and
higher memory utilization. The exclusive L3 cache on Skylake does us
no favors there.

On our web serving workload, that effect is usually negligible.

It appears that the increased number of CPU migrations is generally a
good thing, since it leads to lower cpu_delay numbers, reflecting the
fact that tasks get to run faster. However, the reduced locality and
the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing
select_idle_smt with a twist: when a socket has no idle cores, check
to see if the sibling of "prev" is idle, before searching all the
other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, task migrations are still
high, but L2 cache misses, memory bandwidth, and CPU time used are
back down to what they were before. The p95 and p99 response times for
the memcache type application improve by about 10% over what they were
before Mel's patches got merged.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Mel Gorman 
Acked-by: Vincent Guittot 
Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com
---
 kernel/sched/fair.c | 55 ++--
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d73bdb..bc34e35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def)
 {
struct sched_domain_shared *sds;
 
-   if (static_branch_likely(_smt_present)) {
-   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-   if (sds)
-   return READ_ONCE(sds->has_idle_cores);
-   }
+   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+   if (sds)
+   return READ_ONCE(sds->has_idle_cores);
 
return def;
 }
@@ -6112,6 +6110,24 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int 
target)
+{
+   int cpu;
+
+   for_each_cpu(cpu, cpu_smt_mask(target)) {
+   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+   continue;
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+   }
+
+   return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6128,6 +6144,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+   return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6135,11 +6156,10 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
bool has_idle_core, int target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
-   bool smt = test_idle_cores(target, false);
int this = smp_processo

[tip: sched/core] sched/fair: Bring back select_idle_smt(), but differently

2021-04-09 Thread tip-bot2 for Rik van Riel
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 6bcd3e21ba278098920d26d4888f5e6f4087c61d
Gitweb:
https://git.kernel.org/tip/6bcd3e21ba278098920d26d4888f5e6f4087c61d
Author:Rik van Riel 
AuthorDate:Fri, 26 Mar 2021 15:19:32 -04:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 08 Apr 2021 23:09:44 +02:00

sched/fair: Bring back select_idle_smt(), but differently

Mel Gorman did some nice work in 9fe1f127b913 ("sched/fair: Merge
select_idle_core/cpu()"), resulting in the kernel being more efficient
at finding an idle CPU, and in tasks spending less time waiting to be
run, both according to the schedstats run_delay numbers, and according
to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about 30%
more), higher cache misses, higher memory bandwidth utilization, and
higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw a
consistent 1-3% increase in total CPU use on the system, due to those
increased task migrations leading to higher L2 cache miss numbers, and
higher memory utilization. The exclusive L3 cache on Skylake does us
no favors there.

On our web serving workload, that effect is usually negligible.

It appears that the increased number of CPU migrations is generally a
good thing, since it leads to lower cpu_delay numbers, reflecting the
fact that tasks get to run faster. However, the reduced locality and
the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing
select_idle_smt with a twist: when a socket has no idle cores, check
to see if the sibling of "prev" is idle, before searching all the
other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, task migrations are still
high, but L2 cache misses, memory bandwidth, and CPU time used are
back down to what they were before. The p95 and p99 response times for
the memcache type application improve by about 10% over what they were
before Mel's patches got merged.

Signed-off-by: Rik van Riel 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Mel Gorman 
Acked-by: Vincent Guittot 
Link: https://lkml.kernel.org/r/20210326151932.2c187...@imladris.surriel.com
---
 kernel/sched/fair.c | 55 ++--
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6d73bdb..d0bd861 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6038,11 +6038,9 @@ static inline bool test_idle_cores(int cpu, bool def)
 {
struct sched_domain_shared *sds;
 
-   if (static_branch_likely(_smt_present)) {
-   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-   if (sds)
-   return READ_ONCE(sds->has_idle_cores);
-   }
+   sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+   if (sds)
+   return READ_ONCE(sds->has_idle_cores);
 
return def;
 }
@@ -6112,6 +6110,24 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int 
target)
+{
+   int cpu;
+
+   for_each_cpu(cpu, cpu_smt_mask(target)) {
+   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+   continue;
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+   }
+
+   return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6128,6 +6144,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+   return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6135,11 +6156,10 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, 
bool has_idle_core, int target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
-   bool smt = test_idle_cores(target, false);
int this = smp_processo

Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-08 Thread Rik van Riel
On Wed, 2021-04-07 at 12:19 +0200, Peter Zijlstra wrote:
> On Wed, Apr 07, 2021 at 11:54:37AM +0200, Peter Zijlstra wrote:
> 
> > Let me have another poke at it.
> 
> Pretty much what you did, except I also did s/smt/has_idle_core/ and
> fixed that @sd thing.
> 
> Like so then?

Looks good to me. Thank you.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Rik van Riel
On Tue, 2021-04-06 at 17:31 +0200, Vincent Guittot wrote:
> On Tue, 6 Apr 2021 at 17:26, Rik van Riel  wrote:
> > On Tue, 2021-04-06 at 17:10 +0200, Vincent Guittot wrote:
> > > On Fri, 26 Mar 2021 at 20:19, Rik van Riel 
> > > wrote:
> > > 
> > > > -static int select_idle_cpu(struct task_struct *p, struct
> > > > sched_domain *sd, int target)
> > > > +static int select_idle_cpu(struct task_struct *p, struct
> > > > sched_domain *sd, int prev, int target)
> > > >  {
> > > > struct cpumask *cpus =
> > > > this_cpu_cpumask_var_ptr(select_idle_mask);
> > > > int i, cpu, idle_cpu = -1, nr = INT_MAX;
> > > > @@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct
> > > > task_struct *p, struct sched_domain *sd, int t
> > > > 
> > > > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > > > 
> > > > -   if (sched_feat(SIS_PROP) && !smt) {
> > > > -   u64 avg_cost, avg_idle, span_avg;
> > > > +   if (!smt) {
> > > > +   if (cpus_share_cache(prev, target)) {
> > > 
> > > Have you checked the impact on no smt system ? would worth a
> > > static
> > > branch.
> > > 
> > > Also, this doesn't need to be in select_idle_cpu() which aims to
> > > loop
> > > the sched_domain becaus you only compare  target and prev. So you
> > > can
> > > move this call to select_idle_smt() in select_idle_sibling()
> > 
> > After Mel's rewrite, there no longer are calls to
> > select_idle_core() or select_idle_smt() in select_idle_sibling().
> 
> select_idle_smt() had even disappeared that why it was not in
> select_idle_sibling
> 
> > Everything got folded into one single loop in select_idle_cpu()
> 
> but this is done completely out of the loop so we don't need to
> complify the function with unrelated stuff

Not entirely. The call to select_idle_smt() is still
conditional on test_idle_cores() returning false.

We only look for the
other sibling if there is no idle
core in the LLC. If there is an idle core, we prefer
that.

Pulling the select_idle_smt() call out of select_idle_cpu()
would mean having to test_idle_cores() twice.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-04-06 Thread Rik van Riel
On Tue, 2021-04-06 at 17:10 +0200, Vincent Guittot wrote:
> On Fri, 26 Mar 2021 at 20:19, Rik van Riel  wrote:
> 
> > -static int select_idle_cpu(struct task_struct *p, struct
> > sched_domain *sd, int target)
> > +static int select_idle_cpu(struct task_struct *p, struct
> > sched_domain *sd, int prev, int target)
> >  {
> > struct cpumask *cpus =
> > this_cpu_cpumask_var_ptr(select_idle_mask);
> > int i, cpu, idle_cpu = -1, nr = INT_MAX;
> > @@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct
> > task_struct *p, struct sched_domain *sd, int t
> > 
> > cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > 
> > -   if (sched_feat(SIS_PROP) && !smt) {
> > -   u64 avg_cost, avg_idle, span_avg;
> > +   if (!smt) {
> > +   if (cpus_share_cache(prev, target)) {
> 
> Have you checked the impact on no smt system ? would worth a static
> branch.
> 
> Also, this doesn't need to be in select_idle_cpu() which aims to loop
> the sched_domain becaus you only compare  target and prev. So you can
> move this call to select_idle_smt() in select_idle_sibling()

After Mel's rewrite, there no longer are calls to
select_idle_core() or select_idle_smt() in select_idle_sibling().

Everything got folded into one single loop in select_idle_cpu()

I would be happy to pull the static branch out of select_idle_smt()
and place it into this if condition, though. You are right that
would save some overhead on non-smt systems.

Peter, would you prefer a follow-up patch for that or a version 4
of the patch?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH v3] sched/fair: bring back select_idle_smt, but differently

2021-03-26 Thread Rik van Riel
On Mon, 22 Mar 2021 11:03:06 +
Mel Gorman  wrote:


> Second, select_idle_smt() does not use the cpus mask so consider moving
> the cpus initialisation after select_idle_smt() has been called.
> Specifically this initialisation
> 
>   cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> 
> Alternatively, clear the bits in the SMT sibling scan to avoid checking
> the siblings twice. It's a tradeoff because initialising and clearing
> bits is not free and the cost is wasted if a sibling is free.

I tried a number of different variations on moving the CPU mask
initialization, and clearing CPUs from the mask, and failed to
get any clear results from those in testing, even in workloads
with lots of context switches.

Below is a simple version that seems to perform identically to
more complicated versions :)

---8<---
sched,fair: bring back select_idle_smt, but differently

Mel Gorman did some nice work in 9fe1f127b913
("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel
being more efficient at finding an idle CPU, and in tasks spending less
time waiting to be run, both according to the schedstats run_delay
numbers, and according to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about
30% more), higher cache misses, higher memory bandwidth utilization,
and higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw
a consistent 1-3% increase in total CPU use on the system, due
to those increased task migrations leading to higher L2 cache
miss numbers, and higher memory utilization. The exclusive L3
cache on Skylake does us no favors there.

On our web serving workload, that effect is usually negligible.

It appears that the increased number of CPU migrations is generally
a good thing, since it leads to lower cpu_delay numbers, reflecting
the fact that tasks get to run faster. However, the reduced locality
and the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt
with a twist: when a socket has no idle cores, check to see if the
sibling of "prev" is idle, before searching all the other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, task migrations are still
high, but L2 cache misses, memory bandwidth, and CPU time used are back
down to what they were before. The p95 and p99 response times for the
memcache type application improve by about 10% over what they were
before Mel's patches got merged.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 68 ++---
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..69680158963f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6098,6 +6098,28 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int
+target)
+{
+   int cpu;
+
+   if (!static_branch_likely(_smt_present))
+   return -1;
+
+   for_each_cpu(cpu, cpu_smt_mask(target)) {
+   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+   continue;
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+   }
+
+   return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6114,6 +6136,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+   return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6121,7 +6148,7 @@ static inline int select_idle_core(struct task_struct *p, 
int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
prev, int target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
@@ -6136,23 +6163,32 @@ static int select_idle_cpu(struct task_struct *p, 
struct sched_domain *sd, int t
 
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
 
-  

Re: [PATCH v2] sched/fair: bring back select_idle_smt, but differently

2021-03-22 Thread Rik van Riel
On Mon, 2021-03-22 at 15:33 +, Mel Gorman wrote:

> If trying that, I would put that in a separate patch. At one point
> I did play with clearing prev, target and recent but hit problems.
> Initialising the mask and clearing them in select_idle_sibling() hurt
> the fast path and doing it later was not much better. IIRC, the
> problem
> I hit was that the cost of clearing multiple CPUs before the search
> was
> not offset by gains from a more efficient search.

I'm definitely avoiding the more expensive operations,
and am only using __cpumask_clear_cpu now :)

> If I had to guess, simply initialising cpumask after calling
> select_idle_smt() will be faster for your particular case because you
> have a reasonable expectation that prev's SMT sibling is idle when
> there
> are no idle cores. Checking if prev's sibling is free when there are
> no
> idle cores is fairly cheap in comparison to a cpumask initialisation
> and
> partial clearing.
> 
> If you have the testing capacity and time, test both.

Kicking off more tests soon. I'll get back with a v3 patch
on Wednesday.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2] sched/fair: bring back select_idle_smt, but differently

2021-03-22 Thread Rik van Riel
On Mon, 2021-03-22 at 11:03 +, Mel Gorman wrote:
> On Sun, Mar 21, 2021 at 03:03:58PM -0400, Rik van Riel wrote:
> > Mel Gorman did some nice work in 9fe1f127b913
> > ("sched/fair: Merge select_idle_core/cpu()"), resulting in the
> > kernel
> > being more efficient at finding an idle CPU, and in tasks
> > spending less
> > time waiting to be run, both according to the schedstats
> > run_delay
> > numbers, and according to measured application latencies. Yay.
> > 
> 
> Other than unusual indentation of the changelog, yey for part 1.
> 
> > The flip side of this is that we see more task migrations
> > (about
> > 30% more), higher cache misses, higher memory bandwidth
> > utilization,
> > and higher CPU use, for the same number of requests/second.
> > 
> 
> I am having difficulty with this part and whether this patch affects
> task
> migrations in particular.

Sorry, I should be more clear in the changelog for the
next version. Task migrations continue to be high with
this patch applied, but memory bandwidth and L2 cache
misses go back down, due to better cache locality.

> > This is most pronounced on a memcache type workload, which saw
> > a consistent 1-3% increase in total CPU use on the system, due
> > to those increased task migrations leading to higher L2 cache
> > miss numbers, and higher memory utilization. The exclusive L3
> > cache on Skylake does us no favors there.
> > 
> 
> Out of curiousity, what is the load generator for memcache or is this
> based on analysis of a production workload? I ask because mutilate
> (https://github.com/leverich/mutilate) is allegedly a load generator
> that can simulate FaceBook patterns but it is old. I would be
> interested
> in hearing if mutilate is used and if so, what parameters the load
> generator is given.

I had never heard of mutilate, I'll take a look at that.

I am running systems that get real production queries, but
at a higher average load than regular production systems.
Also, the same queries get replicated out to 3 systems on
the A and B side each, which seems to be enough to factor
out random noise for this workload.

> This is understandable because finding an idle CPU and incurring the
> migration cost can be better than stacking a task on a busy CPU for
> workloads that are sensitive to wakeup latency.

It absolutely is, most of the time. Our web servers have
"only" about 70k context switches/second on a 36 CPU system,
while the memcache style workload has about 170k context
switches/second on the same hardware.

> But this is the part I'm having a problem with. If the sibling of
> prev is
> idle then a task migration cost is still incurred. The main
> difference is
> that it's more likely to share a L1 or L2 cache and with no idle
> cores,
> some sharing of resources between HT siblings is inevitable.
> 
> Can you confirm that task migrations are still higher with this
> patch?

They are higher. In fact, with my patch the number of task
migrations increases ever so slightly over what it is with
just your patches.  However, memory bandwidth used, and the
number of L2 cache misses go down.

> > This fixes both the occasional 9% regression on the web serving
> > workload, and the continuous 2% CPU use regression on the
> > memcache
> > type workload.
> > 
> > With Mel's patches and this patch together, the p95 and p99
> > response
> > times for the memcache type application improve by about 20%
> > over what
> > they were before Mel's patches got merged.
> 
> Again, I would be very interested in hearing how this conclusion was
> reached because I can update mmtests accordingly and wire it into the
> "standard battery of scheduler workloads".

I monitor a number of different metrics during this test:
- number of task migrations
- number of context switches
- p95 & p99 application response latency
- memory bandwidth used
- L2 cache misses per 1000 instructions
- CPU utilization (total, user, system)
- the cpu_delay schedstat number

There are other metrics too, but these seem the most useful
ones.

With just your patches applied, I see an increase in:
- number of task migrations
- amount of CPU time used
- memory bandwidth & L2 cache misses
and a reduction in:
- cpu_delay
- p95 & p99 application response latency

With my patch on top, I continue to see the benefits in
the cpu_delay and application response latency metrics,
while the CPU time, memory bandwidth, and L2 cache miss
metrics all go back down.

I suspect this is an artifact of the L3 CPU cache on
Skylake being an eviction cache, rather than an
inclusive cache. When a task

[PATCH v2] sched/fair: bring back select_idle_smt, but differently

2021-03-21 Thread Rik van Riel
Mel Gorman did some nice work in 9fe1f127b913
("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel
being more efficient at finding an idle CPU, and in tasks spending less
time waiting to be run, both according to the schedstats run_delay
numbers, and according to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about
30% more), higher cache misses, higher memory bandwidth utilization,
and higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw
a consistent 1-3% increase in total CPU use on the system, due
to those increased task migrations leading to higher L2 cache
miss numbers, and higher memory utilization. The exclusive L3
cache on Skylake does us no favors there.

On our web serving workload, that effect is usually negligible,
but occasionally as large as a 9% regression in the number of
requests served, due to some interaction between scheduler latency
and the web load balancer.

It appears that the increased number of CPU migrations is generally
a good thing, since it leads to lower cpu_delay numbers, reflecting
the fact that tasks get to run faster. However, the reduced locality
and the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt
with a twist: when a socket has no idle cores, check to see if the
sibling of "prev" is idle, before searching all the other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, the p95 and p99 response
times for the memcache type application improve by about 20% over what
they were before Mel's patches got merged.

    Signed-off-by: Rik van Riel 

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..0c986972f4cd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6098,6 +6098,28 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int
+target)
+{
+   int cpu;
+
+   if (!static_branch_likely(_smt_present))
+   return -1;
+
+   for_each_cpu(cpu, cpu_smt_mask(target)) {
+   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+   continue;
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+   }
+
+   return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6114,6 +6136,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+   return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6121,7 +6148,7 @@ static inline int select_idle_core(struct task_struct *p, 
int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
prev, int target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
@@ -6155,6 +6182,13 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
time = cpu_clock(this);
}
 
+   if (!smt && cpus_share_cache(prev, target)) {
+   /* No idle core. Check if prev has an idle sibling. */
+   i = select_idle_smt(p, sd, prev);
+   if ((unsigned int)i < nr_cpumask_bits)
+   return i;
+   }
+
for_each_cpu_wrap(cpu, cpus, target) {
if (smt) {
i = select_idle_core(p, cpu, cpus, _cpu);
@@ -6307,7 +6341,7 @@ static int select_idle_sibling(struct task_struct *p, int 
prev, int target)
if (!sd)
return target;
 
-   i = select_idle_cpu(p, sd, target);
+   i = select_idle_cpu(p, sd, prev, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
 



Re: [PATCH] sched/fair: bring back select_idle_smt, but differently

2021-03-21 Thread Rik van Riel
On Sun, 2021-03-21 at 14:48 -0400, Rik van Riel wrote:
> 
> + if (cpus_share_cache(prev, target)) {
> + /* No idle core. Check if prev has an idle sibling. */
> + i = select_idle_smt(p, sd, prev);

Uh, one minute. This is the wrong version of the patch.

There's supposed to be a !smt && in there as well. I first
merged it right, then somehow git ate it, and I merged it
wrong the second time. Sigh.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH] sched/fair: bring back select_idle_smt, but differently

2021-03-21 Thread Rik van Riel
Mel Gorman did some nice work in 9fe1f127b913
("sched/fair: Merge select_idle_core/cpu()"), resulting in the kernel
being more efficient at finding an idle CPU, and in tasks spending less
time waiting to be run, both according to the schedstats run_delay
numbers, and according to measured application latencies. Yay.

The flip side of this is that we see more task migrations (about
30% more), higher cache misses, higher memory bandwidth utilization,
and higher CPU use, for the same number of requests/second.

This is most pronounced on a memcache type workload, which saw
a consistent 1-3% increase in total CPU use on the system, due
to those increased task migrations leading to higher L2 cache
miss numbers, and higher memory utilization. The exclusive L3
cache on Skylake does us no favors there.

On our web serving workload, that effect is usually negligible,
but occasionally as large as a 9% regression in the number of
requests served, due to some interaction between scheduler latency
and the web load balancer.

It appears that the increased number of CPU migrations is generally
a good thing, since it leads to lower cpu_delay numbers, reflecting
the fact that tasks get to run faster. However, the reduced locality
and the corresponding increase in L2 cache misses hurts a little.

The patch below appears to fix the regression, while keeping the
benefit of the lower cpu_delay numbers, by reintroducing select_idle_smt
with a twist: when a socket has no idle cores, check to see if the
sibling of "prev" is idle, before searching all the other CPUs.

This fixes both the occasional 9% regression on the web serving
workload, and the continuous 2% CPU use regression on the memcache
type workload.

With Mel's patches and this patch together, the p95 and p99 response
times for the memcache type application improve by about 20% over what
they were before Mel's patches got merged.

    Signed-off-by: Rik van Riel 

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 794c2cb945f8..fcc47675d160 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6098,6 +6098,28 @@ static int select_idle_core(struct task_struct *p, int 
core, struct cpumask *cpu
return -1;
 }
 
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int
+target)
+{
+   int cpu;
+
+   if (!static_branch_likely(_smt_present))
+   return -1;
+
+   for_each_cpu(cpu, cpu_smt_mask(target)) {
+   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
+   continue;
+   if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
+   return cpu;
+   }
+
+   return -1;
+}
+
 #else /* CONFIG_SCHED_SMT */
 
 static inline void set_idle_cores(int cpu, int val)
@@ -6114,6 +6136,11 @@ static inline int select_idle_core(struct task_struct 
*p, int core, struct cpuma
return __select_idle_cpu(core);
 }
 
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain 
*sd, int target)
+{
+   return -1;
+}
+
 #endif /* CONFIG_SCHED_SMT */
 
 /*
@@ -6121,7 +6148,7 @@ static inline int select_idle_core(struct task_struct *p, 
int core, struct cpuma
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
  * average idle time for this rq (as found in rq->avg_idle).
  */
-static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int 
prev, int target)
 {
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
int i, cpu, idle_cpu = -1, nr = INT_MAX;
@@ -6155,6 +6182,13 @@ static int select_idle_cpu(struct task_struct *p, struct 
sched_domain *sd, int t
time = cpu_clock(this);
}
 
+   if (cpus_share_cache(prev, target)) {
+   /* No idle core. Check if prev has an idle sibling. */
+   i = select_idle_smt(p, sd, prev);
+   if ((unsigned int)i < nr_cpumask_bits)
+   return i;
+   }
+
for_each_cpu_wrap(cpu, cpus, target) {
if (smt) {
i = select_idle_core(p, cpu, cpus, _cpu);
@@ -6307,7 +6341,7 @@ static int select_idle_sibling(struct task_struct *p, int 
prev, int target)
if (!sd)
return target;
 
-   i = select_idle_cpu(p, sd, target);
+   i = select_idle_cpu(p, sd, prev, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
 


Re: [PATCH v1 09/14] mm: multigenerational lru: mm_struct list

2021-03-15 Thread Rik van Riel
On Sat, 2021-03-13 at 00:57 -0700, Yu Zhao wrote:

> +/*
> + * After pages are faulted in, they become the youngest generation.
> They must
> + * go through aging process twice before they can be evicted. After
> first scan,
> + * their accessed bit set during initial faults are cleared and they
> become the
> + * second youngest generation. And second scan makes sure they
> haven't been used
> + * since the first.
> + */

I have to wonder if the reductions in OOM kills and 
low-memory tab discards is due to this aging policy
change, rather than from the switch to virtual scanning.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] sched/fair: Prefer idle CPU to cache affinity

2021-02-27 Thread Rik van Riel
On Fri, 2021-02-26 at 22:10 +0530, Srikar Dronamraju wrote:

> Current order of preference to pick a LLC while waking a wake-affine
> task:
> 1. Between the waker CPU and previous CPU, prefer the LLC of the CPU
>that is idle.
> 
> 2. Between the waker CPU and previous CPU, prefer the LLC of the CPU
>that is less lightly loaded.
> 
> In the current situation where waker and previous CPUs are busy, but
> only one of its LLC has an idle CPU, Scheduler may end up picking a
> LLC
> with no idle CPUs. To mitigate this, add a new step between 1 and 2
> where Scheduler compares idle CPUs in waker and previous LLCs and
> picks
> the appropriate one.

I like that idea a lot. That could also solve some of the
issues sometimes observed on multi-node x86 systems, and
probably on the newer AMD chips with several LLCs on chip.

> + if (sched_feat(WA_WAKER) && tnr_busy < tllc_size)
> + return this_cpu;

I wonder if we need to use a slightly lower threshold on
very large LLCs, both to account for the fact that the
select_idle_cpu code may not find the single idle CPU
among a dozen busy ones, or because on a system with
hyperthreading we may often be better off picking another
LLC for HT contention issues?

Maybe we could use "tnr_busy * 4 <
tllc_size * 3" or
something like that?

That way we will only try to find the last 5 idle
CPUs
in a 22 CPU LLC if the other LLC also has fewer than 6
idle cores.

That might increase our chances of finding an idle CPU
with SIS_PROP enabled, and might allow WA_WAKER to be
true by default.

> + /* For better wakeup latency, prefer idler LLC to cache
> affinity */
> + diff = tnr_busy * pllc_size - sync - pnr_busy * tllc_size;
> + if (!diff)
> + return nr_cpumask_bits;
> + if (diff < 0)
> + return this_cpu;
> +
> + return prev_cpu;
> +}

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH 4/3] mm,shmem,thp: limit shmem THP allocations to requested zones

2021-02-24 Thread Rik van Riel
On Wed, 24 Feb 2021 08:55:40 -0800 (PST)
Hugh Dickins  wrote:
> On Wed, 24 Feb 2021, Rik van Riel wrote:
> > On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:  
> > > Oh, I'd forgotten all about that gma500 aspect:
> > > well, I can send a fixup later on.  
> > 
> > I already have code to fix that, which somebody earlier
> > in this discussion convinced me to throw away. Want me
> > to send it as a patch 4/3 ?  
> 
> If Andrew wants it all, yes, please do add that - thanks Rik.

Trivial patch to fix the gma500 thing below:

---8<---

mm,shmem,thp: limit shmem THP allocations to requested zones

Hugh pointed out that the gma500 driver uses shmem pages, but needs
to limit them to the DMA32 zone. Ensure the allocations resulting from
the gfp_mask returned by limit_gfp_mask use the zone flags that were
originally passed to shmem_getpage_gfp.

Signed-off-by: Rik van Riel 
Suggested-by: Hugh Dickins 
---
 mm/shmem.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index ee3cea10c2a4..876fec89686f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1539,7 +1539,11 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t 
limit_gfp)
 {
gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
-   gfp_t result = huge_gfp & ~allowflags;
+   gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+   gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+   /* Allow allocations only from the originally specified zones. */
+   result |= zoneflags;
 
/*
 * Minimize the result gfp by taking the union with the deny flags,



Re: [PATCH v6 0/3] mm,thp,shm: limit shmem THP alloc gfp_mask

2021-02-24 Thread Rik van Riel
On Wed, 2021-02-24 at 00:41 -0800, Hugh Dickins wrote:
> On Mon, 14 Dec 2020, Vlastimil Babka wrote:
> 
> > > (There's also a specific issue with the gfp_mask limiting: I have
> > > not yet reviewed the allowing and denying in detail, but it looks
> > > like it does not respect the caller's GFP_ZONEMASK - the gfp in
> > > shmem_getpage_gfp() and shmem_read_mapping_page_gfp() is there to
> > > satisfy the gma500, which wanted to use shmem but could only
> manage
> > > DMA32.  I doubt it wants THPS, but shmem_enabled=force forces
> them.)
> 
> Oh, I'd forgotten all about that gma500 aspect:
> well, I can send a fixup later on.

I already have code to fix that, which somebody earlier
in this discussion convinced me to throw away. Want me
to send it as a patch 4/3 ?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH RFC clocksource 2/5] clocksource: Retry clock read if long delays detected

2021-01-06 Thread Rik van Riel
On Wed, 2021-01-06 at 11:53 -0800, Paul E. McKenney wrote:
> On Wed, Jan 06, 2021 at 11:28:00AM -0500, Rik van Riel wrote:
> 
> > +   wdagain_nsec = clocksource_cyc2ns(delta, watchdog-
> > > mult, watchdog->shift);
> > +   if (wdagain_nsec < 0 || wdagain_nsec >
> > WATCHDOG_MAX_SKEW) {
> > +   wderr_nsec = wdagain_nsec;
> > +   if (nretries++ < max_read_retries)
> > +   goto retry;
> > +   }
> > 
> > Given that clocksource_cyc2ns uses unsigned multiplication
> > followed by a right shift, do we need to test for <0?
> 
> I am worried about the possibility of the "shift" argument to
> clocksource_cyc2ns() being zero.  For example, unless I am missing
> something, clocksource_tsc has a zero .shift field.

Oh, good point!



Re: [PATCH RFC clocksource 2/5] clocksource: Retry clock read if long delays detected

2021-01-06 Thread Rik van Riel
On Tue, 2021-01-05 at 16:41 -0800, paul...@kernel.org wrote:
> 
> @@ -203,7 +204,6 @@ static void
> clocksource_watchdog_inject_delay(void)
>   injectfail = inject_delay_run;
>   if (!(++injectfail / inject_delay_run % inject_delay_freq)) {
>   printk("%s(): Injecting delay.\n", __func__);
> - injectfail = 0;
>   for (i = 0; i < 2 * WATCHDOG_THRESHOLD / NSEC_PER_MSEC;
> i++)
>   udelay(1000);

Wait, patch 1 just added that line?

Should patch 1 not add it and this
patch go without
this removal? :)

+   wdagain_nsec = clocksource_cyc2ns(delta, watchdog-
>mult, watchdog->shift);
+   if (wdagain_nsec < 0 || wdagain_nsec >
WATCHDOG_MAX_SKEW) {
+   wderr_nsec = wdagain_nsec;
+   if (nretries++ < max_read_retries)
+   goto retry;
+   }

Given that clocksource_cyc2ns uses unsigned multiplication
followed by a right shift, do we need to test for <0?



Re: [MOCKUP] x86/mm: Lightweight lazy mm refcounting

2020-12-03 Thread Rik van Riel
On Thu, 2020-12-03 at 12:31 +, Matthew Wilcox wrote:

> And this just makes me think RCU freeing of mm_struct.  I'm sure it's
> more complicated than that (then, or now), but if an anonymous
> process
> is borrowing a freed mm, and the mm is freed by RCU then it will not
> go
> away until the task context switches.  When we context switch back to
> the anon task, it'll borrow some other task's MM and won't even
> notice
> that the MM it was using has gone away.

One major complication here is that most of the
active_mm borrowing is done by the idle task,
but RCU does not wait for idle tasks to context
switch.

That means RCU, as it is today, is not a
mechanism that mm_struct freeing could just
piggyback off.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/3] mm,thp,shm: limit gfp mask to no more than specified

2020-11-30 Thread Rik van Riel
On Mon, 2020-11-30 at 11:00 +0100, Michal Hocko wrote:
> On Fri 27-11-20 14:03:39, Rik van Riel wrote:
> > On Fri, 2020-11-27 at 08:52 +0100, Michal Hocko wrote:
> > > On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> > > > I would be more than happy to implement things differently,
> > > > but I am not sure what alternative you are suggesting.
> > > 
> > > Simply do not alter gfp flags? Or warn in some cases of a serious
> > > mismatch.
> > > E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL
> > > users
> > > of
> > > shmem.
> > 
> > Not altering the gfp flags is not really an option,
> > because that would leads to attempting to allocate THPs
> > with GFP_HIGHUSER, which is what is used to allocate
> > regular tmpfs pages.
> 
> Right but that is a completely different reason to alter the mask and
> it
> would be really great to know whether this is a theoretical concern
> or
> those users simply do not ever use THPs. Btw. should they be using
> THPs
> even if they opt themselves into GFP_KERNEL restriction?

I suppose disabling THPs completely if the gfp_mask
passed to shmem_getpage_gfp() is not GFP_HIGHUSER
is another option.

That seems like it might come with its own pitfalls,
though, both functionality wise and maintenance wise.

Does anyone have
strong feelings between "limit gfp_mask"
and "disable THP for !GFP_HIGHUSER"?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/3] mm,thp,shm: limit gfp mask to no more than specified

2020-11-27 Thread Rik van Riel
On Fri, 2020-11-27 at 08:52 +0100, Michal Hocko wrote:
> On Thu 26-11-20 13:04:14, Rik van Riel wrote:
> > 
> > I would be more than happy to implement things differently,
> > but I am not sure what alternative you are suggesting.
> 
> Simply do not alter gfp flags? Or warn in some cases of a serious
> mismatch.
> E.g. GFP_ZONEMASK mismatch because there are already GFP_KERNEL users
> of
> shmem.

Not altering the gfp flags is not really an option,
because that would leads to attempting to allocate THPs
with GFP_HIGHUSER, which is what is used to allocate
regular tmpfs pages.

If the THP configuration in sysfs says we should
not
be doing compaction/reclaim from THP allocations, we
should obey that configuration setting, and use a 
gfp_flags that results in no compaction/reclaim being done.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 3/3] mm,thp,shmem: make khugepaged obey tmpfs mount flags

2020-11-26 Thread Rik van Riel
On Thu, 2020-11-26 at 20:42 +0100, Vlastimil Babka wrote:
> On 11/26/20 7:14 PM, Rik van Riel wrote:
> > On Thu, 2020-11-26 at 18:18 +0100, Vlastimil Babka wrote:
> > > 
> > This patch makes khugepaged treat the mount options
> > and/or
> > sysfs flag as enabling collapsing of huge pages in case
> > enabled = [always] for regular THPs.
> > 
> > Should I send another patch on top
> > of this that causes
> > khugepaged to be enabled when regular THPs are disabled,
> > but shmem THPs are enabled in any way?
> 
> I think it would make sense. Although it might involve counting
> thp-enabled shmem mounts and only run khugepaged when there are >0 of
> them.

That seems feasible. I can do that as a follow-up 4/3
patch after the Thanksgiving weekend :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 3/3] mm,thp,shmem: make khugepaged obey tmpfs mount flags

2020-11-26 Thread Rik van Riel
On Thu, 2020-11-26 at 18:18 +0100, Vlastimil Babka wrote:
> On 11/24/20 8:49 PM, Rik van Riel wrote:
> > Currently if thp enabled=[madvise], mounting a tmpfs filesystem
> > with huge=always and mmapping files from that tmpfs does not
> > result in khugepaged collapsing those mappings, despite the
> > mount flag indicating that it should.
> > 
> > Fix that by breaking up the blocks of tests in hugepage_vma_check
> > a little bit, and testing things in the correct order.
> > 
> > Signed-off-by: Rik van Riel 
> > Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when
> > merging vma for shmem")
> 
> Looks ok. But, it we have sysfs thp enabled=never, and shmem mount
> explicitly 
> thp enabled, then shmem mount overrides the global sysfs setting and
> thp's will 
> be allocated there, right? However, khugepaged_enabled() will be
> false and thus 
> khugepaged won't run at all? So a similar situation than what you're
> fixing here.

Indeed, that is somewhat similar. Whether or not shmem
allocations attempt huge pages is controlled by both
the file /sys/kernel/mm/transparent_hugepage/shmem_enabled
and mount options.

This patch makes khugepaged treat the mount options
and/or
sysfs flag as enabling collapsing of huge pages in case
enabled = [always] for regular THPs.

Should I send another patch on top
of this that causes
khugepaged to be enabled when regular THPs are disabled,
but shmem THPs are enabled in any way?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/3] mm,thp,shm: limit gfp mask to no more than specified

2020-11-26 Thread Rik van Riel
On Thu, 2020-11-26 at 14:40 +0100, Michal Hocko wrote:
> On Tue 24-11-20 14:49:24, Rik van Riel wrote:
> > Matthew Wilcox pointed out that the i915 driver opportunistically
> > allocates tmpfs memory, but will happily reclaim some of its
> > pool if no memory is available.
> > 
> > Make sure the gfp mask used to opportunistically allocate a THP
> > is always at least as restrictive as the original gfp mask.
> 
> I have brought this up in the previous version review and I feel my
> feedback hasn't been addressed. Please describe the expected behavior
> by
> those shmem users including GFP_KERNEL restriction which would make
> the
> THP flags incompatible. Is this a problem? Is there any actual
> problem
> if the THP uses its own set of flags?

In the case of i915, the gfp flags passed in by the i915
driver expect the VM to easily fail the allocation, in
which case the i915 driver will reclaim some existing
buffers and try again.

Trying harder than the original gfp_mask would
change
the OOM behavior of systems using the i915 driver.

> I am also not happy how those two sets of flags are completely
> detached
> and we can only expect surprises there. 

I would be more than happy to implement things differently,
but I am not sure what alternative you are suggesting.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH 1/3] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-11-24 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

Controlling the gfp_mask of THP allocations through the knobs in
sysfs allows users to determine the balance between how aggressively
the system tries to allocate THPs at fault time, and how much the
application may end up stalling attempting those allocations.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

Signed-off-by: Rik van Riel 
---
 include/linux/gfp.h | 2 ++
 mm/huge_memory.c| 6 +++---
 mm/shmem.c  | 8 +---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..c7615c9ba03c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..c5d03b2f2f2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
 {
-   const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
_hugepage_flags))
@@ -744,7 +744,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
-   gfp = alloc_hugepage_direct_gfpmask(vma);
+   gfp = vma_thp_gfp_mask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..6c3cb192a88d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+   gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
-   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+   huge_gfp = vma_thp_gfp_mask(vma);
+   page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,
-- 
2.25.4



[PATCH 2/3] mm,thp,shm: limit gfp mask to no more than specified

2020-11-24 Thread Rik van Riel
Matthew Wilcox pointed out that the i915 driver opportunistically
allocates tmpfs memory, but will happily reclaim some of its
pool if no memory is available.

Make sure the gfp mask used to opportunistically allocate a THP
is always at least as restrictive as the original gfp mask.

Signed-off-by: Rik van Riel 
Suggested-by: Matthew Wilcox 
---
 mm/shmem.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 6c3cb192a88d..ee3cea10c2a4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1531,6 +1531,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t 
gfp,
return page;
 }
 
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+   gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+   gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+   gfp_t result = huge_gfp & ~allowflags;
+
+   /*
+* Minimize the result gfp by taking the union with the deny flags,
+* and the intersection of the allow flags.
+*/
+   result |= (limit_gfp & denyflags);
+   result |= (huge_gfp & limit_gfp) & allowflags;
+
+   return result;
+}
+
 static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
 {
@@ -1889,6 +1909,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
 
 alloc_huge:
huge_gfp = vma_thp_gfp_mask(vma);
+   huge_gfp = limit_gfp_mask(huge_gfp, gfp);
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
-- 
2.25.4



[PATCH 3/3] mm,thp,shmem: make khugepaged obey tmpfs mount flags

2020-11-24 Thread Rik van Riel
Currently if thp enabled=[madvise], mounting a tmpfs filesystem
with huge=always and mmapping files from that tmpfs does not
result in khugepaged collapsing those mappings, despite the
mount flag indicating that it should.

Fix that by breaking up the blocks of tests in hugepage_vma_check
a little bit, and testing things in the correct order.

Signed-off-by: Rik van Riel 
Fixes: c2231020ea7b ("mm: thp: register mm for khugepaged when merging vma for 
shmem")
---
 include/linux/khugepaged.h |  2 ++
 mm/khugepaged.c| 22 --
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h
index c941b7377321..2fcc01891b47 100644
--- a/include/linux/khugepaged.h
+++ b/include/linux/khugepaged.h
@@ -3,6 +3,7 @@
 #define _LINUX_KHUGEPAGED_H
 
 #include  /* MMF_VM_HUGEPAGE */
+#include 
 
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -57,6 +58,7 @@ static inline int khugepaged_enter(struct vm_area_struct *vma,
 {
if (!test_bit(MMF_VM_HUGEPAGE, >vm_mm->flags))
if ((khugepaged_always() ||
+(shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) ||
 (khugepaged_req_madv() && (vm_flags & VM_HUGEPAGE))) &&
!(vm_flags & VM_NOHUGEPAGE) &&
!test_bit(MMF_DISABLE_THP, >vm_mm->flags))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4e3dff13eb70..abab394c4206 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -440,18 +440,28 @@ static inline int khugepaged_test_exit(struct mm_struct 
*mm)
 static bool hugepage_vma_check(struct vm_area_struct *vma,
   unsigned long vm_flags)
 {
-   if ((!(vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
-   (vm_flags & VM_NOHUGEPAGE) ||
+   /* Explicitly disabled through madvise. */
+   if ((vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, >vm_mm->flags))
return false;
 
-   if (shmem_file(vma->vm_file) ||
-   (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
-vma->vm_file &&
-(vm_flags & VM_DENYWRITE))) {
+   /* Enabled via shmem mount options or sysfs settings. */
+   if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
HPAGE_PMD_NR);
}
+
+   /* THP settings require madvise. */
+   if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
+   return false;
+
+   /* Read-only file mappings need to be aligned for THP to work. */
+   if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
+   (vm_flags & VM_DENYWRITE)) {
+   return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
+   HPAGE_PMD_NR);
+   }
+
if (!vma->anon_vma || vma->vm_ops)
return false;
if (vma_is_temporary_stack(vma))
-- 
2.25.4



[PATCH v6 0/3] mm,thp,shm: limit shmem THP alloc gfp_mask

2020-11-24 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

v6: make khugepaged actually obey tmpfs mount flags
v5: reduce gfp mask further if needed, to accomodate i915 (Matthew Wilcox)
v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
v3: fix NULL vma issue spotted by Hugh Dickins & tested
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu




Re: [PATCH 2/2] mm,thp,shm: limit gfp mask to no more than specified

2020-11-23 Thread Rik van Riel
On Thu, 2020-11-19 at 10:38 +0100, Michal Hocko wrote:
> On Fri 13-11-20 22:40:40, Rik van Riel wrote:
> > On Thu, 2020-11-12 at 12:22 +0100, Michal Hocko wrote:
> > > [Cc Chris for i915 and Andray]
> > > 
> > > On Thu 05-11-20 14:15:08, Rik van Riel wrote:
> > > > Matthew Wilcox pointed out that the i915 driver
> > > > opportunistically
> > > > allocates tmpfs memory, but will happily reclaim some of its
> > > > pool if no memory is available.
> > > 
> > > It would be good to explicitly mention the requested gfp flags
> > > for
> > > those
> > > allocations. i915 uses __GFP_NORETRY | __GFP_NOWARN, or
> > > GFP_KERNEL.
> > > Is
> > > __shmem_rw really meant to not allocate from highmeme/movable
> > > zones?
> > > Can
> > > it be ever backed by THPs?
> > 
> > You are right, I need to copy the zone flags __GFP_DMA
> > through
> > __GFP_MOVABLE straight from the limiting gfp_mask
> > into the gfp_mask used for THP allocations, and not use
> > the default THP zone flags if the caller specifies something
> > else.
> > 
> > I'll send out a new version that fixes that.
> 
> Can we make one step back here and actually check whether all this is
> actually needed for those shmem users before adding more hacks here
> and
> there?

It doesn't look like that is needed, after all.

The i915 driver seems to support having its buffer in
highmem, the shmem_pwrite and shmem_pread functions
both do kmap/kunmap.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 1/2] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-11-13 Thread Rik van Riel
On Thu, 2020-11-12 at 11:52 +0100, Michal Hocko wrote:
> On Thu 05-11-20 14:15:07, Rik van Riel wrote:
> > 
> > This patch applies the same configurated limitation of THPs to
> > shmem
> > hugepage allocations, to prevent that from happening.
> 
> I believe you should also exaplain why we want to control defrag by
> the
> global knob while the enable logic is per mount.

I added that to the changelog for the next version of
the patches.

> > This way a THP defrag setting of "never" or "defer+madvise" will
> > result
> > in quick allocation failures without direct reclaim when no 2MB
> > free
> > pages are available.
> > 
> > With this patch applied, THP allocations for tmpfs will be a little
> > more aggressive than today for files mmapped with MADV_HUGEPAGE,
> > and a little less aggressive for files that are not mmapped or
> > mapped without that flag.
> 
> This begs some numbers. A little is rather bad unit of performance. I
> do
> agree that unifying those makes sense in general though.

The aggressiveness is in changes to the gfp_mask, eg by
adding __GFP_NORETRY. How that translates into THP
allocation success rates is entirely dependent on the
workload and on what else is in memory at the time.

I am not sure any
numbers I could gather will be
representative for anything but the workloads I am
testing.

However, I did find an issue in hugepage_vma_check
that prevents khugepaged from collapsing pages on
shmem filesystems mounted with huge=always or
huge=within_size when transparent_hugepage/enabled
is set to [madvise].

The next version of the series will have a third
patch, in order to fix that.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/2] mm,thp,shm: limit gfp mask to no more than specified

2020-11-13 Thread Rik van Riel
On Thu, 2020-11-12 at 12:22 +0100, Michal Hocko wrote:
> [Cc Chris for i915 and Andray]
> 
> On Thu 05-11-20 14:15:08, Rik van Riel wrote:
> > Matthew Wilcox pointed out that the i915 driver opportunistically
> > allocates tmpfs memory, but will happily reclaim some of its
> > pool if no memory is available.
> 
> It would be good to explicitly mention the requested gfp flags for
> those
> allocations. i915 uses __GFP_NORETRY | __GFP_NOWARN, or GFP_KERNEL.
> Is
> __shmem_rw really meant to not allocate from highmeme/movable zones?
> Can
> it be ever backed by THPs?

You are right, I need to copy the zone flags __GFP_DMA
through
__GFP_MOVABLE straight from the limiting gfp_mask
into the gfp_mask used for THP allocations, and not use
the default THP zone flags if the caller specifies something
else.

I'll send out a new version that fixes that.

> ttm might want __GFP_RETRY_MAYFAIL while shmem_read_mapping_page use
> the mapping gfp mask which can be NOFS or something else. This is
> quite
> messy already and I suspect that they are more targeting regular
> order-0
> requests. E.g. have a look at cb5f1a52caf23.
> 
> I am worried that this games with gfp flags will lead to
> unmaintainable
> code later on. There is a clear disconnect betwen the core THP
> allocation strategy and what drivers are asking for and those
> requirements might be really conflicting. Not to mention that flags
> might be different between regular and THP pages.

That is exactly why I want to make sure the THP allocations
are never more aggressive than the gfp flags the drivers
request, and the THP allocations may only ever be less
aggressive than the order 0 gfp_mask specified by the drivers.


-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/2] mm,thp,shm: limit gfp mask to no more than specified

2020-11-06 Thread Rik van Riel
On Fri, 2020-11-06 at 11:05 +0800, Hillf Danton wrote:
> On Thu,  5 Nov 2020 14:15:08 -0500
> > Matthew Wilcox pointed out that the i915 driver opportunistically
> > allocates tmpfs memory, but will happily reclaim some of its
> > pool if no memory is available.
> > 
> > Make sure the gfp mask used to opportunistically allocate a THP
> > is always at least as restrictive as the original gfp mask.
> > 
> > Signed-off-by: Rik van Riel 
> > Suggested-by: Matthew Wilcox 
> > ---
> >  mm/shmem.c | 21 +
> >  1 file changed, 21 insertions(+)
> > 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 6c3cb192a88d..ee3cea10c2a4 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1531,6 +1531,26 @@ static struct page *shmem_swapin(swp_entry_t
> > swap, gfp_t gfp,
> > return page;
> >  }
> >  
> > +/*
> > + * Make sure huge_gfp is always more limited than limit_gfp.
> > + * Some of the flags set permissions, while others set
> > limitations.
> > + */
> > +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
> > +{
> > +   gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
> > +   gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
> > +   gfp_t result = huge_gfp & ~allowflags;
> > +
> > +   /*
> > +* Minimize the result gfp by taking the union with the deny
> > flags,
> > +* and the intersection of the allow flags.
> > +*/
> > +   result |= (limit_gfp & denyflags);
> 
> Currently NORETRY is always set regardless of i915 and if it's
> determined in 1/2 then the i915 thing can be done like
> 
>   return huge_gfp | (limit_gfp & __GFP_RECLAIM);

No, if __GFP_KSWAPD_RECLAIM or __GFP_DIRECT_RECLAIM are
not set in either huge_gfp or limit_gfp, we want to ensure
the resulting gfp does not have it set, either.

Your suggested change
would result in __GFP_KSWAPD_RECLAIM
or __GFP_DIRECT_RECLAIM getting set if it was set in either
of the input gfp variables, which is probably not the desired
behavior.


-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH 1/2] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-11-05 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

Signed-off-by: Rik van Riel 
---
 include/linux/gfp.h | 2 ++
 mm/huge_memory.c| 6 +++---
 mm/shmem.c  | 8 +---
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..c7615c9ba03c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..c5d03b2f2f2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
 {
-   const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
_hugepage_flags))
@@ -744,7 +744,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
-   gfp = alloc_hugepage_direct_gfpmask(vma);
+   gfp = vma_thp_gfp_mask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..6c3cb192a88d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+   gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
-   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+   huge_gfp = vma_thp_gfp_mask(vma);
+   page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,
-- 
2.25.4



[PATCH 2/2] mm,thp,shm: limit gfp mask to no more than specified

2020-11-05 Thread Rik van Riel
Matthew Wilcox pointed out that the i915 driver opportunistically
allocates tmpfs memory, but will happily reclaim some of its
pool if no memory is available.

Make sure the gfp mask used to opportunistically allocate a THP
is always at least as restrictive as the original gfp mask.

Signed-off-by: Rik van Riel 
Suggested-by: Matthew Wilcox 
---
 mm/shmem.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 6c3cb192a88d..ee3cea10c2a4 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1531,6 +1531,26 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t 
gfp,
return page;
 }
 
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+   gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+   gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+   gfp_t result = huge_gfp & ~allowflags;
+
+   /*
+* Minimize the result gfp by taking the union with the deny flags,
+* and the intersection of the allow flags.
+*/
+   result |= (limit_gfp & denyflags);
+   result |= (huge_gfp & limit_gfp) & allowflags;
+
+   return result;
+}
+
 static struct page *shmem_alloc_hugepage(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
 {
@@ -1889,6 +1909,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
 
 alloc_huge:
huge_gfp = vma_thp_gfp_mask(vma);
+   huge_gfp = limit_gfp_mask(huge_gfp, gfp);
page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
-- 
2.25.4



[PATCH 0/2] mm,thp,shm: limit shmem THP alloc gfp_mask

2020-11-05 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

v5: reduce gfp mask further if needed, to accomodate i915 (Matthew Wilcox)
v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
v3: fix NULL vma issue spotted by Hugh Dickins & tested
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu




Re: [PATCH] sched/fair: ensure tasks spreading in LLC during LB

2020-11-02 Thread Rik van Riel
On Mon, 2020-11-02 at 11:24 +0100, Vincent Guittot wrote:

> Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()")
> Reported-by: Chris Mason 
> Suggested-by: Rik van Riel 
> Signed-off-by: Vincent Guittot 

Tested-and-reviewed-by: Rik van Riel 

Thank you!

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] fix scheduler regression from "sched/fair: Rework load_balance()"

2020-10-29 Thread Rik van Riel
On Mon, 2020-10-26 at 17:52 +0100, Vincent Guittot wrote:
> On Mon, 26 Oct 2020 at 17:48, Chris Mason  wrote:
> > On 26 Oct 2020, at 12:20, Vincent Guittot wrote:
> > 
> > > what you are suggesting is something like:
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 4978964e75e5..3b6fbf33abc2 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -9156,7 +9156,8 @@ static inline void
> > > calculate_imbalance(struct
> > > lb_env *env, struct sd_lb_stats *s
> > >  * emptying busiest.
> > >  */
> > > if (local->group_type == group_has_spare) {
> > > -   if (busiest->group_type > group_fully_busy) {
> > > +   if ((busiest->group_type > group_fully_busy) &&
> > > +   !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
> > > /*
> > >  * If busiest is overloaded, try to fill
> > > spare
> > >  * capacity. This might end up creating
> > > spare
> > > capacity
> > > 
> > > which also fixes the problem for me and alignes LB with wakeup
> > > path
> > > regarding the migration
> > > in the LLC
> > 
> > Vincent’s patch on top of 5.10-rc1 looks pretty great:
> > 
> > Latency percentiles (usec) runtime 90 (s) (3320 total samples)
> >  50.0th: 161 (1687 samples)
> >  75.0th: 200 (817 samples)
> >  90.0th: 228 (488 samples)
> >  95.0th: 254 (164 samples)
> >  *99.0th: 314 (131 samples)
> >  99.5th: 330 (17 samples)
> >  99.9th: 356 (13 samples)
> >  min=29, max=358
> > 
> > Next we test in prod, which probably won’t have answers until
> > tomorrow.  Thanks again Vincent!
> 
> Great !
> 
> I'm going to run more tests on my setup as well to make sure that it
> doesn't generate unexpected side effects on other kinds of use cases.

We have tested the patch with several pretty demanding
workloads for the past several days, and it seems to
do the trick!

With all the current scheduler code from the Linus tree,
plus this patch on top, performance is as good as it ever
was before with one workload, and slightly better with
the other.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] fix scheduler regression from "sched/fair: Rework load_balance()"

2020-10-26 Thread Rik van Riel
On Mon, 26 Oct 2020 16:42:14 +0100
Vincent Guittot  wrote:
> On Mon, 26 Oct 2020 at 16:04, Rik van Riel  wrote:

> > Could utilization estimates be off, either lagging or
> > simply having a wrong estimate for a task, resulting
> > in no task getting pulled sometimes, while doing a
> > migrate_task imbalance always moves over something?  
> 
> task and cpu utilization are not always up to fully synced and may lag
> a bit which explains that sometimes LB can fail to migrate for a small
> diff

OK, running with this little snippet below, I see latencies
improve back to near where they used to be:

Latency percentiles (usec) runtime 150 (s)
50.0th: 13
75.0th: 31
90.0th: 69
95.0th: 90
*99.0th: 761
99.5th: 2268
99.9th: 9104
min=1, max=16158

I suspect the right/cleaner approach might be to use
migrate_task more in !CPU_NOT_IDLE cases?

Running a task to an idle CPU immediately, instead of refusing
to have the load balancer move it, improves latencies for fairly
obvious reasons.

I am not entirely clear on why the load balancer should need to
be any more conservative about moving tasks than the wakeup
path is in eg. select_idle_sibling.


diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 35bdc0cccfa6..60acf71a2d39 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7415,7 +7415,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_util:
util = task_util_est(p);
 
-   if (util > env->imbalance)
+   if (util > env->imbalance && env->idle == CPU_NOT_IDLE)
goto next;
 
env->imbalance -= util;


Re: [PATCH] fix scheduler regression from "sched/fair: Rework load_balance()"

2020-10-26 Thread Rik van Riel
On Mon, 2020-10-26 at 15:56 +0100, Vincent Guittot wrote:
> On Mon, 26 Oct 2020 at 15:38, Rik van Riel  wrote:
> > On Mon, 2020-10-26 at 15:24 +0100, Vincent Guittot wrote:
> > > Le lundi 26 oct. 2020 à 08:45:27 (-0400), Chris Mason a écrit :
> > > > On 26 Oct 2020, at 4:39, Vincent Guittot wrote:
> > > > 
> > > > > Hi Chris
> > > > > 
> > > > > On Sat, 24 Oct 2020 at 01:49, Chris Mason  wrote:
> > > > > > Hi everyone,
> > > > > > 
> > > > > > We’re validating a new kernel in the fleet, and compared
> > > > > > with
> > > > > > v5.2,
> > > > > 
> > > > > Which version are you using ?
> > > > > several improvements have been added since v5.5 and the
> > > > > rework of
> > > > > load_balance
> > > > 
> > > > We’re validating v5.6, but all of the numbers referenced in
> > > > this
> > > > patch are
> > > > against v5.9.  I usually try to back port my way to victory on
> > > > this
> > > > kind of
> > > > thing, but mainline seems to behave exactly the same as
> > > > 0b0695f2b34a wrt
> > > > this benchmark.
> > > 
> > > ok. Thanks for the confirmation
> > > 
> > > I have been able to reproduce the problem on my setup.
> > > 
> > > Could you try the fix below ?
> > > 
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -9049,7 +9049,8 @@ static inline void
> > > calculate_imbalance(struct
> > > lb_env *env, struct sd_lb_stats *s
> > >  * emptying busiest.
> > >  */
> > > if (local->group_type == group_has_spare) {
> > > -   if (busiest->group_type > group_fully_busy) {
> > > +   if ((busiest->group_type > group_fully_busy) &&
> > > +   (busiest->group_weight > 1)) {
> > > /*
> > >  * If busiest is overloaded, try to fill
> > > spare
> > >  * capacity. This might end up creating
> > > spare
> > > capacity
> > > 
> > > 
> > > When we calculate an imbalance at te smallest level, ie between
> > > CPUs
> > > (group_weight == 1),
> > > we should try to spread tasks on cpus instead of trying to fill
> > > spare
> > > capacity.
> > 
> > Should we also spread tasks when balancing between
> > multi-threaded CPU cores on the same socket?
> 
> My explanation is probably misleading. In fact we already try to
> spread tasks. we just use spare capacity instead of nr_running when
> there is more than 1 CPU in the group and the group is overloaded.
> Using spare capacity is a bit more conservative because it tries to
> not pull more utilization than spare capacity

Could utilization estimates be off, either lagging or
simply having a wrong estimate for a task, resulting
in no task getting pulled sometimes, while doing a
migrate_task imbalance always moves over something?

Within an LLC we might not need to worry too much
about spare capacity, considering select_idle_sibling 
doesn't give a hoot about capacity, either.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] fix scheduler regression from "sched/fair: Rework load_balance()"

2020-10-26 Thread Rik van Riel
On Mon, 2020-10-26 at 15:24 +0100, Vincent Guittot wrote:
> Le lundi 26 oct. 2020 à 08:45:27 (-0400), Chris Mason a écrit :
> > On 26 Oct 2020, at 4:39, Vincent Guittot wrote:
> > 
> > > Hi Chris
> > > 
> > > On Sat, 24 Oct 2020 at 01:49, Chris Mason  wrote:
> > > > Hi everyone,
> > > > 
> > > > We’re validating a new kernel in the fleet, and compared with
> > > > v5.2,
> > > 
> > > Which version are you using ?
> > > several improvements have been added since v5.5 and the rework of
> > > load_balance
> > 
> > We’re validating v5.6, but all of the numbers referenced in this
> > patch are
> > against v5.9.  I usually try to back port my way to victory on this
> > kind of
> > thing, but mainline seems to behave exactly the same as
> > 0b0695f2b34a wrt
> > this benchmark.
> 
> ok. Thanks for the confirmation
> 
> I have been able to reproduce the problem on my setup.
> 
> Could you try the fix below ?
> 
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9049,7 +9049,8 @@ static inline void calculate_imbalance(struct
> lb_env *env, struct sd_lb_stats *s
>  * emptying busiest.
>  */
> if (local->group_type == group_has_spare) {
> -   if (busiest->group_type > group_fully_busy) {
> +   if ((busiest->group_type > group_fully_busy) &&
> +   (busiest->group_weight > 1)) {
> /*
>  * If busiest is overloaded, try to fill
> spare
>  * capacity. This might end up creating spare
> capacity
> 
> 
> When we calculate an imbalance at te smallest level, ie between CPUs
> (group_weight == 1),
> we should try to spread tasks on cpus instead of trying to fill spare
> capacity.

Should we also spread tasks when balancing between
multi-threaded CPU cores on the same socket?

Say we have groups of CPUs
(0, 2) and CPUs (1, 3),
with CPU 2 idle, and 3 tasks spread between CPUs
1 & 3.

Since they are all on the same LLC, and the task
wakeup code has absolutely no hesitation in moving
them around, should the load balancer also try to
keep tasks within a socket spread across all CPUs?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v4] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-24 Thread Rik van Riel
On Sat, 2020-10-24 at 03:09 +0100, Matthew Wilcox wrote:
> On Fri, Oct 23, 2020 at 08:48:04PM -0400, Rik van Riel wrote:
> > The allocation flags of anonymous transparent huge pages can be
> > controlled
> > through the files in /sys/kernel/mm/transparent_hugepage/defrag,
> > which can
> > help the system from getting bogged down in the page reclaim and
> > compaction
> > code when many THPs are getting allocated simultaneously.
> > 
> > However, the gfp_mask for shmem THP allocations were not limited by
> > those
> > configuration settings, and some workloads ended up with all CPUs
> > stuck
> > on the LRU lock in the page reclaim code, trying to allocate dozens
> > of
> > THPs simultaneously.
> > 
> > This patch applies the same configurated limitation of THPs to
> > shmem
> > hugepage allocations, to prevent that from happening.
> > 
> > This way a THP defrag setting of "never" or "defer+madvise" will
> > result
> > in quick allocation failures without direct reclaim when no 2MB
> > free
> > pages are available.
> > 
> > With this patch applied, THP allocations for tmpfs will be a little
> > more aggressive than today for files mmapped with MADV_HUGEPAGE,
> > and a little less aggressive for files that are not mmapped or
> > mapped without that flag.
> 
> How about this code path though?
> 
> shmem_get_pages() [ in i915 ]
>   shmem_read_mapping_page_gfp(__GFP_NORETRY | __GFP_NOWARN)
> shmem_getpage_gfp()
>   shmem_alloc_and_acct_page()
> shmem_alloc_hugepage()
> 
> I feel like the NORETRY from i915 should override whatever is set
> in sysfs for anon THPs.  What do others think?

It looks like currently the only way to get a THP
allocation with __GFP_DIRECT_RECLAIM and without
__GFP_NORETRY (which does nothing without
__GFP_DIRECT_RECLAIM) is to explicitly do an
madvise MADV_HUGEPAGE on a VMA.

I am not convinced the i915 driver should
override a userspace madvise.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH v4] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-23 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

Signed-off-by: Rik van Riel 
--- 
v4: rename alloc_hugepage_direct_gfpmask to vma_thp_gfp_mask (Matthew Wilcox)
v3: fix NULL vma issue spotted by Hugh Dickins & tested
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..c7615c9ba03c 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..c5d03b2f2f2f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t vma_thp_gfp_mask(struct vm_area_struct *vma)
 {
-   const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
_hugepage_flags))
@@ -744,7 +744,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable);
return ret;
}
-   gfp = alloc_hugepage_direct_gfpmask(vma);
+   gfp = vma_thp_gfp_mask(vma);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..6c3cb192a88d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+   gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
-   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+   huge_gfp = vma_thp_gfp_mask(vma);
+   page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,


[PATCH v3] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-23 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

With this patch applied, THP allocations for tmpfs will be a little
more aggressive than today for files mmapped with MADV_HUGEPAGE,
and a little less aggressive for files that are not mmapped or
mapped without that flag.

Signed-off-by: Rik van Riel 
--- 
v3: fix NULL vma issue spotted by Hugh Dickins & tested
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..0a5b164a26d9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..6296bdff9693 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,9 +649,9 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
-   const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+   const bool vma_madvised = vma && (vma->vm_flags & VM_HUGEPAGE);
 
/* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
_hugepage_flags))
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..9710b9df91e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+   gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
-   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+   huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+   page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,



Re: [PATCH v2] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-22 Thread Rik van Riel
On Thu, 2020-10-22 at 19:54 -0700, Hugh Dickins wrote:
> On Thu, 22 Oct 2020, Rik van Riel wrote:
> 
> > The allocation flags of anonymous transparent huge pages can be
> controlled
> > through the files in /sys/kernel/mm/transparent_hugepage/defrag,
> which can
> > help the system from getting bogged down in the page reclaim and
> compaction
> > code when many THPs are getting allocated simultaneously.
> > 
> > However, the gfp_mask for shmem THP allocations were not limited by
> those
> > configuration settings, and some workloads ended up with all CPUs
> stuck
> > on the LRU lock in the page reclaim code, trying to allocate dozens
> of
> > THPs simultaneously.
> > 
> > This patch applies the same configurated limitation of THPs to
> shmem
> > hugepage allocations, to prevent that from happening.
> > 
> > This way a THP defrag setting of "never" or "defer+madvise" will
> result
> > in quick allocation failures without direct reclaim when no 2MB
> free
> > pages are available.
> > 
> > Signed-off-by: Rik van Riel 
> 
> NAK in its present untested form: see below.

Oops. That issue is easy to fix, but indeed lets figure
out what the desired behavior is.

> I'm open to change here, particularly to Yu Xu's point (in other
> mail)
> about direct reclaim - we avoid that here in Google too: though it's
> not so much to avoid the direct reclaim, as to avoid the latencies of
> direct compaction, which __GFP_DIRECT_RECLAIM allows as a side-
> effect.
> 
> > @@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode
> *inode, pgoff_t index,
> >   }
> >  
> >  alloc_huge:
> > - page = shmem_alloc_and_acct_page(gfp, inode, index, true);
> > + huge_gfp = alloc_hugepage_direct_gfpmask(vma);
> 
> Still looks nice: but what about the crash when vma is NULL?

That's a one line fix, but I suppose we should get the
discussion on what the code behavior should be out of
the way first :)

> Michal is right to remember pushback before, because tmpfs is a
> filesystem, and "huge=" is a mount option: in using a huge=always
> filesystem, the user has already declared a preference for huge
> pages.
> Whereas the original anon THP had to deduce that preference from sys
> tunables and vma madvice.

...

> But it's likely that they have accumulated some defrag wisdom, which
> tmpfs can take on board - but please accept that in using a huge
> mount,
> the preference for huge has already been expressed, so I don't expect
> anon THP alloc_hugepage_direct_gfpmask() choices will map one to one.

In my mind, the huge= mount options for tmpfs corresponded
to the "enabled" anon THP options, denoting a desired end
state, not necessarily how much we will stall allocations
to get there immediately.

The underlying allocation behavior has been changed repeatedly,
with changes to the direct reclaim code and the compaction
deferral code.

The shmem THP gfp_mask never tried really hard anyway,
with __GFP_NORETRY being the default, which matches what
is used for non-VM_HUGEPAGE anon VMAs.

Likewise, the direct reclaim done from the opportunistic
THP allocations done by the shmem code limited itself to
reclaiming 32 4kB pages per THP allocation.

In other words, mounting
with huge=always has never behaved
the same as the more aggressive allocations done for
MADV_HUGEPAGE VMAs.

This patch would leave shmem THP allocations for non-MADV_HUGEPAGE
mapped files opportunistic like today, and make shmem THP
allocations for files mapped with MADV_HUGEPAGE more aggressive
than today.

However, I would like to know what people think the shmem
huge= mount options should do, and how things should behave
when memory gets low, before pushing in a patch just because
it makes the system run smoother "without changing current
behavior too much".

What do people want tmpfs THP allocations to do?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm: memcontrol: add file_thp, shmem_thp to memory.stat

2020-10-22 Thread Rik van Riel
On Thu, 2020-10-22 at 12:49 -0400, Rik van Riel wrote:
> On Thu, 2020-10-22 at 11:18 -0400, Johannes Weiner wrote:
> 
> > index e80aa9d2db68..334ce608735c 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct
> > address_space *mapping,
> > if (PageSwapBacked(page)) {
> > __mod_lruvec_page_state(page, NR_SHMEM, -nr);
> > if (PageTransHuge(page))
> > -   __dec_node_page_state(page, NR_SHMEM_THPS);
> > +   __dec_lruvec_page_state(page, NR_SHMEM_THPS);
> > } else if (PageTransHuge(page)) {
> > -   __dec_node_page_state(page, NR_FILE_THPS);
> > +   __dec_lruvec_page_state(page, NR_FILE_THPS);
> > filemap_nr_thps_dec(mapping);
> > }
> 
> This may be a dumb question, but does that mean the
> NR_FILE_THPS number will no longer be visible in
> /proc/vmstat or is there some magic I overlooked in
> a cursory look of the code?

Never mind, I found it a few levels deep in
__dec_lruvec_page_state.

Reviewed-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm: memcontrol: add file_thp, shmem_thp to memory.stat

2020-10-22 Thread Rik van Riel
On Thu, 2020-10-22 at 11:18 -0400, Johannes Weiner wrote:

> index e80aa9d2db68..334ce608735c 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct
> address_space *mapping,
>   if (PageSwapBacked(page)) {
>   __mod_lruvec_page_state(page, NR_SHMEM, -nr);
>   if (PageTransHuge(page))
> - __dec_node_page_state(page, NR_SHMEM_THPS);
> + __dec_lruvec_page_state(page, NR_SHMEM_THPS);
>   } else if (PageTransHuge(page)) {
> - __dec_node_page_state(page, NR_FILE_THPS);
> + __dec_lruvec_page_state(page, NR_FILE_THPS);
>   filemap_nr_thps_dec(mapping);
>   }

This may be a dumb question, but does that mean the
NR_FILE_THPS number will no longer be visible in
/proc/vmstat or is there some magic I overlooked in
a cursory look of the code?




[PATCH v2] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-22 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

Signed-off-by: Rik van Riel 
--- 
v2: move gfp calculation to shmem_getpage_gfp as suggested by Yu Xu

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..0a5b164a26d9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..9b08ce5cc387 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,7 +649,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..9710b9df91e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);
@@ -1802,6 +1802,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
+   gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
@@ -1887,7 +1888,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t 
index,
}
 
 alloc_huge:
-   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+   huge_gfp = alloc_hugepage_direct_gfpmask(vma);
+   page = shmem_alloc_and_acct_page(huge_gfp, inode, index, true);
if (IS_ERR(page)) {
 alloc_nohuge:
page = shmem_alloc_and_acct_page(gfp, inode,


Re: [PATCH] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-22 Thread Rik van Riel
On Fri, 2020-10-23 at 00:00 +0800, Yu Xu wrote:
> On 10/22/20 11:48 AM, Rik van Riel wrote:
> > The allocation flags of anonymous transparent huge pages can be
> > controlled
> > through the files in /sys/kernel/mm/transparent_hugepage/defrag,
> > which can
> > help the system from getting bogged down in the page reclaim and
> > compaction
> > code when many THPs are getting allocated simultaneously.
> > 
> > However, the gfp_mask for shmem THP allocations were not limited by
> > those
> > configuration settings, and some workloads ended up with all CPUs
> > stuck
> > on the LRU lock in the page reclaim code, trying to allocate dozens
> > of
> > THPs simultaneously.
> > 
> > This patch applies the same configurated limitation of THPs to
> > shmem
> > hugepage allocations, to prevent that from happening.
> > 
> > This way a THP defrag setting of "never" or "defer+madvise" will
> > result
> > in quick allocation failures without direct reclaim when no 2MB
> > free
> > pages are available.
> > 
> > Signed-off-by: Rik van Riel 
> > ---
> > 
> > diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> > index c603237e006c..0a5b164a26d9 100644
> > --- a/include/linux/gfp.h
> > +++ b/include/linux/gfp.h
> > @@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
> >   extern void pm_restrict_gfp_mask(void);
> >   extern void pm_restore_gfp_mask(void);
> >   
> > +extern gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct
> > *vma);
> > +
> >   #ifdef CONFIG_PM_SLEEP
> >   extern bool pm_suspended_storage(void);
> >   #else
> > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > index 9474dbc150ed..9b08ce5cc387 100644
> > --- a/mm/huge_memory.c
> > +++ b/mm/huge_memory.c
> > @@ -649,7 +649,7 @@ static vm_fault_t
> > __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> >*available
> >* never: never stall for any thp allocation
> >*/
> > -static inline gfp_t alloc_hugepage_direct_gfpmask(struct
> > vm_area_struct *vma)
> > +gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
> >   {
> > const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
> >   
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 537c137698f8..d1290eb508e5 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1545,8 +1545,11 @@ static struct page
> > *shmem_alloc_hugepage(gfp_t gfp,
> > return NULL;
> >   
> > shmem_pseudo_vma_init(, info, hindex);
> > -   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY |
> > __GFP_NOWARN,
> > -   HPAGE_PMD_ORDER, , 0, numa_node_id(),
> > true);
> > +   /* Limit the gfp mask according to THP configuration. */
> > +   gfp |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
> > +   gfp &= alloc_hugepage_direct_gfpmask();
> 
> It is fine to reuse `alloc_hugepage_direct_gfpmask`, but
> `pvma.vm_flags & VM_HUGEPAGE` is always false here, thus,
> `vma_madvised` in `alloc_hugepage_direct_gfpmask` will always
> be false.
> 
> That is why I chose to do the gfp_mask fixup in `shmem_getpage_gfp`,
> using `sgp_huge` to indicate `vma_madvised`, although with some silly
> mistakes pointed out by you, in another mail thread.
> 
> It will be better if vma_madvised is well handled in your solution.

OK, let me send a v2 that does that!

By just passing a correct gfp_mask to shmem_alloc_and_acct_page
we can also avoid the gfp gymnastics in shmem_alloc_hugepage
that Michal rightfully objected to.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-22 Thread Rik van Riel
On Thu, 2020-10-22 at 17:50 +0200, Michal Hocko wrote:
> On Thu 22-10-20 09:25:21, Rik van Riel wrote:
> > On Thu, 2020-10-22 at 10:15 +0200, Michal Hocko wrote:
> > > On Wed 21-10-20 23:48:46, Rik van Riel wrote:
> > > > 
> > > > diff --git a/mm/shmem.c b/mm/shmem.c
> > > > index 537c137698f8..d1290eb508e5 100644
> > > > --- a/mm/shmem.c
> > > > +++ b/mm/shmem.c
> > > > @@ -1545,8 +1545,11 @@ static struct page
> > > > *shmem_alloc_hugepage(gfp_t gfp,
> > > > return NULL;
> > > >  
> > > > shmem_pseudo_vma_init(, info, hindex);
> > > > -   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY
> > > > |
> > > > __GFP_NOWARN,
> > > > -   HPAGE_PMD_ORDER, , 0,
> > > > numa_node_id(),
> > > > true);
> > > > +   /* Limit the gfp mask according to THP configuration.
> > > > */
> > > > +   gfp |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
> > > 
> > > What is the reason for these when alloc_hugepage_direct_gfpmask
> > > provides
> > > the full mask?
> > 
> > The mapping_gfp_mask for the shmem file might have additional
> > restrictions above and beyond the gfp mask returned by
> > alloc_hugepage_direct_gfpmask, and I am not sure we should just
> > ignore the mapping_gfp_mask.
> 
> No, we shouldn't. But I do not see why you should be adding the above
> set of flags on top.

Because THP allocations are higher order and optimistic,
and we want them to:
1) be annotated as compound allocations
2) fail (and fall back to 4kB allocations) when they cannot
   be easily satisfied, and
3) not create a spew of allocation failure backtraces on
   the (serial) console when these THP allocations fail

> > That is why this patch takes the union of both gfp masks.
> > 
> > However, digging into things more, it looks like shmem inodes
> > always have the mapping gfp mask set to GFP_HIGHUSER_MOVABLE,
> > and it is never changed, so simply using the output from
> > alloc_hugepage_direct_gfpmask should be fine.
> > 
> > I can do the patch either way. Just let me know what you prefer.
> 
> I would just and the given gfp with alloc_hugepage_direct_gfpmask

That would miss the three things we definitely want
from above.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm/shmem: fix up gfpmask for shmem hugepage allocation

2020-10-22 Thread Rik van Riel
On Wed, 2020-10-21 at 16:09 +0800, Xu Yu wrote:

> @@ -1887,6 +1930,7 @@ static int shmem_getpage_gfp(struct inode
> *inode, pgoff_t index,
>   }
>  
>  alloc_huge:
> + gfp = shmem_hugepage_gfpmask_fixup(gfp, sgp_huge);
>   page = shmem_alloc_and_acct_page(gfp, inode, index, true);
>   if (IS_ERR(page)) {
>  alloc_nohuge:

This looks it could be a bug, because the changed
gfp flags are also used for the non-huge allocation
below the alloc_nohuge: label, when the huge allocation
fails.

Using a separate huge_gfp variable would solve that
issue.

However, your patch also changes the meaning of
SHMEM_HUGE_FORCE from "override mount flags" to
"aggressively try reclaim and compaction", which
mixes up the equivalents of the anon THP sysctl
"enabled" and "defrag" settings.

I believe it makes sense to continue keeping the
"what should khugepaged do with these pages?" and
"how hard should we try at allocation time?" settings
separately for shmem the same way they are kept
separately for anonymous memory.

I also suspect it is simplest if shmem uses the
same "how hard should we try at allocation time?"
settings from the "defrag" sysfs file, instead
of giving system administrators two knobs that they
will likely want to set to the same value anyway.

Coincidentally, I have been looking at the same
code on and off for the past month, and also sent
a patch to the list to fix this issue yesterday.

I suspect my patch can be simplified a little more
by directly using alloc_hugepage_direct_gfpmask to
create a huge_gfp flag in shmem_getpage_gfp.

https://lore.kernel.org/linux-mm/20201021234846.5cc97...@imladris.surriel.com/

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-22 Thread Rik van Riel
On Thu, 2020-10-22 at 10:15 +0200, Michal Hocko wrote:
> On Wed 21-10-20 23:48:46, Rik van Riel wrote:
> > The allocation flags of anonymous transparent huge pages can be
> > controlled
> > through the files in /sys/kernel/mm/transparent_hugepage/defrag,
> > which can
> > help the system from getting bogged down in the page reclaim and
> > compaction
> > code when many THPs are getting allocated simultaneously.
> > 
> > However, the gfp_mask for shmem THP allocations were not limited by
> > those
> > configuration settings, and some workloads ended up with all CPUs
> > stuck
> > on the LRU lock in the page reclaim code, trying to allocate dozens
> > of
> > THPs simultaneously.
> > 
> > This patch applies the same configurated limitation of THPs to
> > shmem
> > hugepage allocations, to prevent that from happening.
> > 
> > This way a THP defrag setting of "never" or "defer+madvise" will
> > result
> > in quick allocation failures without direct reclaim when no 2MB
> > free
> > pages are available.
> 
> I remmeber I wanted to unify this in the past as well. The patch got
> reverted in the end. It was a long story and I do not have time to
> read
> through lengthy discussions again. I do remember though that there
> were
> some objections pointing out that shmem has its own behavior which is
> controlled by the mount option - at least for the explicitly mounted
> shmems. I might misremember.

That is not entirely true, though.

THP has two main sysfs knobs: "enabled" and "defrag"

The mount options
control the shmem equivalent options
for "enabled", but they do not do anything for the "defrag"
equivalent options.

This patch applies the "defrag" THP options to
shmem.

> [...]
> 
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 537c137698f8..d1290eb508e5 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
> > @@ -1545,8 +1545,11 @@ static struct page
> > *shmem_alloc_hugepage(gfp_t gfp,
> > return NULL;
> >  
> > shmem_pseudo_vma_init(, info, hindex);
> > -   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY |
> > __GFP_NOWARN,
> > -   HPAGE_PMD_ORDER, , 0, numa_node_id(),
> > true);
> > +   /* Limit the gfp mask according to THP configuration. */
> > +   gfp |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
> 
> What is the reason for these when alloc_hugepage_direct_gfpmask
> provides
> the full mask?

The mapping_gfp_mask for the shmem file might have additional
restrictions above and beyond the gfp mask returned by
alloc_hugepage_direct_gfpmask, and I am not sure we should just
ignore the mapping_gfp_mask.

That is why this patch takes the union of both gfp masks.

However, digging into things more, it looks like shmem inodes
always have the mapping gfp mask set to GFP_HIGHUSER_MOVABLE,
and it is never changed, so simply using the output from
alloc_hugepage_direct_gfpmask should be fine.

I can do the patch either way. Just let me know what you prefer.

> > +   gfp &= alloc_hugepage_direct_gfpmask();
> > +   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0,
> > numa_node_id(),
> > +  true);
> > shmem_pseudo_vma_destroy();
> > if (page)
> > prep_transhuge_page(page);
> > 
> > -- 
> > All rights reversed.
-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH] mm,thp,shmem: limit shmem THP alloc gfp_mask

2020-10-21 Thread Rik van Riel
The allocation flags of anonymous transparent huge pages can be controlled
through the files in /sys/kernel/mm/transparent_hugepage/defrag, which can
help the system from getting bogged down in the page reclaim and compaction
code when many THPs are getting allocated simultaneously.

However, the gfp_mask for shmem THP allocations were not limited by those
configuration settings, and some workloads ended up with all CPUs stuck
on the LRU lock in the page reclaim code, trying to allocate dozens of
THPs simultaneously.

This patch applies the same configurated limitation of THPs to shmem
hugepage allocations, to prevent that from happening.

This way a THP defrag setting of "never" or "defer+madvise" will result
in quick allocation failures without direct reclaim when no 2MB free
pages are available.

Signed-off-by: Rik van Riel 
---

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c603237e006c..0a5b164a26d9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -614,6 +614,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask);
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+extern gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma);
+
 #ifdef CONFIG_PM_SLEEP
 extern bool pm_suspended_storage(void);
 #else
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9474dbc150ed..9b08ce5cc387 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -649,7 +649,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
vm_fault *vmf,
  * available
  * never: never stall for any thp allocation
  */
-static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
+gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 {
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..d1290eb508e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,11 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
 
shmem_pseudo_vma_init(, info, hindex);
-   page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
-   HPAGE_PMD_ORDER, , 0, numa_node_id(), true);
+   /* Limit the gfp mask according to THP configuration. */
+   gfp |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
+   gfp &= alloc_hugepage_direct_gfpmask();
+   page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, , 0, numa_node_id(),
+  true);
shmem_pseudo_vma_destroy();
if (page)
prep_transhuge_page(page);

-- 
All rights reversed.


Re: [PATCH 0/2] mm,swap: skip swap readahead for instant IO (like zswap)

2020-10-09 Thread Rik van Riel
On Mon, 2020-10-05 at 13:32 -0400, Rik van Riel wrote:
> On Tue, 2020-09-22 at 10:12 -0700, Andrew Morton wrote:
> > On Mon, 21 Sep 2020 22:01:46 -0400 Rik van Riel 
> > wrote:

> > Any quantitative testing results?
> 
> I have test results with a real workload now.
> 
> Without this patch, enabling zswap results in about an 
> 8% increase in p99 request latency. With these patches,
> the latency penalty for enabling zswap is under 1%.

Never mind that. On larger tests the effect seems to disappear,
probably because the logic in __swapin_nr_pages() already reduces
the number of pages read ahead to 2 on workloads with lots of
random access.

That reduces the latency effects observed.

Now we might
still see some memory waste due to decompressing
pages we don't need, but I have not seen any real effects from
that yet, either.

I think it may be time to focus on a larger memory waste with
zswap: leaving the compressed copy of memory around when we
decompress the memory at swapin time.  More aggressively freeing
the compressed memory will probably buy us more than reducing
readahead.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 0/2] mm,swap: skip swap readahead for instant IO (like zswap)

2020-10-05 Thread Rik van Riel
On Tue, 2020-09-22 at 10:12 -0700, Andrew Morton wrote:
> On Mon, 21 Sep 2020 22:01:46 -0400 Rik van Riel 
> wrote:
> 
> > Both with frontswap/zswap, and with some extremely fast IO devices,
> > swap IO will be done before the "asynchronous" swap_readpage() call
> > has returned.
> > 
> > In that case, doing swap readahead only wastes memory, increases
> > latency, and increases the chances of needing to evict something
> > more
> > useful from memory. In that case, just skip swap readahead.
> 
> Any quantitative testing results?

I have test results with a real workload now.

Without this patch, enabling zswap results in about an 
8% increase in p99 request latency. With these patches,
the latency penalty for enabling zswap is under 1%.

Enabling zswap
allows us to give the main workload a
little more memory, since the spikes in memory demand
caused by things like system management software no 
longer cause large latency issues.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [RFC PATCH 1/1] vmscan: Support multiple kswapd threads per node

2020-10-02 Thread Rik van Riel
On Fri, 2020-10-02 at 09:03 +0200, Michal Hocko wrote:
> On Thu 01-10-20 18:18:10, Sebastiaan Meijer wrote:
> > (Apologies for messing up the mailing list thread, Gmail had fooled
> > me into
> > believing that it properly picked up the thread)
> > 
> > On Thu, 1 Oct 2020 at 14:30, Michal Hocko  wrote:
> > > On Wed 30-09-20 21:27:12, Sebastiaan Meijer wrote:
> > > > > yes it shows the bottleneck but it is quite artificial. Read
> > > > > data is
> > > > > usually processed and/or written back and that changes the
> > > > > picture a
> > > > > lot.
> > > > Apologies for reviving an ancient thread (and apologies in
> > > > advance for my lack
> > > > of knowledge on how mailing lists work), but I'd like to offer
> > > > up another
> > > > reason why merging this might be a good idea.
> > > > 
> > > > From what I understand, zswap runs its compression on the same
> > > > kswapd thread,
> > > > limiting it to a single thread for compression. Given enough
> > > > processing power,
> > > > zswap can get great throughput using heavier compression
> > > > algorithms like zstd,
> > > > but this is currently greatly limited by the lack of threading.
> > > 
> > > Isn't this a problem of the zswap implementation rather than
> > > general
> > > kswapd reclaim? Why zswap doesn't do the same as normal swap out
> > > in a
> > > context outside of the reclaim?

On systems with lots of very fast IO devices, we have
also seen kswapd take 100% CPU time without any zswap
in use.

This seems like a generic issue, though zswap does
manage to bring it out on lower end systems.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH 2/2] mm,swap: skip swap readahead if page was obtained instantaneously

2020-09-22 Thread Rik van Riel
On Tue, 2020-09-22 at 11:13 +0800, huang ying wrote:
> On Tue, Sep 22, 2020 at 10:02 AM Rik van Riel 
> wrote:
> > Check whether a swap page was obtained instantaneously, for example
> > because it is in zswap, or on a very fast IO device which uses busy
> > waiting, and we did not wait on IO to swap in this page.
> > If no IO was needed to get the swap page we want, kicking off
> > readahead
> > on surrounding swap pages is likely to be counterproductive,
> > because the
> > extra loads will cause additional latency, use up extra memory, and
> > chances
> > are the surrounding pages in swap are just as fast to load as this
> > one,
> > making readahead pointless.
> > 
> > Signed-off-by: Rik van Riel 
> > ---
> >  mm/swap_state.c | 14 +++---
> >  1 file changed, 11 insertions(+), 3 deletions(-)
> > 
> > diff --git a/mm/swap_state.c b/mm/swap_state.c
> > index aacb9ba53f63..6919f9d5fe88 100644
> > --- a/mm/swap_state.c
> > +++ b/mm/swap_state.c
> > @@ -637,6 +637,7 @@ static struct page
> > *swap_cluster_read_one(swp_entry_t entry,
> >  struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t
> > gfp_mask,
> > struct vm_fault *vmf)
> 
> Why not do this for swap_vma_readahead()
> too?  swap_cluster_read_one()
> can be used in swap_vma_readahead() too.

Good point, I should do the same thing for swap_vma_readahead()
as well. Let me do that and send in a version 2 of the series.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH 2/2] mm,swap: skip swap readahead if page was obtained instantaneously

2020-09-21 Thread Rik van Riel
Check whether a swap page was obtained instantaneously, for example
because it is in zswap, or on a very fast IO device which uses busy
waiting, and we did not wait on IO to swap in this page.

If no IO was needed to get the swap page we want, kicking off readahead
on surrounding swap pages is likely to be counterproductive, because the
extra loads will cause additional latency, use up extra memory, and chances
are the surrounding pages in swap are just as fast to load as this one,
making readahead pointless.

Signed-off-by: Rik van Riel 
---
 mm/swap_state.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index aacb9ba53f63..6919f9d5fe88 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -637,6 +637,7 @@ static struct page *swap_cluster_read_one(swp_entry_t entry,
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
 {
+   struct page *page;
unsigned long entry_offset = swp_offset(entry);
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
@@ -668,11 +669,18 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
gfp_t gfp_mask,
end_offset = si->max - 1;
 
blk_start_plug();
+   /* If we read the page without waiting on IO, skip readahead. */
+   page = swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, false);
+   if (page && PageUptodate(page))
+   goto skip_unplug;
+
+   /* Ok, do the async read-ahead now. */
for (offset = start_offset; offset <= end_offset ; offset++) {
-   /* Ok, do the async read-ahead now */
-   swap_cluster_read_one(entry, offset, gfp_mask, vma, addr,
- offset != entry_offset);
+   if (offset == entry_offset)
+   continue;
+   swap_cluster_read_one(entry, offset, gfp_mask, vma, addr, true);
}
+skip_unplug:
blk_finish_plug();
 
lru_add_drain();/* Push any new pages onto the LRU now */
-- 
2.25.4



[PATCH 1/2] mm,swap: extract swap single page readahead into its own function

2020-09-21 Thread Rik van Riel
Split swap single page readahead into its own function, to make
the next patch easier to read. No functional changes.

Signed-off-by: Rik van Riel 
---
 mm/swap_state.c | 40 +---
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index c16eebb81d8b..aacb9ba53f63 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -594,6 +594,28 @@ static unsigned long swapin_nr_pages(unsigned long offset)
return pages;
 }
 
+static struct page *swap_cluster_read_one(swp_entry_t entry,
+   unsigned long offset, gfp_t gfp_mask,
+   struct vm_area_struct *vma, unsigned long addr, bool readahead)
+{
+   bool page_allocated;
+   struct page *page;
+
+   page =__read_swap_cache_async(swp_entry(swp_type(entry), offset),
+ gfp_mask, vma, addr, _allocated);
+   if (!page)
+   return NULL;
+   if (page_allocated) {
+   swap_readpage(page, false);
+   if (readahead) {
+   SetPageReadahead(page);
+   count_vm_event(SWAP_RA);
+   }
+   }
+   put_page(page);
+   return page;
+}
+
 /**
  * swap_cluster_readahead - swap in pages in hope we need them soon
  * @entry: swap entry of this memory
@@ -615,14 +637,13 @@ static unsigned long swapin_nr_pages(unsigned long offset)
 struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
 {
-   struct page *page;
unsigned long entry_offset = swp_offset(entry);
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
struct swap_info_struct *si = swp_swap_info(entry);
struct blk_plug plug;
-   bool do_poll = true, page_allocated;
+   bool do_poll = true;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
 
@@ -649,19 +670,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
gfp_t gfp_mask,
blk_start_plug();
for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */
-   page = __read_swap_cache_async(
-   swp_entry(swp_type(entry), offset),
-   gfp_mask, vma, addr, _allocated);
-   if (!page)
-   continue;
-   if (page_allocated) {
-   swap_readpage(page, false);
-   if (offset != entry_offset) {
-   SetPageReadahead(page);
-   count_vm_event(SWAP_RA);
-   }
-   }
-   put_page(page);
+   swap_cluster_read_one(entry, offset, gfp_mask, vma, addr,
+ offset != entry_offset);
}
blk_finish_plug();
 
-- 
2.25.4



[PATCH 0/2] mm,swap: skip swap readahead for instant IO (like zswap)

2020-09-21 Thread Rik van Riel
Both with frontswap/zswap, and with some extremely fast IO devices,
swap IO will be done before the "asynchronous" swap_readpage() call
has returned.

In that case, doing swap readahead only wastes memory, increases
latency, and increases the chances of needing to evict something more
useful from memory. In that case, just skip swap readahead.




Re: [PATCH 5/9] btrfs: zstd: Switch to the zstd-1.4.6 API

2020-09-16 Thread Rik van Riel
On Wed, 2020-09-16 at 15:18 -0400, Nick Terrell wrote:

> The zstd version in the kernel works fine. But, you can see that the
> version
> that got imported stagnated where upstream had 14 released versions.
> I
> don't think it makes sense to have kernel developers maintain their
> own copy
> of zstd. Their time would be better spent working on the rest of the
> kernel.
> Using upstream directly lets the kernel profit from the work that we,
> the zstd
> developers, are doing. And it still allows kernel developers to fix
> bugs if any
> show up, and we can back-port them to upstream.

I can't argue with that.

> One possibility is to have a kernel wrapper on top of the zstd API to
> make it
> more ergonomic. I personally don’t really see the value in it, since
> it adds
> another layer of indirection between zstd and the caller, but it
> could be done.

Zstd would not be the first part of the kernel to
come from somewhere else, and have wrappers when
it gets integrated into the kernel. There certainly
is precedence there.

It would be interesting to know what Christoph's
preference is.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH] silence nfscache allocation warnings with kvzalloc

2020-09-14 Thread Rik van Riel
silence nfscache allocation warnings with kvzalloc

Currently nfsd_reply_cache_init attempts hash table allocation through
kmalloc, and manually falls back to vzalloc if that fails. This makes
the code a little larger than needed, and creates a significant amount
of serial console spam if you have enough systems.

Switching to kvzalloc gets rid of the allocation warnings, and makes
the code a little cleaner too as a side effect.

Freeing of nn->drc_hashtbl is already done using kvfree currently.

Signed-off-by: Rik van Riel 
---
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 96352ab7bd81..5125b5ef25b6 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -164,14 +164,10 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
if (!nn->drc_slab)
goto out_shrinker;
 
-   nn->drc_hashtbl = kcalloc(hashsize,
-   sizeof(*nn->drc_hashtbl), GFP_KERNEL);
-   if (!nn->drc_hashtbl) {
-   nn->drc_hashtbl = vzalloc(array_size(hashsize,
-sizeof(*nn->drc_hashtbl)));
-   if (!nn->drc_hashtbl)
-   goto out_slab;
-   }
+   nn->drc_hashtbl = kvzalloc(array_size(hashsize,
+  sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
+   if (!nn->drc_hashtbl)
+   goto out_slab;
 
for (i = 0; i < hashsize; i++) {
INIT_LIST_HEAD(>drc_hashtbl[i].lru_head);



Re: [RFC PATCH 00/16] 1GB THP support on x86_64

2020-09-10 Thread Rik van Riel
On Thu, 2020-09-10 at 09:32 +0200, Michal Hocko wrote:
> [Cc Vlastimil and Mel - the whole email thread starts
>  http://lkml.kernel.org/r/20200902180628.4052244-1-zi@sent.com
>  but this particular subthread has diverged a bit and you might find
> it
>  interesting]
> 
> On Wed 09-09-20 15:43:55, David Hildenbrand wrote:
> > 
> > I am not sure I like the trend towards CMA that we are seeing,
> > reserving
> > huge buffers for specific users (and eventually even doing it
> > automatically).
> > 
> > What we actually want is ZONE_MOVABLE with relaxed guarantees, such
> > that
> > anybody who requires large, unmovable allocations can use it.
> > 
> > I once played with the idea of having ZONE_PREFER_MOVABLE, which
> > a) Is the primary choice for movable allocations
> > b) Is allowed to contain unmovable allocations (esp., gigantic
> > pages)
> > c) Is the fallback for ZONE_NORMAL for unmovable allocations,
> > instead of
> > running out of memory
> 
> I might be missing something but how can this work longterm? Or put
> in
> another words why would this work any better than existing
> fragmentation
> avoidance techniques that page allocator implements already - 

One big difference is reclaim. If ZONE_NORMAL runs low on
free memory, page reclaim would kick in and evict some
movable/reclaimable things, to free up more space for
unmovable allocations.

The current fragmentation avoidance techniques don't do
things like reclaim, or proactively migrating movable
pages out of unmovable page blocks to prevent unmovable
allocations in currently movable page blocks.

> My suspicion is that a separate zone would work in a similar fashion.
> As
> long as there is a lot of free memory then zone will be effectively
> MOVABLE. Similar applies to normal zone when unmovable allocations
> are
> in minority. As long as the Normal zone gets full of unmovable
> objects
> they start overflowing to ZONE_PREFER_MOVABLE and it will resemble
> page
> block stealing when unmovable objects start spreading over movable
> page
> blocks.

You are right, with the difference being reclaim and/or
migration, which could make a real difference in limiting
the number of pageblocks that have unmovable allocations.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [RFC PATCH 00/16] 1GB THP support on x86_64

2020-09-09 Thread Rik van Riel
On Wed, 2020-09-09 at 15:43 +0200, David Hildenbrand wrote:
> On 09.09.20 15:19, Rik van Riel wrote:
> > On Wed, 2020-09-09 at 09:04 +0200, Michal Hocko wrote:
> > 
> > > That CMA has to be pre-reserved, right? That requires a
> > > configuration.
> > 
> > To some extent, yes.
> > 
> > However, because that pool can be used for movable
> > 4kB and 2MB
> > pages as well as for 1GB pages, it would be easy to just set
> > the size of that pool to eg. 1/3 or even 1/2 of memory for every
> > system.
> > 
> > It isn't like the pool needs to be the exact right size. We
> > just need to avoid the "highmem problem" of having too little
> > memory for kernel allocations.
> > 
> 
> I am not sure I like the trend towards CMA that we are seeing,
> reserving
> huge buffers for specific users (and eventually even doing it
> automatically).
> 
> What we actually want is ZONE_MOVABLE with relaxed guarantees, such
> that
> anybody who requires large, unmovable allocations can use it.
> 
> I once played with the idea of having ZONE_PREFER_MOVABLE, which
> a) Is the primary choice for movable allocations
> b) Is allowed to contain unmovable allocations (esp., gigantic pages)
> c) Is the fallback for ZONE_NORMAL for unmovable allocations, instead
> of
> running out of memory
> 
> If someone messes up the zone ratio, issues known from zone
> imbalances
> are avoided - large allocations simply become less likely to succeed.
> In
> contrast to ZONE_MOVABLE, memory offlining is not guaranteed to work.

I really like that idea. This will be easier to deal with than
a "just the right size" CMA area, and seems like it would be
pretty forgiving in both directions.

Keeping unmovable allocations
contained to one part of memory
should also make compaction within the ZONE_PREFER_MOVABLE area
a lot easier than compaction for higher order allocations is
today.

I suspect your proposal solves a lot of issues at once.

For (c) from your proposal, we could even claim a whole
2MB or even 1GB area at once for unmovable allocations,
keeping those contained in a limited amount of physical
memory again, to make life easier on compaction.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [RFC PATCH 00/16] 1GB THP support on x86_64

2020-09-09 Thread Rik van Riel
On Wed, 2020-09-09 at 09:04 +0200, Michal Hocko wrote:
> On Tue 08-09-20 10:41:10, Rik van Riel wrote:
> > On Tue, 2020-09-08 at 16:35 +0200, Michal Hocko wrote:
> > 
> > > A global knob is insufficient. 1G pages will become a very
> > > precious
> > > resource as it requires a pre-allocation (reservation). So it
> > > really
> > > has
> > > to be an opt-in and the question is whether there is also some
> > > sort
> > > of
> > > access control needed.
> > 
> > The 1GB pages do not require that much in the way of
> > pre-allocation. The memory can be obtained through CMA,
> > which means it can be used for movable 4kB and 2MB
> > allocations when not
> > being used for 1GB pages.
> 
> That CMA has to be pre-reserved, right? That requires a
> configuration.

To some extent, yes.

However, because that pool can be used for movable
4kB and 2MB
pages as well as for 1GB pages, it would be easy to just set
the size of that pool to eg. 1/3 or even 1/2 of memory for every
system.

It isn't like the pool needs to be the exact right size. We
just need to avoid the "highmem problem" of having too little
memory for kernel allocations.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [RFC PATCH 00/16] 1GB THP support on x86_64

2020-09-08 Thread Rik van Riel
On Tue, 2020-09-08 at 16:35 +0200, Michal Hocko wrote:

> A global knob is insufficient. 1G pages will become a very precious
> resource as it requires a pre-allocation (reservation). So it really
> has
> to be an opt-in and the question is whether there is also some sort
> of
> access control needed.

The 1GB pages do not require that much in the way of
pre-allocation. The memory can be obtained through CMA,
which means it can be used for movable 4kB and 2MB
allocations when not
being used for 1GB pages.

That makes it relatively easy to set aside
some fraction
of system memory in every system for 1GB and movable
allocations, and use it for whatever way it is needed
depending on what workload(s) end up running on a system.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: CFS flat runqueue proposal fixes/update

2020-08-20 Thread Rik van Riel
On Thu, 2020-08-20 at 16:56 +0200, Dietmar Eggemann wrote:
> Hi Rik,
> 
> On 31/07/2020 09:42, Rik van Riel wrote:
> 
> [...]
> 
> > Lets revisit the hierarchy from above, and assign priorities
> > to the cgroups, with the fixed point one being 1000. Lets
> > say cgroups A, A1, and B have priority 1000, while cgroup
> > A2 has priority 1.
> > 
> > /\
> >/  \
> >   AB
> >  / \\ 
> > A1 A2   t3
> >/ \
> >   t1 t2
> > 
> > One consequence of this is that when t1, t2, and t3 each
> > get a time slice, the vruntime of tasks t1 and t3 advances
> > at roughly the same speed as the clock time, while the
> > vruntime of task t2 advances 1000x faster.
> > 
> > This is fine if all three tasks continue to be runnable,
> > since t1, t2 and t3 each get their fair share of CPU time.
> > 
> > However, if t1 goes to sleep, t2 is the only thing running
> > inside cgroup A, which has the same priority as cgroup B,
> > and tasks t2 and t3 should be getting the same amount of
> > CPU time.
> > 
> > They eventually will, but not before task t3 has used up
> > enough CPU time to catch up with the enormous vruntime
> > advance that t2 just suffered.
> > 
> > That needs to be fixed, to get near-immediate convergence,
> > and not convergence after some unknown (potentially long)
> > period of time.
> 
> I'm trying to understand this issue in detail ...
> 
> Since t1 and t2 are single tasks in A1 and A2, this taskgroup level
> shouldn't matter for tick preemption after t1 went to sleep?
> 
> check_preempt_tick() is only invoked for 'cfs_rq->nr_running > 1'
> from
> entity_tick().
> 
> IMHO, tick preemption is handled between A and B and since they have
> the
> same cpu.weight (cpu.shares) t2 and t3 get the same time slice after
> t1
> went to sleep.
> 
> I think that here tick preemption happens in the 'if (delta_exec >
> ideal_runtime)' condition w/ delta_exec = curr->sum_exec_runtime -
> curr->prev_sum_exec_runtime.
> 
> Did I miss anything?

The issue happens with a flat runqueue, when t1 goes
to sleep, but t2 and t3 continue running.

We need to make sure the vruntime for t2 has not been
advanced so far into the future that cgroup A is unable
to get its fair share of CPU wihle t1 is sleeping.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH for 5.9 v2 1/4] futex: introduce FUTEX_SWAP operation

2020-08-11 Thread Rik van Riel
On Tue, 2020-08-04 at 14:31 +0200, pet...@infradead.org wrote:
> On Mon, Aug 03, 2020 at 03:15:07PM -0700, Peter Oskolkov wrote:
> > A simplified/idealized use case: imagine a multi-user service
> > application
> > (e.g. a DBMS) that has to implement the following user CPU quota
> > policy:
> 
> So the last posting made hackernews; and there a bunch expressed far
> more interest in coroutines, which, if I'm not mistaken, can also be
> implemented using all this.
> 
> Would that not make for a far simpler and more convincing use-case?

Also just worker threads in general. Instead of waking up
an idle worker thread every time a new request comes in,
why not allow worker threads to indicate "I'm almost done",
and the main/receiving thread to enqueue the new request,
to be handled when the worker thread is done with the current
request?

> I really think we want to have block/resume detection sorted before
> this
> goes anywhere, I also strongly feel BPF should not be used for
> functional interfaces like that.
> 
> That is, I want to see a complete interface before I want to commit
> to
> an ABI that we're stuck with.

The work case above also needs to have that figured out,
for the (slow path, but still common) case of the worker
thread actually having gone to sleep while the work got
enqueued, and then needing to be woken up anyway.

Not sure that is a direction you are interested in, given
the description, but it could make coroutines and worker
threads more efficient :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: CFS flat runqueue proposal fixes/update

2020-08-07 Thread Rik van Riel
On Fri, 2020-08-07 at 16:14 +0200, Dietmar Eggemann wrote:
> On 31/07/2020 09:42, Rik van Riel wrote:
> > Possible solution
> > ...
> I imagine that I can see what you want to achieve here ;-)
> 
> But it's hard since your v5 RFC
> https://lkml.kernel.org/r/20190906191237.27006-1-r...@surriel.com is
> pretty old by now.

The v5 patches also do not implement this new idea, and
still suffer from the corner cases that Paul Turner
pointed out last year.

> Do you have a version of the patch-set against tip/sched/core? Quite
> a
> lot has changed (runnable load avg replaced by runnable avg, rq->load 
> is
> gone, CFS load balance rework).

Not yet. We got a baby this spring, so I've been busy
with things like milk and diapers, instead of with
code.

I wanted to get this proposal out before Plumbers, so
we could at least talk about it, and maybe find flaws
with this idea before I spend weeks/months implementing it :)

> IMHO it would be extremely helpful to have a current patch-set to
> discuss how these other problems can be covered by patches on top.

Agreed. I hope to get some time to work on that, but
no guarantees about getting it ready before Plumbers :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


CFS flat runqueue proposal fixes/update

2020-07-31 Thread Rik van Riel
Hello,

last year at Linux Plumbers conference, I presented on my work
of turning the hierarchical CFS runqueue into a flat runqueue,
and Paul Turner pointed out some corner cases that could not
work with my design as it was last year.

Paul pointed out two corner cases, and I have come up with a
third one myself, but I believe they all revolve around the
same issue, admission control, and can all be solved with the
same solution.

This is a fairly complex thing, so this email is long and
in 3 parts:
- hierarchical runqueue problem statement
- quick design overview of the flat runqueue for CFS
- overview of the problems pointed out by Paul Turner
- description of a possible solution


   hierarchical runqueue problem statement

Currently CFS with the cgroups CPU controller uses a
hierarchical design, which means that a task is enqueued
on its cgroup's runqueu, that cgroup is enqueued on its
parent's runqueue, etc all the way up to the root runqueue.
This also means that every time a task in a cgroup wakes
up or goes to sleep, the common case (1 task on the CPU)
is that the entire hierarchy is enqueued or dequeued,
resulting in significant overhead for workloads with lots
of context switches.

For example, the hierarchy in a system with equal priority
cgroups A and B, with cgroup A subdivided in two unequal
priority cgroups A1 and B2, and a task in each leaf cgroup
would look like this:

/\
   /  \
  AB
 / \\ 
A1 A2   t3
   / \
  t1 t2

The common case on most systems is that CPUs are idle
some of the time, and CPUs usually have 0, 1, or 2
running tasks at a time.

That means when task t1 wakes up when the CPU is idle,
t1 first gets enqueued on A1, which then gets enqueued
on A2, which then gets enqueued on the root runqueue.
When t1 goes back to sleep, the whole thing is torn back
down.

In case all three tasks are runnable at the same time,
the scheduler will first choose between A and B in the
root runqueue, and, in case it chose A, between A1 and A2
the next level down.


   CFS flat runqueue design

The code I presented last year operates on the principle
of separating determining hierarchical priority of a task,
which is done periodically, from the runqueues, and using
the hierarchical priority to scale the vruntime of a task
like is done for nice levels.

The hierarchical priority of a tasks is the fraction of
the weight at each level in the runqueue hierarchy of the
path to that task. In the example above, with equal priority
levels for cgroups A, B, A1, and A2, and a total weight 1000
for the root runqueue, tasks t1, t2, and t3 will have hierarchical
weights of 250, 250, and 500 respectively.

CFS tracks both wall clock run time and vruntime for each
task, where vruntime is the wall clock run time scaled
according to the task's priority. The vruntime of every
entity on a runqueue advances at the same rate.

The vruntime is calculated as follows:

 vruntime = exec_time * FIXED_POINT_ONE / priority

That is, if the priority of the task is equal to the
fixed point constant used, the vruntime corresponds
to the executed wall clock time, while a lower priority
task has its vruntime advance at a faster rate, and
a higher priority task has its vruntime advance slower.

With every entity on a runqueue having its vruntime
advance at the same rate, that translates into higher
priority tasks receiving more CPU time, and lower
priority tasks receiving less CPU time.


Problems identified by Paul Turner

Lets revisit the hierarchy from above, and assign priorities
to the cgroups, with the fixed point one being 1000. Lets
say cgroups A, A1, and B have priority 1000, while cgroup
A2 has priority 1.

/\
   /  \
  AB
 / \\ 
A1 A2   t3
   / \
  t1 t2

One consequence of this is that when t1, t2, and t3 each
get a time slice, the vruntime of tasks t1 and t3 advances
at roughly the same speed as the clock time, while the
vruntime of task t2 advances 1000x faster.

This is fine if all three tasks continue to be runnable,
since t1, t2 and t3 each get their fair share of CPU time.

However, if t1 goes to sleep, t2 is the only thing running
inside cgroup A, which has the same priority as cgroup B,
and tasks t2 and t3 should be getting the same amount of
CPU time.

They eventually will, but not before task t3 has used up
enough CPU time to catch up with the enormous vruntime
advance that t2 just suffered.

That needs to be fixed, to get near-immediate convergence,
and not convergence after some unknown (potentially long)
period of time.


There are similar issues with thundering herds and cgroup
cpu.max throttling, where a large number of tasks can be
woken up simultaneously.

/\
   /  \
  AB
 /  \
t1t2 t3 t4 t5 ...
t42

In the current, hierarchical runqueue setup, CFS will
time slice between cgroups A and B at the top level,
which means task t1 will never go long without being
scheduled.  Only the tasks 

Re: XHCI vs PCM2903B/PCM2904 part 2

2020-06-30 Thread Rik van Riel
On Tue, 2020-06-30 at 17:27 +0300, Mathias Nyman wrote:
> On 30.6.2020 16.08, Rik van Riel wrote:
> > I misread the code, it's not a bitfield, so state 1 means an
> > endpoint marked with running state. The next urb is never getting a
> > response, though.
> > 
> > However, the xhci spec says an endpoint is halted upon a babble
> > error.
> 
> I was looking at the same, so according to specs this state shouldn't
> be possible. 

The PCM2903B chip, and potentially the hub it is behind,
are USB2 devices, though. Does USB2 know about halted
endpoints?

> > The code right above the babble handling case adds halted into the
> > endpoint state itself. Does the code handling the babble error need
> > to do something similar to trigger cleanup elsewhere? 
> 
> It's a flag to prevent ringing the doorbell for a halted endpoint.
> Anyway, reset endpoint is meant to recover an endpoint in a halted
> state.
> Resetting non-halted endpoints will just lead to a context state
> error, and
> besides, isoc endpoints shouldn't halt.
> 
> Anyways, I haven't got any better idea at the moment.
> You can try and see what a forced reset does with:

So close. Looks like something in the XHCI/USB stack isn't
doing the reset because something is in an unexpected state?

[   51.706798] xhci_hcd :00:14.0: WARN Set TR Deq Ptr cmd failed
due to incorrect slot or ep state.
[   51.706802] got overflow, ep->flags = 2
[   51.932550] usb 3-9.7.5: reset high-speed USB device number 18 using
xhci_hcd
[   68.830396] xhci_hcd :00:14.0: WARN Set TR Deq Ptr cmd failed
due to incorrect slot or ep state.
[   68.830409] got overflow, ep->flags = 2
[   70.077981] rfkill: input handler disabled
[  157.992374] got overflow, ep->flags = 2
[  157.992406] xhci_hcd :00:14.0: WARN Set TR Deq Ptr cmd failed
due to incorrect slot or ep state.

> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-
> ring.c
> index 2c255d0620b0..d79aca0df6d4 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -1900,8 +1900,7 @@ static int
> xhci_requires_manual_halt_cleanup(struct xhci_hcd *xhci,
>  * endpoint anyway.  Check if a babble halted the
>  * endpoint.
>  */
> -   if (GET_EP_CTX_STATE(ep_ctx) == EP_STATE_HALTED)
> -   return 1;
> +   return 1;
>  
> return 0;
>  }
> 
> Traces also showed thet endpoint doorbell was rang after th babble
> error, so
> we know that didn't help restarting the endpoint.
> 
> -Mathias
> 
-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: XHCI vs PCM2903B/PCM2904 part 2

2020-06-29 Thread Rik van Riel
On Mon, 2020-06-29 at 23:21 -0400, Rik van Riel wrote:

> > Could you add the code below and take new traces, it will show the
> > endpoint
> > state after the Babble error.
> 
> Hi Mathias,
> 
> I have finally rebooted into a kernel with your tracepoint.
> After a babble error, I get the following info in the trace.
> 
> [  556.716334] xhci_hcd :00:14.0: Babble error for slot 13 ep 8
> on
> endpoint
> 
>  28672.016 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
> 12845096, deq: 69501877488, tx_info: 12845252)
>  34816.037 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
> 12845096, deq: 69501877856, tx_info: 12845252)
>  38912.043 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
> 12845096, deq: 69501870176, tx_info: 12845252)

OK, this is strange indeed.
info: 0x30001
info2: 0xc40028
tx_info: c400c4

That suggests the device state is EP_STATE_DISABLED, but
we never got the error from the EP_STATE_DISABLED test near
the start of handle_tx_event(). If we had, the big switch
statement containing the code below would have been bypassed.

Unless I am mistaken, does that mean the endpoint context
(*ep_ctx) got modified while the code was in the middle of
handle_tx_event()?

What would cause that? A subsequent transfer to an endpoint
while it is in EP_STATE_HALTED, which the comment suggests 
is the expected endpoint state for a babble error?

> > diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-
> > ring.c
> > index 0fda0c0f4d31..373d89ef7275 100644
> > --- a/drivers/usb/host/xhci-ring.c
> > +++ b/drivers/usb/host/xhci-ring.c
> > @@ -2455,6 +2455,7 @@ static int handle_tx_event(struct xhci_hcd
> > *xhci,
> > case COMP_BABBLE_DETECTED_ERROR:
> > xhci_dbg(xhci, "Babble error for slot %u ep %u on
> > endpoint\n",
> >  slot_id, ep_index);
> > +   trace_xhci_handle_tx_event(ep_ctx);
> > status = -EOVERFLOW;
> > break;
> > /* Completion codes for endpoint error state */
> > diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-
> > trace.h
> > index b19582b2a72c..5081df079f4a 100644
> > --- a/drivers/usb/host/xhci-trace.h
> > +++ b/drivers/usb/host/xhci-trace.h
> > @@ -360,6 +360,11 @@ DEFINE_EVENT(xhci_log_ep_ctx,
> > xhci_add_endpoint,
> > TP_ARGS(ctx)
> >  );
> >  
> > +DEFINE_EVENT(xhci_log_ep_ctx, xhci_handle_tx_event,
> > +   TP_PROTO(struct xhci_ep_ctx *ctx),
> > +   TP_ARGS(ctx)
> > +);
> > +
> >  DECLARE_EVENT_CLASS(xhci_log_slot_ctx,
> > TP_PROTO(struct xhci_slot_ctx *ctx),
> > TP_ARGS(ctx),
> > 
> > 
> > 
-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: XHCI vs PCM2903B/PCM2904 part 2

2020-06-29 Thread Rik van Riel
[keeping old context since it's been a month...]

On Mon, 2020-05-25 at 12:37 +0300, Mathias Nyman wrote:
> On 21.5.2020 6.45, Rik van Riel wrote:
> > On Wed, 2020-05-20 at 16:34 -0400, Alan Stern wrote:
> > > On Wed, May 20, 2020 at 03:21:44PM -0400, Rik van Riel wrote:
> > > > Interesting. That makes me really curious why things are
> > > > getting stuck, now...
> > > 
> > > This could be a bug in xhci-hcd.  Perhaps the controller's
> > > endpoint 
> > > state needs to be updated after one of these errors
> > > occurs.  Mathias 
> > > will know all about that.
> > 
> > I am seeing something potentially interesting in the
> > giant trace. First the final enqueue/dequeue before
> > the babble error:
> > 
> >   -0 [005] d.s. 776367.638233: xhci_inc_enq: ISOC
> > 33a6879e: enq 0x001014070420(0x00101407) deq
> > 0x001014070360(0x00101407) segs 2 stream 0 free_trbs
> > 497
> > bounce 196 cycle 1
> > 
> > The next reference to 0x001014070360 is the babble error,
> > and some info on the ISOC buffer itself:
> > 
> >   -0 [005] d.h. 776367.639187: xhci_handle_event:
> > EVENT: TRB 001014070360 status 'Babble Detected' len 196 slot
> > 15 ep
> > 9 type 'Transfer Event' flags e:C
> >   -0 [005] d.h. 776367.639195:
> > xhci_handle_transfer:
> > ISOC: Buffer 000e2676f400 length 196 TD size 0 intr 0 type
> > 'Isoch'
> > flags b:i:I:c:s:I:e:C
> > n
> > Immediately after the babble error, the next request is enqueued,
> > and the doorbell is rung:
> > 
> >   -0 [005] d.h. 776367.639196: xhci_inc_deq: ISOC
> > 33a6879e: enq 0x001014070420(0x00101407) deq
> > 0x001014070370(0x00101407) segs 2 stream 0 free_trbs
> > 498 bounce 196 cycle 1
> >   -0 [005] d.h. 776367.639197: xhci_urb_giveback:
> > ep4in-isoc: urb 72126553 pipe 135040 slot 15 length 196/196
> > sgs 0/0 stream 0 flags 0206
> >   -0 [005] d.h. 776367.639197: xhci_inc_deq:
> > EVENT 97f84b16: enq 0x0010170b5000(0x0010170b5000)
> > deq 0x0010170b5670(0x0010170b5000) segs 1 stream 0
> > free_trbs 254 bounce 0 cycle 1
> >   -0 [005] ..s. 776367.639212: xhci_urb_enqueue:
> > ep4in-isoc: urb 72126553 pipe 135040 slot 15 length 0/196
> > sgs 0/0 stream 0 flags 0206
> >   -0 [005] d.s. 776367.639214: xhci_queue_trb:
> > ISOC: Buffer 000e2676f400 length 196 TD size 0 intr 0 type
> > 'Isoch' flags b:i:I:c:s:I:e:c
> >   -0 [005] d.s. 776367.639214: xhci_inc_enq: ISOC
> > 33a6879e: enq 0x001014070430(0x00101407) deq
> > 0x001014070370(0x00101407) segs 2 stream 0 free_trbs
> > 497 bounce 196 cycle 1
> >   -0 [005] d.s. 776367.639215:
> > xhci_ring_ep_doorbell: Ring doorbell for Slot 15 ep4in
> > 
> > However, after that point, no more xhci_handle_transfer: ISOC
> > lines ar seen in the log. The doorbell line above is the last
> > line in the log for ep4in.
> > 
> > Is this some area where USB3 and USB2 behave differently?
> 
> It acts as if the endpoint is no longer running.
> 
> If the endpoint would be halted then
> xhci_requires_manual_halt_cleanup() should
> reset the endpoints and it would show in the traces.
> 
> Could you add the code below and take new traces, it will show the
> endpoint
> state after the Babble error.

Hi Mathias,

I have finally rebooted into a kernel with your tracepoint.
After a babble error, I get the following info in the trace.

[  556.716334] xhci_hcd :00:14.0: Babble error for slot 13 ep 8 on
endpoint

 28672.016 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
12845096, deq: 69501877488, tx_info: 12845252)
 34816.037 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
12845096, deq: 69501877856, tx_info: 12845252)
 38912.043 :0/0 xhci-hcd:xhci_handle_tx_event(info: 196609, info2:
12845096, deq: 69501870176, tx_info: 12845252)

I hope this is useful :)

> diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-
> ring.c
> index 0fda0c0f4d31..373d89ef7275 100644
> --- a/drivers/usb/host/xhci-ring.c
> +++ b/drivers/usb/host/xhci-ring.c
> @@ -2455,6 +2455,7 @@ static int handle_tx_event(struct xhci_hcd
> *xhci,
>   case COMP_BABBLE_DETECTED_ERROR:
>   xhci_dbg(xhci, "Babble error for slot %u ep %u on
> endpoint\n",
>slot_id, ep_index);
> + trace_xhci_handle_tx_event(ep_ctx);
>

Re: XHCI vs PCM2903B/PCM2904 part 2

2020-05-20 Thread Rik van Riel
On Wed, 2020-05-20 at 16:34 -0400, Alan Stern wrote:
> On Wed, May 20, 2020 at 03:21:44PM -0400, Rik van Riel wrote:
> > 
> > Interesting. That makes me really curious why things are
> > getting stuck, now...
> 
> This could be a bug in xhci-hcd.  Perhaps the controller's endpoint 
> state needs to be updated after one of these errors occurs.  Mathias 
> will know all about that.

I am seeing something potentially interesting in the
giant trace. First the final enqueue/dequeue before
the babble error:

  -0 [005] d.s. 776367.638233: xhci_inc_enq: ISOC
33a6879e: enq 0x001014070420(0x00101407) deq
0x001014070360(0x00101407) segs 2 stream 0 free_trbs 497
bounce 196 cycle 1

The next reference to 0x001014070360 is the babble error,
and some info on the ISOC buffer itself:

  -0 [005] d.h. 776367.639187: xhci_handle_event:
EVENT: TRB 001014070360 status 'Babble Detected' len 196 slot 15 ep
9 type 'Transfer Event' flags e:C
  -0 [005] d.h. 776367.639195: xhci_handle_transfer:
ISOC: Buffer 000e2676f400 length 196 TD size 0 intr 0 type 'Isoch'
flags b:i:I:c:s:I:e:C

Immediately after the babble error, the next request is enqueued,
and the doorbell is rung:

  -0 [005] d.h. 776367.639196: xhci_inc_deq: ISOC 
33a6879e: enq 0x001014070420(0x00101407) deq 
0x001014070370(0x00101407) segs 2 stream 0 free_trbs 498 bounce 196 
cycle 1
  -0 [005] d.h. 776367.639197: xhci_urb_giveback: ep4in-isoc: 
urb 72126553 pipe 135040 slot 15 length 196/196 sgs 0/0 stream 0 flags 
0206
  -0 [005] d.h. 776367.639197: xhci_inc_deq: EVENT 
97f84b16: enq 0x0010170b5000(0x0010170b5000) deq 
0x0010170b5670(0x0010170b5000) segs 1 stream 0 free_trbs 254 bounce 0 
cycle 1
  -0 [005] ..s. 776367.639212: xhci_urb_enqueue: ep4in-isoc: 
urb 72126553 pipe 135040 slot 15 length 0/196 sgs 0/0 stream 0 flags 
0206
  -0 [005] d.s. 776367.639214: xhci_queue_trb: ISOC: Buffer 
000e2676f400 length 196 TD size 0 intr 0 type 'Isoch' flags b:i:I:c:s:I:e:c
  -0 [005] d.s. 776367.639214: xhci_inc_enq: ISOC 
33a6879e: enq 0x001014070430(0x00101407) deq 
0x001014070370(0x00101407) segs 2 stream 0 free_trbs 497 bounce 196 
cycle 1
  -0 [005] d.s. 776367.639215: xhci_ring_ep_doorbell: Ring 
doorbell for Slot 15 ep4in

However, after that point, no more xhci_handle_transfer: ISOC
lines ar seen in the log. The doorbell line above is the last
line in the log for ep4in.

Is this some area where USB3 and USB2 behave differently?

dmesg: 
https://drive.google.com/open?id=1S2Qc8lroqA5-RMukuLBLWFGx10vEjG-i

usb trace, as requested by Mathias: 
https://drive.google.com/open?id=1cbLcOnAtQRW0Chgak6PNC0l4yJv__4uO

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: XHCI vs PCM2903B/PCM2904 part 2

2020-05-20 Thread Rik van Riel
On Wed, 2020-05-20 at 16:50 +0300, Mathias Nyman wrote:
> On 20.5.2020 14.26, Rik van Riel wrote:
> > After a few more weeks of digging, I have come to the tentative
> > conclusion that either the XHCI driver, or the USB sound driver,
> > or both, fail to handle USB errors correctly.
> > 
> > I have some questions at the bottom, after a (brief-ish)
> > explanation
> > of exactly what seems to go wrong.
> > 
> > TL;DR: arecord from a misbehaving device can hang forever
> > after a USB error, due to poll on /dev/snd/timer never returning.
> > 
> > The details: under some mysterious circumstances, the PCM290x
> > family sound chips can send more data than expected during an
> > isochronous transfer, leading to a babble error. Those
> > circumstances seem to in part depend on the USB host controller
> > and/or the electrical environment, since the chips work just
> > fine for most people.
> > 
> > Receiving data past the end of the isochronous transfer window
> > scheduled for a device results in the XHCI controller throwing
> > a babble error, which moves the endpoint into halted state.
> > 
> > This is followed by the host controller software sending a
> > reset endpoint command, and moving the endpoint into stopped
> > state, as specified on pages 164-165 of the XHCI specification.
> 
> Note that isoch endpoints should generate buffer overrun instead of
> babble detect error on TD babble conditions. 
> See xHCI spec 6.4.5 additional note 115.

Maybe it should, but I'm hitting this printk in handle_tx_event:

case COMP_BABBLE_DETECTED_ERROR:
xhci_dbg(xhci, "Babble error for slot %u ep %u on
endpoint\n",
 slot_id, ep_index);
status = -EOVERFLOW;
break;

Tracing in the sound driver suggests that packet belongs to
the sound data device (not the sync device), because the
URB with -EOVERFLOW status for the isochronous packet in that
URB is the last one seen there.

> Or maybe a frame babble could halt an isoc endpoint, see xhci
> 4.10.2.4.1
> but then you should see a port error and port going to disabled
> state.
> 
> Any logs of this?
> 
> mount -t debugfs none /sys/kernel/debug
> echo 'module xhci_hcd =p' >/sys/kernel/debug/dynamic_debug/control
> echo 'module usbcore =p' >/sys/kernel/debug/dynamic_debug/control
> echo 81920 > /sys/kernel/debug/tracing/buffer_size_kb
> echo 1 > /sys/kernel/debug/tracing/events/xhci-hcd/enable
> < trigger the issue >
> Send output of dmesg
> Send content of /sys/kernel/debug/tracing/trace

dmesg: 
https://drive.google.com/open?id=1S2Qc8lroqA5-RMukuLBLWFGx10vEjG-i

usbtrace: 
https://drive.google.com/open?id=1cbLcOnAtQRW0Chgak6PNC0l4yJv__4uO

> > However, the USB sound driver seems to have no idea that this
> > error happened. The function retire_capture_urb looks at the
> > status of each isochronous frame, but seems to be under the
> > assumption that the sound device just keeps on running.
> > 
> > The function snd_complete_urb seems to only detect that the
> > device is not running if usb_submit_urb returns a failure.
> > 
> > err = usb_submit_urb(urb, GFP_ATOMIC);
> > if (err == 0)
> > return;
> > 
> > usb_audio_err(ep->chip, "cannot submit urb (err = %d)\n",
> > err);
> > 
> > if (ep->data_subs && ep->data_subs->pcm_substream) {
> > substream = ep->data_subs->pcm_substream;
> > snd_pcm_stop_xrun(substream);
> > }
> > 
> > However, the XHCI driver will happily submit an URB to a
> > stopped device. Looking at the call trace usb_submit_urb ->
> > xhci_urb_enqueue -> xhci_queue_isoc_tx_prepare -> prepare_ring,
> > you can see this code:
> > 
> > /* Make sure the endpoint has been added to xHC schedule */
> > switch (ep_state) {
> > ...
> > case EP_STATE_HALTED:
> > xhci_dbg(xhci, "WARN halted endpoint, queueing URB
> > anyway.\n");
> > case EP_STATE_STOPPED:
> > case EP_STATE_RUNNING:
> > break;
> > 
> > This leads me to a few questions:
> > - should retire_capture_urb call snd_pcm_stop_xrun,
> >   or another function like it, if it sees certain
> >   errors in the iso frame in the URB?
> > - should snd_complete_urb do something with these
> >   errors, too, in case they happen on the sync frames
> >   and not the data frames?
> > - does the XHCI code need to ring the doorbell when
> >   subm

Re: XHCI vs PCM2903B/PCM2904 part 2

2020-05-20 Thread Rik van Riel
On Wed, 2020-05-20 at 12:38 -0400, Alan Stern wrote:
> On Wed, May 20, 2020 at 07:26:57AM -0400, Rik van Riel wrote:
> > After a few more weeks of digging, I have come to the tentative
> > conclusion that either the XHCI driver, or the USB sound driver,
> > or both, fail to handle USB errors correctly.
> > 
> > I have some questions at the bottom, after a (brief-ish)
> > explanation
> > of exactly what seems to go wrong.
> > 
> > TL;DR: arecord from a misbehaving device can hang forever
> > after a USB error, due to poll on /dev/snd/timer never returning.
> > 
> > The details: under some mysterious circumstances, the PCM290x
> > family sound chips can send more data than expected during an
> > isochronous transfer, leading to a babble error. Those
> 
> Do these chips connect as USB-3 devices or as USB-2?  (I wouldn't
> expect 
> an audio device to use USB-3; it shouldn't need the higher
> bandwidth.)

USB-2

> In general, errors such as babble are not supposed to stop
> isochronous 
> endpoints.

However, it seems that they do. The urb never
gets an answer after snd_complete_urb refiles
it with usb_submit_urb.

> > However, the USB sound driver seems to have no idea that this
> > error happened. The function retire_capture_urb looks at the
> > status of each isochronous frame, but seems to be under the
> > assumption that the sound device just keeps on running.
> 
> This is appropriate, for the reason mentioned above.

Having arecord get stuck forever does not seem like
the right behavior, though :)

> > This leads me to a few questions:
> > - should retire_capture_urb call snd_pcm_stop_xrun,
> >   or another function like it, if it sees certain
> >   errors in the iso frame in the URB?
> 
> No.  Isochronous endpoints are expected to encounter errors from time
> to 
> time; that is the nature of isochronous communications.  You're
> supposed 
> to ignore the errors (skip over any bad data) and keep going.

...

> The notion of "stopped state" is not part of USB-2.  As a result, it 
> should be handled entirely within the xhci-hcd driver.

Interesting. That makes me really curious why things are
getting stuck, now...

> > - how should the USB sound driver recover from these
> >   occasional and/or one-off errors? stop the sound
> >   stream, or try to reinitialize the device and start
> >   recording again?
> 
> As far as I know, it should do its best to continue (perhaps fill in 
> missing data with zeros).

That was my first intuition as well, given the documented
behavior of isochronous frames.

However, given that the device appears to stop sending
frames after that error, at least usbmon is not seeing
any, I am not sure what needs to happen in order to get
that behavior.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


XHCI vs PCM2903B/PCM2904 part 2

2020-05-20 Thread Rik van Riel
After a few more weeks of digging, I have come to the tentative
conclusion that either the XHCI driver, or the USB sound driver,
or both, fail to handle USB errors correctly.

I have some questions at the bottom, after a (brief-ish) explanation
of exactly what seems to go wrong.

TL;DR: arecord from a misbehaving device can hang forever
after a USB error, due to poll on /dev/snd/timer never returning.

The details: under some mysterious circumstances, the PCM290x
family sound chips can send more data than expected during an
isochronous transfer, leading to a babble error. Those
circumstances seem to in part depend on the USB host controller
and/or the electrical environment, since the chips work just
fine for most people.

Receiving data past the end of the isochronous transfer window
scheduled for a device results in the XHCI controller throwing
a babble error, which moves the endpoint into halted state.

This is followed by the host controller software sending a
reset endpoint command, and moving the endpoint into stopped
state, as specified on pages 164-165 of the XHCI specification.

However, the USB sound driver seems to have no idea that this
error happened. The function retire_capture_urb looks at the
status of each isochronous frame, but seems to be under the
assumption that the sound device just keeps on running.

The function snd_complete_urb seems to only detect that the
device is not running if usb_submit_urb returns a failure.

err = usb_submit_urb(urb, GFP_ATOMIC);
if (err == 0)
return;

usb_audio_err(ep->chip, "cannot submit urb (err = %d)\n", err);

if (ep->data_subs && ep->data_subs->pcm_substream) {
substream = ep->data_subs->pcm_substream;
snd_pcm_stop_xrun(substream);
}

However, the XHCI driver will happily submit an URB to a
stopped device. Looking at the call trace usb_submit_urb ->
xhci_urb_enqueue -> xhci_queue_isoc_tx_prepare -> prepare_ring,
you can see this code:

/* Make sure the endpoint has been added to xHC schedule */
switch (ep_state) {
...
case EP_STATE_HALTED:
xhci_dbg(xhci, "WARN halted endpoint, queueing URB anyway.\n");
case EP_STATE_STOPPED:
case EP_STATE_RUNNING:
break;

This leads me to a few questions:
- should retire_capture_urb call snd_pcm_stop_xrun,
  or another function like it, if it sees certain
  errors in the iso frame in the URB?
- should snd_complete_urb do something with these
  errors, too, in case they happen on the sync frames
  and not the data frames?
- does the XHCI code need to ring the doorbell when
  submitting an URB to a stopped device, or is it
  always up to the higher-level driver to fully reset
  the device before it can do anything useful?
- if a device in stopped state does not do anything
  useful, should usb_submit_urb return an error?
- how should the USB sound driver recover from these
  occasional and/or one-off errors? stop the sound
  stream, or try to reinitialize the device and start
  recording again?

I am willing to write patches and can test with my
setup, but both the sound code and the USB code are
new to me so I would like to know what direction I
should go in :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: Possibility of conflicting memory types in lazier TLB mode?

2020-05-15 Thread Rik van Riel
On Fri, 2020-05-15 at 16:50 +1000, Nicholas Piggin wrote:
> 
> But what about if there are (real, not speculative) stores in the
> store 
> queue still on the lazy thread from when it was switched, that have
> not 
> yet become coherent? The page is freed by another CPU and reallocated
> for something that maps it as nocache. Do you have a coherency
> problem 
> there?
> 
> Ensuring the store queue is drained when switching to lazy seems like
> it 
> would fix it, maybe context switch code does that already or you
> have 
> some other trick or reason it's not a problem. Am I way off base
> here?

On x86, all stores become visible in-order globally.

I suspect that
means any pending stores in the queue
would become visible to the rest of the system before
the store to the "current" cpu-local variable, as
well as other writes from the context switch code
become visible to the rest of the system.

Is that too naive a way of preventing the scenario you
describe?

What am I overlooking?

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm,thp: recheck each page before collapsing file THP

2019-10-18 Thread Rik van Riel
On Fri, 2019-10-18 at 16:34 +0300, Kirill A. Shutemov wrote:
> On Thu, Oct 17, 2019 at 10:08:32PM -0700, Song Liu wrote:
> > In collapse_file(), after locking the page, it is necessary to
> > recheck
> > that the page is up-to-date, clean, and pointing to the proper
> > mapping.
> > If any check fails, abort the collapse.
> > 
> > Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-
> > shmem) FS")
> > Cc: Kirill A. Shutemov 
> > Cc: Johannes Weiner 
> > Cc: Hugh Dickins 
> > Cc: William Kucharski 
> > Cc: Andrew Morton 
> > Signed-off-by: Song Liu 
> > ---
> >  mm/khugepaged.c | 8 
> >  1 file changed, 8 insertions(+)
> > 
> > diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> > index 0a1b4b484ac5..7da49b643c4d 100644
> > --- a/mm/khugepaged.c
> > +++ b/mm/khugepaged.c
> > @@ -1619,6 +1619,14 @@ static void collapse_file(struct mm_struct
> > *mm,
> > result = SCAN_PAGE_LOCK;
> > goto xa_locked;
> > }
> > +
> > +   /* double check the page is correct and clean
> > */
> > +   if (unlikely(!PageUptodate(page)) ||
> > +   unlikely(PageDirty(page)) ||
> > +   unlikely(page->mapping != mapping)) {
> > +   result = SCAN_FAIL;
> > +   goto out_unlock;
> > +   }
> > }
> >  
> > /*
> 
> Hm. But why only for !is_shmem? Or I read it wrong?

It looks like the shmem code path has its own way of bailing
out when a page is !PageUptodate. Also, shmem can handle dirty
pages fine.

However, I suppose the shmem code might want to check for truncated
pages, which it does not curretnly appear to do. I guess doing
the trylock_page under the xarray lock may protect against truncate,
but that is subtle enough that at the very least it should be
documented.




Re: [PATCH] mm,thp: recheck each page before collapsing file THP

2019-10-18 Thread Rik van Riel
On Thu, 2019-10-17 at 22:08 -0700, Song Liu wrote:
> In collapse_file(), after locking the page, it is necessary to
> recheck
> that the page is up-to-date, clean, and pointing to the proper
> mapping.
> If any check fails, abort the collapse.
> 
> Fixes: 99cb0dbd47a1 ("mm,thp: add read-only THP support for (non-
> shmem) FS")
> Cc: Kirill A. Shutemov 
> Cc: Johannes Weiner 
> Cc: Hugh Dickins 
> Cc: William Kucharski 
> Cc: Andrew Morton 
> Signed-off-by: Song Liu 

Acked-by: Rik van Riel 



Re: [PATCH v3 09/10] sched/fair: use load instead of runnable load in wakeup path

2019-10-07 Thread Rik van Riel
On Mon, 2019-10-07 at 17:27 +0200, Vincent Guittot wrote:
> On Mon, 7 Oct 2019 at 17:14, Rik van Riel  wrote:
> > On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> > > runnable load has been introduced to take into account the case
> > > where
> > > blocked load biases the wake up path which may end to select an
> > > overloaded
> > > CPU with a large number of runnable tasks instead of an
> > > underutilized
> > > CPU with a huge blocked load.
> > > 
> > > Tha wake up path now starts to looks for idle CPUs before
> > > comparing
> > > runnable load and it's worth aligning the wake up path with the
> > > load_balance.
> > > 
> > > Signed-off-by: Vincent Guittot 
> > 
> > On a single socket system, patches 9 & 10 have the
> > result of driving a woken up task (when wake_wide is
> > true) to the CPU core with the lowest blocked load,
> > even when there is an idle core the task could run on
> > right now.
> > 
> > With the whole series applied, I see a 1-2% regression
> > in CPU use due to that issue.
> > 
> > With only patches 1-8 applied, I see a 1% improvement in
> > CPU use for that same workload.
> 
> Thanks for testing.
> patch 8-9 have just replaced runnable load  by blocked load and then
> removed the duplicated metrics in find_idlest_group.
> I'm preparing an additional patch that reworks  find_idlest_group()
> to
> behave similarly to find_busiest_group(). It gathers statistics what
> it already does, then classifies the groups and finally selects the
> idlest one. This should fix the problem that you mentioned above when
> it selects a group with lowest blocked load whereas there are idle
> cpus in another group with high blocked load.

That should do the trick!

> > Given that it looks like select_idle_sibling and
> > find_idlest_group_cpu do roughly the same thing, I
> > wonder if it is enough to simply add an additional
> > test to find_idlest_group to have it return the
> > LLC sg, if it is called on the LLC sd on a single
> > socket system.
> 
> That make sense to me
> 
> > That way find_idlest_group_cpu can still find an
> > idle core like it does today.
> > 
> > Does that seem like a reasonable thing?
> 
> That's worth testing

I'll give it a try.

Doing the full find_idlest_group heuristic
inside an LLC seems like it would be overkill,
anyway.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3 09/10] sched/fair: use load instead of runnable load in wakeup path

2019-10-07 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> runnable load has been introduced to take into account the case where
> blocked load biases the wake up path which may end to select an
> overloaded
> CPU with a large number of runnable tasks instead of an underutilized
> CPU with a huge blocked load.
> 
> Tha wake up path now starts to looks for idle CPUs before comparing
> runnable load and it's worth aligning the wake up path with the
> load_balance.
> 
> Signed-off-by: Vincent Guittot 

On a single socket system, patches 9 & 10 have the
result of driving a woken up task (when wake_wide is
true) to the CPU core with the lowest blocked load,
even when there is an idle core the task could run on
right now.

With the whole series applied, I see a 1-2% regression
in CPU use due to that issue.

With only patches 1-8 applied, I see a 1% improvement in
CPU use for that same workload.

Given that it looks like select_idle_sibling and
find_idlest_group_cpu do roughly the same thing, I
wonder if it is enough to simply add an additional
test to find_idlest_group to have it return the
LLC sg, if it is called on the LLC sd on a single
socket system.

That way find_idlest_group_cpu can still find an
idle core like it does today.

Does that seem like a reasonable thing?

I can run tests with that :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] mm/rmap.c: reuse mergeable anon_vma as parent when fork

2019-10-04 Thread Rik van Riel
On Sat, 2019-10-05 at 00:06 +0800, Wei Yang wrote:
> In function __anon_vma_prepare(), we will try to find anon_vma if it
> is
> possible to reuse it. While on fork, the logic is different.
> 
> Since commit 5beb49305251 ("mm: change anon_vma linking to fix
> multi-process server scalability issue"), function anon_vma_clone()
> tries to allocate new anon_vma for child process. But the logic here
> will allocate a new anon_vma for each vma, even in parent this vma
> is mergeable and share the same anon_vma with its sibling. This may
> do
> better for scalability issue, while it is not necessary to do so
> especially after interval tree is used.
> 
> Commit 7a3ef208e662 ("mm: prevent endless growth of anon_vma
> hierarchy")
> tries to reuse some anon_vma by counting child anon_vma and attached
> vmas. While for those mergeable anon_vmas, we can just reuse it and
> not
> necessary to go through the logic.
> 
> After this change, kernel build test reduces 20% anon_vma allocation.
> 
> Signed-off-by: Wei Yang 

Acked-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3 04/10] sched/fair: rework load_balance

2019-09-29 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> 
> Also the load balance decisions have been consolidated in the 3
> functions
> below after removing the few bypasses and hacks of the current code:
> - update_sd_pick_busiest() select the busiest sched_group.
> - find_busiest_group() checks if there is an imbalance between local
> and
>   busiest group.
> - calculate_imbalance() decides what have to be moved.

I really like the direction this series is going.

However, I suppose I should run these patches for
a few days with some of our test workloads before
I send out an ack for this patch :)

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3 03/10] sched/fair: remove meaningless imbalance calculation

2019-09-27 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> clean up load_balance and remove meaningless calculation and fields
> before
> adding new algorithm.
> 
> Signed-off-by: Vincent Guittot 

Yay.

Acked-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3 02/10] sched/fair: rename sum_nr_running to sum_h_nr_running

2019-09-27 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> Rename sum_nr_running to sum_h_nr_running because it effectively
> tracks
> cfs->h_nr_running so we can use sum_nr_running to track rq-
> >nr_running
> when needed.
> 
> There is no functional changes.
> 
> Signed-off-by: Vincent Guittot 
> 
Acked-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v3 01/10] sched/fair: clean up asym packing

2019-09-27 Thread Rik van Riel
On Thu, 2019-09-19 at 09:33 +0200, Vincent Guittot wrote:
> Clean up asym packing to follow the default load balance behavior:
> - classify the group by creating a group_asym_packing field.
> - calculate the imbalance in calculate_imbalance() instead of
> bypassing it.
> 
> We don't need to test twice same conditions anymore to detect asym
> packing
> and we consolidate the calculation of imbalance in
> calculate_imbalance().
> 
> There is no functional changes.
> 
> Signed-off-by: Vincent Guittot 

Acked-by: Rik van Riel 

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


[PATCH 06/15] sched,cfs: use explicit cfs_rq of parent se helper

2019-09-06 Thread Rik van Riel
Use an explicit "cfs_rq of parent sched_entity" helper in a few
strategic places, where cfs_rq_of(se) may no longer point at the
right runqueue once we flatten the hierarchical cgroup runqueues.

No functional change.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 17 +
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c1593611291..dfc116cb8750 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -276,6 +276,15 @@ static inline struct cfs_rq *group_cfs_rq(struct 
sched_entity *grp)
return grp->my_q;
 }
 
+/* runqueue owned by the parent entity; the root cfs_rq for a top level se */
+static inline struct cfs_rq *group_cfs_rq_of_parent(struct sched_entity *se)
+{
+   if (se->parent)
+   return group_cfs_rq(se->parent);
+
+   return cfs_rq_of(se);
+}
+
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
struct rq *rq = rq_of(cfs_rq);
@@ -3319,7 +3328,7 @@ static inline int propagate_entity_load_avg(struct 
sched_entity *se)
 
gcfs_rq->propagate = 0;
 
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
 
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
 
@@ -7796,7 +7805,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 
WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
@@ -7819,7 +7828,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 
 static unsigned long task_se_h_load(struct sched_entity *se)
 {
-   struct cfs_rq *cfs_rq = cfs_rq_of(se);
+   struct cfs_rq *cfs_rq = group_cfs_rq_of_parent(se);
 
update_cfs_rq_h_load(cfs_rq);
return div64_ul(se->avg.load_avg * cfs_rq->h_load,
@@ -10166,7 +10175,7 @@ static void task_tick_fair(struct rq *rq, struct 
task_struct *curr, int queued)
struct sched_entity *se = >se;
 
for_each_sched_entity(se) {
-   cfs_rq = cfs_rq_of(se);
+   cfs_rq = group_cfs_rq_of_parent(se);
entity_tick(cfs_rq, se, queued);
}
 
-- 
2.20.1



[PATCH 08/15] sched,fair: refactor enqueue/dequeue_entity

2019-09-06 Thread Rik van Riel
Refactor enqueue_entity, dequeue_entity, and update_load_avg, in order
to split out the things we still want to happen at every level in the
cgroup hierarchy with a flat runqueue from the things we only need to
happen once.

No functional changes.

Signed-off-by: Rik van Riel 
---
 kernel/sched/fair.c | 64 +
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 00d774833df5..3e294e9c9994 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3518,7 +3518,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 #define DO_ATTACH  0x4
 
 /* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
+static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 {
u64 now = cfs_rq_clock_pelt(cfs_rq);
int decayed;
@@ -3547,6 +3547,8 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 
} else if (decayed && (flags & UPDATE_TG))
update_tg_load_avg(cfs_rq, 0);
+
+   return decayed;
 }
 
 #ifndef CONFIG_64BIT
@@ -3763,9 +3765,10 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
 #define SKIP_AGE_LOAD  0x0
 #define DO_ATTACH  0x0
 
-static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int not_used1)
+static inline bool update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int not_used1)
 {
cfs_rq_util_change(cfs_rq, 0);
+   return false;
 }
 
 static inline void remove_entity_load_avg(struct sched_entity *se) {}
@@ -3888,6 +3891,24 @@ static inline void check_schedstat_required(void)
  * CPU and an up-to-date min_vruntime on the destination CPU.
  */
 
+static bool
+enqueue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int 
flags)
+{
+   /*
+* When enqueuing a sched_entity, we must:
+*   - Update loads to have both entity and cfs_rq synced with now.
+*   - Add its load to cfs_rq->runnable_avg
+*   - For group_entity, update its weight to reflect the new share of
+* its group cfs_rq
+*   - Add its new weight to cfs_rq->load.weight
+*/
+   if (!update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH))
+   return false;
+
+   update_cfs_group(se);
+   return true;
+}
+
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -3912,16 +3933,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
 
-   /*
-* When enqueuing a sched_entity, we must:
-*   - Update loads to have both entity and cfs_rq synced with now.
-*   - Add its load to cfs_rq->runnable_avg
-*   - For group_entity, update its weight to reflect the new share of
-* its group cfs_rq
-*   - Add its new weight to cfs_rq->load.weight
-*/
-   update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
-   update_cfs_group(se);
enqueue_runnable_load_avg(cfs_rq, se);
account_entity_enqueue(cfs_rq, se);
 
@@ -3988,14 +3999,9 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
-static void
-dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+static bool
+dequeue_entity_groups(struct cfs_rq *cfs_rq, struct sched_entity *se, int 
flags)
 {
-   /*
-* Update run-time statistics of the 'current'.
-*/
-   update_curr(cfs_rq);
-
/*
 * When dequeuing a sched_entity, we must:
 *   - Update loads to have both entity and cfs_rq synced with now.
@@ -4004,7 +4010,21 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se, int flags)
 *   - For group entity, update its weight to reflect the new share
 * of its group cfs_rq.
 */
-   update_load_avg(cfs_rq, se, UPDATE_TG);
+   if (!update_load_avg(cfs_rq, se, UPDATE_TG))
+   return false;
+   update_cfs_group(se);
+
+   return true;
+}
+
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+   /*
+* Update run-time statistics of the 'current'.
+*/
+   update_curr(cfs_rq);
+
dequeue_runnable_load_avg(cfs_rq, se);
 
update_stats_dequeue(cfs_rq, se, flags);
@@ -4028,8 +4048,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
 
-   update_cfs_group(se);
-
/*
 * Now advance min_vruntime if @se was the entity h

[PATCH 05/15] sched,fair: remove cfs_rqs from leaf_cfs_rq_list bottom up

2019-09-06 Thread Rik van Riel
Reducing the overhead of the CPU controller is achieved by not walking
all the sched_entities every time a task is enqueued or dequeued.

One of the things being checked every single time is whether the cfs_rq
is on the rq->leaf_cfs_rq_list.

By only removing a cfs_rq from the list once it no longer has children
on the list, we can avoid walking the sched_entity hierarchy if the bottom
cfs_rq is on the list, once the runqueues have been flattened.

Signed-off-by: Rik van Riel 
Suggested-by: Vincent Guittot 
Acked-by: Vincent Guittot 
---
 kernel/sched/fair.c | 35 ++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af9458d8828f..1c1593611291 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -369,6 +369,39 @@ static inline void assert_list_leaf_cfs_rq(struct rq *rq)
SCHED_WARN_ON(rq->tmp_alone_branch != >leaf_cfs_rq_list);
 }
 
+/*
+ * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
+ * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
+ * bottom-up, we only have to test whether the cfs_rq before us on the list
+ * is our child.
+ */
+static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
+{
+   struct cfs_rq *prev_cfs_rq;
+   struct list_head *prev;
+
+   prev = cfs_rq->leaf_cfs_rq_list.prev;
+   prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
+
+   return (prev_cfs_rq->tg->parent == cfs_rq->tg);
+}
+
+/*
+ * Remove a cfs_rq from the list if it has no children on the list.
+ * The scheduler iterates over the list regularly; if conditions for
+ * removal are still true, we'll get to this cfs_rq in the future.
+ */
+static inline void list_del_leaf_cfs_rq_bottom(struct cfs_rq *cfs_rq)
+{
+   if (!cfs_rq->on_list)
+   return;
+
+   if (child_cfs_rq_on_list(cfs_rq))
+   return;
+
+   list_del_leaf_cfs_rq(cfs_rq);
+}
+
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
list_for_each_entry_safe(cfs_rq, pos, >leaf_cfs_rq_list,\
@@ -7723,7 +7756,7 @@ static void update_blocked_averages(int cpu)
 * decayed cfs_rqs linger on the list.
 */
if (cfs_rq_is_decayed(cfs_rq))
-   list_del_leaf_cfs_rq(cfs_rq);
+   list_del_leaf_cfs_rq_bottom(cfs_rq);
 
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
-- 
2.20.1



[PATCH 01/15] sched: introduce task_se_h_load helper

2019-09-06 Thread Rik van Riel
Sometimes the hierarchical load of a sched_entity needs to be calculated.
Rename task_h_load to task_se_h_load, and directly pass a sched_entity to
that function.

Move the function declaration up above where it will be used later.

No functional changes.

Signed-off-by: Rik van Riel 
Reviewed-by: Josef Bacik 
---
 kernel/sched/fair.c | 28 ++--
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f5e528..812f7947aab2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -242,6 +242,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long 
weight, struct load_weight
 
 
 const struct sched_class fair_sched_class;
+static unsigned long task_se_h_load(struct sched_entity *se);
 
 /**
  * CFS operations on generic schedulable entities:
@@ -706,7 +707,6 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 #ifdef CONFIG_SMP
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
-static unsigned long task_h_load(struct task_struct *p);
 static unsigned long capacity_of(int cpu);
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
@@ -1668,7 +1668,7 @@ static void task_numa_compare(struct task_numa_env *env,
/*
 * In the overloaded case, try and keep the load balanced.
 */
-   load = task_h_load(env->p) - task_h_load(cur);
+   load = task_se_h_load(>p->se) - task_se_h_load(>se);
if (!load)
goto assign;
 
@@ -1706,7 +1706,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
bool maymove = false;
int cpu;
 
-   load = task_h_load(env->p);
+   load = task_se_h_load(>p->se);
dst_load = env->dst_stats.load + load;
src_load = env->src_stats.load - load;
 
@@ -3389,7 +3389,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq 
*cfs_rq, long runnable_sum
  * avg. The immediate corollary is that all (fair) tasks must be attached, see
  * post_init_entity_util_avg().
  *
- * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ * cfs_rq->avg is used for task_se_h_load() and update_cfs_share() for example.
  *
  * Returns true if the load decayed or we removed load.
  *
@@ -3522,7 +3522,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, 
struct sched_entity *s
 
/*
 * Track task load average for carrying it to new CPU after migrated, 
and
-* track group sched_entity load average for task_h_load calc in 
migration
+* track group sched_entity load average for task_se_h_load calc in 
migration
 */
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cfs_rq, se);
@@ -3751,7 +3751,7 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq)
return;
}
 
-   rq->misfit_task_load = task_h_load(p);
+   rq->misfit_task_load = task_se_h_load(>se);
 }
 
 #else /* CONFIG_SMP */
@@ -5739,7 +5739,7 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load = target_load(this_cpu, sd->wake_idx);
 
if (sync) {
-   unsigned long current_load = task_h_load(current);
+   unsigned long current_load = task_se_h_load(>se);
 
if (current_load > this_eff_load)
return this_cpu;
@@ -5747,7 +5747,7 @@ wake_affine_weight(struct sched_domain *sd, struct 
task_struct *p,
this_eff_load -= current_load;
}
 
-   task_load = task_h_load(p);
+   task_load = task_se_h_load(>se);
 
this_eff_load += task_load;
if (sched_feat(WA_BIAS))
@@ -7600,7 +7600,7 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
 
-   load = task_h_load(p);
+   load = task_se_h_load(>se);
 
if (sched_feat(LB_MIN) && load < 16 && 
!env->sd->nr_balance_failed)
goto next;
@@ -7833,12 +7833,12 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
}
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long task_se_h_load(struct sched_entity *se)
 {
-   struct cfs_rq *cfs_rq = task_cfs_rq(p);
+   struct cfs_rq *cfs_rq = cfs_rq_of(se);
 
update_cfs_rq_h_load(cfs_rq);
-   return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
+   return div64_ul(se->avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
 }
 #else
@@ -7865,9 +7865,9 @@ static inline void update_blocked_averages(int cpu)
rq_unlock_irqrestore(rq, );
 }
 
-static unsigned long task_h_load(struct task_

  1   2   3   4   5   6   7   8   9   10   >