Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-16 Thread Peter Zijlstra
On Mon, Apr 15, 2024 at 08:00:26PM +0300, Mike Rapoport wrote:
> On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:
> > 
> > > To populate the cache, a writable large page is allocated from vmalloc 
> > > with
> > > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> > > ROX.
> > 
> > > +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> > > +{
> > > + if (execmem_info->invalidate)
> > > + execmem_info->invalidate(ptr, size, writable);
> > > + else
> > > + memset(ptr, 0, size);
> > > +}
> > 
> > +static void execmem_invalidate(void *ptr, size_t size, bool writeable)
> > +{
> > +   /* fill memory with INT3 instructions */
> > +   if (writeable)
> > +   memset(ptr, 0xcc, size);
> > +   else
> > +   text_poke_set(ptr, 0xcc, size);
> > +}
> > 
> > Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
> > It raises #BP not #UD.
> 
> Do you mean that _invalidate is a poor name choice or that it's necessary
> to use an instruction that raises #UD?

Poor naming, mostly. #BP handler will still scream bloody murder if the
site is otherwise unclaimed.

It just isn't an invalid instruction.



Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages

2024-04-15 Thread Peter Zijlstra
On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote:

> To populate the cache, a writable large page is allocated from vmalloc with
> VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as
> ROX.

> +static void execmem_invalidate(void *ptr, size_t size, bool writable)
> +{
> + if (execmem_info->invalidate)
> + execmem_info->invalidate(ptr, size, writable);
> + else
> + memset(ptr, 0, size);
> +}

+static void execmem_invalidate(void *ptr, size_t size, bool writeable)
+{
+   /* fill memory with INT3 instructions */
+   if (writeable)
+   memset(ptr, 0xcc, size);
+   else
+   text_poke_set(ptr, 0xcc, size);
+}

Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction.
It raises #BP not #UD.



Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text

2024-04-15 Thread Peter Zijlstra
On Thu, Apr 11, 2024 at 07:05:24PM +0300, Mike Rapoport wrote:
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 45a280f2161c..b4d6868df573 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c

> @@ -504,17 +513,17 @@ void __init_or_module noinline 
> apply_alternatives(struct alt_instr *start,
>*   patch if feature is *NOT* present.
>*/
>   if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) {
> - optimize_nops_inplace(instr, a->instrlen);
> + optimize_nops_inplace(wr_instr, a->instrlen);
>   continue;
>   }
>  
> - DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: 
> (%px, len: %d) flags: 0x%x",
> + DPRINTK(ALT, "feat: %d*32+%d, old: (%px (%px) len: %d), repl: 
> (%px (%px), len: %d) flags: 0x%x",
>   a->cpuid >> 5,
>   a->cpuid & 0x1f,
> - instr, instr, a->instrlen,
> - replacement, a->replacementlen, a->flags);
> + instr, wr_instr, a->instrlen,
> + replacement, wr_replacement, a->replacementlen, 
> a->flags);

I think this, and

>  
> - memcpy(insn_buff, replacement, a->replacementlen);
> + memcpy(insn_buff, wr_replacement, a->replacementlen);
>   insn_buff_sz = a->replacementlen;
>  
>   if (a->flags & ALT_FLAG_DIRECT_CALL) {
> @@ -528,11 +537,11 @@ void __init_or_module noinline 
> apply_alternatives(struct alt_instr *start,
>  
>   apply_relocation(insn_buff, a->instrlen, instr, replacement, 
> a->replacementlen);
>  
> - DUMP_BYTES(ALT, instr, a->instrlen, "%px:   old_insn: ", instr);
> + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px:   old_insn: ", 
> instr);

this, want to remain as is. 

>   DUMP_BYTES(ALT, replacement, a->replacementlen, "%px:   
> rpl_insn: ", replacement);
>   DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", 
> instr);
>  
> - text_poke_early(instr, insn_buff, insn_buff_sz);
> + text_poke_early(wr_instr, insn_buff, insn_buff_sz);
>   }
>  
>   kasan_enable_current();

The rationale being that we then print an address that can be correlated
to the kernel image (provided one either kills kaslr or adjusts for it).



Re: [PATCH v4 07/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem

2024-04-15 Thread Peter Zijlstra
On Thu, Apr 11, 2024 at 07:00:43PM +0300, Mike Rapoport wrote:

> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
> + .flags = EXECMEM_KASAN_SHADOW,
> + .alignment = MODULE_ALIGN,
> + },
> + },
> +};
>  
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + unsigned long start, offset = 0;
>  
> + if (kaslr_enabled())
> + offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;
>  
> + start = MODULES_VADDR + offset;
> + execmem_info.ranges[EXECMEM_DEFAULT].start = start;
> + execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
> + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
>  
> + return _info;
>  }

struct execmem_info __init *execmem_arch_setup(void)
{
unsigned long offset = 0;

if (kaslr_enabled())
offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE;

execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
.start = MODULES_VADDR + offset,
.end   = MODULES_END,
.pgprot= PAGE_KERNEL,
.flags = EXECMEM_KASAN_SHADOW,
.alignment = 1,
},
},
};

return _info;
}



Re: [PATCH v4 06/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem

2024-04-15 Thread Peter Zijlstra
On Thu, Apr 11, 2024 at 07:00:42PM +0300, Mike Rapoport wrote:
> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
> + .start = MODULES_VADDR,
> + .end = MODULES_END,
> + .alignment = 1,
> + },
> + },
> +};
> +
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
> +
> + return _info;
>  }

> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
> + .start = MODULES_VADDR,
> + .end = MODULES_END,
> + .pgprot = PAGE_KERNEL_EXEC,
> + .alignment = 1,
> + },
> + },
> +};
> +
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + return _info;
>  }

> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
> + .pgprot = PAGE_KERNEL_RWX,
> + .alignment = 1,
> + },
> + },
> +};
> +
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + execmem_info.ranges[EXECMEM_DEFAULT].start = VMALLOC_START;
> + execmem_info.ranges[EXECMEM_DEFAULT].end = VMALLOC_END;
> +
> + return _info;
>  }

> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
> + .pgprot = PAGE_KERNEL,
> + .alignment = 1,
> + },
> + },
> +};
> +
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + execmem_info.ranges[EXECMEM_DEFAULT].start = MODULES_VADDR;
> + execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END;
> +
> + return _info;
>  }

> +static struct execmem_info execmem_info __ro_after_init = {
> + .ranges = {
> + [EXECMEM_DEFAULT] = {
>  #ifdef CONFIG_SPARC64
> + .start = MODULES_VADDR,
> + .end = MODULES_END,
>  #else
> + .start = VMALLOC_START,
> + .end = VMALLOC_END,
> +#endif
> + .alignment = 1,
> + },
> + },
> +};
> +
> +struct execmem_info __init *execmem_arch_setup(void)
>  {
> + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL;
>  
> + return _info;
>  }

I'm amazed by the weird and inconsistent breakup of initializations.

What exactly is wrong with something like:

static struct execmem_info execmem_info __ro_after_init;

struct execmem_info __init *execmem_arch_setup(void)
{
execmem_info = (struct execmem_info){
.ranges = {
[EXECMEM_DEFAULT] = {
.start  = MODULES_VADDR,
.end= MODULES_END,
.pgprot = PAGE_KERNEL,
.alignment = 1,
},
},
};
return _info;
}




Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()

2024-04-15 Thread Peter Zijlstra
On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote:
> +/**
> + * enum execmem_type - types of executable memory ranges
> + *
> + * There are several subsystems that allocate executable memory.
> + * Architectures define different restrictions on placement,
> + * permissions, alignment and other parameters for memory that can be used
> + * by these subsystems.
> + * Types in this enum identify subsystems that allocate executable memory
> + * and let architectures define parameters for ranges suitable for
> + * allocations by each subsystem.
> + *
> + * @EXECMEM_DEFAULT: default parameters that would be used for types that
> + * are not explcitly defined.
> + * @EXECMEM_MODULE_TEXT: parameters for module text sections
> + * @EXECMEM_KPROBES: parameters for kprobes
> + * @EXECMEM_FTRACE: parameters for ftrace
> + * @EXECMEM_BPF: parameters for BPF
> + * @EXECMEM_TYPE_MAX:
> + */
> +enum execmem_type {
> + EXECMEM_DEFAULT,
> + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT,
> + EXECMEM_KPROBES,
> + EXECMEM_FTRACE,
> + EXECMEM_BPF,
> + EXECMEM_TYPE_MAX,
> +};

Can we please get a break-down of how all these types are actually
different from one another?

I'm thinking some platforms have a tiny immediate space (arm64 comes to
mind) and has less strict placement constraints for some of them?



Re: [RFC PATCH 0/4] perf: Correlating user process data to samples

2024-04-13 Thread Peter Zijlstra
On Fri, Apr 12, 2024 at 09:37:24AM -0700, Beau Belgrave wrote:

> > Anyway, since we typically run stuff from NMI context, accessing user
> > data is 'interesting'. As such I would really like to make this work
> > depend on the call-graph rework that pushes all the user access bits
> > into return-to-user.
> 
> Cool, I assume that's the SFRAME work? Are there pointers to work I
> could look at and think about what a rebase looks like? Or do you have
> someone in mind I should work with for this?

I've been offline for a little while and still need to catch up with
things myself.

Josh was working on that when I dropped off IIRC, I'm not entirely sure
where things are at currently (and there is no way I can ever hope to
process the backlog).

Anybody know where we are with that?



Re: [RFC PATCH 0/4] perf: Correlating user process data to samples

2024-04-12 Thread Peter Zijlstra


On Fri, Apr 12, 2024 at 12:17:28AM +, Beau Belgrave wrote:

> An idea flow would look like this:
> User Task Profile
> do_work();sample() -> IP + No activity
> ...
> set_activity(123);
> ...
> do_work();sample() -> IP + activity (123)
> ...
> set_activity(124);
> ...
> do_work();sample() -> IP + activity (124)

This, start with this, because until I saw this, I was utterly confused
as to what the heck you were on about.

I started by thinking we already have TID in samples so you can already
associate back to user processes and got increasingly confused the
further I went.

What you seem to want to do however is have some task-state included so
you can see what the thread is doing.

Anyway, since we typically run stuff from NMI context, accessing user
data is 'interesting'. As such I would really like to make this work
depend on the call-graph rework that pushes all the user access bits
into return-to-user.



Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-11-22 Thread Peter Zijlstra
On Tue, Nov 21, 2023 at 02:17:21PM +0100, Tobias Huschle wrote:

> We applied both suggested patch options and ran the test again, so 
> 
> sched/eevdf: Fix vruntime adjustment on reweight
> sched/fair: Update min_vruntime for reweight_entity() correctly
> 
> and
> 
> sched/eevdf: Delay dequeue
> 
> Unfortunately, both variants do NOT fix the problem.
> The regression remains unchanged.

Thanks for testing.

> I will continue getting myself familiar with how cgroups are scheduled to dig 
> deeper here. If there are any other ideas, I'd be happy to use them as a 
> starting point for further analysis.
> 
> Would additional traces still be of interest? If so, I would be glad to
> provide them.

So, since it got bisected to the placement logic, but is a cgroup
related issue, I was thinking that 'Delay dequeue' might not cut it,
that only works for tasks, not the internal entities.

The below should also work for internal entities, but last time I poked
around with it I had some regressions elsewhere -- you know, fix one,
wreck another type of situations on hand.

But still, could you please give it a go -- it applies cleanly to linus'
master and -rc2.

---
Subject: sched/eevdf: Revenge of the Sith^WSleepers

For tasks that have received excess service (negative lag) allow them to
gain parity (zero lag) by sleeping.

Signed-off-by: Peter Zijlstra (Intel) 
---
 kernel/sched/fair.c | 36 
 kernel/sched/features.h |  6 ++
 2 files changed, 42 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d7a3c63a2171..b975e4b07a68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5110,6 +5110,33 @@ static inline void update_misfit_status(struct 
task_struct *p, struct rq *rq) {}
 
 #endif /* CONFIG_SMP */
 
+static inline u64
+entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+{
+   u64 now, vdelta;
+   s64 delta;
+
+   if (!(flags & ENQUEUE_WAKEUP))
+   return se->vlag;
+
+   if (flags & ENQUEUE_MIGRATED)
+   return 0;
+
+   now = rq_clock_task(rq_of(cfs_rq));
+   delta = now - se->exec_start;
+   if (delta < 0)
+   return se->vlag;
+
+   if (sched_feat(GENTLE_SLEEPER))
+   delta /= 2;
+
+   vdelta = __calc_delta(delta, NICE_0_LOAD, _rq->load);
+   if (vdelta < -se->vlag)
+   return se->vlag + vdelta;
+
+   return 0;
+}
+
 static void
 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -5133,6 +5160,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 
lag = se->vlag;
 
+   /*
+* Allow tasks that have received too much service (negative
+* lag) to (re)gain parity (zero lag) by sleeping for the
+* equivalent duration. This ensures they will be readily
+* eligible.
+*/
+   if (sched_feat(PLACE_SLEEPER) && lag < 0)
+   lag = entity_vlag_sleeper(cfs_rq, se, flags);
+
/*
 * If we want to place a task and preserve lag, we have to
 * consider the effect of the new entity on the weighted
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3ddf84de430..722282d3ed07 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -7,6 +7,12 @@
 SCHED_FEAT(PLACE_LAG, true)
 SCHED_FEAT(PLACE_DEADLINE_INITIAL, true)
 SCHED_FEAT(RUN_TO_PARITY, true)
+/*
+ * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at
+ * half the speed.
+ */
+SCHED_FEAT(PLACE_SLEEPER, true)
+SCHED_FEAT(GENTLE_SLEEPER, true)
 
 /*
  * Prefer to schedule the task we woke last (assuming it failed



Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-11-20 Thread Peter Zijlstra
On Sat, Nov 18, 2023 at 01:14:32PM +0800, Abel Wu wrote:

> Hi Peter, I'm a little confused here. As we adopt placement strategy #1
> when PLACE_LAG is enabled, the lag of that entity needs to be preserved.
> Given that the weight doesn't change, we have:
> 
>   vl' = vl
> 
> But in fact it is scaled on placement:
> 
>   vl' = vl * W/(W + w)

(W+w)/W

> 
> Does this intended? 

The scaling, yes that's intended and the comment explains why. So now
you have me confused too :-)

Specifically, I want the lag after placement to be equal to the lag we
come in with. Since placement will affect avg_vruntime (adding one
element to the average changes the average etc..) the placement also
affects the lag as measured after placement.

Or rather, if you enqueue and dequeue, I want the lag to be preserved.
If you do not take placement into consideration, lag will dissipate real
quick.

> And to illustrate my understanding of strategy #1:

> @@ -5162,41 +5165,17 @@ place_entity(struct cfs_rq *cfs_rq, struct 
> sched_entity *se, int flags)
>* vl_i is given by:
>*
>*   V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i)
> -  *  = (W*V + w_i*(V - vl_i)) / (W + w_i)
> -  *  = (W*V + w_i*V - w_i*vl_i) / (W + w_i)
> -  *  = (V*(W + w_i) - w_i*l) / (W + w_i)
> -  *  = V - w_i*vl_i / (W + w_i)
> -  *
> -  * And the actual lag after adding an entity with vl_i is:
> -  *
> -  *   vl'_i = V' - v_i
> -  * = V - w_i*vl_i / (W + w_i) - (V - vl_i)
> -  * = vl_i - w_i*vl_i / (W + w_i)
> -  *
> -  * Which is strictly less than vl_i. So in order to preserve lag
> -  * we should inflate the lag before placement such that the
> -  * effective lag after placement comes out right.
> -  *
> -  * As such, invert the above relation for vl'_i to get the vl_i
> -  * we need to use such that the lag after placement is the lag
> -  * we computed before dequeue.
> +  *  = (W*V + w_i*(V' - vl_i)) / (W + w_i)
> +  *  = V - w_i*vl_i / W
>*
> -  *   vl'_i = vl_i - w_i*vl_i / (W + w_i)
> -  * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i)
> -  *
> -  *   (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i
> -  *   = W*vl_i
> -  *
> -  *   vl_i = (W + w_i)*vl'_i / W
>*/
>   load = cfs_rq->avg_load;
>   if (curr && curr->on_rq)
>   load += scale_load_down(curr->load.weight);
> -
> - lag *= load + scale_load_down(se->load.weight);
>   if (WARN_ON_ONCE(!load))
>   load = 1;
> - lag = div_s64(lag, load);
> +
> + vruntime -= div_s64(lag * scale_load_down(se->load.weight), 
> load);
>   }
>   se->vruntime = vruntime - lag;


So you're proposing we do:

v = V - (lag * w) / (W + w) - lag

?

That can be written like:

v = V - (lag * w) / (W+w) - (lag * (W+w)) / (W+w)
  = V - (lag * (W+w) + lag * w) / (W+w)
  = V - (lag * (W+2w)) / (W+w)

And that turns into a mess AFAICT.


Let me repeat my earlier argument. Suppose v,w,l are the new element.
V,W are the old avg_vruntime and sum-weight.

Then: V = V*W / W, and by extention: V' = (V*W + v*w) / (W + w).

The new lag, after placement: 

l' = V' - v = (V*W + v*w) / (W+w) - v
= (V*W + v*w) / (W+w) - v * (W+w) / (W+v)
= (V*W + v*w -v*W - v*w) / (W+w)
= (V*W - v*W) / (W+w)
= W*(V-v) / (W+w)
= W/(W+w) * (V-v)

Substitute: v = V - (W+w)/W * l, my scaling thing, to obtain:

l' = W/(W+w) * (V - (V - (W+w)/W * l))
   = W/(W+w) * (V - V + (W+w)/W * l)
   = W/(W+w) * (W+w)/W * l
   = l

So by scaling, we've preserved lag across placement.

That make sense?



Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-11-17 Thread Peter Zijlstra
On Fri, Nov 17, 2023 at 01:24:21PM +0100, Tobias Huschle wrote:
> On Fri, Nov 17, 2023 at 10:23:18AM +0100, Peter Zijlstra wrote:

> > kworkers are typically not in cgroups and are part of the root cgroup,
> > but what's a vhost and where does it live?
> 
> The qemu instances of the two KVM guests are placed into cgroups.
> The vhosts run within the context of these qemu instances (4 threads per 
> guest).
> So they are also put into those cgroups.
> 
> I'll answer the other questions you brought up as well, but I guess that one 
> is most critical: 
> 
> > 
> > After confirming both tasks are indeed in the same cgroup ofcourse,
> > because if they're not, vruntime will be meaningless to compare and we
> > should look elsewhere.
> 
> In that case we probably have to go with elsewhere ... which is good to know.

Ah, so if this is a cgroup issue, it might be worth trying this patch
that we have in tip/sched/urgent.

I'll try and read the rest of the email a little later, gotta run
errands first.

---

commit eab03c23c2a162085b13200d7942fc5a00b5ccc8
Author: Abel Wu 
Date:   Tue Nov 7 17:05:07 2023 +0800

sched/eevdf: Fix vruntime adjustment on reweight

vruntime of the (on_rq && !0-lag) entity needs to be adjusted when
it gets re-weighted, and the calculations can be simplified based
on the fact that re-weight won't change the w-average of all the
entities. Please check the proofs in comments.

But adjusting vruntime can also cause position change in RB-tree
hence require re-queue to fix up which might be costly. This might
be avoided by deferring adjustment to the time the entity actually
leaves tree (dequeue/pick), but that will negatively affect task
selection and probably not good enough either.

Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling 
policy")
Signed-off-by: Abel Wu 
Signed-off-by: Peter Zijlstra (Intel) 
Link: 
https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.a...@bytedance.com

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138ce54b..025d90925bf6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
 #endif
 
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+  unsigned long weight)
+{
+   unsigned long old_weight = se->load.weight;
+   u64 avruntime = avg_vruntime(cfs_rq);
+   s64 vlag, vslice;
+
+   /*
+* VRUNTIME
+* 
+*
+* COROLLARY #1: The virtual runtime of the entity needs to be
+* adjusted if re-weight at !0-lag point.
+*
+* Proof: For contradiction assume this is not true, so we can
+* re-weight without changing vruntime at !0-lag point.
+*
+* Weight   VRuntime   Avg-VRuntime
+* beforew  vV
+*  afterw' v'   V'
+*
+* Since lag needs to be preserved through re-weight:
+*
+*  lag = (V - v)*w = (V'- v')*w', where v = v'
+*  ==> V' = (V - v)*w/w' + v   (1)
+*
+* Let W be the total weight of the entities before reweight,
+* since V' is the new weighted average of entities:
+*
+*  V' = (WV + w'v - wv) / (W + w' - w) (2)
+*
+* by using (1) & (2) we obtain:
+*
+*  (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+*  ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+*  ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+*  ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+*
+* Since we are doing at !0-lag point which means V != v, we
+* can simplify (3):
+*
+*  ==> W / (W + w' - w) = w / w'
+*  ==> Ww' = Ww + ww' - ww
+*  ==> W * (w' - w) = w * (w' - w)
+*  ==> W = w   (re-weight indicates w' != w)
+*
+* So the cfs_rq contains only one entity, hence vruntime of
+* the entity @v should always equal to the cfs_rq's weighted
+* average vruntime @V, which means we will always re-weight
+* at 0-lag point, thus breach assumption. Proof completed.
+*
+*
+* COROLLARY #2: Re-weight does NOT affect weighted average
+* vruntime of all the entities.
+*
+* Proof: According to corollary #1, Eq. (1) should be:
+*
+*  (V - v)*w = (V' - v')*w'
+*  ==>v' = V' - (V - v)*w/w'   (4)
+*
+* According to the weighted average formula, we have:
+*
+*  V' = (WV - wv + w'v') / (W - w + w')
+

Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-11-17 Thread Peter Zijlstra
On Fri, Nov 17, 2023 at 10:23:18AM +0100, Peter Zijlstra wrote:
> Now, IF this is the problem, I might have a patch that helps:
> 
>   
> https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf=119feac4fcc77001cd9bf199b25f08d232289a5c

And then I turn around and wipe the repository invalidating that link.

The sched/eevdf branch should be re-instated (with different SHA1), but
I'll include the patch below for reference.

---
Subject: sched/eevdf: Delay dequeue
From: Peter Zijlstra 
Date: Fri Sep 15 00:48:45 CEST 2023

For tasks that have negative-lag (have received 'excess' service), delay the
dequeue and keep them in the runnable tree until they're eligible again. Or
rather, keep them until they're selected again, since finding their eligibility
crossover point is expensive.

The effect is a bit like sleeper bonus, the tasks keep contending for service
until either they get a wakeup or until they're selected again and are really
dequeued.

This means that any actual dequeue happens with positive lag (serviced owed)
and are more readily ran when woken next.

Signed-off-by: Peter Zijlstra (Intel) 
---
 include/linux/sched.h   |1 
 kernel/sched/core.c |   88 +++-
 kernel/sched/fair.c |   11 ++
 kernel/sched/features.h |   11 ++
 kernel/sched/sched.h|3 +
 5 files changed, 97 insertions(+), 17 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -916,6 +916,7 @@ struct task_struct {
unsignedsched_reset_on_fork:1;
unsignedsched_contributes_to_load:1;
unsignedsched_migrated:1;
+   unsignedsched_delayed:1;
 
/* Force alignment to the next boundary: */
unsigned:0;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3856,12 +3856,23 @@ static int ttwu_runnable(struct task_str
 
rq = __task_rq_lock(p, );
if (task_on_rq_queued(p)) {
+   update_rq_clock(rq);
+   if (unlikely(p->sched_delayed)) {
+   p->sched_delayed = 0;
+   /* mustn't run a delayed task */
+   WARN_ON_ONCE(task_on_cpu(rq, p));
+   if (sched_feat(GENTLE_DELAY)) {
+   dequeue_task(rq, p, DEQUEUE_SAVE | 
DEQUEUE_NOCLOCK);
+   if (p->se.vlag > 0)
+   p->se.vlag = 0;
+   enqueue_task(rq, p, ENQUEUE_RESTORE | 
ENQUEUE_NOCLOCK);
+   }
+   }
if (!task_on_cpu(rq, p)) {
/*
 * When on_rq && !on_cpu the task is preempted, see if
 * it should preempt the task that is current now.
 */
-   update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -6565,6 +6576,24 @@ pick_next_task(struct rq *rq, struct tas
 # define SM_MASK_PREEMPT   SM_PREEMPT
 #endif
 
+static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned 
long prev_state)
+{
+   p->sched_contributes_to_load =
+   (prev_state & TASK_UNINTERRUPTIBLE) &&
+   !(prev_state & TASK_NOLOAD) &&
+   !(prev_state & TASK_FROZEN);
+
+   if (p->sched_contributes_to_load)
+   rq->nr_uninterruptible++;
+
+   deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
+
+   if (p->in_iowait) {
+   atomic_inc(>nr_iowait);
+   delayacct_blkio_start();
+   }
+}
+
 /*
  * __schedule() is the main scheduler function.
  *
@@ -6650,6 +6679,8 @@ static void __sched notrace __schedule(u
 
switch_count = >nivcsw;
 
+   WARN_ON_ONCE(prev->sched_delayed);
+
/*
 * We must load prev->state once (task_struct::state is volatile), such
 * that we form a control dependency vs deactivate_task() below.
@@ -6659,14 +6690,6 @@ static void __sched notrace __schedule(u
if (signal_pending_state(prev_state, prev)) {
WRITE_ONCE(prev->__state, TASK_RUNNING);
} else {
-   prev->sched_contributes_to_load =
-   (prev_state & TASK_UNINTERRUPTIBLE) &&
-   !(prev_state & TASK_NOLOAD) &&
-   !(prev_state & TASK_FROZEN);
-
-   if (prev->sched_contributes_to_load)
-   rq->nr_uninterruptible++;
-
/*
 * __schedule() ttwu()

Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)

2023-11-17 Thread Peter Zijlstra


Your email is pretty badly mangled by wrapping, please try and
reconfigure your MUA, esp. the trace and debug output is unreadable.

On Thu, Nov 16, 2023 at 07:58:18PM +0100, Tobias Huschle wrote:

> The base scenario are two KVM guests running on an s390 LPAR. One guest
> hosts the uperf server, one the uperf client.
> With EEVDF we observe a regression of ~50% for a strburst test.
> For a more detailed description of the setup see the section TEST SUMMARY at
> the bottom.

Well, that's not good :/

> Short summary:
> The mentioned kworker has been scheduled to CPU 14 before the tracing was
> enabled.
> A vhost process is migrated onto CPU 14.
> The vruntimes of kworker and vhost differ significantly (86642125805 vs
> 4242563284 -> factor 20)

So bear with me, I know absolutely nothing about virt stuff. I suspect
there's cgroups involved because shiny or something.

kworkers are typically not in cgroups and are part of the root cgroup,
but what's a vhost and where does it live?

Also, what are their weights / nice values?

> The vhost process wants to wake up the kworker, therefore the kworker is
> placed onto the runqueue again and set to runnable.
> The vhost process continues to execute, waking up other vhost processes on
> other CPUs.
> 
> So far this behavior is not different to what we see on pre-EEVDF kernels.
> 
> On timestamp 576.162767, the vhost process triggers the last wake up of
> another vhost on another CPU.
> Until timestamp 576.171155, we see no other activity. Now, the vhost process
> ends its time slice.
> Then, vhost gets re-assigned new time slices 4 times and gets then migrated
> off to CPU 15.

So why does this vhost stay on the CPU if it doesn't have anything to
do? (I've not tried to make sense of the trace, that's just too
painful).

> This does not occur with older kernels.
> The kworker has to wait for the migration to happen in order to be able to
> execute again.
> This is due to the fact, that the vruntime of the kworker is significantly
> larger than the one of vhost.

That's, weird. Can you add a trace_printk() to update_entity_lag() and
have it print out the lag, limit and vlag (post clamping) values? And
also in place_entity() for the reverse process, lag pre and post scaling
or something.

After confirming both tasks are indeed in the same cgroup ofcourse,
because if they're not, vruntime will be meaningless to compare and we
should look elsewhere.

Also, what HZ and what preemption mode are you running? If kworker is
somehow vastly over-shooting it's slice -- keeps running way past the
avg_vruntime, then it will build up a giant lag and you get what you
describe, next time it wakes up it gets placed far to the right (exactly
where it was when it 'finally' went to sleep, relatively speaking).

> We found some options which sound plausible but we are not sure if they are
> valid or not:
> 
> 1. The wake up path has a dependency on the vruntime metrics that now delays
> the execution of the kworker.
> 2. The previous commit af4cf40470c2 (sched/fair: Add cfs_rq::avg_vruntime)
> which updates the way cfs_rq->min_vruntime and
> cfs_rq->avg_runtime are set might have introduced an issue which is
> uncovered with the commit mentioned above.

Suppose you have a few tasks (of equal weight) on you virtual timeline
like so:

   -+---+---+---+---+--
^   ^
|   `avg_vruntime
`-min_vruntime

Then the above would be more or less the relative placements of these
values. avg_vruntime is the weighted average of the various vruntimes
and is therefore always in the 'middle' of the tasks, and not somewhere
out-there.

min_vruntime is a monotonically increasing 'minimum' that's left-ish on
the tree (there's a few cases where a new task can be placed left of
min_vruntime and its no longer actuall the minimum, but whatever).

These values should be relatively close to one another, depending
ofcourse on the spread of the tasks. So I don't think this is causing
trouble.

Anyway, the big difference with lag based placement is that where
previously tasks (that do not migrate) retain their old vruntime and on
placing they get pulled forward to at least min_vruntime, so a task that
wildly overshoots, but then doesn't run for significant time can still
be overtaken and then when placed again be 'okay'.

Now OTOH, with lag-based placement,  we strictly preserve their relative
offset vs avg_vruntime. So if they were *far* too the right when they go
to sleep, they will again be there on placement.

Sleeping doesn't help them anymore.

Now, IF this is the problem, I might have a patch that helps:

  
https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf=119feac4fcc77001cd9bf199b25f08d232289a5c

That branch is based on v6.7-rc1 and then some, but I think it's
relatively easy to rebase the lot on v6.6 (which I'm assuming you're
on).

I'm a little conflicted on the patch, conceptually I like what it does,
but the 

Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS

2023-11-06 Thread Peter Zijlstra
On Mon, Nov 06, 2023 at 09:47:08PM +0900, Masami Hiramatsu wrote:
> On Mon, 6 Nov 2023 11:19:32 +0100
> Peter Zijlstra  wrote:
> 
> > On Mon, Nov 06, 2023 at 09:38:50AM +0900, Masami Hiramatsu wrote:
> > > On Sun, 5 Nov 2023 18:34:09 -0500
> > > Steven Rostedt  wrote:
> > > 
> > > > On Sun, 5 Nov 2023 18:33:01 -0500
> > > > Steven Rostedt  wrote:
> > > > 
> > > > > For x86_64, that would be:
> > > > > 
> > > > >   rdi, rsi, rdx, r8, r9, rsp
> > > > 
> > > > I missed rcx.
> > > 
> > > I would like to add rax to the list so that it can handle the return 
> > > value too. :)
> > 
> > So something like so?
> > 
> > 
> > diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
> > index 897cf02c20b1..71bfe27594a5 100644
> > --- a/arch/x86/include/asm/ftrace.h
> > +++ b/arch/x86/include/asm/ftrace.h
> > @@ -36,6 +36,10 @@ static inline unsigned long ftrace_call_adjust(unsigned 
> > long addr)
> >  
> >  #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
> >  struct ftrace_regs {
> > +   /*
> > +* Partial, filled with:
> > +*  rax, rcx, rdx, rdi, rsi, r8, r9, rsp
> 
> Don't we need rbp too? (for frame pointer)

/me goes stare at ftrace_64.S, and yes it appears it fills out rbp too.





Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS

2023-11-06 Thread Peter Zijlstra
On Mon, Nov 06, 2023 at 09:38:50AM +0900, Masami Hiramatsu wrote:
> On Sun, 5 Nov 2023 18:34:09 -0500
> Steven Rostedt  wrote:
> 
> > On Sun, 5 Nov 2023 18:33:01 -0500
> > Steven Rostedt  wrote:
> > 
> > > For x86_64, that would be:
> > > 
> > >   rdi, rsi, rdx, r8, r9, rsp
> > 
> > I missed rcx.
> 
> I would like to add rax to the list so that it can handle the return value 
> too. :)

So something like so?


diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index 897cf02c20b1..71bfe27594a5 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -36,6 +36,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long 
addr)
 
 #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS
 struct ftrace_regs {
+   /*
+* Partial, filled with:
+*  rax, rcx, rdx, rdi, rsi, r8, r9, rsp
+*/
struct pt_regs  regs;
 };



Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS

2023-11-05 Thread Peter Zijlstra
On Sun, Nov 05, 2023 at 02:11:30PM -0500, Steven Rostedt wrote:
> On Sun, 5 Nov 2023 18:25:36 +0100
> Peter Zijlstra  wrote:
> 
> > On Mon, Nov 06, 2023 at 01:11:21AM +0900, Masami Hiramatsu (Google) wrote:
> > > From: Masami Hiramatsu (Google) 
> > > 
> > > Support HAVE_FUNCTION_GRAPH_FREGS on x86-64, which saves ftrace_regs
> > > on the stack in ftrace_graph return trampoline so that the callbacks
> > > can access registers via ftrace_regs APIs.  
> > 
> > What is ftrace_regs ? If I look at arch/x86/include/asm/ftrace.h it's a
> > pointless wrapper around pt_regs.
> > 
> > Can we please remove the pointless wrappery and call it what it is?
> 
> A while back ago when I introduced FTRACE_WITH_ARGS, it would have all
> ftrace callbacks get a pt_regs, but it would be partially filled for
> those that did not specify the "REGS" flag when registering the
> callback. You and Thomas complained that it would be a bug to return
> pt_regs that was not full because something might read the non filled
> registers and think they were valid.
> 
> To solve this, I came up with ftrace_regs to only hold the registers
> that were required for function parameters (including the stack
> pointer). You could then call arch_ftrace_get_regs(ftrace_regs) and if
> this "wrapper" had all valid pt_regs registers, then it would return
> the pt_regs, otherwise it would return NULL, and you would need to use
> the ftrace_regs accessor calls to get the function registers. You and
> Thomas agreed with this.

Changelog nor code made it clear this was partial anything. So this is
still the partial thing?

Can we then pretty clear clarify all that, and make it clear which regs
are in there? Because when I do 'vim -t ftrace_regs' it just gets me a
seemingly pointless wrapper struct, no elucidating comments nothingses.

> You even Acked the patch:
>
> commit 02a474ca266a47ea8f4d5a11f4ffa120f83730ad
> Author: Steven Rostedt (VMware) 
> Date:   Tue Oct 27 10:55:55 2020 -0400

You expect me to remember things from 3 years ago?



Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS

2023-11-05 Thread Peter Zijlstra
On Mon, Nov 06, 2023 at 01:11:21AM +0900, Masami Hiramatsu (Google) wrote:
> From: Masami Hiramatsu (Google) 
> 
> Support HAVE_FUNCTION_GRAPH_FREGS on x86-64, which saves ftrace_regs
> on the stack in ftrace_graph return trampoline so that the callbacks
> can access registers via ftrace_regs APIs.

What is ftrace_regs ? If I look at arch/x86/include/asm/ftrace.h it's a
pointless wrapper around pt_regs.

Can we please remove the pointless wrappery and call it what it is?



Re: [PATCH V6 5/7] cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically

2023-09-09 Thread Peter Zijlstra
On Fri, Sep 08, 2023 at 03:46:51PM +0800, Meng Li wrote:
> +static void amd_pstate_update_highest_perf(unsigned int cpu)
> +{
> + struct cpufreq_policy *policy;
> + struct amd_cpudata *cpudata;
> + u32 prev_high = 0, cur_high = 0;
> + u64 highest_perf;
> + int ret;
> +
> + if (!prefcore)
> + return;
> +
> + ret = amd_pstate_get_highest_perf(cpu, _perf);
> + if (ret)
> + return;
> +
> + policy = cpufreq_cpu_get(cpu);
> + cpudata = policy->driver_data;
> + cur_high = highest_perf;
> + prev_high = READ_ONCE(cpudata->prefcore_ranking);
> +
> + if (prev_high != cur_high) {
> + WRITE_ONCE(cpudata->prefcore_ranking, cur_high);
> + sched_set_itmt_core_prio(cur_high, cpu);
> + }
> +
> + cpufreq_cpu_put(policy);
> +}

Idem -- I told to clarify the u32 vs int thing, nothing here.


Re: [PATCH V6 3/7] cpufreq: amd-pstate: Enable amd-pstate preferred core supporting.

2023-09-09 Thread Peter Zijlstra
On Fri, Sep 08, 2023 at 03:46:49PM +0800, Meng Li wrote:
> +static void amd_pstate_init_prefcore(void)
> +{
> + int cpu, ret;
> + u64 highest_perf;
> +
> + if (!prefcore)
> + return;
> +
> + for_each_online_cpu(cpu) {
> + ret = amd_pstate_get_highest_perf(cpu, _perf);
> + if (ret)
> + break;
> +
> + sched_set_itmt_core_prio(highest_perf, cpu);
> +
> + /* check if CPPC preferred core feature is enabled*/
> + if (highest_perf == AMD_PSTATE_MAX_CPPC_PERF) {
> + pr_debug("AMD CPPC preferred core is unsupported!\n");
> + hw_prefcore = false;
> + prefcore = false;
> + return;
> + }
> + }
> +
> + /*
> +  * This code can be run during CPU online under the
> +  * CPU hotplug locks, so sched_set_amd_prefcore_support()
> +  * cannot be called from here.  Queue up a work item
> +  * to invoke it.
> +  */
> + schedule_work(_prefcore_work);
> +}

Brilliant, repost without addressing prior feedback..  :-(


Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-17 Thread Peter Zijlstra
On Tue, Aug 16, 2022 at 11:49:59AM -0700, Dan Williams wrote:

> What would have helped is if the secure-erase and unlock definition in
> the specification mandated that the device emit cache invalidations for
> everything it has mapped when it is erased. However, that has some
> holes, and it also makes me think there is a gap in the current region
> provisioning code. If I have device-A mapped at physical-address-X and then
> tear that down and instantiate device-B at that same physical address
> there needs to be CPU cache invalidation between those 2 events.

Can we pretty please get those holes fixed ASAP such that future
generations can avoid the WBINVD nonsense?



Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-17 Thread Peter Zijlstra
On Tue, Aug 16, 2022 at 10:42:03AM -0700, Dan Williams wrote:

> I also think this cache_flush_region() API wants a prominent comment
> clarifying the limited applicability of this API. I.e. that it is not
> for general purpose usage, not for VMs, and only for select bare metal
> scenarios that instantaneously invalidate wide swaths of memory.
> Otherwise, I can now see how this looks like a potentially scary
> expansion of the usage of wbinvd.

This; because adding a generic API like this makes it ripe for usage.
And this is absolutely the very last thing we want used.



Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()

2022-08-16 Thread Peter Zijlstra
On Mon, Aug 15, 2022 at 09:07:06AM -0700, Davidlohr Bueso wrote:
> diff --git a/arch/x86/include/asm/cacheflush.h 
> b/arch/x86/include/asm/cacheflush.h
> index b192d917a6d0..ce2ec9556093 100644
> --- a/arch/x86/include/asm/cacheflush.h
> +++ b/arch/x86/include/asm/cacheflush.h
> @@ -10,4 +10,7 @@
> 
>  void clflush_cache_range(void *addr, unsigned int size);
> 
> +#define flush_all_caches() \
> + do { wbinvd_on_all_cpus(); } while(0)
> +

This is horrific... we've done our utmost best to remove all WBINVD
usage and here you're adding it back in the most horrible form possible
?!?

Please don't do this, do *NOT* use WBINVD.



Re: [PATCH v3 2/8] cxl/acpi: Add root device lockdep validation

2022-04-25 Thread Peter Zijlstra
On Sat, Apr 23, 2022 at 10:27:52AM -0700, Dan Williams wrote:

> ...so I'm going to drop it and just add a comment about the
> expectations. As Peter said there's already a multitude of ways to
> cause false positive / negative results with lockdep so this is just
> one more area where one needs to be careful and understand the lock
> context they might be overriding.

One safe-guard might be to check the class you're overriding is indeed
__no_validate__, and WARN if not. Then the unconditional reset is
conistent.

Then, if/when, that WARN ever triggers you can revisit all this.



Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation

2022-04-15 Thread Peter Zijlstra
On Thu, Apr 14, 2022 at 12:43:31PM -0700, Dan Williams wrote:
> On Thu, Apr 14, 2022 at 12:34 PM Peter Zijlstra  wrote:
> >
> > On Thu, Apr 14, 2022 at 10:17:13AM -0700, Dan Williams wrote:
> >
> > > One more sanity check... So in driver subsystems there are cases where
> > > a device on busA hosts a topology on busB. When that happens there's a
> > > need to set the lock class late in a driver since busA knows nothing
> > > about the locking rules of busB.
> >
> > I'll pretend I konw what you're talking about ;-)
> >
> > > Since the device has a longer lifetime than a driver when the driver
> > > exits it must set dev->mutex back to the novalidate class, otherwise
> > > it sets up a use after free of the static lock_class_key.
> >
> > I'm not following, static storage has infinite lifetime.
> 
> Not static storage in a driver module.
> 
> modprobe -r fancy_lockdep_using_driver.ko
> 
> Any use of device_lock() by the core on a device that a driver in this
> module was driving will de-reference a now invalid pointer into
> whatever memory was vmalloc'd for the module static data.

Ooh, modules (I always, conveniently, forget they exist). Yes, setting a
lock instance from the core kernel to a key that's inside a module and
then taking the module out will be 'interesting'.

Most likely you'll get a splat from lockdep when doing this.



Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation

2022-04-14 Thread Peter Zijlstra
On Thu, Apr 14, 2022 at 10:17:13AM -0700, Dan Williams wrote:

> One more sanity check... So in driver subsystems there are cases where
> a device on busA hosts a topology on busB. When that happens there's a
> need to set the lock class late in a driver since busA knows nothing
> about the locking rules of busB.

I'll pretend I konw what you're talking about ;-)

> Since the device has a longer lifetime than a driver when the driver
> exits it must set dev->mutex back to the novalidate class, otherwise
> it sets up a use after free of the static lock_class_key.

I'm not following, static storage has infinite lifetime.

> I came up with this and it seems to work, just want to make sure I'm
> properly using the lock_set_class() API and it is ok to transition
> back and forth from the novalidate case:
> 
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 990b6670222e..32673e1a736d 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -405,6 +405,29 @@ struct cxl_nvdimm_bridge
> *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd);
>  #define __mock static
>  #endif
> 
> +#ifdef CONFIG_PROVE_LOCKING
> +static inline void cxl_lock_reset_class(void *_dev)
> +{
> +   struct device *dev = _dev;
> +
> +   lock_set_class(>mutex.dep_map, "__lockdep_no_validate__",
> +  &__lockdep_no_validate__, 0, _THIS_IP_);
> +}
> +
> +static inline int cxl_lock_set_class(struct device *dev, const char *name,
> +struct lock_class_key *key)
> +{
> +   lock_set_class(>mutex.dep_map, name, key, 0, _THIS_IP_);
> +   return devm_add_action_or_reset(dev, cxl_lock_reset_class, dev);
> +}
> +#else
> +static inline int cxl_lock_set_class(struct device *dev, const char *name,
> +struct lock_class_key *key)
> +{
> +   return 0;
> +}
> +#endif

Under the assumption that the lock is held (lock_set_class() will
actually barf if @lock isn't held) this should indeed work as expected
(although I think you got the @name part 'wrong', I think that's
canonically something like ">mutex" or something).



Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation

2022-04-14 Thread Peter Zijlstra
On Wed, Apr 13, 2022 at 03:01:21PM -0700, Dan Williams wrote:

> > That's not an obvious conclusion; lockdep has lots of funny annotations,
> > subclasses is just one.
> >
> > I think the big new development in lockdep since that time with Alan
> > Stern is that lockdep now has support for dynamic keys; that is lock
> > keys in heap memory (as opposed to static storage).
> 
> Ah, I was not aware of that, that should allow for deep cleanups of
> this proposal.

> > If you want lockdep validation for one (or more) dev->mutex instances,
> > why not pull them out of the no_validate class and use the normal
> > locking?
> 
> Sounds perfect, just didn't know how to do that with my current
> understanding of how to communicate this to lockdep.
> 
> >
> > This is all quite insane.
> 
> Yes, certainly in comparison to your suggestion on the next patch.
> That looks much more sane, and even better I think it allows for
> optional lockdep validation without even needing to touch
> include/linux/device.h.

Right, so lockdep has:

 - classes, based off of lock_class_key address;

   * lock_class_key should be static storage; except now we also have:

   lockdep_{,un}register_key()

 which allows dynamic memory (aka. heap) to be used for classes,
 important to note that lockdep memory usage is still static storage
 because the memory allocators use locks too. So if you register too
 many dynamic keys, you'll run out of lockdep memory etc.. so be
 careful.

   * things like mutex_init() have a static lock_class_key per site
 and hence every lock initialized by the same code ends up in the
 same class by default.

   * can be trivially changed at any time, assuming the lock isn't held,
 using lockdep_set_class*() family.

 (extensively used all over the kernel, for example by the vfs to
  give each filesystem type their own locking order rules)

   * lockdep_set_no_validate_class() is a magical variant of
 lockdep_set_class() that sets a magical lock_class_key.

   * can be changed while held using lock_set_class()

 ( from a lockdep pov it unlocks the held stack,
   changes the class of your lock, and re-locks the
   held stack, now with a different class nesting ).

 Be carefule! It doesn't forget the old nesting order, so you
 can trivally generate cycles.

 - subclasses, basically distinct addresses inside above mentioned
   lock_class_key object, limited to 8. Normally used with
   *lock_nested() family of functions. Typically used to lock multiple
   instances of a single lock class where there is defined order between
   instances (see for instance: double_rq_lock()).

 - nest_lock; eg. mutex_lock_nest_lock(), which allows expressing things
   like: multiple locks of class A can be taken in any order, provided
   we hold lock B.

With many of these, it's possible to get it wrong and annotate real
deadlocks away, so be careful :-)



Re: [PATCH v2 03/12] cxl/core: Refactor a cxl_lock_class() out of cxl_nested_lock()

2022-04-13 Thread Peter Zijlstra


*Completely* untested (I wouldn't know where to begin and probably odn't
have the hardware anyway), and known incomplete.

What's wrong with something like this then?

---
 drivers/cxl/core/pmem.c |  8 +
 drivers/cxl/core/port.c | 58 +---
 drivers/cxl/cxl.h   | 78 -
 3 files changed, 42 insertions(+), 102 deletions(-)

diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c
index 8de240c4d96b..aca0dd5eefeb 100644
--- a/drivers/cxl/core/pmem.c
+++ b/drivers/cxl/core/pmem.c
@@ -80,6 +80,8 @@ struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct 
cxl_nvdimm *cxl_nvd)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, CXL);
 
+static lock_class_key cxl_nvdimm_bridge_key;
+
 static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port)
 {
struct cxl_nvdimm_bridge *cxl_nvb;
@@ -104,6 +106,8 @@ static struct cxl_nvdimm_bridge 
*cxl_nvdimm_bridge_alloc(struct cxl_port *port)
dev->bus = _bus_type;
dev->type = _nvdimm_bridge_type;
 
+   lockdep_set_class(>mutex, _nvdimm_bridge_key);
+
return cxl_nvb;
 
 err:
@@ -214,6 +218,8 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev)
 }
 EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL);
 
+static struct lock_class_key cxl_nvdimm_key;
+
 static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd)
 {
struct cxl_nvdimm *cxl_nvd;
@@ -231,6 +237,8 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct 
cxl_memdev *cxlmd)
dev->bus = _bus_type;
dev->type = _nvdimm_type;
 
+   lockdep_set_class(>mutex, _nvdimm_key);
+
return cxl_nvd;
 }
 
diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
index 2ab1ba4499b3..cae88404612f 100644
--- a/drivers/cxl/core/port.c
+++ b/drivers/cxl/core/port.c
@@ -312,10 +312,10 @@ static void cxl_port_release(struct device *dev)
struct cxl_port *port = to_cxl_port(dev);
struct cxl_ep *ep, *_e;
 
-   cxl_device_lock(dev);
+   device_lock(dev);
list_for_each_entry_safe(ep, _e, >endpoints, list)
cxl_ep_release(ep);
-   cxl_device_unlock(dev);
+   device_unlock(dev);
ida_free(_port_ida, port->id);
kfree(port);
 }
@@ -391,6 +391,8 @@ static int devm_cxl_link_uport(struct device *host, struct 
cxl_port *port)
return devm_add_action_or_reset(host, cxl_unlink_uport, port);
 }
 
+static struct lock_class_key cxl_port_key;
+
 static struct cxl_port *cxl_port_alloc(struct device *uport,
   resource_size_t component_reg_phys,
   struct cxl_port *parent_port)
@@ -431,6 +433,8 @@ static struct cxl_port *cxl_port_alloc(struct device *uport,
dev->bus = _bus_type;
dev->type = _port_type;
 
+   lockdep_set_class(>mutex, _port_key);
+
return port;
 
 err:
@@ -457,8 +461,10 @@ struct cxl_port *devm_cxl_add_port(struct device *host, 
struct device *uport,
if (IS_ERR(port))
return port;
 
-   if (parent_port)
+   if (parent_port) {
port->depth = parent_port->depth + 1;
+   lockdep_set_class_and_subclass(>dev->mutex, _port, 
port->depth);
+   }
dev = >dev;
if (is_cxl_memdev(uport))
rc = dev_set_name(dev, "endpoint%d", port->id);
@@ -554,7 +560,7 @@ static int match_root_child(struct device *dev, const void 
*match)
return 0;
 
port = to_cxl_port(dev);
-   cxl_device_lock(dev);
+   device_lock(dev);
list_for_each_entry(dport, >dports, list) {
iter = match;
while (iter) {
@@ -564,7 +570,7 @@ static int match_root_child(struct device *dev, const void 
*match)
}
}
 out:
-   cxl_device_unlock(dev);
+   device_unlock(dev);
 
return !!iter;
 }
@@ -623,13 +629,13 @@ static int add_dport(struct cxl_port *port, struct 
cxl_dport *new)
 static void cond_cxl_root_lock(struct cxl_port *port)
 {
if (is_cxl_root(port))
-   cxl_device_lock(>dev);
+   device_lock(>dev);
 }
 
 static void cond_cxl_root_unlock(struct cxl_port *port)
 {
if (is_cxl_root(port))
-   cxl_device_unlock(>dev);
+   device_unlock(>dev);
 }
 
 static void cxl_dport_remove(void *data)
@@ -736,15 +742,15 @@ static int add_ep(struct cxl_port *port, struct cxl_ep 
*new)
 {
struct cxl_ep *dup;
 
-   cxl_device_lock(>dev);
+   device_lock(>dev);
if (port->dead) {
-   cxl_device_unlock(>dev);
+   device_unlock(>dev);
return -ENXIO;
}
dup = find_ep(port, new->ep);
if (!dup)
list_add_tail(>list, >endpoints);
-   cxl_device_unlock(>dev);
+   device_unlock(>dev);
 
return dup ? -EEXIST : 0;
 }
@@ -854,12 +860,12 @@ static void delete_endpoint(void *data)

Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation

2022-04-13 Thread Peter Zijlstra
On Tue, Apr 12, 2022 at 11:01:38PM -0700, Dan Williams wrote:
> The device_lock() is hidden from lockdep by default because, for
> example, a device subsystem may do something like:
> 
> ---
> device_add(dev1);
> ...in driver core...
> device_lock(dev1);
> bus->probe(dev1); /* where bus->probe() calls driver1_probe() */
> 
> driver1_probe(struct device *dev)
> {
>   ...do some enumeration...
>   dev2->parent = dev;
>   /* this triggers probe under device_lock(dev2); */
>   device_add(dev2);
> }
> ---
> 
> To lockdep, that device_lock(dev2) looks like a deadlock because lockdep

Recursion, you're meaning to say it looks like same lock recursion.

> only sees lock classes, not individual lock instances. All device_lock()
> instances across the entire kernel are the same class. However, this is
> not a deadlock in practice because the locking is strictly hierarchical.
> I.e. device_lock(dev1) is held over device_lock(dev2), but never the
> reverse.

I have some very vague memories from a conversation with Alan Stern,
some maybe 10 years ago, where I think he was explaining to me this was
not in fact a simple hierarchy.

> In order for lockdep to be satisfied and see that it is
> hierarchical in practice the mutex_lock() call in device_lock() needs to
> be moved to mutex_lock_nested() where the @subclass argument to
> mutex_lock_nested() represents the nesting level, i.e.:

That's not an obvious conclusion; lockdep has lots of funny annotations,
subclasses is just one.

I think the big new development in lockdep since that time with Alan
Stern is that lockdep now has support for dynamic keys; that is lock
keys in heap memory (as opposed to static storage).

> s/device_lock(dev1)/mutex_lock_nested(>mutex, 1)/
> 
> s/device_lock(dev2)/mutex_lock_nested(>mutex, 2)/
> 
> Now, what if the internals of the device_lock() could be annotated with
> the right @subclass argument to call mutex_lock_nested()?
> 
> With device_set_lock_class() a subsystem can optionally add that
> metadata. The device_lock() still takes dev->mutex, but when
> dev->lock_class is >= 0 it additionally takes dev->lockdep_mutex with
> the proper nesting. Unlike dev->mutex, dev->lockdep_mutex is not marked
> lockdep_set_novalidate_class() and lockdep will become useful... at
> least for one subsystem at a time.
> 
> It is still the case that only one subsystem can be using lockdep with
> lockdep_mutex at a time because different subsystems will collide class
> numbers. You might say "well, how about subsystem1 gets class ids 0 to 9
> and subsystem2 gets class ids 10 to 20?". MAX_LOCKDEP_SUBCLASSES is 8,
> and 8 is just enough class ids for one subsystem of moderate complexity.

Again, that doesn't seem like an obvious suggestion at all. Why not give
each subsystem a different lock key?


> diff --git a/include/linux/device.h b/include/linux/device.h
> index af2576ace130..6083e757e804 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -402,6 +402,7 @@ struct dev_msi_info {
>   * @mutex:   Mutex to synchronize calls to its driver.
>   * @lockdep_mutex: An optional debug lock that a subsystem can use as a
>   *   peer lock to gain localized lockdep coverage of the device_lock.
> + * @lock_class: per-subsystem annotated device lock class
>   * @bus: Type of bus device is on.
>   * @driver:  Which driver has allocated this
>   * @platform_data: Platform data specific to the device.
> @@ -501,6 +502,7 @@ struct device {
>  dev_set_drvdata/dev_get_drvdata */
>  #ifdef CONFIG_PROVE_LOCKING
>   struct mutexlockdep_mutex;
> + int lock_class;
>  #endif
>   struct mutexmutex;  /* mutex to synchronize calls to
>* its driver.
> @@ -762,18 +764,100 @@ static inline bool dev_pm_test_driver_flags(struct 
> device *dev, u32 flags)
>   return !!(dev->power.driver_flags & flags);
>  }
>  
> +static inline void device_lock_assert(struct device *dev)
> +{
> + lockdep_assert_held(>mutex);
> +}
> +
>  #ifdef CONFIG_PROVE_LOCKING
>  static inline void device_lockdep_init(struct device *dev)
>  {
>   mutex_init(>lockdep_mutex);
> + dev->lock_class = -1;
>   lockdep_set_novalidate_class(>mutex);
>  }
> -#else
> +
> +static inline void device_lock(struct device *dev)
> +{
> + /*
> +  * For double-lock programming errors the kernel will hang
> +  * trying to acquire @dev->mutex before lockdep can report the
> +  * problem acquiring @dev->lockdep_mutex, so manually assert
> +  * before that hang.
> +  */
> + lockdep_assert_not_held(>lockdep_mutex);
> +
> + mutex_lock(>mutex);
> + if (dev->lock_class >= 0)
> + mutex_lock_nested(>lockdep_mutex, dev->lock_class);
> +}
> +
> +static inline int device_lock_interruptible(struct device *dev)
> +{
> + int rc;
> +
> + lockdep_assert_not_held(>lockdep_mutex);
> +
> + rc = 

Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-20 Thread Peter Zijlstra
On Wed, Apr 21, 2021 at 03:37:11AM +0900, Namhyung Kim wrote:
> On Tue, Apr 20, 2021 at 7:28 PM Peter Zijlstra  wrote:

> > You forgot NMI.
> 
> Thanks for your explanation.  Maybe I'm missing something but
> this event is basically for counting and doesn't allow sampling.
> Do you say it's affected by other sampling events?  Note that
> it's not reading from the PMU here, what it reads is a snapshot
> of last pmu->read(event) afaik.

Even !sampling events will trigger NMI to deal with short hardware
counters rolling over. But yes, !sampling can also be updated from NMI
by other events if they're in a group etc..

Basically, always assume NMI/PMI can happen.


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 05:53:40PM +0100, Vincent Donnefort wrote:
> All good with that snippet on my end.
> 
> I wonder if balance_push() shouldn't use the cpu_of() accessor
> instead of rq->cpu.

That might be a personal quirk of mine, but for code that is under
CONFIG_SMP (as all balancing code must be) I tend to prefer the more
explicit rq->cpu usage. cpu_of() obviously also works.

> Otherwise,
> 
> + Reviewed-by: Vincent Donnefort 

Thanks!, now I get to write a Changelog :-)


Re: [PATCH] perf/x86: Fix integer overflow when left shifting an integer more than 32 bits

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 05:03:03PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 03:29:07PM +0100, Colin King wrote:
> > From: Colin Ian King 
> > 
> > The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being
> > bit-wise masked with the value (0x03 << i*4). However, the shifted value
> > is evaluated using 32 bit arithmetic, so will overflow when i > 8.
> > Fix this by making 0x03 a ULL so that the shift is performed using
> > 64 bit arithmetic.
> > 
> > Addresses-Coverity: ("Unintentional integer overflow")
> 
> Strange tag that, also inaccurate, wide shifts are UB and don't behave
> consistently.
> 
> As is, we've not had hardware with that many fixed counters, but yes,
> worth fixing I suppose.

Patch now reads:

---
Subject: perf/x86: Allow for 8
Date: Tue, 20 Apr 2021 15:29:07 +0100

From: Colin Ian King 

The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being
bit-wise masked with the value (0x03 << i*4). However, the shifted value
is evaluated using 32 bit arithmetic, so will UB when i > 8. Fix this
by making 0x03 a ULL so that the shift is performed using 64 bit
arithmetic.

This makes the arithmetic internally consistent and preparers for the
day when hardware provides 8
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210420142907.382417-1-colin.k...@canonical.com
---
 arch/x86/events/core.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -261,7 +261,7 @@ static bool check_hw_exists(void)
for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
if (fixed_counter_disabled(i))
continue;
-   if (val & (0x03 << i*4)) {
+   if (val & (0x03ULL << i*4)) {
bios_fail = 1;
val_fail = val;
reg_fail = reg;


Re: [PATCH] perf/x86: Fix integer overflow when left shifting an integer more than 32 bits

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 03:29:07PM +0100, Colin King wrote:
> From: Colin Ian King 
> 
> The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being
> bit-wise masked with the value (0x03 << i*4). However, the shifted value
> is evaluated using 32 bit arithmetic, so will overflow when i > 8.
> Fix this by making 0x03 a ULL so that the shift is performed using
> 64 bit arithmetic.
> 
> Addresses-Coverity: ("Unintentional integer overflow")

Strange tag that, also inaccurate, wide shifts are UB and don't behave
consistently.

As is, we've not had hardware with that many fixed counters, but yes,
worth fixing I suppose.

> Fixes: a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check")
> Signed-off-by: Colin Ian King 
> ---
>  arch/x86/events/core.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
> index bafd93c54ffa..59c665c8c2e9 100644
> --- a/arch/x86/events/core.c
> +++ b/arch/x86/events/core.c
> @@ -261,7 +261,7 @@ static bool check_hw_exists(void)
>   for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
>   if (fixed_counter_disabled(i))
>   continue;
> - if (val & (0x03 << i*4)) {
> + if (val & (0x03ULL << i*4)) {
>   bios_fail = 1;
>   val_fail = val;
>   reg_fail = reg;
> -- 
> 2.30.2
> 


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 04:39:04PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
> > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
> > 
> > > Found the issue:
> > > 
> > > $ cat hotplug/states:
> > > 219: sched:active
> > > 220: online
> > > 
> > > CPU0: 
> > > 
> > > $ echo 219 > hotplug/fail
> > > $ echo 0 > online
> > > 
> > > => cpu_active = 1 cpu_dying = 1
> > > 
> > > which means that later on, for another CPU hotunplug, in
> > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> > > trying to migrate that task to CPU0.
> > > 
> > > The problem is that for a failure in sched:active, as "online" has no 
> > > callback,
> > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying 
> > > bit would
> > > not be reset.
> > 
> > Urgh! Good find.

> I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
> Have cpu0 fail on sched:active, then offline all other CPUs.
> 
> Now lemme add that patch.

(which obviously didn't actually build) seems to fix it.

---
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 838dcf238f92..e538518556f4 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
boolrollback;
boolsingle;
boolbringup;
+   int cpu;
struct hlist_node   *node;
struct hlist_node   *last;
enum cpuhp_statecb_state;
@@ -160,9 +161,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum 
cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
 
-   if (cpu_dying(cpu) != !bringup)
-   set_cpu_dying(cpu, !bringup);
-
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -467,13 +465,16 @@ static inline enum cpuhp_state
 cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
 {
enum cpuhp_state prev_state = st->state;
+   bool bringup = st->state < target;
 
st->rollback = false;
st->last = NULL;
 
st->target = target;
st->single = false;
-   st->bringup = st->state < target;
+   st->bringup = bringup;
+   if (cpu_dying(st->cpu) != !bringup)
+   set_cpu_dying(st->cpu, !bringup);
 
return prev_state;
 }
@@ -481,6 +482,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum 
cpuhp_state target)
 static inline void
 cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
 {
+   bool bringup = !st->bringup;
+
st->target = prev_state;
 
/*
@@ -503,7 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum 
cpuhp_state prev_state)
st->state++;
}
 
-   st->bringup = !st->bringup;
+   st->bringup = bringup;
+   if (cpu_dying(st->cpu) != !bringup)
+   set_cpu_dying(st->cpu, !bringup);
 }
 
 /* Regular hotplug invocation of the AP hotplug thread */
@@ -693,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)
 
init_completion(>done_up);
init_completion(>done_down);
+   st->cpu = cpu;
 }
 
 static int cpuhp_should_run(unsigned int cpu)


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote:
> On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:
> 
> > Found the issue:
> > 
> > $ cat hotplug/states:
> > 219: sched:active
> > 220: online
> > 
> > CPU0: 
> > 
> > $ echo 219 > hotplug/fail
> > $ echo 0 > online
> > 
> > => cpu_active = 1 cpu_dying = 1
> > 
> > which means that later on, for another CPU hotunplug, in
> > __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> > CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> > trying to migrate that task to CPU0.
> > 
> > The problem is that for a failure in sched:active, as "online" has no 
> > callback,
> > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit 
> > would
> > not be reset.
> 
> Urgh! Good find.
> 
> > Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better 
> > place to
> > switch the dying bit?
> 
> Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs
> the callbacks out of order. I _think_ we can ignore it, but 
> 
> Something like the below, let me see if I can reproduce and test.

I seem to have triggered the BUG() in select_fallback_rq() with your recipie.
Have cpu0 fail on sched:active, then offline all other CPUs.

Now lemme add that patch.


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote:

> Found the issue:
> 
> $ cat hotplug/states:
> 219: sched:active
> 220: online
> 
> CPU0: 
> 
> $ echo 219 > hotplug/fail
> $ echo 0 > online
> 
> => cpu_active = 1 cpu_dying = 1
> 
> which means that later on, for another CPU hotunplug, in
> __balance_push_cpu_stop(), the fallback rq for a kthread can select that
> CPU0, but __migrate_task() would fail and we end-up in an infinite loop,
> trying to migrate that task to CPU0.
> 
> The problem is that for a failure in sched:active, as "online" has no 
> callback,
> there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit 
> would
> not be reset.

Urgh! Good find.

> Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better place 
> to
> switch the dying bit?

Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs
the callbacks out of order. I _think_ we can ignore it, but 

Something like the below, let me see if I can reproduce and test.

---
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 838dcf238f92..05272bae953d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,9 +160,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum 
cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
 
-   if (cpu_dying(cpu) != !bringup)
-   set_cpu_dying(cpu, !bringup);
-
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -467,13 +464,16 @@ static inline enum cpuhp_state
 cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
 {
enum cpuhp_state prev_state = st->state;
+   bool bringup = st->state < target;
 
st->rollback = false;
st->last = NULL;
 
st->target = target;
st->single = false;
-   st->bringup = st->state < target;
+   st->bringup = bringup;
+   if (cpu_dying(cpu) != !bringup)
+   set_cpu_dying(cpu, !bringup);
 
return prev_state;
 }
@@ -481,6 +481,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum 
cpuhp_state target)
 static inline void
 cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
 {
+   bool bringup = !st->bringup;
+
st->target = prev_state;
 
/*
@@ -503,7 +505,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum 
cpuhp_state prev_state)
st->state++;
}
 
-   st->bringup = !st->bringup;
+   st->bringup = bringup;
+   if (cpu_dying(cpu) != !bringup)
+   set_cpu_dying(cpu, !bringup);
 }
 
 /* Regular hotplug invocation of the AP hotplug thread */


Re: [PATCH] smp: add a best_effort version of smp_call_function_many()

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 12:41:08PM +0200, Luigi Rizzo wrote:
> On Tue, Apr 20, 2021 at 11:14 AM Peter Zijlstra  wrote:

> > We mostly try and avoid using this stuff wherever possible. Only when
> > no other choice is left do we send IPIs.
> >
> > NOHZ_FULL already relies on this and gets massively unhappy when a new
> > user comes and starts to spray IPIs.
> 
> I am curious, why is that -- is it because the new user is stealing
> the shared csd's in cfd_data (see below), or some other reason ?

The premise of NOHZ_FULL is that it will not be interrupted. There are
users who are working on a mode where any interruption will cause a
(fatal) signal.

> > So no; mostly we send an IPI because we _HAVE_ to, not because giggles.
> >
> > That said; there's still some places left where we can avoid sending
> > IPIs, but in all those cases correctness mandates we actually handle
> > things and not randomly not do anything.
> 
> My case too requires that the request is eventually handled, but with
> this non-blocking IPI the caller has a better option than blocking:
> it can either retry the multicast IPI at a later time if conditions allow,
> or it can post a dedicated CSD (with the advantage that being my
> requests idempotent, if the CSD is locked there is no need to retry
> because it means the handler has not started yet).
> 
> In fact, if we had the option to use dedicated CSDs for multicast IPI,
> we wouldn't even need to retry because we'd know that the posted CSD
> is for our call back and not someone else's.

What are you doing that CSD contention is such a problem?


Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 01:34:40AM -0700, Stephane Eranian wrote:
> The sampling approach will certainly incur more overhead and be at
> risk of losing the ability to reconstruct the total counter
> per-cgroup, unless  you set the period for SW_CGROUP_SWITCHES to 1.
> But then, you run the risk of losing samples if the buffer is full or
> sampling is throtlled.  In some scenarios, we believe the number of
> context switches between cgroup could be quite high (>> 1000/s).  And
> on top you would have to add the processing of the samples to extract
> the counts per cgroup. That would require a synthesis on cgroup on
> perf record and some post-processing on perf report. We are interested
> in using the data live to make some policy decisions, so a counting
> approach with perf stat will always be best.
> 
> The fundamental problem Namhyung is trying to solve is the following:
> 
> num_fds = num_cpus x num_events x num_cgroups
> 
> On an 256-CPU AMD server running 200 cgroups with 6 events/cgroup (as
> an example):
> 
> num_fds = 256 x 200 x 6 = 307,200 fds (with all the kernel memory
> associated with them).

So the PERCPU proposal would reduce that to 200 * 6 = 1200 fds, which is
a definite win.

> On each CPU, that implies: 200 x 6 = 1200
> events to schedule and 6 to find on each cgroup switch

Right, so here we could optimize; if we find the event-groups are
identical in composition we can probably frob something that swizzles
the counts around without touching the PMU. That would be very similar
to what we already have in perf_event_context_sched_out().

This gets a wee bit tricky when you consider cgroup hierarchy though;
suppose you have:

   R
  / \
 A   B
/ \
   C   D

And are monitoring both B and D, then you'll end up with 12 counters
active instead of the 6. I'm not sure how to make that go away. 'Don't
do that then' might be good enough.

> This does not scale for us:
>- run against the fd limit, but also memory consumption in the
>kernel per struct file, struct inode, struct perf_event 
>- number of events per-cpu is still also large
>- require event scheduling on cgroup switches, even with RB-tree
>improvements, still heavy
>- require event scheduling even if measuring the same events across
>all cgroups
> 
> One factor in that equation above needs to disappear. The one counter
> per file descriptor is respected with Nahmyung's patch because he is
> operating a plain per-cpu mode. What changes is just how and where the
> count is accumulated in perf_events. The resulting programming on the
> hardware is the same as before.

Yes, you're aggregating differently. And that's exactly the problem. The
aggregation is a variable one with fairly poor semantics. Suppose you
create a new cgroup, then you have to tear down and recreate the whole
thing, which is pretty crap.

Ftrace had a similar issue; where people wanted aggregation, and that
resulted in the event histogram, which, quite frankla,y is a scary
monster that I've no intention of duplicating. That's half a programming
language implemented.

> As you point out, the difficulty is how to express the cgroups of
> interest and how to read the counts back.  I agree that the ioctl() is
> not ideal for the latter. For the former, if you do not want ioctl()
> then you would have to overload perf_event_open() with a vector of
> cgroup fd, for instance. As for the read, you could, as you suggest,
> use the read syscall if you want to read all the cgroups at once using
> a new read_format. I don't have a problem with that.  As for cgroup-id
> vs. cgroup-fd, I think you make a fair point about consistency with
> the existing approach. I don't have a problem with that either

So that is a problem of aggregation; which is basically a
programmability problem. You're asking for a variadic-fixed-function
now, but tomorrow someone else will come and want another one.



Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-20 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 06:49:09PM +0900, Namhyung Kim wrote:
> On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra  wrote:
> > > +static void perf_update_cgroup_node(struct perf_event *event, struct 
> > > cgroup *cgrp)
> > > +{
> > > + u64 delta_count, delta_time_enabled, delta_time_running;
> > > + int i;
> > > +
> > > + if (event->cgrp_node_count == 0)
> > > + goto out;
> > > +
> > > + delta_count = local64_read(>count) - event->cgrp_node_count;

>From here...

> > > + delta_time_enabled = event->total_time_enabled - 
> > > event->cgrp_node_time_enabled;
> > > + delta_time_running = event->total_time_running - 
> > > event->cgrp_node_time_running;
> > > +
> > > + /* account delta to all ancestor cgroups */
> > > + for (i = 0; i <= cgrp->level; i++) {
> > > + struct perf_cgroup_node *node;
> > > +
> > > + node = find_cgroup_node(event, cgrp->ancestor_ids[i]);
> > > + if (node) {
> > > + node->count += delta_count;
> > > + node->time_enabled += delta_time_enabled;
> > > + node->time_running += delta_time_running;
> > > + }
> > > + }

... till here, NMI could hit and increment event->count, which then
means that:

> > > +
> > > +out:
> > > + event->cgrp_node_count = local64_read(>count);

This load doesn't match the delta_count load and events will go missing.

Obviously correct solution is:

event->cgrp_node_count += delta_count;


> > > + event->cgrp_node_time_enabled = event->total_time_enabled;
> > > + event->cgrp_node_time_running = event->total_time_running;

And while total_time doesn't have that problem, consistency would then
have you do:

event->cgrp_node_time_foo += delta_time_foo;

> >
> > This is wrong; there's no guarantee these are the same values you read
> > at the begin, IOW you could be loosing events.
> 
> Could you please elaborate?

You forgot NMI.


Re: [syzbot] WARNING in kthread_is_per_cpu

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 10:43:43AM +0100, Valentin Schneider wrote:
> On 20/04/21 10:51, Peter Zijlstra wrote:

> > I think free_kthread_struct() is ok, because a task at that point in its
> > lifetime cannot be also doing exec().
> >
> 
> What if it's one of those kthreads created by directly invoking
> kernel_thread()? AFAICT right now it's only umh, and that one does execve()
> so it ends up stripped of PF_KTHREAD. It could however go through an error
> path, i.e. not call exec, and exit, giving us:
> 
>   put_task_struct(p)
>   `\
> free_task(p)
> `\
>   if (tsk->flags & PF_KTHREAD)
>   free_kthread_struct(tsk);
>   `\
> to_kthread(p)

I'm not following, at the point we hit free_task() it had better be dead
and p->flags had better be stable. Either it will, or will not, have
PF_KTHREAD.




Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-20 Thread Peter Zijlstra
On Tue, Apr 20, 2021 at 01:34:40AM -0700, Stephane Eranian wrote:
> The sampling approach will certainly incur more overhead and be at
> risk of losing the ability to
> reconstruct the total counter per-cgroup, unless  you set the period
> for SW_CGROUP_SWITCHES to
> 1. But then, you run the risk of losing samples if the buffer is full
> or sampling is throtlled.
> In some scenarios, we believe the number of context switches between
> cgroup could be quite high (>> 1000/s).
> And on top you would have to add the processing of the samples to
> extract the counts per cgroup. That would require
> a synthesis on cgroup on perf record and some post-processing on perf
> report. We are interested in using the data live
> to make some policy decisions, so a counting approach with perf stat
> will always be best.

Can you please configure your MUA to sanely (re)flow text? The above
random line-breaks are *so* painful to read.


Re: [PATCH] smp: add a best_effort version of smp_call_function_many()

2021-04-20 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 11:07:08PM +0200, Luigi Rizzo wrote:
> On Mon, Apr 19, 2021 at 9:17 PM Peter Zijlstra  wrote:
> >
> > On Mon, Apr 19, 2021 at 11:44:55AM -0700, Luigi Rizzo wrote:
> > > Regardless of the 'wait' argument, smp_call_function_many() must spin
> > > if any of the target CPUs have their csd busy waiting to be processed
> > > for a previous call. This may cause high tail latencies e.g. when some
> > > of the target CPUs are running functions that disable interrupts for a
> > > long time; getrusage() is one possible culprit.
> > >
> > > Here we introduce a variant, __smp_call_function_many(), that adds
> > > a third 'best_effort' mode to the two existing ones (nowait, wait).
> > > In best effort mode, the call will skip CPUs whose csd is busy, and if
> > > any CPU is skipped it returns -EBUSY and the set of busy in the mask.
> > > This allows the caller to decide how to proceed, e.g. it might retry at
> > > a later time, or use a private csd, etc..
> > >
> > > The new function is a compromise to avoid touching existing callers of
> > > smp_call_function_many(). If the feature is considered interesting, we
> > > could even replace the 'wait' argument with a ternary 'mode' in all
> > > smp_call_function_*() and derived methods.
> >
> > I don't see a user of this...
> 
> This is actually something for which I was looking for feedback:
> 
> my use case is similar to a periodic garbage collect request:
> the caller tells targets that it may be time to do some work,
> but it does not matter if the request is dropped because the
> caller knows who was busy and will reissue pending requests later.
> 
> I would expect something like the above could be useful e.g.
> in various kinds of resource manager.
> 
> However, a grep for on_each_cpu_*() and smp_call_function_*()
> mostly returns synchronous calls (wait=1).
> 
> Any possible candidates that people can think of ?

We mostly try and avoid using this stuff wherever possible. Only when
no other choice is left do we send IPIs.

NOHZ_FULL already relies on this and gets massively unhappy when a new
user comes and starts to spray IPIs.

So no; mostly we send an IPI because we _HAVE_ to, not because giggles.

That said; there's still some places left where we can avoid sending
IPIs, but in all those cases correctness mandates we actually handle
things and not randomly not do anything.

For example, look at arch/x86/kernel/alternative.c:text_poke_sync(). The
purpose of that is to ensure all CPUs observe modified *kernel* code.
Now, if that CPU is currently running userspace, it doesn't much care
kernel code is changed, however that does mean it needs to call
sync_core() upon entering kernel, *BEFORE* hitting any code that's
possibly modified (and self modifying code is everywhere today,
ironically also very much in the NOHZ_FULL entry paths).

So untangling all that should be possible, but is something that
requires quite a bit of care and doesn't benefit from anything like the
proposed.

Mostly it sounds like you shouldn't be using IPIs either.


Re: [syzbot] WARNING in kthread_is_per_cpu

2021-04-20 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 08:58:26PM +0100, Valentin Schneider wrote:

> Looks about right, IIUC the key being:
> 
>   p->flags & PF_KTHREAD + p->set_child_tid => the struct kthread is
>   persistent
> 
>   p->flags & PF_KTHREAD => you may or may not have a struct kthread (see
>   kernel/umh.c kernel_thread() uses). PF_KTHREAD isn't even guaranteed to
>   persist (begin_new_exec()), which seems to be what the syzbot hit.

Ack, that's nicely put.

> While we're at it, does free_kthread_struct() want the __to_kthread()
> treatment as well? The other to_kthread() callsites looked like they only
> made sense with a "proper" kthread anyway.

I think free_kthread_struct() is ok, because a task at that point in its
lifetime cannot be also doing exec().

kthread_func() is another 'fun' trainwreck waiting to happen -- luckily
the only caller uses current, still let me go fix it.

kthread_probe_data() relies on PF_WQ_WORKER implying PF_KTHREAD but
otherwise seems very fragile too.

Something like so then?

---
Subject: kthread: Fix PF_KTHREAD vs to_kthread() race
From: Peter Zijlstra 
Date: Tue Apr 20 10:18:17 CEST 2021

The kthread_is_per_cpu() construct relies on only being called on
PF_KTHREAD tasks (per the WARN in to_kthread). This gives rise to the
following usage pattern:

if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p))

However, as reported by syzcaller, this is broken. The scenario is:

CPU0CPU1 (running p)

(p->flags & PF_KTHREAD) // true

begin_new_exec()
  me->flags &= ~(PF_KTHREAD|...);
kthread_is_per_cpu(p)
  to_kthread(p)
WARN(!(p->flags & PF_KTHREAD) <-- *SPLAT*

Introduce __to_kthread() that omits the WARN and is sure to check both
values.

Use this to remove the problematic pattern for kthread_is_per_cpu()
and fix a number of other kthread_*() functions that have similar
issues but are currently not used in ways that would expose the
problem.

Notably kthread_func() is only ever called on 'current', while
kthread_probe_data() is only used for PF_WQ_WORKER, which implies the
task is from kthread_create*().

Fixes: ac687e6e8c26 ("kthread: Extract KTHREAD_IS_PER_CPU")
Signed-off-by: Peter Zijlstra (Intel) 
---
 kernel/kthread.c|   33 +++--
 kernel/sched/core.c |2 +-
 kernel/sched/fair.c |2 +-
 3 files changed, 29 insertions(+), 8 deletions(-)

--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -84,6 +84,25 @@ static inline struct kthread *to_kthread
return (__force void *)k->set_child_tid;
 }
 
+/*
+ * Variant of to_kthread() that doesn't assume @p is a kthread.
+ *
+ * Per construction; when:
+ *
+ *   (p->flags & PF_KTHREAD) && p->set_child_tid
+ *
+ * the task is both a kthread and struct kthread is persistent. However
+ * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
+ * begin_new_exec()).
+ */
+static inline struct kthread *__to_kthread(struct task_struct *p)
+{
+   void *kthread = (__force void *)p->set_child_tid;
+   if (kthread && !(p->flags & PF_KTHREAD))
+   kthread = NULL;
+   return kthread;
+}
+
 void free_kthread_struct(struct task_struct *k)
 {
struct kthread *kthread;
@@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_shou
  */
 void *kthread_func(struct task_struct *task)
 {
-   if (task->flags & PF_KTHREAD)
-   return to_kthread(task)->threadfn;
+   struct kthread *kthread = __to_kthread(task);
+   if (kthread)
+   return kthread->threadfn;
return NULL;
 }
 EXPORT_SYMBOL_GPL(kthread_func);
@@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
  */
 void *kthread_probe_data(struct task_struct *task)
 {
-   struct kthread *kthread = to_kthread(task);
+   struct kthread *kthread = __to_kthread(task);
void *data = NULL;
 
-   copy_from_kernel_nofault(, >data, sizeof(data));
+   if (kthread)
+   copy_from_kernel_nofault(, >data, sizeof(data));
return data;
 }
 
@@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_str
set_bit(KTHREAD_IS_PER_CPU, >flags);
 }
 
-bool kthread_is_per_cpu(struct task_struct *k)
+bool kthread_is_per_cpu(struct task_struct *p)
 {
-   struct kthread *kthread = to_kthread(k);
+   struct kthread *kthread = __to_kthread(p);
if (!kthread)
return false;
 
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8505,7 +8505,7 @@ static void balance_push(struct rq *rq)
 * histerical raisins.
 */
if (rq->idle == push_task ||
-   ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) 
||
+   kthread_is_per_cpu(pu

Re: [PATCH 09/15] x86/alternatives: Use C int3 selftest but disable KASAN

2021-04-20 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 08:26:36AM -0700, Sami Tolvanen wrote:
> On Sat, Apr 17, 2021 at 4:37 AM Peter Zijlstra  wrote:

> > The problem is that the call-site does not respect the regular calling
> > convention, so calling a regular C function is just asking for trouble,
> > which is why it ended up being asm, then we fully control the calling
> > convention.
> 
> Ack. The problem here is that we can't declare an extern static
> function in C. How would you feel about making int3_magic a global
> instead to match the C declaration?

Works for me.


Re: [PATCH] smp: add a best_effort version of smp_call_function_many()

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 11:44:55AM -0700, Luigi Rizzo wrote:
> Regardless of the 'wait' argument, smp_call_function_many() must spin
> if any of the target CPUs have their csd busy waiting to be processed
> for a previous call. This may cause high tail latencies e.g. when some
> of the target CPUs are running functions that disable interrupts for a
> long time; getrusage() is one possible culprit.
> 
> Here we introduce a variant, __smp_call_function_many(), that adds
> a third 'best_effort' mode to the two existing ones (nowait, wait).
> In best effort mode, the call will skip CPUs whose csd is busy, and if
> any CPU is skipped it returns -EBUSY and the set of busy in the mask.
> This allows the caller to decide how to proceed, e.g. it might retry at
> a later time, or use a private csd, etc..
> 
> The new function is a compromise to avoid touching existing callers of
> smp_call_function_many(). If the feature is considered interesting, we
> could even replace the 'wait' argument with a ternary 'mode' in all
> smp_call_function_*() and derived methods.

I don't see a user of this... 


Re: [syzbot] WARNING in kthread_is_per_cpu

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 12:31:22PM +0100, Valentin Schneider wrote:

>   if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p))
>  `\
>to_kthread(p);
> `\
>   WARN_ON(!(p->flags & PF_KTHREAD));
> 
> ... Huh?

Something like so perhaps?

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 1578973c5740..eeba40df61ac 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -78,6 +78,14 @@ static inline void set_kthread_struct(void *kthread)
current->set_child_tid = (__force void __user *)kthread;
 }
 
+static inline struct kthread *__to_kthread(struct task_struct *k)
+{
+   void *kthread = (__force void *)k->set_child_tid;
+   if (kthread && !(k->flags & PF_KTHREAD))
+   kthread = NULL;
+   return kthread;
+}
+
 static inline struct kthread *to_kthread(struct task_struct *k)
 {
WARN_ON(!(k->flags & PF_KTHREAD));
@@ -516,7 +524,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
 
 bool kthread_is_per_cpu(struct task_struct *k)
 {
-   struct kthread *kthread = to_kthread(k);
+   struct kthread *kthread = __to_kthread(k);
if (!kthread)
return false;
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3384ea74cad4..dc6311bd6986 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7658,7 +7658,7 @@ static void balance_push(struct rq *rq)
 * histerical raisins.
 */
if (rq->idle == push_task ||
-   ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) 
||
+   kthread_is_per_cpu(push_task) ||
is_migration_disabled(push_task)) {
 
/*


Re: [PATCH v7 09/28] mm: Create FolioFlags

2021-04-19 Thread Peter Zijlstra
On Fri, Apr 09, 2021 at 07:50:46PM +0100, Matthew Wilcox (Oracle) wrote:
> These new functions are the folio analogues of the PageFlags functions.
> If CONFIG_DEBUG_VM_PGFLAGS is enabled, we check the folio is not a tail
> page at every invocation.  Note that this will also catch the PagePoisoned
> case as a poisoned page has every bit set, which would include PageTail.
> 
> This saves 1727 bytes of text with the distro-derived config that
> I'm testing due to removing a double call to compound_head() in
> PageSwapCache().

I vote for dropping the Camels if we're going to rework all this.


Re: [PATCH] sched,fair: skip newidle_balance if a wakeup is pending

2021-04-19 Thread Peter Zijlstra
On Sun, Apr 18, 2021 at 10:17:51PM -0400, Rik van Riel wrote:
> The try_to_wake_up function has an optimization where it can queue
> a task for wakeup on its previous CPU, if the task is still in the
> middle of going to sleep inside schedule().
> 
> Once schedule() re-enables IRQs, the task will be woken up with an
> IPI, and placed back on the runqueue.
> 
> If we have such a wakeup pending, there is no need to search other
> CPUs for runnable tasks. Just skip (or bail out early from) newidle
> balancing, and run the just woken up task.
> 
> For a memcache like workload test, this reduces total CPU use by
> about 2%, proportionally split between user and system time,
> and p99 and p95 application response time by 2-3% on average.
> The schedstats run_delay number shows a similar improvement.

Nice.

> Signed-off-by: Rik van Riel 
> ---
>  kernel/sched/fair.c | 11 ++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 69680158963f..19a92c48939f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7163,6 +7163,14 @@ done: __maybe_unused;
>   if (!rf)
>   return NULL;
>  
> + /*
> +  * We have a woken up task pending here. No need to search for ones
> +  * elsewhere. This task will be enqueued the moment we unblock irqs
> +  * upon exiting the scheduler.
> +  */
> + if (rq->ttwu_pending)
> + return NULL;

As reported by the robot, that needs an CONFIG_SMP guard of sorts,
#ifdef might work I suppose.

>   new_tasks = newidle_balance(rq, rf);
>  
>   /*
> @@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct 
> rq_flags *rf)
>* Stop searching for tasks to pull if there are
>* now runnable tasks on this rq.
>*/
> - if (pulled_task || this_rq->nr_running > 0)
> + if (pulled_task || this_rq->nr_running > 0 ||
> + this_rq->ttwu_pending)

Either cino=(0:0 or just bust the limit and make it 84 chars.


Re: [PATCH] sched: Optimize housekeeping_cpumask in for_each_cpu_and

2021-04-19 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 11:01:37PM +0800, Yuan ZhaoXiong wrote:
> On a 128 cores AMD machine, there are 8 cores in nohz_full mode, and
> the others are used for housekeeping. When many housekeeping cpus are
> in idle state, we can observe huge time burn in the loop for searching
> nearest busy housekeeper cpu by ftrace.
> 
>9)   |  get_nohz_timer_target() {
>9)   |housekeeping_test_cpu() {
>9)   0.390 us|  housekeeping_get_mask.part.1();
>9)   0.561 us|}
>9)   0.090 us|__rcu_read_lock();
>9)   0.090 us|housekeeping_cpumask();
>9)   0.521 us|housekeeping_cpumask();
>9)   0.140 us|housekeeping_cpumask();
> 
>...
> 
>9)   0.500 us|housekeeping_cpumask();
>9)   |housekeeping_any_cpu() {
>9)   0.090 us|  housekeeping_get_mask.part.1();
>9)   0.100 us|  sched_numa_find_closest();
>9)   0.491 us|}
>9)   0.100 us|__rcu_read_unlock();
>9) + 76.163 us   |  }
> 
> for_each_cpu_and() is a micro function, so in get_nohz_timer_target()
> function the
> for_each_cpu_and(i, sched_domain_span(sd),
> housekeeping_cpumask(HK_FLAG_TIMER))
> equals to below:
> for (i = -1; i = cpumask_next_and(i, sched_domain_span(sd),
> housekeeping_cpumask(HK_FLAG_TIMER)), i < nr_cpu_ids;)
> That will cause that housekeeping_cpumask() will be invoked many times.
> The housekeeping_cpumask() function returns a const value, so it is
> unnecessary to invoke it every time. This patch can minimize the worst
> searching time from ~76us to ~16us in my testing.
> 
> Similarly, the find_new_ilb() function has the same problem.

Would it not make sense to mark housekeeping_cpumask() __pure instead?


Re: [PATCH 00/13] [RFC] Rust support

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 11:02:12AM +0200, Paolo Bonzini wrote:
> > void writer(void)
> > {
> >  atomic_store_explicit(, seq+1, memory_order_relaxed);
> >  atomic_thread_fence(memory_order_acquire);
> 
> This needs to be memory_order_release.  The only change in the resulting
> assembly is that "dmb ishld" becomes "dmb ish", which is not as good as the
> "dmb ishst" you get from smp_wmb() but not buggy either.

Yuck! And that is what requires the insides to be
atomic_store_explicit(), otherwise this fence doesn't have to affect
them.

I also don't see how this is better than seq_cst.

But yes, not broken, but also very much not optimal.


Re: [PATCH 0/9] sched: Core scheduling interfaces

2021-04-19 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 09:35:07PM -0400, Joel Fernandes wrote:
> On Tue, Apr 06, 2021 at 10:16:12AM -0400, Tejun Heo wrote:
> > Hello,
> > 
> > On Mon, Apr 05, 2021 at 02:46:09PM -0400, Joel Fernandes wrote:
> > > Yeah, its at http://lore.kernel.org/r/20200822030155.ga414...@google.com
> > > as mentioned above, let me know if you need any more details about
> > > usecase.
> > 
> > Except for the unspecified reason in usecase 4, I don't see why cgroup is in
> > the picture at all. This doesn't really have much to do with hierarchical
> > resource distribution. Besides, yes, you can use cgroup for logical
> > structuring and identificaiton purposes but in those cases the interactions
> > and interface should be with the original subsystem while using cgroup IDs
> > or paths as parameters - see tracing and bpf for examples.
> 
> Personally for ChromeOS, we need only the per-task interface. Considering
> that the second argument of this prctl is a command, I don't see why we
> cannot add a new command PR_SCHED_CORE_CGROUP_SHARE to do what Tejun is
> saying (in the future).
> 
> In order to not block ChromeOS and other "per-task interface" usecases, I
> suggest we keep the CGroup interface for a later time (whether that's
> through prctl or the CGroups FS way which Tejun dislikes) and move forward
> with per-task interface only initially.

Josh, you being on the other Google team, the one that actually uses the
cgroup interface AFAIU, can you fight the good fight with TJ on this?

> Peter, any thoughts on this?

Adding CGROUP_SHARE is not sufficient to close the hole against CLEAR.
So we either then have to 'tweak' the meaning of CLEAR or replace it
entirely, neither seem attractive.


I'd love to make some progress on all this.


Re: [PATCH] preempt/dynamic: fix typo in macro conditional statement

2021-04-19 Thread Peter Zijlstra
On Sat, Apr 10, 2021 at 03:35:23PM +0800, Zhouyi Zhou wrote:
> commit 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched()
>  static call") tried to provide irqentry_exit_cond_resched() static call
> in irqentry_exit, but has a typo in macro conditional statement.
> 
> This patch fix this typo.
> 
> Signed-off-by: Zhouyi Zhou 

Fixes: 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() 
static call")

Thanks!


Re: [PATCH 00/13] [RFC] Rust support

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 10:26:57AM +0200, Peter Zijlstra wrote:

> https://godbolt.org/z/85xoPxeE5

That wants _Atomic on the seq definition for clang.

> void writer(void)
> {
> atomic_store_explicit(, seq+1, memory_order_relaxed);
> atomic_thread_fence(memory_order_acquire);
> 
> X = 1;
> Y = 2;
> 
> atomic_store_explicit(, seq+1, memory_order_release);
> }
> 
> gives:
> 
> writer:
> adrpx1, .LANCHOR0
> add x0, x1, :lo12:.LANCHOR0
> ldr w2, [x1, #:lo12:.LANCHOR0]
> add w2, w2, 1
> str w2, [x0]
> dmb ishld
> ldr w1, [x1, #:lo12:.LANCHOR0]
> mov w3, 1
> mov w2, 2
> stp w3, w2, [x0, 4]
> add w1, w1, w3
> stlrw1, [x0]
> ret
> 
> Which, afaict, is completely buggered. What it seems to be doing is
> turning the seq load into a load-acquire, but what we really need is to
> make sure the seq store (increment) is ordered before the other stores.

Put differently, what you seem to want is store-acquire, but there ain't
no such thing.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 09:53:06AM +0200, Paolo Bonzini wrote:
> On 19/04/21 09:32, Peter Zijlstra wrote:
> > On Sat, Apr 17, 2021 at 04:51:58PM +0200, Paolo Bonzini wrote:
> > > On 16/04/21 09:09, Peter Zijlstra wrote:
> > > > Well, the obvious example would be seqlocks. C11 can't do them
> > > 
> > > Sure it can.  C11 requires annotating with (the equivalent of) READ_ONCE 
> > > all
> > > reads of seqlock-protected fields, but the memory model supports seqlocks
> > > just fine.
> > 
> > How does that help?
> > 
> > IIRC there's two problems, one on each side the lock. On the write side
> > we have:
> > 
> > seq++;
> > smp_wmb();
> > X = r;
> > Y = r;
> > smp_wmb();
> > seq++;
> > 
> > Which C11 simply cannot do right because it does't have wmb.
> 
> It has atomic_thread_fence(memory_order_release), and
> atomic_thread_fence(memory_order_acquire) on the read side.

https://godbolt.org/z/85xoPxeE5

void writer(void)
{
atomic_store_explicit(, seq+1, memory_order_relaxed);
atomic_thread_fence(memory_order_acquire);

X = 1;
Y = 2;

atomic_store_explicit(, seq+1, memory_order_release);
}

gives:

writer:
adrpx1, .LANCHOR0
add x0, x1, :lo12:.LANCHOR0
ldr w2, [x1, #:lo12:.LANCHOR0]
add w2, w2, 1
str w2, [x0]
dmb ishld
ldr w1, [x1, #:lo12:.LANCHOR0]
mov w3, 1
mov w2, 2
stp w3, w2, [x0, 4]
add w1, w1, w3
stlrw1, [x0]
ret

Which, afaict, is completely buggered. What it seems to be doing is
turning the seq load into a load-acquire, but what we really need is to
make sure the seq store (increment) is ordered before the other stores.




Re: [PATCH v2 resubmit] sched: Warn on long periods of pending need_resched

2021-04-19 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 02:29:36PM -0700, Josh Don wrote:
> From: Paul Turner 
> 
> CPU scheduler marks need_resched flag to signal a schedule() on a
> particular CPU. But, schedule() may not happen immediately in cases
> where the current task is executing in the kernel mode (no
> preemption state) for extended periods of time.
> 
> This patch adds a warn_on if need_resched is pending for more than the
> time specified in sysctl resched_latency_warn_ms. If it goes off, it is
> likely that there is a missing cond_resched() somewhere. Monitoring is
> done via the tick and the accuracy is hence limited to jiffy scale. This
> also means that we won't trigger the warning if the tick is disabled.
> 
> This feature (LATENCY_WARN) is default disabled.
> 
> Signed-off-by: Paul Turner 
> Signed-off-by: Josh Don 
> Signed-off-by: Peter Zijlstra (Intel) 
> Link: https://lkml.kernel.org/r/20210323035706.572953-1-josh...@google.com

Thanks!


Re: [PATCH v2] sched: Warn on long periods of pending need_resched

2021-04-19 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 02:33:27PM -0700, Josh Don wrote:
> Yikes, sorry about that. I've squashed the fixup and sent the updated
> patch (with trimmed cc list): https://lkml.org/lkml/2021/4/16/1125

For future reference, please use: https://lkml.kernel.org/r/MSGID

Then I can search for MSGID in my local mailer (mutt: ~h) and instantly
find the actual message (now I'll search for all email from you and it
should obviously be easily found as well).


Re: [PATCH v5 04/19] perf: aux: Add CoreSight PMU buffer formats

2021-04-19 Thread Peter Zijlstra
On Mon, Mar 29, 2021 at 10:56:25AM -0600, Mathieu Poirier wrote:
> Hi Peter,
> 
> On Tue, Mar 23, 2021 at 12:06:32PM +, Suzuki K Poulose wrote:
> > CoreSight PMU supports aux-buffer for the ETM tracing. The trace
> > generated by the ETM (associated with individual CPUs, like Intel PT)
> > is captured by a separate IP (CoreSight TMC-ETR/ETF until now).
> > 
> > The TMC-ETR applies formatting of the raw ETM trace data, as it
> > can collect traces from multiple ETMs, with the TraceID to indicate
> > the source of a given trace packet.
> > 
> > Arm Trace Buffer Extension is new "sink" IP, attached to individual
> > CPUs and thus do not provide additional formatting, like TMC-ETR.
> > 
> > Additionally, a system could have both TRBE *and* TMC-ETR for
> > the trace collection. e.g, TMC-ETR could be used as a single
> > trace buffer to collect data from multiple ETMs to correlate
> > the traces from different CPUs. It is possible to have a
> > perf session where some events end up collecting the trace
> > in TMC-ETR while the others in TRBE. Thus we need a way
> > to identify the type of the trace for each AUX record.
> > 
> > Define the trace formats exported by the CoreSight PMU.
> > We don't define the flags following the "ETM" as this
> > information is available to the user when issuing
> > the session. What is missing is the additional
> > formatting applied by the "sink" which is decided
> > at the runtime and the user may not have a control on.
> > 
> > So we define :
> >  - CORESIGHT format (indicates the Frame format)
> >  - RAW format (indicates the format of the source)
> > 
> > The default value is CORESIGHT format for all the records
> > (i,e == 0). Add the RAW format for others that use
> > raw format.
> > 
> > Cc: Peter Zijlstra 
> > Cc: Mike Leach 
> > Cc: Mathieu Poirier 
> > Cc: Leo Yan 
> > Cc: Anshuman Khandual 
> > Reviewed-by: Mike Leach 
> > Reviewed-by: Mathieu Poirier 
> > Signed-off-by: Suzuki K Poulose 
> > ---
> >  include/uapi/linux/perf_event.h | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/include/uapi/linux/perf_event.h 
> > b/include/uapi/linux/perf_event.h
> > index f006eeab6f0e..63971eaef127 100644
> > --- a/include/uapi/linux/perf_event.h
> > +++ b/include/uapi/linux/perf_event.h
> > @@ -1162,6 +1162,10 @@ enum perf_callchain_context {
> >  #define PERF_AUX_FLAG_COLLISION0x08/* sample 
> > collided with another */
> >  #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00  /* PMU specific trace 
> > format type */
> >  
> > +/* CoreSight PMU AUX buffer formats */
> > +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT   0x /* Default for 
> > backward compatibility */
> > +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of 
> > the source */
> > +
> 
> Have you had time to review this patch?  Anything you'd like to see modified?

Ok I suppose..

Acked-by: Peter Zijlstra (Intel) 


Re: Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.

2021-04-19 Thread Peter Zijlstra
On Mon, Apr 19, 2021 at 09:46:26AM +0800, 周传高 wrote:
> 
> >On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote:
> >> eg:
> >> In Android system, we usually and some processes to the whitelist.
> >> static task_t task_whitelist[] = {
> >>{"mdrt_thread", HUNG_TASK_WHITELIST},
> >>{"chre_kthread", HUNG_TASK_WHITELIST},
> >>{"scp_power_reset", HUNG_TASK_WHITELIST},
> >>{"ccci_fsm1", HUNG_TASK_WHITELIST},
> >>{"qos_ipi_recv", HUNG_TASK_WHITELIST},
> >>{NULL, 0},
> >> };
> >
> >What are these tasks doing that the hung-task detector fires on them?
> >Should you fix that instead?
> 
> These tasks are implemented by the SoC vendor, and normally they do
> not configure HUNG TASK, so we need to ignore these tasks if we use
> HUNG TASK. 

Then raise a bug against their crap software, don't try and work around
it in the kernel.

We're not going to upstream workarounds for crap vendor software.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-19 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 04:51:58PM +0200, Paolo Bonzini wrote:
> On 16/04/21 09:09, Peter Zijlstra wrote:
> > Well, the obvious example would be seqlocks. C11 can't do them
> 
> Sure it can.  C11 requires annotating with (the equivalent of) READ_ONCE all
> reads of seqlock-protected fields, but the memory model supports seqlocks
> just fine.

How does that help?

IIRC there's two problems, one on each side the lock. On the write side
we have:

seq++;
smp_wmb();
X = r;
Y = r;
smp_wmb();
seq++;

Which C11 simply cannot do right because it does't have wmb. You end up
having to use seq_cst for the first wmb or make both X and Y (on top of
the last seq) a store-release, both options are sub-optimal.

On the read side you get:

do {
  s = seq;
  smp_rmb();
  r1 = X;
  r2 = Y;
  smp_rmb();
} while ((s&1) || seq != s);

And then you get into trouble the last barrier, so the first seq load
can be load-acquire, after which the loads of X, Y come after, but you
need then to happen before the second seq load, for which you then need
seq_cst, or make X and Y load-acquire. Again, not optimal.

I have also seen *many* broken variants of it on the web. Some work on
x86 but are totally broken when you build them on LL/SC ARM64.



Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.

2021-04-17 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote:
> eg:
> In Android system, we usually and some processes to the whitelist.
> static task_t task_whitelist[] = {
>   {"mdrt_thread", HUNG_TASK_WHITELIST},
>   {"chre_kthread", HUNG_TASK_WHITELIST},
>   {"scp_power_reset", HUNG_TASK_WHITELIST},
>   {"ccci_fsm1", HUNG_TASK_WHITELIST},
>   {"qos_ipi_recv", HUNG_TASK_WHITELIST},
>   {NULL, 0},
> };

What are these tasks doing that the hung-task detector fires on them?
Should you fix that instead?


Re: [PATCH 00/13] [RFC] Rust support

2021-04-17 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 01:46:23PM +0200, Willy Tarreau wrote:
> For me the old trick of casting one side as long long still works:
> 
>   unsigned long long mul3264(unsigned int a, unsigned int b)
>   {
> return (unsigned long long)a * b;
>   }
> 
> i386:
>    :
>  0: 8b 44 24 08   mov0x8(%esp),%eax
>  4: f7 64 24 04   mull   0x4(%esp)
>  8: c3ret
> 
> x86_64:
>    :
>  0: 89 f8 mov%edi,%eax
>  2: 89 f7 mov%esi,%edi
>  4: 48 0f af c7   imul   %rdi,%rax
>  8: c3retq   
> 
> Or maybe you had something else in mind ?

Last time I tried it, the thing refused :/ which is how we ended up with
mul_u32_u32() in asm.


Re: [tip:sched/core 23/23] kernel/sched/topology.c:17:2: error: 'sched_debug_verbose' undeclared; did you mean 'sched_debug_setup'?

2021-04-17 Thread Peter Zijlstra
On Sat, Apr 17, 2021 at 11:06:28AM +0200, Borislav Petkov wrote:
> On Sat, Apr 17, 2021 at 02:47:46PM +0800, kernel test robot wrote:
> >kernel/sched/topology.c: In function 'sched_debug_setup':
> > >> kernel/sched/topology.c:17:2: error: 'sched_debug_verbose' undeclared 
> > >> (first use in this function); did you mean 'sched_debug_setup'?
> >   17 |  sched_debug_verbose = true;
> 
> I guess it needs this:
> 
> ---
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 55232db745ac..916bb96ff9a6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2364,6 +2364,7 @@ extern struct sched_entity *__pick_last_entity(struct 
> cfs_rq *cfs_rq);
>  
>  #ifdef   CONFIG_SCHED_DEBUG
>  extern bool sched_debug_enabled;
> +extern bool sched_debug_verbose;
>  
>  extern void print_cfs_stats(struct seq_file *m, int cpu);
>  extern void print_rt_stats(struct seq_file *m, int cpu);
> 

Right, sorry for messing that up. Pushed out that commit with this
folded. If it gives any more trouble just kill the commit, it's the
topmost commit anyway.


[tip: sched/core] sched/debug: Rename the sched_debug parameter to sched_verbose

2021-04-17 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 9406415f46f6127fd31bb66f0260f7a61a8d2786
Gitweb:
https://git.kernel.org/tip/9406415f46f6127fd31bb66f0260f7a61a8d2786
Author:Peter Zijlstra 
AuthorDate:Thu, 15 Apr 2021 18:23:17 +02:00
Committer: Peter Zijlstra 
CommitterDate: Sat, 17 Apr 2021 13:22:44 +02:00

sched/debug: Rename the sched_debug parameter to sched_verbose

CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param
sched_debug and the /debug/sched/debug_enabled knobs control the
sched_debug_enabled variable, but what they really do is make
SCHED_DEBUG more verbose, so rename the lot.

Signed-off-by: Peter Zijlstra (Intel) 
---
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 Documentation/scheduler/sched-domains.rst   | 10 +-
 kernel/sched/debug.c|  4 ++--
 kernel/sched/sched.h|  2 +-
 kernel/sched/topology.c | 12 ++--
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 0454572..9e4c026 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4725,7 +4725,7 @@
 
sbni=   [NET] Granch SBNI12 leased line adapter
 
-   sched_debug [KNL] Enables verbose scheduler debug messages.
+   sched_verbose   [KNL] Enables verbose scheduler debug messages.
 
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
diff --git a/Documentation/scheduler/sched-domains.rst 
b/Documentation/scheduler/sched-domains.rst
index 8582fa5..14ea2f2 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,8 +74,8 @@ for a given topology level by creating a 
sched_domain_topology_level array and
 calling set_sched_topology() with this array as the parameter.
 
 The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
-tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
-knob. This enables an error checking parse of the sched domains which should
-catch most possible errors (described above). It also prints out the domain
-structure in a visual format.
+CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+forgot to tweak your cmdline, you can also flip the
+/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
+the sched domains which should catch most possible errors (described above). It
+also prints out the domain structure in a visual format.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index bf199d6..461342f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -275,7 +275,7 @@ static const struct file_operations sched_dynamic_fops = {
 
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
-__read_mostly bool sched_debug_enabled;
+__read_mostly bool sched_debug_verbose;
 
 static const struct seq_operations sched_debug_sops;
 
@@ -300,7 +300,7 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
 
debugfs_create_file("features", 0644, debugfs_sched, NULL, 
_feat_fops);
-   debugfs_create_bool("debug_enabled", 0644, debugfs_sched, 
_debug_enabled);
+   debugfs_create_bool("verbose", 0644, debugfs_sched, 
_debug_verbose);
 #ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, 
_dynamic_fops);
 #endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 55232db..bde7248 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2363,7 +2363,7 @@ extern struct sched_entity *__pick_first_entity(struct 
cfs_rq *cfs_rq);
 extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
 
 #ifdef CONFIG_SCHED_DEBUG
-extern bool sched_debug_enabled;
+extern bool sched_debug_verbose;
 
 extern void print_cfs_stats(struct seq_file *m, int cpu);
 extern void print_rt_stats(struct seq_file *m, int cpu);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c343aed..55a0a24 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2;
 
 static int __init sched_debug_setup(char *str)
 {
-   sched_debug_enabled = true;
+   sched_debug_verbose = true;
 
return 0;
 }
-early_param("sched_debug", sched_debug_setup);
+early_param("sched_verbose", sched_debug_setup);
 
 static inline bool sched_debug(void)
 {
-   return sched_debug_enabled;
+   return sched_debug_verbose;
 }
 
 #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags,

Re: [PATCH 09/15] x86/alternatives: Use C int3 selftest but disable KASAN

2021-04-17 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 01:38:38PM -0700, Sami Tolvanen wrote:
> From: Kees Cook 
> 
> Instead of using inline asm for the int3 selftest (which confuses the
> Clang's ThinLTO pass), this restores the C function but disables KASAN
> (and tracing for good measure) to keep the things simple and avoid
> unexpected side-effects. This attempts to keep the fix from commit
> ecc606103837 ("x86/alternatives: Fix int3_emulate_call() selftest stack
> corruption") without using inline asm.
> 
> Signed-off-by: Kees Cook 
> Signed-off-by: Sami Tolvanen 
> ---
>  arch/x86/kernel/alternative.c | 21 -
>  1 file changed, 4 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index 6974b5174495..669a23454c09 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -496,23 +496,10 @@ extern struct paravirt_patch_site 
> __start_parainstructions[],
>   *
>   * See entry_{32,64}.S for more details.
>   */
> -
> -/*
> - * We define the int3_magic() function in assembly to control the calling
> - * convention such that we can 'call' it from assembly.
> - */
> -
> -extern void int3_magic(unsigned int *ptr); /* defined in asm */
> -
> -asm (
> -".pushsection.init.text, \"ax\", @progbits\n"
> -".type   int3_magic, @function\n"
> -"int3_magic:\n"
> -"movl$1, (%" _ASM_ARG1 ")\n"
> -"ret\n"
> -".size   int3_magic, .-int3_magic\n"
> -".popsection\n"
> -);
> +static void __init __no_sanitize_address notrace int3_magic(unsigned int 
> *ptr)
> +{
> + *ptr = 1;
> +}

I really don't like this. the compiler is free to mess this up in all
sorts of ways.

The problem is that the call-site does not respect the regular calling
convention, so calling a regular C function is just asking for trouble,
which is why it ended up being asm, then we fully control the calling
convention.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-17 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 07:08:29PM +0100, Matthew Wilcox wrote:
> On Fri, Apr 16, 2021 at 07:18:48PM +0200, Peter Zijlstra wrote:
> > On Fri, Apr 16, 2021 at 07:10:17PM +0200, Miguel Ojeda wrote:
> > 
> > > Of course, UB is only a subset of errors, but it is a major one, and
> > > particularly critical for privileged code.
> > 
> > I've seen relatively few UBSAN warnings that weren't due to UBSAN being
> > broken.
> 
> Lucky you.
> 
> 84c34df158cf215b0cd1475ab3b8e6f212f81f23
> 
> (i'd argue this is C being broken; promoting only as far as int, when
> assigning to an unsigned long is Bad, but until/unless either GCC fixes
> that or the language committee realises that being stuck in the 1970s
> is Bad, people are going to keep making this kind of mistake)

Well, I think the rules actually make sense, at the point in the syntax
tree where + happens, we have 'unsigned char' and 'int', so at that
point we promote to 'int'. Subsequently 'int' gets shifted and bad
things happen.

The 'unsigned long' doesn't happen until quite a bit later.

Anyway, the rules are imo fairly clear and logical, but yes they can be
annoying. The really silly thing here is that << and >> have UB at all,
and I would love a -fwrapv style flag that simply defines it. Yes it
will generate worse code in some cases, but having the UB there is just
stupid.

That of course doesn't help your case here, it would simply misbehave
and not be UB.

Another thing the C rules cannot really express is a 32x32->64
multiplication, some (older) versions of GCC can be tricked into it, but
mostly it just doesn't want to do that sanely and the C rules are
absolutely no help there.


Re: [PATCH v5] sched/debug: Use sched_debug_lock to serialize use of cgroup_path[] only

2021-04-16 Thread Peter Zijlstra
On Thu, Apr 15, 2021 at 03:54:26PM -0400, Waiman Long wrote:
> The handling of sysrq key can be activated by echoing the key to
> /proc/sysrq-trigger or via the magic key sequence typed into a terminal
> that is connected to the system in some way (serial, USB or other mean).
> In the former case, the handling is done in a user context. In the
> latter case, it is likely to be in an interrupt context.
> 
> Currently in print_cpu() of kernel/sched/debug.c, sched_debug_lock is
> taken with interrupt disabled for the whole duration of the calls to
> print_*_stats() and print_rq() which could last for the quite some time
> if the information dump happens on the serial console.
> 
> If the system has many cpus and the sched_debug_lock is somehow busy
> (e.g. parallel sysrq-t), the system may hit a hard lockup panic
> depending on the actually serial console implementation of the
> system. For instance,
> 
...
> 
> The purpose of sched_debug_lock is to serialize the use of the global
> cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't
> need serialization from sched_debug_lock.
> 
> Calling printk() with interrupt disabled can still be problematic if
> multiple instances are running. Allocating a stack buffer of PATH_MAX
> bytes is not feasible because of the limited size of the kernel stack.
> 
> The solution implemented in this patch is to allow only one caller at a
> time to use the full size group_path[], while other simultaneous callers
> will have to use shorter stack buffers with the possibility of path
> name truncation. A "..." suffix will be printed if truncation may have
> happened.  The cgroup path name is provided for informational purpose
> only, so occasional path name truncation should not be a big problem.
> 
> Fixes: efe25c2c7b3a ("sched: Reinstate group names in /proc/sched_debug")
> Suggested-by: Peter Zijlstra 
> Signed-off-by: Waiman Long 

Thanks!


Re: [PATCH V2] sched,psi: fix the 'int' underflow for psi

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 11:11:04AM -0400, Johannes Weiner wrote:
> On Fri, Apr 16, 2021 at 08:32:16PM +0530, Charan Teja Reddy wrote:
> > psi_group_cpu->tasks, represented by the unsigned int, stores the number
> > of tasks that could be stalled on a psi resource(io/mem/cpu).
> > Decrementing these counters at zero leads to wrapping which further
> > leads to the psi_group_cpu->state_mask is being set with the respective
> > pressure state. This could result into the unnecessary time sampling for
> > the pressure state thus cause the spurious psi events. This can further
> > lead to wrong actions being taken at the user land based on these psi
> > events.
> > Though psi_bug is set under these conditions but that just for debug
> > purpose. Fix it by decrementing the ->tasks count only when it is
> > non-zero.
> > 
> > Signed-off-by: Charan Teja Reddy 
> 
> The subject should be
> 
>   psi: handle potential task count underflow bugs more gracefully

Done.

> since it's not fixing a known bug at this point. Otherwise,
> 
> Acked-by: Johannes Weiner 

Thanks!




Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 07:10:17PM +0200, Miguel Ojeda wrote:

> Of course, UB is only a subset of errors, but it is a major one, and
> particularly critical for privileged code.

I've seen relatively few UBSAN warnings that weren't due to UBSAN being
broken.


Re: [PATCH v2] locking/qrwlock: Fix ordering in queued_write_lock_slowpath

2021-04-16 Thread Peter Zijlstra



I've edited the thing to look like so. I'll go queue it for
locking/urgent.

---
Subject: locking/qrwlock: Fix ordering in queued_write_lock_slowpath()
From: Ali Saidi 
Date: Thu, 15 Apr 2021 17:27:11 +

From: Ali Saidi 

While this code is executed with the wait_lock held, a reader can
acquire the lock without holding wait_lock.  The writer side loops
checking the value with the atomic_cond_read_acquire(), but only truly
acquires the lock when the compare-and-exchange is completed
successfully which isn’t ordered. This exposes the window between the
acquire and the cmpxchg to an A-B-A problem which allows reads
following the lock acquisition to observe values speculatively before
the write lock is truly acquired.

We've seen a problem in epoll where the reader does a xchg while
holding the read lock, but the writer can see a value change out from
under it.

  Writer| Reader
  

  ep_scan_ready_list()  |
  |- write_lock_irq()   |
  |- queued_write_lock_slowpath()   |
|- atomic_cond_read_acquire()   |
| read_lock_irqsave(>lock, flags);
 --> (observes value before unlock) |  chain_epi_lockless()
 |  |epi->next = xchg(>ovflist, 
epi);
 |  | read_unlock_irqrestore(>lock, 
flags);
 |  |
 | atomic_cmpxchg_relaxed() |
 |-- READ_ONCE(ep->ovflist);|

A core can order the read of the ovflist ahead of the
atomic_cmpxchg_relaxed(). Switching the cmpxchg to use acquire
semantics addresses this issue at which point the atomic_cond_read can
be switched to use relaxed semantics.

Fixes: b519b56e378ee ("locking/qrwlock: Use atomic_cond_read_acquire() when 
spinning in qrwlock")
Signed-off-by: Ali Saidi 
[peterz: use try_cmpxchg()]
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Steve Capper 
Acked-by: Will Deacon 
Tested-by: Steve Capper 
---
 kernel/locking/qrwlock.c |7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath)
  */
 void queued_write_lock_slowpath(struct qrwlock *lock)
 {
+   int cnts;
+
/* Put the writer into the wait queue */
arch_spin_lock(>wait_lock);
 
@@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct q
 
/* When no more readers or writers, set the locked flag */
do {
-   atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
-   } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING,
-   _QW_LOCKED) != _QW_WAITING);
+   cnts = atomic_cond_read_relaxed(>cnts, VAL == 
_QW_WAITING);
+   } while (!atomic_try_cmpxchg_acquire(>cnts, , _QW_LOCKED));
 unlock:
arch_spin_unlock(>wait_lock);
 }


Re: [tip: perf/core] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 03:01:48PM -, tip-bot2 for Kan Liang wrote:
> @@ -2331,6 +2367,9 @@ static void x86_pmu_event_unmapped(struct perf_event 
> *event, struct mm_struct *m
>   if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
>   return;
>  
> + if (x86_pmu.sched_task && event->hw.target)
> + perf_sched_cb_dec(event->ctx->pmu);
> +
>   if (atomic_dec_and_test(>context.perf_rdpmc_allowed))
>   on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1);
>  }

'perf test' on a kernel with CONFIG_DEBUG_PREEMPT=y gives:

[  244.439538] BUG: using smp_processor_id() in preemptible [] code: 
perf/1771
[  244.448144] caller is perf_sched_cb_dec+0xa/0x70
[  244.453314] CPU: 28 PID: 1771 Comm: perf Not tainted 
5.12.0-rc3-00026-gb04c0cddff6d #595
[  244.462347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS 
SE5C600.86B.02.02.0002.122320131210 12/23/2013
[  244.473804] Call Trace:
[  244.476535]  dump_stack+0x6d/0x89
[  244.480237]  check_preemption_disabled+0xc8/0xd0
[  244.485394]  perf_sched_cb_dec+0xa/0x70
[  244.489677]  x86_pmu_event_unmapped+0x35/0x60
[  244.494541]  perf_mmap_close+0x76/0x580
[  244.498833]  remove_vma+0x31/0x70
[  244.502535]  __do_munmap+0x2e8/0x4e0
[  244.506531]  __vm_munmap+0x7e/0x120
[  244.510429]  __x64_sys_munmap+0x28/0x30
[  244.514712]  do_syscall_64+0x33/0x80
[  244.518701]  entry_SYSCALL_64_after_hwframe+0x44/0xae


Obviously I tested that after I pushed it out :/



[tip: sched/core] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: b5c4477366fb5e6a2f0f38742c33acd666c07698
Gitweb:
https://git.kernel.org/tip/b5c4477366fb5e6a2f0f38742c33acd666c07698
Author:Peter Zijlstra 
AuthorDate:Thu, 21 Jan 2021 16:09:32 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00

sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

Use the new cpu_dying() state to simplify and fix the balance_push()
vs CPU hotplug rollback state.

Specifically, we currently rely on notifiers sched_cpu_dying() /
sched_cpu_activate() to terminate balance_push, however if the
cpu_down() fails when we're past sched_cpu_deactivate(), it should
terminate balance_push at that point and not wait until we hit
sched_cpu_activate().

Similarly, when cpu_up() fails and we're going back down, balance_push
should be active, where it currently is not.

So instead, make sure balance_push is enabled below SCHED_AP_ACTIVE
(when !cpu_active()), and gate it's utility with cpu_dying().

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/yhgayef83vqhk...@hirez.programming.kicks-ass.net
---
 kernel/sched/core.c  | 26 +++---
 kernel/sched/sched.h |  1 -
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 95bd6ab..7d031da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1811,7 +1811,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, 
int cpu)
return cpu_online(cpu);
 
/* Regular kernel threads don't get to stay during offline. */
-   if (cpu_rq(cpu)->balance_push)
+   if (cpu_dying(cpu))
return false;
 
/* But are allowed during online. */
@@ -7638,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
 
 /*
  * Ensure we only run per-cpu kthreads once the CPU goes !active.
+ *
+ * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
+ * effective when the hotplug motion is down.
  */
 static void balance_push(struct rq *rq)
 {
@@ -7645,12 +7648,19 @@ static void balance_push(struct rq *rq)
 
lockdep_assert_held(>lock);
SCHED_WARN_ON(rq->cpu != smp_processor_id());
+
/*
 * Ensure the thing is persistent until balance_push_set(.on = false);
 */
rq->balance_callback = _push_callback;
 
/*
+* Only active while going offline.
+*/
+   if (!cpu_dying(rq->cpu))
+   return;
+
+   /*
 * Both the cpu-hotplug and stop task are in this case and are
 * required to complete the hotplug process.
 *
@@ -7703,7 +7713,6 @@ static void balance_push_set(int cpu, bool on)
struct rq_flags rf;
 
rq_lock_irqsave(rq, );
-   rq->balance_push = on;
if (on) {
WARN_ON_ONCE(rq->balance_callback);
rq->balance_callback = _push_callback;
@@ -7828,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu)
struct rq_flags rf;
 
/*
-* Make sure that when the hotplug state machine does a roll-back
-* we clear balance_push. Ideally that would happen earlier...
+* Clear the balance_push callback and prepare to schedule
+* regular tasks.
 */
balance_push_set(cpu, false);
 
@@ -8014,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu)
}
rq_unlock_irqrestore(rq, );
 
-   /*
-* Now that the CPU is offline, make sure we're welcome
-* to new tasks once we come back up.
-*/
-   balance_push_set(cpu, false);
-
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
@@ -8204,7 +8207,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-   rq->balance_callback = NULL;
+   rq->balance_callback = _push_callback;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -8251,6 +8254,7 @@ void __init sched_init(void)
 
 #ifdef CONFIG_SMP
idle_thread_set_boot_cpu();
+   balance_push_set(smp_processor_id(), false);
 #endif
init_sched_fair_class();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cbb0b01..7e7e936 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -983,7 +983,6 @@ struct rq {
unsigned long   cpu_capacity_orig;
 
struct callback_head*balance_callback;
-   unsigned char   balance_push;
 
unsigned char   nohz_idle_balance;
unsigned char   idle_balance;


[tip: sched/core] cpumask: Make cpu_{online,possible,present,active}() inline

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: b02a4fd8148f655095d9e3d6eddd8f0042bcc27c
Gitweb:
https://git.kernel.org/tip/b02a4fd8148f655095d9e3d6eddd8f0042bcc27c
Author:Peter Zijlstra 
AuthorDate:Mon, 25 Jan 2021 16:46:49 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00

cpumask: Make cpu_{online,possible,present,active}() inline

Prepare for addition of another mask. Primarily a code movement to
avoid having to create more #ifdef, but while there, convert
everything with an argument to an inline function.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210310150109.045447...@infradead.org
---
 include/linux/cpumask.h | 97 +++-
 1 file changed, 66 insertions(+), 31 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 383684e..a584336 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -98,37 +98,6 @@ extern struct cpumask __cpu_active_mask;
 
 extern atomic_t __num_online_cpus;
 
-#if NR_CPUS > 1
-/**
- * num_online_cpus() - Read the number of online CPUs
- *
- * Despite the fact that __num_online_cpus is of type atomic_t, this
- * interface gives only a momentary snapshot and is not protected against
- * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
- * region.
- */
-static inline unsigned int num_online_cpus(void)
-{
-   return atomic_read(&__num_online_cpus);
-}
-#define num_possible_cpus()cpumask_weight(cpu_possible_mask)
-#define num_present_cpus() cpumask_weight(cpu_present_mask)
-#define num_active_cpus()  cpumask_weight(cpu_active_mask)
-#define cpu_online(cpu)cpumask_test_cpu((cpu), cpu_online_mask)
-#define cpu_possible(cpu)  cpumask_test_cpu((cpu), cpu_possible_mask)
-#define cpu_present(cpu)   cpumask_test_cpu((cpu), cpu_present_mask)
-#define cpu_active(cpu)cpumask_test_cpu((cpu), cpu_active_mask)
-#else
-#define num_online_cpus()  1U
-#define num_possible_cpus()1U
-#define num_present_cpus() 1U
-#define num_active_cpus()  1U
-#define cpu_online(cpu)((cpu) == 0)
-#define cpu_possible(cpu)  ((cpu) == 0)
-#define cpu_present(cpu)   ((cpu) == 0)
-#define cpu_active(cpu)((cpu) == 0)
-#endif
-
 extern cpumask_t cpus_booted_once_mask;
 
 static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
@@ -894,6 +863,72 @@ static inline const struct cpumask *get_cpu_mask(unsigned 
int cpu)
return to_cpumask(p);
 }
 
+#if NR_CPUS > 1
+/**
+ * num_online_cpus() - Read the number of online CPUs
+ *
+ * Despite the fact that __num_online_cpus is of type atomic_t, this
+ * interface gives only a momentary snapshot and is not protected against
+ * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
+ * region.
+ */
+static inline unsigned int num_online_cpus(void)
+{
+   return atomic_read(&__num_online_cpus);
+}
+#define num_possible_cpus()cpumask_weight(cpu_possible_mask)
+#define num_present_cpus() cpumask_weight(cpu_present_mask)
+#define num_active_cpus()  cpumask_weight(cpu_active_mask)
+
+static inline bool cpu_online(unsigned int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_online_mask);
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_possible_mask);
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_present_mask);
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_active_mask);
+}
+
+#else
+
+#define num_online_cpus()  1U
+#define num_possible_cpus()1U
+#define num_present_cpus() 1U
+#define num_active_cpus()  1U
+
+static inline bool cpu_online(unsigned int cpu)
+{
+   return cpu == 0;
+}
+
+static inline bool cpu_possible(unsigned int cpu)
+{
+   return cpu == 0;
+}
+
+static inline bool cpu_present(unsigned int cpu)
+{
+   return cpu == 0;
+}
+
+static inline bool cpu_active(unsigned int cpu)
+{
+   return cpu == 0;
+}
+
+#endif /* NR_CPUS > 1 */
+
 #define cpu_is_offline(cpu)unlikely(!cpu_online(cpu))
 
 #if NR_CPUS <= BITS_PER_LONG


[tip: sched/core] cpumask: Introduce DYING mask

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5
Gitweb:
https://git.kernel.org/tip/e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5
Author:Peter Zijlstra 
AuthorDate:Tue, 19 Jan 2021 18:43:45 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00

cpumask: Introduce DYING mask

Introduce a cpumask that indicates (for each CPU) what direction the
CPU hotplug is currently going. Notably, it tracks rollbacks. Eg. when
an up fails and we do a roll-back down, it will accurately reflect the
direction.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210310150109.151441...@infradead.org
---
 include/linux/cpumask.h | 20 
 kernel/cpu.c|  6 ++
 2 files changed, 26 insertions(+)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index a584336..e6b948a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -91,10 +91,12 @@ extern struct cpumask __cpu_possible_mask;
 extern struct cpumask __cpu_online_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
+extern struct cpumask __cpu_dying_mask;
 #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
 #define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
 #define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
 #define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
+#define cpu_dying_mask((const struct cpumask *)&__cpu_dying_mask)
 
 extern atomic_t __num_online_cpus;
 
@@ -826,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active)
cpumask_clear_cpu(cpu, &__cpu_active_mask);
 }
 
+static inline void
+set_cpu_dying(unsigned int cpu, bool dying)
+{
+   if (dying)
+   cpumask_set_cpu(cpu, &__cpu_dying_mask);
+   else
+   cpumask_clear_cpu(cpu, &__cpu_dying_mask);
+}
 
 /**
  * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
@@ -900,6 +910,11 @@ static inline bool cpu_active(unsigned int cpu)
return cpumask_test_cpu(cpu, cpu_active_mask);
 }
 
+static inline bool cpu_dying(unsigned int cpu)
+{
+   return cpumask_test_cpu(cpu, cpu_dying_mask);
+}
+
 #else
 
 #define num_online_cpus()  1U
@@ -927,6 +942,11 @@ static inline bool cpu_active(unsigned int cpu)
return cpu == 0;
 }
 
+static inline bool cpu_dying(unsigned int cpu)
+{
+   return false;
+}
+
 #endif /* NR_CPUS > 1 */
 
 #define cpu_is_offline(cpu)unlikely(!cpu_online(cpu))
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 23505d6..838dcf2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -160,6 +160,9 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum 
cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
 
+   if (cpu_dying(cpu) != !bringup)
+   set_cpu_dying(cpu, !bringup);
+
if (st->fail == state) {
st->fail = CPUHP_INVALID;
return -EAGAIN;
@@ -2512,6 +2515,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 
+struct cpumask __cpu_dying_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_dying_mask);
+
 atomic_t __num_online_cpus __read_mostly;
 EXPORT_SYMBOL(__num_online_cpus);
 


[tip: sched/core] sched: Move SCHED_DEBUG sysctl to debugfs

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 8a99b6833c884fa0e7919030d93fecedc69fc625
Gitweb:
https://git.kernel.org/tip/8a99b6833c884fa0e7919030d93fecedc69fc625
Author:Peter Zijlstra 
AuthorDate:Wed, 24 Mar 2021 11:43:21 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00

sched: Move SCHED_DEBUG sysctl to debugfs

Stop polluting sysctl with undocumented knobs that really are debug
only, move them all to /debug/sched/ along with the existing
/debug/sched_* files that already exist.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Greg Kroah-Hartman 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.287610...@infradead.org
---
 include/linux/sched/sysctl.h |  8 +--
 kernel/sched/core.c  |  4 +-
 kernel/sched/debug.c | 74 +--
 kernel/sched/fair.c  |  9 +---
 kernel/sched/sched.h |  2 +-
 kernel/sysctl.c  | 96 +---
 6 files changed, 80 insertions(+), 113 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 3c31ba8..0a3f346 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, 
int write,
 enum { sysctl_hung_task_timeout_secs = 0 };
 #endif
 
+extern unsigned int sysctl_sched_child_runs_first;
+
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_child_runs_first;
 
 enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
@@ -37,7 +38,7 @@ enum sched_tunable_scaling {
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
 };
-extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
+extern unsigned int sysctl_sched_tunable_scaling;
 
 extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
@@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-
-int sched_proc_update_handler(struct ctl_table *table, int write,
-   void *buffer, size_t *length, loff_t *ppos);
 #endif
 
 /*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d031da..bac30db 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5504,9 +5504,11 @@ static const struct file_operations sched_dynamic_fops = 
{
.release= single_release,
 };
 
+extern struct dentry *debugfs_sched;
+
 static __init int sched_init_debug_dynamic(void)
 {
-   debugfs_create_file("sched_preempt", 0644, NULL, NULL, 
_dynamic_fops);
+   debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, 
_dynamic_fops);
return 0;
 }
 late_initcall(sched_init_debug_dynamic);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2..2093b90 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -169,15 +169,81 @@ static const struct file_operations sched_feat_fops = {
.release= single_release,
 };
 
+#ifdef CONFIG_SMP
+
+static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
+  size_t cnt, loff_t *ppos)
+{
+   char buf[16];
+
+   if (cnt > 15)
+   cnt = 15;
+
+   if (copy_from_user(, ubuf, cnt))
+   return -EFAULT;
+
+   if (kstrtouint(buf, 10, _sched_tunable_scaling))
+   return -EINVAL;
+
+   if (sched_update_scaling())
+   return -EINVAL;
+
+   *ppos += cnt;
+   return cnt;
+}
+
+static int sched_scaling_show(struct seq_file *m, void *v)
+{
+   seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
+   return 0;
+}
+
+static int sched_scaling_open(struct inode *inode, struct file *filp)
+{
+   return single_open(filp, sched_scaling_show, NULL);
+}
+
+static const struct file_operations sched_scaling_fops = {
+   .open   = sched_scaling_open,
+   .write  = sched_scaling_write,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= single_release,
+};
+
+#endif /* SMP */
+
 __read_mostly bool sched_debug_enabled;
 
+struct dentry *debugfs_sched;
+
 static __init int sched_init_debug(void)
 {
-   debugfs_create_file("sched_features", 0644, NULL, NULL,
-   _feat_fops);
+   struct dentry __maybe_unused *numa;
 
-   debugfs_create_bool("sched_debug", 0644, NULL,
-   _debug_enabled);
+   debugfs_sched = debugfs_create_dir("sched", NULL);
+
+   debugfs_create_file("features", 0644, debugfs_sch

[tip: sched/core] sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 1d1c2509de4488cc58c924d0a6117c62de1d4f9c
Gitweb:
https://git.kernel.org/tip/1d1c2509de4488cc58c924d0a6117c62de1d4f9c
Author:Peter Zijlstra 
AuthorDate:Wed, 24 Mar 2021 19:47:43 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00

sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG

CONFIG_SCHEDSTATS does not depend on SCHED_DEBUG, it is inconsistent
to have the sysctl depend on it.

Suggested-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.161151...@infradead.org
---
 kernel/sysctl.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8042098..17f1cc9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1711,17 +1711,6 @@ static struct ctl_table kern_table[] = {
.mode   = 0644,
.proc_handler   = proc_dointvec,
},
-#ifdef CONFIG_SCHEDSTATS
-   {
-   .procname   = "sched_schedstats",
-   .data   = NULL,
-   .maxlen = sizeof(unsigned int),
-   .mode   = 0644,
-   .proc_handler   = sysctl_schedstats,
-   .extra1 = SYSCTL_ZERO,
-   .extra2 = SYSCTL_ONE,
-   },
-#endif /* CONFIG_SCHEDSTATS */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
{
@@ -1755,6 +1744,17 @@ static struct ctl_table kern_table[] = {
},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHEDSTATS
+   {
+   .procname   = "sched_schedstats",
+   .data   = NULL,
+   .maxlen = sizeof(unsigned int),
+   .mode   = 0644,
+   .proc_handler   = sysctl_schedstats,
+   .extra1 = SYSCTL_ZERO,
+   .extra2 = SYSCTL_ONE,
+   },
+#endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_NUMA_BALANCING
{
.procname   = "numa_balancing",


[tip: sched/core] sched,preempt: Move preempt_dynamic to debug.c

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 1011dcce99f8026d48fdd7b9cc259e32a8b472be
Gitweb:
https://git.kernel.org/tip/1011dcce99f8026d48fdd7b9cc259e32a8b472be
Author:Peter Zijlstra 
AuthorDate:Thu, 25 Mar 2021 12:21:38 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00

sched,preempt: Move preempt_dynamic to debug.c

Move the #ifdef SCHED_DEBUG bits to kernel/sched/debug.c in order to
collect all the debugfs bits.

Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.353833...@infradead.org
---
 kernel/sched/core.c  | 77 +--
 kernel/sched/debug.c | 67 -
 kernel/sched/sched.h | 11 --
 3 files changed, 78 insertions(+), 77 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bac30db..e6c714b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5371,9 +5371,9 @@ enum {
preempt_dynamic_full,
 };
 
-static int preempt_dynamic_mode = preempt_dynamic_full;
+int preempt_dynamic_mode = preempt_dynamic_full;
 
-static int sched_dynamic_mode(const char *str)
+int sched_dynamic_mode(const char *str)
 {
if (!strcmp(str, "none"))
return preempt_dynamic_none;
@@ -5387,7 +5387,7 @@ static int sched_dynamic_mode(const char *str)
return -EINVAL;
 }
 
-static void sched_dynamic_update(int mode)
+void sched_dynamic_update(int mode)
 {
/*
 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
@@ -5444,79 +5444,8 @@ static int __init setup_preempt_mode(char *str)
 }
 __setup("preempt=", setup_preempt_mode);
 
-#ifdef CONFIG_SCHED_DEBUG
-
-static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
-  size_t cnt, loff_t *ppos)
-{
-   char buf[16];
-   int mode;
-
-   if (cnt > 15)
-   cnt = 15;
-
-   if (copy_from_user(, ubuf, cnt))
-   return -EFAULT;
-
-   buf[cnt] = 0;
-   mode = sched_dynamic_mode(strstrip(buf));
-   if (mode < 0)
-   return mode;
-
-   sched_dynamic_update(mode);
-
-   *ppos += cnt;
-
-   return cnt;
-}
-
-static int sched_dynamic_show(struct seq_file *m, void *v)
-{
-   static const char * preempt_modes[] = {
-   "none", "voluntary", "full"
-   };
-   int i;
-
-   for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
-   if (preempt_dynamic_mode == i)
-   seq_puts(m, "(");
-   seq_puts(m, preempt_modes[i]);
-   if (preempt_dynamic_mode == i)
-   seq_puts(m, ")");
-
-   seq_puts(m, " ");
-   }
-
-   seq_puts(m, "\n");
-   return 0;
-}
-
-static int sched_dynamic_open(struct inode *inode, struct file *filp)
-{
-   return single_open(filp, sched_dynamic_show, NULL);
-}
-
-static const struct file_operations sched_dynamic_fops = {
-   .open   = sched_dynamic_open,
-   .write  = sched_dynamic_write,
-   .read   = seq_read,
-   .llseek = seq_lseek,
-   .release= single_release,
-};
-
-extern struct dentry *debugfs_sched;
-
-static __init int sched_init_debug_dynamic(void)
-{
-   debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, 
_dynamic_fops);
-   return 0;
-}
-late_initcall(sched_init_debug_dynamic);
-
-#endif /* CONFIG_SCHED_DEBUG */
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
-
 /*
  * This is the entry point to schedule() from kernel preemption
  * off of irq context.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2093b90..bdd344f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -213,9 +213,71 @@ static const struct file_operations sched_scaling_fops = {
 
 #endif /* SMP */
 
+#ifdef CONFIG_PREEMPT_DYNAMIC
+
+static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
+  size_t cnt, loff_t *ppos)
+{
+   char buf[16];
+   int mode;
+
+   if (cnt > 15)
+   cnt = 15;
+
+   if (copy_from_user(, ubuf, cnt))
+   return -EFAULT;
+
+   buf[cnt] = 0;
+   mode = sched_dynamic_mode(strstrip(buf));
+   if (mode < 0)
+   return mode;
+
+   sched_dynamic_update(mode);
+
+   *ppos += cnt;
+
+   return cnt;
+}
+
+static int sched_dynamic_show(struct seq_file *m, void *v)
+{
+   static const char * preempt_modes[] = {
+   "none", "voluntary", "full"
+   };
+   int i;
+
+   for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+   if (preempt_dynamic_mode == i)
+   seq_puts(m, "(");
+  

[tip: sched/core] sched: Don't make LATENCYTOP select SCHED_DEBUG

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: d86ba831656611872e4939b895503ddac63d8196
Gitweb:
https://git.kernel.org/tip/d86ba831656611872e4939b895503ddac63d8196
Author:Peter Zijlstra 
AuthorDate:Wed, 24 Mar 2021 19:48:34 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00

sched: Don't make LATENCYTOP select SCHED_DEBUG

SCHED_DEBUG is not in fact required for LATENCYTOP, don't select it.

Suggested-by: Mel Gorman 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.224578...@infradead.org
---
 lib/Kconfig.debug | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2779c29..5f98376 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1670,7 +1670,6 @@ config LATENCYTOP
select KALLSYMS_ALL
select STACKTRACE
select SCHEDSTATS
-   select SCHED_DEBUG
help
  Enable this option if you want to use the LatencyTOP tool
  to find out which userspace is blocking on what kernel operations.


[tip: sched/core] sched,fair: Alternative sched_slice()

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 0c2de3f054a59f15e01804b75a04355c48de628c
Gitweb:
https://git.kernel.org/tip/0c2de3f054a59f15e01804b75a04355c48de628c
Author:Peter Zijlstra 
AuthorDate:Thu, 25 Mar 2021 13:44:46 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00

sched,fair: Alternative sched_slice()

The current sched_slice() seems to have issues; there's two possible
things that could be improved:

 - the 'nr_running' used for __sched_period() is daft when cgroups are
   considered. Using the RQ wide h_nr_running seems like a much more
   consistent number.

 - (esp) cgroups can slice it real fine, which makes for easy
   over-scheduling, ensure min_gran is what the name says.

Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.611897...@infradead.org
---
 kernel/sched/fair.c | 12 +++-
 kernel/sched/features.h |  3 +++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3ea14c..49636a4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -687,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running)
  */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-   u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
+   unsigned int nr_running = cfs_rq->nr_running;
+   u64 slice;
+
+   if (sched_feat(ALT_PERIOD))
+   nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
+
+   slice = __sched_period(nr_running + !se->on_rq);
 
for_each_sched_entity(se) {
struct load_weight *load;
@@ -704,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
}
slice = __calc_delta(slice, se->load.weight, load);
}
+
+   if (sched_feat(BASE_SLICE))
+   slice = max(slice, (u64)sysctl_sched_min_granularity);
+
return slice;
 }
 
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 422fa68..011c5ec 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -90,3 +90,6 @@ SCHED_FEAT(WA_BIAS, true)
  */
 SCHED_FEAT(UTIL_EST, true)
 SCHED_FEAT(UTIL_EST_FASTUP, true)
+
+SCHED_FEAT(ALT_PERIOD, true)
+SCHED_FEAT(BASE_SLICE, true)


[tip: sched/core] sched/debug: Rename the sched_debug parameter to sched_verbose

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: a1b93fc0377e73dd54f819a993f83291324bb54a
Gitweb:
https://git.kernel.org/tip/a1b93fc0377e73dd54f819a993f83291324bb54a
Author:Peter Zijlstra 
AuthorDate:Thu, 15 Apr 2021 18:23:17 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00

sched/debug: Rename the sched_debug parameter to sched_verbose

CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param
sched_debug and the /debug/sched/debug_enabled knobs control the
sched_debug_enabled variable, but what they really do is make
SCHED_DEBUG more verbose, so rename the lot.

Signed-off-by: Peter Zijlstra (Intel) 
---
 Documentation/admin-guide/kernel-parameters.txt |  2 +-
 Documentation/scheduler/sched-domains.rst   | 10 +-
 kernel/sched/debug.c|  4 ++--
 kernel/sched/topology.c | 12 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 0454572..9e4c026 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4725,7 +4725,7 @@
 
sbni=   [NET] Granch SBNI12 leased line adapter
 
-   sched_debug [KNL] Enables verbose scheduler debug messages.
+   sched_verbose   [KNL] Enables verbose scheduler debug messages.
 
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
diff --git a/Documentation/scheduler/sched-domains.rst 
b/Documentation/scheduler/sched-domains.rst
index 8582fa5..14ea2f2 100644
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,8 +74,8 @@ for a given topology level by creating a 
sched_domain_topology_level array and
 calling set_sched_topology() with this array as the parameter.
 
 The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
-tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
-knob. This enables an error checking parse of the sched domains which should
-catch most possible errors (described above). It also prints out the domain
-structure in a visual format.
+CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+forgot to tweak your cmdline, you can also flip the
+/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
+the sched domains which should catch most possible errors (described above). It
+also prints out the domain structure in a visual format.
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index bf199d6..461342f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -275,7 +275,7 @@ static const struct file_operations sched_dynamic_fops = {
 
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
-__read_mostly bool sched_debug_enabled;
+__read_mostly bool sched_debug_verbose;
 
 static const struct seq_operations sched_debug_sops;
 
@@ -300,7 +300,7 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
 
debugfs_create_file("features", 0644, debugfs_sched, NULL, 
_feat_fops);
-   debugfs_create_bool("debug_enabled", 0644, debugfs_sched, 
_debug_enabled);
+   debugfs_create_bool("verbose", 0644, debugfs_sched, 
_debug_verbose);
 #ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, 
_dynamic_fops);
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c343aed..55a0a24 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2;
 
 static int __init sched_debug_setup(char *str)
 {
-   sched_debug_enabled = true;
+   sched_debug_verbose = true;
 
return 0;
 }
-early_param("sched_debug", sched_debug_setup);
+early_param("sched_verbose", sched_debug_setup);
 
 static inline bool sched_debug(void)
 {
-   return sched_debug_enabled;
+   return sched_debug_verbose;
 }
 
 #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = 
#_name },
@@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int 
cpu)
 {
int level = 0;
 
-   if (!sched_debug_enabled)
+   if (!sched_debug_verbose)
return;
 
if (!sd) {
@@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int 
cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 
-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@@ -2141,7 +2141,7 @@ build_sched_domains(const struct

[tip: sched/core] sched,debug: Convert sysctl sched_domains to debugfs

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 3b87f136f8fccddf7da016ab7d04bb3cf9b180f0
Gitweb:
https://git.kernel.org/tip/3b87f136f8fccddf7da016ab7d04bb3cf9b180f0
Author:Peter Zijlstra 
AuthorDate:Thu, 25 Mar 2021 11:31:20 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00

sched,debug: Convert sysctl sched_domains to debugfs

Stop polluting sysctl, move to debugfs for SCHED_DEBUG stuff.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Dietmar Eggemann 
Reviewed-by: Valentin Schneider 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/YHgB/s4kcbq1i...@hirez.programming.kicks-ass.net
---
 kernel/sched/debug.c| 254 ---
 kernel/sched/sched.h|  10 +--
 kernel/sched/topology.c |   6 +-
 3 files changed, 59 insertions(+), 211 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index bdd344f..b25de7b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -299,6 +299,10 @@ static __init int sched_init_debug(void)
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, 
_scaling_fops);
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, 
_sched_migration_cost);
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, 
_sched_nr_migrate);
+
+   mutex_lock(_domains_mutex);
+   update_sched_domain_debugfs();
+   mutex_unlock(_domains_mutex);
 #endif
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -316,229 +320,88 @@ late_initcall(sched_init_debug);
 
 #ifdef CONFIG_SMP
 
-#ifdef CONFIG_SYSCTL
-
-static struct ctl_table sd_ctl_dir[] = {
-   {
-   .procname   = "sched_domain",
-   .mode   = 0555,
-   },
-   {}
-};
-
-static struct ctl_table sd_ctl_root[] = {
-   {
-   .procname   = "kernel",
-   .mode   = 0555,
-   .child  = sd_ctl_dir,
-   },
-   {}
-};
-
-static struct ctl_table *sd_alloc_ctl_entry(int n)
-{
-   struct ctl_table *entry =
-   kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
-
-   return entry;
-}
-
-static void sd_free_ctl_entry(struct ctl_table **tablep)
-{
-   struct ctl_table *entry;
-
-   /*
-* In the intermediate directories, both the child directory and
-* procname are dynamically allocated and could fail but the mode
-* will always be set. In the lowest directory the names are
-* static strings and all have proc handlers.
-*/
-   for (entry = *tablep; entry->mode; entry++) {
-   if (entry->child)
-   sd_free_ctl_entry(>child);
-   if (entry->proc_handler == NULL)
-   kfree(entry->procname);
-   }
-
-   kfree(*tablep);
-   *tablep = NULL;
-}
-
-static void
-set_table_entry(struct ctl_table *entry,
-   const char *procname, void *data, int maxlen,
-   umode_t mode, proc_handler *proc_handler)
-{
-   entry->procname = procname;
-   entry->data = data;
-   entry->maxlen = maxlen;
-   entry->mode = mode;
-   entry->proc_handler = proc_handler;
-}
+static cpumask_var_t   sd_sysctl_cpus;
+static struct dentry   *sd_dentry;
 
-static int sd_ctl_doflags(struct ctl_table *table, int write,
- void *buffer, size_t *lenp, loff_t *ppos)
+static int sd_flags_show(struct seq_file *m, void *v)
 {
-   unsigned long flags = *(unsigned long *)table->data;
-   size_t data_size = 0;
-   size_t len = 0;
-   char *tmp, *buf;
+   unsigned long flags = *(unsigned int *)m->private;
int idx;
 
-   if (write)
-   return 0;
-
-   for_each_set_bit(idx, , __SD_FLAG_CNT) {
-   char *name = sd_flag_debug[idx].name;
-
-   /* Name plus whitespace */
-   data_size += strlen(name) + 1;
-   }
-
-   if (*ppos > data_size) {
-   *lenp = 0;
-   return 0;
-   }
-
-   buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
-   if (!buf)
-   return -ENOMEM;
-
for_each_set_bit(idx, , __SD_FLAG_CNT) {
-   char *name = sd_flag_debug[idx].name;
-
-   len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
-   }
-
-   tmp = buf + *ppos;
-   len -= *ppos;
-
-   if (len > *lenp)
-   len = *lenp;
-   if (len)
-   memcpy(buffer, tmp, len);
-   if (len < *lenp) {
-   ((char *)buffer)[len] = '\n';
-   len++;
+   seq_puts(m, sd_flag_debug[idx].name);
+   seq_puts(m, " ");
}
-
-   *lenp = len;
-   *ppos += len;
-
-   kfree(buf);
+   seq_puts(m, "\n");
 
ret

[tip: sched/core] debugfs: Implement debugfs_create_str()

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: 9af0440ec86ebdab075e1b3d231f81fe7decb575
Gitweb:
https://git.kernel.org/tip/9af0440ec86ebdab075e1b3d231f81fe7decb575
Author:Peter Zijlstra 
AuthorDate:Thu, 25 Mar 2021 10:53:55 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00

debugfs: Implement debugfs_create_str()

Implement debugfs_create_str() to easily display names and such in
debugfs.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Greg Kroah-Hartman 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.415407...@infradead.org
---
 fs/debugfs/file.c   | 91 -
 include/linux/debugfs.h | 17 +++-
 2 files changed, 108 insertions(+)

diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 686e0ad..9b78e9e 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -865,6 +865,97 @@ struct dentry *debugfs_create_bool(const char *name, 
umode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_bool);
 
+ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+   struct dentry *dentry = F_DENTRY(file);
+   char *str, *copy = NULL;
+   int copy_len, len;
+   ssize_t ret;
+
+   ret = debugfs_file_get(dentry);
+   if (unlikely(ret))
+   return ret;
+
+   str = *(char **)file->private_data;
+   len = strlen(str) + 1;
+   copy = kmalloc(len, GFP_KERNEL);
+   if (!copy) {
+   debugfs_file_put(dentry);
+   return -ENOMEM;
+   }
+
+   copy_len = strscpy(copy, str, len);
+   debugfs_file_put(dentry);
+   if (copy_len < 0) {
+   kfree(copy);
+   return copy_len;
+   }
+
+   copy[copy_len] = '\n';
+
+   ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
+   kfree(copy);
+
+   return ret;
+}
+
+static ssize_t debugfs_write_file_str(struct file *file, const char __user 
*user_buf,
+ size_t count, loff_t *ppos)
+{
+   /* This is really only for read-only strings */
+   return -EINVAL;
+}
+
+static const struct file_operations fops_str = {
+   .read = debugfs_read_file_str,
+   .write =debugfs_write_file_str,
+   .open = simple_open,
+   .llseek =   default_llseek,
+};
+
+static const struct file_operations fops_str_ro = {
+   .read = debugfs_read_file_str,
+   .open = simple_open,
+   .llseek =   default_llseek,
+};
+
+static const struct file_operations fops_str_wo = {
+   .write =debugfs_write_file_str,
+   .open = simple_open,
+   .llseek =   default_llseek,
+};
+
+/**
+ * debugfs_create_str - create a debugfs file that is used to read and write a 
string value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *  directory dentry if set.  If this parameter is %NULL, then the
+ *  file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value.  If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, ERR_PTR(-ERROR) will be
+ * returned.
+ *
+ * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
+ * be returned.
+ */
+void debugfs_create_str(const char *name, umode_t mode,
+   struct dentry *parent, char **value)
+{
+   debugfs_create_mode_unsafe(name, mode, parent, value, _str,
+  _str_ro, _str_wo);
+}
+
 static ssize_t read_file_blob(struct file *file, char __user *user_buf,
  size_t count, loff_t *ppos)
 {
diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h
index d6c4cc9..1fdb434 100644
--- a/include/linux/debugfs.h
+++ b/include/linux/debugfs.h
@@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
 struct dentry *parent, atomic_t *value);
 struct dentry *debugfs_create_bool(const char *name, umode_t mode,
  struct dentry *parent, bool *value);
+void debugfs_create_str(const char *name, umode_t mode,
+   struct dentry *parent, char **value);
 
 struct dentry *debugfs_create_blob

[tip: sched/core] sched: Move /proc/sched_debug to debugfs

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the sched/core branch of tip:

Commit-ID: d27e9ae2f244805bbdc730d85fba28685d2471e5
Gitweb:
https://git.kernel.org/tip/d27e9ae2f244805bbdc730d85fba28685d2471e5
Author:Peter Zijlstra 
AuthorDate:Thu, 25 Mar 2021 15:18:19 +01:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00

sched: Move /proc/sched_debug to debugfs

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Greg Kroah-Hartman 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/20210412102001.548833...@infradead.org
---
 kernel/sched/debug.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index b25de7b..bf199d6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -277,6 +277,20 @@ static const struct file_operations sched_dynamic_fops = {
 
 __read_mostly bool sched_debug_enabled;
 
+static const struct seq_operations sched_debug_sops;
+
+static int sched_debug_open(struct inode *inode, struct file *filp)
+{
+   return seq_open(filp, _debug_sops);
+}
+
+static const struct file_operations sched_debug_fops = {
+   .open   = sched_debug_open,
+   .read   = seq_read,
+   .llseek = seq_lseek,
+   .release= seq_release,
+};
+
 static struct dentry *debugfs_sched;
 
 static __init int sched_init_debug(void)
@@ -314,6 +328,8 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_size_mb", 0644, numa, 
_numa_balancing_scan_size);
 #endif
 
+   debugfs_create_file("debug", 0444, debugfs_sched, NULL, 
_debug_fops);
+
return 0;
 }
 late_initcall(sched_init_debug);
@@ -847,15 +863,6 @@ static const struct seq_operations sched_debug_sops = {
.show   = sched_debug_show,
 };
 
-static int __init init_sched_debug_procfs(void)
-{
-   if (!proc_create_seq("sched_debug", 0444, NULL, _debug_sops))
-   return -ENOMEM;
-   return 0;
-}
-
-__initcall(init_sched_debug_procfs);
-
 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
 #define __P(F) __PS(#F, F)
 #define   P(F) __PS(#F, p->F)


Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 05:04:41PM +0200, Miguel Ojeda wrote:
> Of course, we could propose something similar for C -- in fact, there
> was a recent discussion around this in the C committee triggered by my
> n2659 "Safety attributes for C" paper. 

http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2659.htm

That's just not making any damn sense what so ever. That seems to be
about sprinkling abort() all over the place, which is just total
rubbish.


Re: liburcu: LTO breaking rcu_dereference on arm64 and possibly other architectures ?

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 10:52:16AM -0400, Mathieu Desnoyers wrote:
> Hi Paul, Will, Peter,
> 
> I noticed in this discussion https://lkml.org/lkml/2021/4/16/118 that LTO
> is able to break rcu_dereference. This seems to be taken care of by
> arch/arm64/include/asm/rwonce.h on arm64 in the Linux kernel tree.
> 
> In the liburcu user-space library, we have this comment near 
> rcu_dereference() in
> include/urcu/static/pointer.h:
> 
>  * The compiler memory barrier in CMM_LOAD_SHARED() ensures that 
> value-speculative
>  * optimizations (e.g. VSS: Value Speculation Scheduling) does not perform the
>  * data read before the pointer read by speculating the value of the pointer.
>  * Correct ordering is ensured because the pointer is read as a volatile 
> access.
>  * This acts as a global side-effect operation, which forbids reordering of
>  * dependent memory operations. Note that such concern about 
> dependency-breaking
>  * optimizations will eventually be taken care of by the 
> "memory_order_consume"
>  * addition to forthcoming C++ standard.
> 
> (note: CMM_LOAD_SHARED() is the equivalent of READ_ONCE(), but was introduced 
> in
> liburcu as a public API before READ_ONCE() existed in the Linux kernel)
> 
> Peter tells me the "memory_order_consume" is not something which can be used 
> today.
> Any information on its status at C/C++ standard levels and 
> implementation-wise ?
> 
> Pragmatically speaking, what should we change in liburcu to ensure we don't 
> generate
> broken code when LTO is enabled ? I suspect there are a few options here:
> 
> 1) Fail to build if LTO is enabled,
> 2) Generate slower code for rcu_dereference, either on all architectures or 
> only
>on weakly-ordered architectures,
> 3) Generate different code depending on whether LTO is enabled or not. AFAIU 
> this would only
>work if every compile unit is aware that it will end up being optimized 
> with LTO. Not sure
>how this could be done in the context of user-space.
> 4) [ Insert better idea here. ]
> 
> Thoughts ?

Using memory_order_acquire is safe; and is basically what Will did for
ARM64.

The problematic tranformations are possible even without LTO, although
less likely due to less visibility, but everybody agrees they're
possible and allowed.

OTOH we do not have a positive sighting of it actually happening (I
think), we're all just being cautious and not willing to debug the
resulting wreckage if it does indeed happen.



Re: [PATCH v2] sched: Warn on long periods of pending need_resched

2021-04-16 Thread Peter Zijlstra
On Tue, Mar 30, 2021 at 03:44:12PM -0700, Josh Don wrote:
> Peter,
> 
> Since you've already pulled the need_resched warning patch into your
> tree, I'm including just the diff based on that patch (in response to
> Mel's comments) below. This should be squashed into the original
> patch.
> 

Sorry, I seem to have missed this. The patch is completely whitespace
mangled through.

I've attached my latest copy of the patch and held it back for now,
please resubmit.

---
Subject: sched: Warn on long periods of pending need_resched
From: Paul Turner 
Date: Mon, 22 Mar 2021 20:57:06 -0700

From: Paul Turner 

CPU scheduler marks need_resched flag to signal a schedule() on a
particular CPU. But, schedule() may not happen immediately in cases
where the current task is executing in the kernel mode (no
preemption state) for extended periods of time.

This patch adds a warn_on if need_resched is pending for more than the
time specified in sysctl resched_latency_warn_ms. If it goes off, it is
likely that there is a missing cond_resched() somewhere. Monitoring is
done via the tick and the accuracy is hence limited to jiffy scale. This
also means that we won't trigger the warning if the tick is disabled.

This feature is default disabled. It can be toggled on using sysctl
resched_latency_warn_enabled.

Signed-off-by: Paul Turner 
Signed-off-by: Josh Don 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210323035706.572953-1-josh...@google.com
---

 include/linux/sched/sysctl.h |3 +
 kernel/sched/core.c  |   75 ++-
 kernel/sched/debug.c |   13 +++
 kernel/sched/features.h  |2 +
 kernel/sched/sched.h |   10 +
 5 files changed, 102 insertions(+), 1 deletion(-)

--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancin
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
+
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
 #endif
 
 /*
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_fe
 #include "features.h"
0;
 #undef SCHED_FEAT
-#endif
+
+/*
+ * Print a warning if need_resched is set for the given duration (if
+ * resched_latency_warn_enabled is set).
+ *
+ * If sysctl_resched_latency_warn_once is set, only one warning will be shown
+ * per boot.
+ *
+ * Resched latency will be ignored for the first resched_boot_quiet_sec, to
+ * reduce false alarms.
+ */
+int sysctl_resched_latency_warn_ms = 100;
+int sysctl_resched_latency_warn_once = 1;
+static const long resched_boot_quiet_sec = 600;
+#endif /* CONFIG_SCHED_DEBUG */
 
 /*
  * Number of tasks to iterate in a single balance run.
@@ -4527,6 +4541,56 @@ unsigned long long task_sched_runtime(st
return ns;
 }
 
+#ifdef CONFIG_SCHED_DEBUG
+static u64 resched_latency_check(struct rq *rq)
+{
+   int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
+   u64 need_resched_latency, now = rq_clock(rq);
+   static bool warned_once;
+
+   if (sysctl_resched_latency_warn_once && warned_once)
+   return 0;
+
+   if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2))
+   return 0;
+
+   /* Disable this warning for the first few mins after boot */
+   if (now < resched_boot_quiet_sec * NSEC_PER_SEC)
+   return 0;
+
+   if (!rq->last_seen_need_resched_ns) {
+   rq->last_seen_need_resched_ns = now;
+   rq->ticks_without_resched = 0;
+   return 0;
+   }
+
+   rq->ticks_without_resched++;
+   need_resched_latency = now - rq->last_seen_need_resched_ns;
+   if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
+   return 0;
+
+   warned_once = true;
+
+   return need_resched_latency;
+}
+
+static int __init setup_resched_latency_warn_ms(char *str)
+{
+   long val;
+
+   if ((kstrtol(str, 0, ))) {
+   pr_warn("Unable to set resched_latency_warn_ms\n");
+   return 1;
+   }
+
+   sysctl_resched_latency_warn_ms = val;
+   return 1;
+}
+__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
+#else
+static inline u64 resched_latency_check(struct rq *rq) { return 0; }
+#endif /* CONFIG_SCHED_DEBUG */
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -4538,6 +4602,7 @@ void scheduler_tick(void)
struct task_struct *curr = rq->curr;
struct rq_flags rf;
unsigned long thermal_pressure;
+   u64 resched_latency;
 
arch_scale_freq_tick();
sched_clock_tick();
@@ -4548,10 +4613,15 @@ void

[tip: perf/core] perf: Rework perf_event_exit_event()

2021-04-16 Thread tip-bot2 for Peter Zijlstra
The following commit has been merged into the perf/core branch of tip:

Commit-ID: ef54c1a476aef7eef26fe13ea10dc090952c00f8
Gitweb:
https://git.kernel.org/tip/ef54c1a476aef7eef26fe13ea10dc090952c00f8
Author:Peter Zijlstra 
AuthorDate:Thu, 08 Apr 2021 12:35:56 +02:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 16 Apr 2021 16:32:40 +02:00

perf: Rework perf_event_exit_event()

Make perf_event_exit_event() more robust, such that we can use it from
other contexts. Specifically the up and coming remove_on_exec.

For this to work we need to address a few issues. Remove_on_exec will
not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to
disable event_function_call() and we thus have to use
perf_remove_from_context().

When using perf_remove_from_context(), there's two races to consider.
The first is against close(), where we can have concurrent tear-down
of the event. The second is against child_list iteration, which should
not find a half baked event.

To address this, teach perf_remove_from_context() to special case
!ctx->is_active and about DETACH_CHILD.

[ el...@google.com: fix racing parent/child exit in sync_child_event(). ]
Signed-off-by: Marco Elver 
Signed-off-by: Peter Zijlstra (Intel) 
Link: https://lkml.kernel.org/r/20210408103605.1676875-2-el...@google.com
---
 include/linux/perf_event.h |   1 +-
 kernel/events/core.c   | 142 
 2 files changed, 80 insertions(+), 63 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3f7f89e..3d478ab 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -607,6 +607,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_TASK_DATA  0x08
 #define PERF_ATTACH_ITRACE 0x10
 #define PERF_ATTACH_SCHED_CB   0x20
+#define PERF_ATTACH_CHILD  0x40
 
 struct perf_cgroup;
 struct perf_buffer;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f079431..318ff7b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2205,6 +2205,26 @@ out:
perf_event__header_size(leader);
 }
 
+static void sync_child_event(struct perf_event *child_event);
+
+static void perf_child_detach(struct perf_event *event)
+{
+   struct perf_event *parent_event = event->parent;
+
+   if (!(event->attach_state & PERF_ATTACH_CHILD))
+   return;
+
+   event->attach_state &= ~PERF_ATTACH_CHILD;
+
+   if (WARN_ON_ONCE(!parent_event))
+   return;
+
+   lockdep_assert_held(_event->child_mutex);
+
+   sync_child_event(event);
+   list_del_init(>child_list);
+}
+
 static bool is_orphaned_event(struct perf_event *event)
 {
return event->state == PERF_EVENT_STATE_DEAD;
@@ -2312,6 +2332,7 @@ group_sched_out(struct perf_event *group_event,
 }
 
 #define DETACH_GROUP   0x01UL
+#define DETACH_CHILD   0x02UL
 
 /*
  * Cross CPU call to remove a performance event
@@ -2335,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event,
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
+   if (flags & DETACH_CHILD)
+   perf_child_detach(event);
list_del_event(event, ctx);
 
if (!ctx->nr_events && ctx->is_active) {
@@ -2363,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event 
*event, unsigned long fla
 
lockdep_assert_held(>mutex);
 
-   event_function_call(event, __perf_remove_from_context, (void *)flags);
-
/*
-* The above event_function_call() can NO-OP when it hits
-* TASK_TOMBSTONE. In that case we must already have been detached
-* from the context (by perf_event_exit_event()) but the grouping
-* might still be in-tact.
+* Because of perf_event_exit_task(), perf_remove_from_context() ought
+* to work in the face of TASK_TOMBSTONE, unlike every other
+* event_function_call() user.
 */
-   WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
-   if ((flags & DETACH_GROUP) &&
-   (event->attach_state & PERF_ATTACH_GROUP)) {
-   /*
-* Since in that case we cannot possibly be scheduled, simply
-* detach now.
-*/
-   raw_spin_lock_irq(>lock);
-   perf_group_detach(event);
+   raw_spin_lock_irq(>lock);
+   if (!ctx->is_active) {
+   __perf_remove_from_context(event, __get_cpu_context(ctx),
+  ctx, (void *)flags);
raw_spin_unlock_irq(>lock);
+   return;
}
+   raw_spin_unlock_irq(>lock);
+
+   event_function_call(event, __perf_remove_from_context, (void *)flags);
 }
 
 /*
@@ -12377,14 +12396,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int 
src_cpu, int dst_cpu)
 }
 EXPORT_SYMBO

Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 02:07:49PM +0100, Wedson Almeida Filho wrote:
> On Fri, Apr 16, 2021 at 01:24:23PM +0200, Peter Zijlstra wrote:

> >  int perf_event_task_enable(void)
> >  {
> > +   DEFINE_MUTEX_GUARD(event_mutex, >perf_event_mutex);
> 
> There is nothing in C forcing developers to actually use DEFINE_MUTEX_GUARD. 
> So
> someone may simply forget (or not know that they need) to lock
> current->perf_event_mutex and directly access some field protected by it. This
> is unlikely to happen when one first writes the code, but over time as 
> different
> people modify the code and invariants change, it is possible for this to 
> happen.
> 
> In Rust, this isn't possible: the data protected by a lock is only accessible
> when the lock is locked. So developers cannot accidentally make mistakes of 
> this
> kind. And since the enforcement happens at compile time, there is no runtime
> cost.
> 
> This, we believe, is fundamental to the discussion: we agree that many of 
> these
> idioms can be implemented in C (albeit in this case with a compiler 
> extension),
> but their use is optional, people can (and do) still make mistakes that lead 
> to
> vulnerabilities; Rust disallows classes of  mistakes by construction.

Does this also not prohibit constructs where modification must be done
while holding two locks, but reading can be done while holding either
lock?

That's a semi common scheme in the kernel, but not something that's
expressible by, for example, the Java sync keyword.

It also very much doesn't work for RCU, where modification must be done
under a lock, but access is done essentially lockless.

I would much rather have a language extention where we can associate
custom assertions with variable access, sorta like a sanitizer:

static inline void assert_foo_bar(struct foo *f)
{
lockdep_assert_held(>lock);
}

struct foo {
spinlock_t lock;
int bar __assert__(assert_foo_bar);
};

Such things can be optional and only enabled for debug builds on new
compilers.

> Another scenario: suppose within perf_event_task_enable you need to call a
> function that requires the mutex to be locked and that will unlock it for you 
> on
> error (or unconditionally, doesn't matter). How would you do that in C? In 
> Rust,
> there is a clean idiomatic way of transferring ownership of a guard (or any
> other object) such that the previous owner cannot continue to use it after
> ownership is transferred. Again, this is enforced at compile time. I'm happy 
> to
> provide a small example if that would help.

C does indeed not have the concept of ownership, unlike modern C++ I
think. But I would much rather see a C language extention for that than
go Rust.

This would mean a far more agressive push for newer C compilers than
we've ever done before, but at least it would all still be a single
language. Convertion to the new stuff can be done gradually and where
it makes sense and new extentions can be evaluated on performance impact
etc.



Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 09:19:08PM +0900, Namhyung Kim wrote:
> > Are you actually using mmap() to read? I had a proposal for FORMAT_GROUP
> > like thing for mmap(), but I never implemented that (didn't get the
> > enthousiatic response I thought it would). But yeah, there's nowhere
> > near enough space in there for PERCPU.
> 
> Recently there's a patch to do it with rdpmc which needs to mmap first.
> 
> https://lore.kernel.org/lkml/20210414155412.3697605-1-r...@kernel.org/

Yeah, I'm not sure about that, I've not looked at it recently. The thing
is though; for RDPMC to work, you *NEED* to be on the same CPU as the
counter.

The typical RDPMC use-case is self monitoring.


Re: [PATCH 04/13] Kbuild: Rust support

2021-04-16 Thread Peter Zijlstra
On Wed, Apr 14, 2021 at 08:45:55PM +0200, oj...@kernel.org wrote:
> diff --git a/scripts/Makefile.build b/scripts/Makefile.build
> index 1b6094a13034..3665c49c4dcf 100644
> --- a/scripts/Makefile.build
> +++ b/scripts/Makefile.build
> @@ -26,6 +26,7 @@ EXTRA_CPPFLAGS :=
>  EXTRA_LDFLAGS  :=
>  asflags-y  :=
>  ccflags-y  :=
> +rustcflags-y :=
>  cppflags-y :=
>  ldflags-y  :=
>  
> @@ -287,6 +288,24 @@ quiet_cmd_cc_lst_c = MKLST   $@
>  $(obj)/%.lst: $(src)/%.c FORCE
>   $(call if_changed_dep,cc_lst_c)
>  
> +# Compile Rust sources (.rs)
> +# ---
> +
> +rustc_cross_flags := --target=$(srctree)/arch/$(SRCARCH)/rust/target.json
> +
> +quiet_cmd_rustc_o_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@
> +  cmd_rustc_o_rs = \
> + RUST_MODFILE=$(modfile) \
> + $(RUSTC_OR_CLIPPY) $(rustc_flags) $(rustc_cross_flags) \
> + --extern alloc --extern kernel \
> + --crate-type rlib --out-dir $(obj) -L $(objtree)/rust/ \
> + --crate-name $(patsubst %.o,%,$(notdir $@)) $<; \
> + mv $(obj)/$(subst .o,,$(notdir $@)).d $(depfile); \
> + sed -i '/^\#/d' $(depfile)
> +
> +$(obj)/%.o: $(src)/%.rs FORCE
> + $(call if_changed_dep,rustc_o_rs)
> +
>  # Compile assembler sources (.S)
>  # ---
>  

So if I read all this right, rust compiles to .o and, like any other .o
file is then fed into objtool (for x86_64). Did you have any problems
with objtool? Does it generate correct ORC unwind information?

AFAICT rust has try/throw/catch exception handling (like
C++/Java/others) which is typically implemented with stack unwinding of
its own.

Does all that interact sanely?


Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 08:22:38PM +0900, Namhyung Kim wrote:
> On Fri, Apr 16, 2021 at 7:28 PM Peter Zijlstra  wrote:
> >
> > On Fri, Apr 16, 2021 at 11:29:30AM +0200, Peter Zijlstra wrote:
> >
> > > > So I think we've had proposals for being able to close fds in the past;
> > > > while preserving groups etc. We've always pushed back on that because of
> > > > the resource limit issue. By having each counter be a filedesc we get a
> > > > natural limit on the amount of resources you can consume. And in that
> > > > respect, having to use 400k fds is things working as designed.
> > > >
> > > > Anyway, there might be a way around this..
> >
> > So how about we flip the whole thing sideways, instead of doing one
> > event for multiple cgroups, do an event for multiple-cpus.
> >
> > Basically, allow:
> >
> > perf_event_open(.pid=fd, cpu=-1, .flag=PID_CGROUP);
> >
> > Which would have the kernel create nr_cpus events [the corrolary is that
> > we'd probably also allow: (.pid=-1, cpu=-1) ].
> 
> Do you mean it'd have separate perf_events per cpu internally?
> From a cpu's perspective, there's nothing changed, right?
> Then it will have the same performance problem as of now.

Yes, but we'll not end up in ioctl() hell. The interface is sooo much
better. The performance thing just means we need to think harder.

I thought cgroup scheduling got a lot better with the work Ian did a
while back? What's the actual bottleneck now?

> > Output could be done by adding FORMAT_PERCPU, which takes the current
> > read() format and writes a copy for each CPU event. (p)read(v)() could
> > be used to explode or partial read that.
> 
> Yeah, I think it's good for read.  But what about mmap?
> I don't think we can use file offset since it's taken for auxtrace.
> Maybe we can simply disallow that..

Are you actually using mmap() to read? I had a proposal for FORMAT_GROUP
like thing for mmap(), but I never implemented that (didn't get the
enthousiatic response I thought it would). But yeah, there's nowhere
near enough space in there for PERCPU.

Not sure how to do that, these counters must not be sampling counters
because we can't be sharing a buffer from multiple CPUs, so data/aux
just isn't a concern. But it's weird to have them magically behave
differently.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote:
>   - Featureful language: sum types, pattern matching, generics,
> RAII, lifetimes, shared & exclusive references, modules &
> visibility, powerful hygienic and procedural macros...

IMO RAII is over-valued, but just in case you care, the below seems to
work just fine. No fancy new language needed, works today. Similarly you
can create refcount_t guards, or with a little more work full blown
smart_ptr crud.

---
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index e19323521f9c..f03a72dd8cea 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -197,4 +197,22 @@ extern void mutex_unlock(struct mutex *lock);
 
 extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
+struct mutex_guard {
+   struct mutex *mutex;
+};
+
+static inline struct mutex_guard mutex_guard_lock(struct mutex *mutex)
+{
+   mutex_lock(mutex);
+   return (struct mutex_guard){ .mutex = mutex, };
+}
+
+static inline void mutex_guard_unlock(struct mutex_guard *guard)
+{
+   mutex_unlock(guard->mutex);
+}
+
+#define DEFINE_MUTEX_GUARD(name, lock) \
+   struct mutex_guard __attribute__((__cleanup__(mutex_guard_unlock))) 
name = mutex_guard_lock(lock)
+
 #endif /* __LINUX_MUTEX_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 8ee3249de2f0..603d197a83b8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5715,16 +5715,15 @@ static long perf_compat_ioctl(struct file *file, 
unsigned int cmd,
 
 int perf_event_task_enable(void)
 {
+   DEFINE_MUTEX_GUARD(event_mutex, >perf_event_mutex);
struct perf_event_context *ctx;
struct perf_event *event;
 
-   mutex_lock(>perf_event_mutex);
list_for_each_entry(event, >perf_event_list, owner_entry) {
ctx = perf_event_ctx_lock(event);
perf_event_for_each_child(event, _perf_event_enable);
perf_event_ctx_unlock(event, ctx);
}
-   mutex_unlock(>perf_event_mutex);
 
return 0;
 }


Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 11:29:30AM +0200, Peter Zijlstra wrote:

> > So I think we've had proposals for being able to close fds in the past;
> > while preserving groups etc. We've always pushed back on that because of
> > the resource limit issue. By having each counter be a filedesc we get a
> > natural limit on the amount of resources you can consume. And in that
> > respect, having to use 400k fds is things working as designed.
> > 
> > Anyway, there might be a way around this..

So how about we flip the whole thing sideways, instead of doing one
event for multiple cgroups, do an event for multiple-cpus.

Basically, allow:

perf_event_open(.pid=fd, cpu=-1, .flag=PID_CGROUP);

Which would have the kernel create nr_cpus events [the corrolary is that
we'd probably also allow: (.pid=-1, cpu=-1) ].

Output could be done by adding FORMAT_PERCPU, which takes the current
read() format and writes a copy for each CPU event. (p)read(v)() could
be used to explode or partial read that.

This gets rid of the nasty variadic nature of the
'get-me-these-n-cgroups'. While still getting rid of the n*m fd issue
you're facing.


Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-16 Thread Peter Zijlstra


Duh.. this is a half-finished email I meant to save for later. Anyway,
I'll reply more.

On Fri, Apr 16, 2021 at 11:26:39AM +0200, Peter Zijlstra wrote:
> On Fri, Apr 16, 2021 at 08:48:12AM +0900, Namhyung Kim wrote:
> > On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra  
> > wrote:
> > > On Tue, Apr 13, 2021 at 08:53:36AM -0700, Namhyung Kim wrote:
> 
> > > > cgroup event counting (i.e. perf stat).
> > > >
> > > >  * PERF_EVENT_IOC_ATTACH_CGROUP - it takes a buffer consists of a
> > > >  64-bit array to attach given cgroups.  The first element is a
> > > >  number of cgroups in the buffer, and the rest is a list of cgroup
> > > >  ids to add a cgroup info to the given event.
> > >
> > > WTH is a cgroup-id? The syscall takes a fd to the path, why have two
> > > different ways?
> > 
> > As you know, we already use cgroup-id for sampling.  Yeah we
> > can do it with the fd but one of the point in this patch is to reduce
> > the number of file descriptors. :)
> 
> Well, I found those patches again after I wrote that. But I'm still not
> sure what a cgroup-id is from userspace.
> 
> How does userspace get one given a cgroup? (I actually mounted cgroupfs
> in order to see if there's some new 'id' file to read, there is not)
> Does having the cgroup-id ensure the cgroup exists? Can the cgroup-id
> get re-used?
> 
> I really don't konw what the thing is. I don't use cgroups, like ever,
> except when I'm forced to due to some regression or bugreport.
> 
> > Also, having cgroup-id is good to match with the result (from read)
> > as it contains the cgroup information.
> 
> What?
> 
> > > >  * PERF_EVENT_IOC_READ_CGROUP - it takes a buffer consists of a 64-bit
> > > >  array to get the event counter values.  The first element is size
> > > >  of the array in byte, and the second element is a cgroup id to
> > > >  read.  The rest is to save the counter value and timings.
> > >
> > > :-(
> > >
> > > So basically you're doing a whole seconds cgroup interface, one that
> > > violates the one counter per file premise and lives off of ioctl()s.
> > 
> > Right, but I'm not sure that we really want a separate event for each
> > cgroup if underlying hardware events are all the same.
> 
> Sure, I see where you're coming from; I just don't much like where it
> got you :-)
> 
> > > *IF* we're going to do something like this, I feel we should explore the
> > > whole vector-per-fd concept before proceeding. Can we make it less yuck
> > > (less special ioctl() and more regular file ops. Can we apply the
> > > concept to more things?
> > 
> > Ideally it'd do without keeping file descriptors open.  Maybe we can make
> > the vector accept various types like vector-per-cgroup_id or so.
> 
> So I think we've had proposals for being able to close fds in the past;
> while preserving groups etc. We've always pushed back on that because of
> the resource limit issue. By having each counter be a filedesc we get a
> natural limit on the amount of resources you can consume. And in that
> respect, having to use 400k fds is things working as designed.
> 
> Anyway, there might be a way around this..
> 
> > > The second patch extends the ioctl() to be more read() like, instead of
> > > doing the sane things and extending read() by adding PERF_FORMAT_VECTOR
> > > or whatever. In fact, this whole second ioctl() doesn't make sense to
> > > have if we do indeed want to do vector-per-fd.
> > 
> > One of the upside of the ioctl() is that we can pass cgroup-id to read.
> > Probably we can keep the index in the vector and set the file offset
> > with it.  Or else just read the whole vector, and then it has a cgroup-id
> > in the output like PERF_FORMAT_CGROUP?
> > 
> > >
> > > Also, I suppose you can already fake this, by having a
> > > SW_CGROUP_SWITCHES (sorry, I though I picked those up, done now) event
> > 
> > Thanks!
> > 
> > > with PERF_SAMPLE_READ|PERF_SAMPLE_CGROUP and PERF_FORMAT_GROUP in a
> > > group with a bunch of events. Then the buffer will fill with the values
> > > you use here.
> > 
> > Right, I'll do an experiment with it.
> > 
> > >
> > > Yes, I suppose it has higher overhead, but you get the data you want
> > > without having to do terrible things like this.
> > 
> > That's true.  And we don't need many things in the perf record like
> > synthesizing task/mmap info.  Also there's a risk we can miss some
> > samples for some reason.
> > 
> > Another concern is that it'd add huge slow down in the perf event
> > open as it creates a mixed sw/hw group.  The synchronized_rcu in
> > the move_cgroup path caused significant problems in my
> > environment as it adds up in proportion to the number of cpus.
> 
> Since when is perf_event_open() a performance concern? That thing is
> slow in all possible ways.


Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 08:48:12AM +0900, Namhyung Kim wrote:
> On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra  wrote:
> > On Tue, Apr 13, 2021 at 08:53:36AM -0700, Namhyung Kim wrote:

> > > cgroup event counting (i.e. perf stat).
> > >
> > >  * PERF_EVENT_IOC_ATTACH_CGROUP - it takes a buffer consists of a
> > >  64-bit array to attach given cgroups.  The first element is a
> > >  number of cgroups in the buffer, and the rest is a list of cgroup
> > >  ids to add a cgroup info to the given event.
> >
> > WTH is a cgroup-id? The syscall takes a fd to the path, why have two
> > different ways?
> 
> As you know, we already use cgroup-id for sampling.  Yeah we
> can do it with the fd but one of the point in this patch is to reduce
> the number of file descriptors. :)

Well, I found those patches again after I wrote that. But I'm still not
sure what a cgroup-id is from userspace.

How does userspace get one given a cgroup? (I actually mounted cgroupfs
in order to see if there's some new 'id' file to read, there is not)
Does having the cgroup-id ensure the cgroup exists? Can the cgroup-id
get re-used?

I really don't konw what the thing is. I don't use cgroups, like ever,
except when I'm forced to due to some regression or bugreport.

> Also, having cgroup-id is good to match with the result (from read)
> as it contains the cgroup information.

What?

> > >  * PERF_EVENT_IOC_READ_CGROUP - it takes a buffer consists of a 64-bit
> > >  array to get the event counter values.  The first element is size
> > >  of the array in byte, and the second element is a cgroup id to
> > >  read.  The rest is to save the counter value and timings.
> >
> > :-(
> >
> > So basically you're doing a whole seconds cgroup interface, one that
> > violates the one counter per file premise and lives off of ioctl()s.
> 
> Right, but I'm not sure that we really want a separate event for each
> cgroup if underlying hardware events are all the same.

Sure, I see where you're coming from; I just don't much like where it
got you :-)

> > *IF* we're going to do something like this, I feel we should explore the
> > whole vector-per-fd concept before proceeding. Can we make it less yuck
> > (less special ioctl() and more regular file ops. Can we apply the
> > concept to more things?
> 
> Ideally it'd do without keeping file descriptors open.  Maybe we can make
> the vector accept various types like vector-per-cgroup_id or so.

So I think we've had proposals for being able to close fds in the past;
while preserving groups etc. We've always pushed back on that because of
the resource limit issue. By having each counter be a filedesc we get a
natural limit on the amount of resources you can consume. And in that
respect, having to use 400k fds is things working as designed.

Anyway, there might be a way around this..

> > The second patch extends the ioctl() to be more read() like, instead of
> > doing the sane things and extending read() by adding PERF_FORMAT_VECTOR
> > or whatever. In fact, this whole second ioctl() doesn't make sense to
> > have if we do indeed want to do vector-per-fd.
> 
> One of the upside of the ioctl() is that we can pass cgroup-id to read.
> Probably we can keep the index in the vector and set the file offset
> with it.  Or else just read the whole vector, and then it has a cgroup-id
> in the output like PERF_FORMAT_CGROUP?
> 
> >
> > Also, I suppose you can already fake this, by having a
> > SW_CGROUP_SWITCHES (sorry, I though I picked those up, done now) event
> 
> Thanks!
> 
> > with PERF_SAMPLE_READ|PERF_SAMPLE_CGROUP and PERF_FORMAT_GROUP in a
> > group with a bunch of events. Then the buffer will fill with the values
> > you use here.
> 
> Right, I'll do an experiment with it.
> 
> >
> > Yes, I suppose it has higher overhead, but you get the data you want
> > without having to do terrible things like this.
> 
> That's true.  And we don't need many things in the perf record like
> synthesizing task/mmap info.  Also there's a risk we can miss some
> samples for some reason.
> 
> Another concern is that it'd add huge slow down in the perf event
> open as it creates a mixed sw/hw group.  The synchronized_rcu in
> the move_cgroup path caused significant problems in my
> environment as it adds up in proportion to the number of cpus.

Since when is perf_event_open() a performance concern? That thing is
slow in all possible ways.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 06:02:33AM +0100, Wedson Almeida Filho wrote:
> On Fri, Apr 16, 2021 at 04:25:34AM +, Al Viro wrote:

> > And as one of those freaks I can tell
> > you where exactly I would like you to go and what I would like you to do
> > with implicit suggestions to start a browser when I need to read some
> > in-tree documentation.
> 
> I could be mistaken but you seem angry. Perhaps it wouldn't be a bad idea to
> read your own code of conduct, I don't think you need a browser for that 
> either.

Welcome to LKML. CoC does not forbid human emotions just yet. Deal with
it.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-16 Thread Peter Zijlstra
On Fri, Apr 16, 2021 at 03:22:16AM +0100, Wedson Almeida Filho wrote:
> On Thu, Apr 15, 2021 at 08:58:16PM +0200, Peter Zijlstra wrote:
> > On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote:
> > 
> > > Rust is a systems programming language that brings several key
> > > advantages over C in the context of the Linux kernel:
> > > 
> > >   - No undefined behavior in the safe subset (when unsafe code is
> > > sound), including memory safety and the absence of data races.
> > 
> > And yet I see not a single mention of the Rust Memory Model and how it
> > aligns (or not) with the LKMM. The C11 memory model for example is a
> > really poor fit for LKMM.
> 
> We don't intend to directly expose C data structures to Rust code (outside the
> kernel crate). Instead, we intend to provide wrappers that expose safe
> interfaces even though the implementation may use unsafe blocks. So we expect
> the vast majority of Rust code to just care about the Rust memory model.
> 
> We admittedly don't have a huge number of wrappers yet, but we do have enough 
> to
> implement most of Binder and so far it's been ok. We do intend to eventually
> cover other classes of drivers that may unveil unforeseen difficulties, we'll
> see.
> 
> If you have concerns that we might have overlooked, we'd be happy to hear 
> about
> them from you (or anyone else).

Well, the obvious example would be seqlocks. C11 can't do them. The not
sharing of data structures would avoid most of that, but will also cost
you in performance.

Simlar thing for RCU; C11 can't optimally do that; it needs to make
rcu_dereference() a load-acquire [something ARM64 has already done in C
because the compiler might be too clever by half when doing LTO :-(].
But it's the compiler needing the acquire semantics, not the computer,
which is just bloody wrong.

And there's more sharp corners to be had. But yes, if you're not
actually sharing anything; and taking the performance hit that comes
with that, you might get away with it.

> > HTML is not a valid documentation format. Heck, markdown itself is
> > barely readable.
> 
> Are you stating [what you perceive as] a fact or just venting? If the former,
> would you mind enlightening us with some evidence?

I've yet to see a program that renders HTML (including all the cruft
often used in docs, which might include SVG graphics and whatnot) sanely
in ASCII. Lynx does not qualify, it's output is atrocious crap.

Yes, lynx lets you read HTML in ASCII, but at the cost of bleeding
eyeballs and missing content.

Nothing beats a sane ASCII document with possibly, where really needed
some ASCII art.

Sadly the whole kernel documentation project is moving away from that as
well, which just means I'm back to working on an undocumented codebase.
This rst crap they adopted is unreadable garbage.

> > It is really *really* hard to read. It has all sorts of weird things,
> > like operators at the beginning after a line break:
> > 
> > if (foo
> > || bar)
> > 
> > which is just wrong. And it suffers from CamelCase, which is just about
> > the worst thing ever. Not even the C++ std libs have that (or had, back
> > when I still did knew C++).
> > 
> > I also see:
> > 
> > if (foo) {
> > ...
> > }
> > 
> > and
> > 
> > if foo {
> > }
> > 
> > the latter, ofcourse, being complete rubbish.
> 
> There are advantages to adopting the preferred style of a language (when one
> exists). We, of course, are not required to adopt it but I am of the opinion
> that we should have good reasons to diverge if that's our choice in the end.
> 
> "Not having parentheses around the if-clause expression is complete rubbish"
> doesn't sound like a good reason to me.

Of course it does; my internal lexer keeps screaming syntax error at me;
how am I going to understand code when I can't sanely read it?

The more you make it look like (Kernel) C, the easier it is for us C
people to actually read. My eyes have been reading C for almost 30 years
by now, they have a lexer built in the optical nerve; reading something
that looks vaguely like C but is definitely not C is an utterly painful
experience.

You're asking to join us, not the other way around. I'm fine in a world
without Rust.


Re: [PATCH 00/13] [RFC] Rust support

2021-04-15 Thread Peter Zijlstra
On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote:

> Rust is a systems programming language that brings several key
> advantages over C in the context of the Linux kernel:
> 
>   - No undefined behavior in the safe subset (when unsafe code is
> sound), including memory safety and the absence of data races.

And yet I see not a single mention of the Rust Memory Model and how it
aligns (or not) with the LKMM. The C11 memory model for example is a
really poor fit for LKMM.

> ## Why not?
> 
> Rust also has disadvantages compared to C in the context of
> the Linux kernel:
> 
>   - The many years of effort in tooling for C around the kernel,
> including compiler plugins, sanitizers, Coccinelle, lockdep,
> sparse... However, this will likely improve if Rust usage in
> the kernel grows over time.

This; can we mercilessly break the .rs bits when refactoring? What
happens the moment we cannot boot x86_64 without Rust crap on?

We can ignore this as a future problem, but I think it's only fair to
discuss now. I really don't care for that future, and IMO adding this
Rust or any other second language is a fail.

> Thirdly, in Rust code bases, most documentation is written alongside
> the source code, in Markdown. We follow this convention, thus while
> we have a few general documents in `Documentation/rust/`, most of
> the actual documentation is in the source code itself.
> 
> In order to read this documentation easily, Rust provides a tool
> to generate HTML documentation, just like Sphinx/kernel-doc, but
> suited to Rust code bases and the language concepts.

HTML is not a valid documentation format. Heck, markdown itself is
barely readable.

> Moreover, as explained above, we are taking the chance to enforce
> some documentation guidelines. We are also enforcing automatic code
> formatting, a set of Clippy lints, etc. We decided to go with Rust's
> idiomatic style, i.e. keeping `rustfmt` defaults. For instance, this
> means 4 spaces are used for indentation, rather than a tab. We are
> happy to change that if needed -- we think what is important is
> keeping the formatting automated.

It is really *really* hard to read. It has all sorts of weird things,
like operators at the beginning after a line break:

if (foo
|| bar)

which is just wrong. And it suffers from CamelCase, which is just about
the worst thing ever. Not even the C++ std libs have that (or had, back
when I still did knew C++).

I also see:

if (foo) {
...
}

and

if foo {
}

the latter, ofcourse, being complete rubbish.

> Another important topic we would like feedback on is the Rust
> "native" documentation that is written alongside the code, as
> explained above. We have uploaded it here:
> 
> https://rust-for-linux.github.io/docs/kernel/
> 
> We like how this kind of generated documentation looks. Please take
> a look and let us know what you think!

I cannot view with less or vim. Therefore it looks not at all.

>   - Boqun Feng is working hard on the different options for
> threading abstractions and has reviewed most of the `sync` PRs.

Boqun, I know you're familiar with LKMM, can you please talk about how
Rust does things and how it interacts?


[PATCH] sched/debug: Rename the sched_debug parameter to sched_debug_verbose

2021-04-15 Thread Peter Zijlstra
On Mon, Apr 12, 2021 at 12:14:25PM +0200, Peter Zijlstra wrote:

> + debugfs_create_bool("debug_enabled", 0644, debugfs_sched, 
> _debug_enabled);

How's this on top of the whole series?

---
Subject: sched/debug: Rename the sched_debug parameter to sched_debug_verbose
From: Peter Zijlstra 
Date: Thu Apr 15 18:23:17 CEST 2021

CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param
sched_debug and the /debug/sched/debug_enabled knobs control the
sched_debug_enabled variable, but what they really do is make
SCHED_DEBUG more verbose, so rename the lot.

Signed-off-by: Peter Zijlstra (Intel) 
---
 Documentation/admin-guide/kernel-parameters.txt |3 ++-
 Documentation/scheduler/sched-domains.rst   |   10 +-
 kernel/sched/debug.c|4 ++--
 kernel/sched/topology.c |   10 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4756,7 +4756,8 @@
 
sbni=   [NET] Granch SBNI12 leased line adapter
 
-   sched_debug [KNL] Enables verbose scheduler debug messages.
+   sched_debug_verbose
+   [KNL] Enables verbose scheduler debug messages.
 
schedstats= [KNL,X86] Enable or disable scheduled statistics.
Allowed values are enable and disable. This feature
--- a/Documentation/scheduler/sched-domains.rst
+++ b/Documentation/scheduler/sched-domains.rst
@@ -74,8 +74,8 @@ for a given topology level by creating a
 calling set_sched_topology() with this array as the parameter.
 
 The sched-domains debugging infrastructure can be enabled by enabling
-CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
-tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
-knob. This enables an error checking parse of the sched domains which should
-catch most possible errors (described above). It also prints out the domain
-structure in a visual format.
+CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
+forgot to tweak your cmdline, you can also flip the
+/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
+the sched domains which should catch most possible errors (described above). It
+also prints out the domain structure in a visual format.
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -275,7 +275,7 @@ static const struct file_operations sche
 
 #endif /* CONFIG_PREEMPT_DYNAMIC */
 
-__read_mostly bool sched_debug_enabled;
+__read_mostly bool sched_debug_verbose;
 
 static const struct seq_operations sched_debug_sops;
 
@@ -300,7 +300,7 @@ static __init int sched_init_debug(void)
debugfs_sched = debugfs_create_dir("sched", NULL);
 
debugfs_create_file("features", 0644, debugfs_sched, NULL, 
_feat_fops);
-   debugfs_create_bool("debug_enabled", 0644, debugfs_sched, 
_debug_enabled);
+   debugfs_create_bool("verbose", 0644, debugfs_sched, 
_debug_verbose);
 #ifdef CONFIG_PREEMPT_DYNAMIC
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, 
_dynamic_fops);
 #endif
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -14,7 +14,7 @@ static cpumask_var_t sched_domains_tmpma
 
 static int __init sched_debug_setup(char *str)
 {
-   sched_debug_enabled = true;
+   sched_debug_verbose = true;
 
return 0;
 }
@@ -22,7 +22,7 @@ early_param("sched_debug", sched_debug_s
 
 static inline bool sched_debug(void)
 {
-   return sched_debug_enabled;
+   return sched_debug_verbose;
 }
 
 #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = 
#_name },
@@ -131,7 +131,7 @@ static void sched_domain_debug(struct sc
 {
int level = 0;
 
-   if (!sched_debug_enabled)
+   if (!sched_debug_verbose)
return;
 
if (!sd) {
@@ -152,7 +152,7 @@ static void sched_domain_debug(struct sc
 }
 #else /* !CONFIG_SCHED_DEBUG */
 
-# define sched_debug_enabled 0
+# define sched_debug_verbose 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@@ -2141,7 +2141,7 @@ build_sched_domains(const struct cpumask
if (has_asym)
static_branch_inc_cpuslocked(_asym_cpucapacity);
 
-   if (rq && sched_debug_enabled) {
+   if (rq && sched_debug_verbose) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}


Re: [PATCH] locking/qrwlock: Fix ordering in queued_write_lock_slowpath

2021-04-15 Thread Peter Zijlstra
On Thu, Apr 15, 2021 at 04:28:21PM +0100, Will Deacon wrote:
> On Thu, Apr 15, 2021 at 05:03:58PM +0200, Peter Zijlstra wrote:
> > @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
> >  
> > /* When no more readers or writers, set the locked flag */
> > do {
> > -   atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
> > -   } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING,
> > -   _QW_LOCKED) != _QW_WAITING);
> > +   cnt = atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
> 
> I think the issue is that >here< a concurrent reader in interrupt context
> can take the lock and release it again, but we could speculate reads from
> the critical section up over the later release and up before the control
> dependency here...

OK, so that was totally not clear from the original changelog.

> > +   } while (!atomic_try_cmpxchg_relaxed(>cnts, , _QW_LOCKED));
> 
> ... and then this cmpxchg() will succeed, so our speculated stale reads
> could be used.
> 
> *HOWEVER*
> 
> Speculating a read should be fine in the face of a concurrent _reader_,
> so for this to be an issue it implies that the reader is also doing some
> (atomic?) updates.

So we're having something like:

CPU0CPU1

queue_write_lock_slowpath()
  atomic_cond_read_acquire() == _QW_WAITING

queued_read_lock_slowpath()
  atomic_cond_read_acquire()
  return;
   ,--> (before X's store)
   |X = y;
   |
   |queued_read_unlock()
   |  (void)atomic_sub_return_release()
   |
   |  atomic_cmpxchg_relaxed(.old = _QW_WAITING)
   |
`-- r = X;


Which as Will said is a cmpxchg ABA, however for it to be ABA, we have
to observe that unlock-store, and won't we then also have to observe the
whole critical section?

Ah, the issue is yet another load inside our own (CPU0)'s critical
section. which isn't ordered against the cmpxchg_relaxed() and can be
issued before.

So yes, then making the cmpxchg an acquire, will force all our own loads
to happen later.


Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback

2021-04-15 Thread Peter Zijlstra
On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote:
> > So instead, make sure balance_push is enabled between
> > sched_cpu_deactivate() and sched_cpu_activate() (eg. when
> > !cpu_active()), and gate it's utility with cpu_dying().
> 
> I'd word that "is enabled below sched_cpu_activate()", since
> sched_cpu_deactivate() is now out of the picture.

I are confused (again!). Did you mean to say something like: 'is enabled
below SCHED_AP_ACTIVE' ?


Re: [PATCH] locking/qrwlock: Fix ordering in queued_write_lock_slowpath

2021-04-15 Thread Peter Zijlstra
On Thu, Apr 15, 2021 at 02:25:52PM +, Ali Saidi wrote:
> While this code is executed with the wait_lock held, a reader can
> acquire the lock without holding wait_lock.  The writer side loops
> checking the value with the atomic_cond_read_acquire(), but only truly
> acquires the lock when the compare-and-exchange is completed
> successfully which isn’t ordered. The other atomic operations from this
> point are release-ordered and thus reads after the lock acquisition can
> be completed before the lock is truly acquired which violates the
> guarantees the lock should be making.

Should be per who? We explicitly do not order the lock acquire store.
qspinlock has the exact same issue.

If you look in the git history surrounding spin_is_locked(), you'll find
'lovely' things.

Basically, if you're doing crazy things with spin_is_locked() you get to
keep the pieces.

> Fixes: b519b56e378ee ("locking/qrwlock: Use atomic_cond_read_acquire() when 
> spinning in qrwloc")
> Signed-off-by: Ali Saidi 
> Cc: sta...@vger.kernel.org
> ---
>  kernel/locking/qrwlock.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
> index 4786dd271b45..10770f6ac4d9 100644
> --- a/kernel/locking/qrwlock.c
> +++ b/kernel/locking/qrwlock.c
> @@ -73,8 +73,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
>  
>   /* When no more readers or writers, set the locked flag */
>   do {
> - atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
> - } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING,
> + atomic_cond_read_relaxed(>cnts, VAL == _QW_WAITING);
> + } while (atomic_cmpxchg_acquire(>cnts, _QW_WAITING,
>   _QW_LOCKED) != _QW_WAITING);
>  unlock:
>   arch_spin_unlock(>wait_lock);

This doesn't make sense, there is no such thing as a store-acquire. What
you're doing here is moving the acquire from one load to the next. A
load we know will load the exact same value.

Also see Documentation/atomic_t.txt:

  {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE


If anything this code wants to be written like so.

---

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 4786dd271b45..22aeccc363ca 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
  */
 void queued_write_lock_slowpath(struct qrwlock *lock)
 {
+   u32 cnt;
+
/* Put the writer into the wait queue */
arch_spin_lock(>wait_lock);
 
@@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
 
/* When no more readers or writers, set the locked flag */
do {
-   atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
-   } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING,
-   _QW_LOCKED) != _QW_WAITING);
+   cnt = atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING);
+   } while (!atomic_try_cmpxchg_relaxed(>cnts, , _QW_LOCKED));
 unlock:
arch_spin_unlock(>wait_lock);
 }


  1   2   3   4   5   6   7   8   9   10   >