Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages
On Mon, Apr 15, 2024 at 08:00:26PM +0300, Mike Rapoport wrote: > On Mon, Apr 15, 2024 at 12:47:50PM +0200, Peter Zijlstra wrote: > > On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote: > > > > > To populate the cache, a writable large page is allocated from vmalloc > > > with > > > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as > > > ROX. > > > > > +static void execmem_invalidate(void *ptr, size_t size, bool writable) > > > +{ > > > + if (execmem_info->invalidate) > > > + execmem_info->invalidate(ptr, size, writable); > > > + else > > > + memset(ptr, 0, size); > > > +} > > > > +static void execmem_invalidate(void *ptr, size_t size, bool writeable) > > +{ > > + /* fill memory with INT3 instructions */ > > + if (writeable) > > + memset(ptr, 0xcc, size); > > + else > > + text_poke_set(ptr, 0xcc, size); > > +} > > > > Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction. > > It raises #BP not #UD. > > Do you mean that _invalidate is a poor name choice or that it's necessary > to use an instruction that raises #UD? Poor naming, mostly. #BP handler will still scream bloody murder if the site is otherwise unclaimed. It just isn't an invalid instruction.
Re: [RFC PATCH 6/7] execmem: add support for cache of large ROX pages
On Thu, Apr 11, 2024 at 07:05:25PM +0300, Mike Rapoport wrote: > To populate the cache, a writable large page is allocated from vmalloc with > VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as > ROX. > +static void execmem_invalidate(void *ptr, size_t size, bool writable) > +{ > + if (execmem_info->invalidate) > + execmem_info->invalidate(ptr, size, writable); > + else > + memset(ptr, 0, size); > +} +static void execmem_invalidate(void *ptr, size_t size, bool writeable) +{ + /* fill memory with INT3 instructions */ + if (writeable) + memset(ptr, 0xcc, size); + else + text_poke_set(ptr, 0xcc, size); +} Thing is, 0xcc (aka INT3_INSN_OPCODE) is not an invalid instruction. It raises #BP not #UD.
Re: [RFC PATCH 5/7] x86/module: perpare module loading for ROX allocations of text
On Thu, Apr 11, 2024 at 07:05:24PM +0300, Mike Rapoport wrote: > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index 45a280f2161c..b4d6868df573 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -504,17 +513,17 @@ void __init_or_module noinline > apply_alternatives(struct alt_instr *start, >* patch if feature is *NOT* present. >*/ > if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { > - optimize_nops_inplace(instr, a->instrlen); > + optimize_nops_inplace(wr_instr, a->instrlen); > continue; > } > > - DPRINTK(ALT, "feat: %d*32+%d, old: (%pS (%px) len: %d), repl: > (%px, len: %d) flags: 0x%x", > + DPRINTK(ALT, "feat: %d*32+%d, old: (%px (%px) len: %d), repl: > (%px (%px), len: %d) flags: 0x%x", > a->cpuid >> 5, > a->cpuid & 0x1f, > - instr, instr, a->instrlen, > - replacement, a->replacementlen, a->flags); > + instr, wr_instr, a->instrlen, > + replacement, wr_replacement, a->replacementlen, > a->flags); I think this, and > > - memcpy(insn_buff, replacement, a->replacementlen); > + memcpy(insn_buff, wr_replacement, a->replacementlen); > insn_buff_sz = a->replacementlen; > > if (a->flags & ALT_FLAG_DIRECT_CALL) { > @@ -528,11 +537,11 @@ void __init_or_module noinline > apply_alternatives(struct alt_instr *start, > > apply_relocation(insn_buff, a->instrlen, instr, replacement, > a->replacementlen); > > - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); > + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", > instr); this, want to remain as is. > DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: > rpl_insn: ", replacement); > DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", > instr); > > - text_poke_early(instr, insn_buff, insn_buff_sz); > + text_poke_early(wr_instr, insn_buff, insn_buff_sz); > } > > kasan_enable_current(); The rationale being that we then print an address that can be correlated to the kernel image (provided one either kills kaslr or adjusts for it).
Re: [PATCH v4 07/15] mm/execmem, arch: convert remaining overrides of module_alloc to execmem
On Thu, Apr 11, 2024 at 07:00:43PM +0300, Mike Rapoport wrote: > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > + .flags = EXECMEM_KASAN_SHADOW, > + .alignment = MODULE_ALIGN, > + }, > + }, > +}; > > +struct execmem_info __init *execmem_arch_setup(void) > { > + unsigned long start, offset = 0; > > + if (kaslr_enabled()) > + offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; > > + start = MODULES_VADDR + offset; > + execmem_info.ranges[EXECMEM_DEFAULT].start = start; > + execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END; > + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL; > > + return _info; > } struct execmem_info __init *execmem_arch_setup(void) { unsigned long offset = 0; if (kaslr_enabled()) offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; execmem_info = (struct execmem_info){ .ranges = { [EXECMEM_DEFAULT] = { .start = MODULES_VADDR + offset, .end = MODULES_END, .pgprot= PAGE_KERNEL, .flags = EXECMEM_KASAN_SHADOW, .alignment = 1, }, }, }; return _info; }
Re: [PATCH v4 06/15] mm/execmem, arch: convert simple overrides of module_alloc to execmem
On Thu, Apr 11, 2024 at 07:00:42PM +0300, Mike Rapoport wrote: > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > + .start = MODULES_VADDR, > + .end = MODULES_END, > + .alignment = 1, > + }, > + }, > +}; > + > +struct execmem_info __init *execmem_arch_setup(void) > { > + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL; > + > + return _info; > } > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > + .start = MODULES_VADDR, > + .end = MODULES_END, > + .pgprot = PAGE_KERNEL_EXEC, > + .alignment = 1, > + }, > + }, > +}; > + > +struct execmem_info __init *execmem_arch_setup(void) > { > + return _info; > } > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > + .pgprot = PAGE_KERNEL_RWX, > + .alignment = 1, > + }, > + }, > +}; > + > +struct execmem_info __init *execmem_arch_setup(void) > { > + execmem_info.ranges[EXECMEM_DEFAULT].start = VMALLOC_START; > + execmem_info.ranges[EXECMEM_DEFAULT].end = VMALLOC_END; > + > + return _info; > } > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > + .pgprot = PAGE_KERNEL, > + .alignment = 1, > + }, > + }, > +}; > + > +struct execmem_info __init *execmem_arch_setup(void) > { > + execmem_info.ranges[EXECMEM_DEFAULT].start = MODULES_VADDR; > + execmem_info.ranges[EXECMEM_DEFAULT].end = MODULES_END; > + > + return _info; > } > +static struct execmem_info execmem_info __ro_after_init = { > + .ranges = { > + [EXECMEM_DEFAULT] = { > #ifdef CONFIG_SPARC64 > + .start = MODULES_VADDR, > + .end = MODULES_END, > #else > + .start = VMALLOC_START, > + .end = VMALLOC_END, > +#endif > + .alignment = 1, > + }, > + }, > +}; > + > +struct execmem_info __init *execmem_arch_setup(void) > { > + execmem_info.ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL; > > + return _info; > } I'm amazed by the weird and inconsistent breakup of initializations. What exactly is wrong with something like: static struct execmem_info execmem_info __ro_after_init; struct execmem_info __init *execmem_arch_setup(void) { execmem_info = (struct execmem_info){ .ranges = { [EXECMEM_DEFAULT] = { .start = MODULES_VADDR, .end= MODULES_END, .pgprot = PAGE_KERNEL, .alignment = 1, }, }, }; return _info; }
Re: [PATCH v4 05/15] mm: introduce execmem_alloc() and execmem_free()
On Thu, Apr 11, 2024 at 07:00:41PM +0300, Mike Rapoport wrote: > +/** > + * enum execmem_type - types of executable memory ranges > + * > + * There are several subsystems that allocate executable memory. > + * Architectures define different restrictions on placement, > + * permissions, alignment and other parameters for memory that can be used > + * by these subsystems. > + * Types in this enum identify subsystems that allocate executable memory > + * and let architectures define parameters for ranges suitable for > + * allocations by each subsystem. > + * > + * @EXECMEM_DEFAULT: default parameters that would be used for types that > + * are not explcitly defined. > + * @EXECMEM_MODULE_TEXT: parameters for module text sections > + * @EXECMEM_KPROBES: parameters for kprobes > + * @EXECMEM_FTRACE: parameters for ftrace > + * @EXECMEM_BPF: parameters for BPF > + * @EXECMEM_TYPE_MAX: > + */ > +enum execmem_type { > + EXECMEM_DEFAULT, > + EXECMEM_MODULE_TEXT = EXECMEM_DEFAULT, > + EXECMEM_KPROBES, > + EXECMEM_FTRACE, > + EXECMEM_BPF, > + EXECMEM_TYPE_MAX, > +}; Can we please get a break-down of how all these types are actually different from one another? I'm thinking some platforms have a tiny immediate space (arm64 comes to mind) and has less strict placement constraints for some of them?
Re: [RFC PATCH 0/4] perf: Correlating user process data to samples
On Fri, Apr 12, 2024 at 09:37:24AM -0700, Beau Belgrave wrote: > > Anyway, since we typically run stuff from NMI context, accessing user > > data is 'interesting'. As such I would really like to make this work > > depend on the call-graph rework that pushes all the user access bits > > into return-to-user. > > Cool, I assume that's the SFRAME work? Are there pointers to work I > could look at and think about what a rebase looks like? Or do you have > someone in mind I should work with for this? I've been offline for a little while and still need to catch up with things myself. Josh was working on that when I dropped off IIRC, I'm not entirely sure where things are at currently (and there is no way I can ever hope to process the backlog). Anybody know where we are with that?
Re: [RFC PATCH 0/4] perf: Correlating user process data to samples
On Fri, Apr 12, 2024 at 12:17:28AM +, Beau Belgrave wrote: > An idea flow would look like this: > User Task Profile > do_work();sample() -> IP + No activity > ... > set_activity(123); > ... > do_work();sample() -> IP + activity (123) > ... > set_activity(124); > ... > do_work();sample() -> IP + activity (124) This, start with this, because until I saw this, I was utterly confused as to what the heck you were on about. I started by thinking we already have TID in samples so you can already associate back to user processes and got increasingly confused the further I went. What you seem to want to do however is have some task-state included so you can see what the thread is doing. Anyway, since we typically run stuff from NMI context, accessing user data is 'interesting'. As such I would really like to make this work depend on the call-graph rework that pushes all the user access bits into return-to-user.
Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
On Tue, Nov 21, 2023 at 02:17:21PM +0100, Tobias Huschle wrote: > We applied both suggested patch options and ran the test again, so > > sched/eevdf: Fix vruntime adjustment on reweight > sched/fair: Update min_vruntime for reweight_entity() correctly > > and > > sched/eevdf: Delay dequeue > > Unfortunately, both variants do NOT fix the problem. > The regression remains unchanged. Thanks for testing. > I will continue getting myself familiar with how cgroups are scheduled to dig > deeper here. If there are any other ideas, I'd be happy to use them as a > starting point for further analysis. > > Would additional traces still be of interest? If so, I would be glad to > provide them. So, since it got bisected to the placement logic, but is a cgroup related issue, I was thinking that 'Delay dequeue' might not cut it, that only works for tasks, not the internal entities. The below should also work for internal entities, but last time I poked around with it I had some regressions elsewhere -- you know, fix one, wreck another type of situations on hand. But still, could you please give it a go -- it applies cleanly to linus' master and -rc2. --- Subject: sched/eevdf: Revenge of the Sith^WSleepers For tasks that have received excess service (negative lag) allow them to gain parity (zero lag) by sleeping. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 36 kernel/sched/features.h | 6 ++ 2 files changed, 42 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d7a3c63a2171..b975e4b07a68 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5110,6 +5110,33 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ +static inline u64 +entity_vlag_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + u64 now, vdelta; + s64 delta; + + if (!(flags & ENQUEUE_WAKEUP)) + return se->vlag; + + if (flags & ENQUEUE_MIGRATED) + return 0; + + now = rq_clock_task(rq_of(cfs_rq)); + delta = now - se->exec_start; + if (delta < 0) + return se->vlag; + + if (sched_feat(GENTLE_SLEEPER)) + delta /= 2; + + vdelta = __calc_delta(delta, NICE_0_LOAD, _rq->load); + if (vdelta < -se->vlag) + return se->vlag + vdelta; + + return 0; +} + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -5133,6 +5160,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) lag = se->vlag; + /* +* Allow tasks that have received too much service (negative +* lag) to (re)gain parity (zero lag) by sleeping for the +* equivalent duration. This ensures they will be readily +* eligible. +*/ + if (sched_feat(PLACE_SLEEPER) && lag < 0) + lag = entity_vlag_sleeper(cfs_rq, se, flags); + /* * If we want to place a task and preserve lag, we have to * consider the effect of the new entity on the weighted diff --git a/kernel/sched/features.h b/kernel/sched/features.h index a3ddf84de430..722282d3ed07 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -7,6 +7,12 @@ SCHED_FEAT(PLACE_LAG, true) SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) SCHED_FEAT(RUN_TO_PARITY, true) +/* + * Let sleepers earn back lag, but not more than 0-lag. GENTLE_SLEEPERS earn at + * half the speed. + */ +SCHED_FEAT(PLACE_SLEEPER, true) +SCHED_FEAT(GENTLE_SLEEPER, true) /* * Prefer to schedule the task we woke last (assuming it failed
Re: Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
On Sat, Nov 18, 2023 at 01:14:32PM +0800, Abel Wu wrote: > Hi Peter, I'm a little confused here. As we adopt placement strategy #1 > when PLACE_LAG is enabled, the lag of that entity needs to be preserved. > Given that the weight doesn't change, we have: > > vl' = vl > > But in fact it is scaled on placement: > > vl' = vl * W/(W + w) (W+w)/W > > Does this intended? The scaling, yes that's intended and the comment explains why. So now you have me confused too :-) Specifically, I want the lag after placement to be equal to the lag we come in with. Since placement will affect avg_vruntime (adding one element to the average changes the average etc..) the placement also affects the lag as measured after placement. Or rather, if you enqueue and dequeue, I want the lag to be preserved. If you do not take placement into consideration, lag will dissipate real quick. > And to illustrate my understanding of strategy #1: > @@ -5162,41 +5165,17 @@ place_entity(struct cfs_rq *cfs_rq, struct > sched_entity *se, int flags) >* vl_i is given by: >* >* V' = (\Sum w_j*v_j + w_i*v_i) / (W + w_i) > - * = (W*V + w_i*(V - vl_i)) / (W + w_i) > - * = (W*V + w_i*V - w_i*vl_i) / (W + w_i) > - * = (V*(W + w_i) - w_i*l) / (W + w_i) > - * = V - w_i*vl_i / (W + w_i) > - * > - * And the actual lag after adding an entity with vl_i is: > - * > - * vl'_i = V' - v_i > - * = V - w_i*vl_i / (W + w_i) - (V - vl_i) > - * = vl_i - w_i*vl_i / (W + w_i) > - * > - * Which is strictly less than vl_i. So in order to preserve lag > - * we should inflate the lag before placement such that the > - * effective lag after placement comes out right. > - * > - * As such, invert the above relation for vl'_i to get the vl_i > - * we need to use such that the lag after placement is the lag > - * we computed before dequeue. > + * = (W*V + w_i*(V' - vl_i)) / (W + w_i) > + * = V - w_i*vl_i / W >* > - * vl'_i = vl_i - w_i*vl_i / (W + w_i) > - * = ((W + w_i)*vl_i - w_i*vl_i) / (W + w_i) > - * > - * (W + w_i)*vl'_i = (W + w_i)*vl_i - w_i*vl_i > - * = W*vl_i > - * > - * vl_i = (W + w_i)*vl'_i / W >*/ > load = cfs_rq->avg_load; > if (curr && curr->on_rq) > load += scale_load_down(curr->load.weight); > - > - lag *= load + scale_load_down(se->load.weight); > if (WARN_ON_ONCE(!load)) > load = 1; > - lag = div_s64(lag, load); > + > + vruntime -= div_s64(lag * scale_load_down(se->load.weight), > load); > } > se->vruntime = vruntime - lag; So you're proposing we do: v = V - (lag * w) / (W + w) - lag ? That can be written like: v = V - (lag * w) / (W+w) - (lag * (W+w)) / (W+w) = V - (lag * (W+w) + lag * w) / (W+w) = V - (lag * (W+2w)) / (W+w) And that turns into a mess AFAICT. Let me repeat my earlier argument. Suppose v,w,l are the new element. V,W are the old avg_vruntime and sum-weight. Then: V = V*W / W, and by extention: V' = (V*W + v*w) / (W + w). The new lag, after placement: l' = V' - v = (V*W + v*w) / (W+w) - v = (V*W + v*w) / (W+w) - v * (W+w) / (W+v) = (V*W + v*w -v*W - v*w) / (W+w) = (V*W - v*W) / (W+w) = W*(V-v) / (W+w) = W/(W+w) * (V-v) Substitute: v = V - (W+w)/W * l, my scaling thing, to obtain: l' = W/(W+w) * (V - (V - (W+w)/W * l)) = W/(W+w) * (V - V + (W+w)/W * l) = W/(W+w) * (W+w)/W * l = l So by scaling, we've preserved lag across placement. That make sense?
Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
On Fri, Nov 17, 2023 at 01:24:21PM +0100, Tobias Huschle wrote: > On Fri, Nov 17, 2023 at 10:23:18AM +0100, Peter Zijlstra wrote: > > kworkers are typically not in cgroups and are part of the root cgroup, > > but what's a vhost and where does it live? > > The qemu instances of the two KVM guests are placed into cgroups. > The vhosts run within the context of these qemu instances (4 threads per > guest). > So they are also put into those cgroups. > > I'll answer the other questions you brought up as well, but I guess that one > is most critical: > > > > > After confirming both tasks are indeed in the same cgroup ofcourse, > > because if they're not, vruntime will be meaningless to compare and we > > should look elsewhere. > > In that case we probably have to go with elsewhere ... which is good to know. Ah, so if this is a cgroup issue, it might be worth trying this patch that we have in tip/sched/urgent. I'll try and read the rest of the email a little later, gotta run errands first. --- commit eab03c23c2a162085b13200d7942fc5a00b5ccc8 Author: Abel Wu Date: Tue Nov 7 17:05:07 2023 +0800 sched/eevdf: Fix vruntime adjustment on reweight vruntime of the (on_rq && !0-lag) entity needs to be adjusted when it gets re-weighted, and the calculations can be simplified based on the fact that re-weight won't change the w-average of all the entities. Please check the proofs in comments. But adjusting vruntime can also cause position change in RB-tree hence require re-queue to fix up which might be costly. This might be avoided by deferring adjustment to the time the entity actually leaves tree (dequeue/pick), but that will negatively affect task selection and probably not good enough either. Fixes: 147f3efaa241 ("sched/fair: Implement an EEVDF-like scheduling policy") Signed-off-by: Abel Wu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20231107090510.71322-2-wuyun.a...@bytedance.com diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2048138ce54b..025d90925bf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3666,41 +3666,140 @@ static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif +static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + unsigned long old_weight = se->load.weight; + u64 avruntime = avg_vruntime(cfs_rq); + s64 vlag, vslice; + + /* +* VRUNTIME +* +* +* COROLLARY #1: The virtual runtime of the entity needs to be +* adjusted if re-weight at !0-lag point. +* +* Proof: For contradiction assume this is not true, so we can +* re-weight without changing vruntime at !0-lag point. +* +* Weight VRuntime Avg-VRuntime +* beforew vV +* afterw' v' V' +* +* Since lag needs to be preserved through re-weight: +* +* lag = (V - v)*w = (V'- v')*w', where v = v' +* ==> V' = (V - v)*w/w' + v (1) +* +* Let W be the total weight of the entities before reweight, +* since V' is the new weighted average of entities: +* +* V' = (WV + w'v - wv) / (W + w' - w) (2) +* +* by using (1) & (2) we obtain: +* +* (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v +* ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v +* ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v +* ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) +* +* Since we are doing at !0-lag point which means V != v, we +* can simplify (3): +* +* ==> W / (W + w' - w) = w / w' +* ==> Ww' = Ww + ww' - ww +* ==> W * (w' - w) = w * (w' - w) +* ==> W = w (re-weight indicates w' != w) +* +* So the cfs_rq contains only one entity, hence vruntime of +* the entity @v should always equal to the cfs_rq's weighted +* average vruntime @V, which means we will always re-weight +* at 0-lag point, thus breach assumption. Proof completed. +* +* +* COROLLARY #2: Re-weight does NOT affect weighted average +* vruntime of all the entities. +* +* Proof: According to corollary #1, Eq. (1) should be: +* +* (V - v)*w = (V' - v')*w' +* ==>v' = V' - (V - v)*w/w' (4) +* +* According to the weighted average formula, we have: +* +* V' = (WV - wv + w'v') / (W - w + w') +
Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
On Fri, Nov 17, 2023 at 10:23:18AM +0100, Peter Zijlstra wrote: > Now, IF this is the problem, I might have a patch that helps: > > > https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf=119feac4fcc77001cd9bf199b25f08d232289a5c And then I turn around and wipe the repository invalidating that link. The sched/eevdf branch should be re-instated (with different SHA1), but I'll include the patch below for reference. --- Subject: sched/eevdf: Delay dequeue From: Peter Zijlstra Date: Fri Sep 15 00:48:45 CEST 2023 For tasks that have negative-lag (have received 'excess' service), delay the dequeue and keep them in the runnable tree until they're eligible again. Or rather, keep them until they're selected again, since finding their eligibility crossover point is expensive. The effect is a bit like sleeper bonus, the tasks keep contending for service until either they get a wakeup or until they're selected again and are really dequeued. This means that any actual dequeue happens with positive lag (serviced owed) and are more readily ran when woken next. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched.h |1 kernel/sched/core.c | 88 +++- kernel/sched/fair.c | 11 ++ kernel/sched/features.h | 11 ++ kernel/sched/sched.h|3 + 5 files changed, 97 insertions(+), 17 deletions(-) --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -916,6 +916,7 @@ struct task_struct { unsignedsched_reset_on_fork:1; unsignedsched_contributes_to_load:1; unsignedsched_migrated:1; + unsignedsched_delayed:1; /* Force alignment to the next boundary: */ unsigned:0; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3856,12 +3856,23 @@ static int ttwu_runnable(struct task_str rq = __task_rq_lock(p, ); if (task_on_rq_queued(p)) { + update_rq_clock(rq); + if (unlikely(p->sched_delayed)) { + p->sched_delayed = 0; + /* mustn't run a delayed task */ + WARN_ON_ONCE(task_on_cpu(rq, p)); + if (sched_feat(GENTLE_DELAY)) { + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + if (p->se.vlag > 0) + p->se.vlag = 0; + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); + } + } if (!task_on_cpu(rq, p)) { /* * When on_rq && !on_cpu the task is preempted, see if * it should preempt the task that is current now. */ - update_rq_clock(rq); wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); @@ -6565,6 +6576,24 @@ pick_next_task(struct rq *rq, struct tas # define SM_MASK_PREEMPT SM_PREEMPT #endif +static void deschedule_task(struct rq *rq, struct task_struct *p, unsigned long prev_state) +{ + p->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev_state & TASK_FROZEN); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible++; + + deactivate_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); + + if (p->in_iowait) { + atomic_inc(>nr_iowait); + delayacct_blkio_start(); + } +} + /* * __schedule() is the main scheduler function. * @@ -6650,6 +6679,8 @@ static void __sched notrace __schedule(u switch_count = >nivcsw; + WARN_ON_ONCE(prev->sched_delayed); + /* * We must load prev->state once (task_struct::state is volatile), such * that we form a control dependency vs deactivate_task() below. @@ -6659,14 +6690,6 @@ static void __sched notrace __schedule(u if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNING); } else { - prev->sched_contributes_to_load = - (prev_state & TASK_UNINTERRUPTIBLE) && - !(prev_state & TASK_NOLOAD) && - !(prev_state & TASK_FROZEN); - - if (prev->sched_contributes_to_load) - rq->nr_uninterruptible++; - /* * __schedule() ttwu()
Re: EEVDF/vhost regression (bisected to 86bfbb7ce4f6 sched/fair: Add lag based placement)
Your email is pretty badly mangled by wrapping, please try and reconfigure your MUA, esp. the trace and debug output is unreadable. On Thu, Nov 16, 2023 at 07:58:18PM +0100, Tobias Huschle wrote: > The base scenario are two KVM guests running on an s390 LPAR. One guest > hosts the uperf server, one the uperf client. > With EEVDF we observe a regression of ~50% for a strburst test. > For a more detailed description of the setup see the section TEST SUMMARY at > the bottom. Well, that's not good :/ > Short summary: > The mentioned kworker has been scheduled to CPU 14 before the tracing was > enabled. > A vhost process is migrated onto CPU 14. > The vruntimes of kworker and vhost differ significantly (86642125805 vs > 4242563284 -> factor 20) So bear with me, I know absolutely nothing about virt stuff. I suspect there's cgroups involved because shiny or something. kworkers are typically not in cgroups and are part of the root cgroup, but what's a vhost and where does it live? Also, what are their weights / nice values? > The vhost process wants to wake up the kworker, therefore the kworker is > placed onto the runqueue again and set to runnable. > The vhost process continues to execute, waking up other vhost processes on > other CPUs. > > So far this behavior is not different to what we see on pre-EEVDF kernels. > > On timestamp 576.162767, the vhost process triggers the last wake up of > another vhost on another CPU. > Until timestamp 576.171155, we see no other activity. Now, the vhost process > ends its time slice. > Then, vhost gets re-assigned new time slices 4 times and gets then migrated > off to CPU 15. So why does this vhost stay on the CPU if it doesn't have anything to do? (I've not tried to make sense of the trace, that's just too painful). > This does not occur with older kernels. > The kworker has to wait for the migration to happen in order to be able to > execute again. > This is due to the fact, that the vruntime of the kworker is significantly > larger than the one of vhost. That's, weird. Can you add a trace_printk() to update_entity_lag() and have it print out the lag, limit and vlag (post clamping) values? And also in place_entity() for the reverse process, lag pre and post scaling or something. After confirming both tasks are indeed in the same cgroup ofcourse, because if they're not, vruntime will be meaningless to compare and we should look elsewhere. Also, what HZ and what preemption mode are you running? If kworker is somehow vastly over-shooting it's slice -- keeps running way past the avg_vruntime, then it will build up a giant lag and you get what you describe, next time it wakes up it gets placed far to the right (exactly where it was when it 'finally' went to sleep, relatively speaking). > We found some options which sound plausible but we are not sure if they are > valid or not: > > 1. The wake up path has a dependency on the vruntime metrics that now delays > the execution of the kworker. > 2. The previous commit af4cf40470c2 (sched/fair: Add cfs_rq::avg_vruntime) > which updates the way cfs_rq->min_vruntime and > cfs_rq->avg_runtime are set might have introduced an issue which is > uncovered with the commit mentioned above. Suppose you have a few tasks (of equal weight) on you virtual timeline like so: -+---+---+---+---+-- ^ ^ | `avg_vruntime `-min_vruntime Then the above would be more or less the relative placements of these values. avg_vruntime is the weighted average of the various vruntimes and is therefore always in the 'middle' of the tasks, and not somewhere out-there. min_vruntime is a monotonically increasing 'minimum' that's left-ish on the tree (there's a few cases where a new task can be placed left of min_vruntime and its no longer actuall the minimum, but whatever). These values should be relatively close to one another, depending ofcourse on the spread of the tasks. So I don't think this is causing trouble. Anyway, the big difference with lag based placement is that where previously tasks (that do not migrate) retain their old vruntime and on placing they get pulled forward to at least min_vruntime, so a task that wildly overshoots, but then doesn't run for significant time can still be overtaken and then when placed again be 'okay'. Now OTOH, with lag-based placement, we strictly preserve their relative offset vs avg_vruntime. So if they were *far* too the right when they go to sleep, they will again be there on placement. Sleeping doesn't help them anymore. Now, IF this is the problem, I might have a patch that helps: https://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git/commit/?h=sched/eevdf=119feac4fcc77001cd9bf199b25f08d232289a5c That branch is based on v6.7-rc1 and then some, but I think it's relatively easy to rebase the lot on v6.6 (which I'm assuming you're on). I'm a little conflicted on the patch, conceptually I like what it does, but the
Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS
On Mon, Nov 06, 2023 at 09:47:08PM +0900, Masami Hiramatsu wrote: > On Mon, 6 Nov 2023 11:19:32 +0100 > Peter Zijlstra wrote: > > > On Mon, Nov 06, 2023 at 09:38:50AM +0900, Masami Hiramatsu wrote: > > > On Sun, 5 Nov 2023 18:34:09 -0500 > > > Steven Rostedt wrote: > > > > > > > On Sun, 5 Nov 2023 18:33:01 -0500 > > > > Steven Rostedt wrote: > > > > > > > > > For x86_64, that would be: > > > > > > > > > > rdi, rsi, rdx, r8, r9, rsp > > > > > > > > I missed rcx. > > > > > > I would like to add rax to the list so that it can handle the return > > > value too. :) > > > > So something like so? > > > > > > diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h > > index 897cf02c20b1..71bfe27594a5 100644 > > --- a/arch/x86/include/asm/ftrace.h > > +++ b/arch/x86/include/asm/ftrace.h > > @@ -36,6 +36,10 @@ static inline unsigned long ftrace_call_adjust(unsigned > > long addr) > > > > #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS > > struct ftrace_regs { > > + /* > > +* Partial, filled with: > > +* rax, rcx, rdx, rdi, rsi, r8, r9, rsp > > Don't we need rbp too? (for frame pointer) /me goes stare at ftrace_64.S, and yes it appears it fills out rbp too.
Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS
On Mon, Nov 06, 2023 at 09:38:50AM +0900, Masami Hiramatsu wrote: > On Sun, 5 Nov 2023 18:34:09 -0500 > Steven Rostedt wrote: > > > On Sun, 5 Nov 2023 18:33:01 -0500 > > Steven Rostedt wrote: > > > > > For x86_64, that would be: > > > > > > rdi, rsi, rdx, r8, r9, rsp > > > > I missed rcx. > > I would like to add rax to the list so that it can handle the return value > too. :) So something like so? diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 897cf02c20b1..71bfe27594a5 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -36,6 +36,10 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr) #ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { + /* +* Partial, filled with: +* rax, rcx, rdx, rdi, rsi, r8, r9, rsp +*/ struct pt_regs regs; };
Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS
On Sun, Nov 05, 2023 at 02:11:30PM -0500, Steven Rostedt wrote: > On Sun, 5 Nov 2023 18:25:36 +0100 > Peter Zijlstra wrote: > > > On Mon, Nov 06, 2023 at 01:11:21AM +0900, Masami Hiramatsu (Google) wrote: > > > From: Masami Hiramatsu (Google) > > > > > > Support HAVE_FUNCTION_GRAPH_FREGS on x86-64, which saves ftrace_regs > > > on the stack in ftrace_graph return trampoline so that the callbacks > > > can access registers via ftrace_regs APIs. > > > > What is ftrace_regs ? If I look at arch/x86/include/asm/ftrace.h it's a > > pointless wrapper around pt_regs. > > > > Can we please remove the pointless wrappery and call it what it is? > > A while back ago when I introduced FTRACE_WITH_ARGS, it would have all > ftrace callbacks get a pt_regs, but it would be partially filled for > those that did not specify the "REGS" flag when registering the > callback. You and Thomas complained that it would be a bug to return > pt_regs that was not full because something might read the non filled > registers and think they were valid. > > To solve this, I came up with ftrace_regs to only hold the registers > that were required for function parameters (including the stack > pointer). You could then call arch_ftrace_get_regs(ftrace_regs) and if > this "wrapper" had all valid pt_regs registers, then it would return > the pt_regs, otherwise it would return NULL, and you would need to use > the ftrace_regs accessor calls to get the function registers. You and > Thomas agreed with this. Changelog nor code made it clear this was partial anything. So this is still the partial thing? Can we then pretty clear clarify all that, and make it clear which regs are in there? Because when I do 'vim -t ftrace_regs' it just gets me a seemingly pointless wrapper struct, no elucidating comments nothingses. > You even Acked the patch: > > commit 02a474ca266a47ea8f4d5a11f4ffa120f83730ad > Author: Steven Rostedt (VMware) > Date: Tue Oct 27 10:55:55 2020 -0400 You expect me to remember things from 3 years ago?
Re: [RFC PATCH 24/32] x86/ftrace: Enable HAVE_FUNCTION_GRAPH_FREGS
On Mon, Nov 06, 2023 at 01:11:21AM +0900, Masami Hiramatsu (Google) wrote: > From: Masami Hiramatsu (Google) > > Support HAVE_FUNCTION_GRAPH_FREGS on x86-64, which saves ftrace_regs > on the stack in ftrace_graph return trampoline so that the callbacks > can access registers via ftrace_regs APIs. What is ftrace_regs ? If I look at arch/x86/include/asm/ftrace.h it's a pointless wrapper around pt_regs. Can we please remove the pointless wrappery and call it what it is?
Re: [PATCH V6 5/7] cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically
On Fri, Sep 08, 2023 at 03:46:51PM +0800, Meng Li wrote: > +static void amd_pstate_update_highest_perf(unsigned int cpu) > +{ > + struct cpufreq_policy *policy; > + struct amd_cpudata *cpudata; > + u32 prev_high = 0, cur_high = 0; > + u64 highest_perf; > + int ret; > + > + if (!prefcore) > + return; > + > + ret = amd_pstate_get_highest_perf(cpu, _perf); > + if (ret) > + return; > + > + policy = cpufreq_cpu_get(cpu); > + cpudata = policy->driver_data; > + cur_high = highest_perf; > + prev_high = READ_ONCE(cpudata->prefcore_ranking); > + > + if (prev_high != cur_high) { > + WRITE_ONCE(cpudata->prefcore_ranking, cur_high); > + sched_set_itmt_core_prio(cur_high, cpu); > + } > + > + cpufreq_cpu_put(policy); > +} Idem -- I told to clarify the u32 vs int thing, nothing here.
Re: [PATCH V6 3/7] cpufreq: amd-pstate: Enable amd-pstate preferred core supporting.
On Fri, Sep 08, 2023 at 03:46:49PM +0800, Meng Li wrote: > +static void amd_pstate_init_prefcore(void) > +{ > + int cpu, ret; > + u64 highest_perf; > + > + if (!prefcore) > + return; > + > + for_each_online_cpu(cpu) { > + ret = amd_pstate_get_highest_perf(cpu, _perf); > + if (ret) > + break; > + > + sched_set_itmt_core_prio(highest_perf, cpu); > + > + /* check if CPPC preferred core feature is enabled*/ > + if (highest_perf == AMD_PSTATE_MAX_CPPC_PERF) { > + pr_debug("AMD CPPC preferred core is unsupported!\n"); > + hw_prefcore = false; > + prefcore = false; > + return; > + } > + } > + > + /* > + * This code can be run during CPU online under the > + * CPU hotplug locks, so sched_set_amd_prefcore_support() > + * cannot be called from here. Queue up a work item > + * to invoke it. > + */ > + schedule_work(_prefcore_work); > +} Brilliant, repost without addressing prior feedback.. :-(
Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()
On Tue, Aug 16, 2022 at 11:49:59AM -0700, Dan Williams wrote: > What would have helped is if the secure-erase and unlock definition in > the specification mandated that the device emit cache invalidations for > everything it has mapped when it is erased. However, that has some > holes, and it also makes me think there is a gap in the current region > provisioning code. If I have device-A mapped at physical-address-X and then > tear that down and instantiate device-B at that same physical address > there needs to be CPU cache invalidation between those 2 events. Can we pretty please get those holes fixed ASAP such that future generations can avoid the WBINVD nonsense?
Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()
On Tue, Aug 16, 2022 at 10:42:03AM -0700, Dan Williams wrote: > I also think this cache_flush_region() API wants a prominent comment > clarifying the limited applicability of this API. I.e. that it is not > for general purpose usage, not for VMs, and only for select bare metal > scenarios that instantaneously invalidate wide swaths of memory. > Otherwise, I can now see how this looks like a potentially scary > expansion of the usage of wbinvd. This; because adding a generic API like this makes it ripe for usage. And this is absolutely the very last thing we want used.
Re: [PATCH] arch/cacheflush: Introduce flush_all_caches()
On Mon, Aug 15, 2022 at 09:07:06AM -0700, Davidlohr Bueso wrote: > diff --git a/arch/x86/include/asm/cacheflush.h > b/arch/x86/include/asm/cacheflush.h > index b192d917a6d0..ce2ec9556093 100644 > --- a/arch/x86/include/asm/cacheflush.h > +++ b/arch/x86/include/asm/cacheflush.h > @@ -10,4 +10,7 @@ > > void clflush_cache_range(void *addr, unsigned int size); > > +#define flush_all_caches() \ > + do { wbinvd_on_all_cpus(); } while(0) > + This is horrific... we've done our utmost best to remove all WBINVD usage and here you're adding it back in the most horrible form possible ?!? Please don't do this, do *NOT* use WBINVD.
Re: [PATCH v3 2/8] cxl/acpi: Add root device lockdep validation
On Sat, Apr 23, 2022 at 10:27:52AM -0700, Dan Williams wrote: > ...so I'm going to drop it and just add a comment about the > expectations. As Peter said there's already a multitude of ways to > cause false positive / negative results with lockdep so this is just > one more area where one needs to be careful and understand the lock > context they might be overriding. One safe-guard might be to check the class you're overriding is indeed __no_validate__, and WARN if not. Then the unconditional reset is conistent. Then, if/when, that WARN ever triggers you can revisit all this.
Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation
On Thu, Apr 14, 2022 at 12:43:31PM -0700, Dan Williams wrote: > On Thu, Apr 14, 2022 at 12:34 PM Peter Zijlstra wrote: > > > > On Thu, Apr 14, 2022 at 10:17:13AM -0700, Dan Williams wrote: > > > > > One more sanity check... So in driver subsystems there are cases where > > > a device on busA hosts a topology on busB. When that happens there's a > > > need to set the lock class late in a driver since busA knows nothing > > > about the locking rules of busB. > > > > I'll pretend I konw what you're talking about ;-) > > > > > Since the device has a longer lifetime than a driver when the driver > > > exits it must set dev->mutex back to the novalidate class, otherwise > > > it sets up a use after free of the static lock_class_key. > > > > I'm not following, static storage has infinite lifetime. > > Not static storage in a driver module. > > modprobe -r fancy_lockdep_using_driver.ko > > Any use of device_lock() by the core on a device that a driver in this > module was driving will de-reference a now invalid pointer into > whatever memory was vmalloc'd for the module static data. Ooh, modules (I always, conveniently, forget they exist). Yes, setting a lock instance from the core kernel to a key that's inside a module and then taking the module out will be 'interesting'. Most likely you'll get a splat from lockdep when doing this.
Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation
On Thu, Apr 14, 2022 at 10:17:13AM -0700, Dan Williams wrote: > One more sanity check... So in driver subsystems there are cases where > a device on busA hosts a topology on busB. When that happens there's a > need to set the lock class late in a driver since busA knows nothing > about the locking rules of busB. I'll pretend I konw what you're talking about ;-) > Since the device has a longer lifetime than a driver when the driver > exits it must set dev->mutex back to the novalidate class, otherwise > it sets up a use after free of the static lock_class_key. I'm not following, static storage has infinite lifetime. > I came up with this and it seems to work, just want to make sure I'm > properly using the lock_set_class() API and it is ok to transition > back and forth from the novalidate case: > > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h > index 990b6670222e..32673e1a736d 100644 > --- a/drivers/cxl/cxl.h > +++ b/drivers/cxl/cxl.h > @@ -405,6 +405,29 @@ struct cxl_nvdimm_bridge > *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd); > #define __mock static > #endif > > +#ifdef CONFIG_PROVE_LOCKING > +static inline void cxl_lock_reset_class(void *_dev) > +{ > + struct device *dev = _dev; > + > + lock_set_class(>mutex.dep_map, "__lockdep_no_validate__", > + &__lockdep_no_validate__, 0, _THIS_IP_); > +} > + > +static inline int cxl_lock_set_class(struct device *dev, const char *name, > +struct lock_class_key *key) > +{ > + lock_set_class(>mutex.dep_map, name, key, 0, _THIS_IP_); > + return devm_add_action_or_reset(dev, cxl_lock_reset_class, dev); > +} > +#else > +static inline int cxl_lock_set_class(struct device *dev, const char *name, > +struct lock_class_key *key) > +{ > + return 0; > +} > +#endif Under the assumption that the lock is held (lock_set_class() will actually barf if @lock isn't held) this should indeed work as expected (although I think you got the @name part 'wrong', I think that's canonically something like ">mutex" or something).
Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation
On Wed, Apr 13, 2022 at 03:01:21PM -0700, Dan Williams wrote: > > That's not an obvious conclusion; lockdep has lots of funny annotations, > > subclasses is just one. > > > > I think the big new development in lockdep since that time with Alan > > Stern is that lockdep now has support for dynamic keys; that is lock > > keys in heap memory (as opposed to static storage). > > Ah, I was not aware of that, that should allow for deep cleanups of > this proposal. > > If you want lockdep validation for one (or more) dev->mutex instances, > > why not pull them out of the no_validate class and use the normal > > locking? > > Sounds perfect, just didn't know how to do that with my current > understanding of how to communicate this to lockdep. > > > > > This is all quite insane. > > Yes, certainly in comparison to your suggestion on the next patch. > That looks much more sane, and even better I think it allows for > optional lockdep validation without even needing to touch > include/linux/device.h. Right, so lockdep has: - classes, based off of lock_class_key address; * lock_class_key should be static storage; except now we also have: lockdep_{,un}register_key() which allows dynamic memory (aka. heap) to be used for classes, important to note that lockdep memory usage is still static storage because the memory allocators use locks too. So if you register too many dynamic keys, you'll run out of lockdep memory etc.. so be careful. * things like mutex_init() have a static lock_class_key per site and hence every lock initialized by the same code ends up in the same class by default. * can be trivially changed at any time, assuming the lock isn't held, using lockdep_set_class*() family. (extensively used all over the kernel, for example by the vfs to give each filesystem type their own locking order rules) * lockdep_set_no_validate_class() is a magical variant of lockdep_set_class() that sets a magical lock_class_key. * can be changed while held using lock_set_class() ( from a lockdep pov it unlocks the held stack, changes the class of your lock, and re-locks the held stack, now with a different class nesting ). Be carefule! It doesn't forget the old nesting order, so you can trivally generate cycles. - subclasses, basically distinct addresses inside above mentioned lock_class_key object, limited to 8. Normally used with *lock_nested() family of functions. Typically used to lock multiple instances of a single lock class where there is defined order between instances (see for instance: double_rq_lock()). - nest_lock; eg. mutex_lock_nest_lock(), which allows expressing things like: multiple locks of class A can be taken in any order, provided we hold lock B. With many of these, it's possible to get it wrong and annotate real deadlocks away, so be careful :-)
Re: [PATCH v2 03/12] cxl/core: Refactor a cxl_lock_class() out of cxl_nested_lock()
*Completely* untested (I wouldn't know where to begin and probably odn't have the hardware anyway), and known incomplete. What's wrong with something like this then? --- drivers/cxl/core/pmem.c | 8 + drivers/cxl/core/port.c | 58 +--- drivers/cxl/cxl.h | 78 - 3 files changed, 42 insertions(+), 102 deletions(-) diff --git a/drivers/cxl/core/pmem.c b/drivers/cxl/core/pmem.c index 8de240c4d96b..aca0dd5eefeb 100644 --- a/drivers/cxl/core/pmem.c +++ b/drivers/cxl/core/pmem.c @@ -80,6 +80,8 @@ struct cxl_nvdimm_bridge *cxl_find_nvdimm_bridge(struct cxl_nvdimm *cxl_nvd) } EXPORT_SYMBOL_NS_GPL(cxl_find_nvdimm_bridge, CXL); +static lock_class_key cxl_nvdimm_bridge_key; + static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port) { struct cxl_nvdimm_bridge *cxl_nvb; @@ -104,6 +106,8 @@ static struct cxl_nvdimm_bridge *cxl_nvdimm_bridge_alloc(struct cxl_port *port) dev->bus = _bus_type; dev->type = _nvdimm_bridge_type; + lockdep_set_class(>mutex, _nvdimm_bridge_key); + return cxl_nvb; err: @@ -214,6 +218,8 @@ struct cxl_nvdimm *to_cxl_nvdimm(struct device *dev) } EXPORT_SYMBOL_NS_GPL(to_cxl_nvdimm, CXL); +static struct lock_class_key cxl_nvdimm_key; + static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd) { struct cxl_nvdimm *cxl_nvd; @@ -231,6 +237,8 @@ static struct cxl_nvdimm *cxl_nvdimm_alloc(struct cxl_memdev *cxlmd) dev->bus = _bus_type; dev->type = _nvdimm_type; + lockdep_set_class(>mutex, _nvdimm_key); + return cxl_nvd; } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 2ab1ba4499b3..cae88404612f 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -312,10 +312,10 @@ static void cxl_port_release(struct device *dev) struct cxl_port *port = to_cxl_port(dev); struct cxl_ep *ep, *_e; - cxl_device_lock(dev); + device_lock(dev); list_for_each_entry_safe(ep, _e, >endpoints, list) cxl_ep_release(ep); - cxl_device_unlock(dev); + device_unlock(dev); ida_free(_port_ida, port->id); kfree(port); } @@ -391,6 +391,8 @@ static int devm_cxl_link_uport(struct device *host, struct cxl_port *port) return devm_add_action_or_reset(host, cxl_unlink_uport, port); } +static struct lock_class_key cxl_port_key; + static struct cxl_port *cxl_port_alloc(struct device *uport, resource_size_t component_reg_phys, struct cxl_port *parent_port) @@ -431,6 +433,8 @@ static struct cxl_port *cxl_port_alloc(struct device *uport, dev->bus = _bus_type; dev->type = _port_type; + lockdep_set_class(>mutex, _port_key); + return port; err: @@ -457,8 +461,10 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, if (IS_ERR(port)) return port; - if (parent_port) + if (parent_port) { port->depth = parent_port->depth + 1; + lockdep_set_class_and_subclass(>dev->mutex, _port, port->depth); + } dev = >dev; if (is_cxl_memdev(uport)) rc = dev_set_name(dev, "endpoint%d", port->id); @@ -554,7 +560,7 @@ static int match_root_child(struct device *dev, const void *match) return 0; port = to_cxl_port(dev); - cxl_device_lock(dev); + device_lock(dev); list_for_each_entry(dport, >dports, list) { iter = match; while (iter) { @@ -564,7 +570,7 @@ static int match_root_child(struct device *dev, const void *match) } } out: - cxl_device_unlock(dev); + device_unlock(dev); return !!iter; } @@ -623,13 +629,13 @@ static int add_dport(struct cxl_port *port, struct cxl_dport *new) static void cond_cxl_root_lock(struct cxl_port *port) { if (is_cxl_root(port)) - cxl_device_lock(>dev); + device_lock(>dev); } static void cond_cxl_root_unlock(struct cxl_port *port) { if (is_cxl_root(port)) - cxl_device_unlock(>dev); + device_unlock(>dev); } static void cxl_dport_remove(void *data) @@ -736,15 +742,15 @@ static int add_ep(struct cxl_port *port, struct cxl_ep *new) { struct cxl_ep *dup; - cxl_device_lock(>dev); + device_lock(>dev); if (port->dead) { - cxl_device_unlock(>dev); + device_unlock(>dev); return -ENXIO; } dup = find_ep(port, new->ep); if (!dup) list_add_tail(>list, >endpoints); - cxl_device_unlock(>dev); + device_unlock(>dev); return dup ? -EEXIST : 0; } @@ -854,12 +860,12 @@ static void delete_endpoint(void *data)
Re: [PATCH v2 02/12] device-core: Add dev->lock_class to enable device_lock() lockdep validation
On Tue, Apr 12, 2022 at 11:01:38PM -0700, Dan Williams wrote: > The device_lock() is hidden from lockdep by default because, for > example, a device subsystem may do something like: > > --- > device_add(dev1); > ...in driver core... > device_lock(dev1); > bus->probe(dev1); /* where bus->probe() calls driver1_probe() */ > > driver1_probe(struct device *dev) > { > ...do some enumeration... > dev2->parent = dev; > /* this triggers probe under device_lock(dev2); */ > device_add(dev2); > } > --- > > To lockdep, that device_lock(dev2) looks like a deadlock because lockdep Recursion, you're meaning to say it looks like same lock recursion. > only sees lock classes, not individual lock instances. All device_lock() > instances across the entire kernel are the same class. However, this is > not a deadlock in practice because the locking is strictly hierarchical. > I.e. device_lock(dev1) is held over device_lock(dev2), but never the > reverse. I have some very vague memories from a conversation with Alan Stern, some maybe 10 years ago, where I think he was explaining to me this was not in fact a simple hierarchy. > In order for lockdep to be satisfied and see that it is > hierarchical in practice the mutex_lock() call in device_lock() needs to > be moved to mutex_lock_nested() where the @subclass argument to > mutex_lock_nested() represents the nesting level, i.e.: That's not an obvious conclusion; lockdep has lots of funny annotations, subclasses is just one. I think the big new development in lockdep since that time with Alan Stern is that lockdep now has support for dynamic keys; that is lock keys in heap memory (as opposed to static storage). > s/device_lock(dev1)/mutex_lock_nested(>mutex, 1)/ > > s/device_lock(dev2)/mutex_lock_nested(>mutex, 2)/ > > Now, what if the internals of the device_lock() could be annotated with > the right @subclass argument to call mutex_lock_nested()? > > With device_set_lock_class() a subsystem can optionally add that > metadata. The device_lock() still takes dev->mutex, but when > dev->lock_class is >= 0 it additionally takes dev->lockdep_mutex with > the proper nesting. Unlike dev->mutex, dev->lockdep_mutex is not marked > lockdep_set_novalidate_class() and lockdep will become useful... at > least for one subsystem at a time. > > It is still the case that only one subsystem can be using lockdep with > lockdep_mutex at a time because different subsystems will collide class > numbers. You might say "well, how about subsystem1 gets class ids 0 to 9 > and subsystem2 gets class ids 10 to 20?". MAX_LOCKDEP_SUBCLASSES is 8, > and 8 is just enough class ids for one subsystem of moderate complexity. Again, that doesn't seem like an obvious suggestion at all. Why not give each subsystem a different lock key? > diff --git a/include/linux/device.h b/include/linux/device.h > index af2576ace130..6083e757e804 100644 > --- a/include/linux/device.h > +++ b/include/linux/device.h > @@ -402,6 +402,7 @@ struct dev_msi_info { > * @mutex: Mutex to synchronize calls to its driver. > * @lockdep_mutex: An optional debug lock that a subsystem can use as a > * peer lock to gain localized lockdep coverage of the device_lock. > + * @lock_class: per-subsystem annotated device lock class > * @bus: Type of bus device is on. > * @driver: Which driver has allocated this > * @platform_data: Platform data specific to the device. > @@ -501,6 +502,7 @@ struct device { > dev_set_drvdata/dev_get_drvdata */ > #ifdef CONFIG_PROVE_LOCKING > struct mutexlockdep_mutex; > + int lock_class; > #endif > struct mutexmutex; /* mutex to synchronize calls to >* its driver. > @@ -762,18 +764,100 @@ static inline bool dev_pm_test_driver_flags(struct > device *dev, u32 flags) > return !!(dev->power.driver_flags & flags); > } > > +static inline void device_lock_assert(struct device *dev) > +{ > + lockdep_assert_held(>mutex); > +} > + > #ifdef CONFIG_PROVE_LOCKING > static inline void device_lockdep_init(struct device *dev) > { > mutex_init(>lockdep_mutex); > + dev->lock_class = -1; > lockdep_set_novalidate_class(>mutex); > } > -#else > + > +static inline void device_lock(struct device *dev) > +{ > + /* > + * For double-lock programming errors the kernel will hang > + * trying to acquire @dev->mutex before lockdep can report the > + * problem acquiring @dev->lockdep_mutex, so manually assert > + * before that hang. > + */ > + lockdep_assert_not_held(>lockdep_mutex); > + > + mutex_lock(>mutex); > + if (dev->lock_class >= 0) > + mutex_lock_nested(>lockdep_mutex, dev->lock_class); > +} > + > +static inline int device_lock_interruptible(struct device *dev) > +{ > + int rc; > + > + lockdep_assert_not_held(>lockdep_mutex); > + > + rc =
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Wed, Apr 21, 2021 at 03:37:11AM +0900, Namhyung Kim wrote: > On Tue, Apr 20, 2021 at 7:28 PM Peter Zijlstra wrote: > > You forgot NMI. > > Thanks for your explanation. Maybe I'm missing something but > this event is basically for counting and doesn't allow sampling. > Do you say it's affected by other sampling events? Note that > it's not reading from the PMU here, what it reads is a snapshot > of last pmu->read(event) afaik. Even !sampling events will trigger NMI to deal with short hardware counters rolling over. But yes, !sampling can also be updated from NMI by other events if they're in a group etc.. Basically, always assume NMI/PMI can happen.
Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
On Tue, Apr 20, 2021 at 05:53:40PM +0100, Vincent Donnefort wrote: > All good with that snippet on my end. > > I wonder if balance_push() shouldn't use the cpu_of() accessor > instead of rq->cpu. That might be a personal quirk of mine, but for code that is under CONFIG_SMP (as all balancing code must be) I tend to prefer the more explicit rq->cpu usage. cpu_of() obviously also works. > Otherwise, > > + Reviewed-by: Vincent Donnefort Thanks!, now I get to write a Changelog :-)
Re: [PATCH] perf/x86: Fix integer overflow when left shifting an integer more than 32 bits
On Tue, Apr 20, 2021 at 05:03:03PM +0200, Peter Zijlstra wrote: > On Tue, Apr 20, 2021 at 03:29:07PM +0100, Colin King wrote: > > From: Colin Ian King > > > > The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being > > bit-wise masked with the value (0x03 << i*4). However, the shifted value > > is evaluated using 32 bit arithmetic, so will overflow when i > 8. > > Fix this by making 0x03 a ULL so that the shift is performed using > > 64 bit arithmetic. > > > > Addresses-Coverity: ("Unintentional integer overflow") > > Strange tag that, also inaccurate, wide shifts are UB and don't behave > consistently. > > As is, we've not had hardware with that many fixed counters, but yes, > worth fixing I suppose. Patch now reads: --- Subject: perf/x86: Allow for 8 Date: Tue, 20 Apr 2021 15:29:07 +0100 From: Colin Ian King The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being bit-wise masked with the value (0x03 << i*4). However, the shifted value is evaluated using 32 bit arithmetic, so will UB when i > 8. Fix this by making 0x03 a ULL so that the shift is performed using 64 bit arithmetic. This makes the arithmetic internally consistent and preparers for the day when hardware provides 8 Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210420142907.382417-1-colin.k...@canonical.com --- arch/x86/events/core.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -261,7 +261,7 @@ static bool check_hw_exists(void) for (i = 0; i < x86_pmu.num_counters_fixed; i++) { if (fixed_counter_disabled(i)) continue; - if (val & (0x03 << i*4)) { + if (val & (0x03ULL << i*4)) { bios_fail = 1; val_fail = val; reg_fail = reg;
Re: [PATCH] perf/x86: Fix integer overflow when left shifting an integer more than 32 bits
On Tue, Apr 20, 2021 at 03:29:07PM +0100, Colin King wrote: > From: Colin Ian King > > The 64 bit value read from MSR_ARCH_PERFMON_FIXED_CTR_CTRL is being > bit-wise masked with the value (0x03 << i*4). However, the shifted value > is evaluated using 32 bit arithmetic, so will overflow when i > 8. > Fix this by making 0x03 a ULL so that the shift is performed using > 64 bit arithmetic. > > Addresses-Coverity: ("Unintentional integer overflow") Strange tag that, also inaccurate, wide shifts are UB and don't behave consistently. As is, we've not had hardware with that many fixed counters, but yes, worth fixing I suppose. > Fixes: a5ebe0ba3dff ("perf/x86: Check all MSRs before passing hw check") > Signed-off-by: Colin Ian King > --- > arch/x86/events/core.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c > index bafd93c54ffa..59c665c8c2e9 100644 > --- a/arch/x86/events/core.c > +++ b/arch/x86/events/core.c > @@ -261,7 +261,7 @@ static bool check_hw_exists(void) > for (i = 0; i < x86_pmu.num_counters_fixed; i++) { > if (fixed_counter_disabled(i)) > continue; > - if (val & (0x03 << i*4)) { > + if (val & (0x03ULL << i*4)) { > bios_fail = 1; > val_fail = val; > reg_fail = reg; > -- > 2.30.2 >
Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
On Tue, Apr 20, 2021 at 04:39:04PM +0200, Peter Zijlstra wrote: > On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote: > > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote: > > > > > Found the issue: > > > > > > $ cat hotplug/states: > > > 219: sched:active > > > 220: online > > > > > > CPU0: > > > > > > $ echo 219 > hotplug/fail > > > $ echo 0 > online > > > > > > => cpu_active = 1 cpu_dying = 1 > > > > > > which means that later on, for another CPU hotunplug, in > > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that > > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop, > > > trying to migrate that task to CPU0. > > > > > > The problem is that for a failure in sched:active, as "online" has no > > > callback, > > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying > > > bit would > > > not be reset. > > > > Urgh! Good find. > I seem to have triggered the BUG() in select_fallback_rq() with your recipie. > Have cpu0 fail on sched:active, then offline all other CPUs. > > Now lemme add that patch. (which obviously didn't actually build) seems to fix it. --- diff --git a/kernel/cpu.c b/kernel/cpu.c index 838dcf238f92..e538518556f4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -63,6 +63,7 @@ struct cpuhp_cpu_state { boolrollback; boolsingle; boolbringup; + int cpu; struct hlist_node *node; struct hlist_node *last; enum cpuhp_statecb_state; @@ -160,9 +161,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; - if (cpu_dying(cpu) != !bringup) - set_cpu_dying(cpu, !bringup); - if (st->fail == state) { st->fail = CPUHP_INVALID; return -EAGAIN; @@ -467,13 +465,16 @@ static inline enum cpuhp_state cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; + bool bringup = st->state < target; st->rollback = false; st->last = NULL; st->target = target; st->single = false; - st->bringup = st->state < target; + st->bringup = bringup; + if (cpu_dying(st->cpu) != !bringup) + set_cpu_dying(st->cpu, !bringup); return prev_state; } @@ -481,6 +482,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) static inline void cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) { + bool bringup = !st->bringup; + st->target = prev_state; /* @@ -503,7 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) st->state++; } - st->bringup = !st->bringup; + st->bringup = bringup; + if (cpu_dying(st->cpu) != !bringup) + set_cpu_dying(st->cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */ @@ -693,6 +698,7 @@ static void cpuhp_create(unsigned int cpu) init_completion(>done_up); init_completion(>done_down); + st->cpu = cpu; } static int cpuhp_should_run(unsigned int cpu)
Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
On Tue, Apr 20, 2021 at 04:20:56PM +0200, Peter Zijlstra wrote: > On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote: > > > Found the issue: > > > > $ cat hotplug/states: > > 219: sched:active > > 220: online > > > > CPU0: > > > > $ echo 219 > hotplug/fail > > $ echo 0 > online > > > > => cpu_active = 1 cpu_dying = 1 > > > > which means that later on, for another CPU hotunplug, in > > __balance_push_cpu_stop(), the fallback rq for a kthread can select that > > CPU0, but __migrate_task() would fail and we end-up in an infinite loop, > > trying to migrate that task to CPU0. > > > > The problem is that for a failure in sched:active, as "online" has no > > callback, > > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit > > would > > not be reset. > > Urgh! Good find. > > > Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better > > place to > > switch the dying bit? > > Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs > the callbacks out of order. I _think_ we can ignore it, but > > Something like the below, let me see if I can reproduce and test. I seem to have triggered the BUG() in select_fallback_rq() with your recipie. Have cpu0 fail on sched:active, then offline all other CPUs. Now lemme add that patch.
Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
On Tue, Apr 20, 2021 at 10:46:33AM +0100, Vincent Donnefort wrote: > Found the issue: > > $ cat hotplug/states: > 219: sched:active > 220: online > > CPU0: > > $ echo 219 > hotplug/fail > $ echo 0 > online > > => cpu_active = 1 cpu_dying = 1 > > which means that later on, for another CPU hotunplug, in > __balance_push_cpu_stop(), the fallback rq for a kthread can select that > CPU0, but __migrate_task() would fail and we end-up in an infinite loop, > trying to migrate that task to CPU0. > > The problem is that for a failure in sched:active, as "online" has no > callback, > there will be no call to cpuhp_invoke_callback(). Hence, the cpu_dying bit > would > not be reset. Urgh! Good find. > Maybe cpuhp_reset_state() and cpuhp_set_state() would then be a better place > to > switch the dying bit? Yes, except now cpuhp_invoke_ap_callback() makes my head hurt, that runs the callbacks out of order. I _think_ we can ignore it, but Something like the below, let me see if I can reproduce and test. --- diff --git a/kernel/cpu.c b/kernel/cpu.c index 838dcf238f92..05272bae953d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,9 +160,6 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; - if (cpu_dying(cpu) != !bringup) - set_cpu_dying(cpu, !bringup); - if (st->fail == state) { st->fail = CPUHP_INVALID; return -EAGAIN; @@ -467,13 +464,16 @@ static inline enum cpuhp_state cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) { enum cpuhp_state prev_state = st->state; + bool bringup = st->state < target; st->rollback = false; st->last = NULL; st->target = target; st->single = false; - st->bringup = st->state < target; + st->bringup = bringup; + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); return prev_state; } @@ -481,6 +481,8 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target) static inline void cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) { + bool bringup = !st->bringup; + st->target = prev_state; /* @@ -503,7 +505,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state) st->state++; } - st->bringup = !st->bringup; + st->bringup = bringup; + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); } /* Regular hotplug invocation of the AP hotplug thread */
Re: [PATCH] smp: add a best_effort version of smp_call_function_many()
On Tue, Apr 20, 2021 at 12:41:08PM +0200, Luigi Rizzo wrote: > On Tue, Apr 20, 2021 at 11:14 AM Peter Zijlstra wrote: > > We mostly try and avoid using this stuff wherever possible. Only when > > no other choice is left do we send IPIs. > > > > NOHZ_FULL already relies on this and gets massively unhappy when a new > > user comes and starts to spray IPIs. > > I am curious, why is that -- is it because the new user is stealing > the shared csd's in cfd_data (see below), or some other reason ? The premise of NOHZ_FULL is that it will not be interrupted. There are users who are working on a mode where any interruption will cause a (fatal) signal. > > So no; mostly we send an IPI because we _HAVE_ to, not because giggles. > > > > That said; there's still some places left where we can avoid sending > > IPIs, but in all those cases correctness mandates we actually handle > > things and not randomly not do anything. > > My case too requires that the request is eventually handled, but with > this non-blocking IPI the caller has a better option than blocking: > it can either retry the multicast IPI at a later time if conditions allow, > or it can post a dedicated CSD (with the advantage that being my > requests idempotent, if the CSD is locked there is no need to retry > because it means the handler has not started yet). > > In fact, if we had the option to use dedicated CSDs for multicast IPI, > we wouldn't even need to retry because we'd know that the posted CSD > is for our call back and not someone else's. What are you doing that CSD contention is such a problem?
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Tue, Apr 20, 2021 at 01:34:40AM -0700, Stephane Eranian wrote: > The sampling approach will certainly incur more overhead and be at > risk of losing the ability to reconstruct the total counter > per-cgroup, unless you set the period for SW_CGROUP_SWITCHES to 1. > But then, you run the risk of losing samples if the buffer is full or > sampling is throtlled. In some scenarios, we believe the number of > context switches between cgroup could be quite high (>> 1000/s). And > on top you would have to add the processing of the samples to extract > the counts per cgroup. That would require a synthesis on cgroup on > perf record and some post-processing on perf report. We are interested > in using the data live to make some policy decisions, so a counting > approach with perf stat will always be best. > > The fundamental problem Namhyung is trying to solve is the following: > > num_fds = num_cpus x num_events x num_cgroups > > On an 256-CPU AMD server running 200 cgroups with 6 events/cgroup (as > an example): > > num_fds = 256 x 200 x 6 = 307,200 fds (with all the kernel memory > associated with them). So the PERCPU proposal would reduce that to 200 * 6 = 1200 fds, which is a definite win. > On each CPU, that implies: 200 x 6 = 1200 > events to schedule and 6 to find on each cgroup switch Right, so here we could optimize; if we find the event-groups are identical in composition we can probably frob something that swizzles the counts around without touching the PMU. That would be very similar to what we already have in perf_event_context_sched_out(). This gets a wee bit tricky when you consider cgroup hierarchy though; suppose you have: R / \ A B / \ C D And are monitoring both B and D, then you'll end up with 12 counters active instead of the 6. I'm not sure how to make that go away. 'Don't do that then' might be good enough. > This does not scale for us: >- run against the fd limit, but also memory consumption in the >kernel per struct file, struct inode, struct perf_event >- number of events per-cpu is still also large >- require event scheduling on cgroup switches, even with RB-tree >improvements, still heavy >- require event scheduling even if measuring the same events across >all cgroups > > One factor in that equation above needs to disappear. The one counter > per file descriptor is respected with Nahmyung's patch because he is > operating a plain per-cpu mode. What changes is just how and where the > count is accumulated in perf_events. The resulting programming on the > hardware is the same as before. Yes, you're aggregating differently. And that's exactly the problem. The aggregation is a variable one with fairly poor semantics. Suppose you create a new cgroup, then you have to tear down and recreate the whole thing, which is pretty crap. Ftrace had a similar issue; where people wanted aggregation, and that resulted in the event histogram, which, quite frankla,y is a scary monster that I've no intention of duplicating. That's half a programming language implemented. > As you point out, the difficulty is how to express the cgroups of > interest and how to read the counts back. I agree that the ioctl() is > not ideal for the latter. For the former, if you do not want ioctl() > then you would have to overload perf_event_open() with a vector of > cgroup fd, for instance. As for the read, you could, as you suggest, > use the read syscall if you want to read all the cgroups at once using > a new read_format. I don't have a problem with that. As for cgroup-id > vs. cgroup-fd, I think you make a fair point about consistency with > the existing approach. I don't have a problem with that either So that is a problem of aggregation; which is basically a programmability problem. You're asking for a variadic-fixed-function now, but tomorrow someone else will come and want another one.
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Fri, Apr 16, 2021 at 06:49:09PM +0900, Namhyung Kim wrote: > On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra wrote: > > > +static void perf_update_cgroup_node(struct perf_event *event, struct > > > cgroup *cgrp) > > > +{ > > > + u64 delta_count, delta_time_enabled, delta_time_running; > > > + int i; > > > + > > > + if (event->cgrp_node_count == 0) > > > + goto out; > > > + > > > + delta_count = local64_read(>count) - event->cgrp_node_count; >From here... > > > + delta_time_enabled = event->total_time_enabled - > > > event->cgrp_node_time_enabled; > > > + delta_time_running = event->total_time_running - > > > event->cgrp_node_time_running; > > > + > > > + /* account delta to all ancestor cgroups */ > > > + for (i = 0; i <= cgrp->level; i++) { > > > + struct perf_cgroup_node *node; > > > + > > > + node = find_cgroup_node(event, cgrp->ancestor_ids[i]); > > > + if (node) { > > > + node->count += delta_count; > > > + node->time_enabled += delta_time_enabled; > > > + node->time_running += delta_time_running; > > > + } > > > + } ... till here, NMI could hit and increment event->count, which then means that: > > > + > > > +out: > > > + event->cgrp_node_count = local64_read(>count); This load doesn't match the delta_count load and events will go missing. Obviously correct solution is: event->cgrp_node_count += delta_count; > > > + event->cgrp_node_time_enabled = event->total_time_enabled; > > > + event->cgrp_node_time_running = event->total_time_running; And while total_time doesn't have that problem, consistency would then have you do: event->cgrp_node_time_foo += delta_time_foo; > > > > This is wrong; there's no guarantee these are the same values you read > > at the begin, IOW you could be loosing events. > > Could you please elaborate? You forgot NMI.
Re: [syzbot] WARNING in kthread_is_per_cpu
On Tue, Apr 20, 2021 at 10:43:43AM +0100, Valentin Schneider wrote: > On 20/04/21 10:51, Peter Zijlstra wrote: > > I think free_kthread_struct() is ok, because a task at that point in its > > lifetime cannot be also doing exec(). > > > > What if it's one of those kthreads created by directly invoking > kernel_thread()? AFAICT right now it's only umh, and that one does execve() > so it ends up stripped of PF_KTHREAD. It could however go through an error > path, i.e. not call exec, and exit, giving us: > > put_task_struct(p) > `\ > free_task(p) > `\ > if (tsk->flags & PF_KTHREAD) > free_kthread_struct(tsk); > `\ > to_kthread(p) I'm not following, at the point we hit free_task() it had better be dead and p->flags had better be stable. Either it will, or will not, have PF_KTHREAD.
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Tue, Apr 20, 2021 at 01:34:40AM -0700, Stephane Eranian wrote: > The sampling approach will certainly incur more overhead and be at > risk of losing the ability to > reconstruct the total counter per-cgroup, unless you set the period > for SW_CGROUP_SWITCHES to > 1. But then, you run the risk of losing samples if the buffer is full > or sampling is throtlled. > In some scenarios, we believe the number of context switches between > cgroup could be quite high (>> 1000/s). > And on top you would have to add the processing of the samples to > extract the counts per cgroup. That would require > a synthesis on cgroup on perf record and some post-processing on perf > report. We are interested in using the data live > to make some policy decisions, so a counting approach with perf stat > will always be best. Can you please configure your MUA to sanely (re)flow text? The above random line-breaks are *so* painful to read.
Re: [PATCH] smp: add a best_effort version of smp_call_function_many()
On Mon, Apr 19, 2021 at 11:07:08PM +0200, Luigi Rizzo wrote: > On Mon, Apr 19, 2021 at 9:17 PM Peter Zijlstra wrote: > > > > On Mon, Apr 19, 2021 at 11:44:55AM -0700, Luigi Rizzo wrote: > > > Regardless of the 'wait' argument, smp_call_function_many() must spin > > > if any of the target CPUs have their csd busy waiting to be processed > > > for a previous call. This may cause high tail latencies e.g. when some > > > of the target CPUs are running functions that disable interrupts for a > > > long time; getrusage() is one possible culprit. > > > > > > Here we introduce a variant, __smp_call_function_many(), that adds > > > a third 'best_effort' mode to the two existing ones (nowait, wait). > > > In best effort mode, the call will skip CPUs whose csd is busy, and if > > > any CPU is skipped it returns -EBUSY and the set of busy in the mask. > > > This allows the caller to decide how to proceed, e.g. it might retry at > > > a later time, or use a private csd, etc.. > > > > > > The new function is a compromise to avoid touching existing callers of > > > smp_call_function_many(). If the feature is considered interesting, we > > > could even replace the 'wait' argument with a ternary 'mode' in all > > > smp_call_function_*() and derived methods. > > > > I don't see a user of this... > > This is actually something for which I was looking for feedback: > > my use case is similar to a periodic garbage collect request: > the caller tells targets that it may be time to do some work, > but it does not matter if the request is dropped because the > caller knows who was busy and will reissue pending requests later. > > I would expect something like the above could be useful e.g. > in various kinds of resource manager. > > However, a grep for on_each_cpu_*() and smp_call_function_*() > mostly returns synchronous calls (wait=1). > > Any possible candidates that people can think of ? We mostly try and avoid using this stuff wherever possible. Only when no other choice is left do we send IPIs. NOHZ_FULL already relies on this and gets massively unhappy when a new user comes and starts to spray IPIs. So no; mostly we send an IPI because we _HAVE_ to, not because giggles. That said; there's still some places left where we can avoid sending IPIs, but in all those cases correctness mandates we actually handle things and not randomly not do anything. For example, look at arch/x86/kernel/alternative.c:text_poke_sync(). The purpose of that is to ensure all CPUs observe modified *kernel* code. Now, if that CPU is currently running userspace, it doesn't much care kernel code is changed, however that does mean it needs to call sync_core() upon entering kernel, *BEFORE* hitting any code that's possibly modified (and self modifying code is everywhere today, ironically also very much in the NOHZ_FULL entry paths). So untangling all that should be possible, but is something that requires quite a bit of care and doesn't benefit from anything like the proposed. Mostly it sounds like you shouldn't be using IPIs either.
Re: [syzbot] WARNING in kthread_is_per_cpu
On Mon, Apr 19, 2021 at 08:58:26PM +0100, Valentin Schneider wrote: > Looks about right, IIUC the key being: > > p->flags & PF_KTHREAD + p->set_child_tid => the struct kthread is > persistent > > p->flags & PF_KTHREAD => you may or may not have a struct kthread (see > kernel/umh.c kernel_thread() uses). PF_KTHREAD isn't even guaranteed to > persist (begin_new_exec()), which seems to be what the syzbot hit. Ack, that's nicely put. > While we're at it, does free_kthread_struct() want the __to_kthread() > treatment as well? The other to_kthread() callsites looked like they only > made sense with a "proper" kthread anyway. I think free_kthread_struct() is ok, because a task at that point in its lifetime cannot be also doing exec(). kthread_func() is another 'fun' trainwreck waiting to happen -- luckily the only caller uses current, still let me go fix it. kthread_probe_data() relies on PF_WQ_WORKER implying PF_KTHREAD but otherwise seems very fragile too. Something like so then? --- Subject: kthread: Fix PF_KTHREAD vs to_kthread() race From: Peter Zijlstra Date: Tue Apr 20 10:18:17 CEST 2021 The kthread_is_per_cpu() construct relies on only being called on PF_KTHREAD tasks (per the WARN in to_kthread). This gives rise to the following usage pattern: if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p)) However, as reported by syzcaller, this is broken. The scenario is: CPU0CPU1 (running p) (p->flags & PF_KTHREAD) // true begin_new_exec() me->flags &= ~(PF_KTHREAD|...); kthread_is_per_cpu(p) to_kthread(p) WARN(!(p->flags & PF_KTHREAD) <-- *SPLAT* Introduce __to_kthread() that omits the WARN and is sure to check both values. Use this to remove the problematic pattern for kthread_is_per_cpu() and fix a number of other kthread_*() functions that have similar issues but are currently not used in ways that would expose the problem. Notably kthread_func() is only ever called on 'current', while kthread_probe_data() is only used for PF_WQ_WORKER, which implies the task is from kthread_create*(). Fixes: ac687e6e8c26 ("kthread: Extract KTHREAD_IS_PER_CPU") Signed-off-by: Peter Zijlstra (Intel) --- kernel/kthread.c| 33 +++-- kernel/sched/core.c |2 +- kernel/sched/fair.c |2 +- 3 files changed, 29 insertions(+), 8 deletions(-) --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -84,6 +84,25 @@ static inline struct kthread *to_kthread return (__force void *)k->set_child_tid; } +/* + * Variant of to_kthread() that doesn't assume @p is a kthread. + * + * Per construction; when: + * + * (p->flags & PF_KTHREAD) && p->set_child_tid + * + * the task is both a kthread and struct kthread is persistent. However + * PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and + * begin_new_exec()). + */ +static inline struct kthread *__to_kthread(struct task_struct *p) +{ + void *kthread = (__force void *)p->set_child_tid; + if (kthread && !(p->flags & PF_KTHREAD)) + kthread = NULL; + return kthread; +} + void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; @@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_shou */ void *kthread_func(struct task_struct *task) { - if (task->flags & PF_KTHREAD) - return to_kthread(task)->threadfn; + struct kthread *kthread = __to_kthread(task); + if (kthread) + return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); @@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data); */ void *kthread_probe_data(struct task_struct *task) { - struct kthread *kthread = to_kthread(task); + struct kthread *kthread = __to_kthread(task); void *data = NULL; - copy_from_kernel_nofault(, >data, sizeof(data)); + if (kthread) + copy_from_kernel_nofault(, >data, sizeof(data)); return data; } @@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_str set_bit(KTHREAD_IS_PER_CPU, >flags); } -bool kthread_is_per_cpu(struct task_struct *k) +bool kthread_is_per_cpu(struct task_struct *p) { - struct kthread *kthread = to_kthread(k); + struct kthread *kthread = __to_kthread(p); if (!kthread) return false; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8505,7 +8505,7 @@ static void balance_push(struct rq *rq) * histerical raisins. */ if (rq->idle == push_task || - ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + kthread_is_per_cpu(pu
Re: [PATCH 09/15] x86/alternatives: Use C int3 selftest but disable KASAN
On Mon, Apr 19, 2021 at 08:26:36AM -0700, Sami Tolvanen wrote: > On Sat, Apr 17, 2021 at 4:37 AM Peter Zijlstra wrote: > > The problem is that the call-site does not respect the regular calling > > convention, so calling a regular C function is just asking for trouble, > > which is why it ended up being asm, then we fully control the calling > > convention. > > Ack. The problem here is that we can't declare an extern static > function in C. How would you feel about making int3_magic a global > instead to match the C declaration? Works for me.
Re: [PATCH] smp: add a best_effort version of smp_call_function_many()
On Mon, Apr 19, 2021 at 11:44:55AM -0700, Luigi Rizzo wrote: > Regardless of the 'wait' argument, smp_call_function_many() must spin > if any of the target CPUs have their csd busy waiting to be processed > for a previous call. This may cause high tail latencies e.g. when some > of the target CPUs are running functions that disable interrupts for a > long time; getrusage() is one possible culprit. > > Here we introduce a variant, __smp_call_function_many(), that adds > a third 'best_effort' mode to the two existing ones (nowait, wait). > In best effort mode, the call will skip CPUs whose csd is busy, and if > any CPU is skipped it returns -EBUSY and the set of busy in the mask. > This allows the caller to decide how to proceed, e.g. it might retry at > a later time, or use a private csd, etc.. > > The new function is a compromise to avoid touching existing callers of > smp_call_function_many(). If the feature is considered interesting, we > could even replace the 'wait' argument with a ternary 'mode' in all > smp_call_function_*() and derived methods. I don't see a user of this...
Re: [syzbot] WARNING in kthread_is_per_cpu
On Mon, Apr 19, 2021 at 12:31:22PM +0100, Valentin Schneider wrote: > if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p)) > `\ >to_kthread(p); > `\ > WARN_ON(!(p->flags & PF_KTHREAD)); > > ... Huh? Something like so perhaps? diff --git a/kernel/kthread.c b/kernel/kthread.c index 1578973c5740..eeba40df61ac 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -78,6 +78,14 @@ static inline void set_kthread_struct(void *kthread) current->set_child_tid = (__force void __user *)kthread; } +static inline struct kthread *__to_kthread(struct task_struct *k) +{ + void *kthread = (__force void *)k->set_child_tid; + if (kthread && !(k->flags & PF_KTHREAD)) + kthread = NULL; + return kthread; +} + static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); @@ -516,7 +524,7 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu) bool kthread_is_per_cpu(struct task_struct *k) { - struct kthread *kthread = to_kthread(k); + struct kthread *kthread = __to_kthread(k); if (!kthread) return false; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3384ea74cad4..dc6311bd6986 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7658,7 +7658,7 @@ static void balance_push(struct rq *rq) * histerical raisins. */ if (rq->idle == push_task || - ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { /*
Re: [PATCH v7 09/28] mm: Create FolioFlags
On Fri, Apr 09, 2021 at 07:50:46PM +0100, Matthew Wilcox (Oracle) wrote: > These new functions are the folio analogues of the PageFlags functions. > If CONFIG_DEBUG_VM_PGFLAGS is enabled, we check the folio is not a tail > page at every invocation. Note that this will also catch the PagePoisoned > case as a poisoned page has every bit set, which would include PageTail. > > This saves 1727 bytes of text with the distro-derived config that > I'm testing due to removing a double call to compound_head() in > PageSwapCache(). I vote for dropping the Camels if we're going to rework all this.
Re: [PATCH] sched,fair: skip newidle_balance if a wakeup is pending
On Sun, Apr 18, 2021 at 10:17:51PM -0400, Rik van Riel wrote: > The try_to_wake_up function has an optimization where it can queue > a task for wakeup on its previous CPU, if the task is still in the > middle of going to sleep inside schedule(). > > Once schedule() re-enables IRQs, the task will be woken up with an > IPI, and placed back on the runqueue. > > If we have such a wakeup pending, there is no need to search other > CPUs for runnable tasks. Just skip (or bail out early from) newidle > balancing, and run the just woken up task. > > For a memcache like workload test, this reduces total CPU use by > about 2%, proportionally split between user and system time, > and p99 and p95 application response time by 2-3% on average. > The schedstats run_delay number shows a similar improvement. Nice. > Signed-off-by: Rik van Riel > --- > kernel/sched/fair.c | 11 ++- > 1 file changed, 10 insertions(+), 1 deletion(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 69680158963f..19a92c48939f 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7163,6 +7163,14 @@ done: __maybe_unused; > if (!rf) > return NULL; > > + /* > + * We have a woken up task pending here. No need to search for ones > + * elsewhere. This task will be enqueued the moment we unblock irqs > + * upon exiting the scheduler. > + */ > + if (rq->ttwu_pending) > + return NULL; As reported by the robot, that needs an CONFIG_SMP guard of sorts, #ifdef might work I suppose. > new_tasks = newidle_balance(rq, rf); > > /* > @@ -10661,7 +10669,8 @@ static int newidle_balance(struct rq *this_rq, struct > rq_flags *rf) >* Stop searching for tasks to pull if there are >* now runnable tasks on this rq. >*/ > - if (pulled_task || this_rq->nr_running > 0) > + if (pulled_task || this_rq->nr_running > 0 || > + this_rq->ttwu_pending) Either cino=(0:0 or just bust the limit and make it 84 chars.
Re: [PATCH] sched: Optimize housekeeping_cpumask in for_each_cpu_and
On Sat, Apr 17, 2021 at 11:01:37PM +0800, Yuan ZhaoXiong wrote: > On a 128 cores AMD machine, there are 8 cores in nohz_full mode, and > the others are used for housekeeping. When many housekeeping cpus are > in idle state, we can observe huge time burn in the loop for searching > nearest busy housekeeper cpu by ftrace. > >9) | get_nohz_timer_target() { >9) |housekeeping_test_cpu() { >9) 0.390 us| housekeeping_get_mask.part.1(); >9) 0.561 us|} >9) 0.090 us|__rcu_read_lock(); >9) 0.090 us|housekeeping_cpumask(); >9) 0.521 us|housekeeping_cpumask(); >9) 0.140 us|housekeeping_cpumask(); > >... > >9) 0.500 us|housekeeping_cpumask(); >9) |housekeeping_any_cpu() { >9) 0.090 us| housekeeping_get_mask.part.1(); >9) 0.100 us| sched_numa_find_closest(); >9) 0.491 us|} >9) 0.100 us|__rcu_read_unlock(); >9) + 76.163 us | } > > for_each_cpu_and() is a micro function, so in get_nohz_timer_target() > function the > for_each_cpu_and(i, sched_domain_span(sd), > housekeeping_cpumask(HK_FLAG_TIMER)) > equals to below: > for (i = -1; i = cpumask_next_and(i, sched_domain_span(sd), > housekeeping_cpumask(HK_FLAG_TIMER)), i < nr_cpu_ids;) > That will cause that housekeeping_cpumask() will be invoked many times. > The housekeeping_cpumask() function returns a const value, so it is > unnecessary to invoke it every time. This patch can minimize the worst > searching time from ~76us to ~16us in my testing. > > Similarly, the find_new_ilb() function has the same problem. Would it not make sense to mark housekeeping_cpumask() __pure instead?
Re: [PATCH 00/13] [RFC] Rust support
On Mon, Apr 19, 2021 at 11:02:12AM +0200, Paolo Bonzini wrote: > > void writer(void) > > { > > atomic_store_explicit(, seq+1, memory_order_relaxed); > > atomic_thread_fence(memory_order_acquire); > > This needs to be memory_order_release. The only change in the resulting > assembly is that "dmb ishld" becomes "dmb ish", which is not as good as the > "dmb ishst" you get from smp_wmb() but not buggy either. Yuck! And that is what requires the insides to be atomic_store_explicit(), otherwise this fence doesn't have to affect them. I also don't see how this is better than seq_cst. But yes, not broken, but also very much not optimal.
Re: [PATCH 0/9] sched: Core scheduling interfaces
On Sat, Apr 17, 2021 at 09:35:07PM -0400, Joel Fernandes wrote: > On Tue, Apr 06, 2021 at 10:16:12AM -0400, Tejun Heo wrote: > > Hello, > > > > On Mon, Apr 05, 2021 at 02:46:09PM -0400, Joel Fernandes wrote: > > > Yeah, its at http://lore.kernel.org/r/20200822030155.ga414...@google.com > > > as mentioned above, let me know if you need any more details about > > > usecase. > > > > Except for the unspecified reason in usecase 4, I don't see why cgroup is in > > the picture at all. This doesn't really have much to do with hierarchical > > resource distribution. Besides, yes, you can use cgroup for logical > > structuring and identificaiton purposes but in those cases the interactions > > and interface should be with the original subsystem while using cgroup IDs > > or paths as parameters - see tracing and bpf for examples. > > Personally for ChromeOS, we need only the per-task interface. Considering > that the second argument of this prctl is a command, I don't see why we > cannot add a new command PR_SCHED_CORE_CGROUP_SHARE to do what Tejun is > saying (in the future). > > In order to not block ChromeOS and other "per-task interface" usecases, I > suggest we keep the CGroup interface for a later time (whether that's > through prctl or the CGroups FS way which Tejun dislikes) and move forward > with per-task interface only initially. Josh, you being on the other Google team, the one that actually uses the cgroup interface AFAIU, can you fight the good fight with TJ on this? > Peter, any thoughts on this? Adding CGROUP_SHARE is not sufficient to close the hole against CLEAR. So we either then have to 'tweak' the meaning of CLEAR or replace it entirely, neither seem attractive. I'd love to make some progress on all this.
Re: [PATCH] preempt/dynamic: fix typo in macro conditional statement
On Sat, Apr 10, 2021 at 03:35:23PM +0800, Zhouyi Zhou wrote: > commit 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() > static call") tried to provide irqentry_exit_cond_resched() static call > in irqentry_exit, but has a typo in macro conditional statement. > > This patch fix this typo. > > Signed-off-by: Zhouyi Zhou Fixes: 40607ee97e4e ("preempt/dynamic: Provide irqentry_exit_cond_resched() static call") Thanks!
Re: [PATCH 00/13] [RFC] Rust support
On Mon, Apr 19, 2021 at 10:26:57AM +0200, Peter Zijlstra wrote: > https://godbolt.org/z/85xoPxeE5 That wants _Atomic on the seq definition for clang. > void writer(void) > { > atomic_store_explicit(, seq+1, memory_order_relaxed); > atomic_thread_fence(memory_order_acquire); > > X = 1; > Y = 2; > > atomic_store_explicit(, seq+1, memory_order_release); > } > > gives: > > writer: > adrpx1, .LANCHOR0 > add x0, x1, :lo12:.LANCHOR0 > ldr w2, [x1, #:lo12:.LANCHOR0] > add w2, w2, 1 > str w2, [x0] > dmb ishld > ldr w1, [x1, #:lo12:.LANCHOR0] > mov w3, 1 > mov w2, 2 > stp w3, w2, [x0, 4] > add w1, w1, w3 > stlrw1, [x0] > ret > > Which, afaict, is completely buggered. What it seems to be doing is > turning the seq load into a load-acquire, but what we really need is to > make sure the seq store (increment) is ordered before the other stores. Put differently, what you seem to want is store-acquire, but there ain't no such thing.
Re: [PATCH 00/13] [RFC] Rust support
On Mon, Apr 19, 2021 at 09:53:06AM +0200, Paolo Bonzini wrote: > On 19/04/21 09:32, Peter Zijlstra wrote: > > On Sat, Apr 17, 2021 at 04:51:58PM +0200, Paolo Bonzini wrote: > > > On 16/04/21 09:09, Peter Zijlstra wrote: > > > > Well, the obvious example would be seqlocks. C11 can't do them > > > > > > Sure it can. C11 requires annotating with (the equivalent of) READ_ONCE > > > all > > > reads of seqlock-protected fields, but the memory model supports seqlocks > > > just fine. > > > > How does that help? > > > > IIRC there's two problems, one on each side the lock. On the write side > > we have: > > > > seq++; > > smp_wmb(); > > X = r; > > Y = r; > > smp_wmb(); > > seq++; > > > > Which C11 simply cannot do right because it does't have wmb. > > It has atomic_thread_fence(memory_order_release), and > atomic_thread_fence(memory_order_acquire) on the read side. https://godbolt.org/z/85xoPxeE5 void writer(void) { atomic_store_explicit(, seq+1, memory_order_relaxed); atomic_thread_fence(memory_order_acquire); X = 1; Y = 2; atomic_store_explicit(, seq+1, memory_order_release); } gives: writer: adrpx1, .LANCHOR0 add x0, x1, :lo12:.LANCHOR0 ldr w2, [x1, #:lo12:.LANCHOR0] add w2, w2, 1 str w2, [x0] dmb ishld ldr w1, [x1, #:lo12:.LANCHOR0] mov w3, 1 mov w2, 2 stp w3, w2, [x0, 4] add w1, w1, w3 stlrw1, [x0] ret Which, afaict, is completely buggered. What it seems to be doing is turning the seq load into a load-acquire, but what we really need is to make sure the seq store (increment) is ordered before the other stores.
Re: [PATCH v2 resubmit] sched: Warn on long periods of pending need_resched
On Fri, Apr 16, 2021 at 02:29:36PM -0700, Josh Don wrote: > From: Paul Turner > > CPU scheduler marks need_resched flag to signal a schedule() on a > particular CPU. But, schedule() may not happen immediately in cases > where the current task is executing in the kernel mode (no > preemption state) for extended periods of time. > > This patch adds a warn_on if need_resched is pending for more than the > time specified in sysctl resched_latency_warn_ms. If it goes off, it is > likely that there is a missing cond_resched() somewhere. Monitoring is > done via the tick and the accuracy is hence limited to jiffy scale. This > also means that we won't trigger the warning if the tick is disabled. > > This feature (LATENCY_WARN) is default disabled. > > Signed-off-by: Paul Turner > Signed-off-by: Josh Don > Signed-off-by: Peter Zijlstra (Intel) > Link: https://lkml.kernel.org/r/20210323035706.572953-1-josh...@google.com Thanks!
Re: [PATCH v2] sched: Warn on long periods of pending need_resched
On Fri, Apr 16, 2021 at 02:33:27PM -0700, Josh Don wrote: > Yikes, sorry about that. I've squashed the fixup and sent the updated > patch (with trimmed cc list): https://lkml.org/lkml/2021/4/16/1125 For future reference, please use: https://lkml.kernel.org/r/MSGID Then I can search for MSGID in my local mailer (mutt: ~h) and instantly find the actual message (now I'll search for all email from you and it should obviously be easily found as well).
Re: [PATCH v5 04/19] perf: aux: Add CoreSight PMU buffer formats
On Mon, Mar 29, 2021 at 10:56:25AM -0600, Mathieu Poirier wrote: > Hi Peter, > > On Tue, Mar 23, 2021 at 12:06:32PM +, Suzuki K Poulose wrote: > > CoreSight PMU supports aux-buffer for the ETM tracing. The trace > > generated by the ETM (associated with individual CPUs, like Intel PT) > > is captured by a separate IP (CoreSight TMC-ETR/ETF until now). > > > > The TMC-ETR applies formatting of the raw ETM trace data, as it > > can collect traces from multiple ETMs, with the TraceID to indicate > > the source of a given trace packet. > > > > Arm Trace Buffer Extension is new "sink" IP, attached to individual > > CPUs and thus do not provide additional formatting, like TMC-ETR. > > > > Additionally, a system could have both TRBE *and* TMC-ETR for > > the trace collection. e.g, TMC-ETR could be used as a single > > trace buffer to collect data from multiple ETMs to correlate > > the traces from different CPUs. It is possible to have a > > perf session where some events end up collecting the trace > > in TMC-ETR while the others in TRBE. Thus we need a way > > to identify the type of the trace for each AUX record. > > > > Define the trace formats exported by the CoreSight PMU. > > We don't define the flags following the "ETM" as this > > information is available to the user when issuing > > the session. What is missing is the additional > > formatting applied by the "sink" which is decided > > at the runtime and the user may not have a control on. > > > > So we define : > > - CORESIGHT format (indicates the Frame format) > > - RAW format (indicates the format of the source) > > > > The default value is CORESIGHT format for all the records > > (i,e == 0). Add the RAW format for others that use > > raw format. > > > > Cc: Peter Zijlstra > > Cc: Mike Leach > > Cc: Mathieu Poirier > > Cc: Leo Yan > > Cc: Anshuman Khandual > > Reviewed-by: Mike Leach > > Reviewed-by: Mathieu Poirier > > Signed-off-by: Suzuki K Poulose > > --- > > include/uapi/linux/perf_event.h | 4 > > 1 file changed, 4 insertions(+) > > > > diff --git a/include/uapi/linux/perf_event.h > > b/include/uapi/linux/perf_event.h > > index f006eeab6f0e..63971eaef127 100644 > > --- a/include/uapi/linux/perf_event.h > > +++ b/include/uapi/linux/perf_event.h > > @@ -1162,6 +1162,10 @@ enum perf_callchain_context { > > #define PERF_AUX_FLAG_COLLISION0x08/* sample > > collided with another */ > > #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace > > format type */ > > > > +/* CoreSight PMU AUX buffer formats */ > > +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x /* Default for > > backward compatibility */ > > +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of > > the source */ > > + > > Have you had time to review this patch? Anything you'd like to see modified? Ok I suppose.. Acked-by: Peter Zijlstra (Intel)
Re: Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
On Mon, Apr 19, 2021 at 09:46:26AM +0800, 周传高 wrote: > > >On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote: > >> eg: > >> In Android system, we usually and some processes to the whitelist. > >> static task_t task_whitelist[] = { > >>{"mdrt_thread", HUNG_TASK_WHITELIST}, > >>{"chre_kthread", HUNG_TASK_WHITELIST}, > >>{"scp_power_reset", HUNG_TASK_WHITELIST}, > >>{"ccci_fsm1", HUNG_TASK_WHITELIST}, > >>{"qos_ipi_recv", HUNG_TASK_WHITELIST}, > >>{NULL, 0}, > >> }; > > > >What are these tasks doing that the hung-task detector fires on them? > >Should you fix that instead? > > These tasks are implemented by the SoC vendor, and normally they do > not configure HUNG TASK, so we need to ignore these tasks if we use > HUNG TASK. Then raise a bug against their crap software, don't try and work around it in the kernel. We're not going to upstream workarounds for crap vendor software.
Re: [PATCH 00/13] [RFC] Rust support
On Sat, Apr 17, 2021 at 04:51:58PM +0200, Paolo Bonzini wrote: > On 16/04/21 09:09, Peter Zijlstra wrote: > > Well, the obvious example would be seqlocks. C11 can't do them > > Sure it can. C11 requires annotating with (the equivalent of) READ_ONCE all > reads of seqlock-protected fields, but the memory model supports seqlocks > just fine. How does that help? IIRC there's two problems, one on each side the lock. On the write side we have: seq++; smp_wmb(); X = r; Y = r; smp_wmb(); seq++; Which C11 simply cannot do right because it does't have wmb. You end up having to use seq_cst for the first wmb or make both X and Y (on top of the last seq) a store-release, both options are sub-optimal. On the read side you get: do { s = seq; smp_rmb(); r1 = X; r2 = Y; smp_rmb(); } while ((s&1) || seq != s); And then you get into trouble the last barrier, so the first seq load can be load-acquire, after which the loads of X, Y come after, but you need then to happen before the second seq load, for which you then need seq_cst, or make X and Y load-acquire. Again, not optimal. I have also seen *many* broken variants of it on the web. Some work on x86 but are totally broken when you build them on LL/SC ARM64.
Re: [PATCH] kernel/hung_task: Add a whitelist and blacklist mechanism.
On Sat, Apr 17, 2021 at 07:13:01AM -0700, zhouchuangao wrote: > eg: > In Android system, we usually and some processes to the whitelist. > static task_t task_whitelist[] = { > {"mdrt_thread", HUNG_TASK_WHITELIST}, > {"chre_kthread", HUNG_TASK_WHITELIST}, > {"scp_power_reset", HUNG_TASK_WHITELIST}, > {"ccci_fsm1", HUNG_TASK_WHITELIST}, > {"qos_ipi_recv", HUNG_TASK_WHITELIST}, > {NULL, 0}, > }; What are these tasks doing that the hung-task detector fires on them? Should you fix that instead?
Re: [PATCH 00/13] [RFC] Rust support
On Sat, Apr 17, 2021 at 01:46:23PM +0200, Willy Tarreau wrote: > For me the old trick of casting one side as long long still works: > > unsigned long long mul3264(unsigned int a, unsigned int b) > { > return (unsigned long long)a * b; > } > > i386: > : > 0: 8b 44 24 08 mov0x8(%esp),%eax > 4: f7 64 24 04 mull 0x4(%esp) > 8: c3ret > > x86_64: > : > 0: 89 f8 mov%edi,%eax > 2: 89 f7 mov%esi,%edi > 4: 48 0f af c7 imul %rdi,%rax > 8: c3retq > > Or maybe you had something else in mind ? Last time I tried it, the thing refused :/ which is how we ended up with mul_u32_u32() in asm.
Re: [tip:sched/core 23/23] kernel/sched/topology.c:17:2: error: 'sched_debug_verbose' undeclared; did you mean 'sched_debug_setup'?
On Sat, Apr 17, 2021 at 11:06:28AM +0200, Borislav Petkov wrote: > On Sat, Apr 17, 2021 at 02:47:46PM +0800, kernel test robot wrote: > >kernel/sched/topology.c: In function 'sched_debug_setup': > > >> kernel/sched/topology.c:17:2: error: 'sched_debug_verbose' undeclared > > >> (first use in this function); did you mean 'sched_debug_setup'? > > 17 | sched_debug_verbose = true; > > I guess it needs this: > > --- > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index 55232db745ac..916bb96ff9a6 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -2364,6 +2364,7 @@ extern struct sched_entity *__pick_last_entity(struct > cfs_rq *cfs_rq); > > #ifdef CONFIG_SCHED_DEBUG > extern bool sched_debug_enabled; > +extern bool sched_debug_verbose; > > extern void print_cfs_stats(struct seq_file *m, int cpu); > extern void print_rt_stats(struct seq_file *m, int cpu); > Right, sorry for messing that up. Pushed out that commit with this folded. If it gives any more trouble just kill the commit, it's the topmost commit anyway.
[tip: sched/core] sched/debug: Rename the sched_debug parameter to sched_verbose
The following commit has been merged into the sched/core branch of tip: Commit-ID: 9406415f46f6127fd31bb66f0260f7a61a8d2786 Gitweb: https://git.kernel.org/tip/9406415f46f6127fd31bb66f0260f7a61a8d2786 Author:Peter Zijlstra AuthorDate:Thu, 15 Apr 2021 18:23:17 +02:00 Committer: Peter Zijlstra CommitterDate: Sat, 17 Apr 2021 13:22:44 +02:00 sched/debug: Rename the sched_debug parameter to sched_verbose CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param sched_debug and the /debug/sched/debug_enabled knobs control the sched_debug_enabled variable, but what they really do is make SCHED_DEBUG more verbose, so rename the lot. Signed-off-by: Peter Zijlstra (Intel) --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/scheduler/sched-domains.rst | 10 +- kernel/sched/debug.c| 4 ++-- kernel/sched/sched.h| 2 +- kernel/sched/topology.c | 12 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0454572..9e4c026 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4725,7 +4725,7 @@ sbni= [NET] Granch SBNI12 leased line adapter - sched_debug [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 8582fa5..14ea2f2 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and calling set_sched_topology() with this array as the parameter. The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to -tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug -knob. This enables an error checking parse of the sched domains which should -catch most possible errors (described above). It also prints out the domain -structure in a visual format. +CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you +forgot to tweak your cmdline, you can also flip the +/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of +the sched domains which should catch most possible errors (described above). It +also prints out the domain structure in a visual format. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bf199d6..461342f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -275,7 +275,7 @@ static const struct file_operations sched_dynamic_fops = { #endif /* CONFIG_PREEMPT_DYNAMIC */ -__read_mostly bool sched_debug_enabled; +__read_mostly bool sched_debug_verbose; static const struct seq_operations sched_debug_sops; @@ -300,7 +300,7 @@ static __init int sched_init_debug(void) debugfs_sched = debugfs_create_dir("sched", NULL); debugfs_create_file("features", 0644, debugfs_sched, NULL, _feat_fops); - debugfs_create_bool("debug_enabled", 0644, debugfs_sched, _debug_enabled); + debugfs_create_bool("verbose", 0644, debugfs_sched, _debug_verbose); #ifdef CONFIG_PREEMPT_DYNAMIC debugfs_create_file("preempt", 0644, debugfs_sched, NULL, _dynamic_fops); #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 55232db..bde7248 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2363,7 +2363,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq); extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq); #ifdef CONFIG_SCHED_DEBUG -extern bool sched_debug_enabled; +extern bool sched_debug_verbose; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c343aed..55a0a24 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2; static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } -early_param("sched_debug", sched_debug_setup); +early_param("sched_verbose", sched_debug_setup); static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags,
Re: [PATCH 09/15] x86/alternatives: Use C int3 selftest but disable KASAN
On Fri, Apr 16, 2021 at 01:38:38PM -0700, Sami Tolvanen wrote: > From: Kees Cook > > Instead of using inline asm for the int3 selftest (which confuses the > Clang's ThinLTO pass), this restores the C function but disables KASAN > (and tracing for good measure) to keep the things simple and avoid > unexpected side-effects. This attempts to keep the fix from commit > ecc606103837 ("x86/alternatives: Fix int3_emulate_call() selftest stack > corruption") without using inline asm. > > Signed-off-by: Kees Cook > Signed-off-by: Sami Tolvanen > --- > arch/x86/kernel/alternative.c | 21 - > 1 file changed, 4 insertions(+), 17 deletions(-) > > diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c > index 6974b5174495..669a23454c09 100644 > --- a/arch/x86/kernel/alternative.c > +++ b/arch/x86/kernel/alternative.c > @@ -496,23 +496,10 @@ extern struct paravirt_patch_site > __start_parainstructions[], > * > * See entry_{32,64}.S for more details. > */ > - > -/* > - * We define the int3_magic() function in assembly to control the calling > - * convention such that we can 'call' it from assembly. > - */ > - > -extern void int3_magic(unsigned int *ptr); /* defined in asm */ > - > -asm ( > -".pushsection.init.text, \"ax\", @progbits\n" > -".type int3_magic, @function\n" > -"int3_magic:\n" > -"movl$1, (%" _ASM_ARG1 ")\n" > -"ret\n" > -".size int3_magic, .-int3_magic\n" > -".popsection\n" > -); > +static void __init __no_sanitize_address notrace int3_magic(unsigned int > *ptr) > +{ > + *ptr = 1; > +} I really don't like this. the compiler is free to mess this up in all sorts of ways. The problem is that the call-site does not respect the regular calling convention, so calling a regular C function is just asking for trouble, which is why it ended up being asm, then we fully control the calling convention.
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 07:08:29PM +0100, Matthew Wilcox wrote: > On Fri, Apr 16, 2021 at 07:18:48PM +0200, Peter Zijlstra wrote: > > On Fri, Apr 16, 2021 at 07:10:17PM +0200, Miguel Ojeda wrote: > > > > > Of course, UB is only a subset of errors, but it is a major one, and > > > particularly critical for privileged code. > > > > I've seen relatively few UBSAN warnings that weren't due to UBSAN being > > broken. > > Lucky you. > > 84c34df158cf215b0cd1475ab3b8e6f212f81f23 > > (i'd argue this is C being broken; promoting only as far as int, when > assigning to an unsigned long is Bad, but until/unless either GCC fixes > that or the language committee realises that being stuck in the 1970s > is Bad, people are going to keep making this kind of mistake) Well, I think the rules actually make sense, at the point in the syntax tree where + happens, we have 'unsigned char' and 'int', so at that point we promote to 'int'. Subsequently 'int' gets shifted and bad things happen. The 'unsigned long' doesn't happen until quite a bit later. Anyway, the rules are imo fairly clear and logical, but yes they can be annoying. The really silly thing here is that << and >> have UB at all, and I would love a -fwrapv style flag that simply defines it. Yes it will generate worse code in some cases, but having the UB there is just stupid. That of course doesn't help your case here, it would simply misbehave and not be UB. Another thing the C rules cannot really express is a 32x32->64 multiplication, some (older) versions of GCC can be tricked into it, but mostly it just doesn't want to do that sanely and the C rules are absolutely no help there.
Re: [PATCH v5] sched/debug: Use sched_debug_lock to serialize use of cgroup_path[] only
On Thu, Apr 15, 2021 at 03:54:26PM -0400, Waiman Long wrote: > The handling of sysrq key can be activated by echoing the key to > /proc/sysrq-trigger or via the magic key sequence typed into a terminal > that is connected to the system in some way (serial, USB or other mean). > In the former case, the handling is done in a user context. In the > latter case, it is likely to be in an interrupt context. > > Currently in print_cpu() of kernel/sched/debug.c, sched_debug_lock is > taken with interrupt disabled for the whole duration of the calls to > print_*_stats() and print_rq() which could last for the quite some time > if the information dump happens on the serial console. > > If the system has many cpus and the sched_debug_lock is somehow busy > (e.g. parallel sysrq-t), the system may hit a hard lockup panic > depending on the actually serial console implementation of the > system. For instance, > ... > > The purpose of sched_debug_lock is to serialize the use of the global > cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't > need serialization from sched_debug_lock. > > Calling printk() with interrupt disabled can still be problematic if > multiple instances are running. Allocating a stack buffer of PATH_MAX > bytes is not feasible because of the limited size of the kernel stack. > > The solution implemented in this patch is to allow only one caller at a > time to use the full size group_path[], while other simultaneous callers > will have to use shorter stack buffers with the possibility of path > name truncation. A "..." suffix will be printed if truncation may have > happened. The cgroup path name is provided for informational purpose > only, so occasional path name truncation should not be a big problem. > > Fixes: efe25c2c7b3a ("sched: Reinstate group names in /proc/sched_debug") > Suggested-by: Peter Zijlstra > Signed-off-by: Waiman Long Thanks!
Re: [PATCH V2] sched,psi: fix the 'int' underflow for psi
On Fri, Apr 16, 2021 at 11:11:04AM -0400, Johannes Weiner wrote: > On Fri, Apr 16, 2021 at 08:32:16PM +0530, Charan Teja Reddy wrote: > > psi_group_cpu->tasks, represented by the unsigned int, stores the number > > of tasks that could be stalled on a psi resource(io/mem/cpu). > > Decrementing these counters at zero leads to wrapping which further > > leads to the psi_group_cpu->state_mask is being set with the respective > > pressure state. This could result into the unnecessary time sampling for > > the pressure state thus cause the spurious psi events. This can further > > lead to wrong actions being taken at the user land based on these psi > > events. > > Though psi_bug is set under these conditions but that just for debug > > purpose. Fix it by decrementing the ->tasks count only when it is > > non-zero. > > > > Signed-off-by: Charan Teja Reddy > > The subject should be > > psi: handle potential task count underflow bugs more gracefully Done. > since it's not fixing a known bug at this point. Otherwise, > > Acked-by: Johannes Weiner Thanks!
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 07:10:17PM +0200, Miguel Ojeda wrote: > Of course, UB is only a subset of errors, but it is a major one, and > particularly critical for privileged code. I've seen relatively few UBSAN warnings that weren't due to UBSAN being broken.
Re: [PATCH v2] locking/qrwlock: Fix ordering in queued_write_lock_slowpath
I've edited the thing to look like so. I'll go queue it for locking/urgent. --- Subject: locking/qrwlock: Fix ordering in queued_write_lock_slowpath() From: Ali Saidi Date: Thu, 15 Apr 2021 17:27:11 + From: Ali Saidi While this code is executed with the wait_lock held, a reader can acquire the lock without holding wait_lock. The writer side loops checking the value with the atomic_cond_read_acquire(), but only truly acquires the lock when the compare-and-exchange is completed successfully which isn’t ordered. This exposes the window between the acquire and the cmpxchg to an A-B-A problem which allows reads following the lock acquisition to observe values speculatively before the write lock is truly acquired. We've seen a problem in epoll where the reader does a xchg while holding the read lock, but the writer can see a value change out from under it. Writer| Reader ep_scan_ready_list() | |- write_lock_irq() | |- queued_write_lock_slowpath() | |- atomic_cond_read_acquire() | | read_lock_irqsave(>lock, flags); --> (observes value before unlock) | chain_epi_lockless() | |epi->next = xchg(>ovflist, epi); | | read_unlock_irqrestore(>lock, flags); | | | atomic_cmpxchg_relaxed() | |-- READ_ONCE(ep->ovflist);| A core can order the read of the ovflist ahead of the atomic_cmpxchg_relaxed(). Switching the cmpxchg to use acquire semantics addresses this issue at which point the atomic_cond_read can be switched to use relaxed semantics. Fixes: b519b56e378ee ("locking/qrwlock: Use atomic_cond_read_acquire() when spinning in qrwlock") Signed-off-by: Ali Saidi [peterz: use try_cmpxchg()] Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steve Capper Acked-by: Will Deacon Tested-by: Steve Capper --- kernel/locking/qrwlock.c |7 --- 1 file changed, 4 insertions(+), 3 deletions(-) --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath) */ void queued_write_lock_slowpath(struct qrwlock *lock) { + int cnts; + /* Put the writer into the wait queue */ arch_spin_lock(>wait_lock); @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct q /* When no more readers or writers, set the locked flag */ do { - atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); - } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING, - _QW_LOCKED) != _QW_WAITING); + cnts = atomic_cond_read_relaxed(>cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_acquire(>cnts, , _QW_LOCKED)); unlock: arch_spin_unlock(>wait_lock); }
Re: [tip: perf/core] perf/x86: Reset the dirty counter to prevent the leak for an RDPMC task
On Fri, Apr 16, 2021 at 03:01:48PM -, tip-bot2 for Kan Liang wrote: > @@ -2331,6 +2367,9 @@ static void x86_pmu_event_unmapped(struct perf_event > *event, struct mm_struct *m > if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) > return; > > + if (x86_pmu.sched_task && event->hw.target) > + perf_sched_cb_dec(event->ctx->pmu); > + > if (atomic_dec_and_test(>context.perf_rdpmc_allowed)) > on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); > } 'perf test' on a kernel with CONFIG_DEBUG_PREEMPT=y gives: [ 244.439538] BUG: using smp_processor_id() in preemptible [] code: perf/1771 [ 244.448144] caller is perf_sched_cb_dec+0xa/0x70 [ 244.453314] CPU: 28 PID: 1771 Comm: perf Not tainted 5.12.0-rc3-00026-gb04c0cddff6d #595 [ 244.462347] Hardware name: Intel Corporation S2600GZ/S2600GZ, BIOS SE5C600.86B.02.02.0002.122320131210 12/23/2013 [ 244.473804] Call Trace: [ 244.476535] dump_stack+0x6d/0x89 [ 244.480237] check_preemption_disabled+0xc8/0xd0 [ 244.485394] perf_sched_cb_dec+0xa/0x70 [ 244.489677] x86_pmu_event_unmapped+0x35/0x60 [ 244.494541] perf_mmap_close+0x76/0x580 [ 244.498833] remove_vma+0x31/0x70 [ 244.502535] __do_munmap+0x2e8/0x4e0 [ 244.506531] __vm_munmap+0x7e/0x120 [ 244.510429] __x64_sys_munmap+0x28/0x30 [ 244.514712] do_syscall_64+0x33/0x80 [ 244.518701] entry_SYSCALL_64_after_hwframe+0x44/0xae Obviously I tested that after I pushed it out :/
[tip: sched/core] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
The following commit has been merged into the sched/core branch of tip: Commit-ID: b5c4477366fb5e6a2f0f38742c33acd666c07698 Gitweb: https://git.kernel.org/tip/b5c4477366fb5e6a2f0f38742c33acd666c07698 Author:Peter Zijlstra AuthorDate:Thu, 21 Jan 2021 16:09:32 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00 sched: Use cpu_dying() to fix balance_push vs hotplug-rollback Use the new cpu_dying() state to simplify and fix the balance_push() vs CPU hotplug rollback state. Specifically, we currently rely on notifiers sched_cpu_dying() / sched_cpu_activate() to terminate balance_push, however if the cpu_down() fails when we're past sched_cpu_deactivate(), it should terminate balance_push at that point and not wait until we hit sched_cpu_activate(). Similarly, when cpu_up() fails and we're going back down, balance_push should be active, where it currently is not. So instead, make sure balance_push is enabled below SCHED_AP_ACTIVE (when !cpu_active()), and gate it's utility with cpu_dying(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/yhgayef83vqhk...@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 26 +++--- kernel/sched/sched.h | 1 - 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95bd6ab..7d031da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1811,7 +1811,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) return cpu_online(cpu); /* Regular kernel threads don't get to stay during offline. */ - if (cpu_rq(cpu)->balance_push) + if (cpu_dying(cpu)) return false; /* But are allowed during online. */ @@ -7638,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work); /* * Ensure we only run per-cpu kthreads once the CPU goes !active. + * + * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only + * effective when the hotplug motion is down. */ static void balance_push(struct rq *rq) { @@ -7645,12 +7648,19 @@ static void balance_push(struct rq *rq) lockdep_assert_held(>lock); SCHED_WARN_ON(rq->cpu != smp_processor_id()); + /* * Ensure the thing is persistent until balance_push_set(.on = false); */ rq->balance_callback = _push_callback; /* +* Only active while going offline. +*/ + if (!cpu_dying(rq->cpu)) + return; + + /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. * @@ -7703,7 +7713,6 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, ); - rq->balance_push = on; if (on) { WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = _push_callback; @@ -7828,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq_flags rf; /* -* Make sure that when the hotplug state machine does a roll-back -* we clear balance_push. Ideally that would happen earlier... +* Clear the balance_push callback and prepare to schedule +* regular tasks. */ balance_push_set(cpu, false); @@ -8014,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, ); - /* -* Now that the CPU is offline, make sure we're welcome -* to new tasks once we come back up. -*/ - balance_push_set(cpu, false); - calc_load_migrate(rq); update_max_interval(); hrtick_clear(rq); @@ -8204,7 +8207,7 @@ void __init sched_init(void) rq->sd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; - rq->balance_callback = NULL; + rq->balance_callback = _push_callback; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; @@ -8251,6 +8254,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); + balance_push_set(smp_processor_id(), false); #endif init_sched_fair_class(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cbb0b01..7e7e936 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -983,7 +983,6 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head*balance_callback; - unsigned char balance_push; unsigned char nohz_idle_balance; unsigned char idle_balance;
[tip: sched/core] cpumask: Make cpu_{online,possible,present,active}() inline
The following commit has been merged into the sched/core branch of tip: Commit-ID: b02a4fd8148f655095d9e3d6eddd8f0042bcc27c Gitweb: https://git.kernel.org/tip/b02a4fd8148f655095d9e3d6eddd8f0042bcc27c Author:Peter Zijlstra AuthorDate:Mon, 25 Jan 2021 16:46:49 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00 cpumask: Make cpu_{online,possible,present,active}() inline Prepare for addition of another mask. Primarily a code movement to avoid having to create more #ifdef, but while there, convert everything with an argument to an inline function. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210310150109.045447...@infradead.org --- include/linux/cpumask.h | 97 +++- 1 file changed, 66 insertions(+), 31 deletions(-) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 383684e..a584336 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -98,37 +98,6 @@ extern struct cpumask __cpu_active_mask; extern atomic_t __num_online_cpus; -#if NR_CPUS > 1 -/** - * num_online_cpus() - Read the number of online CPUs - * - * Despite the fact that __num_online_cpus is of type atomic_t, this - * interface gives only a momentary snapshot and is not protected against - * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held - * region. - */ -static inline unsigned int num_online_cpus(void) -{ - return atomic_read(&__num_online_cpus); -} -#define num_possible_cpus()cpumask_weight(cpu_possible_mask) -#define num_present_cpus() cpumask_weight(cpu_present_mask) -#define num_active_cpus() cpumask_weight(cpu_active_mask) -#define cpu_online(cpu)cpumask_test_cpu((cpu), cpu_online_mask) -#define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) -#define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) -#define cpu_active(cpu)cpumask_test_cpu((cpu), cpu_active_mask) -#else -#define num_online_cpus() 1U -#define num_possible_cpus()1U -#define num_present_cpus() 1U -#define num_active_cpus() 1U -#define cpu_online(cpu)((cpu) == 0) -#define cpu_possible(cpu) ((cpu) == 0) -#define cpu_present(cpu) ((cpu) == 0) -#define cpu_active(cpu)((cpu) == 0) -#endif - extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) @@ -894,6 +863,72 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu) return to_cpumask(p); } +#if NR_CPUS > 1 +/** + * num_online_cpus() - Read the number of online CPUs + * + * Despite the fact that __num_online_cpus is of type atomic_t, this + * interface gives only a momentary snapshot and is not protected against + * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held + * region. + */ +static inline unsigned int num_online_cpus(void) +{ + return atomic_read(&__num_online_cpus); +} +#define num_possible_cpus()cpumask_weight(cpu_possible_mask) +#define num_present_cpus() cpumask_weight(cpu_present_mask) +#define num_active_cpus() cpumask_weight(cpu_active_mask) + +static inline bool cpu_online(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_online_mask); +} + +static inline bool cpu_possible(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_possible_mask); +} + +static inline bool cpu_present(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_present_mask); +} + +static inline bool cpu_active(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_active_mask); +} + +#else + +#define num_online_cpus() 1U +#define num_possible_cpus()1U +#define num_present_cpus() 1U +#define num_active_cpus() 1U + +static inline bool cpu_online(unsigned int cpu) +{ + return cpu == 0; +} + +static inline bool cpu_possible(unsigned int cpu) +{ + return cpu == 0; +} + +static inline bool cpu_present(unsigned int cpu) +{ + return cpu == 0; +} + +static inline bool cpu_active(unsigned int cpu) +{ + return cpu == 0; +} + +#endif /* NR_CPUS > 1 */ + #define cpu_is_offline(cpu)unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG
[tip: sched/core] cpumask: Introduce DYING mask
The following commit has been merged into the sched/core branch of tip: Commit-ID: e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5 Gitweb: https://git.kernel.org/tip/e40f74c535b8a0ecf3ef0388b51a34cdadb34fb5 Author:Peter Zijlstra AuthorDate:Tue, 19 Jan 2021 18:43:45 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:32 +02:00 cpumask: Introduce DYING mask Introduce a cpumask that indicates (for each CPU) what direction the CPU hotplug is currently going. Notably, it tracks rollbacks. Eg. when an up fails and we do a roll-back down, it will accurately reflect the direction. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210310150109.151441...@infradead.org --- include/linux/cpumask.h | 20 kernel/cpu.c| 6 ++ 2 files changed, 26 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index a584336..e6b948a 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -91,10 +91,12 @@ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; +extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) +#define cpu_dying_mask((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; @@ -826,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active) cpumask_clear_cpu(cpu, &__cpu_active_mask); } +static inline void +set_cpu_dying(unsigned int cpu, bool dying) +{ + if (dying) + cpumask_set_cpu(cpu, &__cpu_dying_mask); + else + cpumask_clear_cpu(cpu, &__cpu_dying_mask); +} /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * @@ -900,6 +910,11 @@ static inline bool cpu_active(unsigned int cpu) return cpumask_test_cpu(cpu, cpu_active_mask); } +static inline bool cpu_dying(unsigned int cpu) +{ + return cpumask_test_cpu(cpu, cpu_dying_mask); +} + #else #define num_online_cpus() 1U @@ -927,6 +942,11 @@ static inline bool cpu_active(unsigned int cpu) return cpu == 0; } +static inline bool cpu_dying(unsigned int cpu) +{ + return false; +} + #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu)unlikely(!cpu_online(cpu)) diff --git a/kernel/cpu.c b/kernel/cpu.c index 23505d6..838dcf2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -160,6 +160,9 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state, int (*cb)(unsigned int cpu); int ret, cnt; + if (cpu_dying(cpu) != !bringup) + set_cpu_dying(cpu, !bringup); + if (st->fail == state) { st->fail = CPUHP_INVALID; return -EAGAIN; @@ -2512,6 +2515,9 @@ EXPORT_SYMBOL(__cpu_present_mask); struct cpumask __cpu_active_mask __read_mostly; EXPORT_SYMBOL(__cpu_active_mask); +struct cpumask __cpu_dying_mask __read_mostly; +EXPORT_SYMBOL(__cpu_dying_mask); + atomic_t __num_online_cpus __read_mostly; EXPORT_SYMBOL(__num_online_cpus);
[tip: sched/core] sched: Move SCHED_DEBUG sysctl to debugfs
The following commit has been merged into the sched/core branch of tip: Commit-ID: 8a99b6833c884fa0e7919030d93fecedc69fc625 Gitweb: https://git.kernel.org/tip/8a99b6833c884fa0e7919030d93fecedc69fc625 Author:Peter Zijlstra AuthorDate:Wed, 24 Mar 2021 11:43:21 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00 sched: Move SCHED_DEBUG sysctl to debugfs Stop polluting sysctl with undocumented knobs that really are debug only, move them all to /debug/sched/ along with the existing /debug/sched_* files that already exist. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.287610...@infradead.org --- include/linux/sched/sysctl.h | 8 +-- kernel/sched/core.c | 4 +- kernel/sched/debug.c | 74 +-- kernel/sched/fair.c | 9 +--- kernel/sched/sched.h | 2 +- kernel/sysctl.c | 96 +--- 6 files changed, 80 insertions(+), 113 deletions(-) diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba8..0a3f346 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, enum { sysctl_hung_task_timeout_secs = 0 }; #endif +extern unsigned int sysctl_sched_child_runs_first; + extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; -extern unsigned int sysctl_sched_child_runs_first; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, @@ -37,7 +38,7 @@ enum sched_tunable_scaling { SCHED_TUNABLESCALING_LINEAR, SCHED_TUNABLESCALING_END, }; -extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_sched_tunable_scaling; extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; @@ -47,9 +48,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; - -int sched_proc_update_handler(struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos); #endif /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d031da..bac30db 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5504,9 +5504,11 @@ static const struct file_operations sched_dynamic_fops = { .release= single_release, }; +extern struct dentry *debugfs_sched; + static __init int sched_init_debug_dynamic(void) { - debugfs_create_file("sched_preempt", 0644, NULL, NULL, _dynamic_fops); + debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, _dynamic_fops); return 0; } late_initcall(sched_init_debug_dynamic); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4b49cc2..2093b90 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -169,15 +169,81 @@ static const struct file_operations sched_feat_fops = { .release= single_release, }; +#ifdef CONFIG_SMP + +static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(, ubuf, cnt)) + return -EFAULT; + + if (kstrtouint(buf, 10, _sched_tunable_scaling)) + return -EINVAL; + + if (sched_update_scaling()) + return -EINVAL; + + *ppos += cnt; + return cnt; +} + +static int sched_scaling_show(struct seq_file *m, void *v) +{ + seq_printf(m, "%d\n", sysctl_sched_tunable_scaling); + return 0; +} + +static int sched_scaling_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_scaling_show, NULL); +} + +static const struct file_operations sched_scaling_fops = { + .open = sched_scaling_open, + .write = sched_scaling_write, + .read = seq_read, + .llseek = seq_lseek, + .release= single_release, +}; + +#endif /* SMP */ + __read_mostly bool sched_debug_enabled; +struct dentry *debugfs_sched; + static __init int sched_init_debug(void) { - debugfs_create_file("sched_features", 0644, NULL, NULL, - _feat_fops); + struct dentry __maybe_unused *numa; - debugfs_create_bool("sched_debug", 0644, NULL, - _debug_enabled); + debugfs_sched = debugfs_create_dir("sched", NULL); + + debugfs_create_file("features", 0644, debugfs_sch
[tip: sched/core] sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG
The following commit has been merged into the sched/core branch of tip: Commit-ID: 1d1c2509de4488cc58c924d0a6117c62de1d4f9c Gitweb: https://git.kernel.org/tip/1d1c2509de4488cc58c924d0a6117c62de1d4f9c Author:Peter Zijlstra AuthorDate:Wed, 24 Mar 2021 19:47:43 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00 sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG CONFIG_SCHEDSTATS does not depend on SCHED_DEBUG, it is inconsistent to have the sysctl depend on it. Suggested-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.161151...@infradead.org --- kernel/sysctl.c | 22 +++--- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8042098..17f1cc9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1711,17 +1711,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_SCHEDSTATS - { - .procname = "sched_schedstats", - .data = NULL, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = sysctl_schedstats, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -1755,6 +1744,17 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_NUMA_BALANCING { .procname = "numa_balancing",
[tip: sched/core] sched,preempt: Move preempt_dynamic to debug.c
The following commit has been merged into the sched/core branch of tip: Commit-ID: 1011dcce99f8026d48fdd7b9cc259e32a8b472be Gitweb: https://git.kernel.org/tip/1011dcce99f8026d48fdd7b9cc259e32a8b472be Author:Peter Zijlstra AuthorDate:Thu, 25 Mar 2021 12:21:38 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00 sched,preempt: Move preempt_dynamic to debug.c Move the #ifdef SCHED_DEBUG bits to kernel/sched/debug.c in order to collect all the debugfs bits. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.353833...@infradead.org --- kernel/sched/core.c | 77 +-- kernel/sched/debug.c | 67 - kernel/sched/sched.h | 11 -- 3 files changed, 78 insertions(+), 77 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bac30db..e6c714b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5371,9 +5371,9 @@ enum { preempt_dynamic_full, }; -static int preempt_dynamic_mode = preempt_dynamic_full; +int preempt_dynamic_mode = preempt_dynamic_full; -static int sched_dynamic_mode(const char *str) +int sched_dynamic_mode(const char *str) { if (!strcmp(str, "none")) return preempt_dynamic_none; @@ -5387,7 +5387,7 @@ static int sched_dynamic_mode(const char *str) return -EINVAL; } -static void sched_dynamic_update(int mode) +void sched_dynamic_update(int mode) { /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in @@ -5444,79 +5444,8 @@ static int __init setup_preempt_mode(char *str) } __setup("preempt=", setup_preempt_mode); -#ifdef CONFIG_SCHED_DEBUG - -static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[16]; - int mode; - - if (cnt > 15) - cnt = 15; - - if (copy_from_user(, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - mode = sched_dynamic_mode(strstrip(buf)); - if (mode < 0) - return mode; - - sched_dynamic_update(mode); - - *ppos += cnt; - - return cnt; -} - -static int sched_dynamic_show(struct seq_file *m, void *v) -{ - static const char * preempt_modes[] = { - "none", "voluntary", "full" - }; - int i; - - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { - if (preempt_dynamic_mode == i) - seq_puts(m, "("); - seq_puts(m, preempt_modes[i]); - if (preempt_dynamic_mode == i) - seq_puts(m, ")"); - - seq_puts(m, " "); - } - - seq_puts(m, "\n"); - return 0; -} - -static int sched_dynamic_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_dynamic_show, NULL); -} - -static const struct file_operations sched_dynamic_fops = { - .open = sched_dynamic_open, - .write = sched_dynamic_write, - .read = seq_read, - .llseek = seq_lseek, - .release= single_release, -}; - -extern struct dentry *debugfs_sched; - -static __init int sched_init_debug_dynamic(void) -{ - debugfs_create_file("sched_preempt", 0644, debugfs_sched, NULL, _dynamic_fops); - return 0; -} -late_initcall(sched_init_debug_dynamic); - -#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ - /* * This is the entry point to schedule() from kernel preemption * off of irq context. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2093b90..bdd344f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -213,9 +213,71 @@ static const struct file_operations sched_scaling_fops = { #endif /* SMP */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); +
[tip: sched/core] sched: Don't make LATENCYTOP select SCHED_DEBUG
The following commit has been merged into the sched/core branch of tip: Commit-ID: d86ba831656611872e4939b895503ddac63d8196 Gitweb: https://git.kernel.org/tip/d86ba831656611872e4939b895503ddac63d8196 Author:Peter Zijlstra AuthorDate:Wed, 24 Mar 2021 19:48:34 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:33 +02:00 sched: Don't make LATENCYTOP select SCHED_DEBUG SCHED_DEBUG is not in fact required for LATENCYTOP, don't select it. Suggested-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.224578...@infradead.org --- lib/Kconfig.debug | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2779c29..5f98376 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1670,7 +1670,6 @@ config LATENCYTOP select KALLSYMS_ALL select STACKTRACE select SCHEDSTATS - select SCHED_DEBUG help Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations.
[tip: sched/core] sched,fair: Alternative sched_slice()
The following commit has been merged into the sched/core branch of tip: Commit-ID: 0c2de3f054a59f15e01804b75a04355c48de628c Gitweb: https://git.kernel.org/tip/0c2de3f054a59f15e01804b75a04355c48de628c Author:Peter Zijlstra AuthorDate:Thu, 25 Mar 2021 13:44:46 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00 sched,fair: Alternative sched_slice() The current sched_slice() seems to have issues; there's two possible things that could be improved: - the 'nr_running' used for __sched_period() is daft when cgroups are considered. Using the RQ wide h_nr_running seems like a much more consistent number. - (esp) cgroups can slice it real fine, which makes for easy over-scheduling, ensure min_gran is what the name says. Signed-off-by: Peter Zijlstra (Intel) Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.611897...@infradead.org --- kernel/sched/fair.c | 12 +++- kernel/sched/features.h | 3 +++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b3ea14c..49636a4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -687,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + unsigned int nr_running = cfs_rq->nr_running; + u64 slice; + + if (sched_feat(ALT_PERIOD)) + nr_running = rq_of(cfs_rq)->cfs.h_nr_running; + + slice = __sched_period(nr_running + !se->on_rq); for_each_sched_entity(se) { struct load_weight *load; @@ -704,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) } slice = __calc_delta(slice, se->load.weight, load); } + + if (sched_feat(BASE_SLICE)) + slice = max(slice, (u64)sysctl_sched_min_granularity); + return slice; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 422fa68..011c5ec 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -90,3 +90,6 @@ SCHED_FEAT(WA_BIAS, true) */ SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(UTIL_EST_FASTUP, true) + +SCHED_FEAT(ALT_PERIOD, true) +SCHED_FEAT(BASE_SLICE, true)
[tip: sched/core] sched/debug: Rename the sched_debug parameter to sched_verbose
The following commit has been merged into the sched/core branch of tip: Commit-ID: a1b93fc0377e73dd54f819a993f83291324bb54a Gitweb: https://git.kernel.org/tip/a1b93fc0377e73dd54f819a993f83291324bb54a Author:Peter Zijlstra AuthorDate:Thu, 15 Apr 2021 18:23:17 +02:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00 sched/debug: Rename the sched_debug parameter to sched_verbose CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param sched_debug and the /debug/sched/debug_enabled knobs control the sched_debug_enabled variable, but what they really do is make SCHED_DEBUG more verbose, so rename the lot. Signed-off-by: Peter Zijlstra (Intel) --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/scheduler/sched-domains.rst | 10 +- kernel/sched/debug.c| 4 ++-- kernel/sched/topology.c | 12 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 0454572..9e4c026 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4725,7 +4725,7 @@ sbni= [NET] Granch SBNI12 leased line adapter - sched_debug [KNL] Enables verbose scheduler debug messages. + sched_verbose [KNL] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature diff --git a/Documentation/scheduler/sched-domains.rst b/Documentation/scheduler/sched-domains.rst index 8582fa5..14ea2f2 100644 --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and calling set_sched_topology() with this array as the parameter. The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to -tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug -knob. This enables an error checking parse of the sched domains which should -catch most possible errors (described above). It also prints out the domain -structure in a visual format. +CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you +forgot to tweak your cmdline, you can also flip the +/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of +the sched domains which should catch most possible errors (described above). It +also prints out the domain structure in a visual format. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bf199d6..461342f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -275,7 +275,7 @@ static const struct file_operations sched_dynamic_fops = { #endif /* CONFIG_PREEMPT_DYNAMIC */ -__read_mostly bool sched_debug_enabled; +__read_mostly bool sched_debug_verbose; static const struct seq_operations sched_debug_sops; @@ -300,7 +300,7 @@ static __init int sched_init_debug(void) debugfs_sched = debugfs_create_dir("sched", NULL); debugfs_create_file("features", 0644, debugfs_sched, NULL, _feat_fops); - debugfs_create_bool("debug_enabled", 0644, debugfs_sched, _debug_enabled); + debugfs_create_bool("verbose", 0644, debugfs_sched, _debug_verbose); #ifdef CONFIG_PREEMPT_DYNAMIC debugfs_create_file("preempt", 0644, debugfs_sched, NULL, _dynamic_fops); #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index c343aed..55a0a24 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2; static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } -early_param("sched_debug", sched_debug_setup); +early_param("sched_verbose", sched_debug_setup); static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, @@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) { int level = 0; - if (!sched_debug_enabled) + if (!sched_debug_verbose) return; if (!sd) { @@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } #else /* !CONFIG_SCHED_DEBUG */ -# define sched_debug_enabled 0 +# define sched_debug_verbose 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -2141,7 +2141,7 @@ build_sched_domains(const struct
[tip: sched/core] sched,debug: Convert sysctl sched_domains to debugfs
The following commit has been merged into the sched/core branch of tip: Commit-ID: 3b87f136f8fccddf7da016ab7d04bb3cf9b180f0 Gitweb: https://git.kernel.org/tip/3b87f136f8fccddf7da016ab7d04bb3cf9b180f0 Author:Peter Zijlstra AuthorDate:Thu, 25 Mar 2021 11:31:20 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00 sched,debug: Convert sysctl sched_domains to debugfs Stop polluting sysctl, move to debugfs for SCHED_DEBUG stuff. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/YHgB/s4kcbq1i...@hirez.programming.kicks-ass.net --- kernel/sched/debug.c| 254 --- kernel/sched/sched.h| 10 +-- kernel/sched/topology.c | 6 +- 3 files changed, 59 insertions(+), 211 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index bdd344f..b25de7b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -299,6 +299,10 @@ static __init int sched_init_debug(void) debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, _scaling_fops); debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, _sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, _sched_nr_migrate); + + mutex_lock(_domains_mutex); + update_sched_domain_debugfs(); + mutex_unlock(_domains_mutex); #endif #ifdef CONFIG_NUMA_BALANCING @@ -316,229 +320,88 @@ late_initcall(sched_init_debug); #ifdef CONFIG_SMP -#ifdef CONFIG_SYSCTL - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* -* In the intermediate directories, both the child directory and -* procname are dynamically allocated and could fail but the mode -* will always be set. In the lowest directory the names are -* static strings and all have proc handlers. -*/ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(>child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; -} +static cpumask_var_t sd_sysctl_cpus; +static struct dentry *sd_dentry; -static int sd_ctl_doflags(struct ctl_table *table, int write, - void *buffer, size_t *lenp, loff_t *ppos) +static int sd_flags_show(struct seq_file *m, void *v) { - unsigned long flags = *(unsigned long *)table->data; - size_t data_size = 0; - size_t len = 0; - char *tmp, *buf; + unsigned long flags = *(unsigned int *)m->private; int idx; - if (write) - return 0; - - for_each_set_bit(idx, , __SD_FLAG_CNT) { - char *name = sd_flag_debug[idx].name; - - /* Name plus whitespace */ - data_size += strlen(name) + 1; - } - - if (*ppos > data_size) { - *lenp = 0; - return 0; - } - - buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); - if (!buf) - return -ENOMEM; - for_each_set_bit(idx, , __SD_FLAG_CNT) { - char *name = sd_flag_debug[idx].name; - - len += snprintf(buf + len, strlen(name) + 2, "%s ", name); - } - - tmp = buf + *ppos; - len -= *ppos; - - if (len > *lenp) - len = *lenp; - if (len) - memcpy(buffer, tmp, len); - if (len < *lenp) { - ((char *)buffer)[len] = '\n'; - len++; + seq_puts(m, sd_flag_debug[idx].name); + seq_puts(m, " "); } - - *lenp = len; - *ppos += len; - - kfree(buf); + seq_puts(m, "\n"); ret
[tip: sched/core] debugfs: Implement debugfs_create_str()
The following commit has been merged into the sched/core branch of tip: Commit-ID: 9af0440ec86ebdab075e1b3d231f81fe7decb575 Gitweb: https://git.kernel.org/tip/9af0440ec86ebdab075e1b3d231f81fe7decb575 Author:Peter Zijlstra AuthorDate:Thu, 25 Mar 2021 10:53:55 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:34 +02:00 debugfs: Implement debugfs_create_str() Implement debugfs_create_str() to easily display names and such in debugfs. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.415407...@infradead.org --- fs/debugfs/file.c | 91 - include/linux/debugfs.h | 17 +++- 2 files changed, 108 insertions(+) diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 686e0ad..9b78e9e 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -865,6 +865,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_bool); +ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + char *str, *copy = NULL; + int copy_len, len; + ssize_t ret; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + str = *(char **)file->private_data; + len = strlen(str) + 1; + copy = kmalloc(len, GFP_KERNEL); + if (!copy) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + copy_len = strscpy(copy, str, len); + debugfs_file_put(dentry); + if (copy_len < 0) { + kfree(copy); + return copy_len; + } + + copy[copy_len] = '\n'; + + ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len); + kfree(copy); + + return ret; +} + +static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* This is really only for read-only strings */ + return -EINVAL; +} + +static const struct file_operations fops_str = { + .read = debugfs_read_file_str, + .write =debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_ro = { + .read = debugfs_read_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_wo = { + .write =debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_str - create a debugfs file that is used to read and write a string value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be + * returned. + * + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will + * be returned. + */ +void debugfs_create_str(const char *name, umode_t mode, + struct dentry *parent, char **value) +{ + debugfs_create_mode_unsafe(name, mode, parent, value, _str, + _str_ro, _str_wo); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index d6c4cc9..1fdb434 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode, struct dentry *parent, atomic_t *value); struct dentry *debugfs_create_bool(const char *name, umode_t mode, struct dentry *parent, bool *value); +void debugfs_create_str(const char *name, umode_t mode, + struct dentry *parent, char **value); struct dentry *debugfs_create_blob
[tip: sched/core] sched: Move /proc/sched_debug to debugfs
The following commit has been merged into the sched/core branch of tip: Commit-ID: d27e9ae2f244805bbdc730d85fba28685d2471e5 Gitweb: https://git.kernel.org/tip/d27e9ae2f244805bbdc730d85fba28685d2471e5 Author:Peter Zijlstra AuthorDate:Thu, 25 Mar 2021 15:18:19 +01:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 17:06:35 +02:00 sched: Move /proc/sched_debug to debugfs Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210412102001.548833...@infradead.org --- kernel/sched/debug.c | 25 - 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index b25de7b..bf199d6 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -277,6 +277,20 @@ static const struct file_operations sched_dynamic_fops = { __read_mostly bool sched_debug_enabled; +static const struct seq_operations sched_debug_sops; + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, _debug_sops); +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release= seq_release, +}; + static struct dentry *debugfs_sched; static __init int sched_init_debug(void) @@ -314,6 +328,8 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_size_mb", 0644, numa, _numa_balancing_scan_size); #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, _debug_fops); + return 0; } late_initcall(sched_init_debug); @@ -847,15 +863,6 @@ static const struct seq_operations sched_debug_sops = { .show = sched_debug_show, }; -static int __init init_sched_debug_procfs(void) -{ - if (!proc_create_seq("sched_debug", 0444, NULL, _debug_sops)) - return -ENOMEM; - return 0; -} - -__initcall(init_sched_debug_procfs); - #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) #define __P(F) __PS(#F, F) #define P(F) __PS(#F, p->F)
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 05:04:41PM +0200, Miguel Ojeda wrote: > Of course, we could propose something similar for C -- in fact, there > was a recent discussion around this in the C committee triggered by my > n2659 "Safety attributes for C" paper. http://www.open-std.org/jtc1/sc22/wg14/www/docs/n2659.htm That's just not making any damn sense what so ever. That seems to be about sprinkling abort() all over the place, which is just total rubbish.
Re: liburcu: LTO breaking rcu_dereference on arm64 and possibly other architectures ?
On Fri, Apr 16, 2021 at 10:52:16AM -0400, Mathieu Desnoyers wrote: > Hi Paul, Will, Peter, > > I noticed in this discussion https://lkml.org/lkml/2021/4/16/118 that LTO > is able to break rcu_dereference. This seems to be taken care of by > arch/arm64/include/asm/rwonce.h on arm64 in the Linux kernel tree. > > In the liburcu user-space library, we have this comment near > rcu_dereference() in > include/urcu/static/pointer.h: > > * The compiler memory barrier in CMM_LOAD_SHARED() ensures that > value-speculative > * optimizations (e.g. VSS: Value Speculation Scheduling) does not perform the > * data read before the pointer read by speculating the value of the pointer. > * Correct ordering is ensured because the pointer is read as a volatile > access. > * This acts as a global side-effect operation, which forbids reordering of > * dependent memory operations. Note that such concern about > dependency-breaking > * optimizations will eventually be taken care of by the > "memory_order_consume" > * addition to forthcoming C++ standard. > > (note: CMM_LOAD_SHARED() is the equivalent of READ_ONCE(), but was introduced > in > liburcu as a public API before READ_ONCE() existed in the Linux kernel) > > Peter tells me the "memory_order_consume" is not something which can be used > today. > Any information on its status at C/C++ standard levels and > implementation-wise ? > > Pragmatically speaking, what should we change in liburcu to ensure we don't > generate > broken code when LTO is enabled ? I suspect there are a few options here: > > 1) Fail to build if LTO is enabled, > 2) Generate slower code for rcu_dereference, either on all architectures or > only >on weakly-ordered architectures, > 3) Generate different code depending on whether LTO is enabled or not. AFAIU > this would only >work if every compile unit is aware that it will end up being optimized > with LTO. Not sure >how this could be done in the context of user-space. > 4) [ Insert better idea here. ] > > Thoughts ? Using memory_order_acquire is safe; and is basically what Will did for ARM64. The problematic tranformations are possible even without LTO, although less likely due to less visibility, but everybody agrees they're possible and allowed. OTOH we do not have a positive sighting of it actually happening (I think), we're all just being cautious and not willing to debug the resulting wreckage if it does indeed happen.
Re: [PATCH v2] sched: Warn on long periods of pending need_resched
On Tue, Mar 30, 2021 at 03:44:12PM -0700, Josh Don wrote: > Peter, > > Since you've already pulled the need_resched warning patch into your > tree, I'm including just the diff based on that patch (in response to > Mel's comments) below. This should be squashed into the original > patch. > Sorry, I seem to have missed this. The patch is completely whitespace mangled through. I've attached my latest copy of the patch and held it back for now, please resubmit. --- Subject: sched: Warn on long periods of pending need_resched From: Paul Turner Date: Mon, 22 Mar 2021 20:57:06 -0700 From: Paul Turner CPU scheduler marks need_resched flag to signal a schedule() on a particular CPU. But, schedule() may not happen immediately in cases where the current task is executing in the kernel mode (no preemption state) for extended periods of time. This patch adds a warn_on if need_resched is pending for more than the time specified in sysctl resched_latency_warn_ms. If it goes off, it is likely that there is a missing cond_resched() somewhere. Monitoring is done via the tick and the accuracy is hence limited to jiffy scale. This also means that we won't trigger the warning if the tick is disabled. This feature is default disabled. It can be toggled on using sysctl resched_latency_warn_enabled. Signed-off-by: Paul Turner Signed-off-by: Josh Don Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210323035706.572953-1-josh...@google.com --- include/linux/sched/sysctl.h |3 + kernel/sched/core.c | 75 ++- kernel/sched/debug.c | 13 +++ kernel/sched/features.h |2 + kernel/sched/sched.h | 10 + 5 files changed, 102 insertions(+), 1 deletion(-) --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -48,6 +48,9 @@ extern unsigned int sysctl_numa_balancin #ifdef CONFIG_SCHED_DEBUG extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; + +extern int sysctl_resched_latency_warn_ms; +extern int sysctl_resched_latency_warn_once; #endif /* --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -58,7 +58,21 @@ const_debug unsigned int sysctl_sched_fe #include "features.h" 0; #undef SCHED_FEAT -#endif + +/* + * Print a warning if need_resched is set for the given duration (if + * resched_latency_warn_enabled is set). + * + * If sysctl_resched_latency_warn_once is set, only one warning will be shown + * per boot. + * + * Resched latency will be ignored for the first resched_boot_quiet_sec, to + * reduce false alarms. + */ +int sysctl_resched_latency_warn_ms = 100; +int sysctl_resched_latency_warn_once = 1; +static const long resched_boot_quiet_sec = 600; +#endif /* CONFIG_SCHED_DEBUG */ /* * Number of tasks to iterate in a single balance run. @@ -4527,6 +4541,56 @@ unsigned long long task_sched_runtime(st return ns; } +#ifdef CONFIG_SCHED_DEBUG +static u64 resched_latency_check(struct rq *rq) +{ + int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms); + u64 need_resched_latency, now = rq_clock(rq); + static bool warned_once; + + if (sysctl_resched_latency_warn_once && warned_once) + return 0; + + if (!need_resched() || WARN_ON_ONCE(latency_warn_ms < 2)) + return 0; + + /* Disable this warning for the first few mins after boot */ + if (now < resched_boot_quiet_sec * NSEC_PER_SEC) + return 0; + + if (!rq->last_seen_need_resched_ns) { + rq->last_seen_need_resched_ns = now; + rq->ticks_without_resched = 0; + return 0; + } + + rq->ticks_without_resched++; + need_resched_latency = now - rq->last_seen_need_resched_ns; + if (need_resched_latency <= latency_warn_ms * NSEC_PER_MSEC) + return 0; + + warned_once = true; + + return need_resched_latency; +} + +static int __init setup_resched_latency_warn_ms(char *str) +{ + long val; + + if ((kstrtol(str, 0, ))) { + pr_warn("Unable to set resched_latency_warn_ms\n"); + return 1; + } + + sysctl_resched_latency_warn_ms = val; + return 1; +} +__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms); +#else +static inline u64 resched_latency_check(struct rq *rq) { return 0; } +#endif /* CONFIG_SCHED_DEBUG */ + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -4538,6 +4602,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; struct rq_flags rf; unsigned long thermal_pressure; + u64 resched_latency; arch_scale_freq_tick(); sched_clock_tick(); @@ -4548,10 +4613,15 @@ void
[tip: perf/core] perf: Rework perf_event_exit_event()
The following commit has been merged into the perf/core branch of tip: Commit-ID: ef54c1a476aef7eef26fe13ea10dc090952c00f8 Gitweb: https://git.kernel.org/tip/ef54c1a476aef7eef26fe13ea10dc090952c00f8 Author:Peter Zijlstra AuthorDate:Thu, 08 Apr 2021 12:35:56 +02:00 Committer: Peter Zijlstra CommitterDate: Fri, 16 Apr 2021 16:32:40 +02:00 perf: Rework perf_event_exit_event() Make perf_event_exit_event() more robust, such that we can use it from other contexts. Specifically the up and coming remove_on_exec. For this to work we need to address a few issues. Remove_on_exec will not destroy the entire context, so we cannot rely on TASK_TOMBSTONE to disable event_function_call() and we thus have to use perf_remove_from_context(). When using perf_remove_from_context(), there's two races to consider. The first is against close(), where we can have concurrent tear-down of the event. The second is against child_list iteration, which should not find a half baked event. To address this, teach perf_remove_from_context() to special case !ctx->is_active and about DETACH_CHILD. [ el...@google.com: fix racing parent/child exit in sync_child_event(). ] Signed-off-by: Marco Elver Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210408103605.1676875-2-el...@google.com --- include/linux/perf_event.h | 1 +- kernel/events/core.c | 142 2 files changed, 80 insertions(+), 63 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 3f7f89e..3d478ab 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -607,6 +607,7 @@ struct swevent_hlist { #define PERF_ATTACH_TASK_DATA 0x08 #define PERF_ATTACH_ITRACE 0x10 #define PERF_ATTACH_SCHED_CB 0x20 +#define PERF_ATTACH_CHILD 0x40 struct perf_cgroup; struct perf_buffer; diff --git a/kernel/events/core.c b/kernel/events/core.c index f079431..318ff7b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2205,6 +2205,26 @@ out: perf_event__header_size(leader); } +static void sync_child_event(struct perf_event *child_event); + +static void perf_child_detach(struct perf_event *event) +{ + struct perf_event *parent_event = event->parent; + + if (!(event->attach_state & PERF_ATTACH_CHILD)) + return; + + event->attach_state &= ~PERF_ATTACH_CHILD; + + if (WARN_ON_ONCE(!parent_event)) + return; + + lockdep_assert_held(_event->child_mutex); + + sync_child_event(event); + list_del_init(>child_list); +} + static bool is_orphaned_event(struct perf_event *event) { return event->state == PERF_EVENT_STATE_DEAD; @@ -2312,6 +2332,7 @@ group_sched_out(struct perf_event *group_event, } #define DETACH_GROUP 0x01UL +#define DETACH_CHILD 0x02UL /* * Cross CPU call to remove a performance event @@ -2335,6 +2356,8 @@ __perf_remove_from_context(struct perf_event *event, event_sched_out(event, cpuctx, ctx); if (flags & DETACH_GROUP) perf_group_detach(event); + if (flags & DETACH_CHILD) + perf_child_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && ctx->is_active) { @@ -2363,25 +2386,21 @@ static void perf_remove_from_context(struct perf_event *event, unsigned long fla lockdep_assert_held(>mutex); - event_function_call(event, __perf_remove_from_context, (void *)flags); - /* -* The above event_function_call() can NO-OP when it hits -* TASK_TOMBSTONE. In that case we must already have been detached -* from the context (by perf_event_exit_event()) but the grouping -* might still be in-tact. +* Because of perf_event_exit_task(), perf_remove_from_context() ought +* to work in the face of TASK_TOMBSTONE, unlike every other +* event_function_call() user. */ - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); - if ((flags & DETACH_GROUP) && - (event->attach_state & PERF_ATTACH_GROUP)) { - /* -* Since in that case we cannot possibly be scheduled, simply -* detach now. -*/ - raw_spin_lock_irq(>lock); - perf_group_detach(event); + raw_spin_lock_irq(>lock); + if (!ctx->is_active) { + __perf_remove_from_context(event, __get_cpu_context(ctx), + ctx, (void *)flags); raw_spin_unlock_irq(>lock); + return; } + raw_spin_unlock_irq(>lock); + + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* @@ -12377,14 +12396,17 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) } EXPORT_SYMBO
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 02:07:49PM +0100, Wedson Almeida Filho wrote: > On Fri, Apr 16, 2021 at 01:24:23PM +0200, Peter Zijlstra wrote: > > int perf_event_task_enable(void) > > { > > + DEFINE_MUTEX_GUARD(event_mutex, >perf_event_mutex); > > There is nothing in C forcing developers to actually use DEFINE_MUTEX_GUARD. > So > someone may simply forget (or not know that they need) to lock > current->perf_event_mutex and directly access some field protected by it. This > is unlikely to happen when one first writes the code, but over time as > different > people modify the code and invariants change, it is possible for this to > happen. > > In Rust, this isn't possible: the data protected by a lock is only accessible > when the lock is locked. So developers cannot accidentally make mistakes of > this > kind. And since the enforcement happens at compile time, there is no runtime > cost. > > This, we believe, is fundamental to the discussion: we agree that many of > these > idioms can be implemented in C (albeit in this case with a compiler > extension), > but their use is optional, people can (and do) still make mistakes that lead > to > vulnerabilities; Rust disallows classes of mistakes by construction. Does this also not prohibit constructs where modification must be done while holding two locks, but reading can be done while holding either lock? That's a semi common scheme in the kernel, but not something that's expressible by, for example, the Java sync keyword. It also very much doesn't work for RCU, where modification must be done under a lock, but access is done essentially lockless. I would much rather have a language extention where we can associate custom assertions with variable access, sorta like a sanitizer: static inline void assert_foo_bar(struct foo *f) { lockdep_assert_held(>lock); } struct foo { spinlock_t lock; int bar __assert__(assert_foo_bar); }; Such things can be optional and only enabled for debug builds on new compilers. > Another scenario: suppose within perf_event_task_enable you need to call a > function that requires the mutex to be locked and that will unlock it for you > on > error (or unconditionally, doesn't matter). How would you do that in C? In > Rust, > there is a clean idiomatic way of transferring ownership of a guard (or any > other object) such that the previous owner cannot continue to use it after > ownership is transferred. Again, this is enforced at compile time. I'm happy > to > provide a small example if that would help. C does indeed not have the concept of ownership, unlike modern C++ I think. But I would much rather see a C language extention for that than go Rust. This would mean a far more agressive push for newer C compilers than we've ever done before, but at least it would all still be a single language. Convertion to the new stuff can be done gradually and where it makes sense and new extentions can be evaluated on performance impact etc.
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Fri, Apr 16, 2021 at 09:19:08PM +0900, Namhyung Kim wrote: > > Are you actually using mmap() to read? I had a proposal for FORMAT_GROUP > > like thing for mmap(), but I never implemented that (didn't get the > > enthousiatic response I thought it would). But yeah, there's nowhere > > near enough space in there for PERCPU. > > Recently there's a patch to do it with rdpmc which needs to mmap first. > > https://lore.kernel.org/lkml/20210414155412.3697605-1-r...@kernel.org/ Yeah, I'm not sure about that, I've not looked at it recently. The thing is though; for RDPMC to work, you *NEED* to be on the same CPU as the counter. The typical RDPMC use-case is self monitoring.
Re: [PATCH 04/13] Kbuild: Rust support
On Wed, Apr 14, 2021 at 08:45:55PM +0200, oj...@kernel.org wrote: > diff --git a/scripts/Makefile.build b/scripts/Makefile.build > index 1b6094a13034..3665c49c4dcf 100644 > --- a/scripts/Makefile.build > +++ b/scripts/Makefile.build > @@ -26,6 +26,7 @@ EXTRA_CPPFLAGS := > EXTRA_LDFLAGS := > asflags-y := > ccflags-y := > +rustcflags-y := > cppflags-y := > ldflags-y := > > @@ -287,6 +288,24 @@ quiet_cmd_cc_lst_c = MKLST $@ > $(obj)/%.lst: $(src)/%.c FORCE > $(call if_changed_dep,cc_lst_c) > > +# Compile Rust sources (.rs) > +# --- > + > +rustc_cross_flags := --target=$(srctree)/arch/$(SRCARCH)/rust/target.json > + > +quiet_cmd_rustc_o_rs = $(RUSTC_OR_CLIPPY_QUIET) $(quiet_modtag) $@ > + cmd_rustc_o_rs = \ > + RUST_MODFILE=$(modfile) \ > + $(RUSTC_OR_CLIPPY) $(rustc_flags) $(rustc_cross_flags) \ > + --extern alloc --extern kernel \ > + --crate-type rlib --out-dir $(obj) -L $(objtree)/rust/ \ > + --crate-name $(patsubst %.o,%,$(notdir $@)) $<; \ > + mv $(obj)/$(subst .o,,$(notdir $@)).d $(depfile); \ > + sed -i '/^\#/d' $(depfile) > + > +$(obj)/%.o: $(src)/%.rs FORCE > + $(call if_changed_dep,rustc_o_rs) > + > # Compile assembler sources (.S) > # --- > So if I read all this right, rust compiles to .o and, like any other .o file is then fed into objtool (for x86_64). Did you have any problems with objtool? Does it generate correct ORC unwind information? AFAICT rust has try/throw/catch exception handling (like C++/Java/others) which is typically implemented with stack unwinding of its own. Does all that interact sanely?
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Fri, Apr 16, 2021 at 08:22:38PM +0900, Namhyung Kim wrote: > On Fri, Apr 16, 2021 at 7:28 PM Peter Zijlstra wrote: > > > > On Fri, Apr 16, 2021 at 11:29:30AM +0200, Peter Zijlstra wrote: > > > > > > So I think we've had proposals for being able to close fds in the past; > > > > while preserving groups etc. We've always pushed back on that because of > > > > the resource limit issue. By having each counter be a filedesc we get a > > > > natural limit on the amount of resources you can consume. And in that > > > > respect, having to use 400k fds is things working as designed. > > > > > > > > Anyway, there might be a way around this.. > > > > So how about we flip the whole thing sideways, instead of doing one > > event for multiple cgroups, do an event for multiple-cpus. > > > > Basically, allow: > > > > perf_event_open(.pid=fd, cpu=-1, .flag=PID_CGROUP); > > > > Which would have the kernel create nr_cpus events [the corrolary is that > > we'd probably also allow: (.pid=-1, cpu=-1) ]. > > Do you mean it'd have separate perf_events per cpu internally? > From a cpu's perspective, there's nothing changed, right? > Then it will have the same performance problem as of now. Yes, but we'll not end up in ioctl() hell. The interface is sooo much better. The performance thing just means we need to think harder. I thought cgroup scheduling got a lot better with the work Ian did a while back? What's the actual bottleneck now? > > Output could be done by adding FORMAT_PERCPU, which takes the current > > read() format and writes a copy for each CPU event. (p)read(v)() could > > be used to explode or partial read that. > > Yeah, I think it's good for read. But what about mmap? > I don't think we can use file offset since it's taken for auxtrace. > Maybe we can simply disallow that.. Are you actually using mmap() to read? I had a proposal for FORMAT_GROUP like thing for mmap(), but I never implemented that (didn't get the enthousiatic response I thought it would). But yeah, there's nowhere near enough space in there for PERCPU. Not sure how to do that, these counters must not be sampling counters because we can't be sharing a buffer from multiple CPUs, so data/aux just isn't a concern. But it's weird to have them magically behave differently.
Re: [PATCH 00/13] [RFC] Rust support
On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote: > - Featureful language: sum types, pattern matching, generics, > RAII, lifetimes, shared & exclusive references, modules & > visibility, powerful hygienic and procedural macros... IMO RAII is over-valued, but just in case you care, the below seems to work just fine. No fancy new language needed, works today. Similarly you can create refcount_t guards, or with a little more work full blown smart_ptr crud. --- diff --git a/include/linux/mutex.h b/include/linux/mutex.h index e19323521f9c..f03a72dd8cea 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -197,4 +197,22 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); +struct mutex_guard { + struct mutex *mutex; +}; + +static inline struct mutex_guard mutex_guard_lock(struct mutex *mutex) +{ + mutex_lock(mutex); + return (struct mutex_guard){ .mutex = mutex, }; +} + +static inline void mutex_guard_unlock(struct mutex_guard *guard) +{ + mutex_unlock(guard->mutex); +} + +#define DEFINE_MUTEX_GUARD(name, lock) \ + struct mutex_guard __attribute__((__cleanup__(mutex_guard_unlock))) name = mutex_guard_lock(lock) + #endif /* __LINUX_MUTEX_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 8ee3249de2f0..603d197a83b8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5715,16 +5715,15 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd, int perf_event_task_enable(void) { + DEFINE_MUTEX_GUARD(event_mutex, >perf_event_mutex); struct perf_event_context *ctx; struct perf_event *event; - mutex_lock(>perf_event_mutex); list_for_each_entry(event, >perf_event_list, owner_entry) { ctx = perf_event_ctx_lock(event); perf_event_for_each_child(event, _perf_event_enable); perf_event_ctx_unlock(event, ctx); } - mutex_unlock(>perf_event_mutex); return 0; }
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Fri, Apr 16, 2021 at 11:29:30AM +0200, Peter Zijlstra wrote: > > So I think we've had proposals for being able to close fds in the past; > > while preserving groups etc. We've always pushed back on that because of > > the resource limit issue. By having each counter be a filedesc we get a > > natural limit on the amount of resources you can consume. And in that > > respect, having to use 400k fds is things working as designed. > > > > Anyway, there might be a way around this.. So how about we flip the whole thing sideways, instead of doing one event for multiple cgroups, do an event for multiple-cpus. Basically, allow: perf_event_open(.pid=fd, cpu=-1, .flag=PID_CGROUP); Which would have the kernel create nr_cpus events [the corrolary is that we'd probably also allow: (.pid=-1, cpu=-1) ]. Output could be done by adding FORMAT_PERCPU, which takes the current read() format and writes a copy for each CPU event. (p)read(v)() could be used to explode or partial read that. This gets rid of the nasty variadic nature of the 'get-me-these-n-cgroups'. While still getting rid of the n*m fd issue you're facing.
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
Duh.. this is a half-finished email I meant to save for later. Anyway, I'll reply more. On Fri, Apr 16, 2021 at 11:26:39AM +0200, Peter Zijlstra wrote: > On Fri, Apr 16, 2021 at 08:48:12AM +0900, Namhyung Kim wrote: > > On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra > > wrote: > > > On Tue, Apr 13, 2021 at 08:53:36AM -0700, Namhyung Kim wrote: > > > > > cgroup event counting (i.e. perf stat). > > > > > > > > * PERF_EVENT_IOC_ATTACH_CGROUP - it takes a buffer consists of a > > > > 64-bit array to attach given cgroups. The first element is a > > > > number of cgroups in the buffer, and the rest is a list of cgroup > > > > ids to add a cgroup info to the given event. > > > > > > WTH is a cgroup-id? The syscall takes a fd to the path, why have two > > > different ways? > > > > As you know, we already use cgroup-id for sampling. Yeah we > > can do it with the fd but one of the point in this patch is to reduce > > the number of file descriptors. :) > > Well, I found those patches again after I wrote that. But I'm still not > sure what a cgroup-id is from userspace. > > How does userspace get one given a cgroup? (I actually mounted cgroupfs > in order to see if there's some new 'id' file to read, there is not) > Does having the cgroup-id ensure the cgroup exists? Can the cgroup-id > get re-used? > > I really don't konw what the thing is. I don't use cgroups, like ever, > except when I'm forced to due to some regression or bugreport. > > > Also, having cgroup-id is good to match with the result (from read) > > as it contains the cgroup information. > > What? > > > > > * PERF_EVENT_IOC_READ_CGROUP - it takes a buffer consists of a 64-bit > > > > array to get the event counter values. The first element is size > > > > of the array in byte, and the second element is a cgroup id to > > > > read. The rest is to save the counter value and timings. > > > > > > :-( > > > > > > So basically you're doing a whole seconds cgroup interface, one that > > > violates the one counter per file premise and lives off of ioctl()s. > > > > Right, but I'm not sure that we really want a separate event for each > > cgroup if underlying hardware events are all the same. > > Sure, I see where you're coming from; I just don't much like where it > got you :-) > > > > *IF* we're going to do something like this, I feel we should explore the > > > whole vector-per-fd concept before proceeding. Can we make it less yuck > > > (less special ioctl() and more regular file ops. Can we apply the > > > concept to more things? > > > > Ideally it'd do without keeping file descriptors open. Maybe we can make > > the vector accept various types like vector-per-cgroup_id or so. > > So I think we've had proposals for being able to close fds in the past; > while preserving groups etc. We've always pushed back on that because of > the resource limit issue. By having each counter be a filedesc we get a > natural limit on the amount of resources you can consume. And in that > respect, having to use 400k fds is things working as designed. > > Anyway, there might be a way around this.. > > > > The second patch extends the ioctl() to be more read() like, instead of > > > doing the sane things and extending read() by adding PERF_FORMAT_VECTOR > > > or whatever. In fact, this whole second ioctl() doesn't make sense to > > > have if we do indeed want to do vector-per-fd. > > > > One of the upside of the ioctl() is that we can pass cgroup-id to read. > > Probably we can keep the index in the vector and set the file offset > > with it. Or else just read the whole vector, and then it has a cgroup-id > > in the output like PERF_FORMAT_CGROUP? > > > > > > > > Also, I suppose you can already fake this, by having a > > > SW_CGROUP_SWITCHES (sorry, I though I picked those up, done now) event > > > > Thanks! > > > > > with PERF_SAMPLE_READ|PERF_SAMPLE_CGROUP and PERF_FORMAT_GROUP in a > > > group with a bunch of events. Then the buffer will fill with the values > > > you use here. > > > > Right, I'll do an experiment with it. > > > > > > > > Yes, I suppose it has higher overhead, but you get the data you want > > > without having to do terrible things like this. > > > > That's true. And we don't need many things in the perf record like > > synthesizing task/mmap info. Also there's a risk we can miss some > > samples for some reason. > > > > Another concern is that it'd add huge slow down in the perf event > > open as it creates a mixed sw/hw group. The synchronized_rcu in > > the move_cgroup path caused significant problems in my > > environment as it adds up in proportion to the number of cpus. > > Since when is perf_event_open() a performance concern? That thing is > slow in all possible ways.
Re: [PATCH v3 1/2] perf/core: Share an event with multiple cgroups
On Fri, Apr 16, 2021 at 08:48:12AM +0900, Namhyung Kim wrote: > On Thu, Apr 15, 2021 at 11:51 PM Peter Zijlstra wrote: > > On Tue, Apr 13, 2021 at 08:53:36AM -0700, Namhyung Kim wrote: > > > cgroup event counting (i.e. perf stat). > > > > > > * PERF_EVENT_IOC_ATTACH_CGROUP - it takes a buffer consists of a > > > 64-bit array to attach given cgroups. The first element is a > > > number of cgroups in the buffer, and the rest is a list of cgroup > > > ids to add a cgroup info to the given event. > > > > WTH is a cgroup-id? The syscall takes a fd to the path, why have two > > different ways? > > As you know, we already use cgroup-id for sampling. Yeah we > can do it with the fd but one of the point in this patch is to reduce > the number of file descriptors. :) Well, I found those patches again after I wrote that. But I'm still not sure what a cgroup-id is from userspace. How does userspace get one given a cgroup? (I actually mounted cgroupfs in order to see if there's some new 'id' file to read, there is not) Does having the cgroup-id ensure the cgroup exists? Can the cgroup-id get re-used? I really don't konw what the thing is. I don't use cgroups, like ever, except when I'm forced to due to some regression or bugreport. > Also, having cgroup-id is good to match with the result (from read) > as it contains the cgroup information. What? > > > * PERF_EVENT_IOC_READ_CGROUP - it takes a buffer consists of a 64-bit > > > array to get the event counter values. The first element is size > > > of the array in byte, and the second element is a cgroup id to > > > read. The rest is to save the counter value and timings. > > > > :-( > > > > So basically you're doing a whole seconds cgroup interface, one that > > violates the one counter per file premise and lives off of ioctl()s. > > Right, but I'm not sure that we really want a separate event for each > cgroup if underlying hardware events are all the same. Sure, I see where you're coming from; I just don't much like where it got you :-) > > *IF* we're going to do something like this, I feel we should explore the > > whole vector-per-fd concept before proceeding. Can we make it less yuck > > (less special ioctl() and more regular file ops. Can we apply the > > concept to more things? > > Ideally it'd do without keeping file descriptors open. Maybe we can make > the vector accept various types like vector-per-cgroup_id or so. So I think we've had proposals for being able to close fds in the past; while preserving groups etc. We've always pushed back on that because of the resource limit issue. By having each counter be a filedesc we get a natural limit on the amount of resources you can consume. And in that respect, having to use 400k fds is things working as designed. Anyway, there might be a way around this.. > > The second patch extends the ioctl() to be more read() like, instead of > > doing the sane things and extending read() by adding PERF_FORMAT_VECTOR > > or whatever. In fact, this whole second ioctl() doesn't make sense to > > have if we do indeed want to do vector-per-fd. > > One of the upside of the ioctl() is that we can pass cgroup-id to read. > Probably we can keep the index in the vector and set the file offset > with it. Or else just read the whole vector, and then it has a cgroup-id > in the output like PERF_FORMAT_CGROUP? > > > > > Also, I suppose you can already fake this, by having a > > SW_CGROUP_SWITCHES (sorry, I though I picked those up, done now) event > > Thanks! > > > with PERF_SAMPLE_READ|PERF_SAMPLE_CGROUP and PERF_FORMAT_GROUP in a > > group with a bunch of events. Then the buffer will fill with the values > > you use here. > > Right, I'll do an experiment with it. > > > > > Yes, I suppose it has higher overhead, but you get the data you want > > without having to do terrible things like this. > > That's true. And we don't need many things in the perf record like > synthesizing task/mmap info. Also there's a risk we can miss some > samples for some reason. > > Another concern is that it'd add huge slow down in the perf event > open as it creates a mixed sw/hw group. The synchronized_rcu in > the move_cgroup path caused significant problems in my > environment as it adds up in proportion to the number of cpus. Since when is perf_event_open() a performance concern? That thing is slow in all possible ways.
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 06:02:33AM +0100, Wedson Almeida Filho wrote: > On Fri, Apr 16, 2021 at 04:25:34AM +, Al Viro wrote: > > And as one of those freaks I can tell > > you where exactly I would like you to go and what I would like you to do > > with implicit suggestions to start a browser when I need to read some > > in-tree documentation. > > I could be mistaken but you seem angry. Perhaps it wouldn't be a bad idea to > read your own code of conduct, I don't think you need a browser for that > either. Welcome to LKML. CoC does not forbid human emotions just yet. Deal with it.
Re: [PATCH 00/13] [RFC] Rust support
On Fri, Apr 16, 2021 at 03:22:16AM +0100, Wedson Almeida Filho wrote: > On Thu, Apr 15, 2021 at 08:58:16PM +0200, Peter Zijlstra wrote: > > On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote: > > > > > Rust is a systems programming language that brings several key > > > advantages over C in the context of the Linux kernel: > > > > > > - No undefined behavior in the safe subset (when unsafe code is > > > sound), including memory safety and the absence of data races. > > > > And yet I see not a single mention of the Rust Memory Model and how it > > aligns (or not) with the LKMM. The C11 memory model for example is a > > really poor fit for LKMM. > > We don't intend to directly expose C data structures to Rust code (outside the > kernel crate). Instead, we intend to provide wrappers that expose safe > interfaces even though the implementation may use unsafe blocks. So we expect > the vast majority of Rust code to just care about the Rust memory model. > > We admittedly don't have a huge number of wrappers yet, but we do have enough > to > implement most of Binder and so far it's been ok. We do intend to eventually > cover other classes of drivers that may unveil unforeseen difficulties, we'll > see. > > If you have concerns that we might have overlooked, we'd be happy to hear > about > them from you (or anyone else). Well, the obvious example would be seqlocks. C11 can't do them. The not sharing of data structures would avoid most of that, but will also cost you in performance. Simlar thing for RCU; C11 can't optimally do that; it needs to make rcu_dereference() a load-acquire [something ARM64 has already done in C because the compiler might be too clever by half when doing LTO :-(]. But it's the compiler needing the acquire semantics, not the computer, which is just bloody wrong. And there's more sharp corners to be had. But yes, if you're not actually sharing anything; and taking the performance hit that comes with that, you might get away with it. > > HTML is not a valid documentation format. Heck, markdown itself is > > barely readable. > > Are you stating [what you perceive as] a fact or just venting? If the former, > would you mind enlightening us with some evidence? I've yet to see a program that renders HTML (including all the cruft often used in docs, which might include SVG graphics and whatnot) sanely in ASCII. Lynx does not qualify, it's output is atrocious crap. Yes, lynx lets you read HTML in ASCII, but at the cost of bleeding eyeballs and missing content. Nothing beats a sane ASCII document with possibly, where really needed some ASCII art. Sadly the whole kernel documentation project is moving away from that as well, which just means I'm back to working on an undocumented codebase. This rst crap they adopted is unreadable garbage. > > It is really *really* hard to read. It has all sorts of weird things, > > like operators at the beginning after a line break: > > > > if (foo > > || bar) > > > > which is just wrong. And it suffers from CamelCase, which is just about > > the worst thing ever. Not even the C++ std libs have that (or had, back > > when I still did knew C++). > > > > I also see: > > > > if (foo) { > > ... > > } > > > > and > > > > if foo { > > } > > > > the latter, ofcourse, being complete rubbish. > > There are advantages to adopting the preferred style of a language (when one > exists). We, of course, are not required to adopt it but I am of the opinion > that we should have good reasons to diverge if that's our choice in the end. > > "Not having parentheses around the if-clause expression is complete rubbish" > doesn't sound like a good reason to me. Of course it does; my internal lexer keeps screaming syntax error at me; how am I going to understand code when I can't sanely read it? The more you make it look like (Kernel) C, the easier it is for us C people to actually read. My eyes have been reading C for almost 30 years by now, they have a lexer built in the optical nerve; reading something that looks vaguely like C but is definitely not C is an utterly painful experience. You're asking to join us, not the other way around. I'm fine in a world without Rust.
Re: [PATCH 00/13] [RFC] Rust support
On Wed, Apr 14, 2021 at 08:45:51PM +0200, oj...@kernel.org wrote: > Rust is a systems programming language that brings several key > advantages over C in the context of the Linux kernel: > > - No undefined behavior in the safe subset (when unsafe code is > sound), including memory safety and the absence of data races. And yet I see not a single mention of the Rust Memory Model and how it aligns (or not) with the LKMM. The C11 memory model for example is a really poor fit for LKMM. > ## Why not? > > Rust also has disadvantages compared to C in the context of > the Linux kernel: > > - The many years of effort in tooling for C around the kernel, > including compiler plugins, sanitizers, Coccinelle, lockdep, > sparse... However, this will likely improve if Rust usage in > the kernel grows over time. This; can we mercilessly break the .rs bits when refactoring? What happens the moment we cannot boot x86_64 without Rust crap on? We can ignore this as a future problem, but I think it's only fair to discuss now. I really don't care for that future, and IMO adding this Rust or any other second language is a fail. > Thirdly, in Rust code bases, most documentation is written alongside > the source code, in Markdown. We follow this convention, thus while > we have a few general documents in `Documentation/rust/`, most of > the actual documentation is in the source code itself. > > In order to read this documentation easily, Rust provides a tool > to generate HTML documentation, just like Sphinx/kernel-doc, but > suited to Rust code bases and the language concepts. HTML is not a valid documentation format. Heck, markdown itself is barely readable. > Moreover, as explained above, we are taking the chance to enforce > some documentation guidelines. We are also enforcing automatic code > formatting, a set of Clippy lints, etc. We decided to go with Rust's > idiomatic style, i.e. keeping `rustfmt` defaults. For instance, this > means 4 spaces are used for indentation, rather than a tab. We are > happy to change that if needed -- we think what is important is > keeping the formatting automated. It is really *really* hard to read. It has all sorts of weird things, like operators at the beginning after a line break: if (foo || bar) which is just wrong. And it suffers from CamelCase, which is just about the worst thing ever. Not even the C++ std libs have that (or had, back when I still did knew C++). I also see: if (foo) { ... } and if foo { } the latter, ofcourse, being complete rubbish. > Another important topic we would like feedback on is the Rust > "native" documentation that is written alongside the code, as > explained above. We have uploaded it here: > > https://rust-for-linux.github.io/docs/kernel/ > > We like how this kind of generated documentation looks. Please take > a look and let us know what you think! I cannot view with less or vim. Therefore it looks not at all. > - Boqun Feng is working hard on the different options for > threading abstractions and has reviewed most of the `sync` PRs. Boqun, I know you're familiar with LKMM, can you please talk about how Rust does things and how it interacts?
[PATCH] sched/debug: Rename the sched_debug parameter to sched_debug_verbose
On Mon, Apr 12, 2021 at 12:14:25PM +0200, Peter Zijlstra wrote: > + debugfs_create_bool("debug_enabled", 0644, debugfs_sched, > _debug_enabled); How's this on top of the whole series? --- Subject: sched/debug: Rename the sched_debug parameter to sched_debug_verbose From: Peter Zijlstra Date: Thu Apr 15 18:23:17 CEST 2021 CONFIG_SCHED_DEBUG is the build-time Kconfig knob, the boot param sched_debug and the /debug/sched/debug_enabled knobs control the sched_debug_enabled variable, but what they really do is make SCHED_DEBUG more verbose, so rename the lot. Signed-off-by: Peter Zijlstra (Intel) --- Documentation/admin-guide/kernel-parameters.txt |3 ++- Documentation/scheduler/sched-domains.rst | 10 +- kernel/sched/debug.c|4 ++-- kernel/sched/topology.c | 10 +- 4 files changed, 14 insertions(+), 13 deletions(-) --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4756,7 +4756,8 @@ sbni= [NET] Granch SBNI12 leased line adapter - sched_debug [KNL] Enables verbose scheduler debug messages. + sched_debug_verbose + [KNL] Enables verbose scheduler debug messages. schedstats= [KNL,X86] Enable or disable scheduled statistics. Allowed values are enable and disable. This feature --- a/Documentation/scheduler/sched-domains.rst +++ b/Documentation/scheduler/sched-domains.rst @@ -74,8 +74,8 @@ for a given topology level by creating a calling set_sched_topology() with this array as the parameter. The sched-domains debugging infrastructure can be enabled by enabling -CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to -tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug -knob. This enables an error checking parse of the sched domains which should -catch most possible errors (described above). It also prints out the domain -structure in a visual format. +CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you +forgot to tweak your cmdline, you can also flip the +/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of +the sched domains which should catch most possible errors (described above). It +also prints out the domain structure in a visual format. --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -275,7 +275,7 @@ static const struct file_operations sche #endif /* CONFIG_PREEMPT_DYNAMIC */ -__read_mostly bool sched_debug_enabled; +__read_mostly bool sched_debug_verbose; static const struct seq_operations sched_debug_sops; @@ -300,7 +300,7 @@ static __init int sched_init_debug(void) debugfs_sched = debugfs_create_dir("sched", NULL); debugfs_create_file("features", 0644, debugfs_sched, NULL, _feat_fops); - debugfs_create_bool("debug_enabled", 0644, debugfs_sched, _debug_enabled); + debugfs_create_bool("verbose", 0644, debugfs_sched, _debug_verbose); #ifdef CONFIG_PREEMPT_DYNAMIC debugfs_create_file("preempt", 0644, debugfs_sched, NULL, _dynamic_fops); #endif --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -14,7 +14,7 @@ static cpumask_var_t sched_domains_tmpma static int __init sched_debug_setup(char *str) { - sched_debug_enabled = true; + sched_debug_verbose = true; return 0; } @@ -22,7 +22,7 @@ early_param("sched_debug", sched_debug_s static inline bool sched_debug(void) { - return sched_debug_enabled; + return sched_debug_verbose; } #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, @@ -131,7 +131,7 @@ static void sched_domain_debug(struct sc { int level = 0; - if (!sched_debug_enabled) + if (!sched_debug_verbose) return; if (!sd) { @@ -152,7 +152,7 @@ static void sched_domain_debug(struct sc } #else /* !CONFIG_SCHED_DEBUG */ -# define sched_debug_enabled 0 +# define sched_debug_verbose 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -2141,7 +2141,7 @@ build_sched_domains(const struct cpumask if (has_asym) static_branch_inc_cpuslocked(_asym_cpucapacity); - if (rq && sched_debug_enabled) { + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); }
Re: [PATCH] locking/qrwlock: Fix ordering in queued_write_lock_slowpath
On Thu, Apr 15, 2021 at 04:28:21PM +0100, Will Deacon wrote: > On Thu, Apr 15, 2021 at 05:03:58PM +0200, Peter Zijlstra wrote: > > @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) > > > > /* When no more readers or writers, set the locked flag */ > > do { > > - atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); > > - } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING, > > - _QW_LOCKED) != _QW_WAITING); > > + cnt = atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); > > I think the issue is that >here< a concurrent reader in interrupt context > can take the lock and release it again, but we could speculate reads from > the critical section up over the later release and up before the control > dependency here... OK, so that was totally not clear from the original changelog. > > + } while (!atomic_try_cmpxchg_relaxed(>cnts, , _QW_LOCKED)); > > ... and then this cmpxchg() will succeed, so our speculated stale reads > could be used. > > *HOWEVER* > > Speculating a read should be fine in the face of a concurrent _reader_, > so for this to be an issue it implies that the reader is also doing some > (atomic?) updates. So we're having something like: CPU0CPU1 queue_write_lock_slowpath() atomic_cond_read_acquire() == _QW_WAITING queued_read_lock_slowpath() atomic_cond_read_acquire() return; ,--> (before X's store) |X = y; | |queued_read_unlock() | (void)atomic_sub_return_release() | | atomic_cmpxchg_relaxed(.old = _QW_WAITING) | `-- r = X; Which as Will said is a cmpxchg ABA, however for it to be ABA, we have to observe that unlock-store, and won't we then also have to observe the whole critical section? Ah, the issue is yet another load inside our own (CPU0)'s critical section. which isn't ordered against the cmpxchg_relaxed() and can be issued before. So yes, then making the cmpxchg an acquire, will force all our own loads to happen later.
Re: [PATCH 3/3] sched: Use cpu_dying() to fix balance_push vs hotplug-rollback
On Thu, Apr 15, 2021 at 03:32:11PM +0100, Valentin Schneider wrote: > > So instead, make sure balance_push is enabled between > > sched_cpu_deactivate() and sched_cpu_activate() (eg. when > > !cpu_active()), and gate it's utility with cpu_dying(). > > I'd word that "is enabled below sched_cpu_activate()", since > sched_cpu_deactivate() is now out of the picture. I are confused (again!). Did you mean to say something like: 'is enabled below SCHED_AP_ACTIVE' ?
Re: [PATCH] locking/qrwlock: Fix ordering in queued_write_lock_slowpath
On Thu, Apr 15, 2021 at 02:25:52PM +, Ali Saidi wrote: > While this code is executed with the wait_lock held, a reader can > acquire the lock without holding wait_lock. The writer side loops > checking the value with the atomic_cond_read_acquire(), but only truly > acquires the lock when the compare-and-exchange is completed > successfully which isn’t ordered. The other atomic operations from this > point are release-ordered and thus reads after the lock acquisition can > be completed before the lock is truly acquired which violates the > guarantees the lock should be making. Should be per who? We explicitly do not order the lock acquire store. qspinlock has the exact same issue. If you look in the git history surrounding spin_is_locked(), you'll find 'lovely' things. Basically, if you're doing crazy things with spin_is_locked() you get to keep the pieces. > Fixes: b519b56e378ee ("locking/qrwlock: Use atomic_cond_read_acquire() when > spinning in qrwloc") > Signed-off-by: Ali Saidi > Cc: sta...@vger.kernel.org > --- > kernel/locking/qrwlock.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c > index 4786dd271b45..10770f6ac4d9 100644 > --- a/kernel/locking/qrwlock.c > +++ b/kernel/locking/qrwlock.c > @@ -73,8 +73,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) > > /* When no more readers or writers, set the locked flag */ > do { > - atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); > - } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING, > + atomic_cond_read_relaxed(>cnts, VAL == _QW_WAITING); > + } while (atomic_cmpxchg_acquire(>cnts, _QW_WAITING, > _QW_LOCKED) != _QW_WAITING); > unlock: > arch_spin_unlock(>wait_lock); This doesn't make sense, there is no such thing as a store-acquire. What you're doing here is moving the acquire from one load to the next. A load we know will load the exact same value. Also see Documentation/atomic_t.txt: {}_acquire: the R of the RMW (or atomic_read) is an ACQUIRE If anything this code wants to be written like so. --- diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 4786dd271b45..22aeccc363ca 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -60,6 +60,8 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); */ void queued_write_lock_slowpath(struct qrwlock *lock) { + u32 cnt; + /* Put the writer into the wait queue */ arch_spin_lock(>wait_lock); @@ -73,9 +75,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock) /* When no more readers or writers, set the locked flag */ do { - atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); - } while (atomic_cmpxchg_relaxed(>cnts, _QW_WAITING, - _QW_LOCKED) != _QW_WAITING); + cnt = atomic_cond_read_acquire(>cnts, VAL == _QW_WAITING); + } while (!atomic_try_cmpxchg_relaxed(>cnts, , _QW_LOCKED)); unlock: arch_spin_unlock(>wait_lock); }