Re: [PATCH] powerpc/perf: Remove sched_task function defined for thread-imc
On Fri, 18 May 2018 at 17:06, Anju T Sudhakar wrote: > > Call trace observed while running perf-fuzzer: > > [ 329.228068] CPU: 43 PID: 9088 Comm: perf_fuzzer Not tainted > 4.13.0-32-generic #35~lp1746225 > [ 329.228070] task: c03f776ac900 task.stack: c03f77728000 > [ 329.228071] NIP: c0299b70 LR: c02a4534 CTR: > c029bb80 > [ 329.228073] REGS: c03f7772b760 TRAP: 0700 Not tainted > (4.13.0-32-generic) > [ 329.228073] MSR: 9282b033 > [ 329.228079] CR: 24008822 XER: > [ 329.228080] CFAR: c0299a70 SOFTE: 0 > GPR00: c02a4534 c03f7772b9e0 c1606200 c03fef858908 > GPR04: c03f776ac900 0001 003fee73 > GPR08: c11220d8 0002 > GPR12: c029bb80 c7a3d900 > GPR16: > GPR20: c03f776ad090 c0c71354 > GPR24: c03fef716780 003fee73 c03fe69d4200 c03f776ad330 > GPR28: c11220d8 0001 c14c6108 c03fef858900 > [ 329.228098] NIP [c0299b70] perf_pmu_sched_task+0x170/0x180 > [ 329.228100] LR [c02a4534] __perf_event_task_sched_in+0xc4/0x230 > [ 329.228101] Call Trace: > [ 329.228102] [c03f7772b9e0] [c02a0678] > perf_iterate_sb+0x158/0x2a0 (unreliable) > [ 329.228105] [c03f7772ba30] [c02a4534] > __perf_event_task_sched_in+0xc4/0x230 > [ 329.228107] [c03f7772bab0] [c01396dc] > finish_task_switch+0x21c/0x310 > [ 329.228109] [c03f7772bb60] [c0c71354] __schedule+0x304/0xb80 > [ 329.228111] [c03f7772bc40] [c0c71c10] schedule+0x40/0xc0 > [ 329.228113] [c03f7772bc60] [c01033f4] do_wait+0x254/0x2e0 > [ 329.228115] [c03f7772bcd0] [c0104ac0] kernel_wait4+0xa0/0x1a0 > [ 329.228117] [c03f7772bd70] [c0104c24] SyS_wait4+0x64/0xc0 > [ 329.228121] [c03f7772be30] [c000b184] system_call+0x58/0x6c > [ 329.228121] Instruction dump: > [ 329.228123] 3beafea0 7faa4800 409eff18 e8010060 eb610028 ebc10040 7c0803a6 > 38210050 > [ 329.228127] eb81ffe0 eba1ffe8 ebe1fff8 4e800020 <0fe0> 4bbc > 6000 6042 > [ 329.228131] ---[ end trace 8c46856d314c1811 ]--- > [ 375.755943] hrtimer: interrupt took 31601 ns > > > The context switch call-backs for thread-imc are defined in sched_task > function. > So when thread-imc events are grouped with software pmu events, > perf_pmu_sched_task hits the WARN_ON_ONCE condition, since software PMUs are > assumed not to have a sched_task defined. > > Patch to move the thread_imc enable/disable opal call back from sched_task to > event_[add/del] function > > Signed-off-by: Anju T Sudhakar I reproduced the WARN when running -rc7 on a Power9 box. I haven't seen it since applying this patch. Tested-by: Joel Stanley Cheers, Joel
Re: [PATCH v4 5/6] powerpc: Add show_user_instructions()
Le 01/08/2018 à 23:33, Murilo Opsfelder Araujo a écrit : show_user_instructions() is a slightly modified version of show_instructions() that allows userspace instruction dump. This will be useful within show_signal_msg() to dump userspace instructions of the faulty location. Here is a sample of what show_user_instructions() outputs: pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 The current->comm and current->pid printed can serve as a glue that links the instructions dump to its originator, allowing messages to be interleaved in the logs. Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/include/asm/stacktrace.h | 13 + arch/powerpc/kernel/process.c | 40 +++ 2 files changed, 53 insertions(+) create mode 100644 arch/powerpc/include/asm/stacktrace.h diff --git a/arch/powerpc/include/asm/stacktrace.h b/arch/powerpc/include/asm/stacktrace.h new file mode 100644 index ..6149b53b3bc8 --- /dev/null +++ b/arch/powerpc/include/asm/stacktrace.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Stack trace functions. + * + * Copyright 2018, Murilo Opsfelder Araujo, IBM Corporation. + */ + +#ifndef _ASM_POWERPC_STACKTRACE_H +#define _ASM_POWERPC_STACKTRACE_H + +void show_user_instructions(struct pt_regs *regs); + +#endif /* _ASM_POWERPC_STACKTRACE_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e9533b4d2f08..364645ac732c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1299,6 +1299,46 @@ static void show_instructions(struct pt_regs *regs) pr_cont("\n"); } +void show_user_instructions(struct pt_regs *regs) +{ + int i; + const char *prefix = KERN_INFO "%s[%d]: code: "; + unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * + sizeof(int)); + + printk(prefix, current->comm, current->pid); Why not use pr_info() and remove KERN_INFO from *prefix ? + + for (i = 0; i < instructions_to_print; i++) { + int instr; + + if (!(i % 8) && (i > 0)) { + pr_cont("\n"); + printk(prefix, current->comm, current->pid); + } + +#if !defined(CONFIG_BOOKE) + /* If executing with the IMMU off, adjust pc rather +* than print . +*/ + if (!(regs->msr & MSR_IR)) + pc = (unsigned long)phys_to_virt(pc); Shouldn't this be done outside of the loop, only once ? Christophe +#endif + + if (probe_kernel_address((unsigned int __user *)pc, instr)) { + pr_cont(" "); + } else { + if (regs->nip == pc) + pr_cont("<%08x> ", instr); + else + pr_cont("%08x ", instr); + } + + pc += sizeof(int); + } + + pr_cont("\n"); +} + struct regbit { unsigned long bit; const char *name;
Re: [PATCH v6 5/8] powerpc/pseries: flush SLB contents on SLB MCE errors.
On 08/01/2018 11:28 AM, Nicholas Piggin wrote: > On Wed, 04 Jul 2018 23:28:21 +0530 > Mahesh J Salgaonkar wrote: > >> From: Mahesh Salgaonkar >> >> On pseries, as of today system crashes if we get a machine check >> exceptions due to SLB errors. These are soft errors and can be fixed by >> flushing the SLBs so the kernel can continue to function instead of >> system crash. We do this in real mode before turning on MMU. Otherwise >> we would run into nested machine checks. This patch now fetches the >> rtas error log in real mode and flushes the SLBs on SLB errors. >> >> Signed-off-by: Mahesh Salgaonkar >> --- >> arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 >> arch/powerpc/include/asm/machdep.h|1 >> arch/powerpc/kernel/exceptions-64s.S | 42 + >> arch/powerpc/kernel/mce.c | 16 +++- >> arch/powerpc/mm/slb.c |6 +++ >> arch/powerpc/platforms/pseries/pseries.h |1 >> arch/powerpc/platforms/pseries/ras.c | 51 >> + >> arch/powerpc/platforms/pseries/setup.c|1 >> 8 files changed, 116 insertions(+), 3 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h >> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h >> index 50ed64fba4ae..cc00a7088cf3 100644 >> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h >> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h >> @@ -487,6 +487,7 @@ extern void hpte_init_native(void); >> >> extern void slb_initialize(void); >> extern void slb_flush_and_rebolt(void); >> +extern void slb_flush_and_rebolt_realmode(void); >> >> extern void slb_vmalloc_update(void); >> extern void slb_set_size(u16 size); >> diff --git a/arch/powerpc/include/asm/machdep.h >> b/arch/powerpc/include/asm/machdep.h >> index ffe7c71e1132..fe447e0d4140 100644 >> --- a/arch/powerpc/include/asm/machdep.h >> +++ b/arch/powerpc/include/asm/machdep.h >> @@ -108,6 +108,7 @@ struct machdep_calls { >> >> /* Early exception handlers called in realmode */ >> int (*hmi_exception_early)(struct pt_regs *regs); >> +int (*machine_check_early)(struct pt_regs *regs); >> >> /* Called during machine check exception to retrive fixup address. */ >> bool(*mce_check_early_recovery)(struct pt_regs *regs); >> diff --git a/arch/powerpc/kernel/exceptions-64s.S >> b/arch/powerpc/kernel/exceptions-64s.S >> index f283958129f2..0038596b7906 100644 >> --- a/arch/powerpc/kernel/exceptions-64s.S >> +++ b/arch/powerpc/kernel/exceptions-64s.S >> @@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries) >> machine_check_fwnmi: >> SET_SCRATCH0(r13) /* save r13 */ >> EXCEPTION_PROLOG_0(PACA_EXMC) >> +BEGIN_FTR_SECTION >> +b machine_check_pSeries_early >> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) >> machine_check_pSeries_0: >> EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) >> /* >> @@ -343,6 +346,45 @@ machine_check_pSeries_0: >> >> TRAMP_KVM_SKIP(PACA_EXMC, 0x200) >> >> +TRAMP_REAL_BEGIN(machine_check_pSeries_early) >> +BEGIN_FTR_SECTION >> +EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) >> +mr r10,r1 /* Save r1 */ >> +ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ >> +subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/ >> +mfspr r11,SPRN_SRR0 /* Save SRR0 */ >> +mfspr r12,SPRN_SRR1 /* Save SRR1 */ >> +EXCEPTION_PROLOG_COMMON_1() >> +EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) >> +EXCEPTION_PROLOG_COMMON_3(0x200) >> +addir3,r1,STACK_FRAME_OVERHEAD >> +BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */ >> + >> +/* Move original SRR0 and SRR1 into the respective regs */ >> +ld r9,_MSR(r1) >> +mtspr SPRN_SRR1,r9 >> +ld r3,_NIP(r1) >> +mtspr SPRN_SRR0,r3 >> +ld r9,_CTR(r1) >> +mtctr r9 >> +ld r9,_XER(r1) >> +mtxer r9 >> +ld r9,_LINK(r1) >> +mtlrr9 >> +REST_GPR(0, r1) >> +REST_8GPRS(2, r1) >> +REST_GPR(10, r1) >> +ld r11,_CCR(r1) >> +mtcrr11 >> +REST_GPR(11, r1) >> +REST_2GPRS(12, r1) >> +/* restore original r1. */ >> +ld r1,GPR1(r1) >> +SET_SCRATCH0(r13) /* save r13 */ >> +EXCEPTION_PROLOG_0(PACA_EXMC) >> +b machine_check_pSeries_0 >> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) >> + >> EXC_COMMON_BEGIN(machine_check_common) >> /* >> * Machine check is different because we use a different >> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c >> index efdd16a79075..221271c96a57 100644 >> --- a/arch/powerpc/kernel/mce.c >> +++ b/arch/powerpc/kernel/mce.c >> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs) >> { >> long handled = 0; >> >> -__this_cpu_inc(irq_stat.mce_exceptions); >> +/* >> +
[RFC PATCH 3/3] cpuidle/powernv: Conditionally save-restore sprs using opal
From: Abhishek Goel If a state has "opal-supported" compat flag in device-tree, an opal call needs to be made during the entry and exit of the stop state. This patch passes a hint to the power9_idle_stop and power9_offline_stop. This patch moves the saving and restoring of sprs for P9 cpuidle from kernel to opal. This patch still uses existing code to detect first thread in core. In an attempt to make the powernv idle code backward compatible, and to some extent forward compatible, add support for pre-stop entry and post-stop exit actions in OPAL. If a kernel knows about this opal call, then just a firmware supporting newer hardware is required, instead of waiting for kernel updates. Signed-off-by: Abhishek Goel Signed-off-by: Akshay Adiga --- arch/powerpc/include/asm/cpuidle.h| 1 + arch/powerpc/include/asm/opal-api.h | 4 +- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/include/asm/paca.h | 5 +- arch/powerpc/include/asm/processor.h | 6 +- arch/powerpc/kernel/asm-offsets.c | 3 + arch/powerpc/kernel/idle_book3s.S | 55 ++- arch/powerpc/platforms/powernv/idle.c | 41 ++ .../powerpc/platforms/powernv/opal-wrappers.S | 2 + arch/powerpc/xmon/xmon.c | 14 ++--- 10 files changed, 109 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index b965066560cc..2fb2324d15fc 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -96,6 +96,7 @@ struct pnv_idle_states_t { u64 psscr_val; u64 psscr_mask; u32 flags; + bool req_opal_call; enum idle_state_type_t type; bool valid; }; diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 3bab299eda49..6792a737bc9a 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -208,7 +208,9 @@ #define OPAL_SENSOR_READ_U64 162 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR 164 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR 165 -#define OPAL_LAST 165 +#define OPAL_IDLE_SAVE 168 +#define OPAL_IDLE_RESTORE 169 +#define OPAL_LAST 169 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index e1b2910c6e81..12d57aeacde2 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -356,6 +356,9 @@ extern void opal_kmsg_init(void); extern int opal_event_request(unsigned int opal_event_nr); +extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr); +extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 srr1); + struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr, unsigned long vmalloc_size); void opal_free_sg_list(struct opal_sg_list *sg); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d34bd71139d..586059594443 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -195,11 +195,14 @@ struct paca_struct { /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; + u64 wakeup_psscr; + u8 req_opal_call; /* -* Save area for additional SPRs that need to be +* Save area for SPRs that need to be * saved/restored during cpuidle stop. */ struct stop_sprs stop_sprs; + u64 *opal_stop_sprs; #endif #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 34f572056add..9f9fb1f11dd6 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -513,8 +513,10 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/ extern void power7_idle_type(unsigned long type); -extern unsigned long power9_idle_stop(unsigned long psscr_val); -extern unsigned long power9_offline_stop(unsigned long psscr_val); +extern unsigned long power9_idle_stop(unsigned long psscr_val, + bool opal_enabled); +extern unsigned long power9_offline_stop(unsigned long psscr_val, +bool opal_enabled); extern void power9_idle_type(int index); extern void flush_instruction_cache(void); diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 262c44a90ea1..740ae068ec74 100644
[RFC PATCH 2/3] powernv/cpuidle: Pass pointers instead of values to stop loop
Passing pointer to the pnv_idle_state instead of psscr value and mask. This helps us to pass more information to the stop loop. This will help to figure out the method to enter/exit idle state. Signed-off-by: Akshay Adiga --- arch/powerpc/include/asm/processor.h | 3 +- arch/powerpc/platforms/powernv/idle.c | 58 --- drivers/cpuidle/cpuidle-powernv.c | 16 +++- 3 files changed, 32 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 5debe337ea9d..34f572056add 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -515,8 +515,7 @@ extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc extern void power7_idle_type(unsigned long type); extern unsigned long power9_idle_stop(unsigned long psscr_val); extern unsigned long power9_offline_stop(unsigned long psscr_val); -extern void power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); +extern void power9_idle_type(int index); extern void flush_instruction_cache(void); extern void hard_reset_now(void); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 93accece92e3..a6ef9b68e27b 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -43,8 +43,7 @@ int nr_pnv_idle_states; * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -static u64 pnv_default_stop_val; -static u64 pnv_default_stop_mask; +struct pnv_idle_states_t *pnv_default_state; static bool default_stop_found; static int parse_dt_v1(struct device_node *np); @@ -70,9 +69,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -static u64 pnv_deepest_stop_psscr_val; -static u64 pnv_deepest_stop_psscr_mask; -static u64 pnv_deepest_stop_flag; +static struct pnv_idle_states_t *pnv_deepest_state; static bool deepest_stop_found; static int pnv_save_sprs_for_deep_states(void) @@ -92,7 +89,7 @@ static int pnv_save_sprs_for_deep_states(void) uint64_t hid5_val = mfspr(SPRN_HID5); uint64_t hmeer_val = mfspr(SPRN_HMEER); uint64_t msr_val = MSR_IDLE; - uint64_t psscr_val = pnv_deepest_stop_psscr_val; + uint64_t psscr_val = pnv_deepest_state->psscr_val; for_each_present_cpu(cpu) { uint64_t pir = get_hard_smp_processor_id(cpu); @@ -218,18 +215,15 @@ static void pnv_alloc_idle_core_states(void) pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); if (cpu_has_feature(CPU_FTR_ARCH_300) && - (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { + (pnv_deepest_state->flags & OPAL_PM_LOSE_FULL_CONTEXT)) { /* * Use the default stop state for CPU-Hotplug * if available. */ if (default_stop_found) { - pnv_deepest_stop_psscr_val = - pnv_default_stop_val; - pnv_deepest_stop_psscr_mask = - pnv_default_stop_mask; + pnv_deepest_state = pnv_default_state; pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", - pnv_deepest_stop_psscr_val); + pnv_deepest_state->psscr_val); } else { /* Fallback to snooze loop for CPU-Hotplug */ deepest_stop_found = false; pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); @@ -365,20 +359,20 @@ void power7_idle(void) power7_idle_type(PNV_THREAD_NAP); } -static unsigned long __power9_idle_type(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask) +static unsigned long __power9_idle_type(struct pnv_idle_states_t *state) { - unsigned long psscr; + unsigned long psscr, stop_psscr_mask, stop_psscr_val; unsigned long srr1; + stop_psscr_mask = state->psscr_mask; + stop_psscr_val = state->psscr_val; if (!prep_irq_for_idle_irqsoff()) return 0; psscr = mfspr(SPRN_PSSCR); psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; - __ppc64_runlatch_off(); - srr1 = power9_idle_stop(psscr); + srr1 = power9_idle_stop(psscr, state->opal_supported); __ppc64_runlatch_on(); fini_irq_for_idle_irqsoff(); @@ -386,12 +380,11 @@ static unsigned long __power9_idle_type(unsigned long stop_psscr_val, return
[RFC PATCH 1/3] cpuidle/powernv: Add support for states with ibm, cpuidle-state-v1
This patch adds support for new device-tree format for idle state description. Previously if a older kernel runs on a newer firmware, it may enable all available states irrespective of its capability of handling it. New device tree format adds a compatible flag, so that only kernel which has the capability to handle the version of stop state will enable it. Older kernel will still see stop0 and stop0_lite in older format and we will depricate it after some time. 1) Idea is to bump up the version in firmware if we find a bug or regression in stop states. A fix will be provided in linux which would now know about the bumped up version of stop states, where as kernel without fixes would ignore the states. 2) Slowly deprecate cpuidle /cpuhotplug threshold which is hard-coded into cpuidle-powernv driver. Instead use compatible strings to indicate if idle state is suitable for cpuidle and hotplug. New idle state device tree format : power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "ibm,state-v1", "cpuidle", "opal-supported"; psscr-mask = <0x0 0x3003ff>; handle = <0x102>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; }; ... stop11 { ... compatible = "ibm,state-v1", "cpuoffline", "opal-supported"; ... }; }; compatible strings : "cpuidle" : indicates it should be used by cpuidle-driver "cpuoffline" : indicates it should be used by hotplug driver "ibm,state-v1" : kernel checks if it knows about this version "opal-supported" : indicates kernel can fall back to use opal for stop-transitions Signed-off-by: Akshay Adiga --- arch/powerpc/include/asm/cpuidle.h| 11 ++ arch/powerpc/platforms/powernv/idle.c | 139 +- drivers/cpuidle/cpuidle-powernv.c | 50 + 3 files changed, 175 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 43e5f31fe64d..b965066560cc 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -79,17 +79,28 @@ struct stop_sprs { u64 mmcra; }; +enum idle_state_type_t { + CPUIDLE_TYPE, + CPUOFFLINE_TYPE +}; + + +#define POWERNV_THRESHOLD_LATENCY_NS 20 +#define PNV_VER_NAME_LEN32 #define PNV_IDLE_NAME_LEN16 struct pnv_idle_states_t { char name[PNV_IDLE_NAME_LEN]; + char version[PNV_VER_NAME_LEN]; u32 latency_ns; u32 residency_ns; u64 psscr_val; u64 psscr_mask; u32 flags; + enum idle_state_type_t type; bool valid; }; + extern struct pnv_idle_states_t *pnv_idle_states; extern int nr_pnv_idle_states; extern u32 pnv_fastsleep_workaround_at_entry[]; diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 7cf71b3e03a1..93accece92e3 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -47,6 +47,19 @@ static u64 pnv_default_stop_val; static u64 pnv_default_stop_mask; static bool default_stop_found; +static int parse_dt_v1(struct device_node *np); +struct stop_version_t { + const char name[PNV_VER_NAME_LEN]; + int (*parser_fn)(struct device_node *np); +}; +struct stop_version_t known_versions[] = { + { + .name = "ibm,state-v1", + .parser_fn = parse_dt_v1, + } +}; +const int nr_known_versions = 1; + /* * First deep stop state. Used to figure out when to save/restore * hypervisor context. @@ -659,8 +672,14 @@ static int __init pnv_power9_idle_init(void) state->valid = false; report_invalid_psscr_val(state->psscr_val, err); continue; + } else { + state->valid = true; } + /* +* We pick state with highest residency. We dont care if +* its a cpuidle state or a cpuoffline state. +*/ if (max_residency_ns
[RFC PATCH 0/3] New device-tree format and Opal based idle save-restore
Previously if a older kernel runs on a newer firmware, it may enable all available states irrespective of its capability of handling it. New device tree format adds a compatible flag, so that only kernel which has the capability to handle the version of stop state will enable it. Older kernel will still see stop0 and stop0_lite in older format and we will depricate it after some time. 1) Idea is to bump up the version string in firmware if we find a bug or regression in stop states. A fix will be provided in linux which would now know about the bumped up version of stop states, where as kernel without fixes would ignore the states. 2) Slowly deprecate cpuidle/cpuhotplug threshold which is hard-coded into cpuidle-powernv driver. Instead use compatible strings to indicate if idle state is suitable for cpuidle and hotplug. New idle state device tree format : power-mgt { ... ibm,enabled-stop-levels = <0xec00>; ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>; ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>; ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>; ibm,cpu-idle-state-flags = <0x10 0x101000>; ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>; ibm,idle-states { stop4 { flags = <0x207000>; compatible = "ibm,state-v1", "cpuidle", "opal-supported"; psscr-mask = <0x0 0x3003ff>; handle = <0x102>; latency-ns = <0x186a0>; residency-ns = <0x989680>; psscr = <0x0 0x300374>; }; ... stop11 { ... compatible = "ibm,state-v1", "cpuoffline", "opal-supported"; ... }; }; Skiboot patch-set for device-tree is posted here : https://patchwork.ozlabs.org/project/skiboot/list/?series=58934 High-level parsing algorithm : Say Known version string = "ibm,state-v1" for each stop state node in device tree: if (compatible has known version string) kernel takes care of stop-transitions else if (compatible has "opal-supported") OPAL takes care of stop-transitions else Skip All deeper states When a state does not have both version support and opal support, Its possible to exit from a shallower state. Hence skipping all deeper states. How does it work ? --- Consider a case that stop4 has a bug. We take the following steps to mitigate the problem. 1) Change compatible string for stop4 in OPAL to "ibm-state-v2" and remove "opal-supported". ship the new firmware. The kernel ignores stop4 and all deeper states. But we will still have shallower states. Prevents from completely disabling stop states. 2) Implement workaround in OPAL and add "opal-supported". Ship new firmware The kernel uses opal for stop-transtion , which has workaround implemented. We get stop4 and deeper states working without kernel changes and backports. (and considerably less time) 3) Implement workaround in kernel and add "ibm-state-v2" as known versions The kernel will now be able to handle stop4 and deeper states. Also includes Abhishek's RFC which was posted there : https://patchwork.ozlabs.org/patch/947568/ This patch-set is on top of mpe-next Abhishek Goel (1): cpuidle/powernv: Conditionally save-restore sprs using opal Akshay Adiga (2): cpuidle/powernv: Add support for states with ibm,cpuidle-state-v1 powernv/cpuidle: Pass pointers instead of values to stop loop arch/powerpc/include/asm/cpuidle.h| 12 + arch/powerpc/include/asm/opal-api.h | 4 +- arch/powerpc/include/asm/opal.h | 3 + arch/powerpc/include/asm/paca.h | 5 +- arch/powerpc/include/asm/processor.h | 9 +- arch/powerpc/kernel/asm-offsets.c | 3 + arch/powerpc/kernel/idle_book3s.S | 55 - arch/powerpc/platforms/powernv/idle.c | 214 +++--- .../powerpc/platforms/powernv/opal-wrappers.S | 2 + arch/powerpc/xmon/xmon.c | 14 +- drivers/cpuidle/cpuidle-powernv.c | 66 +++--- 11 files changed, 304 insertions(+), 83 deletions(-) -- 2.18.0.rc2.85.g1fb9df7
Re: linux-next: manual merge of the powerpc tree with the m68k tree
Hi all, [forgot the conflict resolution ...] On Thu, 2 Aug 2018 09:27:20 +1000 Stephen Rothwell wrote: > > Today's linux-next merge of the powerpc tree got a conflict in: > > arch/m68k/mac/misc.c > > between commit: > > 5b9bfb8ec467 ("m68k: mac: Use time64_t in RTC handling") > > from the m68k tree and commit: > > ebd722275f9c ("macintosh/via-pmu: Replace via-pmu68k driver with via-pmu > driver") > > from the powerpc tree. > > I fixed it up (see below) and can carry the fix as necessary. This > is now fixed as far as linux-next is concerned, but any non trivial > conflicts should be mentioned to your upstream maintainer when your tree > is submitted for merging. You may also want to consider cooperating > with the maintainer of the conflicting tree to minimise any particularly > complex conflicts. -- Cheers, Stephen Rothwell diff --cc arch/m68k/mac/misc.c index 19e9d8eef1f2,28090a44fa09..3534aa6a4dc2 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@@ -90,11 -85,11 +90,11 @@@ static void cuda_write_pram(int offset } #endif /* CONFIG_ADB_CUDA */ - #ifdef CONFIG_ADB_PMU68K + #ifdef CONFIG_ADB_PMU -static long pmu_read_time(void) +static time64_t pmu_read_time(void) { struct adb_request req; - long time; + time64_t time; if (pmu_request(, NULL, 1, PMU_READ_RTC) < 0) return 0; pgpf0oTdwPhhP.pgp Description: OpenPGP digital signature
linux-next: manual merge of the powerpc tree with the m68k tree
Hi all, Today's linux-next merge of the powerpc tree got a conflict in: arch/m68k/mac/misc.c between commit: 5b9bfb8ec467 ("m68k: mac: Use time64_t in RTC handling") from the m68k tree and commit: ebd722275f9c ("macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver") from the powerpc tree. I fixed it up (see below) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. -- Cheers, Stephen Rothwell pgpZEHofoHrtx.pgp Description: OpenPGP digital signature
Re: [RFC 0/4] Virtio uses DMA API for all devices
On Wed, Aug 01, 2018 at 10:05:35AM +0100, Will Deacon wrote: > Hi Christoph, > > On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote: > > On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote: > > > On arm/arm64, the problem we have is that legacy virtio devices on the > > > MMIO > > > transport (so definitely not PCI) have historically been advertised by > > > qemu > > > as not being cache coherent, but because the virtio core has bypassed DMA > > > ops then everything has happened to work. If we blindly enable the arch > > > DMA > > > ops, > > > > No one is suggesting that as far as I can tell. > > Apologies: it's me that wants the DMA ops enabled to handle legacy devices > behind an IOMMU, but see below. > > > > we'll plumb in the non-coherent ops and start getting data corruption, > > > so we do need a way to quirk virtio as being "always coherent" if we want > > > to > > > use the DMA ops (which we do, because our emulation platforms have an > > > IOMMU > > > for all virtio devices). > > > > From all that I've gather so far: no you do not want that. We really > > need to figure out virtio "dma" interacts with the host / device. > > > > If you look at the current iommu spec it does talk of physical address > > with a little careveout for VIRTIO_F_IOMMU_PLATFORM. > > That's true, although that doesn't exist in the legacy virtio spec, and we > have an existing emulation platform which puts legacy virtio devices behind > an IOMMU. Currently, Linux is unable to boot on this platform unless the > IOMMU is configured as bypass. If we can use the coherent IOMMU DMA ops, > then it works perfectly. > > > So between that and our discussion in this thread and its previous > > iterations I think we need to stick to the current always physical, > > bypass system dma ops mode of virtio operation as the default. > > As above -- that means we hang during boot because we get stuck trying to > bring up a virtio-block device whose DMA is aborted by the IOMMU. The easy > answer is "just upgrade to latest virtio and advertise the presence of the > IOMMU". I'm pushing for that in future platforms, but it seems a shame not > to support the current platform, especially given that other systems do have > hacks in mainline to get virtio working. > > > We just need to figure out how to deal with devices that deviate > > from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really > > should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu > > dma tweaks (offsets, cache flushing), which seems well in spirit of > > the original design. The other issue is VIRTIO_F_IO_BARRIER > > which is very vaguely defined, and which needs a better definition. > > And last but not least we'll need some text explaining the challenges > > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER > > is what would basically cover them, but a good description including > > an explanation of why these matter. > > I agree that this makes sense for future revisions of virtio (or perhaps > it can just be a clarification to virtio 1.0), but we're still left in the > dark with legacy devices and it would be nice to have them work on the > systems which currently exist, even if it's a legacy-only hack in the arch > code. > > Will Myself I'm sympathetic to this use-case and I see more uses to this than just legacy support. But more work is required IMHO. Will post tomorrow though - it's late here ... -- MST
Re: [RFC 0/4] Virtio uses DMA API for all devices
On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote: > On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote: > > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO > > transport (so definitely not PCI) have historically been advertised by qemu > > as not being cache coherent, but because the virtio core has bypassed DMA > > ops then everything has happened to work. If we blindly enable the arch DMA > > ops, > > No one is suggesting that as far as I can tell. > > > we'll plumb in the non-coherent ops and start getting data corruption, > > so we do need a way to quirk virtio as being "always coherent" if we want to > > use the DMA ops (which we do, because our emulation platforms have an IOMMU > > for all virtio devices). > > >From all that I've gather so far: no you do not want that. We really > need to figure out virtio "dma" interacts with the host / device. > > If you look at the current iommu spec it does talk of physical address > with a little careveout for VIRTIO_F_IOMMU_PLATFORM. > > So between that and our discussion in this thread and its previous > iterations I think we need to stick to the current always physical, > bypass system dma ops mode of virtio operation as the default. > > We just need to figure out how to deal with devices that deviate > from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really > should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu > dma tweaks (offsets, cache flushing), which seems well in spirit of > the original design. Well I wouldn't say that. VIRTIO_F_IOMMU_PLATFORM is for guest programmable protection which is designed for things like userspace drivers but still very much which a CPU doing the accesses. I think VIRTIO_F_IO_BARRIER needs to be extended to VIRTIO_F_PLATFORM_DMA. > The other issue is VIRTIO_F_IO_BARRIER > which is very vaguely defined, and which needs a better definition. > And last but not least we'll need some text explaining the challenges > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER > is what would basically cover them, but a good description including > an explanation of why these matter. I think VIRTIO_F_IOMMU_PLATFORM + VIRTIO_F_PLATFORM_DMA but yea. -- MST
Re: [RFC 0/4] Virtio uses DMA API for all devices
On Tue, Jul 31, 2018 at 03:36:22PM -0500, Benjamin Herrenschmidt wrote: > On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote: > > > However the question people raise is that DMA API is already full of > > > arch-specific tricks the likes of which are outlined in your post linked > > > above. How is this one much worse? > > > > None of these warts is visible to the driver, they are all handled in > > the architecture (possibly on a per-bus basis). > > > > So for virtio we really need to decide if it has one set of behavior > > as specified in the virtio spec, or if it behaves exactly as if it > > was on a PCI bus, or in fact probably both as you lined up. But no > > magic arch specific behavior inbetween. > > The only arch specific behaviour is needed in the case where it doesn't > behave like PCI. In this case, the PCI DMA ops are not suitable, but in > our secure VMs, we still need to make it use swiotlb in order to bounce > through non-secure pages. > > It would be nice if "real PCI" was the default I think you are mixing "real PCI" which isn't coded up yet and IOMMU bypass which is. IOMMU bypass will maybe with time become unnecessary since it seems that one can just program an IOMMU in a bypass mode instead. It's hard to blame you since right now if you disable IOMMU bypass you get a real PCI mode. But they are distinct and to allow people to enable IOMMU by default we will need to teach someone (virtio or DMA API) about this mode that does follow translation and protection rules in the IOMMU but runs on a CPU and so does not need cache flushes and whatnot. OTOH real PCI mode as opposed to default hypervisor mode does not perform as well when what you actually have is a hypervisor. So we'll likely have a mix of these two modes for a while. > but it's not, VMs are > created in "legacy" mode all the times and we don't know at VM creation > time whether it will become a secure VM or not. The way our secure VMs > work is that they start as a normal VM, load a secure "payload" and > call the Ultravisor to "become" secure. > > So we're in a bit of a bind here. We need that one-liner optional arch > hook to make virtio use swiotlb in that "IOMMU bypass" case. > > Ben. And just to make sure I understand, on your platform DMA APIs do include some of the cache flushing tricks and this is why you don't want to declare iommu support in the hypervisor? -- MST
[PATCH 19/21] perf tools: Allow overriding MAX_NR_CPUS at compile time
From: Christophe Leroy After update of kernel, the perf tool doesn't run anymore on my 32MB RAM powerpc board, but still runs on a 128MB RAM board: ~# strace perf execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot allocate memory) --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} --- +++ killed by SIGSEGV +++ Segmentation fault objdump -x shows that .bss section has a huge size of 24Mbytes: 27 .bss 016baca8 101cebb8 101cebb8 001cd988 2**3 With especially the following objects having quite big size: 10205f80 l O .bss 0014 runtime_cycles_stats 10345f80 l O .bss 0014 runtime_stalled_cycles_front_stats 10485f80 l O .bss 0014 runtime_stalled_cycles_back_stats 105c5f80 l O .bss 0014 runtime_branches_stats 10705f80 l O .bss 0014 runtime_cacherefs_stats 10845f80 l O .bss 0014 runtime_l1_dcache_stats 10985f80 l O .bss 0014 runtime_l1_icache_stats 10ac5f80 l O .bss 0014 runtime_ll_cache_stats 10c05f80 l O .bss 0014 runtime_itlb_cache_stats 10d45f80 l O .bss 0014 runtime_dtlb_cache_stats 10e85f80 l O .bss 0014 runtime_cycles_in_tx_stats 10fc5f80 l O .bss 0014 runtime_transaction_stats 11105f80 l O .bss 0014 runtime_elision_stats 11245f80 l O .bss 0014 runtime_topdown_total_slots 11385f80 l O .bss 0014 runtime_topdown_slots_retired 114c5f80 l O .bss 0014 runtime_topdown_slots_issued 11605f80 l O .bss 0014 runtime_topdown_fetch_bubbles 11745f80 l O .bss 0014 runtime_topdown_recovery_bubbles This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus to 1024"), because many tables are sized with MAX_NR_CPUS This patch gives the opportunity to redefine MAX_NR_CPUS via $ make EXTRA_CFLAGS=-DMAX_NR_CPUS=1 Signed-off-by: Christophe Leroy Cc: Alexander Shishkin Cc: Peter Zijlstra Cc: linuxppc-dev@lists.ozlabs.org Link: http://lkml.kernel.org/r/20170922112043.8349468...@po15668-vm-win7.idsi0.si.c-s.fr Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/perf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index d215714f48df..21bf7f5a3cf5 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -25,7 +25,9 @@ static inline unsigned long long rdclock(void) return ts.tv_sec * 10ULL + ts.tv_nsec; } +#ifndef MAX_NR_CPUS #define MAX_NR_CPUS1024 +#endif extern const char *input_name; extern bool perf_host, perf_guest; -- 2.14.4
[GIT PULL 00/21] perf/core improvements and fixes
Hi Ingo, Please consider pulling, contains a recently merged tip/perf/urgent, - Arnaldo Test results at the end of this message, as usual. The following changes since commit c2586cfbb905939b79b49a9121fb0a59a5668fd6: Merge remote-tracking branch 'tip/perf/urgent' into perf/core (2018-07-31 09:55:45 -0300) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git tags/perf-core-for-mingo-4.19-20180801 for you to fetch changes up to b912885ab75c7c8aa841c615108afd755d0b97f8: perf trace: Do not require --no-syscalls to suppress strace like output (2018-08-01 16:20:28 -0300) perf/core improvements and fixes: perf trace: (Arnaldo Carvalho de Melo) - Do not require --no-syscalls to suppress strace like output, i.e. # perf trace -e sched:*switch will show just sched:sched_switch events, not strace-like formatted syscall events, use --syscalls to get the previous behaviour. If instead: # perf trace is used, i.e. no events specified, then --syscalls is implied and system wide strace like formatting will be applied to all syscalls. The behaviour when just a syscall subset is used with '-e' is unchanged: # perf trace -e *sleep,sched:*switch will work as before: just the 'nanosleep' syscall will be strace-like formatted plus the sched:sched_switch tracepoint event, system wide. - Allow string table generators to use a default header dir, allowing use of them without parameters to see the table it generates on stdout, e.g.: $ tools/perf/trace/beauty/kvm_ioctl.sh static const char *kvm_ioctl_cmds[] = { [0x00] = "GET_API_VERSION", [0x01] = "CREATE_VM", [0x02] = "GET_MSR_INDEX_LIST", [0x03] = "CHECK_EXTENSION", [0xe0] = "CREATE_DEVICE", [0xe1] = "SET_DEVICE_ATTR", [0xe2] = "GET_DEVICE_ATTR", [0xe3] = "HAS_DEVICE_ATTR", }; $ See 'ls tools/perf/trace/beauty/*.sh' to see the available string table generators. - Add a generator for IPPROTO_ socket's protocol constants. perf record: (Kan Liang) - Fix error out while applying initial delay and using LBR, due to the use of a PERF_TYPE_SOFTWARE/PERF_COUNT_SW_DUMMY event to track PERF_RECORD_MMAP events while waiting for the initial delay. Such events fail when configured asking PERF_SAMPLE_BRANCH_STACK in perf_event_attr.sample_type. perf c2c: (Jiri Olsa) - Fix report crash for empty browser, when processing a perf.data file without events of interest, either because not asked for in 'perf record' or because the workload didn't triggered such events. perf list: (Michael Petlan) - Align metric group description format with PMU event description. perf tests: (Sandipan Das) - Fix indexing when invoking subtests, which caused BPF tests to get results for the next test in the list, with the last one reporting a failure. eBPF: - Fix installation directory for header files included from eBPF proggies, avoiding clashing with relative paths used to build other software projects such as glibc. (Thomas Richter) - Show better message when failing to load an object. (Arnaldo Carvalho de Melo) General: (Christophe Leroy) - Allow overriding MAX_NR_CPUS at compile time, to make the tooling usable in systems with less memory, in time this has to be changed to properly allocate based on _NPROCESSORS_ONLN. Architecture specific: - Update arm64's ThunderX2 implementation defined pmu core events (Ganapatrao Kulkarni) - Fix complex event name parsing in 'perf test' for PowerPC, where the 'umask' event modifier isn't present. (Sandipan Das) CoreSight ARM hardware tracing: (Leo Yan) - Fix start tracing packet handling. - Support dummy address value for CS_ETM_TRACE_ON packet. - Generate branch sample when receiving a CS_ETM_TRACE_ON packet. - Generate branch sample for CS_ETM_TRACE_ON packet. Signed-off-by: Arnaldo Carvalho de Melo Arnaldo Carvalho de Melo (9): perf trace beauty: Default header_dir to cwd to work without parms tools include uapi: Grab a copy of linux/in.h perf beauty: Add a generator for IPPROTO_ socket's protocol constants perf trace beauty: Do not print NULL strarray entries perf trace beauty: Add beautifiers for 'socket''s 'protocol' arg perf trace: Beautify the AF_INET & AF_INET6 'socket' syscall 'protocol' args perf bpf: Show better message when failing to load an object perf bpf: Include uapi/linux/bpf.h from the 'perf trace' script's bpf.h perf trace: Do not require --no-syscalls to suppress strace like output Christophe Leroy (1): perf tools: Allow overriding MAX_NR_CPUS at compile time Ganapatrao Kulkarni (1): perf vendor events arm64: Update Thu
[PATCH v4 4/6] powerpc/traps: Print VMA for unhandled signals
This adds VMA address in the message printed for unhandled signals, similarly to what other architectures, like x86, print. Before this patch, a page fault looked like: pandafault[61470]: unhandled signal 11 at 17d0 nip 161c lr 7fff8d185100 code 2 After this patch, a page fault looks like: pandafault[6303]: segfault 11 at 17d0 nip 161c lr 7fff93c55100 code 2 in pandafault[1000+1] Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 21 +++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 4503e22f6ba5..bcefbb1ee771 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -96,6 +96,19 @@ EXPORT_SYMBOL(__debugger_fault_handler); #define TM_DEBUG(x...) do { } while(0) #endif +static const char *signame(int signr) +{ + switch (signr) { + case SIGBUS:return "bus error"; + case SIGFPE:return "floating point exception"; + case SIGILL:return "illegal instruction"; + case SIGSEGV: return "segfault"; + case SIGTRAP: return "unhandled trap"; + } + + return "unknown signal"; +} + /* * Trap & Exception support */ @@ -317,9 +330,13 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code, if (!show_unhandled_signals_ratelimited()) return; - pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x\n", - current->comm, current->pid, signr, + pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x", + current->comm, current->pid, signame(signr), signr, addr, regs->nip, regs->link, code); + + print_vma_addr(KERN_CONT " in ", regs->nip); + + pr_cont("\n"); } void _exception_pkey(int signr, struct pt_regs *regs, int code, -- 2.17.1
[PATCH v4 5/6] powerpc: Add show_user_instructions()
show_user_instructions() is a slightly modified version of show_instructions() that allows userspace instruction dump. This will be useful within show_signal_msg() to dump userspace instructions of the faulty location. Here is a sample of what show_user_instructions() outputs: pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 The current->comm and current->pid printed can serve as a glue that links the instructions dump to its originator, allowing messages to be interleaved in the logs. Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/include/asm/stacktrace.h | 13 + arch/powerpc/kernel/process.c | 40 +++ 2 files changed, 53 insertions(+) create mode 100644 arch/powerpc/include/asm/stacktrace.h diff --git a/arch/powerpc/include/asm/stacktrace.h b/arch/powerpc/include/asm/stacktrace.h new file mode 100644 index ..6149b53b3bc8 --- /dev/null +++ b/arch/powerpc/include/asm/stacktrace.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Stack trace functions. + * + * Copyright 2018, Murilo Opsfelder Araujo, IBM Corporation. + */ + +#ifndef _ASM_POWERPC_STACKTRACE_H +#define _ASM_POWERPC_STACKTRACE_H + +void show_user_instructions(struct pt_regs *regs); + +#endif /* _ASM_POWERPC_STACKTRACE_H */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e9533b4d2f08..364645ac732c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1299,6 +1299,46 @@ static void show_instructions(struct pt_regs *regs) pr_cont("\n"); } +void show_user_instructions(struct pt_regs *regs) +{ + int i; + const char *prefix = KERN_INFO "%s[%d]: code: "; + unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * + sizeof(int)); + + printk(prefix, current->comm, current->pid); + + for (i = 0; i < instructions_to_print; i++) { + int instr; + + if (!(i % 8) && (i > 0)) { + pr_cont("\n"); + printk(prefix, current->comm, current->pid); + } + +#if !defined(CONFIG_BOOKE) + /* If executing with the IMMU off, adjust pc rather +* than print . +*/ + if (!(regs->msr & MSR_IR)) + pc = (unsigned long)phys_to_virt(pc); +#endif + + if (probe_kernel_address((unsigned int __user *)pc, instr)) { + pr_cont(" "); + } else { + if (regs->nip == pc) + pr_cont("<%08x> ", instr); + else + pr_cont("%08x ", instr); + } + + pc += sizeof(int); + } + + pr_cont("\n"); +} + struct regbit { unsigned long bit; const char *name; -- 2.17.1
[PATCH v4 6/6] powerpc/traps: Show instructions on exceptions
Call show_user_instructions() in arch/powerpc/kernel/traps.c to dump instructions at faulty location, useful to debugging. Before this patch, an unhandled signal message looked like: pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 code 2 in pandafault[1000+1] After this patch, it looks like: pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 code 2 in pandafault[1000+1] pandafault[10524]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe pandafault[10524]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bcefbb1ee771..8494b0ff4904 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -70,6 +70,7 @@ #include #include #include +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) int (*__debugger)(struct pt_regs *regs) __read_mostly; @@ -337,6 +338,8 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code, print_vma_addr(KERN_CONT " in ", regs->nip); pr_cont("\n"); + + show_user_instructions(regs); } void _exception_pkey(int signr, struct pt_regs *regs, int code, -- 2.17.1
[PATCH v4 3/6] powerpc/traps: Use %lx format in show_signal_msg()
Use %lx format to print registers. This avoids having two different formats and avoids checking for MSR_64BIT, improving readability of the function. Even though we could have used %px, which is functionally equivalent to %lx as per Documentation/core-api/printk-formats.rst, it is not semantically correct because the data printed are not pointers. And using %px requires casting data to (void *). Besides that, %lx matches the format used in show_regs(). Before this patch: pandafault[4808]: unhandled signal 11 at 1718 nip 1574 lr 7fff935e7a6c code 2 After this patch: pandafault[4732]: unhandled signal 11 at 1718 nip 1574 lr 7fff86697a6c code 2 Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 367534b41617..4503e22f6ba5 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -311,20 +311,15 @@ static bool show_unhandled_signals_ratelimited(void) static void show_signal_msg(int signr, struct pt_regs *regs, int code, unsigned long addr) { - const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \ - "at %08lx nip %08lx lr %08lx code %x\n"; - const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ - "at %016lx nip %016lx lr %016lx code %x\n"; - if (!unhandled_signal(current, signr)) return; if (!show_unhandled_signals_ratelimited()) return; - printk(regs->msr & MSR_64BIT ? fmt64 : fmt32, - current->comm, current->pid, signr, - addr, regs->nip, regs->link, code); + pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x\n", + current->comm, current->pid, signr, + addr, regs->nip, regs->link, code); } void _exception_pkey(int signr, struct pt_regs *regs, int code, -- 2.17.1
[PATCH v4 2/6] powerpc/traps: Use an explicit ratelimit state for show_signal_msg()
Replace printk_ratelimited() by printk() and a default rate limit burst to limit displaying unhandled signals messages. This will allow us to call print_vma_addr() in a future patch, which does not work with printk_ratelimited(). Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 21 - 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index cbd3dc365193..367534b41617 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -301,6 +301,13 @@ void user_single_step_siginfo(struct task_struct *tsk, info->si_addr = (void __user *)regs->nip; } +static bool show_unhandled_signals_ratelimited(void) +{ + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + return show_unhandled_signals && __ratelimit(); +} + static void show_signal_msg(int signr, struct pt_regs *regs, int code, unsigned long addr) { @@ -309,11 +316,15 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code, const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ "at %016lx nip %016lx lr %016lx code %x\n"; - if (show_unhandled_signals && unhandled_signal(current, signr)) { - printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, - current->comm, current->pid, signr, - addr, regs->nip, regs->link, code); - } + if (!unhandled_signal(current, signr)) + return; + + if (!show_unhandled_signals_ratelimited()) + return; + + printk(regs->msr & MSR_64BIT ? fmt64 : fmt32, + current->comm, current->pid, signr, + addr, regs->nip, regs->link, code); } void _exception_pkey(int signr, struct pt_regs *regs, int code, -- 2.17.1
[PATCH v4 1/6] powerpc/traps: Print unhandled signals in a separate function
Isolate the logic of printing unhandled signals out of _exception_pkey(). No functional change, only code rearrangement. Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 26 -- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 0e17dcb48720..cbd3dc365193 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -301,26 +301,32 @@ void user_single_step_siginfo(struct task_struct *tsk, info->si_addr = (void __user *)regs->nip; } +static void show_signal_msg(int signr, struct pt_regs *regs, int code, + unsigned long addr) +{ + const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %08lx nip %08lx lr %08lx code %x\n"; + const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ + "at %016lx nip %016lx lr %016lx code %x\n"; + + if (show_unhandled_signals && unhandled_signal(current, signr)) { + printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, + current->comm, current->pid, signr, + addr, regs->nip, regs->link, code); + } +} void _exception_pkey(int signr, struct pt_regs *regs, int code, - unsigned long addr, int key) +unsigned long addr, int key) { siginfo_t info; - const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \ - "at %08lx nip %08lx lr %08lx code %x\n"; - const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \ - "at %016lx nip %016lx lr %016lx code %x\n"; if (!user_mode(regs)) { die("Exception in kernel mode", regs, signr); return; } - if (show_unhandled_signals && unhandled_signal(current, signr)) { - printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32, - current->comm, current->pid, signr, - addr, regs->nip, regs->link, code); - } + show_signal_msg(signr, regs, code, addr); if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs)) local_irq_enable(); -- 2.17.1
[PATCH v4 0/6] powerpc: Modernize unhandled signals message
Hi, everyone. This series was inspired by the need to modernize and display more informative messages about unhandled signals. The "unhandled signal NN" is not very informative. We thought it would be helpful adding a human-readable message describing what the signal number means, printing the VMA address, and dumping the instructions. Before this series: pandafault32[4724]: unhandled signal 11 at 15e4 nip 1444 lr 0fe31ef4 code 2 pandafault64[4725]: unhandled signal 11 at 1718 nip 1574 lr 7fff7faa7a6c code 2 After this series: pandafault32[4753]: segfault (11) at 15e4 nip 1444 lr fe31ef4 code 2 in pandafault32[1000+1] pandafault32[4753]: code: 4b3c 6000 6042 4b30 9421ffd0 93e1002c 7c3f0b78 3d201000 pandafault32[4753]: code: 392905e4 913f0008 813f0008 39400048 <9949> 3920 7d234b78 397f0030 pandafault64[4754]: segfault (11) at 1718 nip 1574 lr 7fffb0007a6c code 2 in pandafault64[1000+1] pandafault64[4754]: code: e8010010 7c0803a6 4bfffef4 4bfffef0 fbe1fff8 f821ffb1 7c3f0b78 3d22fffe pandafault64[4754]: code: 39298818 f93f0030 e93f0030 39400048 <9949> 3920 7d234b78 383f0050 Link to v3: https://lore.kernel.org/lkml/20180731145020.14009-1-muri...@linux.ibm.com/ v3..v4: - Added new show_user_instructions() based on the existing show_instructions() - Updated commit messages - Replaced signals names table with a tiny function that returns a literal string for each signal number Cheers! Murilo Opsfelder Araujo (6): powerpc/traps: Print unhandled signals in a separate function powerpc/traps: Use an explicit ratelimit state for show_signal_msg() powerpc/traps: Use %lx format in show_signal_msg() powerpc/traps: Print VMA for unhandled signals powerpc: Add show_user_instructions() powerpc/traps: Show instructions on exceptions arch/powerpc/include/asm/stacktrace.h | 13 +++ arch/powerpc/kernel/process.c | 40 + arch/powerpc/kernel/traps.c | 52 +-- 3 files changed, 95 insertions(+), 10 deletions(-) create mode 100644 arch/powerpc/include/asm/stacktrace.h -- 2.17.1
Re: phandle_cache vs of_detach_node (was Re: [PATCH] powerpc/mobility: Fix node detach/rename problem)
On 08/01/2018 09:26 AM, Michael Ellerman wrote: > Frank Rowand writes: >> On 07/31/18 07:17, Rob Herring wrote: >>> On Tue, Jul 31, 2018 at 12:34 AM Michael Ellerman >>> wrote: Hi Rob/Frank, I think we might have a problem with the phandle_cache not interacting well with of_detach_node(): >>> >>> Probably needs a similar fix as this commit did for overlays: >>> >>> commit b9952b5218added5577e4a3443969bc20884cea9 >>> Author: Frank Rowand >>> Date: Thu Jul 12 14:00:07 2018 -0700 >>> >>> of: overlay: update phandle cache on overlay apply and remove >>> >>> A comment in the review of the patch adding the phandle cache said that >>> the cache would have to be updated when modules are applied and removed. >>> This patch implements the cache updates. >>> >>> Fixes: 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of >>> of_find_node_by_phandle()") >>> Reported-by: Alan Tull >>> Suggested-by: Alan Tull >>> Signed-off-by: Frank Rowand >>> Signed-off-by: Rob Herring >> >> Agreed. Sorry about missing the of_detach_node() case. >> >> >>> Really what we need here is an "invalidate phandle" function rather >>> than free and re-allocate the whole damn cache. >> >> The big hammer approach was chosen to avoid the race conditions that >> would otherwise occur. OF does not have a locking strategy that >> would be able to protect against the races. >> >> We could maybe implement a slightly smaller hammer by (1) disabling >> the cache, (2) invalidate a phandle entry in the cache, (3) re-enable >> the cache. That is an off the cuff thought - I would have to look >> a little bit more carefully to make sure it would work. >> >> But I don't see a need to add the complexity of the smaller hammer >> or the bigger hammer of proper locking _unless_ we start seeing that >> the cache is being freed and re-allocated frequently. For overlays >> I don't expect the high frequency because it happens on a per overlay >> removal basis (not per node removal basis). For of_detach_node() the >> event _is_ on a per node removal basis. Michael, do you expect node >> removals to be a frequent event with low latency being important? If >> so, a rough guess on what the frequency would be? > > No in practice we expect them to be fairly rare and in the thousands at > most I think. > > TBH you could just disable the phandle_cache on the first detach and > that would be fine by me. I don't think we've ever noticed phandle > lookups being much of a problem for us on our hardware. I ran a couple of migrations with the calls to of_free_phandle_cache() ... of_populate_phandle_cache() bracketing the body of arch/powerpc/platforms/pseries/mobility.c:post_mobility_fixup() with a patch like, diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index e245a88..efc9442 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -22,6 +22,9 @@ #include #include "pseries.h" +extern int of_free_phandle_cache(void); +extern void of_populate_phandle_cache(void); + static struct kobject *mobility_kobj; struct update_props_workarea { @@ -343,6 +346,8 @@ void post_mobility_fixup(void) rc = rtas_call(activate_fw_token, 0, 1, NULL); } while (rtas_busy_delay(rc)); + of_free_phandle_cache(); + if (rc) printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc); @@ -354,6 +359,8 @@ void post_mobility_fixup(void) /* Possibly switch to a new RFI flush type */ pseries_setup_rfi_flush(); + of_populate_phandle_cache(); + return; } and did not observe the warnings/crashes that have been plaguing me. We will need to move their prototypes out of drivers/of/of_private.h to include/linux/of.h, though. > > cheers > Michael -- Michael W. Bringmann Linux Technology Center IBM Corporation Tie-Line 363-5196 External: (512) 286-5196 Cell: (512) 466-0650 m...@linux.vnet.ibm.com
Re: [PATCH] powerpc/pasemi: Seach for PCI root bus by compatible property
On Wed, Aug 1, 2018 at 7:36 AM, Michael Ellerman wrote: > Christian Zigotzky writes: > >> Just for info: I tested it on my Nemo board today and it works. > > Awesome thanks. > > That's probably sufficient to merge it, and if it breaks anything we can > always revert it. Should be fine, all known boards have the properties in question, and I doubt anyone has anything but Nemo and Electra/Chitra boards out there. It's a virtual root bus, so all boards have it irrespective of what PCIe is brought out. (I should hook up my test system and get it into the CI cycle again, maybe this fall). Acked-by: Olof Johansson -Olof
Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100
On Wed, 1 Aug 2018 18:37:35 +1000 Alexey Kardashevskiy wrote: > On 01/08/2018 00:29, Alex Williamson wrote: > > On Tue, 31 Jul 2018 14:03:35 +1000 > > Alexey Kardashevskiy wrote: > > > >> On 31/07/2018 02:29, Alex Williamson wrote: > >>> On Mon, 30 Jul 2018 18:58:49 +1000 > >>> Alexey Kardashevskiy wrote: > After some local discussions, it was pointed out that force disabling > nvlinks won't bring us much as for an nvlink to work, both sides need to > enable it so malicious guests cannot penetrate good ones (or a host) > unless a good guest enabled the link but won't happen with a well > behaving guest. And if two guests became malicious, then can still only > harm each other, and so can they via other ways such network. This is > different from PCIe as once PCIe link is unavoidably enabled, a well > behaving device cannot firewall itself from peers as it is up to the > upstream bridge(s) now to decide the routing; with nvlink2, a GPU still > has means to protect itself, just like a guest can run "firewalld" for > network. > > Although it would be a nice feature to have an extra barrier between > GPUs, is inability to block the links in hypervisor still a blocker for > V100 pass through? > >>> > >>> How is the NVLink configured by the guest, is it 'on'/'off' or are > >>> specific routes configured? > >> > >> The GPU-GPU links need not to be blocked and need to be enabled > >> (==trained) by a driver in the guest. There are no routes between GPUs > >> in NVLink fabric, these are direct links, it is just a switch on each > >> side, both switches need to be on for a link to work. > > > > Ok, but there is at least the possibility of multiple direct links per > > GPU, the very first diagram I find of NVlink shows 8 interconnected > > GPUs: > > > > https://www.nvidia.com/en-us/data-center/nvlink/ > > Out design is like the left part of the picture but it is just a detail. Unless we can specifically identify a direct link vs a mesh link, we shouldn't be making assumptions about the degree of interconnect. > > So if each switch enables one direct, point to point link, how does the > > guest know which links to open for which peer device? > > It uses PCI config space on GPUs to discover the topology. So do we need to virtualize this config space if we're going to virtualize the topology? > > And of course > > since we can't see the spec, a security audit is at best hearsay :-\ > > Yup, the exact discovery protocol is hidden. It could be reverse engineered... > >> The GPU-CPU links - the GPU bit is the same switch, the CPU NVlink state > >> is controlled via the emulated PCI bridges which I pass through together > >> with the GPU. > > > > So there's a special emulated switch, is that how the guest knows which > > GPUs it can enable NVLinks to? > > Since it only has PCI config space (there is nothing relevant in the > device tree at all), I assume (double checking with the NVIDIA folks > now) the guest driver enables them all, tests which pair works and > disables the ones which do not. This gives a malicious guest a tiny > window of opportunity to break into a good guest. Hm :-/ Let's not minimize that window, that seems like a prime candidate for an exploit. > >>> If the former, then isn't a non-malicious > >>> guest still susceptible to a malicious guest? > >> > >> A non-malicious guest needs to turn its switch on for a link to a GPU > >> which belongs to a malicious guest. > > > > Actual security, or obfuscation, will we ever know... > If the latter, how is > >>> routing configured by the guest given that the guest view of the > >>> topology doesn't match physical hardware? Are these routes > >>> deconfigured by device reset? Are they part of the save/restore > >>> state? Thanks, > > > > Still curious what happens to these routes on reset. Can a later user > > of a GPU inherit a device where the links are already enabled? Thanks, > > I am told that the GPU reset disables links. As a side effect, we get an > HMI (a hardware fault which reset the host machine) when trying > accessing the GPU RAM which indicates that the link is down as the > memory is only accessible via the nvlink. We have special fencing code > in our host firmware (skiboot) to fence this memory on PCI reset so > reading from it returns zeroes instead of HMIs. What sort of reset is required for this? Typically we rely on secondary bus reset for GPUs, but it would be a problem if GPUs were to start implementing FLR and nobody had a spec to learn that FLR maybe didn't disable the link. The better approach to me still seems to be virtualizing these NVLink config registers to an extent that the user can only enabling links where they have ownership of both ends of the connection. Thanks, Alex
Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()
Hi, Christophe. On Wed, Aug 01, 2018 at 08:41:15AM +0200, Christophe LEROY wrote: > > > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : > > Remove "Instruction dump:" line by adding a prefix to display current->comm > > and current->pid, along with the instructions dump. > > > > The prefix can serve as a glue that links the instructions dump to its > > originator, allowing messages to be interleaved in the logs. > > > > Before this patch, a page fault looked like: > > > >pandafault[10524]: segfault (11) at 17d0 nip 161c lr > > 7fffbd295100 code 2 in pandafault[1000+1] > >Instruction dump: > >4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe > >392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 > > > > After this patch, it looks like: > > > >pandafault[10850]: segfault (11) at 17d0 nip 161c lr > > 7fff9f3e5100 code 2 in pandafault[1000+1] > >pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 > > f821ffc1 7c3f0b78 3d22fffe > >pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> > > 3920 7d234b78 383f0040 > > > > Signed-off-by: Murilo Opsfelder Araujo > > Does the script scripts/decode_stacktrace.sh also works with this format > change ? I've got more feedback from Michael about this. A better approach would be having a new show_user_instructions(), a slightly modified version of show_instructions(), that can be called from within show_signal_msg(). Since _exception_pkey() dies if the exception is in kernel mode, we'll be safe to call the new show_user_instructions(), without interfering in scripts/decode_stacktrace.sh. Cheers Murilo
Re: [PATCH] powerpc/4xx: Fix error return path in ppc4xx_msi_probe()
Looks good, Reviewed-by: Christoph Hellwig
Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
Hi, Segher. On Wed, Aug 01, 2018 at 02:49:03AM -0500, Segher Boessenkool wrote: > On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote: > > On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote: > > > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : > > > I would suggest to instead use a function like this: > > > > > > static const char *signame(int signr) > > > { > > > if (signr == SIGBUS) > > > return "bus error"; > > > if (signr == SIGFPE) > > > return "floating point exception"; > > > if (signr == SIGILL) > > > return "illegal instruction"; > > > if (signr == SIGILL) > > > return "segfault"; > > > if (signr == SIGTRAP) > > > return "unhandled trap"; > > > return "unknown signal"; > > > } > > > > trivia: > > > > Unless the if tests are ordered most to least likely, > > perhaps it would be better to use a switch/case and > > let the compiler decide. > > That would also show there are two entries for SIGILL (here and in the > original patch), one of them very wrong. Good catch. I'll take care of that in my next respin. > Check the table with psignal or something? Nice suggestion. Thanks! Cheers Murilo
Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote: > On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote: > > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : > > > This adds a human-readable name in the unhandled signal message. > > > Before this patch, a page fault looked like: > > >pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr > > > 7fff93c55100 code 2 in pandafault[1000+1] > > > After this patch, a page fault looks like: > > >pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr > > > 7fffb63e5100 code 2 in pandafault[13a2a+1] > ]] > > > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c > [] > > > @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler); > > > #define TM_DEBUG(x...) do { } while(0) > > > #endif > > > > > > +static const char *signames[SIGRTMIN + 1] = { > > > + "UNKNOWN", > > > + "SIGHUP", // 1 > > > + "SIGINT", // 2 > [] > > I don't think is is worth having that full table when we only use a few > > of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/) > > > > I would suggest to instead use a function like this: > > > > static const char *signame(int signr) > > { > > if (signr == SIGBUS) > > return "bus error"; > > if (signr == SIGFPE) > > return "floating point exception"; > > if (signr == SIGILL) > > return "illegal instruction"; > > if (signr == SIGILL) > > return "segfault"; > > if (signr == SIGTRAP) > > return "unhandled trap"; > > return "unknown signal"; > > } > > trivia: > > Unless the if tests are ordered most to least likely, > perhaps it would be better to use a switch/case and > let the compiler decide. > > switch (signr) { > case SIGBUS:return "bus error"; > case SIGFPE:return "floating point exception"; > case SIGILL:return "illegal instruction"; > case SIGSEGV: return "segfault"; > case SIGTRAP: return "unhandled trap"; > } > return "unknown signal"; > } > Hi, Joe, Christophe. That's a nice enhancement. I'll do that in my next respin. Cheers Murilo
Re: [PATCH] perf tools: allow overriding MAX_NR_CPUS at compile time
Em Wed, Aug 01, 2018 at 11:37:30AM +0200, Christophe LEROY escreveu: > > > Le 03/05/2018 à 15:40, Arnaldo Carvalho de Melo a écrit : > > Em Fri, Sep 22, 2017 at 01:20:43PM +0200, Christophe Leroy escreveu: > > > After update of kernel, perf tool doesn't run anymore on my > > > 32MB RAM powerpc board, but still runs on a 128MB RAM board: > > > > Cleaning up my inbox, found this one, simple enough, still applies, > > applied. > > Did you finally apply it ? I can't see it in linux-next. Will it be merged > into 4.19 ? Sure, applied it finally, - Arnaldo
Re: [PATCH] powerpc/pasemi: Seach for PCI root bus by compatible property
Christian Zigotzky writes: > Just for info: I tested it on my Nemo board today and it works. Awesome thanks. That's probably sufficient to merge it, and if it breaks anything we can always revert it. cheers > On 31 July 2018 at 2:04PM, Michael Ellerman wrote: >> Michael Ellerman writes: >>> Darren Stevens writes: >>> Pasemi arch code finds the root of the PCI-e bus by searching the device-tree for a node called 'pxp'. But the root bus has a compatible property of 'pasemi,rootbus' so search for that instead. Signed-off-by: Darren Stevens --- This works on the Amigaone X1000, I don't know if this method of finding the pci bus was there bcause of earlier firmwares. >>> Does anyone have another pasemi board they can test this on? >>> >>> The last time I plugged mine in it popped the power supply and took out >>> power to half the office :) - I haven't had a chance to try it since. >> I actually I remembered I have a device tree lying around from an electra. >> >> It has: >> >>[I] home:pxp@0,8000(7)(I)> lsprop name compatible >>name "pxp" >>compatible "pasemi,rootbus" >> "pa-pxp" >> >> >> So it looks like the patch would work fine on it at least. >> >> cheers >> diff --git a/arch/powerpc/platforms/pasemi/pci.c b/arch/powerpc/platforms/pasemi/pci.c index c7c8607..be62380 100644 --- a/arch/powerpc/platforms/pasemi/pci.c +++ b/arch/powerpc/platforms/pasemi/pci.c @@ -216,6 +216,7 @@ static int __init pas_add_bridge(struct device_node *dev) void __init pas_pci_init(void) { struct device_node *np, *root; + int res; root = of_find_node_by_path("/"); if (!root) { @@ -226,11 +227,11 @@ void __init pas_pci_init(void) pci_set_flags(PCI_SCAN_ALL_PCIE_DEVS); - for (np = NULL; (np = of_get_next_child(root, np)) != NULL;) - if (np->name && !strcmp(np->name, "pxp") && !pas_add_bridge(np)) - of_node_get(np); - - of_node_put(root); + np = of_find_compatible_node(root, NULL, "pasemi,rootbus"); + if (np) { + res = pas_add_bridge(np); + of_node_put(np); + } } void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset)
Re: [PATCH] powerpc/mobility: Fix node detach/rename problem
Tyrel Datwyler writes: > On 07/30/2018 11:42 PM, Michael Ellerman wrote: >> Tyrel Datwyler writes: >>> On 07/29/2018 06:11 AM, Michael Bringmann wrote: During LPAR migration, the content of the device tree/sysfs may be updated including deletion and replacement of nodes in the tree. When nodes are added to the internal node structures, they are appended in FIFO order to a list of nodes maintained by the OF code APIs. When nodes are removed from the device tree, they are marked OF_DETACHED, but not actually deleted from the system to allow for pointers cached elsewhere in the kernel. The order and content of the entries in the list of nodes is not altered, though. During LPAR migration some common nodes are deleted and re-added e.g. "ibm,platform-facilities". If a node is re-added to the OF node lists, the of_attach_node function checks to make sure that the name + ibm,phandle of the to-be-added data is unique. As the previous copy of a re-added node is not modified beyond the addition of a bit flag, the code (1) finds the old copy, (2) prints a WARNING notice to the console, (3) renames the to-be-added node to avoid filename collisions within a directory, and (3) adds entries to the sysfs/kernfs. >>> >>> So, this patch actually just band aids over the real problem. This is >>> a long standing problem with several PFO drivers leaking references. >>> The issue here is that, during the device tree update that follows a >>> migration. the update of the ibm,platform-facilities node and friends >>> below are always deleted and re-added on the destination lpar and >>> subsequently the leaked references prevent the devices nodes from >>> every actually being properly cleaned up after detach. Thus, leading >>> to the issue you are observing. > > So, it was the end of the day, and I kind of glossed over the issue > Michael was trying to address with an issue that I remembered that had > been around for awhile. Sure, I know that feeling :) >> Leaking references shouldn't affect the node being detached from the >> tree though. > > No, but it does prevent it from being freed from sysfs which leads to > the sysfs entry renaming that happens when another node with the same > name is attached. OK. >> See of_detach_node() calling __of_detach_node(), none of that depends on >> the refcount. >> >> It's only the actual freeing of the node, in of_node_release() that is >> prevented by leaked reference counts. > > Right, but if we did refcount correctly there is the new problem that > the node is freed and now the phandle cache points at freed memory in > the case were no invalidation is done. Yeah that's bad. I guess no one is supposed to lookup that phandle anymore, but it's super fragile. >> So I agree we need to do a better job with the reference counting, but I >> don't see how it is causing the problem here > > Now in regards to the phandle caching somehow I missed that code going > into OF earlier this year. That should have had at least some > discussion from our side based on the fact that it is built on dtc > compiler assumption that there are a set number of phandles that are > allocated starting at 1..n such that they are monotonically > increasing. That has a nice fixed size with O(1) lookup time. Phyp > doesn't guarantee any sort of niceness with nicely ordered phandles. > From what I've seen there are a subset of phandles that decrease from > (-1) monotonically, and then there are a bunch of random values for > cpus and IOAs. Thinking on it might not be that big of a deal as we > just end up in the case where we have a phandle collision one makes it > into the cache and is optimized while the other doesn't. On another > note, they clearly hit a similar issue during overlay attach and > remove, and as Rob pointed out their solution to handle it is a full > cache invalidation followed by rescanning the whole tree to rebuild > it. Seeing as our dynamic lifecycle is node by node this definitely > adds some overhead. Yeah I also should have noticed it going in, but despite being subscribed to the devicetree list I didn't spot it in the flood. AIUI the 1..n assumption is not about correctness but if our phandles violate that we may not be getting any benefit. Anyway yeah lots of things to look at tomorrow :) cheers
Re: phandle_cache vs of_detach_node (was Re: [PATCH] powerpc/mobility: Fix node detach/rename problem)
Frank Rowand writes: > On 07/31/18 07:17, Rob Herring wrote: >> On Tue, Jul 31, 2018 at 12:34 AM Michael Ellerman >> wrote: >>> >>> Hi Rob/Frank, >>> >>> I think we might have a problem with the phandle_cache not interacting >>> well with of_detach_node(): >> >> Probably needs a similar fix as this commit did for overlays: >> >> commit b9952b5218added5577e4a3443969bc20884cea9 >> Author: Frank Rowand >> Date: Thu Jul 12 14:00:07 2018 -0700 >> >> of: overlay: update phandle cache on overlay apply and remove >> >> A comment in the review of the patch adding the phandle cache said that >> the cache would have to be updated when modules are applied and removed. >> This patch implements the cache updates. >> >> Fixes: 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of >> of_find_node_by_phandle()") >> Reported-by: Alan Tull >> Suggested-by: Alan Tull >> Signed-off-by: Frank Rowand >> Signed-off-by: Rob Herring > > Agreed. Sorry about missing the of_detach_node() case. > > >> Really what we need here is an "invalidate phandle" function rather >> than free and re-allocate the whole damn cache. > > The big hammer approach was chosen to avoid the race conditions that > would otherwise occur. OF does not have a locking strategy that > would be able to protect against the races. > > We could maybe implement a slightly smaller hammer by (1) disabling > the cache, (2) invalidate a phandle entry in the cache, (3) re-enable > the cache. That is an off the cuff thought - I would have to look > a little bit more carefully to make sure it would work. > > But I don't see a need to add the complexity of the smaller hammer > or the bigger hammer of proper locking _unless_ we start seeing that > the cache is being freed and re-allocated frequently. For overlays > I don't expect the high frequency because it happens on a per overlay > removal basis (not per node removal basis). For of_detach_node() the > event _is_ on a per node removal basis. Michael, do you expect node > removals to be a frequent event with low latency being important? If > so, a rough guess on what the frequency would be? No in practice we expect them to be fairly rare and in the thousands at most I think. TBH you could just disable the phandle_cache on the first detach and that would be fine by me. I don't think we've ever noticed phandle lookups being much of a problem for us on our hardware. cheers
Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()
Christophe LEROY writes: > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : >> Remove "Instruction dump:" line by adding a prefix to display current->comm >> and current->pid, along with the instructions dump. >> >> The prefix can serve as a glue that links the instructions dump to its >> originator, allowing messages to be interleaved in the logs. >> >> Before this patch, a page fault looked like: >> >>pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 >> code 2 in pandafault[1000+1] >>Instruction dump: >>4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe >>392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 >> >> After this patch, it looks like: >> >>pandafault[10850]: segfault (11) at 17d0 nip 161c lr 7fff9f3e5100 >> code 2 in pandafault[1000+1] >>pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 >> f821ffc1 7c3f0b78 3d22fffe >>pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> >> 3920 7d234b78 383f0040 >> >> Signed-off-by: Murilo Opsfelder Araujo > > Does the script scripts/decode_stacktrace.sh also works with this format > change ? Did it work before? I've never used it. Would be great if it did work though. cheers
Re: [PATCH v2 2/2] powerpc/pseries: Wait for completion of hotplug events during PRRN handling
John Allen writes: > On Mon, Jul 23, 2018 at 11:41:24PM +1000, Michael Ellerman wrote: >>John Allen writes: >> >>> While handling PRRN events, the time to handle the actual hotplug events >>> dwarfs the time it takes to perform the device tree updates and queue the >>> hotplug events. In the case that PRRN events are being queued continuously, >>> hotplug events have been observed to be queued faster than the kernel can >>> actually handle them. This patch avoids the problem by waiting for a >>> hotplug request to complete before queueing more hotplug events. Have you tested this patch in isolation, ie. not with patch 1? >>So do we need the hotplug work queue at all? Can we just call >>handle_dlpar_errorlog() directly? >> >>Or are we using the work queue to serialise things? And if so would a >>mutex be better? > > Right, the workqueue is meant to serialize all hotplug events and it > gets used for more than just PRRN events. I believe the motivation for > using the workqueue over a mutex is that KVM guests initiate hotplug > events through the hotplug interrupt and can queue fairly large requests > meaning that in this scenario, waiting for a lock would block interrupts > for a while. OK, but that just means that path needs to schedule work to run later. > Using the workqueue allows us to serialize hotplug events > from different sources in the same way without worrying about the > context in which the event is generated. A lock would be so much simpler. It looks like we have three callers of queue_hotplug_event(), the dlpar code, the mobility code and the ras interrupt. The dlpar code already waits synchronously: init_completion(_done); queue_hotplug_event(hp_elog, _done, ); wait_for_completion(_done); You're changing mobility to do the same (this patch), leaving only the ras interrupt that actually queues work and returns. So it really seems like a mutex would do the trick, and the ras interrupt would be the only case that needs to schedule work for later. cheers
Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
On 2018-08-01 05:54 AM, Christoph Hellwig wrote: On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote: Platforms with a PCI bus will be offered the RapidIO menu since they may be want support for a RapidIO PCI device. Platforms without a PCI bus that might include a RapidIO IP block will need to "select HAS_RAPIDIO" in the platform-/machine-specific "config ARCH_*" Kconfig entry. Tested that kernel builds for arm64 with RapidIO subsystem and switch drivers enabled, also that the modules load successfully on a custom Aarch64 Qemu model. As said before, please include it from drivers/Kconfig so that _all_ architectures supporting PCI (or other Rapidio attachements) get it and not some arbitrary selection of architectures. As it was replied earlier this is not a random selection of architectures but only ones that implement support for RapidIO as system bus. If other architectures choose to adopt RapidIO we will include them as well. On some platforms RapidIO can be the only system bus available replacing PCI/PCIe or RapidIO can coexist with PCIe. As it is done now, RapidIO is configured in "Bus Options" (x86/PPC) or "Bus Support" (ARMs) sub-menu and from system configuration option it should be kept this way. Current location of RAPIDIO configuration option is familiar to users of PowerPC and x86 platforms, and is similarly available in some ARM manufacturers kernel code trees. drivers/Kconfig will be used for configuring drivers for peripheral RapidIO devices if/when such device drivers will be published.
Re: [PATCH v2 1/2] powerpc/pseries: Avoid blocking rtas polling handling multiple PRRN events
Hi John, I'm still not sure about this one. John Allen writes: > On Mon, Jul 23, 2018 at 11:27:56PM +1000, Michael Ellerman wrote: >>Hi John, >> >>I'm a bit puzzled by this one. >> >>John Allen writes: >>> When a PRRN event is being handled and another PRRN event comes in, the >>> second event will block rtas polling waiting on the first to complete, >>> preventing any further rtas events from being handled. This can be >>> especially problematic in case that PRRN events are continuously being >>> queued in which case rtas polling gets indefinitely blocked completely. >>> >>> This patch introduces a mutex that prevents any subsequent PRRN events from >>> running while there is a prrn event being handled, allowing rtas polling to >>> continue normally. >>> >>> Signed-off-by: John Allen >>> --- >>> v2: >>> -Unlock prrn_lock when PRRN operations are complete, not after handler is >>>scheduled. >>> -Remove call to flush_work, the previous broken method of serializing >>>PRRN events. >>> --- >>> arch/powerpc/kernel/rtasd.c | 10 +++--- >>> 1 file changed, 7 insertions(+), 3 deletions(-) >>> >>> diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c >>> index 44d66c33d59d..845fc5aec178 100644 >>> --- a/arch/powerpc/kernel/rtasd.c >>> +++ b/arch/powerpc/kernel/rtasd.c >>> @@ -284,15 +286,17 @@ static void prrn_work_fn(struct work_struct *work) >>> */ >>> pseries_devicetree_update(-prrn_update_scope); >>> numa_update_cpu_topology(false); >>> + mutex_unlock(_lock); >>> } >>> >>> static DECLARE_WORK(prrn_work, prrn_work_fn); >>> >>> static void prrn_schedule_update(u32 scope) >>> { >>> - flush_work(_work); >> >>This seems like it's actually the core of the change. Previously we were >>basically blocking on the flush before continuing. > > The idea here is to replace the blocking flush_work with a non-blocking > mutex. So rather than waiting on the running PRRN event to complete, we > bail out since a PRRN event is already running. OK, but why is it OK to bail out? The firmware sent you an error log asking you to do something, with a scope value that has some meaning, and now you're just going to drop that on the floor? Maybe it is OK to just drop these events? Or maybe you're saying that because the system is crashing under the load of too many events it's OK to drop the events in this case. > The situation this is > meant to address is flooding the workqueue with PRRN events, which like > the situation in patch 2/2, these can be queued up faster than they can > actually be handled. I'm not really sure why this is a problem though. The current code synchronously processes the events, so there should only ever be one in flight. I guess the issue is that each one can queue multiple events on the hotplug work queue? But still, we have terabytes of RAM, we should be able to queue a lot of events before it becomes a problem. So what exactly is getting flooded, what's the symptom? If the queuing of the hotplug events is the problem, then why don't we stop doing that? We could just process them synchronously from the PRRN update, that would naturally throttle them. cheers
Re: [PATCH v5 00/11] hugetlb: Factorize hugetlb architecture primitives
On 07/31/2018 10:06 PM, Luiz Capitulino wrote: On Tue, 31 Jul 2018 06:01:44 + Alexandre Ghiti wrote: [CC linux-mm for inclusion in -mm tree] In order to reduce copy/paste of functions across architectures and then make riscv hugetlb port (and future ports) simpler and smaller, this patchset intends to factorize the numerous hugetlb primitives that are defined across all the architectures. [...] 15 files changed, 139 insertions(+), 382 deletions(-) I imagine you're mostly interested in non-x86 review at this point, but as this series is looking amazing: Reviewed-by: Luiz Capitulino It's always good to have another feedback :) Thanks for your review Luiz, Alex
Re: [PATCH] powerpc/powernv/opal: Use standard interrupts property when available
Benjamin Herrenschmidt writes: > diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c > b/arch/powerpc/platforms/powernv/opal-irqchip.c > index 9d1b8c0aaf93..46785eaf625d 100644 > --- a/arch/powerpc/platforms/powernv/opal-irqchip.c > +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c > @@ -216,67 +214,91 @@ int __init opal_event_init(void) ... > > /* Install interrupt handlers */ > for (i = 0; i < opal_irq_count; i++) { > - unsigned int virq; > - char *name; > + struct resource *r = _irqs[i]; > + const char *name; > > - /* Get hardware and virtual IRQ */ > - virq = irq_create_mapping(NULL, irqs[i]); > - if (!virq) { > - pr_warn("Failed to map irq 0x%x\n", irqs[i]); > - continue; > - } > - > - if (names[i] && strlen(names[i])) > - name = kasprintf(GFP_KERNEL, "opal-%s", names[i]); > + /* Prefix name */ > + if (r->name) > + name = kasprintf(GFP_KERNEL, "opal-%s", r->name); > else > name = kasprintf(GFP_KERNEL, "opal"); I'm getting: root@p85:/proc/irq# find . -name '*opal*' ... ./63/opal-ipmi ./227/opal- ./228/opal- ./231/opal- ./232/opal- ./247/opal- ./248/opal- ./483/opal- ./484/opal- ./487/opal- ./488/opal- ./500/opal- ./510/opal- ./511/opal- ./512/opal- I fixed it with: diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 41c3303157f7..a2d067925140 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -275,7 +275,7 @@ int __init opal_event_init(void) const char *name; /* Prefix name */ - if (r->name) + if (r->name && strlen(r->name)) name = kasprintf(GFP_KERNEL, "opal-%s", r->name); else name = kasprintf(GFP_KERNEL, "opal"); cheers
Re: Infinite looping observed in __offline_pages
On Wed 01-08-18 21:09:39, Michael Ellerman wrote: > Michal Hocko writes: > > On Wed 25-07-18 13:11:15, John Allen wrote: > > [...] > >> Does a failure in do_migrate_range indicate that the range is unmigratable > >> and the loop in __offline_pages should terminate and goto failed_removal? > >> Or > >> should we allow a certain number of retrys before we > >> give up on migrating the range? > > > > Unfortunatelly not. Migration code doesn't tell a difference between > > ephemeral and permanent failures. > > What's to stop an ephemeral failure happening repeatedly? If there is a short term pin on the page that prevents the migration then the holder of the pin should realease it and the next retry will succeed the migration. If the page gets freed on the way then it will not be reallocated because they are isolated already. I can only see complete OOM to be the reason to fail allocation of the target place as the migration failure and that is highly unlikely and sooner or later trigger the oom killer and release some memory. The biggest problem here is that we cannot tell ephemeral and long term pins... -- Michal Hocko SUSE Labs
Re: Infinite looping observed in __offline_pages
Michal Hocko writes: > On Wed 25-07-18 13:11:15, John Allen wrote: > [...] >> Does a failure in do_migrate_range indicate that the range is unmigratable >> and the loop in __offline_pages should terminate and goto failed_removal? Or >> should we allow a certain number of retrys before we >> give up on migrating the range? > > Unfortunatelly not. Migration code doesn't tell a difference between > ephemeral and permanent failures. What's to stop an ephemeral failure happening repeatedly? cheers
RE: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
Hi Richard, > -Original Message- > From: Richard Cochran [mailto:richardcoch...@gmail.com] > Sent: Wednesday, August 1, 2018 2:15 PM > To: Y.b. Lu > Cc: net...@vger.kernel.org; Madalin-cristian Bucur > ; Rob Herring ; Shawn Guo > ; David S . Miller ; > devicet...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; > linux-arm-ker...@lists.infradead.org; linux-ker...@vger.kernel.org > Subject: Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for > initialization > > On Wed, Aug 01, 2018 at 04:36:40AM +, Y.b. Lu wrote: > > > Could I add a function to calculate a set of default register values > > to initialize ptp timer when dts method failed to get required > > properties in driver? > > Yes, it would be ideal if the driver can pick correct values automatically. > > However, the frequency on the FIPER outputs can't be configured > automatically, and we don't have an API for the user to choose this. [Y.b. Lu] Thanks a lot. Just let me send out the v2 patch for your reviewing. > > > I think this will be useful. The ptp timer on new platforms (you may > > see two dts patches in this patchset. Many platforms will be > > affected.) will work without these dts properties. If user want > > specific setting, they can set dts properties. > > Sure. > > Thanks, > Richard
[v2, 3/3] ptp_qoriq: support automatic configuration for ptp timer
This patch is to support automatic configuration for ptp timer. If required ptp dts properties are not provided, driver could try to calculate a set of default configurations to initialize the ptp timer. This makes the driver work for many boards which don't have the required ptp dts properties in current kernel. Also the users could set dts properties by themselves according to their requirement. Signed-off-by: Yangbo Lu --- Changes for v2: - Dropped module_param. --- drivers/ptp/ptp_qoriq.c | 111 +++- include/linux/fsl/ptp_qoriq.h |6 ++- 2 files changed, 113 insertions(+), 4 deletions(-) diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c index a14c317..095c185 100644 --- a/drivers/ptp/ptp_qoriq.c +++ b/drivers/ptp/ptp_qoriq.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -317,6 +318,105 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp, .enable = ptp_qoriq_enable, }; +/** + * qoriq_ptp_nominal_freq - calculate nominal frequency according to + * reference clock frequency + * + * @clk_src: reference clock frequency + * + * The nominal frequency is the desired clock frequency. + * It should be less than the reference clock frequency. + * It should be a factor of 1000MHz. + * + * Return the nominal frequency + */ +static u32 qoriq_ptp_nominal_freq(u32 clk_src) +{ + u32 remainder = 0; + + clk_src /= 100; + remainder = clk_src % 100; + if (remainder) { + clk_src -= remainder; + clk_src += 100; + } + + do { + clk_src -= 100; + + } while (1000 % clk_src); + + return clk_src * 100; +} + +/** + * qoriq_ptp_auto_config - calculate a set of default configurations + * + * @qoriq_ptp: pointer to qoriq_ptp + * @node: pointer to device_node + * + * If below dts properties are not provided, this function will be + * called to calculate a set of default configurations for them. + * "fsl,tclk-period" + * "fsl,tmr-prsc" + * "fsl,tmr-add" + * "fsl,tmr-fiper1" + * "fsl,tmr-fiper2" + * "fsl,max-adj" + * + * Return 0 if success + */ +static int qoriq_ptp_auto_config(struct qoriq_ptp *qoriq_ptp, +struct device_node *node) +{ + struct clk *clk; + u64 freq_comp; + u64 max_adj; + u32 nominal_freq; + u32 clk_src = 0; + + qoriq_ptp->cksel = DEFAULT_CKSEL; + + clk = of_clk_get(node, 0); + if (!IS_ERR(clk)) { + clk_src = clk_get_rate(clk); + clk_put(clk); + } + + if (clk_src <= 1UL) { + pr_err("error reference clock value, or lower than 100MHz\n"); + return -EINVAL; + } + + nominal_freq = qoriq_ptp_nominal_freq(clk_src); + if (!nominal_freq) + return -EINVAL; + + qoriq_ptp->tclk_period = 10UL / nominal_freq; + qoriq_ptp->tmr_prsc = DEFAULT_TMR_PRSC; + + /* Calculate initial frequency compensation value for TMR_ADD register. +* freq_comp = ceil(2^32 / freq_ratio) +* freq_ratio = reference_clock_freq / nominal_freq +*/ + freq_comp = ((u64)1 << 32) * nominal_freq; + if (do_div(freq_comp, clk_src)) + freq_comp++; + + qoriq_ptp->tmr_add = freq_comp; + qoriq_ptp->tmr_fiper1 = DEFAULT_FIPER1_PERIOD - qoriq_ptp->tclk_period; + qoriq_ptp->tmr_fiper2 = DEFAULT_FIPER2_PERIOD - qoriq_ptp->tclk_period; + + /* max_adj = 10 * (freq_ratio - 1.0) - 1 +* freq_ratio = reference_clock_freq / nominal_freq +*/ + max_adj = 10ULL * (clk_src - nominal_freq); + max_adj = max_adj / nominal_freq - 1; + qoriq_ptp->caps.max_adj = max_adj; + + return 0; +} + static int qoriq_ptp_probe(struct platform_device *dev) { struct device_node *node = dev->dev.of_node; @@ -332,7 +432,7 @@ static int qoriq_ptp_probe(struct platform_device *dev) if (!qoriq_ptp) goto no_memory; - err = -ENODEV; + err = -EINVAL; qoriq_ptp->caps = ptp_qoriq_caps; @@ -351,10 +451,14 @@ static int qoriq_ptp_probe(struct platform_device *dev) "fsl,tmr-fiper2", _ptp->tmr_fiper2) || of_property_read_u32(node, "fsl,max-adj", _ptp->caps.max_adj)) { - pr_err("device tree node missing required elements\n"); - goto no_node; + pr_warn("device tree node missing required elements, try automatic configuration\n"); + + if (qoriq_ptp_auto_config(qoriq_ptp, node)) + goto no_config; } + err = -ENODEV; + qoriq_ptp->irq = platform_get_irq(dev, 0); if (qoriq_ptp->irq < 0) { @@ -436,6 +540,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
[v2, 1/3] arm64: dts: fsl: add clocks property for fman ptp timer node
This patch is to add clocks property for fman ptp timer node. Signed-off-by: Yangbo Lu --- Changes for v2: - None. --- arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi index a56a408..4664c33 100644 --- a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi +++ b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi @@ -80,4 +80,5 @@ ptp_timer0: ptp-timer@1afe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x0 0x1afe000 0x0 0x1000>; interrupts = ; + clocks = < 3 0>; }; -- 1.7.1
[v2, 2/3] powerpc/mpc85xx: add clocks property for fman ptp timer node
This patch is to add clocks property for fman ptp timer node. Signed-off-by: Yangbo Lu --- Changes for v2: - None. --- arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi |1 + arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi |1 + arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi |1 + arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi |1 + arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi |1 + 5 files changed, 5 insertions(+), 0 deletions(-) diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi index 6b124f7..9b6cf91 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi @@ -100,4 +100,5 @@ ptp_timer0: ptp-timer@4fe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x4fe000 0x1000>; interrupts = <96 2 0 0>; + clocks = < 3 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi index b80aaf5..e95c11f 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi @@ -100,4 +100,5 @@ ptp_timer1: ptp-timer@5fe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x5fe000 0x1000>; interrupts = <97 2 0 0>; + clocks = < 3 1>; }; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi index d3720fd..d62b36c 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi @@ -105,4 +105,5 @@ ptp_timer0: ptp-timer@4fe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x4fe000 0x1000>; interrupts = <96 2 0 0>; + clocks = < 3 0>; }; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi index ae34c20..3102324 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi @@ -105,4 +105,5 @@ ptp_timer1: ptp-timer@5fe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x5fe000 0x1000>; interrupts = <97 2 0 0>; + clocks = < 3 1>; }; diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi index 02f2755..c90702b 100644 --- a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi +++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi @@ -93,4 +93,5 @@ ptp_timer0: ptp-timer@4fe000 { compatible = "fsl,fman-ptp-timer"; reg = <0x4fe000 0x1000>; interrupts = <96 2 0 0>; + clocks = < 3 0>; }; -- 1.7.1
Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig
On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote: > Platforms with a PCI bus will be offered the RapidIO menu since they may > be want support for a RapidIO PCI device. Platforms without a PCI bus > that might include a RapidIO IP block will need to "select HAS_RAPIDIO" > in the platform-/machine-specific "config ARCH_*" Kconfig entry. > > Tested that kernel builds for arm64 with RapidIO subsystem and > switch drivers enabled, also that the modules load successfully > on a custom Aarch64 Qemu model. As said before, please include it from drivers/Kconfig so that _all_ architectures supporting PCI (or other Rapidio attachements) get it and not some arbitrary selection of architectures.
Re: [PATCH] perf tools: allow overriding MAX_NR_CPUS at compile time
Le 03/05/2018 à 15:40, Arnaldo Carvalho de Melo a écrit : Em Fri, Sep 22, 2017 at 01:20:43PM +0200, Christophe Leroy escreveu: After update of kernel, perf tool doesn't run anymore on my 32MB RAM powerpc board, but still runs on a 128MB RAM board: Cleaning up my inbox, found this one, simple enough, still applies, applied. Did you finally apply it ? I can't see it in linux-next. Will it be merged into 4.19 ? Thanks Christophe These all needs to be dynamicly allocated, but still, with this one can get a functioning tool, apply it. - Arnaldo ~# strace perf execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot allocate memory) --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} --- +++ killed by SIGSEGV +++ Segmentation fault objdump -x shows that .bss section has a huge size of 24Mbytes: 27 .bss 016baca8 101cebb8 101cebb8 001cd988 2**3 With especially the following objects having quite big size 10205f80 l O .bss 0014 runtime_cycles_stats 10345f80 l O .bss 0014 runtime_stalled_cycles_front_stats 10485f80 l O .bss 0014 runtime_stalled_cycles_back_stats 105c5f80 l O .bss 0014 runtime_branches_stats 10705f80 l O .bss 0014 runtime_cacherefs_stats 10845f80 l O .bss 0014 runtime_l1_dcache_stats 10985f80 l O .bss 0014 runtime_l1_icache_stats 10ac5f80 l O .bss 0014 runtime_ll_cache_stats 10c05f80 l O .bss 0014 runtime_itlb_cache_stats 10d45f80 l O .bss 0014 runtime_dtlb_cache_stats 10e85f80 l O .bss 0014 runtime_cycles_in_tx_stats 10fc5f80 l O .bss 0014 runtime_transaction_stats 11105f80 l O .bss 0014 runtime_elision_stats 11245f80 l O .bss 0014 runtime_topdown_total_slots 11385f80 l O .bss 0014 runtime_topdown_slots_retired 114c5f80 l O .bss 0014 runtime_topdown_slots_issued 11605f80 l O .bss 0014 runtime_topdown_fetch_bubbles 11745f80 l O .bss 0014 runtime_topdown_recovery_bubbles This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus to 1024"), because many tables are sized with MAX_NR_CPUS This patch gives the opportunity to redefine MAX_NR_CPUS via make EXTRA_CFLAGS=-DMAX_NR_CPUS=1 Signed-off-by: Christophe Leroy --- tools/perf/perf.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/perf.h b/tools/perf/perf.h index dc442ba21bf6..a9db563da0a9 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -23,7 +23,9 @@ static inline unsigned long long rdclock(void) return ts.tv_sec * 10ULL + ts.tv_nsec; } +#ifndef MAX_NR_CPUS #define MAX_NR_CPUS 1024 +#endif extern const char *input_name; extern bool perf_host, perf_guest; -- 2.13.3
Re: [PATCH] powerpc/64s/radix: Fix missing global invalidations when removing copro
Frederic Barrat writes: > With the optimizations for TLB invalidation from commit 0cef77c7798a > ("powerpc/64s/radix: flush remote CPUs out of single-threaded > mm_cpumask"), the scope of a TLBI (global vs. local) can now be > influenced by the value of the 'copros' counter of the memory context. > > When calling mm_context_remove_copro(), the 'copros' counter is > decremented first before flushing. It may have the unintended side > effect of sending local TLBIs when we explicitly need global > invalidations in this case. Thus breaking any nMMU user in a bad and > unpredictable way. > > Fix it by flushing first, before updating the 'copros' counter, so > that invalidations will be global. > > Fixes: 0cef77c7798a ("powerpc/64s/radix: flush remote CPUs out of > single-threaded mm_cpumask") > Signed-off-by: Frederic Barrat > --- Tested-by: Vaibhav Jain -- Vaibhav Jain Linux Technology Center, IBM India Pvt. Ltd.
Re: [RFC 0/4] Virtio uses DMA API for all devices
Hi Christoph, On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote: > On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote: > > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO > > transport (so definitely not PCI) have historically been advertised by qemu > > as not being cache coherent, but because the virtio core has bypassed DMA > > ops then everything has happened to work. If we blindly enable the arch DMA > > ops, > > No one is suggesting that as far as I can tell. Apologies: it's me that wants the DMA ops enabled to handle legacy devices behind an IOMMU, but see below. > > we'll plumb in the non-coherent ops and start getting data corruption, > > so we do need a way to quirk virtio as being "always coherent" if we want to > > use the DMA ops (which we do, because our emulation platforms have an IOMMU > > for all virtio devices). > > From all that I've gather so far: no you do not want that. We really > need to figure out virtio "dma" interacts with the host / device. > > If you look at the current iommu spec it does talk of physical address > with a little careveout for VIRTIO_F_IOMMU_PLATFORM. That's true, although that doesn't exist in the legacy virtio spec, and we have an existing emulation platform which puts legacy virtio devices behind an IOMMU. Currently, Linux is unable to boot on this platform unless the IOMMU is configured as bypass. If we can use the coherent IOMMU DMA ops, then it works perfectly. > So between that and our discussion in this thread and its previous > iterations I think we need to stick to the current always physical, > bypass system dma ops mode of virtio operation as the default. As above -- that means we hang during boot because we get stuck trying to bring up a virtio-block device whose DMA is aborted by the IOMMU. The easy answer is "just upgrade to latest virtio and advertise the presence of the IOMMU". I'm pushing for that in future platforms, but it seems a shame not to support the current platform, especially given that other systems do have hacks in mainline to get virtio working. > We just need to figure out how to deal with devices that deviate > from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really > should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu > dma tweaks (offsets, cache flushing), which seems well in spirit of > the original design. The other issue is VIRTIO_F_IO_BARRIER > which is very vaguely defined, and which needs a better definition. > And last but not least we'll need some text explaining the challenges > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER > is what would basically cover them, but a good description including > an explanation of why these matter. I agree that this makes sense for future revisions of virtio (or perhaps it can just be a clarification to virtio 1.0), but we're still left in the dark with legacy devices and it would be nice to have them work on the systems which currently exist, even if it's a legacy-only hack in the arch code. Will
Re: [RFC 0/4] Virtio uses DMA API for all devices
On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote: > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO > transport (so definitely not PCI) have historically been advertised by qemu > as not being cache coherent, but because the virtio core has bypassed DMA > ops then everything has happened to work. If we blindly enable the arch DMA > ops, No one is suggesting that as far as I can tell. > we'll plumb in the non-coherent ops and start getting data corruption, > so we do need a way to quirk virtio as being "always coherent" if we want to > use the DMA ops (which we do, because our emulation platforms have an IOMMU > for all virtio devices). >From all that I've gather so far: no you do not want that. We really need to figure out virtio "dma" interacts with the host / device. If you look at the current iommu spec it does talk of physical address with a little careveout for VIRTIO_F_IOMMU_PLATFORM. So between that and our discussion in this thread and its previous iterations I think we need to stick to the current always physical, bypass system dma ops mode of virtio operation as the default. We just need to figure out how to deal with devices that deviate from the default. One things is that VIRTIO_F_IOMMU_PLATFORM really should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu dma tweaks (offsets, cache flushing), which seems well in spirit of the original design. The other issue is VIRTIO_F_IO_BARRIER which is very vaguely defined, and which needs a better definition. And last but not least we'll need some text explaining the challenges of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER is what would basically cover them, but a good description including an explanation of why these matter.
[PATCH v8 4/4] selftests/powerpc: update strlen() test to test the new assembly function for PPC32
This patch adds a test for testing the new assembly strlen() for PPC32 Signed-off-by: Christophe Leroy --- v8: removed defines in ppc_asm.h that were added in v6 (not used anymore since v7) ; added missing link to strlen_32.S v7: reduced the scope to PPC32 v6: added additional necessary defines in ppc_asm.h v5: no change v4: new tools/testing/selftests/powerpc/stringloops/Makefile| 5 - tools/testing/selftests/powerpc/stringloops/asm/cache.h | 1 + tools/testing/selftests/powerpc/stringloops/strlen_32.S | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/cache.h create mode 12 tools/testing/selftests/powerpc/stringloops/strlen_32.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 779b644461c4..9e510de2c07d 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -13,9 +13,12 @@ $(OUTPUT)/memcmp_32: CFLAGS += -m32 $(OUTPUT)/strlen: strlen.c string.o $(OUTPUT)/string.o: string.c +$(OUTPUT)/strlen_32: strlen.c +$(OUTPUT)/strlen_32: CFLAGS += -m32 + ASFLAGS = $(CFLAGS) -TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen +TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen strlen_32 include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/stringloops/asm/cache.h b/tools/testing/selftests/powerpc/stringloops/asm/cache.h new file mode 100644 index ..8a2840831122 --- /dev/null +++ b/tools/testing/selftests/powerpc/stringloops/asm/cache.h @@ -0,0 +1 @@ +#defineIFETCH_ALIGN_BYTES 4 diff --git a/tools/testing/selftests/powerpc/stringloops/strlen_32.S b/tools/testing/selftests/powerpc/stringloops/strlen_32.S new file mode 12 index ..72b13731b24c --- /dev/null +++ b/tools/testing/selftests/powerpc/stringloops/strlen_32.S @@ -0,0 +1 @@ +../../../../../arch/powerpc/lib/strlen_32.S \ No newline at end of file -- 2.13.3
[PATCH v8 3/4] powerpc/lib: implement strlen() in assembly for PPC32
The generic implementation of strlen() reads strings byte per byte. This patch implements strlen() in assembly based on a read of entire words, in the same spirit as what some other arches and glibc do. On a 8xx the time spent in strlen is reduced by 3/4 for long strings. strlen() selftest on an 8xx provides the following values: Before the patch (ie with the generic strlen() in lib/string.c): len 256 : time = 1.195055 len 016 : time = 0.083745 len 008 : time = 0.046828 len 004 : time = 0.028390 After the patch: len 256 : time = 0.272185 ==> 78% improvment len 016 : time = 0.040632 ==> 51% improvment len 008 : time = 0.033060 ==> 29% improvment len 004 : time = 0.029149 ==> 2% degradation On a 832x: Before the patch: len 256 : time = 0.236125 len 016 : time = 0.018136 len 008 : time = 0.011000 len 004 : time = 0.007229 After the patch: len 256 : time = 0.094950 ==> 60% improvment len 016 : time = 0.013357 ==> 26% improvment len 008 : time = 0.010586 ==> 4% improvment len 004 : time = 0.008784 Signed-off-by: Christophe Leroy --- Changes in v8: - No change Changes in v7: - Reduced the scope to PPC32 - Modified the missalignment handling to be branchless and loopless Changes in v6: - Reworked for having branchless conclusion Changes in v5: - Fixed for PPC64 LITTLE ENDIAN Changes in v4: - Added alignment of the loop - doing the andc only if still not 0 as it happends only for bytes above 0x7f which is pretty rare in a string Changes in v3: - Made it common to PPC32 and PPC64 Changes in v2: - Moved handling of unaligned strings outside of the main path as it is very unlikely. - Removed the verification of the fourth byte in case none of the three first ones are NUL. arch/powerpc/include/asm/string.h | 2 + arch/powerpc/lib/Makefile | 2 +- arch/powerpc/lib/strlen_32.S | 78 +++ 3 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/lib/strlen_32.S diff --git a/arch/powerpc/include/asm/string.h b/arch/powerpc/include/asm/string.h index 9b8cedf618f4..1647de15a31e 100644 --- a/arch/powerpc/include/asm/string.h +++ b/arch/powerpc/include/asm/string.h @@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n) return __memset64(p, v, n * 8); } #else +#define __HAVE_ARCH_STRLEN + extern void *memset16(uint16_t *, uint16_t, __kernel_size_t); #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index d0ca13ad8231..670286808928 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE) obj-y += string.o alloc.o code-patching.o feature-fixups.o -obj-$(CONFIG_PPC32)+= div64.o copy_32.o crtsavres.o +obj-$(CONFIG_PPC32)+= div64.o copy_32.o crtsavres.o strlen_32.o # See corresponding test in arch/powerpc/Makefile # 64-bit linker creates .sfpr on demand for final link (vmlinux), diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S new file mode 100644 index ..0a8d3f64d493 --- /dev/null +++ b/arch/powerpc/lib/strlen_32.S @@ -0,0 +1,78 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * strlen() for PPC32 + * + * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information. + * + * Inspired from glibc implementation + */ +#include +#include +#include + + .text + +/* + * Algorithm: + * + * 1) Given a word 'x', we can test to see if it contains any 0 bytes + *by subtracting 0x01010101, and seeing if any of the high bits of each + *byte changed from 0 to 1. This works because the least significant + *0 byte must have had no incoming carry (otherwise it's not the least + *significant), so it is 0x00 - 0x01 == 0xff. For all other + *byte values, either they have the high bit set initially, or when + *1 is subtracted you get a value in the range 0x00-0x7f, none of which + *have their high bit set. The expression here is + *(x - 0x01010101) & ~x & 0x80808080), which gives 0x when + *there were no 0x00 bytes in the word. You get 0x80 in bytes that + *match, but possibly false 0x80 matches in the next more significant + *byte to a true match due to carries. For little-endian this is + *of no consequence since the least significant match is the one + *we're interested in, but big-endian needs method 2 to find which + *byte matches. + * 2) Given a word 'x', we can test to see _which_ byte was zero by + *calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080). + *This produces 0x80 in each byte that was zero, and 0x00 in all + *the other bytes. The '| ~0x80808080' clears the low 7 bits in each + *byte, and the '| x' part ensures that bytes with the high bit set + *produce 0x00. The addition will carry into the high bit of each byte + *iff that byte had one of its low 7 bits set. We can then just
[PATCH v8 2/4] selftests/powerpc: Add test for strlen()
This patch adds a test for strlen() string.c contains a copy of strlen() from lib/string.c The test first tests the correctness of strlen() by comparing the result with libc strlen(). It tests all cases of alignment. It them tests the duration of an aligned strlen() on a 4 bytes string, on a 16 bytes string and on a 256 bytes string. Signed-off-by: Christophe Leroy --- v8: no change v7: no change v6: refactorised the benchmark test v5: no change v4: new .../testing/selftests/powerpc/stringloops/Makefile | 5 +- .../testing/selftests/powerpc/stringloops/string.c | 36 ++ .../testing/selftests/powerpc/stringloops/strlen.c | 127 + 3 files changed, 167 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/stringloops/string.c create mode 100644 tools/testing/selftests/powerpc/stringloops/strlen.c diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index 3862256c2b7d..779b644461c4 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -10,9 +10,12 @@ $(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec $(OUTPUT)/memcmp_32: memcmp.c $(OUTPUT)/memcmp_32: CFLAGS += -m32 +$(OUTPUT)/strlen: strlen.c string.o +$(OUTPUT)/string.o: string.c + ASFLAGS = $(CFLAGS) -TEST_GEN_PROGS := memcmp_32 memcmp_64 +TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/stringloops/string.c b/tools/testing/selftests/powerpc/stringloops/string.c new file mode 100644 index ..d05200481017 --- /dev/null +++ b/tools/testing/selftests/powerpc/stringloops/string.c @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * linux/lib/string.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * stupid library routines.. The optimized versions should generally be found + * as inline code in + * + * These are buggy as well.. + * + * * Fri Jun 25 1999, Ingo Oeser + * - Added strsep() which will replace strtok() soon (because strsep() is + *reentrant and should be faster). Use only strsep() in new code, please. + * + * * Sat Feb 09 2002, Jason Thomas , + *Matthew Hawkins + * - Kissed strtok() goodbye + */ + +#include + +/** + * strlen - Find the length of a string + * @s: The string to be sized + */ +size_t test_strlen(const char *s) +{ + const char *sc; + + for (sc = s; *sc != '\0'; ++sc) + /* nothing */; + return sc - s; +} diff --git a/tools/testing/selftests/powerpc/stringloops/strlen.c b/tools/testing/selftests/powerpc/stringloops/strlen.c new file mode 100644 index ..9055ebc484d0 --- /dev/null +++ b/tools/testing/selftests/powerpc/stringloops/strlen.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include "utils.h" + +#define SIZE 256 +#define ITERATIONS 1000 +#define ITERATIONS_BENCH 10 + +int test_strlen(const void *s); + +/* test all offsets and lengths */ +static void test_one(char *s) +{ + unsigned long offset; + + for (offset = 0; offset < SIZE; offset++) { + int x, y; + unsigned long i; + + y = strlen(s + offset); + x = test_strlen(s + offset); + + if (x != y) { + printf("strlen() returned %d, should have returned %d (%p offset %ld)\n", x, y, s, offset); + + for (i = offset; i < SIZE; i++) + printf("%02x ", s[i]); + printf("\n"); + } + } +} + +static void bench_test(char *s) +{ + struct timespec ts_start, ts_end; + int i; + + clock_gettime(CLOCK_MONOTONIC, _start); + + for (i = 0; i < ITERATIONS_BENCH; i++) + test_strlen(s); + + clock_gettime(CLOCK_MONOTONIC, _end); + + printf("len %3.3d : time = %.6f\n", test_strlen(s), ts_end.tv_sec - ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9); +} + +static int testcase(void) +{ + char *s; + unsigned long i; + + s = memalign(128, SIZE); + if (!s) { + perror("memalign"); + exit(1); + } + + srandom(1); + + memset(s, 0, SIZE); + for (i = 0; i < SIZE; i++) { + char c; + + do { + c = random() & 0x7f; + } while (!c); + s[i] = c; + test_one(s); + } + + for (i = 0; i < ITERATIONS; i++) { + unsigned long j; + + for (j = 0; j < SIZE; j++) { + char c; + + do { + c = random() & 0x7f; + } while (!c); + s[j] = c; + } + for (j = 0; j < sizeof(long); j++) { +
[PATCH v8 1/4] selftests/powerpc: add test for 32 bits memcmp
This patch renames memcmp test to memcmp_64 and adds a memcmp_32 test for testing the 32 bits version of memcmp() Signed-off-by: Christophe Leroy --- v8: rebased on latest powerpc/merge v7: no change v6: no change v5: no change v4: new tools/testing/selftests/powerpc/stringloops/Makefile| 14 +++--- tools/testing/selftests/powerpc/stringloops/memcmp_32.S | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) create mode 12 tools/testing/selftests/powerpc/stringloops/memcmp_32.S diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile b/tools/testing/selftests/powerpc/stringloops/Makefile index c60c6172dd3c..3862256c2b7d 100644 --- a/tools/testing/selftests/powerpc/stringloops/Makefile +++ b/tools/testing/selftests/powerpc/stringloops/Makefile @@ -1,10 +1,18 @@ # SPDX-License-Identifier: GPL-2.0 # The loops are all 64-bit code -CFLAGS += -m64 -maltivec CFLAGS += -I$(CURDIR) -TEST_GEN_PROGS := memcmp -EXTRA_SOURCES := memcmp_64.S ../harness.c +EXTRA_SOURCES := ../harness.c + +$(OUTPUT)/memcmp_64: memcmp.c +$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec + +$(OUTPUT)/memcmp_32: memcmp.c +$(OUTPUT)/memcmp_32: CFLAGS += -m32 + +ASFLAGS = $(CFLAGS) + +TEST_GEN_PROGS := memcmp_32 memcmp_64 include ../../lib.mk diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_32.S b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S new file mode 12 index ..056f2b3af789 --- /dev/null +++ b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S @@ -0,0 +1 @@ +../../../../../arch/powerpc/lib/memcmp_32.S \ No newline at end of file -- 2.13.3
Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100
On 01/08/2018 00:29, Alex Williamson wrote: > On Tue, 31 Jul 2018 14:03:35 +1000 > Alexey Kardashevskiy wrote: > >> On 31/07/2018 02:29, Alex Williamson wrote: >>> On Mon, 30 Jul 2018 18:58:49 +1000 >>> Alexey Kardashevskiy wrote: After some local discussions, it was pointed out that force disabling nvlinks won't bring us much as for an nvlink to work, both sides need to enable it so malicious guests cannot penetrate good ones (or a host) unless a good guest enabled the link but won't happen with a well behaving guest. And if two guests became malicious, then can still only harm each other, and so can they via other ways such network. This is different from PCIe as once PCIe link is unavoidably enabled, a well behaving device cannot firewall itself from peers as it is up to the upstream bridge(s) now to decide the routing; with nvlink2, a GPU still has means to protect itself, just like a guest can run "firewalld" for network. Although it would be a nice feature to have an extra barrier between GPUs, is inability to block the links in hypervisor still a blocker for V100 pass through? >>> >>> How is the NVLink configured by the guest, is it 'on'/'off' or are >>> specific routes configured? >> >> The GPU-GPU links need not to be blocked and need to be enabled >> (==trained) by a driver in the guest. There are no routes between GPUs >> in NVLink fabric, these are direct links, it is just a switch on each >> side, both switches need to be on for a link to work. > > Ok, but there is at least the possibility of multiple direct links per > GPU, the very first diagram I find of NVlink shows 8 interconnected > GPUs: > > https://www.nvidia.com/en-us/data-center/nvlink/ Out design is like the left part of the picture but it is just a detail. > So if each switch enables one direct, point to point link, how does the > guest know which links to open for which peer device? It uses PCI config space on GPUs to discover the topology. > And of course > since we can't see the spec, a security audit is at best hearsay :-\ Yup, the exact discovery protocol is hidden. >> The GPU-CPU links - the GPU bit is the same switch, the CPU NVlink state >> is controlled via the emulated PCI bridges which I pass through together >> with the GPU. > > So there's a special emulated switch, is that how the guest knows which > GPUs it can enable NVLinks to? Since it only has PCI config space (there is nothing relevant in the device tree at all), I assume (double checking with the NVIDIA folks now) the guest driver enables them all, tests which pair works and disables the ones which do not. This gives a malicious guest a tiny window of opportunity to break into a good guest. Hm :-/ >>> If the former, then isn't a non-malicious >>> guest still susceptible to a malicious guest? >> >> A non-malicious guest needs to turn its switch on for a link to a GPU >> which belongs to a malicious guest. > > Actual security, or obfuscation, will we ever know... If the latter, how is >>> routing configured by the guest given that the guest view of the >>> topology doesn't match physical hardware? Are these routes >>> deconfigured by device reset? Are they part of the save/restore >>> state? Thanks, > > Still curious what happens to these routes on reset. Can a later user > of a GPU inherit a device where the links are already enabled? Thanks, I am told that the GPU reset disables links. As a side effect, we get an HMI (a hardware fault which reset the host machine) when trying accessing the GPU RAM which indicates that the link is down as the memory is only accessible via the nvlink. We have special fencing code in our host firmware (skiboot) to fence this memory on PCI reset so reading from it returns zeroes instead of HMIs. -- Alexey
Re: [RFC 0/4] Virtio uses DMA API for all devices
On Tue, Jul 31, 2018 at 03:36:22PM -0500, Benjamin Herrenschmidt wrote: > On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote: > > > However the question people raise is that DMA API is already full of > > > arch-specific tricks the likes of which are outlined in your post linked > > > above. How is this one much worse? > > > > None of these warts is visible to the driver, they are all handled in > > the architecture (possibly on a per-bus basis). > > > > So for virtio we really need to decide if it has one set of behavior > > as specified in the virtio spec, or if it behaves exactly as if it > > was on a PCI bus, or in fact probably both as you lined up. But no > > magic arch specific behavior inbetween. > > The only arch specific behaviour is needed in the case where it doesn't > behave like PCI. In this case, the PCI DMA ops are not suitable, but in > our secure VMs, we still need to make it use swiotlb in order to bounce > through non-secure pages. On arm/arm64, the problem we have is that legacy virtio devices on the MMIO transport (so definitely not PCI) have historically been advertised by qemu as not being cache coherent, but because the virtio core has bypassed DMA ops then everything has happened to work. If we blindly enable the arch DMA ops, we'll plumb in the non-coherent ops and start getting data corruption, so we do need a way to quirk virtio as being "always coherent" if we want to use the DMA ops (which we do, because our emulation platforms have an IOMMU for all virtio devices). Will
Re: [next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
Hi Abdul On 08/01/2018 02:33 PM, Abdul Haleem wrote: > # mkfs -t ext4 /dev/mapper/mpatha > mke2fs 1.43.1 (08-Jun-2016) > Found a dos partition table in /dev/mapper/mpatha > Proceed anyway? (y,n) y > Discarding device blocks: > qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002. > qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002. > qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002. > qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002. > WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 > scsi_end_request+0x250/0x280 ... > NIP [c0690080] scsi_end_request+0x250/0x280 > LR [c068fe80] scsi_end_request+0x50/0x280 > Call Trace: > [c0027d39b600] [c068fe80] scsi_end_request+0x50/0x280 (unreliable) > [c0027d39b660] [c06904ac] scsi_io_completion+0x29c/0x7d0 > [c0027d39b710] [c06848e4] scsi_finish_command+0x104/0x1c0 > [c0027d39b790] [c068f148] scsi_softirq_done+0x198/0x1f0 > [c0027d39b820] [c04f2b80] blk_mq_complete_request+0x130/0x1d0 > [c0027d39b860] [c068d27c] scsi_mq_done+0x2c/0xe0 > [c0027d39b890] [d4291080] qla2xxx_qpair_sp_compl+0xa8/0x140 > [qla2xxx] > [c0027d39b900] [d42cc9d0] > qla2x00_process_completed_request+0x68/0x140 [qla2xxx] > [ cut here ] > kernel BUG at block/blk-core.c:3196! blk_finish_request BUG_ON(blk_queued_rq(req)) We are also suffering a similar issue on qla2xxx, the BUG_ON in blk_finish_request is triggered while there are lots of command aborted. The root cause should be qla2xxx driver still invoke scsi_done for an aborted command and cause race between requeue path and normal complete path. Add Himanshu Madhani from qlogic team. It seems that they are working on this. Thanks Jianchao
Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote: > On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote: > > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : > > I would suggest to instead use a function like this: > > > > static const char *signame(int signr) > > { > > if (signr == SIGBUS) > > return "bus error"; > > if (signr == SIGFPE) > > return "floating point exception"; > > if (signr == SIGILL) > > return "illegal instruction"; > > if (signr == SIGILL) > > return "segfault"; > > if (signr == SIGTRAP) > > return "unhandled trap"; > > return "unknown signal"; > > } > > trivia: > > Unless the if tests are ordered most to least likely, > perhaps it would be better to use a switch/case and > let the compiler decide. That would also show there are two entries for SIGILL (here and in the original patch), one of them very wrong. Check the table with psignal or something? Segher
Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote: > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : > > This adds a human-readable name in the unhandled signal message. > > Before this patch, a page fault looked like: > >pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr > > 7fff93c55100 code 2 in pandafault[1000+1] > > After this patch, a page fault looks like: > >pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr > > 7fffb63e5100 code 2 in pandafault[13a2a+1] ]] > > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c [] > > @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler); > > #define TM_DEBUG(x...) do { } while(0) > > #endif > > > > +static const char *signames[SIGRTMIN + 1] = { > > + "UNKNOWN", > > + "SIGHUP", // 1 > > + "SIGINT", // 2 [] > I don't think is is worth having that full table when we only use a few > of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/) > > I would suggest to instead use a function like this: > > static const char *signame(int signr) > { > if (signr == SIGBUS) > return "bus error"; > if (signr == SIGFPE) > return "floating point exception"; > if (signr == SIGILL) > return "illegal instruction"; > if (signr == SIGILL) > return "segfault"; > if (signr == SIGTRAP) > return "unhandled trap"; > return "unknown signal"; > } trivia: Unless the if tests are ordered most to least likely, perhaps it would be better to use a switch/case and let the compiler decide. switch (signr) { case SIGBUS:return "bus error"; case SIGFPE:return "floating point exception"; case SIGILL:return "illegal instruction"; case SIGSEGV: return "segfault"; case SIGTRAP: return "unhandled trap"; } return "unknown signal"; }
Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()
Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : Remove "Instruction dump:" line by adding a prefix to display current->comm and current->pid, along with the instructions dump. The prefix can serve as a glue that links the instructions dump to its originator, allowing messages to be interleaved in the logs. Before this patch, a page fault looked like: pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 code 2 in pandafault[1000+1] Instruction dump: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe 392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 After this patch, it looks like: pandafault[10850]: segfault (11) at 17d0 nip 161c lr 7fff9f3e5100 code 2 in pandafault[1000+1] pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040 Signed-off-by: Murilo Opsfelder Araujo Does the script scripts/decode_stacktrace.sh also works with this format change ? --- arch/powerpc/kernel/process.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e78799a8855a..d12143e7d8f9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1265,16 +1265,19 @@ static int instructions_to_print = 16; void show_instructions(struct pt_regs *regs) { int i; + const char *prefix = KERN_INFO "%s[%d]: code: "; unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 * sizeof(int)); - printk("Instruction dump:"); + printk(prefix, current->comm, current->pid); for (i = 0; i < instructions_to_print; i++) { int instr; - if (!(i % 8)) + if (!(i % 8) && (i > 0)) { pr_cont("\n"); + printk(prefix, current->comm, current->pid); + } #if !defined(CONFIG_BOOKE) /* If executing with the IMMU off, adjust pc rather
Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals
Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit : This adds a human-readable name in the unhandled signal message. Before this patch, a page fault looked like: pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr 7fff93c55100 code 2 in pandafault[1000+1] After this patch, a page fault looks like: pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 7fffb63e5100 code 2 in pandafault[13a2a+1] Signed-off-by: Murilo Opsfelder Araujo --- arch/powerpc/kernel/traps.c | 39 +++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 1c4f06fca370..e71f12bca146 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler); #define TM_DEBUG(x...) do { } while(0) #endif +static const char *signames[SIGRTMIN + 1] = { + "UNKNOWN", + "SIGHUP", // 1 + "SIGINT", // 2 + "SIGQUIT",// 3 + "SIGILL", // 4 + "unhandled trap", // 5 = SIGTRAP + "SIGABRT",// 6 = SIGIOT + "bus error", // 7 = SIGBUS + "floating point exception", // 8 = SIGFPE + "illegal instruction",// 9 = SIGILL + "SIGUSR1",// 10 + "segfault", // 11 = SIGSEGV + "SIGUSR2",// 12 + "SIGPIPE",// 13 + "SIGALRM",// 14 + "SIGTERM",// 15 + "SIGSTKFLT", // 16 + "SIGCHLD",// 17 + "SIGCONT",// 18 + "SIGSTOP",// 19 + "SIGTSTP",// 20 + "SIGTTIN",// 21 + "SIGTTOU",// 22 + "SIGURG", // 23 + "SIGXCPU",// 24 + "SIGXFSZ",// 25 + "SIGVTALRM", // 26 + "SIGPROF",// 27 + "SIGWINCH", // 28 + "SIGIO", // 29 = SIGPOLL = SIGLOST + "SIGPWR", // 30 + "SIGSYS", // 31 = SIGUNUSED +}; I don't think is is worth having that full table when we only use a few of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/) I would suggest to instead use a function like this: static const char *signame(int signr) { if (signr == SIGBUS) return "bus error"; if (signr == SIGFPE) return "floating point exception"; if (signr == SIGILL) return "illegal instruction"; if (signr == SIGILL) return "segfault"; if (signr == SIGTRAP) return "unhandled trap"; return "unknown signal"; } Christophe + /* * Trap & Exception support */ @@ -314,8 +349,8 @@ static void show_signal_msg(int signr, struct pt_regs *regs, int code, if (!unhandled_signal(current, signr)) return; - pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x", - current->comm, current->pid, signr, + pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x", + current->comm, current->pid, signames[signr], signr, addr, regs->nip, regs->link, code); print_vma_addr(KERN_CONT " in ", regs->nip);
[next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280
Greeting's linux-next kernel crashes when creating file system on scsi disks using mkfs.ext4 (Discarding device blocks) Machine: Power 8 Power VM LPAR kernel : 4.18.0-rc6-next-20180727 adapter : Fibre Channel [0c04]: QLogic Corp. ISP2532-based 8Gb Fibre Channel to PCI Express HBA [1077:2532] Test: mkfs.ext4 /dev/mapper/maptha Above command triggers WARN_ON_ONCE at line: 0xc0690080 is in scsi_end_request (drivers/scsi/scsi_lib.c:691) 686 687 if (blk_queue_add_random(q)) 688 add_disk_randomness(req->rq_disk); 689 690 if (!blk_rq_is_scsi(req)) { 691 WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); 692 cmd->flags &= ~SCMD_INITIALIZED; 693 destroy_rcu_head(>rcu); 694 } 695 followed by kernel crash # mkfs -t ext4 /dev/mapper/mpatha mke2fs 1.43.1 (08-Jun-2016) Found a dos partition table in /dev/mapper/mpatha Proceed anyway? (y,n) y Discarding device blocks: qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002. qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002. qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 -- 1 2002. qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 -- 1 2002. WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280 Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) kvm_pr(E) kvm(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) ipt_REJECT(E) nf_reject_ipv4(E) xt_conntrack(E) ip_set(E) nfnetlink(E) ebtable_nat(E) ebtable_broute(E) bridge(E) stp(E) llc(E) ip6table_raw(E) ip6table_nat(E) nf_nat_ipv6(E) ip6table_security(E) ip6table_mangle(E) iptable_raw(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_security(E) iptable_mangle(E) ebtable_filter(E) ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) ip_tables(E) dm_service_time(E) xts(E) vmx_crypto(E) pseries_rng(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E) grace(E) sunrpc(E) dm_multipath(E) dm_mod(E) sch_fq_codel(E) ext4(E) mbcache(E) jbd2(E) fscrypto(E) sr_mod(E) cdrom(E) sd_mod(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E) qla2xxx(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) scsi_transport_fc(E) tg3(E) CPU: 12 PID: 511 Comm: kworker/12:2 Tainted: GE 4.18.0-rc6-next-20180727-autotest-autotest #1 Workqueue: qla2xxx_wq qla25xx_free_rsp_que [qla2xxx] NIP: c0690080 LR: c068fe80 CTR: d5050788 REGS: c0027d39b380 TRAP: 0700 Tainted: GE (4.18.0-rc6-next-20180727-autotest-autotest) MSR: 8282b033 CR: 24002824 XER: 2000 CFAR: c068feb8 IRQMASK: 1 GPR00: c068fe80 c0027d39b600 c113de00 GPR04: 0001 GPR08: c0027dce3000 0001 d4330708 GPR12: 2200 cec57600 c012cf78 c00289561b40 GPR16: 00044000 4007 GPR20: c0027dce3860 c00280db9c60 GPR24: 0002 c153567c c002712a1cc8 GPR28: c002712a1cc8 c0027e217000 c002708efe00 NIP [c0690080] scsi_end_request+0x250/0x280 LR [c068fe80] scsi_end_request+0x50/0x280 Call Trace: [c0027d39b600] [c068fe80] scsi_end_request+0x50/0x280 (unreliable) [c0027d39b660] [c06904ac] scsi_io_completion+0x29c/0x7d0 [c0027d39b710] [c06848e4] scsi_finish_command+0x104/0x1c0 [c0027d39b790] [c068f148] scsi_softirq_done+0x198/0x1f0 [c0027d39b820] [c04f2b80] blk_mq_complete_request+0x130/0x1d0 [c0027d39b860] [c068d27c] scsi_mq_done+0x2c/0xe0 [c0027d39b890] [d4291080] qla2xxx_qpair_sp_compl+0xa8/0x140 [qla2xxx] [c0027d39b900] [d42cc9d0] qla2x00_process_completed_request+0x68/0x140 [qla2xxx] [ cut here ] kernel BUG at block/blk-core.c:3196! [c0027d39b970] [d42cd4ac] qla2x00_status_entry.isra.7+0x4f4/0x1750 [qla2xxx] Oops: Exception in kernel mode, sig: 5 [#1] [c0027d39bb00] [d42cfab0] qla24xx_process_response_queue+0x7d8/0xba0 [qla2xxx] LE SMP NR_CPUS=2048 [c0027d39bc40] [d42f8c48] qla25xx_free_rsp_que+0x1f0/0x270 [qla2xxx] NUMA pSeries [c0027d39bc80] [c012478c] process_one_work+0x24c/0x500 Modules linked in: xt_CHECKSUM(E) [c0027d39bd20] [c0124acc] worker_thread+0x8c/0x590 ipt_MASQUERADE(E) [c0027d39bdc0] [c012d0c8] kthread+0x158/0x1a0 tun(E) [c0027d39be30] [c000b65c] ret_from_kernel_thread+0x5c/0x80 kvm_pr(E) Instruction dump: kvm(E) 7f23cb78 ip6t_rpfilter(E) 4bed1d15 ip6t_REJECT(E) 6000 3c62003f nf_reject_ipv6(E) e8637888 ipt_REJECT(E) 7f24cb78
Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization
On Wed, Aug 01, 2018 at 04:36:40AM +, Y.b. Lu wrote: > Could I add a function to calculate a set of default register values > to initialize ptp timer when dts method failed to get required > properties in driver? Yes, it would be ideal if the driver can pick correct values automatically. However, the frequency on the FIPER outputs can't be configured automatically, and we don't have an API for the user to choose this. > I think this will be useful. The ptp timer on new platforms (you may > see two dts patches in this patchset. Many platforms will be > affected.) will work without these dts properties. If user want > specific setting, they can set dts properties. Sure. Thanks, Richard
Re: [PATCH v6 8/8] powernv/pseries: consolidate code for mce early handling.
On Mon, 9 Jul 2018 18:02:39 +0200 Michal Suchánek wrote: > On Fri, 6 Jul 2018 19:40:24 +1000 > Nicholas Piggin wrote: > > > On Wed, 04 Jul 2018 23:30:12 +0530 > > Mahesh J Salgaonkar wrote: > > > > > From: Mahesh Salgaonkar > > > > > > Now that other platforms also implements real mode mce handler, > > > lets consolidate the code by sharing existing powernv machine check > > > early code. Rename machine_check_powernv_early to > > > machine_check_common_early and reuse the code. > > > > > > Signed-off-by: Mahesh Salgaonkar > > > --- > > > arch/powerpc/kernel/exceptions-64s.S | 56 > > > +++--- 1 file changed, 11 > > > insertions(+), 45 deletions(-) > > > > > > diff --git a/arch/powerpc/kernel/exceptions-64s.S > > > b/arch/powerpc/kernel/exceptions-64s.S index > > > 0038596b7906..3e877ec55d50 100644 --- > > > a/arch/powerpc/kernel/exceptions-64s.S +++ > > > b/arch/powerpc/kernel/exceptions-64s.S @@ -243,14 +243,13 @@ > > > EXC_REAL_BEGIN(machine_check, 0x200, 0x100) > > > SET_SCRATCH0(r13) /* save r13 */ > > > EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION > > > - b machine_check_powernv_early > > > + b machine_check_common_early > > > FTR_SECTION_ELSE > > > b machine_check_pSeries_0 > > > ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) > > > EXC_REAL_END(machine_check, 0x200, 0x100) > > > EXC_VIRT_NONE(0x4200, 0x100) > > > -TRAMP_REAL_BEGIN(machine_check_powernv_early) > > > -BEGIN_FTR_SECTION > > > +TRAMP_REAL_BEGIN(machine_check_common_early) > > > EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) > > > /* > > >* Register contents: > > > @@ -306,7 +305,9 @@ BEGIN_FTR_SECTION > > > /* Save r9 through r13 from EXMC save area to stack frame. > > > */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) > > > mfmsr r11 /* get MSR value */ > > > +BEGIN_FTR_SECTION > > > ori r11,r11,MSR_ME /* turn on ME bit > > > */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) > > > ori r11,r11,MSR_RI /* turn on RI bit > > > */ LOAD_HANDLER(r12, machine_check_handle_early) > > > 1: mtspr SPRN_SRR0,r12 > > > @@ -325,7 +326,6 @@ BEGIN_FTR_SECTION > > > andcr11,r11,r10 /* Turn off MSR_ME > > > */ b 1b > > > b . /* prevent speculative execution */ > > > -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) > > > > > > TRAMP_REAL_BEGIN(machine_check_pSeries) > > > .globl machine_check_fwnmi > > > @@ -333,7 +333,7 @@ machine_check_fwnmi: > > > SET_SCRATCH0(r13) /* save r13 */ > > > EXCEPTION_PROLOG_0(PACA_EXMC) > > > BEGIN_FTR_SECTION > > > - b machine_check_pSeries_early > > > + b machine_check_common_early > > > END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) > > > machine_check_pSeries_0: > > > EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) > > > @@ -346,45 +346,6 @@ machine_check_pSeries_0: > > > > > > TRAMP_KVM_SKIP(PACA_EXMC, 0x200) > > > > > > -TRAMP_REAL_BEGIN(machine_check_pSeries_early) > > > -BEGIN_FTR_SECTION > > > - EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) > > > - mr r10,r1 /* Save r1 */ > > > - ld r1,PACAMCEMERGSP(r13) /* Use MC emergency > > > stack */ > > > - subir1,r1,INT_FRAME_SIZE/* alloc stack > > > frame */ > > > - mfspr r11,SPRN_SRR0 /* Save SRR0 */ > > > - mfspr r12,SPRN_SRR1 /* Save SRR1 */ > > > - EXCEPTION_PROLOG_COMMON_1() > > > - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) > > > - EXCEPTION_PROLOG_COMMON_3(0x200) > > > - addir3,r1,STACK_FRAME_OVERHEAD > > > - BRANCH_LINK_TO_FAR(machine_check_early) /* Function call > > > ABI */ - > > > - /* Move original SRR0 and SRR1 into the respective regs */ > > > - ld r9,_MSR(r1) > > > - mtspr SPRN_SRR1,r9 > > > - ld r3,_NIP(r1) > > > - mtspr SPRN_SRR0,r3 > > > - ld r9,_CTR(r1) > > > - mtctr r9 > > > - ld r9,_XER(r1) > > > - mtxer r9 > > > - ld r9,_LINK(r1) > > > - mtlrr9 > > > - REST_GPR(0, r1) > > > - REST_8GPRS(2, r1) > > > - REST_GPR(10, r1) > > > - ld r11,_CCR(r1) > > > - mtcrr11 > > > - REST_GPR(11, r1) > > > - REST_2GPRS(12, r1) > > > - /* restore original r1. */ > > > - ld r1,GPR1(r1) > > > - SET_SCRATCH0(r13) /* save r13 */ > > > - EXCEPTION_PROLOG_0(PACA_EXMC) > > > - b machine_check_pSeries_0 > > > -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) > > > - > > > EXC_COMMON_BEGIN(machine_check_common) > > > /* > > >* Machine check is different because we use a different > > > @@ -483,6 +444,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early) > > > bl machine_check_early > > > std r3,RESULT(r1) /* Save result */ > > > ld r12,_MSR(r1) > > > +BEGIN_FTR_SECTION > > > + bne 9f /* pSeries: continue > > > to V mode. */ +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) > > > > Should this be "b 9f" ? Although... > > > > > > > > #ifdef CONFIG_PPC_P7_NAP > > > /* > > > @@ -564,7 +528,9 @@
Re: [PATCH v6 5/8] powerpc/pseries: flush SLB contents on SLB MCE errors.
On Wed, 04 Jul 2018 23:28:21 +0530 Mahesh J Salgaonkar wrote: > From: Mahesh Salgaonkar > > On pseries, as of today system crashes if we get a machine check > exceptions due to SLB errors. These are soft errors and can be fixed by > flushing the SLBs so the kernel can continue to function instead of > system crash. We do this in real mode before turning on MMU. Otherwise > we would run into nested machine checks. This patch now fetches the > rtas error log in real mode and flushes the SLBs on SLB errors. > > Signed-off-by: Mahesh Salgaonkar > --- > arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 > arch/powerpc/include/asm/machdep.h|1 > arch/powerpc/kernel/exceptions-64s.S | 42 + > arch/powerpc/kernel/mce.c | 16 +++- > arch/powerpc/mm/slb.c |6 +++ > arch/powerpc/platforms/pseries/pseries.h |1 > arch/powerpc/platforms/pseries/ras.c | 51 > + > arch/powerpc/platforms/pseries/setup.c|1 > 8 files changed, 116 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h > b/arch/powerpc/include/asm/book3s/64/mmu-hash.h > index 50ed64fba4ae..cc00a7088cf3 100644 > --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h > +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h > @@ -487,6 +487,7 @@ extern void hpte_init_native(void); > > extern void slb_initialize(void); > extern void slb_flush_and_rebolt(void); > +extern void slb_flush_and_rebolt_realmode(void); > > extern void slb_vmalloc_update(void); > extern void slb_set_size(u16 size); > diff --git a/arch/powerpc/include/asm/machdep.h > b/arch/powerpc/include/asm/machdep.h > index ffe7c71e1132..fe447e0d4140 100644 > --- a/arch/powerpc/include/asm/machdep.h > +++ b/arch/powerpc/include/asm/machdep.h > @@ -108,6 +108,7 @@ struct machdep_calls { > > /* Early exception handlers called in realmode */ > int (*hmi_exception_early)(struct pt_regs *regs); > + int (*machine_check_early)(struct pt_regs *regs); > > /* Called during machine check exception to retrive fixup address. */ > bool(*mce_check_early_recovery)(struct pt_regs *regs); > diff --git a/arch/powerpc/kernel/exceptions-64s.S > b/arch/powerpc/kernel/exceptions-64s.S > index f283958129f2..0038596b7906 100644 > --- a/arch/powerpc/kernel/exceptions-64s.S > +++ b/arch/powerpc/kernel/exceptions-64s.S > @@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries) > machine_check_fwnmi: > SET_SCRATCH0(r13) /* save r13 */ > EXCEPTION_PROLOG_0(PACA_EXMC) > +BEGIN_FTR_SECTION > + b machine_check_pSeries_early > +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) > machine_check_pSeries_0: > EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200) > /* > @@ -343,6 +346,45 @@ machine_check_pSeries_0: > > TRAMP_KVM_SKIP(PACA_EXMC, 0x200) > > +TRAMP_REAL_BEGIN(machine_check_pSeries_early) > +BEGIN_FTR_SECTION > + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) > + mr r10,r1 /* Save r1 */ > + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency stack */ > + subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/ > + mfspr r11,SPRN_SRR0 /* Save SRR0 */ > + mfspr r12,SPRN_SRR1 /* Save SRR1 */ > + EXCEPTION_PROLOG_COMMON_1() > + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) > + EXCEPTION_PROLOG_COMMON_3(0x200) > + addir3,r1,STACK_FRAME_OVERHEAD > + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */ > + > + /* Move original SRR0 and SRR1 into the respective regs */ > + ld r9,_MSR(r1) > + mtspr SPRN_SRR1,r9 > + ld r3,_NIP(r1) > + mtspr SPRN_SRR0,r3 > + ld r9,_CTR(r1) > + mtctr r9 > + ld r9,_XER(r1) > + mtxer r9 > + ld r9,_LINK(r1) > + mtlrr9 > + REST_GPR(0, r1) > + REST_8GPRS(2, r1) > + REST_GPR(10, r1) > + ld r11,_CCR(r1) > + mtcrr11 > + REST_GPR(11, r1) > + REST_2GPRS(12, r1) > + /* restore original r1. */ > + ld r1,GPR1(r1) > + SET_SCRATCH0(r13) /* save r13 */ > + EXCEPTION_PROLOG_0(PACA_EXMC) > + b machine_check_pSeries_0 > +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE) > + > EXC_COMMON_BEGIN(machine_check_common) > /* >* Machine check is different because we use a different > diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c > index efdd16a79075..221271c96a57 100644 > --- a/arch/powerpc/kernel/mce.c > +++ b/arch/powerpc/kernel/mce.c > @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs) > { > long handled = 0; > > - __this_cpu_inc(irq_stat.mce_exceptions); > + /* > + * For pSeries we count mce when we go into virtual mode machine > + * check handler. Hence skip it. Also, We can't
Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.
On Thu, 12 Jul 2018 15:41:13 +0200 Michal Suchánek wrote: > On Tue, 3 Jul 2018 08:08:14 +1000 > "Nicholas Piggin" wrote: > > > On Mon, 02 Jul 2018 11:17:06 +0530 > > Mahesh J Salgaonkar wrote: > > > > > From: Mahesh Salgaonkar > > > > > > On pseries, as of today system crashes if we get a machine check > > > exceptions due to SLB errors. These are soft errors and can be > > > fixed by flushing the SLBs so the kernel can continue to function > > > instead of system crash. We do this in real mode before turning on > > > MMU. Otherwise we would run into nested machine checks. This patch > > > now fetches the rtas error log in real mode and flushes the SLBs on > > > SLB errors. > > > > > > Signed-off-by: Mahesh Salgaonkar > > > --- > > > arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 > > > arch/powerpc/include/asm/machdep.h|1 > > > arch/powerpc/kernel/exceptions-64s.S | 42 > > > + arch/powerpc/kernel/mce.c > > > | 16 +++- arch/powerpc/mm/slb.c | > > > 6 +++ arch/powerpc/platforms/powernv/opal.c |1 > > > arch/powerpc/platforms/pseries/pseries.h |1 > > > arch/powerpc/platforms/pseries/ras.c | 51 > > > + > > > arch/powerpc/platforms/pseries/setup.c|1 9 files > > > changed, 116 insertions(+), 4 deletions(-) > > > > > > > +TRAMP_REAL_BEGIN(machine_check_pSeries_early) > > > +BEGIN_FTR_SECTION > > > + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) > > > + mr r10,r1 /* Save r1 */ > > > + ld r1,PACAMCEMERGSP(r13) /* Use MC emergency > > > stack */ > > > + subir1,r1,INT_FRAME_SIZE/* alloc stack > > > frame */ > > > + mfspr r11,SPRN_SRR0 /* Save SRR0 */ > > > + mfspr r12,SPRN_SRR1 /* Save SRR1 */ > > > + EXCEPTION_PROLOG_COMMON_1() > > > + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC) > > > + EXCEPTION_PROLOG_COMMON_3(0x200) > > > + addir3,r1,STACK_FRAME_OVERHEAD > > > + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call > > > ABI */ > > > > Is there any reason you can't use the existing > > machine_check_powernv_early code to do all this? > > > > > diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c > > > index efdd16a79075..221271c96a57 100644 > > > --- a/arch/powerpc/kernel/mce.c > > > +++ b/arch/powerpc/kernel/mce.c > > > @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs) > > > { > > > long handled = 0; > > > > > > - __this_cpu_inc(irq_stat.mce_exceptions); > > > + /* > > > + * For pSeries we count mce when we go into virtual mode > > > machine > > > + * check handler. Hence skip it. Also, We can't access per > > > cpu > > > + * variables in real mode for LPAR. > > > + */ > > > + if (early_cpu_has_feature(CPU_FTR_HVMODE)) > > > + __this_cpu_inc(irq_stat.mce_exceptions); > > > > > > - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) > > > + /* > > > + * See if platform is capable of handling machine check. > > > + * Otherwise fallthrough and allow CPU to handle this > > > machine check. > > > + */ > > > + if (ppc_md.machine_check_early) > > > + handled = ppc_md.machine_check_early(regs); > > > + else if (cur_cpu_spec && cur_cpu_spec->machine_check_early) > > > handled = > > > cur_cpu_spec->machine_check_early(regs); > > > > Would be good to add a powernv ppc_md handler which does the > > cur_cpu_spec->machine_check_early() call now that other platforms are > > calling this code. Because those aren't valid as a fallback call, but > > specific to powernv. > > > > Something like this (untested)? Sorry, some emails fell through the cracks. Yes exactly like this would be good. If you can add a quick changelog and SOB, and Reviewed-by: Nicholas Piggin Thanks, Nick > > Subject: [PATCH] powerpc/powernv: define platform MCE handler. > > --- > arch/powerpc/kernel/mce.c | 3 --- > arch/powerpc/platforms/powernv/setup.c | 11 +++ > 2 files changed, 11 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c > index 221271c96a57..ae17d8aa60c4 100644 > --- a/arch/powerpc/kernel/mce.c > +++ b/arch/powerpc/kernel/mce.c > @@ -498,12 +498,9 @@ long machine_check_early(struct pt_regs *regs) > > /* >* See if platform is capable of handling machine check. > - * Otherwise fallthrough and allow CPU to handle this machine check. >*/ > if (ppc_md.machine_check_early) > handled = ppc_md.machine_check_early(regs); > - else if (cur_cpu_spec && cur_cpu_spec->machine_check_early) > - handled = cur_cpu_spec->machine_check_early(regs); > return handled; > } > > diff --git a/arch/powerpc/platforms/powernv/setup.c > b/arch/powerpc/platforms/powernv/setup.c > index f96df0a25d05..b74c93bc2e55 100644 > --- a/arch/powerpc/platforms/powernv/setup.c > +++
[PATCH] powernv: opal-sensor-groups: Add attributes to disable/enable sensors
This patch provides support to disable and enable plaform specific sensor groups like performance, utilization and frequency which are currently not supported in hwmon. Signed-off-by: Shilpasri G Bhat --- This patch is based on https://git.kernel.org/powerpc/c/04baaf28f40c68c35a413cd9d0db71 .../ABI/testing/sysfs-firmware-opal-sensor-groups | 33 .../powerpc/platforms/powernv/opal-sensor-groups.c | 99 +++--- 2 files changed, 100 insertions(+), 32 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups new file mode 100644 index 000..a236686 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups @@ -0,0 +1,33 @@ +What: /sys/firmware/opal/sensor_groups +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sensor groups directory for POWER9 powernv servers + + Each folder in this directory contains a sensor group + which are classified based on type of the sensor + like power, temperature, frequency, current, etc. They + can also indicate the group of sensors belonging to + different owners like CSM, Profiler, Job-Scheduler + +What: /sys/firmware/opal/sensor_groups//clear +Date: August 2017 +Contact: Linux for PowerPC mailing list +Description: Sysfs file to clear the min-max of all the sensors + belonging to the group. + + Writing 1 to this file will clear the minimum and + maximum values of all the sensors in the group. + In POWER9, the min-max of a sensor is the historical minimum + and maximum value of the sensor cached by OCC. + +What: /sys/firmware/opal/sensor_groups//enable +Date: August 2018 +Contact: Linux for PowerPC mailing list +Description: Sysfs file to enable/disable the sensor-group + + Writing 0 to this file will disable all the sensors + belonging to the group and writing 1 will enable the + sensors. + In POWER9, by default all the sensor-groups are enabled and + will be copied to main memory by OCC. This file can be used + to increase the update frequency of selective sensor-groups. diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c b/arch/powerpc/platforms/powernv/opal-sensor-groups.c index f7d04b6..8bf6c86 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c +++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c @@ -24,6 +24,8 @@ struct sg_attr { u32 handle; struct kobj_attribute attr; + u32 opal_no; + bool enable; }; static struct sensor_group { @@ -60,34 +62,17 @@ int sensor_group_enable(u32 handle, bool enable) } EXPORT_SYMBOL_GPL(sensor_group_enable); -static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) +static int sensor_group_clear(u32 handle) { - struct sg_attr *sattr = container_of(attr, struct sg_attr, attr); struct opal_msg msg; - u32 data; - int ret, token; - - ret = kstrtoint(buf, 0, ); - if (ret) - return ret; - - if (data != 1) - return -EINVAL; + int token, ret; token = opal_async_get_token_interruptible(); - if (token < 0) { - pr_devel("Failed to get token\n"); + if (token < 0) return token; - } - - ret = mutex_lock_interruptible(_mutex); - if (ret) - goto out_token; - ret = opal_sensor_group_clear(sattr->handle, token); - switch (ret) { - case OPAL_ASYNC_COMPLETION: + ret = opal_sensor_group_clear(handle, token); + if (ret == OPAL_ASYNC_COMPLETION) { ret = opal_async_wait_response(token, ); if (ret) { pr_devel("Failed to wait for the async response\n"); @@ -95,20 +80,57 @@ static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr, goto out; } ret = opal_error_code(opal_get_async_rc(msg)); - if (!ret) - ret = count; + } else { + ret = opal_error_code(ret); + } + +out: + opal_async_release_token(token); + return ret; +} + +static ssize_t sgroup_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct sg_attr *sattr = container_of(attr, struct sg_attr, attr); + + return sprintf(buf, "%u\n", sattr->enable); +} + +static ssize_t sgroup_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf,