Re: [PATCH] powerpc/perf: Remove sched_task function defined for thread-imc

2018-08-01 Thread Joel Stanley
On Fri, 18 May 2018 at 17:06, Anju T Sudhakar  wrote:
>
> Call trace observed while running perf-fuzzer:
>
> [  329.228068] CPU: 43 PID: 9088 Comm: perf_fuzzer Not tainted 
> 4.13.0-32-generic #35~lp1746225
> [  329.228070] task: c03f776ac900 task.stack: c03f77728000
> [  329.228071] NIP: c0299b70 LR: c02a4534 CTR: 
> c029bb80
> [  329.228073] REGS: c03f7772b760 TRAP: 0700   Not tainted  
> (4.13.0-32-generic)
> [  329.228073] MSR: 9282b033 
> [  329.228079]   CR: 24008822  XER: 
> [  329.228080] CFAR: c0299a70 SOFTE: 0
> GPR00: c02a4534 c03f7772b9e0 c1606200 c03fef858908
> GPR04: c03f776ac900 0001  003fee73
> GPR08:   c11220d8 0002
> GPR12: c029bb80 c7a3d900  
> GPR16:    
> GPR20:   c03f776ad090 c0c71354
> GPR24: c03fef716780 003fee73 c03fe69d4200 c03f776ad330
> GPR28: c11220d8 0001 c14c6108 c03fef858900
> [  329.228098] NIP [c0299b70] perf_pmu_sched_task+0x170/0x180
> [  329.228100] LR [c02a4534] __perf_event_task_sched_in+0xc4/0x230
> [  329.228101] Call Trace:
> [  329.228102] [c03f7772b9e0] [c02a0678] 
> perf_iterate_sb+0x158/0x2a0 (unreliable)
> [  329.228105] [c03f7772ba30] [c02a4534] 
> __perf_event_task_sched_in+0xc4/0x230
> [  329.228107] [c03f7772bab0] [c01396dc] 
> finish_task_switch+0x21c/0x310
> [  329.228109] [c03f7772bb60] [c0c71354] __schedule+0x304/0xb80
> [  329.228111] [c03f7772bc40] [c0c71c10] schedule+0x40/0xc0
> [  329.228113] [c03f7772bc60] [c01033f4] do_wait+0x254/0x2e0
> [  329.228115] [c03f7772bcd0] [c0104ac0] kernel_wait4+0xa0/0x1a0
> [  329.228117] [c03f7772bd70] [c0104c24] SyS_wait4+0x64/0xc0
> [  329.228121] [c03f7772be30] [c000b184] system_call+0x58/0x6c
> [  329.228121] Instruction dump:
> [  329.228123] 3beafea0 7faa4800 409eff18 e8010060 eb610028 ebc10040 7c0803a6 
> 38210050
> [  329.228127] eb81ffe0 eba1ffe8 ebe1fff8 4e800020 <0fe0> 4bbc 
> 6000 6042
> [  329.228131] ---[ end trace 8c46856d314c1811 ]---
> [  375.755943] hrtimer: interrupt took 31601 ns
>
>
> The context switch call-backs for thread-imc are defined in sched_task 
> function.
> So when thread-imc events are grouped with software pmu events,
> perf_pmu_sched_task hits the WARN_ON_ONCE condition, since software PMUs are
> assumed not to have a sched_task defined.
>
> Patch to move the thread_imc enable/disable opal call back from sched_task to
> event_[add/del] function
>
> Signed-off-by: Anju T Sudhakar 

I reproduced the WARN when running -rc7 on a Power9 box. I haven't
seen it since applying this patch.

Tested-by: Joel Stanley 

Cheers,

Joel


Re: [PATCH v4 5/6] powerpc: Add show_user_instructions()

2018-08-01 Thread Christophe LEROY




Le 01/08/2018 à 23:33, Murilo Opsfelder Araujo a écrit :

show_user_instructions() is a slightly modified version of
show_instructions() that allows userspace instruction dump.

This will be useful within show_signal_msg() to dump userspace
instructions of the faulty location.

Here is a sample of what show_user_instructions() outputs:

   pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
f821ffc1 7c3f0b78 3d22fffe
   pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
3920 7d234b78 383f0040

The current->comm and current->pid printed can serve as a glue that
links the instructions dump to its originator, allowing messages to be
interleaved in the logs.

Signed-off-by: Murilo Opsfelder Araujo 
---
  arch/powerpc/include/asm/stacktrace.h | 13 +
  arch/powerpc/kernel/process.c | 40 +++
  2 files changed, 53 insertions(+)
  create mode 100644 arch/powerpc/include/asm/stacktrace.h

diff --git a/arch/powerpc/include/asm/stacktrace.h 
b/arch/powerpc/include/asm/stacktrace.h
new file mode 100644
index ..6149b53b3bc8
--- /dev/null
+++ b/arch/powerpc/include/asm/stacktrace.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Stack trace functions.
+ *
+ * Copyright 2018, Murilo Opsfelder Araujo, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_STACKTRACE_H
+#define _ASM_POWERPC_STACKTRACE_H
+
+void show_user_instructions(struct pt_regs *regs);
+
+#endif /* _ASM_POWERPC_STACKTRACE_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e9533b4d2f08..364645ac732c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1299,6 +1299,46 @@ static void show_instructions(struct pt_regs *regs)
pr_cont("\n");
  }
  
+void show_user_instructions(struct pt_regs *regs)

+{
+   int i;
+   const char *prefix = KERN_INFO "%s[%d]: code: ";
+   unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
+   sizeof(int));
+
+   printk(prefix, current->comm, current->pid);


Why not use pr_info() and remove KERN_INFO from *prefix ?


+
+   for (i = 0; i < instructions_to_print; i++) {
+   int instr;
+
+   if (!(i % 8) && (i > 0)) {
+   pr_cont("\n");
+   printk(prefix, current->comm, current->pid);
+   }
+
+#if !defined(CONFIG_BOOKE)
+   /* If executing with the IMMU off, adjust pc rather
+* than print .
+*/
+   if (!(regs->msr & MSR_IR))
+   pc = (unsigned long)phys_to_virt(pc);


Shouldn't this be done outside of the loop, only once ?

Christophe


+#endif
+
+   if (probe_kernel_address((unsigned int __user *)pc, instr)) {
+   pr_cont(" ");
+   } else {
+   if (regs->nip == pc)
+   pr_cont("<%08x> ", instr);
+   else
+   pr_cont("%08x ", instr);
+   }
+
+   pc += sizeof(int);
+   }
+
+   pr_cont("\n");
+}
+
  struct regbit {
unsigned long bit;
const char *name;



Re: [PATCH v6 5/8] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-08-01 Thread Mahesh Jagannath Salgaonkar
On 08/01/2018 11:28 AM, Nicholas Piggin wrote:
> On Wed, 04 Jul 2018 23:28:21 +0530
> Mahesh J Salgaonkar  wrote:
> 
>> From: Mahesh Salgaonkar 
>>
>> On pseries, as of today system crashes if we get a machine check
>> exceptions due to SLB errors. These are soft errors and can be fixed by
>> flushing the SLBs so the kernel can continue to function instead of
>> system crash. We do this in real mode before turning on MMU. Otherwise
>> we would run into nested machine checks. This patch now fetches the
>> rtas error log in real mode and flushes the SLBs on SLB errors.
>>
>> Signed-off-by: Mahesh Salgaonkar 
>> ---
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
>>  arch/powerpc/include/asm/machdep.h|1 
>>  arch/powerpc/kernel/exceptions-64s.S  |   42 +
>>  arch/powerpc/kernel/mce.c |   16 +++-
>>  arch/powerpc/mm/slb.c |6 +++
>>  arch/powerpc/platforms/pseries/pseries.h  |1 
>>  arch/powerpc/platforms/pseries/ras.c  |   51 
>> +
>>  arch/powerpc/platforms/pseries/setup.c|1 
>>  8 files changed, 116 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
>> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> index 50ed64fba4ae..cc00a7088cf3 100644
>> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
>> @@ -487,6 +487,7 @@ extern void hpte_init_native(void);
>>  
>>  extern void slb_initialize(void);
>>  extern void slb_flush_and_rebolt(void);
>> +extern void slb_flush_and_rebolt_realmode(void);
>>  
>>  extern void slb_vmalloc_update(void);
>>  extern void slb_set_size(u16 size);
>> diff --git a/arch/powerpc/include/asm/machdep.h 
>> b/arch/powerpc/include/asm/machdep.h
>> index ffe7c71e1132..fe447e0d4140 100644
>> --- a/arch/powerpc/include/asm/machdep.h
>> +++ b/arch/powerpc/include/asm/machdep.h
>> @@ -108,6 +108,7 @@ struct machdep_calls {
>>  
>>  /* Early exception handlers called in realmode */
>>  int (*hmi_exception_early)(struct pt_regs *regs);
>> +int (*machine_check_early)(struct pt_regs *regs);
>>  
>>  /* Called during machine check exception to retrive fixup address. */
>>  bool(*mce_check_early_recovery)(struct pt_regs *regs);
>> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
>> b/arch/powerpc/kernel/exceptions-64s.S
>> index f283958129f2..0038596b7906 100644
>> --- a/arch/powerpc/kernel/exceptions-64s.S
>> +++ b/arch/powerpc/kernel/exceptions-64s.S
>> @@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
>>  machine_check_fwnmi:
>>  SET_SCRATCH0(r13)   /* save r13 */
>>  EXCEPTION_PROLOG_0(PACA_EXMC)
>> +BEGIN_FTR_SECTION
>> +b   machine_check_pSeries_early
>> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>>  machine_check_pSeries_0:
>>  EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
>>  /*
>> @@ -343,6 +346,45 @@ machine_check_pSeries_0:
>>  
>>  TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
>>  
>> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
>> +BEGIN_FTR_SECTION
>> +EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
>> +mr  r10,r1  /* Save r1 */
>> +ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
>> +subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/
>> +mfspr   r11,SPRN_SRR0   /* Save SRR0 */
>> +mfspr   r12,SPRN_SRR1   /* Save SRR1 */
>> +EXCEPTION_PROLOG_COMMON_1()
>> +EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
>> +EXCEPTION_PROLOG_COMMON_3(0x200)
>> +addir3,r1,STACK_FRAME_OVERHEAD
>> +BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
>> +
>> +/* Move original SRR0 and SRR1 into the respective regs */
>> +ld  r9,_MSR(r1)
>> +mtspr   SPRN_SRR1,r9
>> +ld  r3,_NIP(r1)
>> +mtspr   SPRN_SRR0,r3
>> +ld  r9,_CTR(r1)
>> +mtctr   r9
>> +ld  r9,_XER(r1)
>> +mtxer   r9
>> +ld  r9,_LINK(r1)
>> +mtlrr9
>> +REST_GPR(0, r1)
>> +REST_8GPRS(2, r1)
>> +REST_GPR(10, r1)
>> +ld  r11,_CCR(r1)
>> +mtcrr11
>> +REST_GPR(11, r1)
>> +REST_2GPRS(12, r1)
>> +/* restore original r1. */
>> +ld  r1,GPR1(r1)
>> +SET_SCRATCH0(r13)   /* save r13 */
>> +EXCEPTION_PROLOG_0(PACA_EXMC)
>> +b   machine_check_pSeries_0
>> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>> +
>>  EXC_COMMON_BEGIN(machine_check_common)
>>  /*
>>   * Machine check is different because we use a different
>> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
>> index efdd16a79075..221271c96a57 100644
>> --- a/arch/powerpc/kernel/mce.c
>> +++ b/arch/powerpc/kernel/mce.c
>> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
>>  {
>>  long handled = 0;
>>  
>> -__this_cpu_inc(irq_stat.mce_exceptions);
>> +/*
>> + 

[RFC PATCH 3/3] cpuidle/powernv: Conditionally save-restore sprs using opal

2018-08-01 Thread Akshay Adiga
From: Abhishek Goel 

If a state has "opal-supported" compat flag in device-tree, an opal call
needs to be made during the entry and exit of the stop state. This patch
passes a hint to the power9_idle_stop and power9_offline_stop.

This patch moves the saving and restoring of sprs for P9 cpuidle
from kernel to opal. This patch still uses existing code to detect
first thread in core.
In an attempt to make the powernv idle code backward compatible,
and to some extent forward compatible, add support for pre-stop entry
and post-stop exit actions in OPAL. If a kernel knows about this
opal call, then just a firmware supporting newer hardware is required,
instead of waiting for kernel updates.

Signed-off-by: Abhishek Goel 
Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  1 +
 arch/powerpc/include/asm/opal-api.h   |  4 +-
 arch/powerpc/include/asm/opal.h   |  3 +
 arch/powerpc/include/asm/paca.h   |  5 +-
 arch/powerpc/include/asm/processor.h  |  6 +-
 arch/powerpc/kernel/asm-offsets.c |  3 +
 arch/powerpc/kernel/idle_book3s.S | 55 ++-
 arch/powerpc/platforms/powernv/idle.c | 41 ++
 .../powerpc/platforms/powernv/opal-wrappers.S |  2 +
 arch/powerpc/xmon/xmon.c  | 14 ++---
 10 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index b965066560cc..2fb2324d15fc 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -96,6 +96,7 @@ struct pnv_idle_states_t {
u64 psscr_val;
u64 psscr_mask;
u32 flags;
+   bool req_opal_call;
enum idle_state_type_t type;
bool valid;
 };
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 3bab299eda49..6792a737bc9a 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -208,7 +208,9 @@
 #define OPAL_SENSOR_READ_U64   162
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
-#define OPAL_LAST  165
+#define OPAL_IDLE_SAVE 168
+#define OPAL_IDLE_RESTORE  169
+#define OPAL_LAST  169
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index e1b2910c6e81..12d57aeacde2 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -356,6 +356,9 @@ extern void opal_kmsg_init(void);
 
 extern int opal_event_request(unsigned int opal_event_nr);
 
+extern int opal_cpuidle_save(u64 *stop_sprs, int scope, u64 psscr);
+extern int opal_cpuidle_restore(u64 *stop_sprs, int scope, u64 psscr, u64 
srr1);
+
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,
 unsigned long vmalloc_size);
 void opal_free_sg_list(struct opal_sg_list *sg);
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 6d34bd71139d..586059594443 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -195,11 +195,14 @@ struct paca_struct {
/* The PSSCR value that the kernel requested before going to stop */
u64 requested_psscr;
 
+   u64 wakeup_psscr;
+   u8 req_opal_call;
/*
-* Save area for additional SPRs that need to be
+* Save area for SPRs that need to be
 * saved/restored during cpuidle stop.
 */
struct stop_sprs stop_sprs;
+   u64 *opal_stop_sprs;
 #endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 34f572056add..9f9fb1f11dd6 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -513,8 +513,10 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, 
IDLE_POWERSAVE_OFF};
 extern int powersave_nap;  /* set if nap mode can be used in idle loop */
 extern unsigned long power7_idle_insn(unsigned long type); /* 
PNV_THREAD_NAP/etc*/
 extern void power7_idle_type(unsigned long type);
-extern unsigned long power9_idle_stop(unsigned long psscr_val);
-extern unsigned long power9_offline_stop(unsigned long psscr_val);
+extern unsigned long power9_idle_stop(unsigned long psscr_val,
+ bool opal_enabled);
+extern unsigned long power9_offline_stop(unsigned long psscr_val,
+bool opal_enabled);
 extern void power9_idle_type(int index);
 
 extern void flush_instruction_cache(void);
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 262c44a90ea1..740ae068ec74 100644

[RFC PATCH 2/3] powernv/cpuidle: Pass pointers instead of values to stop loop

2018-08-01 Thread Akshay Adiga
Passing pointer to the pnv_idle_state instead of psscr value and mask.
This helps us to pass more information to the stop loop. This will help to
figure out the method to enter/exit idle state.

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/processor.h  |  3 +-
 arch/powerpc/platforms/powernv/idle.c | 58 ---
 drivers/cpuidle/cpuidle-powernv.c | 16 +++-
 3 files changed, 32 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index 5debe337ea9d..34f572056add 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -515,8 +515,7 @@ extern unsigned long power7_idle_insn(unsigned long type); 
/* PNV_THREAD_NAP/etc
 extern void power7_idle_type(unsigned long type);
 extern unsigned long power9_idle_stop(unsigned long psscr_val);
 extern unsigned long power9_offline_stop(unsigned long psscr_val);
-extern void power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask);
+extern void power9_idle_type(int index);
 
 extern void flush_instruction_cache(void);
 extern void hard_reset_now(void);
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 93accece92e3..a6ef9b68e27b 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -43,8 +43,7 @@ int nr_pnv_idle_states;
  * The default stop state that will be used by ppc_md.power_save
  * function on platforms that support stop instruction.
  */
-static u64 pnv_default_stop_val;
-static u64 pnv_default_stop_mask;
+struct pnv_idle_states_t *pnv_default_state;
 static bool default_stop_found;
 
 static int parse_dt_v1(struct device_node *np);
@@ -70,9 +69,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  * psscr value and mask of the deepest stop idle state.
  * Used when a cpu is offlined.
  */
-static u64 pnv_deepest_stop_psscr_val;
-static u64 pnv_deepest_stop_psscr_mask;
-static u64 pnv_deepest_stop_flag;
+static struct pnv_idle_states_t *pnv_deepest_state;
 static bool deepest_stop_found;
 
 static int pnv_save_sprs_for_deep_states(void)
@@ -92,7 +89,7 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t hid5_val = mfspr(SPRN_HID5);
uint64_t hmeer_val = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
-   uint64_t psscr_val = pnv_deepest_stop_psscr_val;
+   uint64_t psscr_val = pnv_deepest_state->psscr_val;
 
for_each_present_cpu(cpu) {
uint64_t pir = get_hard_smp_processor_id(cpu);
@@ -218,18 +215,15 @@ static void pnv_alloc_idle_core_states(void)
pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug 
affected\n");
 
if (cpu_has_feature(CPU_FTR_ARCH_300) &&
-   (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+   (pnv_deepest_state->flags & OPAL_PM_LOSE_FULL_CONTEXT)) {
/*
 * Use the default stop state for CPU-Hotplug
 * if available.
 */
if (default_stop_found) {
-   pnv_deepest_stop_psscr_val =
-   pnv_default_stop_val;
-   pnv_deepest_stop_psscr_mask =
-   pnv_default_stop_mask;
+   pnv_deepest_state = pnv_default_state;
pr_warn("cpuidle-powernv: Offlined CPUs will 
stop with psscr = 0x%016llx\n",
-   pnv_deepest_stop_psscr_val);
+   pnv_deepest_state->psscr_val);
} else { /* Fallback to snooze loop for CPU-Hotplug */
deepest_stop_found = false;
pr_warn("cpuidle-powernv: Offlined CPUs will 
busy wait\n");
@@ -365,20 +359,20 @@ void power7_idle(void)
power7_idle_type(PNV_THREAD_NAP);
 }
 
-static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
- unsigned long stop_psscr_mask)
+static unsigned long __power9_idle_type(struct pnv_idle_states_t *state)
 {
-   unsigned long psscr;
+   unsigned long psscr, stop_psscr_mask, stop_psscr_val;
unsigned long srr1;
 
+   stop_psscr_mask = state->psscr_mask;
+   stop_psscr_val = state->psscr_val;
if (!prep_irq_for_idle_irqsoff())
return 0;
 
psscr = mfspr(SPRN_PSSCR);
psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
-
__ppc64_runlatch_off();
-   srr1 = power9_idle_stop(psscr);
+   srr1 = power9_idle_stop(psscr, state->opal_supported);
__ppc64_runlatch_on();
 
fini_irq_for_idle_irqsoff();
@@ -386,12 +380,11 @@ static unsigned long __power9_idle_type(unsigned long 
stop_psscr_val,
return 

[RFC PATCH 1/3] cpuidle/powernv: Add support for states with ibm, cpuidle-state-v1

2018-08-01 Thread Akshay Adiga
This patch adds support for new device-tree format for idle state
description.

Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
New device tree format adds a compatible flag, so that only kernel
which has the capability to handle the version of stop state will enable
it.

Older kernel will still see stop0 and stop0_lite in older format and we
will depricate it after some time.

1) Idea is to bump up the version in firmware if we find a bug or
regression in stop states. A fix will be provided in linux which would
now know about the bumped up version of stop states, where as kernel
without fixes would ignore the states.

2) Slowly deprecate cpuidle /cpuhotplug threshold which is hard-coded
into cpuidle-powernv driver. Instead use compatible strings to indicate
if idle state is suitable for cpuidle and hotplug.

New idle state device tree format :
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "ibm,state-v1",
  "cpuidle",
  "opal-supported";
 psscr-mask = <0x0 0x3003ff>;
 handle = <0x102>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
  };
...
stop11 {
 ...
 compatible = "ibm,state-v1",
  "cpuoffline",
  "opal-supported";
 ...
  };
 };

compatible strings :
"cpuidle" : indicates it should be used by cpuidle-driver
"cpuoffline" : indicates it should be used by hotplug driver
"ibm,state-v1" : kernel checks if it knows about this version
"opal-supported" : indicates kernel can fall back to use opal
   for stop-transitions

Signed-off-by: Akshay Adiga 
---
 arch/powerpc/include/asm/cpuidle.h|  11 ++
 arch/powerpc/platforms/powernv/idle.c | 139 +-
 drivers/cpuidle/cpuidle-powernv.c |  50 +
 3 files changed, 175 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/cpuidle.h 
b/arch/powerpc/include/asm/cpuidle.h
index 43e5f31fe64d..b965066560cc 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -79,17 +79,28 @@ struct stop_sprs {
u64 mmcra;
 };
 
+enum idle_state_type_t {
+   CPUIDLE_TYPE,
+   CPUOFFLINE_TYPE
+};
+
+
+#define POWERNV_THRESHOLD_LATENCY_NS 20
+#define PNV_VER_NAME_LEN32
 #define PNV_IDLE_NAME_LEN16
 struct pnv_idle_states_t {
char name[PNV_IDLE_NAME_LEN];
+   char version[PNV_VER_NAME_LEN];
u32 latency_ns;
u32 residency_ns;
u64 psscr_val;
u64 psscr_mask;
u32 flags;
+   enum idle_state_type_t type;
bool valid;
 };
 
+
 extern struct pnv_idle_states_t *pnv_idle_states;
 extern int nr_pnv_idle_states;
 extern u32 pnv_fastsleep_workaround_at_entry[];
diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 7cf71b3e03a1..93accece92e3 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -47,6 +47,19 @@ static u64 pnv_default_stop_val;
 static u64 pnv_default_stop_mask;
 static bool default_stop_found;
 
+static int parse_dt_v1(struct device_node *np);
+struct stop_version_t {
+   const char name[PNV_VER_NAME_LEN];
+   int (*parser_fn)(struct device_node *np);
+};
+struct stop_version_t known_versions[] = {
+   {
+   .name =  "ibm,state-v1",
+   .parser_fn = parse_dt_v1,
+   }
+};
+const int nr_known_versions = 1;
+
 /*
  * First deep stop state. Used to figure out when to save/restore
  * hypervisor context.
@@ -659,8 +672,14 @@ static int __init pnv_power9_idle_init(void)
state->valid = false;
report_invalid_psscr_val(state->psscr_val, err);
continue;
+   } else {
+   state->valid = true;
}
 
+   /*
+* We pick state with highest residency. We dont care if
+* its a cpuidle state or a cpuoffline state.
+*/
if (max_residency_ns 

[RFC PATCH 0/3] New device-tree format and Opal based idle save-restore

2018-08-01 Thread Akshay Adiga
Previously if a older kernel runs on a newer firmware, it may enable
all available states irrespective of its capability of handling it.
New device tree format adds a compatible flag, so that only kernel
which has the capability to handle the version of stop state will enable
it.

Older kernel will still see stop0 and stop0_lite in older format and we
will depricate it after some time.

1) Idea is to bump up the version string in firmware if we find a bug or
regression in stop states. A fix will be provided in linux which would
now know about the bumped up version of stop states, where as kernel
without fixes would ignore the states.

2) Slowly deprecate cpuidle/cpuhotplug threshold which is hard-coded
into cpuidle-powernv driver. Instead use compatible strings to indicate
if idle state is suitable for cpuidle and hotplug.

New idle state device tree format :
   power-mgt {
...
 ibm,enabled-stop-levels = <0xec00>;
 ibm,cpu-idle-state-psscr-mask = <0x0 0x3003ff 0x0 0x3003ff>;
 ibm,cpu-idle-state-latencies-ns = <0x3e8 0x7d0>;
 ibm,cpu-idle-state-psscr = <0x0 0x330 0x0 0x300330>;
 ibm,cpu-idle-state-flags = <0x10 0x101000>;
 ibm,cpu-idle-state-residency-ns = <0x2710 0x4e20>;
 ibm,idle-states {
 stop4 {
 flags = <0x207000>;
 compatible = "ibm,state-v1",
  "cpuidle",
  "opal-supported";
 psscr-mask = <0x0 0x3003ff>;
 handle = <0x102>;
 latency-ns = <0x186a0>;
 residency-ns = <0x989680>;
 psscr = <0x0 0x300374>;
  };
...
stop11 {
 ...
 compatible = "ibm,state-v1",
  "cpuoffline",
  "opal-supported";
 ...
  };
 };

Skiboot patch-set for device-tree is posted here :
https://patchwork.ozlabs.org/project/skiboot/list/?series=58934

High-level parsing algorithm :

Say Known version string = "ibm,state-v1"

for each stop state node in device tree:
if (compatible has known version string)
kernel takes care of stop-transitions
else if (compatible has "opal-supported")
OPAL takes care of stop-transitions
else
Skip All deeper states

When a state does not have both version support and opal support,
Its possible to exit from a shallower state. Hence skipping all
deeper states.

How does it work ?
---

Consider a case that stop4 has a bug. We take the following steps to
mitigate the problem.

1) Change compatible string for stop4 in OPAL to "ibm-state-v2" and
remove "opal-supported". ship the new firmware.
The kernel ignores stop4 and all deeper states. But we will still have
shallower states. Prevents from completely disabling stop states.

2) Implement workaround in OPAL and add "opal-supported". Ship new firmware
The kernel uses opal for stop-transtion , which has workaround implemented.
We get stop4 and deeper states working without kernel changes and backports.
(and considerably less time)

3) Implement workaround in kernel and add "ibm-state-v2" as known versions
The kernel will now be able to handle stop4 and deeper states.

Also includes Abhishek's RFC which was posted there :
 https://patchwork.ozlabs.org/patch/947568/

This patch-set is on top of mpe-next



Abhishek Goel (1):
  cpuidle/powernv: Conditionally save-restore sprs using opal

Akshay Adiga (2):
  cpuidle/powernv: Add support for states with ibm,cpuidle-state-v1
  powernv/cpuidle: Pass pointers instead of values to stop loop

 arch/powerpc/include/asm/cpuidle.h|  12 +
 arch/powerpc/include/asm/opal-api.h   |   4 +-
 arch/powerpc/include/asm/opal.h   |   3 +
 arch/powerpc/include/asm/paca.h   |   5 +-
 arch/powerpc/include/asm/processor.h  |   9 +-
 arch/powerpc/kernel/asm-offsets.c |   3 +
 arch/powerpc/kernel/idle_book3s.S |  55 -
 arch/powerpc/platforms/powernv/idle.c | 214 +++---
 .../powerpc/platforms/powernv/opal-wrappers.S |   2 +
 arch/powerpc/xmon/xmon.c  |  14 +-
 drivers/cpuidle/cpuidle-powernv.c |  66 +++---
 11 files changed, 304 insertions(+), 83 deletions(-)

-- 
2.18.0.rc2.85.g1fb9df7



Re: linux-next: manual merge of the powerpc tree with the m68k tree

2018-08-01 Thread Stephen Rothwell
Hi all,

[forgot the conflict resolution ...]

On Thu, 2 Aug 2018 09:27:20 +1000 Stephen Rothwell  
wrote:
>
> Today's linux-next merge of the powerpc tree got a conflict in:
> 
>   arch/m68k/mac/misc.c
> 
> between commit:
> 
>   5b9bfb8ec467 ("m68k: mac: Use time64_t in RTC handling")
> 
> from the m68k tree and commit:
> 
>   ebd722275f9c ("macintosh/via-pmu: Replace via-pmu68k driver with via-pmu 
> driver")
> 
> from the powerpc tree.
> 
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/m68k/mac/misc.c
index 19e9d8eef1f2,28090a44fa09..3534aa6a4dc2
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@@ -90,11 -85,11 +90,11 @@@ static void cuda_write_pram(int offset
  }
  #endif /* CONFIG_ADB_CUDA */
  
- #ifdef CONFIG_ADB_PMU68K
+ #ifdef CONFIG_ADB_PMU
 -static long pmu_read_time(void)
 +static time64_t pmu_read_time(void)
  {
struct adb_request req;
 -  long time;
 +  time64_t time;
  
if (pmu_request(, NULL, 1, PMU_READ_RTC) < 0)
return 0;


pgpf0oTdwPhhP.pgp
Description: OpenPGP digital signature


linux-next: manual merge of the powerpc tree with the m68k tree

2018-08-01 Thread Stephen Rothwell
Hi all,

Today's linux-next merge of the powerpc tree got a conflict in:

  arch/m68k/mac/misc.c

between commit:

  5b9bfb8ec467 ("m68k: mac: Use time64_t in RTC handling")

from the m68k tree and commit:

  ebd722275f9c ("macintosh/via-pmu: Replace via-pmu68k driver with via-pmu 
driver")

from the powerpc tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.



-- 
Cheers,
Stephen Rothwell


pgpZEHofoHrtx.pgp
Description: OpenPGP digital signature


Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Michael S. Tsirkin
On Wed, Aug 01, 2018 at 10:05:35AM +0100, Will Deacon wrote:
> Hi Christoph,
> 
> On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote:
> > On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> > > On arm/arm64, the problem we have is that legacy virtio devices on the 
> > > MMIO
> > > transport (so definitely not PCI) have historically been advertised by 
> > > qemu
> > > as not being cache coherent, but because the virtio core has bypassed DMA
> > > ops then everything has happened to work. If we blindly enable the arch 
> > > DMA
> > > ops,
> > 
> > No one is suggesting that as far as I can tell.
> 
> Apologies: it's me that wants the DMA ops enabled to handle legacy devices
> behind an IOMMU, but see below.
> 
> > > we'll plumb in the non-coherent ops and start getting data corruption,
> > > so we do need a way to quirk virtio as being "always coherent" if we want 
> > > to
> > > use the DMA ops (which we do, because our emulation platforms have an 
> > > IOMMU
> > > for all virtio devices).
> > 
> > From all that I've gather so far: no you do not want that.  We really
> > need to figure out virtio "dma" interacts with the host / device.
> > 
> > If you look at the current iommu spec it does talk of physical address
> > with a little careveout for VIRTIO_F_IOMMU_PLATFORM.
> 
> That's true, although that doesn't exist in the legacy virtio spec, and we
> have an existing emulation platform which puts legacy virtio devices behind
> an IOMMU. Currently, Linux is unable to boot on this platform unless the
> IOMMU is configured as bypass. If we can use the coherent IOMMU DMA ops,
> then it works perfectly.
> 
> > So between that and our discussion in this thread and its previous
> > iterations I think we need to stick to the current always physical,
> > bypass system dma ops mode of virtio operation as the default.
> 
> As above -- that means we hang during boot because we get stuck trying to
> bring up a virtio-block device whose DMA is aborted by the IOMMU. The easy
> answer is "just upgrade to latest virtio and advertise the presence of the
> IOMMU". I'm pushing for that in future platforms, but it seems a shame not
> to support the current platform, especially given that other systems do have
> hacks in mainline to get virtio working.
> 
> > We just need to figure out how to deal with devices that deviate
> > from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
> > should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> > dma tweaks (offsets, cache flushing), which seems well in spirit of
> > the original design.  The other issue is VIRTIO_F_IO_BARRIER
> > which is very vaguely defined, and which needs a better definition.
> > And last but not least we'll need some text explaining the challenges
> > of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> > is what would basically cover them, but a good description including
> > an explanation of why these matter.
> 
> I agree that this makes sense for future revisions of virtio (or perhaps
> it can just be a clarification to virtio 1.0), but we're still left in the
> dark with legacy devices and it would be nice to have them work on the
> systems which currently exist, even if it's a legacy-only hack in the arch
> code.
> 
> Will


Myself I'm sympathetic to this use-case and I see more uses to this
than just legacy support. But more work is required IMHO.
Will post tomorrow though - it's late here ...

-- 
MST


Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Michael S. Tsirkin
On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote:
> On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
> > transport (so definitely not PCI) have historically been advertised by qemu
> > as not being cache coherent, but because the virtio core has bypassed DMA
> > ops then everything has happened to work. If we blindly enable the arch DMA
> > ops,
> 
> No one is suggesting that as far as I can tell.
> 
> > we'll plumb in the non-coherent ops and start getting data corruption,
> > so we do need a way to quirk virtio as being "always coherent" if we want to
> > use the DMA ops (which we do, because our emulation platforms have an IOMMU
> > for all virtio devices).
> 
> >From all that I've gather so far: no you do not want that.  We really
> need to figure out virtio "dma" interacts with the host / device.
> 
> If you look at the current iommu spec it does talk of physical address
> with a little careveout for VIRTIO_F_IOMMU_PLATFORM.
> 
> So between that and our discussion in this thread and its previous
> iterations I think we need to stick to the current always physical,
> bypass system dma ops mode of virtio operation as the default.
> 
> We just need to figure out how to deal with devices that deviate
> from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
> should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> dma tweaks (offsets, cache flushing), which seems well in spirit of
> the original design.

Well I wouldn't say that. VIRTIO_F_IOMMU_PLATFORM is for guest
programmable protection which is designed for things like userspace
drivers but still very much which a CPU doing the accesses. I think
VIRTIO_F_IO_BARRIER needs to be extended to VIRTIO_F_PLATFORM_DMA.

>  The other issue is VIRTIO_F_IO_BARRIER
> which is very vaguely defined, and which needs a better definition.
> And last but not least we'll need some text explaining the challenges
> of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> is what would basically cover them, but a good description including
> an explanation of why these matter.

I think VIRTIO_F_IOMMU_PLATFORM + VIRTIO_F_PLATFORM_DMA but yea.

-- 
MST


Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Michael S. Tsirkin
On Tue, Jul 31, 2018 at 03:36:22PM -0500, Benjamin Herrenschmidt wrote:
> On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote:
> > > However the question people raise is that DMA API is already full of
> > > arch-specific tricks the likes of which are outlined in your post linked
> > > above. How is this one much worse?
> > 
> > None of these warts is visible to the driver, they are all handled in
> > the architecture (possibly on a per-bus basis).
> > 
> > So for virtio we really need to decide if it has one set of behavior
> > as specified in the virtio spec, or if it behaves exactly as if it
> > was on a PCI bus, or in fact probably both as you lined up.  But no
> > magic arch specific behavior inbetween.
> 
> The only arch specific behaviour is needed in the case where it doesn't
> behave like PCI. In this case, the PCI DMA ops are not suitable, but in
> our secure VMs, we still need to make it use swiotlb in order to bounce
> through non-secure pages.
> 
> It would be nice if "real PCI" was the default

I think you are mixing "real PCI" which isn't coded up yet and IOMMU
bypass which is. IOMMU bypass will maybe with time become unnecessary
since it seems that one can just program an IOMMU in a bypass mode
instead.

It's hard to blame you since right now if you disable IOMMU bypass
you get a real PCI mode. But they are distinct and to allow people
to enable IOMMU by default we will need to teach someone
(virtio or DMA API) about this mode that does follow
translation and protection rules in the IOMMU but runs
on a CPU and so does not need cache flushes and whatnot.

OTOH real PCI mode as opposed to default hypervisor mode does not perform as
well when what you actually have is a hypervisor.

So we'll likely have a mix of these two modes for a while.

> but it's not, VMs are
> created in "legacy" mode all the times and we don't know at VM creation
> time whether it will become a secure VM or not. The way our secure VMs
> work is that they start as a normal VM, load a secure "payload" and
> call the Ultravisor to "become" secure.
> 
> So we're in a bit of a bind here. We need that one-liner optional arch
> hook to make virtio use swiotlb in that "IOMMU bypass" case.
> 
> Ben.

And just to make sure I understand, on your platform DMA APIs do include
some of the cache flushing tricks and this is why you don't want to
declare iommu support in the hypervisor?

-- 
MST


[PATCH 19/21] perf tools: Allow overriding MAX_NR_CPUS at compile time

2018-08-01 Thread Arnaldo Carvalho de Melo
From: Christophe Leroy 

After update of kernel, the perf tool doesn't run anymore on my 32MB RAM
powerpc board, but still runs on a 128MB RAM board:

  ~# strace perf
  execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot 
allocate memory)
  --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} ---
  +++ killed by SIGSEGV +++
  Segmentation fault

objdump -x shows that .bss section has a huge size of 24Mbytes:

 27 .bss  016baca8  101cebb8  101cebb8  001cd988  2**3

With especially the following objects having quite big size:

  10205f80 l O .bss 0014 runtime_cycles_stats
  10345f80 l O .bss 0014 runtime_stalled_cycles_front_stats
  10485f80 l O .bss 0014 runtime_stalled_cycles_back_stats
  105c5f80 l O .bss 0014 runtime_branches_stats
  10705f80 l O .bss 0014 runtime_cacherefs_stats
  10845f80 l O .bss 0014 runtime_l1_dcache_stats
  10985f80 l O .bss 0014 runtime_l1_icache_stats
  10ac5f80 l O .bss 0014 runtime_ll_cache_stats
  10c05f80 l O .bss 0014 runtime_itlb_cache_stats
  10d45f80 l O .bss 0014 runtime_dtlb_cache_stats
  10e85f80 l O .bss 0014 runtime_cycles_in_tx_stats
  10fc5f80 l O .bss 0014 runtime_transaction_stats
  11105f80 l O .bss 0014 runtime_elision_stats
  11245f80 l O .bss 0014 runtime_topdown_total_slots
  11385f80 l O .bss 0014 runtime_topdown_slots_retired
  114c5f80 l O .bss 0014 runtime_topdown_slots_issued
  11605f80 l O .bss 0014 runtime_topdown_fetch_bubbles
  11745f80 l O .bss 0014 runtime_topdown_recovery_bubbles

This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus
to 1024"), because many tables are sized with MAX_NR_CPUS

This patch gives the opportunity to redefine MAX_NR_CPUS via

  $ make EXTRA_CFLAGS=-DMAX_NR_CPUS=1

Signed-off-by: Christophe Leroy 
Cc: Alexander Shishkin 
Cc: Peter Zijlstra 
Cc: linuxppc-dev@lists.ozlabs.org
Link: 
http://lkml.kernel.org/r/20170922112043.8349468...@po15668-vm-win7.idsi0.si.c-s.fr
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/perf.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index d215714f48df..21bf7f5a3cf5 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -25,7 +25,9 @@ static inline unsigned long long rdclock(void)
return ts.tv_sec * 10ULL + ts.tv_nsec;
 }
 
+#ifndef MAX_NR_CPUS
 #define MAX_NR_CPUS1024
+#endif
 
 extern const char *input_name;
 extern bool perf_host, perf_guest;
-- 
2.14.4



[GIT PULL 00/21] perf/core improvements and fixes

2018-08-01 Thread Arnaldo Carvalho de Melo
Hi Ingo,

Please consider pulling, contains a recently merged
tip/perf/urgent,

- Arnaldo

Test results at the end of this message, as usual.

The following changes since commit c2586cfbb905939b79b49a9121fb0a59a5668fd6:

  Merge remote-tracking branch 'tip/perf/urgent' into perf/core (2018-07-31 
09:55:45 -0300)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git 
tags/perf-core-for-mingo-4.19-20180801

for you to fetch changes up to b912885ab75c7c8aa841c615108afd755d0b97f8:

  perf trace: Do not require --no-syscalls to suppress strace like output 
(2018-08-01 16:20:28 -0300)


perf/core improvements and fixes:

perf trace: (Arnaldo Carvalho de Melo)

- Do not require --no-syscalls to suppress strace like output, i.e.

 # perf trace -e sched:*switch

  will show just sched:sched_switch events, not strace-like formatted
  syscall events, use --syscalls to get the previous behaviour.

  If instead:

 # perf trace

  is used, i.e. no events specified, then --syscalls is implied and
  system wide strace like formatting will be applied to all syscalls.

  The behaviour when just a syscall subset is used with '-e' is unchanged:

 # perf trace -e *sleep,sched:*switch

  will work as before: just the 'nanosleep' syscall will be strace-like
  formatted plus the sched:sched_switch tracepoint event, system wide.

- Allow string table generators to use a default header dir, allowing
  use of them without parameters to see the table it generates on
  stdout, e.g.:

$ tools/perf/trace/beauty/kvm_ioctl.sh
static const char *kvm_ioctl_cmds[] = {
[0x00] = "GET_API_VERSION",
[0x01] = "CREATE_VM",
[0x02] = "GET_MSR_INDEX_LIST",
[0x03] = "CHECK_EXTENSION",

[0xe0] = "CREATE_DEVICE",
[0xe1] = "SET_DEVICE_ATTR",
[0xe2] = "GET_DEVICE_ATTR",
[0xe3] = "HAS_DEVICE_ATTR",
};
$

  See 'ls tools/perf/trace/beauty/*.sh' to see the available string
  table generators.

- Add a generator for IPPROTO_ socket's protocol constants.

perf record: (Kan Liang)

- Fix error out while applying initial delay and using LBR, due to
  the use of a PERF_TYPE_SOFTWARE/PERF_COUNT_SW_DUMMY event to track
  PERF_RECORD_MMAP events while waiting for the initial delay. Such
  events fail when configured asking PERF_SAMPLE_BRANCH_STACK in
  perf_event_attr.sample_type.

perf c2c: (Jiri Olsa)

- Fix report crash for empty browser, when processing a perf.data file
  without events of interest, either because not asked for in
  'perf record' or because the workload didn't triggered such events.

perf list: (Michael Petlan)

- Align metric group description format with PMU event description.

perf tests: (Sandipan Das)

- Fix indexing when invoking subtests, which caused BPF tests to
  get results for the next test in the list, with the last one
  reporting a failure.

eBPF:

- Fix installation directory for header files included from eBPF proggies,
  avoiding clashing with relative paths used to build other software projects
  such as glibc. (Thomas Richter)

- Show better message when failing to load an object. (Arnaldo Carvalho de Melo)

General: (Christophe Leroy)

- Allow overriding MAX_NR_CPUS at compile time, to make the tooling
  usable in systems with less memory, in time this has to be changed
  to properly allocate based on _NPROCESSORS_ONLN.

Architecture specific:

- Update arm64's ThunderX2 implementation defined pmu core events (Ganapatrao 
Kulkarni)

- Fix complex event name parsing in 'perf test' for PowerPC, where the 'umask' 
event
  modifier isn't present. (Sandipan Das)

CoreSight ARM hardware tracing: (Leo Yan)

- Fix start tracing packet handling.

- Support dummy address value for CS_ETM_TRACE_ON packet.

- Generate branch sample when receiving a CS_ETM_TRACE_ON packet.

- Generate branch sample for CS_ETM_TRACE_ON packet.

Signed-off-by: Arnaldo Carvalho de Melo 


Arnaldo Carvalho de Melo (9):
  perf trace beauty: Default header_dir to cwd to work without parms
  tools include uapi: Grab a copy of linux/in.h
  perf beauty: Add a generator for IPPROTO_ socket's protocol constants
  perf trace beauty: Do not print NULL strarray entries
  perf trace beauty: Add beautifiers for 'socket''s 'protocol' arg
  perf trace: Beautify the AF_INET & AF_INET6 'socket' syscall 'protocol' 
args
  perf bpf: Show better message when failing to load an object
  perf bpf: Include uapi/linux/bpf.h from the 'perf trace' script's bpf.h
  perf trace: Do not require --no-syscalls to suppress strace like output

Christophe Leroy (1):
  perf tools: Allow overriding MAX_NR_CPUS at compile time

Ganapatrao Kulkarni (1):
  perf vendor events arm64: Update Thu

[PATCH v4 4/6] powerpc/traps: Print VMA for unhandled signals

2018-08-01 Thread Murilo Opsfelder Araujo
This adds VMA address in the message printed for unhandled signals,
similarly to what other architectures, like x86, print.

Before this patch, a page fault looked like:

  pandafault[61470]: unhandled signal 11 at 17d0 nip 161c lr 
7fff8d185100 code 2

After this patch, a page fault looks like:

  pandafault[6303]: segfault 11 at 17d0 nip 161c lr 7fff93c55100 code 2 
in pandafault[1000+1]

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/kernel/traps.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 4503e22f6ba5..bcefbb1ee771 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -96,6 +96,19 @@ EXPORT_SYMBOL(__debugger_fault_handler);
 #define TM_DEBUG(x...) do { } while(0)
 #endif
 
+static const char *signame(int signr)
+{
+   switch (signr) {
+   case SIGBUS:return "bus error";
+   case SIGFPE:return "floating point exception";
+   case SIGILL:return "illegal instruction";
+   case SIGSEGV:   return "segfault";
+   case SIGTRAP:   return "unhandled trap";
+   }
+
+   return "unknown signal";
+}
+
 /*
  * Trap & Exception support
  */
@@ -317,9 +330,13 @@ static void show_signal_msg(int signr, struct pt_regs 
*regs, int code,
if (!show_unhandled_signals_ratelimited())
return;
 
-   pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x\n",
-   current->comm, current->pid, signr,
+   pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
+   current->comm, current->pid, signame(signr), signr,
addr, regs->nip, regs->link, code);
+
+   print_vma_addr(KERN_CONT " in ", regs->nip);
+
+   pr_cont("\n");
 }
 
 void _exception_pkey(int signr, struct pt_regs *regs, int code,
-- 
2.17.1



[PATCH v4 5/6] powerpc: Add show_user_instructions()

2018-08-01 Thread Murilo Opsfelder Araujo
show_user_instructions() is a slightly modified version of
show_instructions() that allows userspace instruction dump.

This will be useful within show_signal_msg() to dump userspace
instructions of the faulty location.

Here is a sample of what show_user_instructions() outputs:

  pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
f821ffc1 7c3f0b78 3d22fffe
  pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
3920 7d234b78 383f0040

The current->comm and current->pid printed can serve as a glue that
links the instructions dump to its originator, allowing messages to be
interleaved in the logs.

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/include/asm/stacktrace.h | 13 +
 arch/powerpc/kernel/process.c | 40 +++
 2 files changed, 53 insertions(+)
 create mode 100644 arch/powerpc/include/asm/stacktrace.h

diff --git a/arch/powerpc/include/asm/stacktrace.h 
b/arch/powerpc/include/asm/stacktrace.h
new file mode 100644
index ..6149b53b3bc8
--- /dev/null
+++ b/arch/powerpc/include/asm/stacktrace.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Stack trace functions.
+ *
+ * Copyright 2018, Murilo Opsfelder Araujo, IBM Corporation.
+ */
+
+#ifndef _ASM_POWERPC_STACKTRACE_H
+#define _ASM_POWERPC_STACKTRACE_H
+
+void show_user_instructions(struct pt_regs *regs);
+
+#endif /* _ASM_POWERPC_STACKTRACE_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e9533b4d2f08..364645ac732c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1299,6 +1299,46 @@ static void show_instructions(struct pt_regs *regs)
pr_cont("\n");
 }
 
+void show_user_instructions(struct pt_regs *regs)
+{
+   int i;
+   const char *prefix = KERN_INFO "%s[%d]: code: ";
+   unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
+   sizeof(int));
+
+   printk(prefix, current->comm, current->pid);
+
+   for (i = 0; i < instructions_to_print; i++) {
+   int instr;
+
+   if (!(i % 8) && (i > 0)) {
+   pr_cont("\n");
+   printk(prefix, current->comm, current->pid);
+   }
+
+#if !defined(CONFIG_BOOKE)
+   /* If executing with the IMMU off, adjust pc rather
+* than print .
+*/
+   if (!(regs->msr & MSR_IR))
+   pc = (unsigned long)phys_to_virt(pc);
+#endif
+
+   if (probe_kernel_address((unsigned int __user *)pc, instr)) {
+   pr_cont(" ");
+   } else {
+   if (regs->nip == pc)
+   pr_cont("<%08x> ", instr);
+   else
+   pr_cont("%08x ", instr);
+   }
+
+   pc += sizeof(int);
+   }
+
+   pr_cont("\n");
+}
+
 struct regbit {
unsigned long bit;
const char *name;
-- 
2.17.1



[PATCH v4 6/6] powerpc/traps: Show instructions on exceptions

2018-08-01 Thread Murilo Opsfelder Araujo
Call show_user_instructions() in arch/powerpc/kernel/traps.c to dump
instructions at faulty location, useful to debugging.

Before this patch, an unhandled signal message looked like:

  pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 
code 2 in pandafault[1000+1]

After this patch, it looks like:

  pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 
code 2 in pandafault[1000+1]
  pandafault[10524]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
f821ffc1 7c3f0b78 3d22fffe
  pandafault[10524]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
3920 7d234b78 383f0040

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/kernel/traps.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index bcefbb1ee771..8494b0ff4904 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -70,6 +70,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
 int (*__debugger)(struct pt_regs *regs) __read_mostly;
@@ -337,6 +338,8 @@ static void show_signal_msg(int signr, struct pt_regs 
*regs, int code,
print_vma_addr(KERN_CONT " in ", regs->nip);
 
pr_cont("\n");
+
+   show_user_instructions(regs);
 }
 
 void _exception_pkey(int signr, struct pt_regs *regs, int code,
-- 
2.17.1



[PATCH v4 3/6] powerpc/traps: Use %lx format in show_signal_msg()

2018-08-01 Thread Murilo Opsfelder Araujo
Use %lx format to print registers.  This avoids having two different
formats and avoids checking for MSR_64BIT, improving readability of the
function.

Even though we could have used %px, which is functionally equivalent to %lx
as per Documentation/core-api/printk-formats.rst, it is not semantically
correct because the data printed are not pointers.  And using %px requires
casting data to (void *).

Besides that, %lx matches the format used in show_regs().

Before this patch:

  pandafault[4808]: unhandled signal 11 at 1718 nip 
1574 lr 7fff935e7a6c code 2

After this patch:

  pandafault[4732]: unhandled signal 11 at 1718 nip 1574 lr 
7fff86697a6c code 2

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/kernel/traps.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 367534b41617..4503e22f6ba5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -311,20 +311,15 @@ static bool show_unhandled_signals_ratelimited(void)
 static void show_signal_msg(int signr, struct pt_regs *regs, int code,
unsigned long addr)
 {
-   const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \
-   "at %08lx nip %08lx lr %08lx code %x\n";
-   const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \
-   "at %016lx nip %016lx lr %016lx code %x\n";
-
if (!unhandled_signal(current, signr))
return;
 
if (!show_unhandled_signals_ratelimited())
return;
 
-   printk(regs->msr & MSR_64BIT ? fmt64 : fmt32,
-  current->comm, current->pid, signr,
-  addr, regs->nip, regs->link, code);
+   pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x\n",
+   current->comm, current->pid, signr,
+   addr, regs->nip, regs->link, code);
 }
 
 void _exception_pkey(int signr, struct pt_regs *regs, int code,
-- 
2.17.1



[PATCH v4 2/6] powerpc/traps: Use an explicit ratelimit state for show_signal_msg()

2018-08-01 Thread Murilo Opsfelder Araujo
Replace printk_ratelimited() by printk() and a default rate limit
burst to limit displaying unhandled signals messages.

This will allow us to call print_vma_addr() in a future patch, which
does not work with printk_ratelimited().

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/kernel/traps.c | 21 -
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index cbd3dc365193..367534b41617 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -301,6 +301,13 @@ void user_single_step_siginfo(struct task_struct *tsk,
info->si_addr = (void __user *)regs->nip;
 }
 
+static bool show_unhandled_signals_ratelimited(void)
+{
+   static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+   return show_unhandled_signals && __ratelimit();
+}
+
 static void show_signal_msg(int signr, struct pt_regs *regs, int code,
unsigned long addr)
 {
@@ -309,11 +316,15 @@ static void show_signal_msg(int signr, struct pt_regs 
*regs, int code,
const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \
"at %016lx nip %016lx lr %016lx code %x\n";
 
-   if (show_unhandled_signals && unhandled_signal(current, signr)) {
-   printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
-  current->comm, current->pid, signr,
-  addr, regs->nip, regs->link, code);
-   }
+   if (!unhandled_signal(current, signr))
+   return;
+
+   if (!show_unhandled_signals_ratelimited())
+   return;
+
+   printk(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+  current->comm, current->pid, signr,
+  addr, regs->nip, regs->link, code);
 }
 
 void _exception_pkey(int signr, struct pt_regs *regs, int code,
-- 
2.17.1



[PATCH v4 1/6] powerpc/traps: Print unhandled signals in a separate function

2018-08-01 Thread Murilo Opsfelder Araujo
Isolate the logic of printing unhandled signals out of _exception_pkey().
No functional change, only code rearrangement.

Signed-off-by: Murilo Opsfelder Araujo 
---
 arch/powerpc/kernel/traps.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 0e17dcb48720..cbd3dc365193 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -301,26 +301,32 @@ void user_single_step_siginfo(struct task_struct *tsk,
info->si_addr = (void __user *)regs->nip;
 }
 
+static void show_signal_msg(int signr, struct pt_regs *regs, int code,
+   unsigned long addr)
+{
+   const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \
+   "at %08lx nip %08lx lr %08lx code %x\n";
+   const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \
+   "at %016lx nip %016lx lr %016lx code %x\n";
+
+   if (show_unhandled_signals && unhandled_signal(current, signr)) {
+   printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
+  current->comm, current->pid, signr,
+  addr, regs->nip, regs->link, code);
+   }
+}
 
 void _exception_pkey(int signr, struct pt_regs *regs, int code,
-   unsigned long addr, int key)
+unsigned long addr, int key)
 {
siginfo_t info;
-   const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \
-   "at %08lx nip %08lx lr %08lx code %x\n";
-   const char fmt64[] = KERN_INFO "%s[%d]: unhandled signal %d " \
-   "at %016lx nip %016lx lr %016lx code %x\n";
 
if (!user_mode(regs)) {
die("Exception in kernel mode", regs, signr);
return;
}
 
-   if (show_unhandled_signals && unhandled_signal(current, signr)) {
-   printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
-  current->comm, current->pid, signr,
-  addr, regs->nip, regs->link, code);
-   }
+   show_signal_msg(signr, regs, code, addr);
 
if (arch_irqs_disabled() && !arch_irq_disabled_regs(regs))
local_irq_enable();
-- 
2.17.1



[PATCH v4 0/6] powerpc: Modernize unhandled signals message

2018-08-01 Thread Murilo Opsfelder Araujo
Hi, everyone.

This series was inspired by the need to modernize and display more
informative messages about unhandled signals.

The "unhandled signal NN" is not very informative.  We thought it would
be helpful adding a human-readable message describing what the signal
number means, printing the VMA address, and dumping the instructions.

Before this series:

  pandafault32[4724]: unhandled signal 11 at 15e4 nip 1444 lr 0fe31ef4 
code 2

  pandafault64[4725]: unhandled signal 11 at 1718 nip 
1574 lr 7fff7faa7a6c code 2

After this series:

  pandafault32[4753]: segfault (11) at 15e4 nip 1444 lr fe31ef4 code 2 
in pandafault32[1000+1]
  pandafault32[4753]: code: 4b3c 6000 6042 4b30 9421ffd0 
93e1002c 7c3f0b78 3d201000
  pandafault32[4753]: code: 392905e4 913f0008 813f0008 39400048 <9949> 
3920 7d234b78 397f0030

  pandafault64[4754]: segfault (11) at 1718 nip 1574 lr 7fffb0007a6c 
code 2 in pandafault64[1000+1]
  pandafault64[4754]: code: e8010010 7c0803a6 4bfffef4 4bfffef0 fbe1fff8 
f821ffb1 7c3f0b78 3d22fffe
  pandafault64[4754]: code: 39298818 f93f0030 e93f0030 39400048 <9949> 
3920 7d234b78 383f0050

Link to v3:

  https://lore.kernel.org/lkml/20180731145020.14009-1-muri...@linux.ibm.com/

v3..v4:

  - Added new show_user_instructions() based on the existing
show_instructions()
  - Updated commit messages
  - Replaced signals names table with a tiny function that returns a
literal string for each signal number

Cheers!

Murilo Opsfelder Araujo (6):
  powerpc/traps: Print unhandled signals in a separate function
  powerpc/traps: Use an explicit ratelimit state for show_signal_msg()
  powerpc/traps: Use %lx format in show_signal_msg()
  powerpc/traps: Print VMA for unhandled signals
  powerpc: Add show_user_instructions()
  powerpc/traps: Show instructions on exceptions

 arch/powerpc/include/asm/stacktrace.h | 13 +++
 arch/powerpc/kernel/process.c | 40 +
 arch/powerpc/kernel/traps.c   | 52 +--
 3 files changed, 95 insertions(+), 10 deletions(-)
 create mode 100644 arch/powerpc/include/asm/stacktrace.h

-- 
2.17.1



Re: phandle_cache vs of_detach_node (was Re: [PATCH] powerpc/mobility: Fix node detach/rename problem)

2018-08-01 Thread Michael Bringmann
On 08/01/2018 09:26 AM, Michael Ellerman wrote:
> Frank Rowand  writes:
>> On 07/31/18 07:17, Rob Herring wrote:
>>> On Tue, Jul 31, 2018 at 12:34 AM Michael Ellerman  
>>> wrote:

 Hi Rob/Frank,

 I think we might have a problem with the phandle_cache not interacting
 well with of_detach_node():
>>>
>>> Probably needs a similar fix as this commit did for overlays:
>>>
>>> commit b9952b5218added5577e4a3443969bc20884cea9
>>> Author: Frank Rowand 
>>> Date:   Thu Jul 12 14:00:07 2018 -0700
>>>
>>> of: overlay: update phandle cache on overlay apply and remove
>>>
>>> A comment in the review of the patch adding the phandle cache said that
>>> the cache would have to be updated when modules are applied and removed.
>>> This patch implements the cache updates.
>>>
>>> Fixes: 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of
>>> of_find_node_by_phandle()")
>>> Reported-by: Alan Tull 
>>> Suggested-by: Alan Tull 
>>> Signed-off-by: Frank Rowand 
>>> Signed-off-by: Rob Herring 
>>
>> Agreed.  Sorry about missing the of_detach_node() case.
>>
>>
>>> Really what we need here is an "invalidate phandle" function rather
>>> than free and re-allocate the whole damn cache.
>>
>> The big hammer approach was chosen to avoid the race conditions that
>> would otherwise occur.  OF does not have a locking strategy that
>> would be able to protect against the races.
>>
>> We could maybe implement a slightly smaller hammer by (1) disabling
>> the cache, (2) invalidate a phandle entry in the cache, (3) re-enable
>> the cache.  That is an off the cuff thought - I would have to look
>> a little bit more carefully to make sure it would work.
>>
>> But I don't see a need to add the complexity of the smaller hammer
>> or the bigger hammer of proper locking _unless_ we start seeing that
>> the cache is being freed and re-allocated frequently.  For overlays
>> I don't expect the high frequency because it happens on a per overlay
>> removal basis (not per node removal basis).  For of_detach_node() the
>> event _is_ on a per node removal basis.  Michael, do you expect node
>> removals to be a frequent event with low latency being important?  If
>> so, a rough guess on what the frequency would be?
> 
> No in practice we expect them to be fairly rare and in the thousands at
> most I think.
> 
> TBH you could just disable the phandle_cache on the first detach and
> that would be fine by me. I don't think we've ever noticed phandle
> lookups being much of a problem for us on our hardware.

I ran a couple of migrations with the calls to

of_free_phandle_cache() ... of_populate_phandle_cache()

bracketing the body of

arch/powerpc/platforms/pseries/mobility.c:post_mobility_fixup()

with a patch like,

diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index e245a88..efc9442 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -22,6 +22,9 @@
 #include 
 #include "pseries.h"

+extern int of_free_phandle_cache(void);
+extern void of_populate_phandle_cache(void);
+
 static struct kobject *mobility_kobj;

 struct update_props_workarea {
@@ -343,6 +346,8 @@ void post_mobility_fixup(void)
rc = rtas_call(activate_fw_token, 0, 1, NULL);
} while (rtas_busy_delay(rc));

+   of_free_phandle_cache();
+
if (rc)
printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc);

@@ -354,6 +359,8 @@ void post_mobility_fixup(void)
/* Possibly switch to a new RFI flush type */
pseries_setup_rfi_flush();

+   of_populate_phandle_cache();
+
return;
 }

and did not observe the warnings/crashes that have been
plaguing me.  We will need to move their prototypes out
of drivers/of/of_private.h to include/linux/of.h, though.

> 
> cheers
> 

Michael
 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



Re: [PATCH] powerpc/pasemi: Seach for PCI root bus by compatible property

2018-08-01 Thread Olof Johansson
On Wed, Aug 1, 2018 at 7:36 AM, Michael Ellerman  wrote:
> Christian Zigotzky  writes:
>
>> Just for info: I tested it on my Nemo board today and it works.
>
> Awesome thanks.
>
> That's probably sufficient to merge it, and if it breaks anything we can
> always revert it.

Should be fine, all known boards have the properties in question, and
I doubt anyone has anything but Nemo and Electra/Chitra boards out
there.

It's a virtual root bus, so all boards have it irrespective of what
PCIe is brought out.

(I should hook up my test system and get it into the CI cycle again,
maybe this fall).


Acked-by: Olof Johansson 


-Olof


Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100

2018-08-01 Thread Alex Williamson
On Wed, 1 Aug 2018 18:37:35 +1000
Alexey Kardashevskiy  wrote:

> On 01/08/2018 00:29, Alex Williamson wrote:
> > On Tue, 31 Jul 2018 14:03:35 +1000
> > Alexey Kardashevskiy  wrote:
> >   
> >> On 31/07/2018 02:29, Alex Williamson wrote:  
> >>> On Mon, 30 Jul 2018 18:58:49 +1000
> >>> Alexey Kardashevskiy  wrote:  
>  After some local discussions, it was pointed out that force disabling
>  nvlinks won't bring us much as for an nvlink to work, both sides need to
>  enable it so malicious guests cannot penetrate good ones (or a host)
>  unless a good guest enabled the link but won't happen with a well
>  behaving guest. And if two guests became malicious, then can still only
>  harm each other, and so can they via other ways such network. This is
>  different from PCIe as once PCIe link is unavoidably enabled, a well
>  behaving device cannot firewall itself from peers as it is up to the
>  upstream bridge(s) now to decide the routing; with nvlink2, a GPU still
>  has means to protect itself, just like a guest can run "firewalld" for
>  network.
> 
>  Although it would be a nice feature to have an extra barrier between
>  GPUs, is inability to block the links in hypervisor still a blocker for
>  V100 pass through?
> >>>
> >>> How is the NVLink configured by the guest, is it 'on'/'off' or are
> >>> specific routes configured? 
> >>
> >> The GPU-GPU links need not to be blocked and need to be enabled
> >> (==trained) by a driver in the guest. There are no routes between GPUs
> >> in NVLink fabric, these are direct links, it is just a switch on each
> >> side, both switches need to be on for a link to work.  
> > 
> > Ok, but there is at least the possibility of multiple direct links per
> > GPU, the very first diagram I find of NVlink shows 8 interconnected
> > GPUs:
> > 
> > https://www.nvidia.com/en-us/data-center/nvlink/  
> 
> Out design is like the left part of the picture but it is just a detail.

Unless we can specifically identify a direct link vs a mesh link, we
shouldn't be making assumptions about the degree of interconnect.
 
> > So if each switch enables one direct, point to point link, how does the
> > guest know which links to open for which peer device?  
> 
> It uses PCI config space on GPUs to discover the topology.

So do we need to virtualize this config space if we're going to
virtualize the topology?

> > And of course
> > since we can't see the spec, a security audit is at best hearsay :-\  
> 
> Yup, the exact discovery protocol is hidden.

It could be reverse engineered...

> >> The GPU-CPU links - the GPU bit is the same switch, the CPU NVlink state
> >> is controlled via the emulated PCI bridges which I pass through together
> >> with the GPU.  
> > 
> > So there's a special emulated switch, is that how the guest knows which
> > GPUs it can enable NVLinks to?  
> 
> Since it only has PCI config space (there is nothing relevant in the
> device tree at all), I assume (double checking with the NVIDIA folks
> now) the guest driver enables them all, tests which pair works and
> disables the ones which do not. This gives a malicious guest a tiny
> window of opportunity to break into a good guest. Hm :-/

Let's not minimize that window, that seems like a prime candidate for
an exploit.

> >>> If the former, then isn't a non-malicious
> >>> guest still susceptible to a malicious guest?
> >>
> >> A non-malicious guest needs to turn its switch on for a link to a GPU
> >> which belongs to a malicious guest.  
> > 
> > Actual security, or obfuscation, will we ever know...  
>  If the latter, how is  
> >>> routing configured by the guest given that the guest view of the
> >>> topology doesn't match physical hardware?  Are these routes
> >>> deconfigured by device reset?  Are they part of the save/restore
> >>> state?  Thanks,
> > 
> > Still curious what happens to these routes on reset.  Can a later user
> > of a GPU inherit a device where the links are already enabled?  Thanks,  
> 
> I am told that the GPU reset disables links. As a side effect, we get an
> HMI (a hardware fault which reset the host machine) when trying
> accessing the GPU RAM which indicates that the link is down as the
> memory is only accessible via the nvlink. We have special fencing code
> in our host firmware (skiboot) to fence this memory on PCI reset so
> reading from it returns zeroes instead of HMIs.

What sort of reset is required for this?  Typically we rely on
secondary bus reset for GPUs, but it would be a problem if GPUs were to
start implementing FLR and nobody had a spec to learn that FLR maybe
didn't disable the link.  The better approach to me still seems to be
virtualizing these NVLink config registers to an extent that the user
can only enabling links where they have ownership of both ends of the
connection.  Thanks,

Alex


Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()

2018-08-01 Thread Murilo Opsfelder Araujo
Hi, Christophe.

On Wed, Aug 01, 2018 at 08:41:15AM +0200, Christophe LEROY wrote:
>
>
> Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > Remove "Instruction dump:" line by adding a prefix to display current->comm
> > and current->pid, along with the instructions dump.
> >
> > The prefix can serve as a glue that links the instructions dump to its
> > originator, allowing messages to be interleaved in the logs.
> >
> > Before this patch, a page fault looked like:
> >
> >pandafault[10524]: segfault (11) at 17d0 nip 161c lr 
> > 7fffbd295100 code 2 in pandafault[1000+1]
> >Instruction dump:
> >4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe
> >392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040
> >
> > After this patch, it looks like:
> >
> >pandafault[10850]: segfault (11) at 17d0 nip 161c lr 
> > 7fff9f3e5100 code 2 in pandafault[1000+1]
> >pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
> > f821ffc1 7c3f0b78 3d22fffe
> >pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
> > 3920 7d234b78 383f0040
> >
> > Signed-off-by: Murilo Opsfelder Araujo 
>
> Does the script scripts/decode_stacktrace.sh also works with this format
> change ?

I've got more feedback from Michael about this.  A better approach would be
having a new show_user_instructions(), a slightly modified version of
show_instructions(), that can be called from within show_signal_msg().

Since _exception_pkey() dies if the exception is in kernel mode, we'll be
safe to call the new show_user_instructions(), without interfering in
scripts/decode_stacktrace.sh.

Cheers
Murilo



Re: [PATCH] powerpc/4xx: Fix error return path in ppc4xx_msi_probe()

2018-08-01 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 


Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals

2018-08-01 Thread Murilo Opsfelder Araujo
Hi, Segher.

On Wed, Aug 01, 2018 at 02:49:03AM -0500, Segher Boessenkool wrote:
> On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote:
> > On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> > > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > > I would suggest to instead use a function like this:
> > >
> > > static const char *signame(int signr)
> > > {
> > >   if (signr == SIGBUS)
> > >   return "bus error";
> > >   if (signr == SIGFPE)
> > >   return "floating point exception";
> > >   if (signr == SIGILL)
> > >   return "illegal instruction";
> > >   if (signr == SIGILL)
> > >   return "segfault";
> > >   if (signr == SIGTRAP)
> > >   return "unhandled trap";
> > >   return "unknown signal";
> > > }
> >
> > trivia:
> >
> > Unless the if tests are ordered most to least likely,
> > perhaps it would be better to use a switch/case and
> > let the compiler decide.
>
> That would also show there are two entries for SIGILL (here and in the
> original patch), one of them very wrong.

Good catch.  I'll take care of that in my next respin.

> Check the table with psignal or something?

Nice suggestion.  Thanks!

Cheers
Murilo



Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals

2018-08-01 Thread Murilo Opsfelder Araujo
On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote:
> On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > > This adds a human-readable name in the unhandled signal message.
> > > Before this patch, a page fault looked like:
> > >pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr 
> > > 7fff93c55100 code 2 in pandafault[1000+1]
> > > After this patch, a page fault looks like:
> > >pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 
> > > 7fffb63e5100 code 2 in pandafault[13a2a+1]
> ]]
> > > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> []
> > > @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler);
> > >   #define TM_DEBUG(x...) do { } while(0)
> > >   #endif
> > >
> > > +static const char *signames[SIGRTMIN + 1] = {
> > > + "UNKNOWN",
> > > + "SIGHUP",   // 1
> > > + "SIGINT",   // 2
> []
> > I don't think is is worth having that full table when we only use a few
> > of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/)
> >
> > I would suggest to instead use a function like this:
> >
> > static const char *signame(int signr)
> > {
> > if (signr == SIGBUS)
> > return "bus error";
> > if (signr == SIGFPE)
> > return "floating point exception";
> > if (signr == SIGILL)
> > return "illegal instruction";
> > if (signr == SIGILL)
> > return "segfault";
> > if (signr == SIGTRAP)
> > return "unhandled trap";
> > return "unknown signal";
> > }
>
> trivia:
>
> Unless the if tests are ordered most to least likely,
> perhaps it would be better to use a switch/case and
> let the compiler decide.
>
>   switch (signr) {
>   case SIGBUS:return "bus error";
>   case SIGFPE:return "floating point exception";
>   case SIGILL:return "illegal instruction";
>   case SIGSEGV:   return "segfault";
>   case SIGTRAP:   return "unhandled trap";
>   }
>   return "unknown signal";
> }
>

Hi, Joe, Christophe.

That's a nice enhancement.  I'll do that in my next respin.

Cheers
Murilo



Re: [PATCH] perf tools: allow overriding MAX_NR_CPUS at compile time

2018-08-01 Thread Arnaldo Carvalho de Melo
Em Wed, Aug 01, 2018 at 11:37:30AM +0200, Christophe LEROY escreveu:
> 
> 
> Le 03/05/2018 à 15:40, Arnaldo Carvalho de Melo a écrit :
> > Em Fri, Sep 22, 2017 at 01:20:43PM +0200, Christophe Leroy escreveu:
> > > After update of kernel, perf tool doesn't run anymore on my
> > > 32MB RAM powerpc board, but still runs on a 128MB RAM board:
> > 
> > Cleaning up my inbox, found this one, simple enough, still applies,
> > applied.
> 
> Did you finally apply it ? I can't see it in linux-next. Will it be merged
> into 4.19 ?

Sure, applied it finally, 

- Arnaldo


Re: [PATCH] powerpc/pasemi: Seach for PCI root bus by compatible property

2018-08-01 Thread Michael Ellerman
Christian Zigotzky  writes:

> Just for info: I tested it on my Nemo board today and it works.

Awesome thanks.

That's probably sufficient to merge it, and if it breaks anything we can
always revert it.

cheers

> On 31 July 2018 at 2:04PM, Michael Ellerman wrote:
>> Michael Ellerman  writes:
>>> Darren Stevens  writes:
>>>
 Pasemi arch code finds the root of the PCI-e bus by searching the
 device-tree for a node called 'pxp'. But the root bus has a
 compatible property of 'pasemi,rootbus' so search for that instead.

 Signed-off-by: Darren Stevens 
 ---

 This works on the Amigaone X1000, I don't know if this method of
 finding the pci bus was there bcause of earlier firmwares.
>>> Does anyone have another pasemi board they can test this on?
>>>
>>> The last time I plugged mine in it popped the power supply and took out
>>> power to half the office :) - I haven't had a chance to try it since.
>> I actually I remembered I have a device tree lying around from an electra.
>>
>> It has:
>>
>>[I] home:pxp@0,8000(7)(I)> lsprop name compatible
>>name "pxp"
>>compatible   "pasemi,rootbus"
>> "pa-pxp"
>>
>>
>> So it looks like the patch would work fine on it at least.
>>
>> cheers
>>
 diff --git a/arch/powerpc/platforms/pasemi/pci.c 
 b/arch/powerpc/platforms/pasemi/pci.c
 index c7c8607..be62380 100644
 --- a/arch/powerpc/platforms/pasemi/pci.c
 +++ b/arch/powerpc/platforms/pasemi/pci.c
 @@ -216,6 +216,7 @@ static int __init pas_add_bridge(struct device_node 
 *dev)
   void __init pas_pci_init(void)
   {
  struct device_node *np, *root;
 +   int res;
   
  root = of_find_node_by_path("/");
  if (!root) {
 @@ -226,11 +227,11 @@ void __init pas_pci_init(void)
   
  pci_set_flags(PCI_SCAN_ALL_PCIE_DEVS);
   
 -   for (np = NULL; (np = of_get_next_child(root, np)) != NULL;)
 -   if (np->name && !strcmp(np->name, "pxp") && !pas_add_bridge(np))
 -   of_node_get(np);
 -
 -   of_node_put(root);
 +   np = of_find_compatible_node(root, NULL, "pasemi,rootbus");
 +   if (np) {
 +   res = pas_add_bridge(np);
 +   of_node_put(np);
 +   }
   }
   
   void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset)


Re: [PATCH] powerpc/mobility: Fix node detach/rename problem

2018-08-01 Thread Michael Ellerman
Tyrel Datwyler  writes:
> On 07/30/2018 11:42 PM, Michael Ellerman wrote:
>> Tyrel Datwyler  writes:
>>> On 07/29/2018 06:11 AM, Michael Bringmann wrote:
 During LPAR migration, the content of the device tree/sysfs may
 be updated including deletion and replacement of nodes in the
 tree.  When nodes are added to the internal node structures, they
 are appended in FIFO order to a list of nodes maintained by the
 OF code APIs.  When nodes are removed from the device tree, they
 are marked OF_DETACHED, but not actually deleted from the system
 to allow for pointers cached elsewhere in the kernel.  The order
 and content of the entries in the list of nodes is not altered,
 though.

 During LPAR migration some common nodes are deleted and re-added
 e.g. "ibm,platform-facilities".  If a node is re-added to the OF
 node lists, the of_attach_node function checks to make sure that
 the name + ibm,phandle of the to-be-added data is unique.  As the
 previous copy of a re-added node is not modified beyond the addition
 of a bit flag, the code (1) finds the old copy, (2) prints a WARNING
 notice to the console, (3) renames the to-be-added node to avoid
 filename collisions within a directory, and (3) adds entries to
 the sysfs/kernfs.
>>>
>>> So, this patch actually just band aids over the real problem. This is
>>> a long standing problem with several PFO drivers leaking references.
>>> The issue here is that, during the device tree update that follows a
>>> migration. the update of the ibm,platform-facilities node and friends
>>> below are always deleted and re-added on the destination lpar and
>>> subsequently the leaked references prevent the devices nodes from
>>> every actually being properly cleaned up after detach. Thus, leading
>>> to the issue you are observing.
>
> So, it was the end of the day, and I kind of glossed over the issue
> Michael was trying to address with an issue that I remembered that had
> been around for awhile.

Sure, I know that feeling :)

>> Leaking references shouldn't affect the node being detached from the
>> tree though.
>
> No, but it does prevent it from being freed from sysfs which leads to
> the sysfs entry renaming that happens when another node with the same
> name is attached.

OK.

>> See of_detach_node() calling __of_detach_node(), none of that depends on
>> the refcount.
>> 
>> It's only the actual freeing of the node, in of_node_release() that is
>> prevented by leaked reference counts.
>
> Right, but if we did refcount correctly there is the new problem that
> the node is freed and now the phandle cache points at freed memory in
> the case were no invalidation is done.

Yeah that's bad.

I guess no one is supposed to lookup that phandle anymore, but it's
super fragile.

>> So I agree we need to do a better job with the reference counting, but I
>> don't see how it is causing the problem here
>
> Now in regards to the phandle caching somehow I missed that code going
> into OF earlier this year. That should have had at least some
> discussion from our side based on the fact that it is built on dtc
> compiler assumption that there are a set number of phandles that are
> allocated starting at 1..n such that they are monotonically
> increasing. That has a nice fixed size with O(1) lookup time. Phyp
> doesn't guarantee any sort of niceness with nicely ordered phandles.
> From what I've seen there are a subset of phandles that decrease from
> (-1) monotonically, and then there are a bunch of random values for
> cpus and IOAs. Thinking on it might not be that big of a deal as we
> just end up in the case where we have a phandle collision one makes it
> into the cache and is optimized while the other doesn't. On another
> note, they clearly hit a similar issue during overlay attach and
> remove, and as Rob pointed out their solution to handle it is a full
> cache invalidation followed by rescanning the whole tree to rebuild
> it. Seeing as our dynamic lifecycle is node by node this definitely
> adds some overhead.

Yeah I also should have noticed it going in, but despite being
subscribed to the devicetree list I didn't spot it in the flood.

AIUI the 1..n assumption is not about correctness but if our phandles
violate that we may not be getting any benefit.

Anyway yeah lots of things to look at tomorrow :)

cheers



Re: phandle_cache vs of_detach_node (was Re: [PATCH] powerpc/mobility: Fix node detach/rename problem)

2018-08-01 Thread Michael Ellerman
Frank Rowand  writes:
> On 07/31/18 07:17, Rob Herring wrote:
>> On Tue, Jul 31, 2018 at 12:34 AM Michael Ellerman  
>> wrote:
>>>
>>> Hi Rob/Frank,
>>>
>>> I think we might have a problem with the phandle_cache not interacting
>>> well with of_detach_node():
>> 
>> Probably needs a similar fix as this commit did for overlays:
>> 
>> commit b9952b5218added5577e4a3443969bc20884cea9
>> Author: Frank Rowand 
>> Date:   Thu Jul 12 14:00:07 2018 -0700
>> 
>> of: overlay: update phandle cache on overlay apply and remove
>> 
>> A comment in the review of the patch adding the phandle cache said that
>> the cache would have to be updated when modules are applied and removed.
>> This patch implements the cache updates.
>> 
>> Fixes: 0b3ce78e90fc ("of: cache phandle nodes to reduce cost of
>> of_find_node_by_phandle()")
>> Reported-by: Alan Tull 
>> Suggested-by: Alan Tull 
>> Signed-off-by: Frank Rowand 
>> Signed-off-by: Rob Herring 
>
> Agreed.  Sorry about missing the of_detach_node() case.
>
>
>> Really what we need here is an "invalidate phandle" function rather
>> than free and re-allocate the whole damn cache.
>
> The big hammer approach was chosen to avoid the race conditions that
> would otherwise occur.  OF does not have a locking strategy that
> would be able to protect against the races.
>
> We could maybe implement a slightly smaller hammer by (1) disabling
> the cache, (2) invalidate a phandle entry in the cache, (3) re-enable
> the cache.  That is an off the cuff thought - I would have to look
> a little bit more carefully to make sure it would work.
>
> But I don't see a need to add the complexity of the smaller hammer
> or the bigger hammer of proper locking _unless_ we start seeing that
> the cache is being freed and re-allocated frequently.  For overlays
> I don't expect the high frequency because it happens on a per overlay
> removal basis (not per node removal basis).  For of_detach_node() the
> event _is_ on a per node removal basis.  Michael, do you expect node
> removals to be a frequent event with low latency being important?  If
> so, a rough guess on what the frequency would be?

No in practice we expect them to be fairly rare and in the thousands at
most I think.

TBH you could just disable the phandle_cache on the first detach and
that would be fine by me. I don't think we've ever noticed phandle
lookups being much of a problem for us on our hardware.

cheers


Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()

2018-08-01 Thread Michael Ellerman
Christophe LEROY  writes:

> Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
>> Remove "Instruction dump:" line by adding a prefix to display current->comm
>> and current->pid, along with the instructions dump.
>> 
>> The prefix can serve as a glue that links the instructions dump to its
>> originator, allowing messages to be interleaved in the logs.
>> 
>> Before this patch, a page fault looked like:
>> 
>>pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 
>> code 2 in pandafault[1000+1]
>>Instruction dump:
>>4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe
>>392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040
>> 
>> After this patch, it looks like:
>> 
>>pandafault[10850]: segfault (11) at 17d0 nip 161c lr 7fff9f3e5100 
>> code 2 in pandafault[1000+1]
>>pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
>> f821ffc1 7c3f0b78 3d22fffe
>>pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
>> 3920 7d234b78 383f0040
>> 
>> Signed-off-by: Murilo Opsfelder Araujo 
>
> Does the script scripts/decode_stacktrace.sh also works with this format 
> change ?

Did it work before? I've never used it.

Would be great if it did work though.

cheers


Re: [PATCH v2 2/2] powerpc/pseries: Wait for completion of hotplug events during PRRN handling

2018-08-01 Thread Michael Ellerman
John Allen  writes:

> On Mon, Jul 23, 2018 at 11:41:24PM +1000, Michael Ellerman wrote:
>>John Allen  writes:
>>
>>> While handling PRRN events, the time to handle the actual hotplug events
>>> dwarfs the time it takes to perform the device tree updates and queue the
>>> hotplug events. In the case that PRRN events are being queued continuously,
>>> hotplug events have been observed to be queued faster than the kernel can
>>> actually handle them. This patch avoids the problem by waiting for a
>>> hotplug request to complete before queueing more hotplug events.

Have you tested this patch in isolation, ie. not with patch 1?

>>So do we need the hotplug work queue at all? Can we just call
>>handle_dlpar_errorlog() directly?
>>
>>Or are we using the work queue to serialise things? And if so would a
>>mutex be better?
>
> Right, the workqueue is meant to serialize all hotplug events and it 
> gets used for more than just PRRN events. I believe the motivation for 
> using the workqueue over a mutex is that KVM guests initiate hotplug 
> events through the hotplug interrupt and can queue fairly large requests 
> meaning that in this scenario, waiting for a lock would block interrupts
> for a while.

OK, but that just means that path needs to schedule work to run later.

> Using the workqueue allows us to serialize hotplug events 
> from different sources in the same way without worrying about the 
> context in which the event is generated.

A lock would be so much simpler.

It looks like we have three callers of queue_hotplug_event(), the dlpar
code, the mobility code and the ras interrupt.

The dlpar code already waits synchronously:

  init_completion(_done);
  queue_hotplug_event(hp_elog, _done, );
  wait_for_completion(_done);

You're changing mobility to do the same (this patch), leaving only the
ras interrupt that actually queues work and returns.


So it really seems like a mutex would do the trick, and the ras
interrupt would be the only case that needs to schedule work for later.

cheers



Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig

2018-08-01 Thread Alex Bounine

On 2018-08-01 05:54 AM, Christoph Hellwig wrote:

On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote:

Platforms with a PCI bus will be offered the RapidIO menu since they may
be want support for a RapidIO PCI device. Platforms without a PCI bus
that might include a RapidIO IP block will need to "select HAS_RAPIDIO"
in the platform-/machine-specific "config ARCH_*" Kconfig entry.

Tested that kernel builds for arm64 with RapidIO subsystem and
switch drivers enabled, also that the modules load successfully
on a custom Aarch64 Qemu model.


As said before, please include it from drivers/Kconfig so that _all_
architectures supporting PCI (or other Rapidio attachements) get it
and not some arbitrary selection of architectures.

As it was replied earlier this is not a random selection of 
architectures but only ones that implement support for RapidIO as system 
bus. If other architectures choose to adopt RapidIO we will include them 
as well.


On some platforms RapidIO can be the only system bus available replacing 
PCI/PCIe or RapidIO can coexist with PCIe.


As it is done now, RapidIO is configured in "Bus Options" (x86/PPC) or 
"Bus Support" (ARMs) sub-menu and from system configuration option it 
should be kept this way.


Current location of RAPIDIO configuration option is familiar to users of 
PowerPC and x86 platforms, and is similarly available in some ARM 
manufacturers kernel code trees.


drivers/Kconfig will be used for configuring drivers for peripheral 
RapidIO devices if/when such device drivers will be published.





Re: [PATCH v2 1/2] powerpc/pseries: Avoid blocking rtas polling handling multiple PRRN events

2018-08-01 Thread Michael Ellerman
Hi John,

I'm still not sure about this one.

John Allen  writes:
> On Mon, Jul 23, 2018 at 11:27:56PM +1000, Michael Ellerman wrote:
>>Hi John,
>>
>>I'm a bit puzzled by this one.
>>
>>John Allen  writes:
>>> When a PRRN event is being handled and another PRRN event comes in, the
>>> second event will block rtas polling waiting on the first to complete,
>>> preventing any further rtas events from being handled. This can be
>>> especially problematic in case that PRRN events are continuously being
>>> queued in which case rtas polling gets indefinitely blocked completely.
>>>
>>> This patch introduces a mutex that prevents any subsequent PRRN events from
>>> running while there is a prrn event being handled, allowing rtas polling to
>>> continue normally.
>>>
>>> Signed-off-by: John Allen 
>>> ---
>>> v2:
>>>   -Unlock prrn_lock when PRRN operations are complete, not after handler is
>>>scheduled.
>>>   -Remove call to flush_work, the previous broken method of serializing
>>>PRRN events.
>>> ---
>>>  arch/powerpc/kernel/rtasd.c | 10 +++---
>>>  1 file changed, 7 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
>>> index 44d66c33d59d..845fc5aec178 100644
>>> --- a/arch/powerpc/kernel/rtasd.c
>>> +++ b/arch/powerpc/kernel/rtasd.c
>>> @@ -284,15 +286,17 @@ static void prrn_work_fn(struct work_struct *work)
>>>  */
>>> pseries_devicetree_update(-prrn_update_scope);
>>> numa_update_cpu_topology(false);
>>> +   mutex_unlock(_lock);
>>>  }
>>>
>>>  static DECLARE_WORK(prrn_work, prrn_work_fn);
>>>
>>>  static void prrn_schedule_update(u32 scope)
>>>  {
>>> -   flush_work(_work);
>>
>>This seems like it's actually the core of the change. Previously we were
>>basically blocking on the flush before continuing.
>
> The idea here is to replace the blocking flush_work with a non-blocking 
> mutex. So rather than waiting on the running PRRN event to complete, we 
> bail out since a PRRN event is already running.

OK, but why is it OK to bail out?

The firmware sent you an error log asking you to do something, with a
scope value that has some meaning, and now you're just going to drop
that on the floor?

Maybe it is OK to just drop these events? Or maybe you're saying that
because the system is crashing under the load of too many events it's OK
to drop the events in this case.

> The situation this is 
> meant to address is flooding the workqueue with PRRN events, which like 
> the situation in patch 2/2, these can be queued up faster than they can 
> actually be handled.

I'm not really sure why this is a problem though.

The current code synchronously processes the events, so there should
only ever be one in flight.

I guess the issue is that each one can queue multiple events on the
hotplug work queue?

But still, we have terabytes of RAM, we should be able to queue a lot
of events before it becomes a problem.

So what exactly is getting flooded, what's the symptom?

If the queuing of the hotplug events is the problem, then why don't we
stop doing that? We could just process them synchronously from the PRRN
update, that would naturally throttle them.

cheers


Re: [PATCH v5 00/11] hugetlb: Factorize hugetlb architecture primitives

2018-08-01 Thread Alexandre Ghiti

On 07/31/2018 10:06 PM, Luiz Capitulino wrote:

On Tue, 31 Jul 2018 06:01:44 +
Alexandre Ghiti  wrote:


[CC linux-mm for inclusion in -mm tree]

In order to reduce copy/paste of functions across architectures and then
make riscv hugetlb port (and future ports) simpler and smaller, this
patchset intends to factorize the numerous hugetlb primitives that are
defined across all the architectures.

[...]


  15 files changed, 139 insertions(+), 382 deletions(-)

I imagine you're mostly interested in non-x86 review at this point, but
as this series is looking amazing:

Reviewed-by: Luiz Capitulino 


It's always good to have another feedback :)
Thanks for your review Luiz,

Alex


Re: [PATCH] powerpc/powernv/opal: Use standard interrupts property when available

2018-08-01 Thread Michael Ellerman
Benjamin Herrenschmidt  writes:

> diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c 
> b/arch/powerpc/platforms/powernv/opal-irqchip.c
> index 9d1b8c0aaf93..46785eaf625d 100644
> --- a/arch/powerpc/platforms/powernv/opal-irqchip.c
> +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
> @@ -216,67 +214,91 @@ int __init opal_event_init(void)
...
>  
>   /* Install interrupt handlers */
>   for (i = 0; i < opal_irq_count; i++) {
> - unsigned int virq;
> - char *name;
> + struct resource *r = _irqs[i];
> + const char *name;
>  
> - /* Get hardware and virtual IRQ */
> - virq = irq_create_mapping(NULL, irqs[i]);
> - if (!virq) {
> - pr_warn("Failed to map irq 0x%x\n", irqs[i]);
> - continue;
> - }
> -
> - if (names[i] && strlen(names[i]))
> - name = kasprintf(GFP_KERNEL, "opal-%s", names[i]);
> + /* Prefix name */
> + if (r->name)
> + name = kasprintf(GFP_KERNEL, "opal-%s", r->name);
>   else
>   name = kasprintf(GFP_KERNEL, "opal");

I'm getting:

root@p85:/proc/irq# find . -name '*opal*'
...
./63/opal-ipmi
./227/opal-
./228/opal-
./231/opal-
./232/opal-
./247/opal-
./248/opal-
./483/opal-
./484/opal-
./487/opal-
./488/opal-
./500/opal-
./510/opal-
./511/opal-
./512/opal-


I fixed it with:

diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c 
b/arch/powerpc/platforms/powernv/opal-irqchip.c
index 41c3303157f7..a2d067925140 100644
--- a/arch/powerpc/platforms/powernv/opal-irqchip.c
+++ b/arch/powerpc/platforms/powernv/opal-irqchip.c
@@ -275,7 +275,7 @@ int __init opal_event_init(void)
const char *name;
 
/* Prefix name */
-   if (r->name)
+   if (r->name && strlen(r->name))
name = kasprintf(GFP_KERNEL, "opal-%s", r->name);
else
name = kasprintf(GFP_KERNEL, "opal");


cheers


Re: Infinite looping observed in __offline_pages

2018-08-01 Thread Michal Hocko
On Wed 01-08-18 21:09:39, Michael Ellerman wrote:
> Michal Hocko  writes:
> > On Wed 25-07-18 13:11:15, John Allen wrote:
> > [...]
> >> Does a failure in do_migrate_range indicate that the range is unmigratable
> >> and the loop in __offline_pages should terminate and goto failed_removal? 
> >> Or
> >> should we allow a certain number of retrys before we
> >> give up on migrating the range?
> >
> > Unfortunatelly not. Migration code doesn't tell a difference between
> > ephemeral and permanent failures.
> 
> What's to stop an ephemeral failure happening repeatedly?

If there is a short term pin on the page that prevents the migration
then the holder of the pin should realease it and the next retry will
succeed the migration. If the page gets freed on the way then it will
not be reallocated because they are isolated already. I can only see
complete OOM to be the reason to fail allocation of the target place
as the migration failure and that is highly unlikely and sooner or later
trigger the oom killer and release some memory.

The biggest problem here is that we cannot tell ephemeral and long term
pins...
-- 
Michal Hocko
SUSE Labs


Re: Infinite looping observed in __offline_pages

2018-08-01 Thread Michael Ellerman
Michal Hocko  writes:
> On Wed 25-07-18 13:11:15, John Allen wrote:
> [...]
>> Does a failure in do_migrate_range indicate that the range is unmigratable
>> and the loop in __offline_pages should terminate and goto failed_removal? Or
>> should we allow a certain number of retrys before we
>> give up on migrating the range?
>
> Unfortunatelly not. Migration code doesn't tell a difference between
> ephemeral and permanent failures.

What's to stop an ephemeral failure happening repeatedly?

cheers


RE: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization

2018-08-01 Thread Y.b. Lu
Hi Richard,

> -Original Message-
> From: Richard Cochran [mailto:richardcoch...@gmail.com]
> Sent: Wednesday, August 1, 2018 2:15 PM
> To: Y.b. Lu 
> Cc: net...@vger.kernel.org; Madalin-cristian Bucur
> ; Rob Herring ; Shawn Guo
> ; David S . Miller ;
> devicet...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org;
> linux-arm-ker...@lists.infradead.org; linux-ker...@vger.kernel.org
> Subject: Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for
> initialization
> 
> On Wed, Aug 01, 2018 at 04:36:40AM +, Y.b. Lu wrote:
> 
> > Could I add a function to calculate a set of default register values
> > to initialize ptp timer when dts method failed to get required
> > properties in driver?
> 
> Yes, it would be ideal if the driver can pick correct values automatically.
> 
> However, the frequency on the FIPER outputs can't be configured
> automatically, and we don't have an API for the user to choose this.

[Y.b. Lu] Thanks a lot. Just let me send out the v2 patch for your reviewing.

> 
> > I think this will be useful. The ptp timer on new platforms (you may
> > see two dts patches in this patchset. Many platforms will be
> > affected.) will work without these dts properties. If user want
> > specific setting, they can set dts properties.
> 
> Sure.
> 
> Thanks,
> Richard


[v2, 3/3] ptp_qoriq: support automatic configuration for ptp timer

2018-08-01 Thread Yangbo Lu
This patch is to support automatic configuration for ptp timer.
If required ptp dts properties are not provided, driver could
try to calculate a set of default configurations to initialize
the ptp timer. This makes the driver work for many boards which
don't have the required ptp dts properties in current kernel.
Also the users could set dts properties by themselves according
to their requirement.

Signed-off-by: Yangbo Lu 
---
Changes for v2:
- Dropped module_param.
---
 drivers/ptp/ptp_qoriq.c   |  111 +++-
 include/linux/fsl/ptp_qoriq.h |6 ++-
 2 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_qoriq.c b/drivers/ptp/ptp_qoriq.c
index a14c317..095c185 100644
--- a/drivers/ptp/ptp_qoriq.c
+++ b/drivers/ptp/ptp_qoriq.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -317,6 +318,105 @@ static int ptp_qoriq_enable(struct ptp_clock_info *ptp,
.enable = ptp_qoriq_enable,
 };
 
+/**
+ * qoriq_ptp_nominal_freq - calculate nominal frequency according to
+ * reference clock frequency
+ *
+ * @clk_src: reference clock frequency
+ *
+ * The nominal frequency is the desired clock frequency.
+ * It should be less than the reference clock frequency.
+ * It should be a factor of 1000MHz.
+ *
+ * Return the nominal frequency
+ */
+static u32 qoriq_ptp_nominal_freq(u32 clk_src)
+{
+   u32 remainder = 0;
+
+   clk_src /= 100;
+   remainder = clk_src % 100;
+   if (remainder) {
+   clk_src -= remainder;
+   clk_src += 100;
+   }
+
+   do {
+   clk_src -= 100;
+
+   } while (1000 % clk_src);
+
+   return clk_src * 100;
+}
+
+/**
+ * qoriq_ptp_auto_config - calculate a set of default configurations
+ *
+ * @qoriq_ptp: pointer to qoriq_ptp
+ * @node: pointer to device_node
+ *
+ * If below dts properties are not provided, this function will be
+ * called to calculate a set of default configurations for them.
+ *   "fsl,tclk-period"
+ *   "fsl,tmr-prsc"
+ *   "fsl,tmr-add"
+ *   "fsl,tmr-fiper1"
+ *   "fsl,tmr-fiper2"
+ *   "fsl,max-adj"
+ *
+ * Return 0 if success
+ */
+static int qoriq_ptp_auto_config(struct qoriq_ptp *qoriq_ptp,
+struct device_node *node)
+{
+   struct clk *clk;
+   u64 freq_comp;
+   u64 max_adj;
+   u32 nominal_freq;
+   u32 clk_src = 0;
+
+   qoriq_ptp->cksel = DEFAULT_CKSEL;
+
+   clk = of_clk_get(node, 0);
+   if (!IS_ERR(clk)) {
+   clk_src = clk_get_rate(clk);
+   clk_put(clk);
+   }
+
+   if (clk_src <= 1UL) {
+   pr_err("error reference clock value, or lower than 100MHz\n");
+   return -EINVAL;
+   }
+
+   nominal_freq = qoriq_ptp_nominal_freq(clk_src);
+   if (!nominal_freq)
+   return -EINVAL;
+
+   qoriq_ptp->tclk_period = 10UL / nominal_freq;
+   qoriq_ptp->tmr_prsc = DEFAULT_TMR_PRSC;
+
+   /* Calculate initial frequency compensation value for TMR_ADD register.
+* freq_comp = ceil(2^32 / freq_ratio)
+* freq_ratio = reference_clock_freq / nominal_freq
+*/
+   freq_comp = ((u64)1 << 32) * nominal_freq;
+   if (do_div(freq_comp, clk_src))
+   freq_comp++;
+
+   qoriq_ptp->tmr_add = freq_comp;
+   qoriq_ptp->tmr_fiper1 = DEFAULT_FIPER1_PERIOD - qoriq_ptp->tclk_period;
+   qoriq_ptp->tmr_fiper2 = DEFAULT_FIPER2_PERIOD - qoriq_ptp->tclk_period;
+
+   /* max_adj = 10 * (freq_ratio - 1.0) - 1
+* freq_ratio = reference_clock_freq / nominal_freq
+*/
+   max_adj = 10ULL * (clk_src - nominal_freq);
+   max_adj = max_adj / nominal_freq - 1;
+   qoriq_ptp->caps.max_adj = max_adj;
+
+   return 0;
+}
+
 static int qoriq_ptp_probe(struct platform_device *dev)
 {
struct device_node *node = dev->dev.of_node;
@@ -332,7 +432,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)
if (!qoriq_ptp)
goto no_memory;
 
-   err = -ENODEV;
+   err = -EINVAL;
 
qoriq_ptp->caps = ptp_qoriq_caps;
 
@@ -351,10 +451,14 @@ static int qoriq_ptp_probe(struct platform_device *dev)
 "fsl,tmr-fiper2", _ptp->tmr_fiper2) ||
of_property_read_u32(node,
 "fsl,max-adj", _ptp->caps.max_adj)) {
-   pr_err("device tree node missing required elements\n");
-   goto no_node;
+   pr_warn("device tree node missing required elements, try 
automatic configuration\n");
+
+   if (qoriq_ptp_auto_config(qoriq_ptp, node))
+   goto no_config;
}
 
+   err = -ENODEV;
+
qoriq_ptp->irq = platform_get_irq(dev, 0);
 
if (qoriq_ptp->irq < 0) {
@@ -436,6 +540,7 @@ static int qoriq_ptp_probe(struct platform_device *dev)

[v2, 1/3] arm64: dts: fsl: add clocks property for fman ptp timer node

2018-08-01 Thread Yangbo Lu
This patch is to add clocks property for fman ptp timer node.

Signed-off-by: Yangbo Lu 
---
Changes for v2:
- None.
---
 arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi 
b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
index a56a408..4664c33 100644
--- a/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
+++ b/arch/arm64/boot/dts/freescale/qoriq-fman3-0.dtsi
@@ -80,4 +80,5 @@ ptp_timer0: ptp-timer@1afe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x0 0x1afe000 0x0 0x1000>;
interrupts = ;
+   clocks = < 3 0>;
 };
-- 
1.7.1



[v2, 2/3] powerpc/mpc85xx: add clocks property for fman ptp timer node

2018-08-01 Thread Yangbo Lu
This patch is to add clocks property for fman ptp timer node.

Signed-off-by: Yangbo Lu 
---
Changes for v2:
- None.
---
 arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi   |1 +
 arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi   |1 +
 arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi  |1 +
 arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi  |1 +
 arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi |1 +
 5 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi 
b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
index 6b124f7..9b6cf91 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-0.dtsi
@@ -100,4 +100,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+   clocks = < 3 0>;
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi 
b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
index b80aaf5..e95c11f 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman-1.dtsi
@@ -100,4 +100,5 @@ ptp_timer1: ptp-timer@5fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x5fe000 0x1000>;
interrupts = <97 2 0 0>;
+   clocks = < 3 1>;
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi 
b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
index d3720fd..d62b36c 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-0.dtsi
@@ -105,4 +105,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+   clocks = < 3 0>;
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi 
b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
index ae34c20..3102324 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3-1.dtsi
@@ -105,4 +105,5 @@ ptp_timer1: ptp-timer@5fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x5fe000 0x1000>;
interrupts = <97 2 0 0>;
+   clocks = < 3 1>;
 };
diff --git a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi 
b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
index 02f2755..c90702b 100644
--- a/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
+++ b/arch/powerpc/boot/dts/fsl/qoriq-fman3l-0.dtsi
@@ -93,4 +93,5 @@ ptp_timer0: ptp-timer@4fe000 {
compatible = "fsl,fman-ptp-timer";
reg = <0x4fe000 0x1000>;
interrupts = <96 2 0 0>;
+   clocks = < 3 0>;
 };
-- 
1.7.1



Re: [RESEND PATCH 6/6] arm64: enable RapidIO menu in Kconfig

2018-08-01 Thread Christoph Hellwig
On Tue, Jul 31, 2018 at 10:29:54AM -0400, Alexei Colin wrote:
> Platforms with a PCI bus will be offered the RapidIO menu since they may
> be want support for a RapidIO PCI device. Platforms without a PCI bus
> that might include a RapidIO IP block will need to "select HAS_RAPIDIO"
> in the platform-/machine-specific "config ARCH_*" Kconfig entry.
> 
> Tested that kernel builds for arm64 with RapidIO subsystem and
> switch drivers enabled, also that the modules load successfully
> on a custom Aarch64 Qemu model.

As said before, please include it from drivers/Kconfig so that _all_
architectures supporting PCI (or other Rapidio attachements) get it
and not some arbitrary selection of architectures.


Re: [PATCH] perf tools: allow overriding MAX_NR_CPUS at compile time

2018-08-01 Thread Christophe LEROY




Le 03/05/2018 à 15:40, Arnaldo Carvalho de Melo a écrit :

Em Fri, Sep 22, 2017 at 01:20:43PM +0200, Christophe Leroy escreveu:

After update of kernel, perf tool doesn't run anymore on my
32MB RAM powerpc board, but still runs on a 128MB RAM board:


Cleaning up my inbox, found this one, simple enough, still applies,
applied.


Did you finally apply it ? I can't see it in linux-next. Will it be 
merged into 4.19 ?


Thanks
Christophe



These all needs to be dynamicly allocated, but still, with this one can
get a functioning tool, apply it.

- Arnaldo
  

~# strace perf
execve("/usr/sbin/perf", ["perf"], [/* 12 vars */]) = -1 ENOMEM (Cannot 
allocate memory)
--- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=0} ---
+++ killed by SIGSEGV +++
Segmentation fault

objdump -x shows that .bss section has a huge size of 24Mbytes:

  27 .bss  016baca8  101cebb8  101cebb8  001cd988  2**3

With especially the following objects having quite big size

10205f80 l O .bss   0014  runtime_cycles_stats
10345f80 l O .bss   0014  runtime_stalled_cycles_front_stats
10485f80 l O .bss   0014  runtime_stalled_cycles_back_stats
105c5f80 l O .bss   0014  runtime_branches_stats
10705f80 l O .bss   0014  runtime_cacherefs_stats
10845f80 l O .bss   0014  runtime_l1_dcache_stats
10985f80 l O .bss   0014  runtime_l1_icache_stats
10ac5f80 l O .bss   0014  runtime_ll_cache_stats
10c05f80 l O .bss   0014  runtime_itlb_cache_stats
10d45f80 l O .bss   0014  runtime_dtlb_cache_stats
10e85f80 l O .bss   0014  runtime_cycles_in_tx_stats
10fc5f80 l O .bss   0014  runtime_transaction_stats
11105f80 l O .bss   0014  runtime_elision_stats
11245f80 l O .bss   0014  runtime_topdown_total_slots
11385f80 l O .bss   0014  runtime_topdown_slots_retired
114c5f80 l O .bss   0014  runtime_topdown_slots_issued
11605f80 l O .bss   0014  runtime_topdown_fetch_bubbles
11745f80 l O .bss   0014  runtime_topdown_recovery_bubbles

This is due to commit 4d255766d28b1 ("perf: Bump max number of cpus
to 1024"), because many tables are sized with MAX_NR_CPUS

This patch gives the opportunity to redefine MAX_NR_CPUS via

make EXTRA_CFLAGS=-DMAX_NR_CPUS=1

Signed-off-by: Christophe Leroy 
---
  tools/perf/perf.h | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index dc442ba21bf6..a9db563da0a9 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -23,7 +23,9 @@ static inline unsigned long long rdclock(void)
return ts.tv_sec * 10ULL + ts.tv_nsec;
  }
  
+#ifndef MAX_NR_CPUS

  #define MAX_NR_CPUS   1024
+#endif
  
  extern const char *input_name;

  extern bool perf_host, perf_guest;
--
2.13.3


Re: [PATCH] powerpc/64s/radix: Fix missing global invalidations when removing copro

2018-08-01 Thread Vaibhav Jain
Frederic Barrat  writes:

> With the optimizations for TLB invalidation from commit 0cef77c7798a
> ("powerpc/64s/radix: flush remote CPUs out of single-threaded
> mm_cpumask"), the scope of a TLBI (global vs. local) can now be
> influenced by the value of the 'copros' counter of the memory context.
>
> When calling mm_context_remove_copro(), the 'copros' counter is
> decremented first before flushing. It may have the unintended side
> effect of sending local TLBIs when we explicitly need global
> invalidations in this case. Thus breaking any nMMU user in a bad and
> unpredictable way.
>
> Fix it by flushing first, before updating the 'copros' counter, so
> that invalidations will be global.
>
> Fixes: 0cef77c7798a ("powerpc/64s/radix: flush remote CPUs out of 
> single-threaded mm_cpumask")
> Signed-off-by: Frederic Barrat 
> ---
Tested-by: Vaibhav Jain 

-- 
Vaibhav Jain 
Linux Technology Center, IBM India Pvt. Ltd.



Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Will Deacon
Hi Christoph,

On Wed, Aug 01, 2018 at 01:36:39AM -0700, Christoph Hellwig wrote:
> On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> > On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
> > transport (so definitely not PCI) have historically been advertised by qemu
> > as not being cache coherent, but because the virtio core has bypassed DMA
> > ops then everything has happened to work. If we blindly enable the arch DMA
> > ops,
> 
> No one is suggesting that as far as I can tell.

Apologies: it's me that wants the DMA ops enabled to handle legacy devices
behind an IOMMU, but see below.

> > we'll plumb in the non-coherent ops and start getting data corruption,
> > so we do need a way to quirk virtio as being "always coherent" if we want to
> > use the DMA ops (which we do, because our emulation platforms have an IOMMU
> > for all virtio devices).
> 
> From all that I've gather so far: no you do not want that.  We really
> need to figure out virtio "dma" interacts with the host / device.
> 
> If you look at the current iommu spec it does talk of physical address
> with a little careveout for VIRTIO_F_IOMMU_PLATFORM.

That's true, although that doesn't exist in the legacy virtio spec, and we
have an existing emulation platform which puts legacy virtio devices behind
an IOMMU. Currently, Linux is unable to boot on this platform unless the
IOMMU is configured as bypass. If we can use the coherent IOMMU DMA ops,
then it works perfectly.

> So between that and our discussion in this thread and its previous
> iterations I think we need to stick to the current always physical,
> bypass system dma ops mode of virtio operation as the default.

As above -- that means we hang during boot because we get stuck trying to
bring up a virtio-block device whose DMA is aborted by the IOMMU. The easy
answer is "just upgrade to latest virtio and advertise the presence of the
IOMMU". I'm pushing for that in future platforms, but it seems a shame not
to support the current platform, especially given that other systems do have
hacks in mainline to get virtio working.

> We just need to figure out how to deal with devices that deviate
> from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
> should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
> dma tweaks (offsets, cache flushing), which seems well in spirit of
> the original design.  The other issue is VIRTIO_F_IO_BARRIER
> which is very vaguely defined, and which needs a better definition.
> And last but not least we'll need some text explaining the challenges
> of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
> is what would basically cover them, but a good description including
> an explanation of why these matter.

I agree that this makes sense for future revisions of virtio (or perhaps
it can just be a clarification to virtio 1.0), but we're still left in the
dark with legacy devices and it would be nice to have them work on the
systems which currently exist, even if it's a legacy-only hack in the arch
code.

Will


Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Christoph Hellwig
On Wed, Aug 01, 2018 at 09:16:38AM +0100, Will Deacon wrote:
> On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
> transport (so definitely not PCI) have historically been advertised by qemu
> as not being cache coherent, but because the virtio core has bypassed DMA
> ops then everything has happened to work. If we blindly enable the arch DMA
> ops,

No one is suggesting that as far as I can tell.

> we'll plumb in the non-coherent ops and start getting data corruption,
> so we do need a way to quirk virtio as being "always coherent" if we want to
> use the DMA ops (which we do, because our emulation platforms have an IOMMU
> for all virtio devices).

>From all that I've gather so far: no you do not want that.  We really
need to figure out virtio "dma" interacts with the host / device.

If you look at the current iommu spec it does talk of physical address
with a little careveout for VIRTIO_F_IOMMU_PLATFORM.

So between that and our discussion in this thread and its previous
iterations I think we need to stick to the current always physical,
bypass system dma ops mode of virtio operation as the default.

We just need to figure out how to deal with devices that deviate
from the default.  One things is that VIRTIO_F_IOMMU_PLATFORM really
should become VIRTIO_F_PLATFORM_DMA to cover the cases of non-iommu
dma tweaks (offsets, cache flushing), which seems well in spirit of
the original design.  The other issue is VIRTIO_F_IO_BARRIER
which is very vaguely defined, and which needs a better definition.
And last but not least we'll need some text explaining the challenges
of hardware devices - I think VIRTIO_F_PLATFORM_DMA + VIRTIO_F_IO_BARRIER
is what would basically cover them, but a good description including
an explanation of why these matter.


[PATCH v8 4/4] selftests/powerpc: update strlen() test to test the new assembly function for PPC32

2018-08-01 Thread Christophe Leroy
This patch adds a test for testing the new assembly strlen() for PPC32

Signed-off-by: Christophe Leroy 
---
 v8: removed defines in ppc_asm.h that were added in v6 (not used anymore since 
v7) ; added missing link to strlen_32.S
 v7: reduced the scope to PPC32
 v6: added additional necessary defines in ppc_asm.h
 v5: no change
 v4: new

 tools/testing/selftests/powerpc/stringloops/Makefile| 5 -
 tools/testing/selftests/powerpc/stringloops/asm/cache.h | 1 +
 tools/testing/selftests/powerpc/stringloops/strlen_32.S | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/asm/cache.h
 create mode 12 tools/testing/selftests/powerpc/stringloops/strlen_32.S

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
b/tools/testing/selftests/powerpc/stringloops/Makefile
index 779b644461c4..9e510de2c07d 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -13,9 +13,12 @@ $(OUTPUT)/memcmp_32: CFLAGS += -m32
 $(OUTPUT)/strlen: strlen.c string.o
 $(OUTPUT)/string.o: string.c
 
+$(OUTPUT)/strlen_32: strlen.c
+$(OUTPUT)/strlen_32: CFLAGS += -m32
+
 ASFLAGS = $(CFLAGS)
 
-TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen strlen_32
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/stringloops/asm/cache.h 
b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
new file mode 100644
index ..8a2840831122
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/asm/cache.h
@@ -0,0 +1 @@
+#defineIFETCH_ALIGN_BYTES 4
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen_32.S 
b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
new file mode 12
index ..72b13731b24c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/strlen_32.S
\ No newline at end of file
-- 
2.13.3



[PATCH v8 3/4] powerpc/lib: implement strlen() in assembly for PPC32

2018-08-01 Thread Christophe Leroy
The generic implementation of strlen() reads strings byte per byte.

This patch implements strlen() in assembly based on a read of entire
words, in the same spirit as what some other arches and glibc do.

On a 8xx the time spent in strlen is reduced by 3/4 for long strings.

strlen() selftest on an 8xx provides the following values:

Before the patch (ie with the generic strlen() in lib/string.c):

len 256 : time = 1.195055
len 016 : time = 0.083745
len 008 : time = 0.046828
len 004 : time = 0.028390

After the patch:

len 256 : time = 0.272185 ==> 78% improvment
len 016 : time = 0.040632 ==> 51% improvment
len 008 : time = 0.033060 ==> 29% improvment
len 004 : time = 0.029149 ==> 2% degradation

On a 832x:

Before the patch:

len 256 : time = 0.236125
len 016 : time = 0.018136
len 008 : time = 0.011000
len 004 : time = 0.007229

After the patch:

len 256 : time = 0.094950 ==> 60% improvment
len 016 : time = 0.013357 ==> 26% improvment
len 008 : time = 0.010586 ==> 4% improvment
len 004 : time = 0.008784

Signed-off-by: Christophe Leroy 
---
Changes in v8:
 - No change

Changes in v7:
 - Reduced the scope to PPC32
 - Modified the missalignment handling to be branchless and loopless

Changes in v6:
 - Reworked for having branchless conclusion

Changes in v5:
 - Fixed for PPC64 LITTLE ENDIAN

Changes in v4:
 - Added alignment of the loop
 - doing the andc only if still not 0 as it happends only for bytes above 0x7f 
which is pretty rare in a string

Changes in v3:
 - Made it common to PPC32 and PPC64

Changes in v2:
 - Moved handling of unaligned strings outside of the main path as it is very 
unlikely.
 - Removed the verification of the fourth byte in case none of the three first 
ones are NUL.

 arch/powerpc/include/asm/string.h |  2 +
 arch/powerpc/lib/Makefile |  2 +-
 arch/powerpc/lib/strlen_32.S  | 78 +++
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/strlen_32.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9b8cedf618f4..1647de15a31e 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -50,6 +50,8 @@ static inline void *memset64(uint64_t *p, uint64_t v, 
__kernel_size_t n)
return __memset64(p, v, n * 8);
 }
 #else
+#define __HAVE_ARCH_STRLEN
+
 extern void *memset16(uint16_t *, uint16_t, __kernel_size_t);
 #endif
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index d0ca13ad8231..670286808928 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -12,7 +12,7 @@ CFLAGS_REMOVE_feature-fixups.o = $(CC_FLAGS_FTRACE)
 
 obj-y += string.o alloc.o code-patching.o feature-fixups.o
 
-obj-$(CONFIG_PPC32)+= div64.o copy_32.o crtsavres.o
+obj-$(CONFIG_PPC32)+= div64.o copy_32.o crtsavres.o strlen_32.o
 
 # See corresponding test in arch/powerpc/Makefile
 # 64-bit linker creates .sfpr on demand for final link (vmlinux),
diff --git a/arch/powerpc/lib/strlen_32.S b/arch/powerpc/lib/strlen_32.S
new file mode 100644
index ..0a8d3f64d493
--- /dev/null
+++ b/arch/powerpc/lib/strlen_32.S
@@ -0,0 +1,78 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * strlen() for PPC32
+ *
+ * Copyright (C) 2018 Christophe Leroy CS Systemes d'Information.
+ *
+ * Inspired from glibc implementation
+ */
+#include 
+#include 
+#include 
+
+   .text
+
+/*
+ * Algorithm:
+ *
+ * 1) Given a word 'x', we can test to see if it contains any 0 bytes
+ *by subtracting 0x01010101, and seeing if any of the high bits of each
+ *byte changed from 0 to 1. This works because the least significant
+ *0 byte must have had no incoming carry (otherwise it's not the least
+ *significant), so it is 0x00 - 0x01 == 0xff. For all other
+ *byte values, either they have the high bit set initially, or when
+ *1 is subtracted you get a value in the range 0x00-0x7f, none of which
+ *have their high bit set. The expression here is
+ *(x - 0x01010101) & ~x & 0x80808080), which gives 0x when
+ *there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+ *match, but possibly false 0x80 matches in the next more significant
+ *byte to a true match due to carries.  For little-endian this is
+ *of no consequence since the least significant match is the one
+ *we're interested in, but big-endian needs method 2 to find which
+ *byte matches.
+ * 2) Given a word 'x', we can test to see _which_ byte was zero by
+ *calculating ~(((x & ~0x80808080) - 0x80808080 - 1) | x | ~0x80808080).
+ *This produces 0x80 in each byte that was zero, and 0x00 in all
+ *the other bytes. The '| ~0x80808080' clears the low 7 bits in each
+ *byte, and the '| x' part ensures that bytes with the high bit set
+ *produce 0x00. The addition will carry into the high bit of each byte
+ *iff that byte had one of its low 7 bits set. We can then just 

[PATCH v8 2/4] selftests/powerpc: Add test for strlen()

2018-08-01 Thread Christophe Leroy
This patch adds a test for strlen()

string.c contains a copy of strlen() from lib/string.c

The test first tests the correctness of strlen() by comparing
the result with libc strlen(). It tests all cases of alignment.

It them tests the duration of an aligned strlen() on a 4 bytes string,
on a 16 bytes string and on a 256 bytes string.

Signed-off-by: Christophe Leroy 
---
 v8: no change
 v7: no change
 v6: refactorised the benchmark test
 v5: no change
 v4: new

 .../testing/selftests/powerpc/stringloops/Makefile |   5 +-
 .../testing/selftests/powerpc/stringloops/string.c |  36 ++
 .../testing/selftests/powerpc/stringloops/strlen.c | 127 +
 3 files changed, 167 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/stringloops/string.c
 create mode 100644 tools/testing/selftests/powerpc/stringloops/strlen.c

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
b/tools/testing/selftests/powerpc/stringloops/Makefile
index 3862256c2b7d..779b644461c4 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -10,9 +10,12 @@ $(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
 $(OUTPUT)/memcmp_32: memcmp.c
 $(OUTPUT)/memcmp_32: CFLAGS += -m32
 
+$(OUTPUT)/strlen: strlen.c string.o
+$(OUTPUT)/string.o: string.c
+
 ASFLAGS = $(CFLAGS)
 
-TEST_GEN_PROGS := memcmp_32 memcmp_64
+TEST_GEN_PROGS := memcmp_32 memcmp_64 strlen
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/stringloops/string.c 
b/tools/testing/selftests/powerpc/stringloops/string.c
new file mode 100644
index ..d05200481017
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/string.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  linux/lib/string.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * stupid library routines.. The optimized versions should generally be found
+ * as inline code in 
+ *
+ * These are buggy as well..
+ *
+ * * Fri Jun 25 1999, Ingo Oeser 
+ * -  Added strsep() which will replace strtok() soon (because strsep() is
+ *reentrant and should be faster). Use only strsep() in new code, please.
+ *
+ * * Sat Feb 09 2002, Jason Thomas ,
+ *Matthew Hawkins 
+ * -  Kissed strtok() goodbye
+ */
+
+#include 
+
+/**
+ * strlen - Find the length of a string
+ * @s: The string to be sized
+ */
+size_t test_strlen(const char *s)
+{
+   const char *sc;
+
+   for (sc = s; *sc != '\0'; ++sc)
+   /* nothing */;
+   return sc - s;
+}
diff --git a/tools/testing/selftests/powerpc/stringloops/strlen.c 
b/tools/testing/selftests/powerpc/stringloops/strlen.c
new file mode 100644
index ..9055ebc484d0
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/strlen.c
@@ -0,0 +1,127 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include "utils.h"
+
+#define SIZE 256
+#define ITERATIONS 1000
+#define ITERATIONS_BENCH 10
+
+int test_strlen(const void *s);
+
+/* test all offsets and lengths */
+static void test_one(char *s)
+{
+   unsigned long offset;
+
+   for (offset = 0; offset < SIZE; offset++) {
+   int x, y;
+   unsigned long i;
+
+   y = strlen(s + offset);
+   x = test_strlen(s + offset);
+
+   if (x != y) {
+   printf("strlen() returned %d, should have returned %d 
(%p offset %ld)\n", x, y, s, offset);
+
+   for (i = offset; i < SIZE; i++)
+   printf("%02x ", s[i]);
+   printf("\n");
+   }
+   }
+}
+
+static void bench_test(char *s)
+{
+   struct timespec ts_start, ts_end;
+   int i;
+
+   clock_gettime(CLOCK_MONOTONIC, _start);
+
+   for (i = 0; i < ITERATIONS_BENCH; i++)
+   test_strlen(s);
+
+   clock_gettime(CLOCK_MONOTONIC, _end);
+
+   printf("len %3.3d : time = %.6f\n", test_strlen(s), ts_end.tv_sec - 
ts_start.tv_sec + (ts_end.tv_nsec - ts_start.tv_nsec) / 1e9);
+}
+
+static int testcase(void)
+{
+   char *s;
+   unsigned long i;
+
+   s = memalign(128, SIZE);
+   if (!s) {
+   perror("memalign");
+   exit(1);
+   }
+
+   srandom(1);
+
+   memset(s, 0, SIZE);
+   for (i = 0; i < SIZE; i++) {
+   char c;
+
+   do {
+   c = random() & 0x7f;
+   } while (!c);
+   s[i] = c;
+   test_one(s);
+   }
+
+   for (i = 0; i < ITERATIONS; i++) {
+   unsigned long j;
+
+   for (j = 0; j < SIZE; j++) {
+   char c;
+
+   do {
+   c = random() & 0x7f;
+   } while (!c);
+   s[j] = c;
+   }
+   for (j = 0; j < sizeof(long); j++) {
+   

[PATCH v8 1/4] selftests/powerpc: add test for 32 bits memcmp

2018-08-01 Thread Christophe Leroy
This patch renames memcmp test to memcmp_64 and adds
a memcmp_32 test for testing the 32 bits version of memcmp()

Signed-off-by: Christophe Leroy 
---
 v8: rebased on latest powerpc/merge
 v7: no change
 v6: no change
 v5: no change
 v4: new

 tools/testing/selftests/powerpc/stringloops/Makefile| 14 +++---
 tools/testing/selftests/powerpc/stringloops/memcmp_32.S |  1 +
 2 files changed, 12 insertions(+), 3 deletions(-)
 create mode 12 tools/testing/selftests/powerpc/stringloops/memcmp_32.S

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
b/tools/testing/selftests/powerpc/stringloops/Makefile
index c60c6172dd3c..3862256c2b7d 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -1,10 +1,18 @@
 # SPDX-License-Identifier: GPL-2.0
 # The loops are all 64-bit code
-CFLAGS += -m64 -maltivec
 CFLAGS += -I$(CURDIR)
 
-TEST_GEN_PROGS := memcmp
-EXTRA_SOURCES := memcmp_64.S ../harness.c
+EXTRA_SOURCES := ../harness.c
+
+$(OUTPUT)/memcmp_64: memcmp.c
+$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
+
+$(OUTPUT)/memcmp_32: memcmp.c
+$(OUTPUT)/memcmp_32: CFLAGS += -m32
+
+ASFLAGS = $(CFLAGS)
+
+TEST_GEN_PROGS := memcmp_32 memcmp_64
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp_32.S 
b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
new file mode 12
index ..056f2b3af789
--- /dev/null
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp_32.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcmp_32.S
\ No newline at end of file
-- 
2.13.3



Re: [RFC PATCH kernel 0/5] powerpc/P9/vfio: Pass through NVIDIA Tesla V100

2018-08-01 Thread Alexey Kardashevskiy



On 01/08/2018 00:29, Alex Williamson wrote:
> On Tue, 31 Jul 2018 14:03:35 +1000
> Alexey Kardashevskiy  wrote:
> 
>> On 31/07/2018 02:29, Alex Williamson wrote:
>>> On Mon, 30 Jul 2018 18:58:49 +1000
>>> Alexey Kardashevskiy  wrote:
 After some local discussions, it was pointed out that force disabling
 nvlinks won't bring us much as for an nvlink to work, both sides need to
 enable it so malicious guests cannot penetrate good ones (or a host)
 unless a good guest enabled the link but won't happen with a well
 behaving guest. And if two guests became malicious, then can still only
 harm each other, and so can they via other ways such network. This is
 different from PCIe as once PCIe link is unavoidably enabled, a well
 behaving device cannot firewall itself from peers as it is up to the
 upstream bridge(s) now to decide the routing; with nvlink2, a GPU still
 has means to protect itself, just like a guest can run "firewalld" for
 network.

 Although it would be a nice feature to have an extra barrier between
 GPUs, is inability to block the links in hypervisor still a blocker for
 V100 pass through?  
>>>
>>> How is the NVLink configured by the guest, is it 'on'/'off' or are
>>> specific routes configured?   
>>
>> The GPU-GPU links need not to be blocked and need to be enabled
>> (==trained) by a driver in the guest. There are no routes between GPUs
>> in NVLink fabric, these are direct links, it is just a switch on each
>> side, both switches need to be on for a link to work.
> 
> Ok, but there is at least the possibility of multiple direct links per
> GPU, the very first diagram I find of NVlink shows 8 interconnected
> GPUs:
> 
> https://www.nvidia.com/en-us/data-center/nvlink/

Out design is like the left part of the picture but it is just a detail.

> So if each switch enables one direct, point to point link, how does the
> guest know which links to open for which peer device?

It uses PCI config space on GPUs to discover the topology.

> And of course
> since we can't see the spec, a security audit is at best hearsay :-\

Yup, the exact discovery protocol is hidden.


>> The GPU-CPU links - the GPU bit is the same switch, the CPU NVlink state
>> is controlled via the emulated PCI bridges which I pass through together
>> with the GPU.
> 
> So there's a special emulated switch, is that how the guest knows which
> GPUs it can enable NVLinks to?

Since it only has PCI config space (there is nothing relevant in the
device tree at all), I assume (double checking with the NVIDIA folks
now) the guest driver enables them all, tests which pair works and
disables the ones which do not. This gives a malicious guest a tiny
window of opportunity to break into a good guest. Hm :-/


>>> If the former, then isn't a non-malicious
>>> guest still susceptible to a malicious guest?  
>>
>> A non-malicious guest needs to turn its switch on for a link to a GPU
>> which belongs to a malicious guest.
> 
> Actual security, or obfuscation, will we ever know...
 If the latter, how is
>>> routing configured by the guest given that the guest view of the
>>> topology doesn't match physical hardware?  Are these routes
>>> deconfigured by device reset?  Are they part of the save/restore
>>> state?  Thanks,  
> 
> Still curious what happens to these routes on reset.  Can a later user
> of a GPU inherit a device where the links are already enabled?  Thanks,

I am told that the GPU reset disables links. As a side effect, we get an
HMI (a hardware fault which reset the host machine) when trying
accessing the GPU RAM which indicates that the link is down as the
memory is only accessible via the nvlink. We have special fencing code
in our host firmware (skiboot) to fence this memory on PCI reset so
reading from it returns zeroes instead of HMIs.



-- 
Alexey


Re: [RFC 0/4] Virtio uses DMA API for all devices

2018-08-01 Thread Will Deacon
On Tue, Jul 31, 2018 at 03:36:22PM -0500, Benjamin Herrenschmidt wrote:
> On Tue, 2018-07-31 at 10:30 -0700, Christoph Hellwig wrote:
> > > However the question people raise is that DMA API is already full of
> > > arch-specific tricks the likes of which are outlined in your post linked
> > > above. How is this one much worse?
> > 
> > None of these warts is visible to the driver, they are all handled in
> > the architecture (possibly on a per-bus basis).
> > 
> > So for virtio we really need to decide if it has one set of behavior
> > as specified in the virtio spec, or if it behaves exactly as if it
> > was on a PCI bus, or in fact probably both as you lined up.  But no
> > magic arch specific behavior inbetween.
> 
> The only arch specific behaviour is needed in the case where it doesn't
> behave like PCI. In this case, the PCI DMA ops are not suitable, but in
> our secure VMs, we still need to make it use swiotlb in order to bounce
> through non-secure pages.

On arm/arm64, the problem we have is that legacy virtio devices on the MMIO
transport (so definitely not PCI) have historically been advertised by qemu
as not being cache coherent, but because the virtio core has bypassed DMA
ops then everything has happened to work. If we blindly enable the arch DMA
ops, we'll plumb in the non-coherent ops and start getting data corruption,
so we do need a way to quirk virtio as being "always coherent" if we want to
use the DMA ops (which we do, because our emulation platforms have an IOMMU
for all virtio devices).

Will


Re: [next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280

2018-08-01 Thread jianchao.wang
Hi Abdul

On 08/01/2018 02:33 PM, Abdul Haleem wrote:
> # mkfs -t ext4 /dev/mapper/mpatha
> mke2fs 1.43.1 (08-Jun-2016)
> Found a dos partition table in /dev/mapper/mpatha
> Proceed anyway? (y,n) y
> Discarding device blocks: 
> qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 --  1 2002.
> qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 --  1 2002.
> qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 --  1 2002.
> qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 --  1 2002.
> WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 
> scsi_end_request+0x250/0x280
...
> NIP [c0690080] scsi_end_request+0x250/0x280
> LR [c068fe80] scsi_end_request+0x50/0x280
> Call Trace:
> [c0027d39b600] [c068fe80] scsi_end_request+0x50/0x280 (unreliable)
> [c0027d39b660] [c06904ac] scsi_io_completion+0x29c/0x7d0
> [c0027d39b710] [c06848e4] scsi_finish_command+0x104/0x1c0
> [c0027d39b790] [c068f148] scsi_softirq_done+0x198/0x1f0
> [c0027d39b820] [c04f2b80] blk_mq_complete_request+0x130/0x1d0
> [c0027d39b860] [c068d27c] scsi_mq_done+0x2c/0xe0
> [c0027d39b890] [d4291080] qla2xxx_qpair_sp_compl+0xa8/0x140 
> [qla2xxx]
> [c0027d39b900] [d42cc9d0] 
> qla2x00_process_completed_request+0x68/0x140 [qla2xxx]
> [ cut here ]
> kernel BUG at block/blk-core.c:3196!

blk_finish_request

BUG_ON(blk_queued_rq(req))

We are also suffering a similar issue on qla2xxx,
the BUG_ON in blk_finish_request is triggered while there are lots of command 
aborted.
The root cause should be qla2xxx driver still invoke scsi_done for an aborted 
command
and cause race between requeue path and normal complete path.

Add Himanshu Madhani from qlogic team.
It seems that they are working on this.

Thanks
Jianchao



Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals

2018-08-01 Thread Segher Boessenkool
On Wed, Aug 01, 2018 at 12:03:50AM -0700, Joe Perches wrote:
> On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> > Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > I would suggest to instead use a function like this:
> > 
> > static const char *signame(int signr)
> > {
> > if (signr == SIGBUS)
> > return "bus error";
> > if (signr == SIGFPE)
> > return "floating point exception";
> > if (signr == SIGILL)
> > return "illegal instruction";
> > if (signr == SIGILL)
> > return "segfault";
> > if (signr == SIGTRAP)
> > return "unhandled trap";
> > return "unknown signal";
> > }
> 
> trivia:
> 
> Unless the if tests are ordered most to least likely,
> perhaps it would be better to use a switch/case and
> let the compiler decide.

That would also show there are two entries for SIGILL (here and in the
original patch), one of them very wrong.

Check the table with psignal or something?


Segher


Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals

2018-08-01 Thread Joe Perches
On Wed, 2018-08-01 at 08:37 +0200, Christophe LEROY wrote:
> Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :
> > This adds a human-readable name in the unhandled signal message.
> > Before this patch, a page fault looked like:
> >pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr 
> > 7fff93c55100 code 2 in pandafault[1000+1]
> > After this patch, a page fault looks like:
> >pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 
> > 7fffb63e5100 code 2 in pandafault[13a2a+1]
]]
> > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
[]
> > @@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler);
> >   #define TM_DEBUG(x...) do { } while(0)
> >   #endif
> >   
> > +static const char *signames[SIGRTMIN + 1] = {
> > +   "UNKNOWN",
> > +   "SIGHUP",   // 1
> > +   "SIGINT",   // 2
[]
> I don't think is is worth having that full table when we only use a few 
> of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/)
> 
> I would suggest to instead use a function like this:
> 
> static const char *signame(int signr)
> {
>   if (signr == SIGBUS)
>   return "bus error";
>   if (signr == SIGFPE)
>   return "floating point exception";
>   if (signr == SIGILL)
>   return "illegal instruction";
>   if (signr == SIGILL)
>   return "segfault";
>   if (signr == SIGTRAP)
>   return "unhandled trap";
>   return "unknown signal";
> }

trivia:

Unless the if tests are ordered most to least likely,
perhaps it would be better to use a switch/case and
let the compiler decide.

switch (signr) {
case SIGBUS:return "bus error";
case SIGFPE:return "floating point exception";
case SIGILL:return "illegal instruction";
case SIGSEGV:   return "segfault";
case SIGTRAP:   return "unhandled trap";
}
return "unknown signal";
}


Re: [PATCH v3 9/9] powerpc/traps: Add line prefix in show_instructions()

2018-08-01 Thread Christophe LEROY




Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :

Remove "Instruction dump:" line by adding a prefix to display current->comm
and current->pid, along with the instructions dump.

The prefix can serve as a glue that links the instructions dump to its
originator, allowing messages to be interleaved in the logs.

Before this patch, a page fault looked like:

   pandafault[10524]: segfault (11) at 17d0 nip 161c lr 7fffbd295100 
code 2 in pandafault[1000+1]
   Instruction dump:
   4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 f821ffc1 7c3f0b78 3d22fffe
   392988d0 f93f0020 e93f0020 39400048 <9949> 3920 7d234b78 383f0040

After this patch, it looks like:

   pandafault[10850]: segfault (11) at 17d0 nip 161c lr 7fff9f3e5100 
code 2 in pandafault[1000+1]
   pandafault[10850]: code: 4bfffeec 4bfffee8 3c401002 38427f00 fbe1fff8 
f821ffc1 7c3f0b78 3d22fffe
   pandafault[10850]: code: 392988d0 f93f0020 e93f0020 39400048 <9949> 
3920 7d234b78 383f0040

Signed-off-by: Murilo Opsfelder Araujo 


Does the script scripts/decode_stacktrace.sh also works with this format 
change ?



---
  arch/powerpc/kernel/process.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e78799a8855a..d12143e7d8f9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1265,16 +1265,19 @@ static int instructions_to_print = 16;
  void show_instructions(struct pt_regs *regs)
  {
int i;
+   const char *prefix = KERN_INFO "%s[%d]: code: ";
unsigned long pc = regs->nip - (instructions_to_print * 3 / 4 *
sizeof(int));
  
-	printk("Instruction dump:");

+   printk(prefix, current->comm, current->pid);
  
  	for (i = 0; i < instructions_to_print; i++) {

int instr;
  
-		if (!(i % 8))

+   if (!(i % 8) && (i > 0)) {
pr_cont("\n");
+   printk(prefix, current->comm, current->pid);
+   }
  
  #if !defined(CONFIG_BOOKE)

/* If executing with the IMMU off, adjust pc rather



Re: [PATCH v3 5/9] powerpc/traps: Print signal name for unhandled signals

2018-08-01 Thread Christophe LEROY




Le 31/07/2018 à 16:50, Murilo Opsfelder Araujo a écrit :

This adds a human-readable name in the unhandled signal message.

Before this patch, a page fault looked like:

   pandafault[6303]: unhandled signal 11 at 17d0 nip 161c lr 
7fff93c55100 code 2 in pandafault[1000+1]

After this patch, a page fault looks like:

   pandafault[6352]: segfault (11) at 13a2a09f8 nip 13a2a086c lr 7fffb63e5100 
code 2 in pandafault[13a2a+1]

Signed-off-by: Murilo Opsfelder Araujo 
---
  arch/powerpc/kernel/traps.c | 39 +++--
  1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 1c4f06fca370..e71f12bca146 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -96,6 +96,41 @@ EXPORT_SYMBOL(__debugger_fault_handler);
  #define TM_DEBUG(x...) do { } while(0)
  #endif
  
+static const char *signames[SIGRTMIN + 1] = {

+   "UNKNOWN",
+   "SIGHUP", // 1
+   "SIGINT", // 2
+   "SIGQUIT",// 3
+   "SIGILL", // 4
+   "unhandled trap", // 5 = SIGTRAP
+   "SIGABRT",// 6 = SIGIOT
+   "bus error",  // 7 = SIGBUS
+   "floating point exception",   // 8 = SIGFPE
+   "illegal instruction",// 9 = SIGILL
+   "SIGUSR1",// 10
+   "segfault",   // 11 = SIGSEGV
+   "SIGUSR2",// 12
+   "SIGPIPE",// 13
+   "SIGALRM",// 14
+   "SIGTERM",// 15
+   "SIGSTKFLT",  // 16
+   "SIGCHLD",// 17
+   "SIGCONT",// 18
+   "SIGSTOP",// 19
+   "SIGTSTP",// 20
+   "SIGTTIN",// 21
+   "SIGTTOU",// 22
+   "SIGURG", // 23
+   "SIGXCPU",// 24
+   "SIGXFSZ",// 25
+   "SIGVTALRM",  // 26
+   "SIGPROF",// 27
+   "SIGWINCH",   // 28
+   "SIGIO",  // 29 = SIGPOLL = SIGLOST
+   "SIGPWR", // 30
+   "SIGSYS", // 31 = SIGUNUSED
+};


I don't think is is worth having that full table when we only use a few 
of them. (As discussed in v1 https://patchwork.ozlabs.org/patch/948802/)


I would suggest to instead use a function like this:

static const char *signame(int signr)
{
if (signr == SIGBUS)
return "bus error";
if (signr == SIGFPE)
return "floating point exception";
if (signr == SIGILL)
return "illegal instruction";
if (signr == SIGILL)
return "segfault";
if (signr == SIGTRAP)
return "unhandled trap";
return "unknown signal";
}

Christophe


+
  /*
   * Trap & Exception support
   */
@@ -314,8 +349,8 @@ static void show_signal_msg(int signr, struct pt_regs 
*regs, int code,
if (!unhandled_signal(current, signr))
return;
  
-	pr_info("%s[%d]: unhandled signal %d at %lx nip %lx lr %lx code %x",

-   current->comm, current->pid, signr,
+   pr_info("%s[%d]: %s (%d) at %lx nip %lx lr %lx code %x",
+   current->comm, current->pid, signames[signr], signr,
addr, regs->nip, regs->link, code);
  
  	print_vma_addr(KERN_CONT " in ", regs->nip);




[next-20180727][qla2xxx][BUG] WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 scsi_end_request+0x250/0x280

2018-08-01 Thread Abdul Haleem
Greeting's

linux-next kernel crashes when creating file system on scsi disks using
mkfs.ext4 (Discarding device blocks) 

Machine: Power 8 Power VM LPAR
kernel : 4.18.0-rc6-next-20180727
adapter : Fibre Channel [0c04]: QLogic Corp. ISP2532-based 8Gb Fibre Channel to 
PCI Express HBA [1077:2532]
Test:  mkfs.ext4 /dev/mapper/maptha

Above command triggers WARN_ON_ONCE at line:
0xc0690080 is in scsi_end_request (drivers/scsi/scsi_lib.c:691)
686 
687 if (blk_queue_add_random(q))
688 add_disk_randomness(req->rq_disk);
689 
690 if (!blk_rq_is_scsi(req)) {
691 WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED));
692 cmd->flags &= ~SCMD_INITIALIZED;
693 destroy_rcu_head(>rcu);
694 }
695 

followed by kernel crash

# mkfs -t ext4 /dev/mapper/mpatha
mke2fs 1.43.1 (08-Jun-2016)
Found a dos partition table in /dev/mapper/mpatha
Proceed anyway? (y,n) y
Discarding device blocks: 
qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 --  1 2002.
qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 --  1 2002.
qla2xxx [0106:a0:00.1]-801c:2: Abort command issued nexus=2:1:0 --  1 2002.
qla2xxx [0106:a0:00.0]-801c:0: Abort command issued nexus=0:1:0 --  1 2002.
WARNING: CPU: 12 PID: 511 at drivers/scsi/scsi_lib.c:691 
scsi_end_request+0x250/0x280
Modules linked in: xt_CHECKSUM(E) ipt_MASQUERADE(E) tun(E) kvm_pr(E)
kvm(E) ip6t_rpfilter(E) ip6t_REJECT(E) nf_reject_ipv6(E) ipt_REJECT(E)
nf_reject_ipv4(E) xt_conntrack(E) ip_set(E) nfnetlink(E) ebtable_nat(E)
ebtable_broute(E) bridge(E) stp(E) llc(E) ip6table_raw(E)
ip6table_nat(E) nf_nat_ipv6(E) ip6table_security(E) ip6table_mangle(E)
iptable_raw(E) iptable_nat(E) nf_nat_ipv4(E) nf_nat(E) nf_conntrack(E)
nf_defrag_ipv6(E) nf_defrag_ipv4(E) libcrc32c(E) iptable_security(E)
iptable_mangle(E) ebtable_filter(E) ebtables(E) ip6table_filter(E)
ip6_tables(E) iptable_filter(E) ip_tables(E) dm_service_time(E) xts(E)
vmx_crypto(E) pseries_rng(E) nfsd(E) auth_rpcgss(E) nfs_acl(E) lockd(E)
grace(E) sunrpc(E) dm_multipath(E) dm_mod(E) sch_fq_codel(E) ext4(E)
mbcache(E) jbd2(E) fscrypto(E) sr_mod(E)
 cdrom(E) sd_mod(E) ibmvscsi(E) scsi_transport_srp(E) ibmveth(E)
qla2xxx(E) nvme_fc(E) nvme_fabrics(E) nvme_core(E) scsi_transport_fc(E)
tg3(E)
CPU: 12 PID: 511 Comm: kworker/12:2 Tainted: GE 
4.18.0-rc6-next-20180727-autotest-autotest #1
Workqueue: qla2xxx_wq qla25xx_free_rsp_que [qla2xxx]
NIP:  c0690080 LR: c068fe80 CTR: d5050788
REGS: c0027d39b380 TRAP: 0700   Tainted: GE  
(4.18.0-rc6-next-20180727-autotest-autotest)
MSR:  8282b033   CR: 24002824  XER: 
2000
CFAR: c068feb8 IRQMASK: 1 
GPR00: c068fe80 c0027d39b600 c113de00  
GPR04:    0001 
GPR08: c0027dce3000 0001  d4330708 
GPR12: 2200 cec57600 c012cf78 c00289561b40 
GPR16:   00044000 4007 
GPR20: c0027dce3860   c00280db9c60 
GPR24: 0002 c153567c c002712a1cc8  
GPR28:  c002712a1cc8 c0027e217000 c002708efe00 
NIP [c0690080] scsi_end_request+0x250/0x280
LR [c068fe80] scsi_end_request+0x50/0x280
Call Trace:
[c0027d39b600] [c068fe80] scsi_end_request+0x50/0x280 (unreliable)
[c0027d39b660] [c06904ac] scsi_io_completion+0x29c/0x7d0
[c0027d39b710] [c06848e4] scsi_finish_command+0x104/0x1c0
[c0027d39b790] [c068f148] scsi_softirq_done+0x198/0x1f0
[c0027d39b820] [c04f2b80] blk_mq_complete_request+0x130/0x1d0
[c0027d39b860] [c068d27c] scsi_mq_done+0x2c/0xe0
[c0027d39b890] [d4291080] qla2xxx_qpair_sp_compl+0xa8/0x140 
[qla2xxx]
[c0027d39b900] [d42cc9d0] 
qla2x00_process_completed_request+0x68/0x140 [qla2xxx]
[ cut here ]
kernel BUG at block/blk-core.c:3196!
[c0027d39b970] [d42cd4ac] qla2x00_status_entry.isra.7+0x4f4/0x1750 
[qla2xxx]
Oops: Exception in kernel mode, sig: 5 [#1]
[c0027d39bb00] [d42cfab0] 
qla24xx_process_response_queue+0x7d8/0xba0 [qla2xxx]
LE SMP NR_CPUS=2048 
[c0027d39bc40] [d42f8c48] qla25xx_free_rsp_que+0x1f0/0x270 [qla2xxx]
NUMA pSeries
[c0027d39bc80] [c012478c] process_one_work+0x24c/0x500
Modules linked in: xt_CHECKSUM(E)
[c0027d39bd20] [c0124acc] worker_thread+0x8c/0x590
 ipt_MASQUERADE(E)
[c0027d39bdc0] [c012d0c8] kthread+0x158/0x1a0
 tun(E)
[c0027d39be30] [c000b65c] ret_from_kernel_thread+0x5c/0x80
 kvm_pr(E)
Instruction dump:
 kvm(E)
7f23cb78 
 ip6t_rpfilter(E)
4bed1d15 
 ip6t_REJECT(E)
6000 3c62003f 
 nf_reject_ipv6(E)
e8637888 
 ipt_REJECT(E)
7f24cb78 
 

Re: [PATCH 3/3] ptp_qoriq: convert to use module parameters for initialization

2018-08-01 Thread Richard Cochran
On Wed, Aug 01, 2018 at 04:36:40AM +, Y.b. Lu wrote:

> Could I add a function to calculate a set of default register values
> to initialize ptp timer when dts method failed to get required
> properties in driver?

Yes, it would be ideal if the driver can pick correct values
automatically.

However, the frequency on the FIPER outputs can't be configured
automatically, and we don't have an API for the user to choose this.

> I think this will be useful. The ptp timer on new platforms (you may
> see two dts patches in this patchset. Many platforms will be
> affected.) will work without these dts properties. If user want
> specific setting, they can set dts properties.

Sure.

Thanks,
Richard


Re: [PATCH v6 8/8] powernv/pseries: consolidate code for mce early handling.

2018-08-01 Thread Nicholas Piggin
On Mon, 9 Jul 2018 18:02:39 +0200
Michal Suchánek  wrote:

> On Fri, 6 Jul 2018 19:40:24 +1000
> Nicholas Piggin  wrote:
> 
> > On Wed, 04 Jul 2018 23:30:12 +0530
> > Mahesh J Salgaonkar  wrote:
> >   
> > > From: Mahesh Salgaonkar 
> > > 
> > > Now that other platforms also implements real mode mce handler,
> > > lets consolidate the code by sharing existing powernv machine check
> > > early code. Rename machine_check_powernv_early to
> > > machine_check_common_early and reuse the code.
> > > 
> > > Signed-off-by: Mahesh Salgaonkar 
> > > ---
> > >  arch/powerpc/kernel/exceptions-64s.S |   56
> > > +++--- 1 file changed, 11
> > > insertions(+), 45 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kernel/exceptions-64s.S
> > > b/arch/powerpc/kernel/exceptions-64s.S index
> > > 0038596b7906..3e877ec55d50 100644 ---
> > > a/arch/powerpc/kernel/exceptions-64s.S +++
> > > b/arch/powerpc/kernel/exceptions-64s.S @@ -243,14 +243,13 @@
> > > EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
> > > SET_SCRATCH0(r13) /* save r13 */
> > > EXCEPTION_PROLOG_0(PACA_EXMC) BEGIN_FTR_SECTION
> > > - b   machine_check_powernv_early
> > > + b   machine_check_common_early
> > >  FTR_SECTION_ELSE
> > >   b   machine_check_pSeries_0
> > >  ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
> > >  EXC_REAL_END(machine_check, 0x200, 0x100)
> > >  EXC_VIRT_NONE(0x4200, 0x100)
> > > -TRAMP_REAL_BEGIN(machine_check_powernv_early)
> > > -BEGIN_FTR_SECTION
> > > +TRAMP_REAL_BEGIN(machine_check_common_early)
> > >   EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > >   /*
> > >* Register contents:
> > > @@ -306,7 +305,9 @@ BEGIN_FTR_SECTION
> > >   /* Save r9 through r13 from EXMC save area to stack frame.
> > > */ EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > >   mfmsr   r11 /* get MSR value */
> > > +BEGIN_FTR_SECTION
> > >   ori r11,r11,MSR_ME  /* turn on ME bit
> > > */ +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> > >   ori r11,r11,MSR_RI  /* turn on RI bit
> > > */ LOAD_HANDLER(r12, machine_check_handle_early)
> > >  1:   mtspr   SPRN_SRR0,r12
> > > @@ -325,7 +326,6 @@ BEGIN_FTR_SECTION
> > >   andcr11,r11,r10 /* Turn off MSR_ME
> > > */ b  1b
> > >   b   .   /* prevent speculative execution */
> > > -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> > >  
> > >  TRAMP_REAL_BEGIN(machine_check_pSeries)
> > >   .globl machine_check_fwnmi
> > > @@ -333,7 +333,7 @@ machine_check_fwnmi:
> > >   SET_SCRATCH0(r13)   /* save r13 */
> > >   EXCEPTION_PROLOG_0(PACA_EXMC)
> > >  BEGIN_FTR_SECTION
> > > - b   machine_check_pSeries_early
> > > + b   machine_check_common_early
> > >  END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> > >  machine_check_pSeries_0:
> > >   EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
> > > @@ -346,45 +346,6 @@ machine_check_pSeries_0:
> > >  
> > >  TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
> > >  
> > > -TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> > > -BEGIN_FTR_SECTION
> > > - EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > > - mr  r10,r1  /* Save r1 */
> > > - ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency
> > > stack */
> > > - subir1,r1,INT_FRAME_SIZE/* alloc stack
> > > frame */
> > > - mfspr   r11,SPRN_SRR0   /* Save SRR0 */
> > > - mfspr   r12,SPRN_SRR1   /* Save SRR1 */
> > > - EXCEPTION_PROLOG_COMMON_1()
> > > - EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > > - EXCEPTION_PROLOG_COMMON_3(0x200)
> > > - addir3,r1,STACK_FRAME_OVERHEAD
> > > - BRANCH_LINK_TO_FAR(machine_check_early) /* Function call
> > > ABI */ -
> > > - /* Move original SRR0 and SRR1 into the respective regs */
> > > - ld  r9,_MSR(r1)
> > > - mtspr   SPRN_SRR1,r9
> > > - ld  r3,_NIP(r1)
> > > - mtspr   SPRN_SRR0,r3
> > > - ld  r9,_CTR(r1)
> > > - mtctr   r9
> > > - ld  r9,_XER(r1)
> > > - mtxer   r9
> > > - ld  r9,_LINK(r1)
> > > - mtlrr9
> > > - REST_GPR(0, r1)
> > > - REST_8GPRS(2, r1)
> > > - REST_GPR(10, r1)
> > > - ld  r11,_CCR(r1)
> > > - mtcrr11
> > > - REST_GPR(11, r1)
> > > - REST_2GPRS(12, r1)
> > > - /* restore original r1. */
> > > - ld  r1,GPR1(r1)
> > > - SET_SCRATCH0(r13)   /* save r13 */
> > > - EXCEPTION_PROLOG_0(PACA_EXMC)
> > > - b   machine_check_pSeries_0
> > > -END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> > > -
> > >  EXC_COMMON_BEGIN(machine_check_common)
> > >   /*
> > >* Machine check is different because we use a different
> > > @@ -483,6 +444,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
> > >   bl  machine_check_early
> > >   std r3,RESULT(r1)   /* Save result */
> > >   ld  r12,_MSR(r1)
> > > +BEGIN_FTR_SECTION
> > > + bne 9f  /* pSeries: continue
> > > to V mode. */ +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> > 
> > Should this be "b 9f" ? Although...
> >   
> > >  
> > >  #ifdef   CONFIG_PPC_P7_NAP
> > >   /*
> > > @@ -564,7 +528,9 @@ 

Re: [PATCH v6 5/8] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-08-01 Thread Nicholas Piggin
On Wed, 04 Jul 2018 23:28:21 +0530
Mahesh J Salgaonkar  wrote:

> From: Mahesh Salgaonkar 
> 
> On pseries, as of today system crashes if we get a machine check
> exceptions due to SLB errors. These are soft errors and can be fixed by
> flushing the SLBs so the kernel can continue to function instead of
> system crash. We do this in real mode before turning on MMU. Otherwise
> we would run into nested machine checks. This patch now fetches the
> rtas error log in real mode and flushes the SLBs on SLB errors.
> 
> Signed-off-by: Mahesh Salgaonkar 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
>  arch/powerpc/include/asm/machdep.h|1 
>  arch/powerpc/kernel/exceptions-64s.S  |   42 +
>  arch/powerpc/kernel/mce.c |   16 +++-
>  arch/powerpc/mm/slb.c |6 +++
>  arch/powerpc/platforms/pseries/pseries.h  |1 
>  arch/powerpc/platforms/pseries/ras.c  |   51 
> +
>  arch/powerpc/platforms/pseries/setup.c|1 
>  8 files changed, 116 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 50ed64fba4ae..cc00a7088cf3 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -487,6 +487,7 @@ extern void hpte_init_native(void);
>  
>  extern void slb_initialize(void);
>  extern void slb_flush_and_rebolt(void);
> +extern void slb_flush_and_rebolt_realmode(void);
>  
>  extern void slb_vmalloc_update(void);
>  extern void slb_set_size(u16 size);
> diff --git a/arch/powerpc/include/asm/machdep.h 
> b/arch/powerpc/include/asm/machdep.h
> index ffe7c71e1132..fe447e0d4140 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -108,6 +108,7 @@ struct machdep_calls {
>  
>   /* Early exception handlers called in realmode */
>   int (*hmi_exception_early)(struct pt_regs *regs);
> + int (*machine_check_early)(struct pt_regs *regs);
>  
>   /* Called during machine check exception to retrive fixup address. */
>   bool(*mce_check_early_recovery)(struct pt_regs *regs);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> index f283958129f2..0038596b7906 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -332,6 +332,9 @@ TRAMP_REAL_BEGIN(machine_check_pSeries)
>  machine_check_fwnmi:
>   SET_SCRATCH0(r13)   /* save r13 */
>   EXCEPTION_PROLOG_0(PACA_EXMC)
> +BEGIN_FTR_SECTION
> + b   machine_check_pSeries_early
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
>  machine_check_pSeries_0:
>   EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
>   /*
> @@ -343,6 +346,45 @@ machine_check_pSeries_0:
>  
>  TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
>  
> +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> +BEGIN_FTR_SECTION
> + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> + mr  r10,r1  /* Save r1 */
> + ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency stack */
> + subir1,r1,INT_FRAME_SIZE/* alloc stack frame*/
> + mfspr   r11,SPRN_SRR0   /* Save SRR0 */
> + mfspr   r12,SPRN_SRR1   /* Save SRR1 */
> + EXCEPTION_PROLOG_COMMON_1()
> + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> + EXCEPTION_PROLOG_COMMON_3(0x200)
> + addir3,r1,STACK_FRAME_OVERHEAD
> + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call ABI */
> +
> + /* Move original SRR0 and SRR1 into the respective regs */
> + ld  r9,_MSR(r1)
> + mtspr   SPRN_SRR1,r9
> + ld  r3,_NIP(r1)
> + mtspr   SPRN_SRR0,r3
> + ld  r9,_CTR(r1)
> + mtctr   r9
> + ld  r9,_XER(r1)
> + mtxer   r9
> + ld  r9,_LINK(r1)
> + mtlrr9
> + REST_GPR(0, r1)
> + REST_8GPRS(2, r1)
> + REST_GPR(10, r1)
> + ld  r11,_CCR(r1)
> + mtcrr11
> + REST_GPR(11, r1)
> + REST_2GPRS(12, r1)
> + /* restore original r1. */
> + ld  r1,GPR1(r1)
> + SET_SCRATCH0(r13)   /* save r13 */
> + EXCEPTION_PROLOG_0(PACA_EXMC)
> + b   machine_check_pSeries_0
> +END_FTR_SECTION_IFCLR(CPU_FTR_HVMODE)
> +
>  EXC_COMMON_BEGIN(machine_check_common)
>   /*
>* Machine check is different because we use a different
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index efdd16a79075..221271c96a57 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
>  {
>   long handled = 0;
>  
> - __this_cpu_inc(irq_stat.mce_exceptions);
> + /*
> +  * For pSeries we count mce when we go into virtual mode machine
> +  * check handler. Hence skip it. Also, We can't 

Re: [PATCH v5 5/7] powerpc/pseries: flush SLB contents on SLB MCE errors.

2018-08-01 Thread Nicholas Piggin
On Thu, 12 Jul 2018 15:41:13 +0200
Michal Suchánek  wrote:

> On Tue, 3 Jul 2018 08:08:14 +1000
> "Nicholas Piggin"  wrote:
> 
> > On Mon, 02 Jul 2018 11:17:06 +0530
> > Mahesh J Salgaonkar  wrote:
> >   
> > > From: Mahesh Salgaonkar 
> > > 
> > > On pseries, as of today system crashes if we get a machine check
> > > exceptions due to SLB errors. These are soft errors and can be
> > > fixed by flushing the SLBs so the kernel can continue to function
> > > instead of system crash. We do this in real mode before turning on
> > > MMU. Otherwise we would run into nested machine checks. This patch
> > > now fetches the rtas error log in real mode and flushes the SLBs on
> > > SLB errors.
> > > 
> > > Signed-off-by: Mahesh Salgaonkar 
> > > ---
> > >  arch/powerpc/include/asm/book3s/64/mmu-hash.h |1 
> > >  arch/powerpc/include/asm/machdep.h|1 
> > >  arch/powerpc/kernel/exceptions-64s.S  |   42
> > > + arch/powerpc/kernel/mce.c
> > > |   16 +++- arch/powerpc/mm/slb.c |
> > > 6 +++ arch/powerpc/platforms/powernv/opal.c |1 
> > >  arch/powerpc/platforms/pseries/pseries.h  |1 
> > >  arch/powerpc/platforms/pseries/ras.c  |   51
> > > +
> > > arch/powerpc/platforms/pseries/setup.c|1 9 files
> > > changed, 116 insertions(+), 4 deletions(-)   
> > 
> >   
> > > +TRAMP_REAL_BEGIN(machine_check_pSeries_early)
> > > +BEGIN_FTR_SECTION
> > > + EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
> > > + mr  r10,r1  /* Save r1 */
> > > + ld  r1,PACAMCEMERGSP(r13)   /* Use MC emergency
> > > stack */
> > > + subir1,r1,INT_FRAME_SIZE/* alloc stack
> > > frame */
> > > + mfspr   r11,SPRN_SRR0   /* Save SRR0 */
> > > + mfspr   r12,SPRN_SRR1   /* Save SRR1 */
> > > + EXCEPTION_PROLOG_COMMON_1()
> > > + EXCEPTION_PROLOG_COMMON_2(PACA_EXMC)
> > > + EXCEPTION_PROLOG_COMMON_3(0x200)
> > > + addir3,r1,STACK_FRAME_OVERHEAD
> > > + BRANCH_LINK_TO_FAR(machine_check_early) /* Function call
> > > ABI */
> > 
> > Is there any reason you can't use the existing
> > machine_check_powernv_early code to do all this?
> >   
> > > diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> > > index efdd16a79075..221271c96a57 100644
> > > --- a/arch/powerpc/kernel/mce.c
> > > +++ b/arch/powerpc/kernel/mce.c
> > > @@ -488,9 +488,21 @@ long machine_check_early(struct pt_regs *regs)
> > >  {
> > >   long handled = 0;
> > >  
> > > - __this_cpu_inc(irq_stat.mce_exceptions);
> > > + /*
> > > +  * For pSeries we count mce when we go into virtual mode
> > > machine
> > > +  * check handler. Hence skip it. Also, We can't access per
> > > cpu
> > > +  * variables in real mode for LPAR.
> > > +  */
> > > + if (early_cpu_has_feature(CPU_FTR_HVMODE))
> > > + __this_cpu_inc(irq_stat.mce_exceptions);
> > >  
> > > - if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> > > + /*
> > > +  * See if platform is capable of handling machine check.
> > > +  * Otherwise fallthrough and allow CPU to handle this
> > > machine check.
> > > +  */
> > > + if (ppc_md.machine_check_early)
> > > + handled = ppc_md.machine_check_early(regs);
> > > + else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> > >   handled =
> > > cur_cpu_spec->machine_check_early(regs);
> > 
> > Would be good to add a powernv ppc_md handler which does the
> > cur_cpu_spec->machine_check_early() call now that other platforms are
> > calling this code. Because those aren't valid as a fallback call, but
> > specific to powernv.
> >   
> 
> Something like this (untested)?

Sorry, some emails fell through the cracks. Yes exactly like this would
be good. If you can add a quick changelog and SOB, and
Reviewed-by: Nicholas Piggin 

Thanks,
Nick

> 
> Subject: [PATCH] powerpc/powernv: define platform MCE handler.
> 
> ---
>  arch/powerpc/kernel/mce.c  |  3 ---
>  arch/powerpc/platforms/powernv/setup.c | 11 +++
>  2 files changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index 221271c96a57..ae17d8aa60c4 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -498,12 +498,9 @@ long machine_check_early(struct pt_regs *regs)
>  
>   /*
>* See if platform is capable of handling machine check.
> -  * Otherwise fallthrough and allow CPU to handle this machine check.
>*/
>   if (ppc_md.machine_check_early)
>   handled = ppc_md.machine_check_early(regs);
> - else if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
> - handled = cur_cpu_spec->machine_check_early(regs);
>   return handled;
>  }
>  
> diff --git a/arch/powerpc/platforms/powernv/setup.c 
> b/arch/powerpc/platforms/powernv/setup.c
> index f96df0a25d05..b74c93bc2e55 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ 

[PATCH] powernv: opal-sensor-groups: Add attributes to disable/enable sensors

2018-08-01 Thread Shilpasri G Bhat
This patch provides support to disable and enable plaform specific
sensor groups like performance, utilization and frequency which are
currently not supported in hwmon.

Signed-off-by: Shilpasri G Bhat 
---
This patch is based on 
https://git.kernel.org/powerpc/c/04baaf28f40c68c35a413cd9d0db71

 .../ABI/testing/sysfs-firmware-opal-sensor-groups  | 33 
 .../powerpc/platforms/powernv/opal-sensor-groups.c | 99 +++---
 2 files changed, 100 insertions(+), 32 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups

diff --git a/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups 
b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
new file mode 100644
index 000..a236686
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-firmware-opal-sensor-groups
@@ -0,0 +1,33 @@
+What:  /sys/firmware/opal/sensor_groups
+Date:  August 2017
+Contact:   Linux for PowerPC mailing list 
+Description:   Sensor groups directory for POWER9 powernv servers
+
+   Each folder in this directory contains a sensor group
+   which are classified based on type of the sensor
+   like power, temperature, frequency, current, etc. They
+   can also indicate the group of sensors belonging to
+   different owners like CSM, Profiler, Job-Scheduler
+
+What:  /sys/firmware/opal/sensor_groups//clear
+Date:  August 2017
+Contact:   Linux for PowerPC mailing list 
+Description:   Sysfs file to clear the min-max of all the sensors
+   belonging to the group.
+
+   Writing 1 to this file will clear the minimum and
+   maximum values of all the sensors in the group.
+   In POWER9, the min-max of a sensor is the historical minimum
+   and maximum value of the sensor cached by OCC.
+
+What:  /sys/firmware/opal/sensor_groups//enable
+Date:  August 2018
+Contact:   Linux for PowerPC mailing list 
+Description:   Sysfs file to enable/disable the sensor-group
+
+   Writing 0 to this file will disable all the sensors
+   belonging to the group and writing 1 will enable the
+   sensors.
+   In POWER9, by default all the sensor-groups are enabled and
+   will be copied to main memory by OCC. This file can be used
+   to increase the update frequency of selective sensor-groups.
diff --git a/arch/powerpc/platforms/powernv/opal-sensor-groups.c 
b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
index f7d04b6..8bf6c86 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor-groups.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor-groups.c
@@ -24,6 +24,8 @@
 struct sg_attr {
u32 handle;
struct kobj_attribute attr;
+   u32 opal_no;
+   bool enable;
 };
 
 static struct sensor_group {
@@ -60,34 +62,17 @@ int sensor_group_enable(u32 handle, bool enable)
 }
 EXPORT_SYMBOL_GPL(sensor_group_enable);
 
-static ssize_t sg_store(struct kobject *kobj, struct kobj_attribute *attr,
-   const char *buf, size_t count)
+static int sensor_group_clear(u32 handle)
 {
-   struct sg_attr *sattr = container_of(attr, struct sg_attr, attr);
struct opal_msg msg;
-   u32 data;
-   int ret, token;
-
-   ret = kstrtoint(buf, 0, );
-   if (ret)
-   return ret;
-
-   if (data != 1)
-   return -EINVAL;
+   int token, ret;
 
token = opal_async_get_token_interruptible();
-   if (token < 0) {
-   pr_devel("Failed to get token\n");
+   if (token < 0)
return token;
-   }
-
-   ret = mutex_lock_interruptible(_mutex);
-   if (ret)
-   goto out_token;
 
-   ret = opal_sensor_group_clear(sattr->handle, token);
-   switch (ret) {
-   case OPAL_ASYNC_COMPLETION:
+   ret = opal_sensor_group_clear(handle, token);
+   if (ret == OPAL_ASYNC_COMPLETION) {
ret = opal_async_wait_response(token, );
if (ret) {
pr_devel("Failed to wait for the async response\n");
@@ -95,20 +80,57 @@ static ssize_t sg_store(struct kobject *kobj, struct 
kobj_attribute *attr,
goto out;
}
ret = opal_error_code(opal_get_async_rc(msg));
-   if (!ret)
-   ret = count;
+   } else {
+   ret = opal_error_code(ret);
+   }
+
+out:
+   opal_async_release_token(token);
+   return ret;
+}
+
+static ssize_t sgroup_show(struct kobject *kobj, struct kobj_attribute *attr,
+  char *buf)
+{
+   struct sg_attr *sattr = container_of(attr, struct sg_attr, attr);
+
+   return sprintf(buf, "%u\n", sattr->enable);
+}
+
+static ssize_t sgroup_store(struct kobject *kobj, struct kobj_attribute *attr,
+   const char *buf,