[PATCH kernel] KVM: PPC: Enable in-kernel TCE handlers for PR KVM

2017-10-10 Thread Alexey Kardashevskiy
The handlers support PR KVM from the day one; however the PR KVM's
enable/disable hcalls handler missed these ones.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/kvm/book3s_pr_papr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_pr_papr.c 
b/arch/powerpc/kvm/book3s_pr_papr.c
index 8a4205fa774f..dae3be5ff42b 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
case H_PROTECT:
case H_BULK_REMOVE:
case H_PUT_TCE:
+   case H_PUT_TCE_INDIRECT:
+   case H_STUFF_TCE:
case H_CEDE:
case H_LOGICAL_CI_LOAD:
case H_LOGICAL_CI_STORE:
-- 
2.11.0



[PATCH kernel] KVM: PPC: Protect kvmppc_gpa_to_ua() with srcu

2017-10-10 Thread Alexey Kardashevskiy
kvmppc_gpa_to_ua() accesses KVM memory slot array via
srcu_dereference_check() and this produces warnings from RCU like below.

This extends the existing srcu_read_lock/unlock to cover that
kvmppc_gpa_to_ua() as well.

We did not hit this before as this lock is not needed for the realmode
handlers and hash guests would use the realmode path all the time;
however the radix guests are always redirected to the virtual mode
handlers and hence the warning.

[   68.253798] ./include/linux/kvm_host.h:575 suspicious 
rcu_dereference_check() usage!
[   68.253799]
   other info that might help us debug this:

[   68.253802]
   rcu_scheduler_active = 2, debug_locks = 1
[   68.253804] 1 lock held by qemu-system-ppc/6413:
[   68.253806]  #0:  (>mutex){+.+.}, at: [] 
vcpu_load+0x3c/0xc0 [kvm]
[   68.253826]
   stack backtrace:
[   68.253830] CPU: 92 PID: 6413 Comm: qemu-system-ppc Tainted: GW  
 4.14.0-rc3-00553-g432dcba58e9c-dirty #72
[   68.253833] Call Trace:
[   68.253839] [c00fd3d9f790] [c0b7fcc8] dump_stack+0xe8/0x160 
(unreliable)
[   68.253845] [c00fd3d9f7d0] [c01924c0] 
lockdep_rcu_suspicious+0x110/0x180
[   68.253851] [c00fd3d9f850] [c00e825c] 
kvmppc_gpa_to_ua+0x26c/0x2b0
[   68.253858] [c00fd3d9f8b0] [c0080e3e1984] 
kvmppc_h_put_tce+0x12c/0x2a0 [kvm]

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/kvm/book3s_64_vio.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 8f2da8bba737..4dffa611376d 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -478,28 +478,30 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned 
long liobn,
return ret;
 
dir = iommu_tce_direction(tce);
+
+   idx = srcu_read_lock(>kvm->srcu);
+
if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
-   tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), , NULL))
-   return H_PARAMETER;
+   tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), , NULL)) {
+   ret = H_PARAMETER;
+   goto unlock_exit;
+   }
 
entry = ioba >> stt->page_shift;
 
list_for_each_entry_lockless(stit, >iommu_tables, next) {
-   if (dir == DMA_NONE) {
+   if (dir == DMA_NONE)
ret = kvmppc_tce_iommu_unmap(vcpu->kvm,
stit->tbl, entry);
-   } else {
-   idx = srcu_read_lock(>kvm->srcu);
+   else
ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl,
entry, ua, dir);
-   srcu_read_unlock(>kvm->srcu, idx);
-   }
 
if (ret == H_SUCCESS)
continue;
 
if (ret == H_TOO_HARD)
-   return ret;
+   goto unlock_exit;
 
WARN_ON_ONCE(1);
kvmppc_clear_tce(stit->tbl, entry);
@@ -507,7 +509,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long 
liobn,
 
kvmppc_tce_put(stt, entry, tce);
 
-   return H_SUCCESS;
+unlock_exit:
+   srcu_read_unlock(>kvm->srcu, idx);
+
+   return ret;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
-- 
2.11.0



Re: [PATCH] powerpc/modules: Use WARN_ON() in stub_for_addr()

2017-10-10 Thread Kamalesh Babulal
On Wednesday 11 October 2017 09:42 AM, Michael Ellerman wrote:
> Kamalesh Babulal  writes:
> 
>> Use WARN_ON(), while running out of stubs in stub_for_addr()
>> and abort loading of the module instead of BUG_ON().
> 
> Thanks. This looks good in principle. Have you actually tested it to
> make sure we do in fact gracefully fail the module load?
> 

Thanks for the review. I tested with little hackish version of this patch:

+   if (!strncmp(me->name, "live", 4))
+   j = 100;
+   for (i = 0; stub_func_addr(stubs[i].funcdata); i+=j) {
+   if (WARN_ON(i >= num_stubs))
+   return 0;

and it fails gracefully.

# modprobe livepatch-sample
modprobe: ERROR: could not insert 'livepatch_sample': Unknown symbol in module, 
or unknown parameter (see dmesg)

# echo $?
1

# dmesg 
[ cut here ]
WARNING: CPU: 2 PID: 2836 at arch/powerpc/kernel/module_64.c:526 
apply_relocate_add+0x71c/0xb00

-- 
cheers,
Kamalesh.



Re: [PATCH] powerpc/perf: Fix IMC initialization crash

2017-10-10 Thread Madhavan Srinivasan



On Wednesday 11 October 2017 09:41 AM, Michael Ellerman wrote:

Anju T Sudhakar  writes:


Call trace observed with latest firmware, and upstream kernel.

[   14.499938] NIP [c00f318c] init_imc_pmu+0x8c/0xcf0
[   14.499973] LR [c00f33f8] init_imc_pmu+0x2f8/0xcf0
[   14.57] Call Trace:
[   14.500027] [c03fed18f710] [c00f33c8] init_imc_pmu+0x2c8/0xcf0 
(unreliable)
[   14.500080] [c03fed18f800] [c00b5ec0] 
opal_imc_counters_probe+0x300/0x400
[   14.500132] [c03fed18f900] [c0807ef4] 
platform_drv_probe+0x64/0x110
[   14.500185] [c03fed18f980] [c0804b58] 
driver_probe_device+0x3d8/0x580
[   14.500236] [c03fed18fa10] [c0804e4c] __driver_attach+0x14c/0x1a0
[   14.500302] [c03fed18fa90] [c080156c] bus_for_each_dev+0x8c/0xf0
[   14.500353] [c03fed18fae0] [c0803fa4] driver_attach+0x34/0x50
[   14.500397] [c03fed18fb00] [c0803688] bus_add_driver+0x298/0x350
[   14.500449] [c03fed18fb90] [c080605c] driver_register+0x9c/0x180
[   14.500500] [c03fed18fc00] [c0807dec] 
__platform_driver_register+0x5c/0x70
[   14.500552] [c03fed18fc20] [c101cee0] 
opal_imc_driver_init+0x2c/0x40
[   14.500603] [c03fed18fc40] [c000d084] do_one_initcall+0x64/0x1d0
[   14.500654] [c03fed18fd00] [c100434c] 
kernel_init_freeable+0x280/0x374
[   14.500705] [c03fed18fdc0] [c000d314] kernel_init+0x24/0x160
[   14.500750] [c03fed18fe30] [c000b4e8] 
ret_from_kernel_thread+0x5c/0x74
[   14.500799] Instruction dump:
[   14.500827] 4082024c 2f890002 419e054c 2e890003 41960094 2e890001 3ba0ffea 
419602d8
[   14.500884] 419e0290 2f890003 419e02a8 e93e0118  2fa3 419e0010 
4827ba41
[   14.500945] ---[ end trace 27b734ad26f1add4 ]---
[   15.908719]
[   16.908869] Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0007
[   16.908869]
[   18.125813] ---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0007]

While registering nest imc at init, cpu-hotplug callback 
`nest_pmu_cpumask_init()`
makes an opal call to stop the engine. And if the OPAL call fails,
imc_common_cpuhp_mem_free() is invoked to cleanup memory and cpuhotplug setup.

But when cleaning up the attribute group, we were dereferencing the attribute
element array without checking whether the backing element is not NULL. This
causes the kernel panic.

Factor out the memory freeing part from imc_common_cpuhp_mem_free() to handle
the failing case gracefully.

Signed-off-by: Anju T Sudhakar 
Reported-by: Pridhiviraj Paidipeddi 
---
  arch/powerpc/perf/imc-pmu.c | 23 ---
  1 file changed, 16 insertions(+), 7 deletions(-)

It's the week before rc5, so I'd really like just the absolute minimal
fix. There's sufficient code movement here that I can't even immediately
see where the bug fix is.

mpe,

We have just re-factored the code to handle the memory freeing and fixed 
a leak.

This is minimal fix. And there are no risks in taking this in.

Reviewed-by: Madhavan Srinivasan 

Maddy



cheers





Re: [PATCH] powerpc/modules: Use WARN_ON() in stub_for_addr()

2017-10-10 Thread Michael Ellerman
Kamalesh Babulal  writes:

> Use WARN_ON(), while running out of stubs in stub_for_addr()
> and abort loading of the module instead of BUG_ON().

Thanks. This looks good in principle. Have you actually tested it to
make sure we do in fact gracefully fail the module load?

cheers


Re: [PATCH] powerpc/perf: Fix IMC initialization crash

2017-10-10 Thread Michael Ellerman
Anju T Sudhakar  writes:

> Call trace observed with latest firmware, and upstream kernel.
>
> [   14.499938] NIP [c00f318c] init_imc_pmu+0x8c/0xcf0
> [   14.499973] LR [c00f33f8] init_imc_pmu+0x2f8/0xcf0
> [   14.57] Call Trace:
> [   14.500027] [c03fed18f710] [c00f33c8] init_imc_pmu+0x2c8/0xcf0 
> (unreliable)
> [   14.500080] [c03fed18f800] [c00b5ec0] 
> opal_imc_counters_probe+0x300/0x400
> [   14.500132] [c03fed18f900] [c0807ef4] 
> platform_drv_probe+0x64/0x110
> [   14.500185] [c03fed18f980] [c0804b58] 
> driver_probe_device+0x3d8/0x580
> [   14.500236] [c03fed18fa10] [c0804e4c] 
> __driver_attach+0x14c/0x1a0
> [   14.500302] [c03fed18fa90] [c080156c] 
> bus_for_each_dev+0x8c/0xf0
> [   14.500353] [c03fed18fae0] [c0803fa4] driver_attach+0x34/0x50
> [   14.500397] [c03fed18fb00] [c0803688] 
> bus_add_driver+0x298/0x350
> [   14.500449] [c03fed18fb90] [c080605c] 
> driver_register+0x9c/0x180
> [   14.500500] [c03fed18fc00] [c0807dec] 
> __platform_driver_register+0x5c/0x70
> [   14.500552] [c03fed18fc20] [c101cee0] 
> opal_imc_driver_init+0x2c/0x40
> [   14.500603] [c03fed18fc40] [c000d084] 
> do_one_initcall+0x64/0x1d0
> [   14.500654] [c03fed18fd00] [c100434c] 
> kernel_init_freeable+0x280/0x374
> [   14.500705] [c03fed18fdc0] [c000d314] kernel_init+0x24/0x160
> [   14.500750] [c03fed18fe30] [c000b4e8] 
> ret_from_kernel_thread+0x5c/0x74
> [   14.500799] Instruction dump:
> [   14.500827] 4082024c 2f890002 419e054c 2e890003 41960094 2e890001 3ba0ffea 
> 419602d8 
> [   14.500884] 419e0290 2f890003 419e02a8 e93e0118  2fa3 
> 419e0010 4827ba41 
> [   14.500945] ---[ end trace 27b734ad26f1add4 ]---
> [   15.908719] 
> [   16.908869] Kernel panic - not syncing: Attempted to kill init! 
> exitcode=0x0007
> [   16.908869] 
> [   18.125813] ---[ end Kernel panic - not syncing: Attempted to kill init! 
> exitcode=0x0007]
>
> While registering nest imc at init, cpu-hotplug callback 
> `nest_pmu_cpumask_init()`
> makes an opal call to stop the engine. And if the OPAL call fails, 
> imc_common_cpuhp_mem_free() is invoked to cleanup memory and cpuhotplug setup.
>
> But when cleaning up the attribute group, we were dereferencing the attribute
> element array without checking whether the backing element is not NULL. This
> causes the kernel panic.
>
> Factor out the memory freeing part from imc_common_cpuhp_mem_free() to handle
> the failing case gracefully.
>
> Signed-off-by: Anju T Sudhakar 
> Reported-by: Pridhiviraj Paidipeddi 
> ---
>  arch/powerpc/perf/imc-pmu.c | 23 ---
>  1 file changed, 16 insertions(+), 7 deletions(-)

It's the week before rc5, so I'd really like just the absolute minimal
fix. There's sufficient code movement here that I can't even immediately
see where the bug fix is.

cheers


Re: [PATCH] selftests/powerpc: fix build error in powerpc ptrace selftests.

2017-10-10 Thread Simon Guo
Hi Michael,
On Tue, Oct 10, 2017 at 09:10:32PM +1100, Michael Ellerman wrote:
> wei.guo.si...@gmail.com writes:
> 
> > From: Simon Guo 
> >
> > GCC 7 will take "r2" in clobber list as an error will it will get following
> > build errors for powerpc ptrace selftests even with -fno-pic option:
> >   ptrace-tm-vsx.c: In function ‘tm_vsx’:
> >   ptrace-tm-vsx.c:42:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> > asm __volatile__(
> > ^~~
> >   make[1]: *** [ptrace-tm-vsx] Error 1
> >   ptrace-tm-spd-vsx.c: In function ‘tm_spd_vsx’:
> >   ptrace-tm-spd-vsx.c:55:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> > asm __volatile__(
> > ^~~
> >   make[1]: *** [ptrace-tm-spd-vsx] Error 1
> >   ptrace-tm-spr.c: In function ‘tm_spr’:
> >   ptrace-tm-spr.c:46:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> > asm __volatile__(
> > ^~~
> >
> > This patch fix the build error by removing "r2" out of clobber list.
> 
> But do any of the blocks clobber r2? If so then it should be in the
> clobber list.

I see none of them clobbers r2, and neither does those assembly
functions which those blocks calls, like "loadvsx".

For the change on tools/testing/selftests/powerpc/ptrace/Makefile, it
can be ignored since I noticed recent commit a3c01050584da3 "selftests/powerpc: 
Force ptrace tests to build -fno-pie". Please let me know if you want
a new v2 to remove that change on ptrace/Makefile.

Thanks,
- Simon


[PATCH v6] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-10-10 Thread Santosh Sivaraj
Current vDSO64 implementation does not have support for coarse clocks
(CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
to system call, increasing the response time, vDSO implementation reduces
the cycle time. Below is a benchmark of the difference in execution times.

(Non-coarse clocks are also included just for completion)

clock-gettime-realtime: syscall: 172 nsec/call
clock-gettime-realtime:libc: 28 nsec/call
clock-gettime-realtime:vdso: 22 nsec/call
clock-gettime-monotonic: syscall: 171 nsec/call
clock-gettime-monotonic:libc: 30 nsec/call
clock-gettime-monotonic:vdso: 25 nsec/call
clock-gettime-realtime-coarse: syscall: 153 nsec/call
clock-gettime-realtime-coarse:libc: 16 nsec/call
clock-gettime-realtime-coarse:vdso: 10 nsec/call
clock-gettime-monotonic-coarse: syscall: 167 nsec/call
clock-gettime-monotonic-coarse:libc: 17 nsec/call
clock-gettime-monotonic-coarse:vdso: 11 nsec/call

CC: Benjamin Herrenschmidt 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kernel/asm-offsets.c |  2 +
 arch/powerpc/kernel/vdso64/gettimeofday.S | 67 ++-
 2 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 8cfb20e38cfe..b55c68c54dc1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -396,6 +396,8 @@ int main(void)
/* Other bits used by the vdso */
DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
+   DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
+   DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
 
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 382021324883..b594f5c745fa 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -64,6 +64,12 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
cmpwi   cr0,r3,CLOCK_REALTIME
cmpwi   cr1,r3,CLOCK_MONOTONIC
crorcr0*4+eq,cr0*4+eq,cr1*4+eq
+
+   cmpwi   cr5,r3,CLOCK_REALTIME_COARSE
+   cmpwi   cr6,r3,CLOCK_MONOTONIC_COARSE
+   crorcr5*4+eq,cr5*4+eq,cr6*4+eq
+
+   crorcr0*4+eq,cr0*4+eq,cr5*4+eq
bne cr0,99f
 
mflrr12 /* r12 saves lr */
@@ -72,6 +78,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
ori r7,r7,NSEC_PER_SEC@l
+   beq cr5,70f
 50:bl  V_LOCAL_FUNC(__do_get_tspec)/* get time from tb & kernel */
bne cr1,80f /* if not monotonic, all done */
 
@@ -97,19 +104,57 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
ld  r0,CFG_TB_UPDATE_COUNT(r3)
 cmpld   cr0,r0,r8  /* check if updated */
bne-50b
+   b   78f
 
-   /* Add wall->monotonic offset and check for overflow or underflow.
+   /*
+* For coarse clocks we get data directly from the vdso data page, so
+* we don't need to call __do_get_tspec, but we still need to do the
+* counter trick.
 */
-   add r4,r4,r6
-   add r5,r5,r9
-   cmpdcr0,r5,r7
-   cmpdi   cr1,r5,0
-   blt 1f
-   subfr5,r7,r5
-   addir4,r4,1
-1: bge cr1,80f
-   addir4,r4,-1
-   add r5,r5,r7
+70:ld  r8,CFG_TB_UPDATE_COUNT(r3)
+   andi.   r0,r8,1 /* pending update ? loop */
+   bne-70b
+   xor r0,r8,r8/* create dependency */
+   add r3,r3,r0
+
+   /*
+* CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
+* too
+*/
+   ld  r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
+   ld  r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
+   bne cr6,75f
+
+   /* CLOCK_MONOTONIC_COARSE */
+   lwa r6,WTOM_CLOCK_SEC(r3)
+   lwa r9,WTOM_CLOCK_NSEC(r3)
+
+   /* check if counter has updated */
+   or  r0,r6,r9
+75:or  r0,r4,r5
+   xor r0,r0,r0
+   add r3,r3,r0
+   ld  r0,CFG_TB_UPDATE_COUNT(r3)
+   cmpld   cr0,r0,r8   /* check if updated */
+   bne-70b
+
+   /* Counter has not updated, so continue calculating proper values for
+* sec and nsec if monotonic coarse, or just return with the proper
+* values for realtime.
+*/
+   bne cr6,80f
+
+   /* Add wall->monotonic offset and check for overflow or underflow */
+78:add r4,r4,r6
+   add r5,r5,r9
+   cmpdcr0,r5,r7
+   cmpdi   cr1,r5,0
+   blt 79f
+   subfr5,r7,r5
+   addir4,r4,1

[PATCH] scsi: ibmvscsi: Convert timers to use timer_setup()

2017-10-10 Thread Kees Cook
In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly.

Cc: "Martin K. Petersen" 
Cc: Tyrel Datwyler 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "James E.J. Bottomley" 
Cc: linux-s...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kees Cook 
---
This requires commit 686fef928bba ("timer: Prepare to change timer
callback argument type") in v4.14-rc3, but should be otherwise
stand-alone.
---
 drivers/scsi/ibmvscsi/ibmvfc.c   | 14 ++
 drivers/scsi/ibmvscsi/ibmvscsi.c |  7 +++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index b491af31a5f8..0d2f7eb3acb6 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -1393,8 +1393,9 @@ static int ibmvfc_map_sg_data(struct scsi_cmnd *scmd,
  *
  * Called when an internally generated command times out
  **/
-static void ibmvfc_timeout(struct ibmvfc_event *evt)
+static void ibmvfc_timeout(struct timer_list *t)
 {
+   struct ibmvfc_event *evt = from_timer(evt, t, timer);
struct ibmvfc_host *vhost = evt->vhost;
dev_err(vhost->dev, "Command timed out (%p). Resetting connection\n", 
evt);
ibmvfc_reset_host(vhost);
@@ -1424,12 +1425,10 @@ static int ibmvfc_send_event(struct ibmvfc_event *evt,
BUG();
 
list_add_tail(>queue, >sent);
-   init_timer(>timer);
+   timer_setup(>timer, ibmvfc_timeout, 0);
 
if (timeout) {
-   evt->timer.data = (unsigned long) evt;
evt->timer.expires = jiffies + (timeout * HZ);
-   evt->timer.function = (void (*)(unsigned long))ibmvfc_timeout;
add_timer(>timer);
}
 
@@ -3692,8 +3691,9 @@ static void ibmvfc_tgt_adisc_cancel_done(struct 
ibmvfc_event *evt)
  * out, reset the CRQ. When the ADISC comes back as cancelled,
  * log back into the target.
  **/
-static void ibmvfc_adisc_timeout(struct ibmvfc_target *tgt)
+static void ibmvfc_adisc_timeout(struct timer_list *t)
 {
+   struct ibmvfc_target *tgt = from_timer(tgt, t, timer);
struct ibmvfc_host *vhost = tgt->vhost;
struct ibmvfc_event *evt;
struct ibmvfc_tmf *tmf;
@@ -3778,9 +3778,7 @@ static void ibmvfc_tgt_adisc(struct ibmvfc_target *tgt)
if (timer_pending(>timer))
mod_timer(>timer, jiffies + (IBMVFC_ADISC_TIMEOUT * HZ));
else {
-   tgt->timer.data = (unsigned long) tgt;
tgt->timer.expires = jiffies + (IBMVFC_ADISC_TIMEOUT * HZ);
-   tgt->timer.function = (void (*)(unsigned 
long))ibmvfc_adisc_timeout;
add_timer(>timer);
}
 
@@ -3912,7 +3910,7 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, 
u64 scsi_id)
tgt->vhost = vhost;
tgt->need_login = 1;
tgt->cancel_key = vhost->task_set++;
-   init_timer(>timer);
+   timer_setup(>timer, ibmvfc_adisc_timeout, 0);
kref_init(>kref);
ibmvfc_init_tgt(tgt, ibmvfc_tgt_implicit_logout);
spin_lock_irqsave(vhost->host->host_lock, flags);
diff --git a/drivers/scsi/ibmvscsi/ibmvscsi.c b/drivers/scsi/ibmvscsi/ibmvscsi.c
index 7d156b161482..17df76f0be3c 100644
--- a/drivers/scsi/ibmvscsi/ibmvscsi.c
+++ b/drivers/scsi/ibmvscsi/ibmvscsi.c
@@ -837,8 +837,9 @@ static void ibmvscsi_reset_host(struct ibmvscsi_host_data 
*hostdata)
  *
  * Called when an internally generated command times out
 */
-static void ibmvscsi_timeout(struct srp_event_struct *evt_struct)
+static void ibmvscsi_timeout(struct timer_list *t)
 {
+   struct srp_event_struct *evt_struct = from_timer(evt_struct, t, timer);
struct ibmvscsi_host_data *hostdata = evt_struct->hostdata;
 
dev_err(hostdata->dev, "Command timed out (%x). Resetting connection\n",
@@ -927,11 +928,9 @@ static int ibmvscsi_send_srp_event(struct srp_event_struct 
*evt_struct,
 */
list_add_tail(_struct->list, >sent);
 
-   init_timer(_struct->timer);
+   timer_setup(_struct->timer, ibmvscsi_timeout, 0);
if (timeout) {
-   evt_struct->timer.data = (unsigned long) evt_struct;
evt_struct->timer.expires = jiffies + (timeout * HZ);
-   evt_struct->timer.function = (void (*)(unsigned 
long))ibmvscsi_timeout;
add_timer(_struct->timer);
}
 
-- 
2.7.4


-- 
Kees Cook
Pixel Security


Re: [PATCH v4 19/20] x86/mm: Add speculative pagefault handling

2017-10-10 Thread Andrew Morton
On Mon,  9 Oct 2017 12:07:51 +0200 Laurent Dufour  
wrote:

> +/*
> + * Advertise that we call the Speculative Page Fault handler.
> + */
> +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
> +#define __HAVE_ARCH_CALL_SPF
> +#endif

Here's where I mess up your life ;)

It would be more idiomatic to define this in arch/XXX/Kconfig:

config SPF
def_bool y if SMP

then use CONFIG_SPF everywhere.

Also, it would be better if CONFIG_SPF were defined at the start of the
patch series rather than the end, so that as the patches add new code,
that code is actually compilable.  For bisection purposes.  I can
understand if this is too much work and effort - we can live with
things the way they are now.

This patchset is a ton of new code in very sensitive areas and seems to
have received little review and test.  I can do a
merge-and-see-what-happens but it would be quite a risk to send all
this upstream based only on my sketchy review and linux-next runtime
testing.  Can we bribe someone?


Re: [PATCH] powerpc/powernv: Add kernel cmdline parameter to disable imc

2017-10-10 Thread Stewart Smith
Michael Ellerman  writes:
> Anju T Sudhakar  writes:
>
>> Add a kernel command line parameter option to disable In-Memory Collection
>> (IMC) counters and add documentation. This helps in debug.
>
> I'd really rather we didn't. Do we *really* need this?
>
> We don't have command line parameters to disable any of the other ~20
> PMUs, why is this one special?

You could also do the same thing by editing the device tree before
booting your kernel, we do have the facility to do that in petitboot.

A recent firmware patch: https://patchwork.ozlabs.org/patch/823249/
would fix the firmware implementation where the counters were already
running before the INIT/START calls, which are likely the cause of the
problems that this patch is trying to work around.

I propose we have the firmware do the right thing and nothing special in
kernel. i.e. not to merge this.

-- 
Stewart Smith
OPAL Architect, IBM.



Re: [PATCH 1/3] powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot

2017-10-10 Thread Nicholas Piggin
On Wed, 11 Oct 2017 01:58:28 +1000
Nicholas Piggin  wrote:


> Ahh okay, pseries is using the start-cpu RTAS call to enter at
> generic_secondary_smp_init() as well. So we can take it out for
> pseries as well.

This patch seems to do the trick for pseries guests too:

powerpc/64s: Avoid waiting for secondary hold spinloop if it is not used

OPAL and some RTAS boot does not insert secondaries at 0x60 to wait at
the secondary hold spinloop. Instead they are started later, at
generic_secondary_smp_init(), which is after the secondary hold
spinloop.

Avoid waiting on this spinloop when booting with OPAL firmware, or
when the RTAS boot does not use this loop. This wait always times
out in those cases.

This saves 100ms boot time on bare metal (10s of seconds of real time
when booting on the simulator in SMP), and 100ms on modern pseries
guests.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/head_64.S  | 16 +++-
 arch/powerpc/kernel/setup_64.c | 12 +++-
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index c9e760ec7530..0deef350004f 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -55,12 +55,18 @@
  *
  *  For pSeries or server processors:
  *   1. The MMU is off & open firmware is running in real mode.
- *   2. The kernel is entered at __start
+ *   2. The primary CPU enters at __start.
+ *   3. If the RTAS supports "query-cpu-stopped-state", then secondary
+ *  CPUs will enter as directed by "start-cpu" RTAS call, which is
+ *  generic_secondary_smp_init, with PIR in r3.
+ *   4. Else the secondary CPUs will enter at secondary_hold (0x60) as
+ *  directed by the "start-cpu" RTS call, with PIR in r3.
  * -or- For OPAL entry:
- *   1. The MMU is off, processor in HV mode, primary CPU enters at 0
- *  with device-tree in gpr3. We also get OPAL base in r8 and
- * entry in r9 for debugging purposes
- *   2. Secondary processors enter at 0x60 with PIR in gpr3
+ *   1. The MMU is off, processor in HV mode.
+ *   2. The primary CPU enters at 0 with device-tree in r3, OPAL base
+ *  in r8, and entry in r9 for debugging purposes.
+ *   3. Secondary CPUs enter as directed by OPAL_START_CPU call, which
+ *  is at generic_secondary_smp_init, with PIR in r3.
  *
  *  For Book3E processors:
  *   1. The MMU is on running in AS0 in a state defined in ePAPR
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 3f2453858f60..afa79e8d56a6 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -363,8 +363,18 @@ void early_setup_secondary(void)
 #if defined(CONFIG_SMP) || defined(CONFIG_KEXEC_CORE)
 static bool use_spinloop(void)
 {
-   if (!IS_ENABLED(CONFIG_PPC_BOOK3E))
+   if (IS_ENABLED(CONFIG_PPC_BOOK3S)) {
+   /*
+* See comments in head_64.S -- not all platforms insert
+* secondaries at __secondary_hold and wait at the spin
+* loop.
+*/
+   if (firmware_has_feature(FW_FEATURE_OPAL))
+   return false;
+   if (rtas_token("query-cpu-stopped-state") != 
RTAS_UNKNOWN_SERVICE)
+   return false;
return true;
+   }
 
/*
 * When book3e boots from kexec, the ePAPR spin table does
-- 
2.13.3



Re: [PATCH v11 7/9] arm64/kasan: add and use kasan_map_populate()

2017-10-10 Thread Pavel Tatashin
Hi Will,

Ok, I will add your patch at the end of my series.

Thank you,
Pavel

>
> I was thinking that you could just add my patch to the end of your series
> and have the whole lot go up like that. If you want to merge it with your
> patch, I'm fine with that too.
>
> Will
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH v11 0/9] complete deferred page initialization

2017-10-10 Thread Pavel Tatashin
I wanted to thank you Michal for spending time and doing the in-depth 
reviews of every incremental change. Overall the series is in much 
better shape now because of your feedback.


Pavel

On 10/10/2017 10:15 AM, Michal Hocko wrote:

Btw. thanks for your persistance and willingness to go over all the
suggestions which might not have been consistent btween different
versions. I believe this is a general improvement in the early
initialization code. We do not rely on an implicit zeroing which just
happens to work by a chance. The perfomance improvements are a bonus on
top.

Thanks, good work!



Re: [PATCH v11 7/9] arm64/kasan: add and use kasan_map_populate()

2017-10-10 Thread Will Deacon
Hi Pavel,

On Tue, Oct 10, 2017 at 01:07:35PM -0400, Pavel Tatashin wrote:
> Thank you for doing this work. How would you like to proceed?
> 
> - If you OK for my series to be accepted as-is, so your patch can be
> added later on top, I think, I need an ack from you for kasan changes.
> - Otherwise, I can replace: 4267aaf1d279 arm64/kasan: add and use
> kasan_map_populate() in my series with code from your patch.

I was thinking that you could just add my patch to the end of your series
and have the whole lot go up like that. If you want to merge it with your
patch, I'm fine with that too.

Will


Re: [PATCH v11 7/9] arm64/kasan: add and use kasan_map_populate()

2017-10-10 Thread Pavel Tatashin
Hi Will,

Thank you for doing this work. How would you like to proceed?

- If you OK for my series to be accepted as-is, so your patch can be
added later on top, I think, I need an ack from you for kasan changes.
- Otherwise, I can replace: 4267aaf1d279 arm64/kasan: add and use
kasan_map_populate() in my series with code from your patch.

Thank you,
Pavel


[PATCH] powerpc/perf: Fix IMC initialization crash

2017-10-10 Thread Anju T Sudhakar
Call trace observed with latest firmware, and upstream kernel.

[   14.499938] NIP [c00f318c] init_imc_pmu+0x8c/0xcf0
[   14.499973] LR [c00f33f8] init_imc_pmu+0x2f8/0xcf0
[   14.57] Call Trace:
[   14.500027] [c03fed18f710] [c00f33c8] init_imc_pmu+0x2c8/0xcf0 
(unreliable)
[   14.500080] [c03fed18f800] [c00b5ec0] 
opal_imc_counters_probe+0x300/0x400
[   14.500132] [c03fed18f900] [c0807ef4] 
platform_drv_probe+0x64/0x110
[   14.500185] [c03fed18f980] [c0804b58] 
driver_probe_device+0x3d8/0x580
[   14.500236] [c03fed18fa10] [c0804e4c] __driver_attach+0x14c/0x1a0
[   14.500302] [c03fed18fa90] [c080156c] bus_for_each_dev+0x8c/0xf0
[   14.500353] [c03fed18fae0] [c0803fa4] driver_attach+0x34/0x50
[   14.500397] [c03fed18fb00] [c0803688] bus_add_driver+0x298/0x350
[   14.500449] [c03fed18fb90] [c080605c] driver_register+0x9c/0x180
[   14.500500] [c03fed18fc00] [c0807dec] 
__platform_driver_register+0x5c/0x70
[   14.500552] [c03fed18fc20] [c101cee0] 
opal_imc_driver_init+0x2c/0x40
[   14.500603] [c03fed18fc40] [c000d084] do_one_initcall+0x64/0x1d0
[   14.500654] [c03fed18fd00] [c100434c] 
kernel_init_freeable+0x280/0x374
[   14.500705] [c03fed18fdc0] [c000d314] kernel_init+0x24/0x160
[   14.500750] [c03fed18fe30] [c000b4e8] 
ret_from_kernel_thread+0x5c/0x74
[   14.500799] Instruction dump:
[   14.500827] 4082024c 2f890002 419e054c 2e890003 41960094 2e890001 3ba0ffea 
419602d8 
[   14.500884] 419e0290 2f890003 419e02a8 e93e0118  2fa3 419e0010 
4827ba41 
[   14.500945] ---[ end trace 27b734ad26f1add4 ]---
[   15.908719] 
[   16.908869] Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0007
[   16.908869] 
[   18.125813] ---[ end Kernel panic - not syncing: Attempted to kill init! 
exitcode=0x0007]

While registering nest imc at init, cpu-hotplug callback 
`nest_pmu_cpumask_init()`
makes an opal call to stop the engine. And if the OPAL call fails, 
imc_common_cpuhp_mem_free() is invoked to cleanup memory and cpuhotplug setup.

But when cleaning up the attribute group, we were dereferencing the attribute
element array without checking whether the backing element is not NULL. This
causes the kernel panic.

Factor out the memory freeing part from imc_common_cpuhp_mem_free() to handle
the failing case gracefully.

Signed-off-by: Anju T Sudhakar 
Reported-by: Pridhiviraj Paidipeddi 
---
 arch/powerpc/perf/imc-pmu.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
index 9ccac86..213d976 100644
--- a/arch/powerpc/perf/imc-pmu.c
+++ b/arch/powerpc/perf/imc-pmu.c
@@ -224,8 +224,10 @@ static int update_events_in_group(struct device_node 
*node, struct imc_pmu *pmu)
 
/* Allocate memory for attribute group */
attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL);
-   if (!attr_group)
+   if (!attr_group) {
+   kfree(pmu->events);
return -ENOMEM;
+   }
 
/*
 * Allocate memory for attributes.
@@ -1115,6 +1117,15 @@ static void cleanup_all_thread_imc_memory(void)
}
 }
 
+/* Function to free the attr_groups which are dynamically allocated */
+static void imc_common_mem_free(struct imc_pmu *pmu_ptr)
+{
+   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
+   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
+   kfree(pmu_ptr);
+   return;
+}
+
 /*
  * Common function to unregister cpu hotplug callback and
  * free the memory.
@@ -1147,10 +1158,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu 
*pmu_ptr)
cleanup_all_thread_imc_memory();
}
 
-   /* Only free the attr_groups which are dynamically allocated  */
-   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
-   kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
-   kfree(pmu_ptr);
return;
 }
 
@@ -1289,17 +1296,19 @@ int init_imc_pmu(struct device_node *parent, struct 
imc_pmu *pmu_ptr, int pmu_id
 
ret = update_pmu_ops(pmu_ptr);
if (ret)
-   goto err_free;
+   goto err_free_mem;
 
ret = perf_pmu_register(_ptr->pmu, pmu_ptr->pmu.name, -1);
if (ret)
-   goto err_free;
+   goto err_free_mem;
 
pr_info("%s performance monitor hardware support registered\n",
pmu_ptr->pmu.name);
 
return 0;
 
+err_free_mem:
+   imc_common_mem_free(pmu_ptr);
 err_free:
imc_common_cpuhp_mem_free(pmu_ptr);
return ret;
-- 
1.8.3.1



[PATCH] powerpc/memkey: feature applies to PPC_BOOK3S_64 archs only

2017-10-10 Thread Ram Pai
Currently protection key feature is erroneously configured to
be enabled for any flavor of PPC64. This patch fixes it.

Signed-off-by: Ram Pai 
---
 arch/powerpc/Kconfig |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 7dee449..4b2b055 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -868,7 +868,7 @@ config PPC64_MEMORY_PROTECTION_KEYS
prompt "PowerPC Memory Protection Keys"
def_bool y
# Note: only available in 64-bit mode
-   depends on PPC64
+   depends on PPC_BOOK3S_64
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
---help---
-- 
1.7.1



Re: [PATCH] powerpc/memkey: fix compilation error caused by upstream changes

2017-10-10 Thread Ram Pai
On Tue, Oct 10, 2017 at 08:00:27PM +1100, Michael Ellerman wrote:
> Ram Pai  writes:
> 
> > "commit df3735c5 -- x86,mpx: make mpx depend on x86-64 to free up VMA flag"
> > added the same vm highmem flag we had introduced for PKEY4. That broke
> > some of the definitions in pkeys.h
> >
> > This patch fixes the issue.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/pkeys.h |4 +---
> >  1 files changed, 1 insertions(+), 3 deletions(-)
> 
> Thanks.

Oops. this bug is already fixed in my latest branch. Balbir had
commented about it. the issue is that Kconfig is configued to depend on
PPC64 instead of PPC_BOOK3S_64. Will send a fix that applies to your
tree, right away.

RP

> 
> Next problem is pmac32, corenet32/corenet64_smp defconfigs:
> 
>   arch/powerpc/include/asm/pkeys.h:19:23: error: 'VM_HIGH_ARCH_BIT_0' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:20:22: error: 'VM_HIGH_ARCH_0' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:21:22: error: 'VM_HIGH_ARCH_1' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:22:22: error: 'VM_HIGH_ARCH_2' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:23:22: error: 'VM_HIGH_ARCH_3' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:24:22: error: 'VM_HIGH_ARCH_4' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:50:39: error: 'H_PAGE_PKEY_BIT4' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:51:32: error: 'H_PAGE_PKEY_BIT3' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:52:32: error: 'H_PAGE_PKEY_BIT2' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:53:32: error: 'H_PAGE_PKEY_BIT1' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:54:32: error: 'H_PAGE_PKEY_BIT0' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:77:23: error: 'H_PAGE_PKEY_BIT0' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:77:43: error: 'HPTE_R_KEY_BIT0' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:78:16: error: 'H_PAGE_PKEY_BIT1' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:78:36: error: 'HPTE_R_KEY_BIT1' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:79:16: error: 'H_PAGE_PKEY_BIT2' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:79:36: error: 'HPTE_R_KEY_BIT2' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:80:16: error: 'H_PAGE_PKEY_BIT3' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:80:36: error: 'HPTE_R_KEY_BIT3' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:81:16: error: 'H_PAGE_PKEY_BIT4' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:81:36: error: 'HPTE_R_KEY_BIT4' undeclared 
> (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:89:23: error: 'H_PAGE_PKEY_BIT0' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:90:16: error: 'H_PAGE_PKEY_BIT1' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:91:16: error: 'H_PAGE_PKEY_BIT2' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:92:16: error: 'H_PAGE_PKEY_BIT3' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/pkeys.h:93:16: error: 'H_PAGE_PKEY_BIT4' 
> undeclared (first use in this function)
>   arch/powerpc/include/asm/mmu_context.h:199:19: error: redefinition of 
> 'vma_pkey'
>   arch/powerpc/include/asm/mmu_context.h:204:19: error: redefinition of 
> 'pte_to_hpte_pkey_bits'
>   arch/powerpc/include/asm/pkeys.h:102:48: error: 'mm_context_t' has no 
> member named 'pkey_allocation_map'
>   arch/powerpc/include/asm/pkeys.h:224:13: error: 'mm_context_t' has no 
> member named 'execute_only_pkey'
> 
> cheers

-- 
Ram Pai



Re: [PATCH 1/3] powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 21:44:15 +1000
Nicholas Piggin  wrote:

> On Tue, 10 Oct 2017 22:11:46 +1100
> Michael Ellerman  wrote:
> 
> > Nicholas Piggin  writes:
> >   
> > > OPAL boot does not insert secondaries at 0x60 to wait at the secondary
> > > hold spinloop. Instead it keeps them held in firmware until the
> > > opal_start_cpu call is made, which directs them where the caller
> > > specifies. Linux inserts them into generic_secondary_smp_init(), which
> > > is after the secondary hold spinloop (they go on to spin at the per-CPU
> > > paca loops, but that is another step).
> > >
> > > So avoid waiting on this spinloop when booting with OPAL firmware.
> > > It always just times out.
> > >
> > > This saves 100ms boot time on bare metal, and 10s of seconds when
> > > booting the simulator in SMP.
> > 
> > Oh nice, that's real facepalm territory.
> > 
> > It'd be neater if we just inserted them at 0x60, but the sequence is
> > wrong.
> > 
> > Can we fix it just by making spinning_secondaries zero on OPAL?  
> 
> I had a look at that, but generic_secondary_smp_init() still
> decrements it, so it would underflow which I thought was
> uglier.
> 
> I actually have to look a bit further, because KVM guests are
> also having the loop time out too by the looks.

Ahh okay, pseries is using the start-cpu RTAS call to enter at
generic_secondary_smp_init() as well. So we can take it out for
pseries as well.

Thanks,
Nick


Re: [PATCH v11 7/9] arm64/kasan: add and use kasan_map_populate()

2017-10-10 Thread Will Deacon
Hi Pavel,

On Mon, Oct 09, 2017 at 06:19:29PM -0400, Pavel Tatashin wrote:
> During early boot, kasan uses vmemmap_populate() to establish its shadow
> memory. But, that interface is intended for struct pages use.
> 
> Because of the current project, vmemmap won't be zeroed during allocation,
> but kasan expects that memory to be zeroed. We are adding a new
> kasan_map_populate() function to resolve this difference.
> 
> Therefore, we must use a new interface to allocate and map kasan shadow
> memory, that also zeroes memory for us.
> 
> Signed-off-by: Pavel Tatashin 
> ---
>  arch/arm64/mm/kasan_init.c | 72 
> ++
>  1 file changed, 66 insertions(+), 6 deletions(-)

Thanks for doing this, although I still think we can do better and avoid the
additional walking code altogether, as well as removing the dependence on
vmemmap. Rather than keep messing you about here (sorry about that), I've
written an arm64 patch for you to take on top of this series. Please take
a look below.

Cheers,

Will

--->8

>From 36c6c7c06273d08348b47c1a182116b0a1df8363 Mon Sep 17 00:00:00 2001
From: Will Deacon 
Date: Tue, 10 Oct 2017 15:49:43 +0100
Subject: [PATCH] arm64: kasan: Avoid using vmemmap_populate to initialise
 shadow

The kasan shadow is currently mapped using vmemmap_populate since that
provides a semi-convenient way to map pages into swapper. However, since
that no longer zeroes the mapped pages, it is not suitable for kasan,
which requires that the shadow is zeroed in order to avoid false
positives.

This patch removes our reliance on vmemmap_populate and reuses the
existing kasan page table code, which is already required for creating
the early shadow.

Signed-off-by: Will Deacon 
---
 arch/arm64/Kconfig |   2 +-
 arch/arm64/mm/kasan_init.c | 176 +++--
 2 files changed, 74 insertions(+), 104 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0df64a6a56d4..888580b9036e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -68,7 +68,7 @@ config ARM64
select HAVE_ARCH_BITREVERSE
select HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
-   select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && 
ARM64_VA_BITS_48)
+   select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index cb4af2951c90..b922826d9908 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -11,6 +11,7 @@
  */
 
 #define pr_fmt(fmt) "kasan: " fmt
+#include 
 #include 
 #include 
 #include 
@@ -28,66 +29,6 @@
 
 static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE);
 
-/* Creates mappings for kasan during early boot. The mapped memory is zeroed */
-static int __meminit kasan_map_populate(unsigned long start, unsigned long end,
-   int node)
-{
-   unsigned long addr, pfn, next;
-   unsigned long long size;
-   pgd_t *pgd;
-   pud_t *pud;
-   pmd_t *pmd;
-   pte_t *pte;
-   int ret;
-
-   ret = vmemmap_populate(start, end, node);
-   /*
-* We might have partially populated memory, so check for no entries,
-* and zero only those that actually exist.
-*/
-   for (addr = start; addr < end; addr = next) {
-   pgd = pgd_offset_k(addr);
-   if (pgd_none(*pgd)) {
-   next = pgd_addr_end(addr, end);
-   continue;
-   }
-
-   pud = pud_offset(pgd, addr);
-   if (pud_none(*pud)) {
-   next = pud_addr_end(addr, end);
-   continue;
-   }
-   if (pud_sect(*pud)) {
-   /* This is PUD size page */
-   next = pud_addr_end(addr, end);
-   size = PUD_SIZE;
-   pfn = pud_pfn(*pud);
-   } else {
-   pmd = pmd_offset(pud, addr);
-   if (pmd_none(*pmd)) {
-   next = pmd_addr_end(addr, end);
-   continue;
-   }
-   if (pmd_sect(*pmd)) {
-   /* This is PMD size page */
-   next = pmd_addr_end(addr, end);
-   size = PMD_SIZE;
-   pfn = pmd_pfn(*pmd);
-   } else {
-   pte = pte_offset_kernel(pmd, addr);
-   next = addr + PAGE_SIZE;
-   if (pte_none(*pte))
-   continue;
-

[PATCH] powerpc/modules: Use WARN_ON() in stub_for_addr()

2017-10-10 Thread Kamalesh Babulal
Use WARN_ON(), while running out of stubs in stub_for_addr()
and abort loading of the module instead of BUG_ON().

Signed-off-by: Kamalesh Babulal 
---
 arch/powerpc/kernel/module_64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 0b0f896..759104b 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -429,7 +429,8 @@ static unsigned long stub_for_addr(const Elf64_Shdr 
*sechdrs,
/* Find this stub, or if that fails, the next avail. entry */
stubs = (void *)sechdrs[me->arch.stubs_section].sh_addr;
for (i = 0; stub_func_addr(stubs[i].funcdata); i++) {
-   BUG_ON(i >= num_stubs);
+   if (WARN_ON(i >= num_stubs))
+   return 0;
 
if (stub_func_addr(stubs[i].funcdata) == func_addr(addr))
return (unsigned long)[i];
-- 
2.7.4



Re: [PATCH v11 5/9] mm: zero reserved and unavailable struct pages

2017-10-10 Thread Pavel Tatashin
> Btw. I would add your example from 
> http://lkml.kernel.org/r/bcf24369-ac37-cedd-a264-3396fb5cf...@oracle.com
> to do changelog
>

Will add, thank you for your review.

Pavel


Re: [PATCH v11 0/9] complete deferred page initialization

2017-10-10 Thread Michal Hocko
Btw. thanks for your persistance and willingness to go over all the
suggestions which might not have been consistent btween different
versions. I believe this is a general improvement in the early
initialization code. We do not rely on an implicit zeroing which just
happens to work by a chance. The perfomance improvements are a bonus on
top.

Thanks, good work!
-- 
Michal Hocko
SUSE Labs


Re: [PATCH v11 5/9] mm: zero reserved and unavailable struct pages

2017-10-10 Thread Michal Hocko
On Tue 10-10-17 15:44:41, Michal Hocko wrote:
> On Mon 09-10-17 18:19:27, Pavel Tatashin wrote:
> > Some memory is reserved but unavailable: not present in memblock.memory
> > (because not backed by physical pages), but present in memblock.reserved.
> > Such memory has backing struct pages, but they are not initialized by going
> > through __init_single_page().
> > 
> > In some cases these struct pages are accessed even if they do not contain
> > any data. One example is page_to_pfn() might access page->flags if this is
> > where section information is stored (CONFIG_SPARSEMEM,
> > SECTION_IN_PAGE_FLAGS).
> > 
> > One example of such memory: trim_low_memory_range() unconditionally
> > reserves from pfn 0, but e820__memblock_setup() might provide the exiting
> > memory from pfn 1 (i.e. KVM).

Btw. I would add your example from 
http://lkml.kernel.org/r/bcf24369-ac37-cedd-a264-3396fb5cf...@oracle.com
to do changelog
 
> > Since, struct pages are zeroed in __init_single_page(), and not during
> > allocation time, we must zero such struct pages explicitly.
> > 
> > The patch involves adding a new memblock iterator:
> > for_each_resv_unavail_range(i, p_start, p_end)
> > 
> > Which iterates through reserved && !memory lists, and we zero struct pages
> > explicitly by calling mm_zero_struct_page().
> > 
> > Signed-off-by: Pavel Tatashin 
> > Reviewed-by: Steven Sistare 
> > Reviewed-by: Daniel Jordan 
> > Reviewed-by: Bob Picco 
> 
> Acked-by: Michal Hocko 
> 
> > ---
> >  include/linux/memblock.h | 16 
> >  include/linux/mm.h   | 15 +++
> >  mm/page_alloc.c  | 38 ++
> >  3 files changed, 69 insertions(+)
> > 
> > diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> > index bae11c7e7bf3..ce8bfa5f3e9b 100644
> > --- a/include/linux/memblock.h
> > +++ b/include/linux/memblock.h
> > @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long 
> > pfn, unsigned long max_pfn);
> > for_each_mem_range_rev(i, , , \
> >nid, flags, p_start, p_end, p_nid)
> >  
> > +/**
> > + * for_each_resv_unavail_range - iterate through reserved and unavailable 
> > memory
> > + * @i: u64 used as loop variable
> > + * @flags: pick from blocks based on memory attributes
> > + * @p_start: ptr to phys_addr_t for start address of the range, can be 
> > %NULL
> > + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
> > + *
> > + * Walks over unavailable but reserved (reserved && !memory) areas of 
> > memblock.
> > + * Available as soon as memblock is initialized.
> > + * Note: because this memory does not belong to any physical node, flags 
> > and
> > + * nid arguments do not make sense and thus not exported as arguments.
> > + */
> > +#define for_each_resv_unavail_range(i, p_start, p_end) 
> > \
> > +   for_each_mem_range(i, , , \
> > +  NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
> > +
> >  static inline void memblock_set_region_flags(struct memblock_region *r,
> >  unsigned long flags)
> >  {
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 065d99deb847..04c8b2e5aff4 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -94,6 +94,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
> >  #define mm_forbids_zeropage(X) (0)
> >  #endif
> >  
> > +/*
> > + * On some architectures it is expensive to call memset() for small sizes.
> > + * Those architectures should provide their own implementation of "struct 
> > page"
> > + * zeroing by defining this macro in .
> > + */
> > +#ifndef mm_zero_struct_page
> > +#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct 
> > page)))
> > +#endif
> > +
> >  /*
> >   * Default maximum number of active map areas, this limits the number of 
> > vmas
> >   * per mm struct. Users can overwrite this number by sysctl but there is a
> > @@ -2001,6 +2010,12 @@ extern int __meminit __early_pfn_to_nid(unsigned 
> > long pfn,
> > struct mminit_pfnnid_cache *state);
> >  #endif
> >  
> > +#ifdef CONFIG_HAVE_MEMBLOCK
> > +void zero_resv_unavail(void);
> > +#else
> > +static inline void zero_resv_unavail(void) {}
> > +#endif
> > +
> >  extern void set_dma_reserve(unsigned long new_dma_reserve);
> >  extern void memmap_init_zone(unsigned long, int, unsigned long,
> > unsigned long, enum memmap_context);
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 20b0bace2235..5f0013bbbe9d 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -6209,6 +6209,42 @@ void __paginginit free_area_init_node(int nid, 
> > unsigned long *zones_size,
> > free_area_init_core(pgdat);
> >  }
> >  
> > +#ifdef CONFIG_HAVE_MEMBLOCK
> > 

Re: [PATCH] powerpc/64s/radix: fix preempt imbalance in TLB flush

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 19:09:54 +0530
"Aneesh Kumar K.V"  wrote:

> 
> 
> On 10/10/2017 04:02 PM, Nicholas Piggin wrote:
> > On Tue, 10 Oct 2017 15:52:02 +0530
> > "Aneesh Kumar K.V"  wrote:
> >
> >> On 10/10/2017 03:46 PM, Nicholas Piggin wrote:
> >>> Signed-off-by: Nicholas Piggin 
> >>> ---
> >>>arch/powerpc/mm/tlb-radix.c | 3 ++-
> >>>1 file changed, 2 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> >>> index b3e849c4886e..de414460287a 100644
> >>> --- a/arch/powerpc/mm/tlb-radix.c
> >>> +++ b/arch/powerpc/mm/tlb-radix.c
> >>> @@ -358,7 +358,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
> >>> *mm, unsigned long addr)
> >>>   unsigned long ap = mmu_get_ap(mmu_virtual_psize);
> >>>   unsigned long pid, end;
> >>>
> >>> -
> >>> + preempt_disable();
> >>>   pid = mm ? mm->context.id : 0;
> >>>   if (unlikely(pid == MMU_NO_CONTEXT))
> >>>   goto no_context;
> >>> @@ -366,6 +366,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
> >>> *mm, unsigned long addr)
> >>>   /* 4k page size, just blow the world */
> >>>   if (PAGE_SIZE == 0x1000) {
> >>>   radix__flush_all_mm(mm);
> >>> + preempt_enable();
> >>>   return;
> >>>   }
> >>>
> >> Can't we do a preempt_disable before the if (local) check?. That way we
> >> don't need that prempt_enable in that PAGE_SIZE==0x1000 path.We already
> >> do disable/enable correctly in radix__flush_all_mm(mm)
> > Well this is just to fix the imbalance. Nested preempt doesn't matter
> > much, and these are all no-ops for !preempt kernel, unless you turn on
> > debugging.
> 
> But this patch is still doing the mm_is_thread_local() test outside 
> preempt_disable() right?

Yes. As it does in some other place too. It's just a minimal fix for
the imbalance issue as I said, because that's messing up debugging.

> > I already proposed another patch to bring those local tests under
> > preempt disable but no response yet
> >
> > https://patchwork.ozlabs.org/patch/811061/
> >
> 
> That is a much better patch?

I'm planning to repost the series but have been side-tracked hitting
testing it due to hitting bugs (!preempt though, so this has not been
top priority).

Thanks,
Nick


Re: [PATCH v11 5/9] mm: zero reserved and unavailable struct pages

2017-10-10 Thread Michal Hocko
On Mon 09-10-17 18:19:27, Pavel Tatashin wrote:
> Some memory is reserved but unavailable: not present in memblock.memory
> (because not backed by physical pages), but present in memblock.reserved.
> Such memory has backing struct pages, but they are not initialized by going
> through __init_single_page().
> 
> In some cases these struct pages are accessed even if they do not contain
> any data. One example is page_to_pfn() might access page->flags if this is
> where section information is stored (CONFIG_SPARSEMEM,
> SECTION_IN_PAGE_FLAGS).
> 
> One example of such memory: trim_low_memory_range() unconditionally
> reserves from pfn 0, but e820__memblock_setup() might provide the exiting
> memory from pfn 1 (i.e. KVM).
> 
> Since, struct pages are zeroed in __init_single_page(), and not during
> allocation time, we must zero such struct pages explicitly.
> 
> The patch involves adding a new memblock iterator:
>   for_each_resv_unavail_range(i, p_start, p_end)
> 
> Which iterates through reserved && !memory lists, and we zero struct pages
> explicitly by calling mm_zero_struct_page().
> 
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 

Acked-by: Michal Hocko 

> ---
>  include/linux/memblock.h | 16 
>  include/linux/mm.h   | 15 +++
>  mm/page_alloc.c  | 38 ++
>  3 files changed, 69 insertions(+)
> 
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index bae11c7e7bf3..ce8bfa5f3e9b 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, 
> unsigned long max_pfn);
>   for_each_mem_range_rev(i, , , \
>  nid, flags, p_start, p_end, p_nid)
>  
> +/**
> + * for_each_resv_unavail_range - iterate through reserved and unavailable 
> memory
> + * @i: u64 used as loop variable
> + * @flags: pick from blocks based on memory attributes
> + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
> + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
> + *
> + * Walks over unavailable but reserved (reserved && !memory) areas of 
> memblock.
> + * Available as soon as memblock is initialized.
> + * Note: because this memory does not belong to any physical node, flags and
> + * nid arguments do not make sense and thus not exported as arguments.
> + */
> +#define for_each_resv_unavail_range(i, p_start, p_end)   
> \
> + for_each_mem_range(i, , , \
> +NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL)
> +
>  static inline void memblock_set_region_flags(struct memblock_region *r,
>unsigned long flags)
>  {
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 065d99deb847..04c8b2e5aff4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -94,6 +94,15 @@ extern int mmap_rnd_compat_bits __read_mostly;
>  #define mm_forbids_zeropage(X)   (0)
>  #endif
>  
> +/*
> + * On some architectures it is expensive to call memset() for small sizes.
> + * Those architectures should provide their own implementation of "struct 
> page"
> + * zeroing by defining this macro in .
> + */
> +#ifndef mm_zero_struct_page
> +#define mm_zero_struct_page(pp)  ((void)memset((pp), 0, sizeof(struct page)))
> +#endif
> +
>  /*
>   * Default maximum number of active map areas, this limits the number of vmas
>   * per mm struct. Users can overwrite this number by sysctl but there is a
> @@ -2001,6 +2010,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long 
> pfn,
>   struct mminit_pfnnid_cache *state);
>  #endif
>  
> +#ifdef CONFIG_HAVE_MEMBLOCK
> +void zero_resv_unavail(void);
> +#else
> +static inline void zero_resv_unavail(void) {}
> +#endif
> +
>  extern void set_dma_reserve(unsigned long new_dma_reserve);
>  extern void memmap_init_zone(unsigned long, int, unsigned long,
>   unsigned long, enum memmap_context);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 20b0bace2235..5f0013bbbe9d 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -6209,6 +6209,42 @@ void __paginginit free_area_init_node(int nid, 
> unsigned long *zones_size,
>   free_area_init_core(pgdat);
>  }
>  
> +#ifdef CONFIG_HAVE_MEMBLOCK
> +/*
> + * Only struct pages that are backed by physical memory are zeroed and
> + * initialized by going through __init_single_page(). But, there are some
> + * struct pages which are reserved in memblock allocator and their fields
> + * may be accessed (for example page_to_pfn() on some configuration accesses
> + * flags). We must explicitly zero those struct pages.
> + */
> +void 

Re: [PATCH] powerpc/64s/radix: fix preempt imbalance in TLB flush

2017-10-10 Thread Aneesh Kumar K.V



On 10/10/2017 04:02 PM, Nicholas Piggin wrote:

On Tue, 10 Oct 2017 15:52:02 +0530
"Aneesh Kumar K.V"  wrote:


On 10/10/2017 03:46 PM, Nicholas Piggin wrote:

Signed-off-by: Nicholas Piggin 
---
   arch/powerpc/mm/tlb-radix.c | 3 ++-
   1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..de414460287a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -358,7 +358,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;

-
+   preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
@@ -366,6 +366,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
+   preempt_enable();
return;
}
   

Can't we do a preempt_disable before the if (local) check?. That way we
don't need that prempt_enable in that PAGE_SIZE==0x1000 path.We already
do disable/enable correctly in radix__flush_all_mm(mm)

Well this is just to fix the imbalance. Nested preempt doesn't matter
much, and these are all no-ops for !preempt kernel, unless you turn on
debugging.


But this patch is still doing the mm_is_thread_local() test outside 
preempt_disable() right?




I already proposed another patch to bring those local tests under
preempt disable but no response yet

https://patchwork.ozlabs.org/patch/811061/



That is a much better patch?

-aneesh



Re: [PATCH v10 05/10] mm: zero reserved and unavailable struct pages

2017-10-10 Thread Michal Hocko
On Fri 06-10-17 11:25:16, Pasha Tatashin wrote:
> Hi Michal,
> 
> > 
> > As I've said in other reply this should go in only if the scenario you
> > describe is real. I am somehow suspicious to be honest. I simply do not
> > see how those weird struct pages would be in a valid pfn range of any
> > zone.
> > 
> 
> There are examples of both when unavailable memory is not part of any zone,
> and where it is part of zones.
> 
> I run Linux in kvm with these arguments:
> 
> qemu-system-x86_64
> -enable-kvm
> -cpu kvm64
> -kernel $kernel
> -initrd $initrd
> -m 512
> -smp 2
> -device e1000,netdev=net0
> -netdev user,id=net0
> -boot order=nc
> -no-reboot
> -watchdog i6300esb
> -watchdog-action debug
> -rtc base=localtime
> -serial stdio
> -display none
> -monitor null
> 
> This patch reports that there are 98 unavailable pages.
> 
> They are: pfn 0 and pfns in range [159, 255].
> 
> Note, trim_low_memory_range() reserves only pfns in range [0, 15], it does
> not reserve [159, 255] ones.
> 
> e820__memblock_setup() reports linux that the following physical ranges are
> available:
> [1 , 158]
> [256, 130783]
> 
> Notice, that exactly unavailable pfns are missing!
> 
> Now, lets check what we have in zone 0: [1, 131039]
> 
> pfn 0, is not part of the zone, but pfns [1, 158], are.
> 
> However, the bigger problem we have if we do not initialize these struct
> pages is with memory hotplug. Because, that path operates at 2M boundaries
> (section_nr). And checks if 2M range of pages is hot removable. It starts
> with first pfn from zone, rounds it down to 2M boundary (sturct pages are
> allocated at 2M boundaries when vmemmap is created), and and checks if that
> section is hot removable. In this case start with pfn 1 and convert it down
> to pfn 0.

Hmm, this is really interesting! I thought each memblock is guaranteed
to be section size aligned. But I suspect this is more of a wishful
thinking. But now I see what is the problem.
-- 
Michal Hocko
SUSE Labs


Re: [PATCH] KVM: PPC: Book3S HV: POWER9 more doorbell fixes

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 20:18:28 +1000
Nicholas Piggin  wrote:

> - Add another case where msgsync is required.
> - Required barrier sequence for global doorbells is msgsync ; lwsync
> - POWER9 DD1 has a different barrier sequence that we don't implement,
>   so remove

This last item was not included in the patch, it slipped past.
I think the changelog can just be changed to remove it.

Fixing up DD1 should be a different patch if we want to bother
with it. Guess either we add the darn sequence for msgsync, or
add support for xive IPIs?

Thanks,
Nick


Re: [PATCH 1/3] powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 22:11:46 +1100
Michael Ellerman  wrote:

> Nicholas Piggin  writes:
> 
> > OPAL boot does not insert secondaries at 0x60 to wait at the secondary
> > hold spinloop. Instead it keeps them held in firmware until the
> > opal_start_cpu call is made, which directs them where the caller
> > specifies. Linux inserts them into generic_secondary_smp_init(), which
> > is after the secondary hold spinloop (they go on to spin at the per-CPU
> > paca loops, but that is another step).
> >
> > So avoid waiting on this spinloop when booting with OPAL firmware.
> > It always just times out.
> >
> > This saves 100ms boot time on bare metal, and 10s of seconds when
> > booting the simulator in SMP.  
> 
> Oh nice, that's real facepalm territory.
> 
> It'd be neater if we just inserted them at 0x60, but the sequence is
> wrong.
> 
> Can we fix it just by making spinning_secondaries zero on OPAL?

I had a look at that, but generic_secondary_smp_init() still
decrements it, so it would underflow which I thought was
uglier.

I actually have to look a bit further, because KVM guests are
also having the loop time out too by the looks.

Thanks,
Nick


Re: [PATCH 1/3] powerpc/powernv: Avoid the secondary hold spinloop for OPAL boot

2017-10-10 Thread Michael Ellerman
Nicholas Piggin  writes:

> OPAL boot does not insert secondaries at 0x60 to wait at the secondary
> hold spinloop. Instead it keeps them held in firmware until the
> opal_start_cpu call is made, which directs them where the caller
> specifies. Linux inserts them into generic_secondary_smp_init(), which
> is after the secondary hold spinloop (they go on to spin at the per-CPU
> paca loops, but that is another step).
>
> So avoid waiting on this spinloop when booting with OPAL firmware.
> It always just times out.
>
> This saves 100ms boot time on bare metal, and 10s of seconds when
> booting the simulator in SMP.

Oh nice, that's real facepalm territory.

It'd be neater if we just inserted them at 0x60, but the sequence is
wrong.

Can we fix it just by making spinning_secondaries zero on OPAL?

cheers


Re: [PATCH v2] powerpc: Default to enabling STRICT_KERNEL_RWX

2017-10-10 Thread Michael Ellerman
Kees Cook  writes:

> When available, CONFIG_KERNEL_RWX should be default-enabled for PPC64.
> On PPC32, there is a performance trade-off.

Thanks for prodding us. But I think we need some more test cycles on
this before we make it the default.

As Balbir said it's currently not compatible with RELOCATABLE, which
means most folks aren't enabling it.

We also don't have good numbers on what the performance impact is on
64-bit. So although it almost certainly should be the default in future,
I'd still like us to have some idea of what it's costing us.

I'll try and get some perf numbers.

cheers


Re: [PATCH] powerpc/64s/radix: fix preempt imbalance in TLB flush

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 15:52:02 +0530
"Aneesh Kumar K.V"  wrote:

> On 10/10/2017 03:46 PM, Nicholas Piggin wrote:
> > Signed-off-by: Nicholas Piggin 
> > ---
> >   arch/powerpc/mm/tlb-radix.c | 3 ++-
> >   1 file changed, 2 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
> > index b3e849c4886e..de414460287a 100644
> > --- a/arch/powerpc/mm/tlb-radix.c
> > +++ b/arch/powerpc/mm/tlb-radix.c
> > @@ -358,7 +358,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
> > *mm, unsigned long addr)
> > unsigned long ap = mmu_get_ap(mmu_virtual_psize);
> > unsigned long pid, end;
> > 
> > -
> > +   preempt_disable();
> > pid = mm ? mm->context.id : 0;
> > if (unlikely(pid == MMU_NO_CONTEXT))
> > goto no_context;
> > @@ -366,6 +366,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct 
> > *mm, unsigned long addr)
> > /* 4k page size, just blow the world */
> > if (PAGE_SIZE == 0x1000) {
> > radix__flush_all_mm(mm);
> > +   preempt_enable();
> > return;
> > }
> >   
> 
> Can't we do a preempt_disable before the if (local) check?. That way we 
> don't need that prempt_enable in that PAGE_SIZE==0x1000 path.We already 
> do disable/enable correctly in radix__flush_all_mm(mm)

Well this is just to fix the imbalance. Nested preempt doesn't matter
much, and these are all no-ops for !preempt kernel, unless you turn on
debugging.

I already proposed another patch to bring those local tests under
preempt disable but no response yet

https://patchwork.ozlabs.org/patch/811061/

Thanks,
Nick


Re: [PATCH] powerpc/64s/radix: fix preempt imbalance in TLB flush

2017-10-10 Thread Aneesh Kumar K.V



On 10/10/2017 03:46 PM, Nicholas Piggin wrote:

Signed-off-by: Nicholas Piggin 
---
  arch/powerpc/mm/tlb-radix.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..de414460287a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -358,7 +358,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;

-
+   preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
@@ -366,6 +366,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
+   preempt_enable();
return;
}



Can't we do a preempt_disable before the if (local) check?. That way we 
don't need that prempt_enable in that PAGE_SIZE==0x1000 path.We already 
do disable/enable correctly in radix__flush_all_mm(mm)


-aneesh



[PATCH] KVM: PPC: Book3S HV: POWER9 more doorbell fixes

2017-10-10 Thread Nicholas Piggin
- Add another case where msgsync is required.
- Required barrier sequence for global doorbells is msgsync ; lwsync
- POWER9 DD1 has a different barrier sequence that we don't implement,
  so remove

When msgsnd is used for IPIs to other cores, msgsync must be executed by
the target to order stores performed on the source before its msgsnd
(provided the source executes the appropriate sync).

Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores 
on POWER9")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 663a4a861e7f..90c07421eba2 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1300,6 +1300,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
bne 3f
 BEGIN_FTR_SECTION
PPC_MSGSYNC
+   lwsync
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi   r0, 0
@@ -2761,6 +2762,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
PPC_MSGCLR(6)
/* see if it's a host IPI */
li  r3, 1
+BEGIN_FTR_SECTION
+   PPC_MSGSYNC
+   lwsync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi   r0, 0
bnelr
-- 
2.13.3



[PATCH] powerpc/64s/radix: fix preempt imbalance in TLB flush

2017-10-10 Thread Nicholas Piggin
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..de414460287a 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -358,7 +358,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
 
-
+   preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
@@ -366,6 +366,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
+   preempt_enable();
return;
}
 
-- 
2.13.3



Re: [PATCH] selftests/powerpc: fix build error in powerpc ptrace selftests.

2017-10-10 Thread Michael Ellerman
wei.guo.si...@gmail.com writes:

> From: Simon Guo 
>
> GCC 7 will take "r2" in clobber list as an error will it will get following
> build errors for powerpc ptrace selftests even with -fno-pic option:
>   ptrace-tm-vsx.c: In function ‘tm_vsx’:
>   ptrace-tm-vsx.c:42:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> asm __volatile__(
> ^~~
>   make[1]: *** [ptrace-tm-vsx] Error 1
>   ptrace-tm-spd-vsx.c: In function ‘tm_spd_vsx’:
>   ptrace-tm-spd-vsx.c:55:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> asm __volatile__(
> ^~~
>   make[1]: *** [ptrace-tm-spd-vsx] Error 1
>   ptrace-tm-spr.c: In function ‘tm_spr’:
>   ptrace-tm-spr.c:46:2: error: PIC register clobbered by ‘r2’ in ‘asm’
> asm __volatile__(
> ^~~
>
> This patch fix the build error by removing "r2" out of clobber list.

But do any of the blocks clobber r2? If so then it should be in the
clobber list.

cheers


Re: [PATCH] powerpc: Drop lockdep_assert_cpus_held call from arch_update_cpu_topology

2017-10-10 Thread Michael Ellerman
Thomas Gleixner  writes:
...
>
> So no, the lockdep assertion triggers in #1 and #2 because
>
>#1 does definitely not hold it
>
>#2 is indirectily protected, but we have no way to express that to lockdep
>
> So yes, it's safe for both cases to remove that assertion.

Thanks for clarifying that.

> If there are other call sites, then they need to be checked. If not, you're
> good.

I also see a call in partition_sched_domains(). The comment there says
"call with hotplug lock held" and I'm sure all callers do so ...

But seriously I think the patch is good because we know there are at
least two callers who are safe but can't hold the lock, so doing an
assert there is definitely wrong.

So I'll apply the patch with a slightly reworked commit message.

cheers


Re: [PATCH] powerpc/pseries/cpuidle: add polling idle for shared processor guests

2017-10-10 Thread Nicholas Piggin
On Tue, 10 Oct 2017 17:11:09 +1000
Nicholas Piggin  wrote:

> For shared processor guests (e.g., KVM), add an idle polling mode rather
> than immediately returning to the hypervisor when the guest CPU goes
> idle.
> 
> Test setup is a 2 socket POWER9 with 4 guests running, each with vCPUs
> equal to 1/2 of real of CPUs. Saturated each guest with tbench. Using
> polling idle gives about 1.4x throughput.

Actually it's even more noticeable when the host is not over subscribed
by the looks. 2.5x-3x increase in throughput for just a single guest.

Thanks,
Nick


Re: [PATCH v4] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-10-10 Thread Naveen N. Rao
On 2017/10/10 09:03AM, Santosh Sivaraj wrote:
> * Naveen N. Rao  wrote (on 2017-10-09 
> 10:39:18 +):
> 
> > On 2017/10/09 08:09AM, Santosh Sivaraj wrote:

[snip]

> > > + add r3,r3,r0
> > > + ld  r0,CFG_TB_UPDATE_COUNT(r3)
> > > + cmpld   cr0,r0,r8   /* check if updated */
> > > + bne-70b
> > 
> > I also notice that the code for dealing with CLOCK_MONOTONIC is similar 
> > for _COARSE and regular clocks. If possible, we should reuse that as 
> > well.
> >
> In this case we will be adding more checks and branches in order to reuse
> the code. If we want to keep the code common we will have to do a lot of
> jumping around, code will contain a bunch of branches, which I feel will make
> the code/flow hard to understand. (Q: Does lot of branches have bad effect on
> branch prediction?)

Right - like we discussed offline, if it hurts readability, that's a 
good enough reason not to do this. We are only talking about a few 
instructions here anyway, so no need to worry too much.

- Naveen



Re: [PATCH v4] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-10-10 Thread Santosh Sivaraj
* Naveen N. Rao  wrote (on 2017-10-09 10:39:18 
+):

> On 2017/10/09 08:09AM, Santosh Sivaraj wrote:
> > Current vDSO64 implementation does not have support for coarse clocks
> > (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
> > to system call, increasing the response time, vDSO implementation reduces
> > the cycle time. Below is a benchmark of the difference in execution times.
> > 
> > (Non-coarse clocks are also included just for completion)
> > 
> > clock-gettime-realtime: syscall: 172 nsec/call
> > clock-gettime-realtime:libc: 28 nsec/call
> > clock-gettime-realtime:vdso: 22 nsec/call
> > clock-gettime-monotonic: syscall: 171 nsec/call
> > clock-gettime-monotonic:libc: 30 nsec/call
> > clock-gettime-monotonic:vdso: 25 nsec/call
> > clock-gettime-realtime-coarse: syscall: 153 nsec/call
> > clock-gettime-realtime-coarse:libc: 16 nsec/call
> > clock-gettime-realtime-coarse:vdso: 10 nsec/call
> > clock-gettime-monotonic-coarse: syscall: 167 nsec/call
> > clock-gettime-monotonic-coarse:libc: 17 nsec/call
> > clock-gettime-monotonic-coarse:vdso: 11 nsec/call
> > 
> > CC: Benjamin Herrenschmidt 
> > Signed-off-by: Santosh Sivaraj 
> > ---
> >  arch/powerpc/kernel/asm-offsets.c |  2 +
> >  arch/powerpc/kernel/vdso64/gettimeofday.S | 67 
> > ++-
> >  2 files changed, 58 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index 8cfb20e38cfe..b55c68c54dc1 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -396,6 +396,8 @@ int main(void)
> > /* Other bits used by the vdso */
> > DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
> > DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
> > +   DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
> > +   DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
> > DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
> > DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
> > 
> > diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> > b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > index 382021324883..729dded195ce 100644
> > --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> > +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > @@ -64,6 +64,12 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
> > cmpwi   cr0,r3,CLOCK_REALTIME
> > cmpwi   cr1,r3,CLOCK_MONOTONIC
> > crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> > +
> > +   cmpwi   cr5,r3,CLOCK_REALTIME_COARSE
> > +   cmpwi   cr6,r3,CLOCK_MONOTONIC_COARSE
> > +   crorcr5*4+eq,cr5*4+eq,cr6*4+eq
> > +
> > +   crorcr0*4+eq,cr0*4+eq,cr5*4+eq
> > bne cr0,99f
> > 
> > mflrr12 /* r12 saves lr */
> > @@ -72,6 +78,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
> > bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> > lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
> > ori r7,r7,NSEC_PER_SEC@l
> > +   beq cr5,70f
> >  50:bl  V_LOCAL_FUNC(__do_get_tspec)/* get time from tb & 
> > kernel */
> > bne cr1,80f /* if not monotonic, all done */
> > 
> > @@ -97,19 +104,57 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
> > ld  r0,CFG_TB_UPDATE_COUNT(r3)
> >  cmpld   cr0,r0,r8  /* check if updated */
> > bne-50b
> > +   b   78f
> > 
> > -   /* Add wall->monotonic offset and check for overflow or underflow.
> > +   /*
> > +* For coarse clocks we get data directly from the vdso data page, so
> > +* we don't need to call __do_get_tspec, but we still need to do the
> > +* counter trick.
> >  */
> > -   add r4,r4,r6
> > -   add r5,r5,r9
> > -   cmpdcr0,r5,r7
> > -   cmpdi   cr1,r5,0
> > -   blt 1f
> > -   subfr5,r7,r5
> > -   addir4,r4,1
> > -1: bge cr1,80f
> > -   addir4,r4,-1
> > -   add r5,r5,r7
> > +70:ld  r8,CFG_TB_UPDATE_COUNT(r3)
> > +   andi.   r0,r8,1 /* pending update ? loop */
> > +   bne-70b
> > +   xor r0,r8,r8/* create dependency */
> > +   add r3,r3,r0
> > +
> > +   /*
> > +* CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
> > +* too
> > +*/
> > +   ld  r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
> > +   ld  r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
> > +   bne cr6,75f
> > +
> > +   /* CLOCK_MONOTONIC_COARSE */
> > +   lwa r6,WTOM_CLOCK_SEC(r3)
> > +   lwa r9,WTOM_CLOCK_NSEC(r3)
> > +
> > +   /* check if counter has updated */
> > +75:or  r0,r6,r9
> > +   or  r0,r4,r5
> > +   xor r0,r0,r0
> 
> The label '75:' should be on the second instruction since we don't need 
> to worry about r6/r9 for REALTIME_COARSE.
> 
> Also, the above hunk should actually be:
> 
>   or  r0,r6,r9
>   or  r0,r0,r4
>   or  r0,r0,r5
>   

Re: [PATCH] powerpc/powernv: Add kernel cmdline parameter to disable imc

2017-10-10 Thread Michael Ellerman
Anju T Sudhakar  writes:

> Add a kernel command line parameter option to disable In-Memory Collection
> (IMC) counters and add documentation. This helps in debug.

I'd really rather we didn't. Do we *really* need this?

We don't have command line parameters to disable any of the other ~20
PMUs, why is this one special?

cheers


Re: [PATCH] powerpc/memkey: fix compilation error caused by upstream changes

2017-10-10 Thread Michael Ellerman
Ram Pai  writes:

> "commit df3735c5 -- x86,mpx: make mpx depend on x86-64 to free up VMA flag"
> added the same vm highmem flag we had introduced for PKEY4. That broke
> some of the definitions in pkeys.h
>
> This patch fixes the issue.
>
> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/pkeys.h |4 +---
>  1 files changed, 1 insertions(+), 3 deletions(-)

Thanks.

Next problem is pmac32, corenet32/corenet64_smp defconfigs:

  arch/powerpc/include/asm/pkeys.h:19:23: error: 'VM_HIGH_ARCH_BIT_0' 
undeclared (first use in this function)
  arch/powerpc/include/asm/pkeys.h:20:22: error: 'VM_HIGH_ARCH_0' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:21:22: error: 'VM_HIGH_ARCH_1' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:22:22: error: 'VM_HIGH_ARCH_2' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:23:22: error: 'VM_HIGH_ARCH_3' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:24:22: error: 'VM_HIGH_ARCH_4' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:50:39: error: 'H_PAGE_PKEY_BIT4' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:51:32: error: 'H_PAGE_PKEY_BIT3' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:52:32: error: 'H_PAGE_PKEY_BIT2' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:53:32: error: 'H_PAGE_PKEY_BIT1' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:54:32: error: 'H_PAGE_PKEY_BIT0' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:77:23: error: 'H_PAGE_PKEY_BIT0' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:77:43: error: 'HPTE_R_KEY_BIT0' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:78:16: error: 'H_PAGE_PKEY_BIT1' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:78:36: error: 'HPTE_R_KEY_BIT1' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:79:16: error: 'H_PAGE_PKEY_BIT2' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:79:36: error: 'HPTE_R_KEY_BIT2' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:80:16: error: 'H_PAGE_PKEY_BIT3' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:80:36: error: 'HPTE_R_KEY_BIT3' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:81:16: error: 'H_PAGE_PKEY_BIT4' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:81:36: error: 'HPTE_R_KEY_BIT4' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:89:23: error: 'H_PAGE_PKEY_BIT0' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:90:16: error: 'H_PAGE_PKEY_BIT1' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:91:16: error: 'H_PAGE_PKEY_BIT2' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:92:16: error: 'H_PAGE_PKEY_BIT3' undeclared 
(first use in this function)
  arch/powerpc/include/asm/pkeys.h:93:16: error: 'H_PAGE_PKEY_BIT4' undeclared 
(first use in this function)
  arch/powerpc/include/asm/mmu_context.h:199:19: error: redefinition of 
'vma_pkey'
  arch/powerpc/include/asm/mmu_context.h:204:19: error: redefinition of 
'pte_to_hpte_pkey_bits'
  arch/powerpc/include/asm/pkeys.h:102:48: error: 'mm_context_t' has no member 
named 'pkey_allocation_map'
  arch/powerpc/include/asm/pkeys.h:224:13: error: 'mm_context_t' has no member 
named 'execute_only_pkey'

cheers


Re: [PATCH] cxl: Dump PSL_FIR register on PSL9 error irq

2017-10-10 Thread Frederic Barrat

Hi Vaibhav,

I think we can make it slightly cleaner by registering a different 
callback for psl8 and psl9. The callback 'err_irq_dump_registers' is 
already in place, it could just point to a different function in 
psl8_ops and psl9_ops.


  Fred


Le 09/10/2017 à 19:58, Vaibhav Jain a écrit :

For PSL9 currently we aren't dumping the PSL FIR register when a
PSL error interrupt is triggered. Contents of this register are useful
in debugging AFU issues.

This patch fixes issue by updating the cxl_native_err_irq_dump_regs()
to dump the PSL_FIR registers on a PSL error interrupt thereby
bringing the behavior in line with PSL on POWER-8.

Signed-off-by: Vaibhav Jain 
--- >   drivers/misc/cxl/native.c | 17 ++---
  drivers/misc/cxl/pci.c|  1 +
  2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 6cd57c756927..fa143bad9b55 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -1267,10 +1267,21 @@ void cxl_native_err_irq_dump_regs(struct cxl *adapter)
  {
u64 fir1, fir2;

-   fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
-   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   if (cxl_is_power8()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
+   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   dev_crit(>dev,
+"PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n",
+fir1, fir2);
+
+   } else if (cxl_is_power9()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL9_FIR1);
+   dev_crit(>dev, "PSL_FIR: 0x%016llx\n", fir1);
+
+   } else {
+   WARN_ON(1);
+   }

-   dev_crit(>dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", 
fir1, fir2);
  }

  static irqreturn_t native_irq_err(int irq, void *data)
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b4ce9ea113a9..d9d6777fa853 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1763,6 +1763,7 @@ static const struct cxl_service_layer_ops psl9_ops = {
.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
+   .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
.debugfs_stop_trace = cxl_stop_trace_psl9,
.write_timebase_ctrl = write_timebase_ctrl_psl9,
.timebase_read = timebase_read_psl9,





Re: [PATCH] cxl: Rename register PSL9_FIR2 to PSL9_FIR_MASK

2017-10-10 Thread Frederic Barrat



Le 09/10/2017 à 19:56, Vaibhav Jain a écrit :

PSL9 doesn't have a FIR2 register as was the case with PSL8. However
currently the register definitions in 'cxl.h' have a definition for
PSL9_FIR2 that actually points to PSL9_FIR_MASK register in the P1
area at offset 0x308.

So this patch renames the def PSL9_FIR2 to PSL9_FIR_MASK and updates
the references in the code to point to the new identifier. It also
removes the code to dump contents of FIR2 (FIR_MASK actually) in
cxl_native_irq_dump_regs_psl9().

Fixes: f24be42aab37("cxl: Add psl9 specific code")
Reported-by: Frederic Barrat 
Signed-off-by: Vaibhav Jain 
---


(patch applies on 'next')
Thanks for cleaning it up.

Acked-by: Frederic Barrat 




  drivers/misc/cxl/cxl.h | 2 +-
  drivers/misc/cxl/debugfs.c | 3 ++-
  drivers/misc/cxl/native.c  | 4 +---
  3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 0167df81df62..252373c2b861 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -104,7 +104,7 @@ static const cxl_p1_reg_t CXL_XSL9_INV  = {0x0110};
  static const cxl_p1_reg_t CXL_XSL9_DEF  = {0x0140};
  static const cxl_p1_reg_t CXL_XSL9_DSNCTL   = {0x0168};
  static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300};
-static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308};
+static const cxl_p1_reg_t CXL_PSL9_FIR_MASK = {0x0308};
  static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310};
  static const cxl_p1_reg_t CXL_PSL9_DEBUG= {0x0320};
  static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348};
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index eae9d749f967..52e3d97db114 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -62,7 +62,8 @@ static struct dentry *debugfs_create_io_x64(const char *name, 
umode_t mode,
  void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry 
*dir)
  {
debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, 
CXL_PSL9_FIR1));
-   debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, 
CXL_PSL9_FIR2));
+   debugfs_create_io_x64("fir_mask", 0400, dir,
+ _cxl_p1_addr(adapter, CXL_PSL9_FIR_MASK));
debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, 
CXL_PSL9_FIR_CNTL));
debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, 
_cxl_p1_addr(adapter, CXL_PSL9_TRACECFG));
  }
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 75df74d59527..6cd57c756927 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -1085,13 +1085,11 @@ static int native_get_irq_info(struct cxl_afu *afu, 
struct cxl_irq_info *info)

  void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx)
  {
-   u64 fir1, fir2, serr;
+   u64 fir1, serr;

fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1);
-   fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2);

dev_crit(>afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
-   dev_crit(>afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
if (ctx->afu->adapter->native->sl_ops->register_serr_irq) {
serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
cxl_afu_decode_psl_serr(ctx->afu, serr);





[PATCH] powerpc/pseries/cpuidle: add polling idle for shared processor guests

2017-10-10 Thread Nicholas Piggin
For shared processor guests (e.g., KVM), add an idle polling mode rather
than immediately returning to the hypervisor when the guest CPU goes
idle.

Test setup is a 2 socket POWER9 with 4 guests running, each with vCPUs
equal to 1/2 of real of CPUs. Saturated each guest with tbench. Using
polling idle gives about 1.4x throughput.

Kernel compile speed was not changed significantly.

Signed-off-by: Nicholas Piggin 
---
 drivers/cpuidle/cpuidle-pseries.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index e9b3853d93ea..16be7ad30fe1 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -171,11 +171,17 @@ static struct cpuidle_state dedicated_states[] = {
  * States for shared partition case.
  */
 static struct cpuidle_state shared_states[] = {
+   { /* Snooze */
+   .name = "snooze",
+   .desc = "snooze",
+   .exit_latency = 0,
+   .target_residency = 0,
+   .enter = _loop },
{ /* Shared Cede */
.name = "Shared Cede",
.desc = "Shared Cede",
-   .exit_latency = 0,
-   .target_residency = 0,
+   .exit_latency = 10,
+   .target_residency = 100,
.enter = _cede_loop },
 };
 
-- 
2.13.3



Re: [PATCH v2] powerpc/lib/sstep: Fix count leading zeros instructions

2017-10-10 Thread Naveen N. Rao
On 2017/10/10 06:45AM, Sandipan Das wrote:
> According to the GCC documentation, the behaviour of __builtin_clz()
> and __builtin_clzl() is undefined if the value of the input argument
> is zero. Without handling this special case, these builtins have been
> used for emulating the following instructions:
>   * Count Leading Zeros Word (cntlzw[.])
>   * Count Leading Zeros Doubleword (cntlzd[.])
> 
> This fixes the emulated behaviour of these instructions by adding an
> additional check for this special case.
> 
> Fixes: 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify 
> *regs")
> Signed-off-by: Sandipan Das 

Reviewed-by: Naveen N. Rao 

> ---
> v2: Make zero-checking condition more compact.
> Add details of original commit that is being fixed here.
> ---
>  arch/powerpc/lib/sstep.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index 5118110c3983..8c3955e183d4 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -1699,11 +1699,13 @@ int analyse_instr(struct instruction_op *op, const 
> struct pt_regs *regs,
>   * Logical instructions
>   */
>   case 26:/* cntlzw */
> - op->val = __builtin_clz((unsigned int) regs->gpr[rd]);
> + val = (unsigned int) regs->gpr[rd];
> + op->val = ( val ? __builtin_clz(val) : 32 );
>   goto logical_done;
>  #ifdef __powerpc64__
>   case 58:/* cntlzd */
> - op->val = __builtin_clzl(regs->gpr[rd]);
> + val = regs->gpr[rd];
> + op->val = ( val ? __builtin_clzl(val) : 64 );
>   goto logical_done;
>  #endif
>   case 28:/* and */
> -- 
> 2.13.6
> 



[PATCH v2] powerpc/lib/sstep: Fix count leading zeros instructions

2017-10-10 Thread Sandipan Das
According to the GCC documentation, the behaviour of __builtin_clz()
and __builtin_clzl() is undefined if the value of the input argument
is zero. Without handling this special case, these builtins have been
used for emulating the following instructions:
  * Count Leading Zeros Word (cntlzw[.])
  * Count Leading Zeros Doubleword (cntlzd[.])

This fixes the emulated behaviour of these instructions by adding an
additional check for this special case.

Fixes: 3cdfcbfd32b9d ("powerpc: Change analyse_instr so it doesn't modify 
*regs")
Signed-off-by: Sandipan Das 
---
v2: Make zero-checking condition more compact.
Add details of original commit that is being fixed here.
---
 arch/powerpc/lib/sstep.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5118110c3983..8c3955e183d4 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1699,11 +1699,13 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
  * Logical instructions
  */
case 26:/* cntlzw */
-   op->val = __builtin_clz((unsigned int) regs->gpr[rd]);
+   val = (unsigned int) regs->gpr[rd];
+   op->val = ( val ? __builtin_clz(val) : 32 );
goto logical_done;
 #ifdef __powerpc64__
case 58:/* cntlzd */
-   op->val = __builtin_clzl(regs->gpr[rd]);
+   val = regs->gpr[rd];
+   op->val = ( val ? __builtin_clzl(val) : 64 );
goto logical_done;
 #endif
case 28:/* and */
-- 
2.13.6