[PATCH 2/2] powerpc/xmon: revisit SPR support

2017-07-20 Thread Balbir Singh
This patch readjusts the SPR's adds support for IAMR/AMR
UAMOR/AMOR based on their supported ISA revisions. The
HDEC SPR is now printed with 16 hex digits instead of 8,
so that we can see the expanded values on ISA 300.
There is also support for printing the PIDR/TIDR for
ISA 300 and PSSCR and PTCR in ISA 300 hypervisor mode.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/xmon/xmon.c | 28 +++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 8aedfff..e025a16 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1738,13 +1738,15 @@ static void dump_206_sprs(void)
mfspr(SPRN_SRR0), mfspr(SPRN_SRR1), mfspr(SPRN_DSISR));
printf("dscr   = %.16x  ppr   = %.16x pir= %.8x\n",
mfspr(SPRN_DSCR), mfspr(SPRN_PPR), mfspr(SPRN_PIR));
+   printf("amr= %.16x  uamor = %.16x\n",
+   mfspr(SPRN_AMR), mfspr(SPRN_UAMOR));
 
if (!(mfmsr() & MSR_HV))
return;
 
printf("sdr1   = %.16x  hdar  = %.16x hdsisr = %.8x\n",
mfspr(SPRN_SDR1), mfspr(SPRN_HDAR), mfspr(SPRN_HDSISR));
-   printf("hsrr0  = %.16x hsrr1  = %.16x hdec = %.8x\n",
+   printf("hsrr0  = %.16x hsrr1  = %.16x hdec = %.16x\n",
mfspr(SPRN_HSRR0), mfspr(SPRN_HSRR1), mfspr(SPRN_HDEC));
printf("lpcr   = %.16x  pcr   = %.16x lpidr = %.8x\n",
mfspr(SPRN_LPCR), mfspr(SPRN_PCR), mfspr(SPRN_LPID));
@@ -1788,6 +1790,7 @@ static void dump_207_sprs(void)
mfspr(SPRN_SDAR), mfspr(SPRN_SIER), mfspr(SPRN_PMC6));
printf("ebbhr  = %.16x  ebbrr = %.16x bescr  = %.16x\n",
mfspr(SPRN_EBBHR), mfspr(SPRN_EBBRR), mfspr(SPRN_BESCR));
+   printf("iamr   = %.16x\n", mfspr(SPRN_IAMR));
 
if (!(msr & MSR_HV))
return;
@@ -1796,6 +1799,28 @@ static void dump_207_sprs(void)
mfspr(SPRN_HFSCR), mfspr(SPRN_DHDES), mfspr(SPRN_RPR));
printf("dawr   = %.16x  dawrx = %.16x ciabr  = %.16x\n",
mfspr(SPRN_DAWR), mfspr(SPRN_DAWRX), mfspr(SPRN_CIABR));
+   printf("amor   = %.16x\n", mfspr(SPRN_AMOR));
+#endif
+}
+
+static void dump_300_sprs(void)
+{
+#ifdef CONFIG_PPC64
+   unsigned long msr;
+
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return;
+
+   printf("pidr   = %.16x  tidr  = %.16x asdr   = %.16x\n",
+   mfspr(SPRN_PID), mfspr(SPRN_TIDR), mfspr(SPRN_ASDR));
+
+   msr = mfmsr();
+
+   if (!(msr & MSR_HV))
+   return;
+
+   printf("ptcr   = %.16x  psscr = %.16x\n",
+   mfspr(SPRN_PTCR), mfspr(SPRN_PSSCR));
 #endif
 }
 
@@ -1852,6 +1877,7 @@ static void super_regs(void)
 
dump_206_sprs();
dump_207_sprs();
+   dump_300_sprs();
 
return;
}
-- 
2.9.4



[PATCH 1/2] powerpc/xmon: support dumping software pagetables

2017-07-20 Thread Balbir Singh
It would be nice to be able to dump page tables in a
particular context

Example use cases

Dumping PTE contents to see the keys (useful for debugging)

c000ba48c880 c000bab438b0   2677   2675 T  2 protection_keys
0:mon> ds c000ba48c880 0x77f7
translating tsk c000ba48c880, addr 77f7
G: 0xb95b6400   U: 0xb6334000   M: 0xb6543000   PA: 0x012c, PTE: 
0xd480012c0504

Dumping vmalloc space

0:mon> ds 0 d000
translating tsk   (null), addr d000
G: 0x3d450400   U: 0xbc184000   M: 0x3d46   PA: 0x7e01, PTE: 
0xc0807e01018e

I did not replicate the complex code of dump_pagetable and have no support
for bolted linear mapping, thats why I've called it software pagetable
dumping support. The format of the PTE can be expanded to add more useful
information about the flags in the PTE if required.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/xmon/xmon.c | 97 
 1 file changed, 97 insertions(+)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 08e367e..8aedfff 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -126,6 +126,7 @@ static void byterev(unsigned char *, int);
 static void memex(void);
 static int bsesc(void);
 static void dump(void);
+static void show_pte(unsigned long);
 static void prdump(unsigned long, long);
 static int ppc_inst_dump(unsigned long, long, int);
 static void dump_log_buf(void);
@@ -233,6 +234,7 @@ Commands:\n\
 #endif
   "\
   dr   dump stream of raw bytes\n\
+  ds   dump software PTEs\n\
   dt   dump the tracing buffers (uses printk)\n\
 "
 #ifdef CONFIG_PPC_POWERNV
@@ -2528,6 +2530,9 @@ dump(void)
} else if (c == 't') {
ftrace_dump(DUMP_ALL);
tracing_on();
+   } else if (c == 's') {
+   /* dump software pte */
+   show_pte(adrs);
} else if (c == 'r') {
scanhex();
if (ndump == 0)
@@ -2860,7 +2865,99 @@ static void show_task(struct task_struct *tsk)
state, task_thread_info(tsk)->cpu,
tsk->comm);
 }
+void format_pte(unsigned long pte)
+{
+   unsigned long pa = pte & PTE_RPN_MASK;
+
+   printf("PA: 0x%08lx, PTE: 0x%08lx\n", pa, pte);
+}
+
+static void show_pte(unsigned long tskv)
+{
+   unsigned long addr = 0;
+   struct task_struct *tsk = NULL;
+   struct mm_struct *mm;
+   pgd_t *pgdp;
+   pud_t *pudp;
+   pmd_t *pmdp;
+   pte_t *ptep;
+
+   tsk = (struct task_struct *)tskv;
+   if (tsk == NULL)
+   mm = _mm;
+   else
+   mm = tsk->active_mm;
+
+   if (mm == NULL)
+   mm = _mm;
+
+   if (!scanhex())
+   printf("need address to translate\n");
+
+   if (setjmp(bus_error_jmp) != 0) {
+   catch_memory_errors = 0;
+   printf("*** Error dumping pte for task %p\n", tsk);
+   return;
+   }
+
+   catch_memory_errors = 1;
+   sync();
+
+   if (mm == _mm)
+   pgdp = pgd_offset_k(addr);
+   else
+   pgdp = pgd_offset(mm, addr);
+
+   if (pgd_none(*pgdp)) {
+   printf("no linux page table for address\n");
+   return;
+   }
 
+   if (pgd_huge(*pgdp)) {
+   format_pte(pgd_val(*pgdp));
+   return;
+   }
+   printf("G: 0x%8lx\t", pgd_val(*pgdp));
+
+   pudp = pud_offset(pgdp, addr);
+
+   if (pud_none(*pudp)) {
+   printf("No valid PUD\n");
+   return;
+   }
+
+   if (pud_huge(*pudp)) {
+   format_pte(pud_val(*pudp));
+   return;
+   }
+   printf("U: 0x%8lx\t", pud_val(*pudp));
+
+   pmdp = pmd_offset(pudp, addr);
+
+   if (pmd_none(*pmdp)) {
+   printf("No valid PMD\n");
+   return;
+   }
+
+   if (pmd_huge(*pmdp)) {
+   format_pte(pmd_val(*pmdp));
+   return;
+   }
+   printf("M: 0x%8lx\t", pmd_val(*pmdp));
+
+   /* pte_offset_map is the same as pte_offset_kernel */
+   ptep = pte_offset_kernel(pmdp, addr);
+   if (pte_none(*ptep)) {
+   printf("no valid PTE\n");
+   return;
+   }
+
+   format_pte(pte_val(*ptep));
+
+   sync();
+   __delay(200);
+   catch_memory_errors = 0;
+}
 static void show_tasks(void)
 {
unsigned long tskv;
-- 
2.9.4



Re: [PATCH v4] powerpc/mm/radix: Workaround prefetch issue with KVM

2017-07-20 Thread Paul Mackerras
On Thu, Jul 20, 2017 at 05:36:56PM +1000, Benjamin Herrenschmidt wrote:
> There's a somewhat architectural issue with Radix MMU and KVM.
> 
> When coming out of a guest with AIL (ie, MMU enabled), we start
> executing hypervisor code with the PID register still containing
> whatever the guest has been using.
> 
> The problem is that the CPU can (and will) then start prefetching
> or speculatively load from whatever host context has that same
> PID (if any), thus bringing translations for that context into
> the TLB, which Linux doesn't know about.
> 
> This can cause stale translations and subsequent crashes.
> 
> Fixing this in a way that is neither racy nor a huge performance
> impact is difficult. We could just make the host invalidations
> always use broadcast forms but that would hurt single threaded
> programs for example.
> 
> We chose to fix it instead by partitioning the PID space between
> guest and host. This is possible because today Linux only use 19
> out of the 20 bits of PID space, so existing guests will work
> if we make the host use the top half of the 20 bits space.
> 
> We additionally add a property to indicate to Linux the size of
> the PID register which will be useful if we eventually have
> processors with a larger PID space available.
> 
> There is still an issue with malicious guests purposefully setting
> the PID register to a value in the host range. Hopefully future HW
> can prevent that, but in the meantime, we handle it with a pair of
> kludges:
> 
>  - On the way out of a guest, before we clear the current VCPU
> in the PACA, we check the PID and if it's outside of the permitted
> range we flush the TLB for that PID.
> 
>  - When context switching, if the mm is "new" on that CPU (the
> corresponding bit was set for the first time in the mm cpumask), we
> check if any sibling thread is in KVM (has a non-NULL VCPU pointer
> in the PACA). If that is the case, we also flush the PID for that
> CPU (core).
> 
> This second part is needed to handle the case where a process is
> migrated (or starts a new pthread) on a sibling thread of the CPU
> coming out of KVM, as there's a window where stale translations
> can exist before we detect it and flush them out.
> 
> A future optimization could be added by keeping track of whether
> the PID has ever been used and avoid doing that for completely
> fresh PIDs. We could similarily mark PIDs that have been the subject of
> a global invalidation as "fresh". But for now this will do.
> 
> Signed-off-by: Benjamin Herrenschmidt 

Mostly looks fine.  I have one comment below about the tlbiels we do
when we detect the case that the guest used a PID value that it
shouldn't have:

[snip]
> + /* Illegal PID, flush the TLB for this PID/LPID */
> + isync
> + ld  r6,VCPU_KVM(r9)
> + lwz r0,KVM_TLB_SETS(r6)
> + mtctr   r0
> + li  r7,0x400/* IS field = 0b01 */
> + ptesync
> + sldir0,r3,32/* RS has PID */
> +3:   PPC_TLBIEL(7,0,2,1,1)   /* RIC=2, PRS=1, R=1 */

The architecture says tlbiel in this form invalidates entries whose
lpid matches the value in LPIDR.  At this point in the code, LPIDR
still contains the guest lpid, but we're trying to invalidate host
entries, aren't we?  In general at this point in the code we can't
switch LPIDR (though on P9 we could) since we don't know at this point
that the other (POWER8) threads are out of the guest.

> + addir7,r7,0x1000
> + bdnz3b
> + ptesync
> +
> +4:   /* Flush the ERAT on radix P9 DD1 guest exit */
>  BEGIN_FTR_SECTION
>   PPC_INVALIDATE_ERAT
>  END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
>  
> +5:
>   /*
>* POWER7/POWER8 guest -> host partition switch code.
>* We don't have to lock against tlbies but we do
> diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
> b/arch/powerpc/mm/mmu_context_book3s64.c
> index abed1fe6992f..f74455dec087 100644
> --- a/arch/powerpc/mm/mmu_context_book3s64.c
> +++ b/arch/powerpc/mm/mmu_context_book3s64.c
> @@ -25,6 +25,10 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +#include 
> +#endif

What is this include for?  Nothing you add to this file seems to use
anything from kvm_book3s_asm.h.

Paul.



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Santosh Sivaraj
* Michael Ellerman  wrote (on 2017-07-20 23:18:26 +1000):

> Santosh Sivaraj  writes:
> 
> > Current vDSO64 implementation does not have support for coarse
> > clocks (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls
> > back to system call. Below is a benchmark of the difference in execution
> > time with and without vDSO support.
> 
> Hi Santosh,
> 
> Great patch! Always good to see asm replaced with C.
> 
> > diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
> > b/arch/powerpc/kernel/vdso64/gettime.c
> > new file mode 100644
> > index 000..01f411f
> > --- /dev/null
> > +++ b/arch/powerpc/kernel/vdso64/gettime.c
> > @@ -0,0 +1,162 @@
> ...
> > +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> > +struct timespec *tp)
> > +{
> > +   register clockid_t id asm("r3") = clk_id;
> > +   register struct timespec *t asm("r4") = tp;
> > +   register int nr asm("r0") = __NR_clock_gettime;
> > +   register int ret asm("r3");
> 
> I guess this works. I've always been a bit nervous about register
> variables TBH.
Yes, this works. It falls back to syscall in the case of CLOCK_BOOTTIME.

> 
> > +   asm volatile("sc"
> > +: "=r" (ret)
> > +: "r"(nr), "r"(id), "r"(t)
> > +: "memory");
> 
> Not sure we need the memory clobber?
> 
> It can clobber more registers than that though.
> 
> See: Documentation/powerpc/syscall64-abi.txt
> 
> > diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> > b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > index 3820213..1258009 100644
> > --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> > +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > @@ -51,85 +53,21 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
> ...
> > +   stwur1,-112(r1)
> > +  .cfi_register lr,r6
> > +   std r6,24(r1)
> > +   bl  V_LOCAL_FUNC(kernel_clock_gettime)
> > crclr   cr0*4+so
> 
> Clearing CR0[SO] says that the syscall always succeeded.
> 
> What happens if you call this with a completely bogus clock id?
>

In case of a bogus clock id, the default case sets 'ret' to -1, which forces it
to fallback to the syscall.
> I think the solution is probably to do the syscall fallback in asm, and
> everything else in C.

Ok. That's what Sergey also sugested. I will send a v2.

Thanks,
Santosh
> 
> cheers

-- 


Re: [PATCH] powerpc/pseries: energy driver only print message when LPAR guest

2017-07-20 Thread Vaidyanathan Srinivasan
* Nicholas Piggin  [2017-07-21 11:16:44]:

> On Thu, 20 Jul 2017 23:03:21 +1000
> Michael Ellerman  wrote:
> 
> > Nicholas Piggin  writes:
> > 
> > > This driver currently reports the H_BEST_ENERGY is unsupported even
> > > when booting in a non-LPAR environment (e.g., powernv). Prevent it.  
> > 
> > Just delete the printk(). Users don't know what that means, and
> > developers have other better ways to detect that the hcall is missing if
> > anyone cares.
> > 
> > cheers
> 
> powerpc/pseries: energy driver do not print failure message
> 
> This driver currently reports the H_BEST_ENERGY is unsupported (even
> when booting in a non-LPAR environment). This is not something the
> administrator can do much with, and not significant for debugging.
> 
> Remove it.
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Vaidyanathan Srinivasan 


> ---
>  arch/powerpc/platforms/pseries/pseries_energy.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c 
> b/arch/powerpc/platforms/pseries/pseries_energy.c
> index 164a13d3998a..35c891aabef0 100644
> --- a/arch/powerpc/platforms/pseries/pseries_energy.c
> +++ b/arch/powerpc/platforms/pseries/pseries_energy.c
> @@ -229,10 +229,9 @@ static int __init pseries_energy_init(void)
>   int cpu, err;
>   struct device *cpu_dev;
> 
> - if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) {
> - printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
> - return 0;
> - }
> + if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY))
> + return 0; /* H_BEST_ENERGY hcall not supported */
> +

The first patch (!firmware_has_feature(FW_FEATURE_LPAR)) would be
ideal, but we do not have this in KVM guest case also. Hence I take
mpe's suggestion.  Removing the print is fine.

--Vaidy



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Anton Blanchard
Hi,

> > +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> > +struct timespec *tp)
> > +{
> > +   register clockid_t id asm("r3") = clk_id;
> > +   register struct timespec *t asm("r4") = tp;
> > +   register int nr asm("r0") = __NR_clock_gettime;
> > +   register int ret asm("r3");  
> 
> I guess this works. I've always been a bit nervous about register
> variables TBH.

I don't think this works with clang unfortunately.

Anton


[PATCH] qe: fix compile issue for arm64

2017-07-20 Thread Zhao Qiang
Signed-off-by: Zhao Qiang 
---
 drivers/soc/fsl/qe/qe.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/soc/fsl/qe/qe.c b/drivers/soc/fsl/qe/qe.c
index 2ef6fc6..d48fa4a 100644
--- a/drivers/soc/fsl/qe/qe.c
+++ b/drivers/soc/fsl/qe/qe.c
@@ -229,7 +229,9 @@ int qe_setbrg(enum qe_clock brg, unsigned int rate, 
unsigned int multiplier)
/* Errata QE_General4, which affects some MPC832x and MPC836x SOCs, says
   that the BRG divisor must be even if you're not using divide-by-16
   mode. */
+#ifdef CONFIG_PPC
if (pvr_version_is(PVR_VER_836x) || pvr_version_is(PVR_VER_832x))
+#endif
if (!div16 && (divisor & 1) && (divisor > 3))
divisor++;
 
-- 
2.1.0.27.g96db324



Re: KVM guests freeze under upstream kernel

2017-07-20 Thread joserz
On Thu, Jul 20, 2017 at 03:21:59PM +1000, Paul Mackerras wrote:
> On Thu, Jul 20, 2017 at 12:02:23AM -0300, jos...@linux.vnet.ibm.com wrote:
> > On Thu, Jul 20, 2017 at 09:42:50AM +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2017-07-19 at 16:46 -0300, jos...@linux.vnet.ibm.com wrote:
> > > > Hello!
> > > > 
> > > > We're not able to boot any KVM guest using upstream kernel 
> > > > (cb8c65ccff7f77d0285f1b126c72d37b2572c865 - 4.13.0-rc1+).
> > > > After reaching the SLOF initial counting, the guest simply freezes:
> > > 
> > > Can you send our .config ?
> > 
> > Sure,
> > 
> > Answering Michael as well:
> > 
> > It's a P9 with RHEL kernel 4.11.0-10.el7a.ppc64le installed. The problem
> > was noticed with kernel > 4.13 (I'm currently running 4.13.0-rc1+).
> > 
> > QEMU is https://github.com/dgibson/qemu (ppc-for-2.10) but I gave the
> > default packaged Qemu a try.
> > 
> > For the guest, I tried both a vanilla Ubuntu 17.04 and the host kernel.
> > But they had never a chance to run since the freezing happened in SLOF.
> > 
> > Note that using the 4.11.0-10.el7a.ppc64le kernel it works fine
> > (for any of these Qemu/Guest setup). With 4.13.0-rc1 I have it run after
> > reverting that referred commit.
> 
> Is the host kernel running in radix mode?

yes

> 
> Did you check the host kernel logs for any oops messages?

dmesg was clean but after sometime waiting (I forgot QEMU running in
another terminal) I got the oops below (after rebooting the host I 
couldn't reproduce it again).

Another test that I did was:
Compile with transparent huge pages disabled: KVM works fine
Compile with transparent huge pages enabled: doesn't work
  + disabling it in /sys/kernel/mm/transparent_hugepage: doesn't work

Just out of my own curiosity I made this small change:

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h
b/arch/powerpc/include
index c0737c8..f94a3b6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -80,7 +80,7 @@
 
  #define _PAGE_SOFT_DIRTY   _RPAGE_SW3 /* software: software dirty
  tracking 
   #define _PAGE_SPECIAL  _RPAGE_SW2 /* software: special page */
   -#define _PAGE_DEVMAP   _RPAGE_SW1 /* software: ZONE_DEVICE page */
   +#define _PAGE_DEVMAP   _RPAGE_RSV3
#define __HAVE_ARCH_PTE_DEVMAP

and it works. I chose _RPAGE_RSV3 because it uses the same value that
x86 uses (0x0400UL) but I don't if it could have any side
effect


SLOF
**
QEMU Starting
 Build Date = Mar  3 2017 13:29:19
  FW Version = git-66d250ef0fd06bb8
   Press "s" to enter Open Firmware.

   [  105.604333] Unable to handle kernel paging request for data at
   address 0x
   [  105.604448] Faulting instruction address: 0xc0910b28
   [  105.604526] Oops: Kernel access of bad area, sig: 11 [#1]
   [  105.604585] SMP NR_CPUS=2048 
   [  105.604588] NUMA 
   [  105.604633] PowerNV
   [  105.604697] Modules linked in: xt_CHECKSUM ipt_MASQUERADE
   nf_nat_masquerade_ipv4 tun ip6t_rpfilter ipt_REJECT nf_reject_ipv4
   ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat
   ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6
   nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security
   ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4
   nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw
   ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter
   kvm_hv kvm i2c_dev at24 ghash_generic ses enclosure gf128mul
   scsi_transport_sas xts sg ctr ipmi_powernv ipmi_devintf shpchp
   opal_prd vmx_crypto ipmi_msghandler uio_pdrv_genirq uio ofpart
   powernv_flash i2c_opal ibmpowernv mtd nfsd auth_rpcgss nfs_acl lockd
   grace sunrpc ip_tables xfs libcrc32c
   [  105.605561]  sd_mod ast i2c_algo_bit drm_kms_helper syscopyarea
   sysfillrect sysimgblt fb_sys_fops ttm drm i40e i2c_core aacraid ptp
   pps_core dm_mirror dm_region_hash dm_log dm_mod
   [  105.605759] CPU: 0 PID: 6 Comm: kworker/u32:0 Not tainted
   4.13.0-rc1+ #57
   [  105.605836] Workqueue: netns cleanup_net
   [  105.605880] task: c00ff6404200 task.stack: c00ff648c000
   [  105.605947] NIP: c0910b28 LR: c07cd6ec CTR:
   c07cd5d0
   [  105.606026] REGS: c00ff648f7d0 TRAP: 0300   Not tainted
   (4.13.0-rc1+)
   [  105.606090] MSR: 90009033 
   [  105.606111]   CR: 88002048  XER: 2000
   [  105.606203] CFAR: c07cd6e8 DAR:  DSISR:
   4000 SOFTE: 1 
   [  105.606203] GPR00: c07cd6ec c00ff648fa50
   c0f5c600  
   [  105.606203] GPR04: c00ff6404cc0 c00ff6404280
   782ccd5c cc908fe7 
   [  105.606203] GPR08:  c00ff648c000
   8000  
   [  105.606203] GPR12: c07cd5d0 cfb0
   c01050f8 c00ffa150ec0 

Re: [PATCH] powerpc/pseries: energy driver only print message when LPAR guest

2017-07-20 Thread Nicholas Piggin
On Thu, 20 Jul 2017 23:03:21 +1000
Michael Ellerman  wrote:

> Nicholas Piggin  writes:
> 
> > This driver currently reports the H_BEST_ENERGY is unsupported even
> > when booting in a non-LPAR environment (e.g., powernv). Prevent it.  
> 
> Just delete the printk(). Users don't know what that means, and
> developers have other better ways to detect that the hcall is missing if
> anyone cares.
> 
> cheers

powerpc/pseries: energy driver do not print failure message

This driver currently reports the H_BEST_ENERGY is unsupported (even
when booting in a non-LPAR environment). This is not something the
administrator can do much with, and not significant for debugging.

Remove it.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/pseries_energy.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c 
b/arch/powerpc/platforms/pseries/pseries_energy.c
index 164a13d3998a..35c891aabef0 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -229,10 +229,9 @@ static int __init pseries_energy_init(void)
int cpu, err;
struct device *cpu_dev;
 
-   if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) {
-   printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
-   return 0;
-   }
+   if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY))
+   return 0; /* H_BEST_ENERGY hcall not supported */
+
/* Create the sysfs files */
err = device_create_file(cpu_subsys.dev_root,
_cpu_activate_hint_list);
-- 
2.11.0



Re: [PATCH 0/4] Allow non-legacy cards to be vgaarb default

2017-07-20 Thread Daniel Axtens
Hi Ard,

> (+ Laszlo)
>
> On 19 July 2017 at 02:28, Daniel Axtens  wrote:
>> Hi all,
>>
>> Previously I posted a patch that provided a quirk for a hibmc card
>> behind a particular Huawei bridge that allowed it to be marked as the
>> default device in the VGA arbiter.[0] This lead to some discussion.[1]
>> It was broadly suggested that a more generic solution would be better,
>> something in the style of powerpc's fixup_vga() quirk.
>>
>> Here is my suggested solution:
>>
>>  - Create a Kconfig option ARCH_WANT_VGA_ARB_FALLBACK
>>
>>  - if an arch selects that option, install PCI_FIXUP_CLASS_ENABLE
>>hook. This hook fires when a card is enabled, which will require
>>that a driver has been bound.
>>
>>  - if there is no default device when the hook fires, and the device
>>can control memory and I/O, mark it as default.
>>
>> The patches are as follows:
>>
>>  (1) powerpc: simplify and fix VGA default device behaviour
>>
>>  This cleans up some quirks in the powerpc implementation of the
>>  vga_fixup. It should make the behaviour match the original
>>  intention.
>>
>>  (2) vgaarb: allow non-legacy cards to be marked as default
>>
>>  Add the Kconfig option, and create the fixup in vgaarb.c gated
>>  behind the option. Nothing happens at this stage because no arch
>>  has selected the option yet.
>>
>>  (3) powerpc: replace vga_fixup() with generic code
>>
>>  Select the option on powerpc and remove the old code. The only
>>  change is that it moves from being a final fixup to an enable
>>  fixup.
>>
>>  (4) arm64: allow non-legacy VGA devices to be default
>>
>>  Select the option on arm64. This solves my problem with the D05,
>>  but may cause other cards to be marked as default on other
>>  boards. This shouldn't cause any real issues but is worth being
>>  aware of.
>>
>
> Hi Daniel,
>
> Given that the whole point of the VGA arbiter is the ability to share
> the legacy mem+io ranges between different cards, why do we care about
> the VGA arbiter in the first place on arm64?
>
> AFAIK, there have been some recent changes in Xorg to address the
> auto-detection problem. I don't remember the exact details, but I have
> added Laszlo, who was involved with this at the time.

I haven't been able to locate those changes - I remember that the call
to pci_device_is_boot_vga() in xf86pciBus.c [0] was critical and that is
still there in the latest git.

Indeed, the reason we care about the vga arbiter at all is because of
that Xorg dependency on the boot VGA card. pci_device_is_boot_vga()
reads a sysfs file, and that sysfs file is populated based on the
vga_default_driver(), so it's very difficult to extricate ourselves from
the vga arbiter and its concept of the default device.

We could make this method an 'either/or' rather than a fallback - so
platforms who didn't care about legacy resources didn't bother with
those tests, but I'm not sure what benefit that would give and I find it
harder to be confident of an absence of unexpected consequences.

Regards,
Daniel

[0] 
https://cgit.freedesktop.org/xorg/xserver/tree/hw/xfree86/common/xf86pciBus.c#n113

>
> -- 
> Ard.


Re: [RFC v6 27/62] powerpc: helper to validate key-access permissions of a pte

2017-07-20 Thread Ram Pai
On Thu, Jul 20, 2017 at 12:12:47PM +0530, Aneesh Kumar K.V wrote:
> Ram Pai  writes:
> 
> > helper function that checks if the read/write/execute is allowed
> > on the pte.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/book3s/64/pgtable.h |4 +++
> >  arch/powerpc/include/asm/pkeys.h |   12 +
> >  arch/powerpc/mm/pkeys.c  |   33 
> > ++
> >  3 files changed, 49 insertions(+), 0 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> > b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > index 30d7f55..0056e58 100644
> > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > @@ -472,6 +472,10 @@ static inline void write_uamor(u64 value)
> > mtspr(SPRN_UAMOR, value);
> >  }
> >
> > +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute);
> > +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> > +
> >  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
> >  static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
> >unsigned long addr, pte_t *ptep)
> > diff --git a/arch/powerpc/include/asm/pkeys.h 
> > b/arch/powerpc/include/asm/pkeys.h
> > index bbb5d85..7a9aade 100644
> > --- a/arch/powerpc/include/asm/pkeys.h
> > +++ b/arch/powerpc/include/asm/pkeys.h
> > @@ -53,6 +53,18 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
> > ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL));
> >  }
> >
> > +static inline u16 pte_to_pkey_bits(u64 pteflags)
> > +{
> > +   if (!pkey_inited)
> > +   return 0x0UL;
> 
> Do we really need that above check ? We should always find it
> peky_inited to be set. 

Yes. there are cases where pkey_inited is not enabled. 
a) if the MMU is radix.
b) if the PAGE size is 4k.
c) if the device tree says the feature is not available
d) if the CPU is of a older generation. P6 and older.

RP



Re: [RFC v6 11/62] powerpc: initial pkey plumbing

2017-07-20 Thread Ram Pai
On Thu, Jul 20, 2017 at 11:34:10AM +0530, Aneesh Kumar K.V wrote:
> Ram Pai  writes:
> 
> > basic setup to initialize the pkey system. Only 64K kernel in HPT
> > mode, enables the pkey system.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/Kconfig   |   16 ++
> >  arch/powerpc/include/asm/mmu_context.h |5 +++
> >  arch/powerpc/include/asm/pkeys.h   |   51 
> > 
> >  arch/powerpc/kernel/setup_64.c |4 ++
> >  arch/powerpc/mm/Makefile   |1 +
> >  arch/powerpc/mm/hash_utils_64.c|1 +
> >  arch/powerpc/mm/pkeys.c|   18 +++
> >  7 files changed, 96 insertions(+), 0 deletions(-)
> >  create mode 100644 arch/powerpc/include/asm/pkeys.h
> >  create mode 100644 arch/powerpc/mm/pkeys.c
> >
> > diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> > index bf4391d..5c60fd6 100644
> > --- a/arch/powerpc/Kconfig
> > +++ b/arch/powerpc/Kconfig
> > @@ -855,6 +855,22 @@ config SECCOMP
> >
> >   If unsure, say Y. Only embedded should say N here.
> >
> > +config PPC64_MEMORY_PROTECTION_KEYS
> > +   prompt "PowerPC Memory Protection Keys"
> > +   def_bool y
> > +   # Note: only available in 64-bit mode
> > +   depends on PPC64 && PPC_64K_PAGES
> > +   select ARCH_USES_HIGH_VMA_FLAGS
> > +   select ARCH_HAS_PKEYS
> > +   ---help---
> > + Memory Protection Keys provides a mechanism for enforcing
> > + page-based protections, but without requiring modification of the
> > + page tables when an application changes protection domains.
> > +
> > + For details, see Documentation/vm/protection-keys.txt
> > +
> > + If unsure, say y.
> > +
> >  endmenu
> 
> 
> Shouldn't this come as the last patch ? Or can we enable this config by
> this patch ? If so what does it do ? Did you test boot each of this
> patch to make sure we don't break git bisect ?

it partially enables the key subsystem. The code is there, but it does
not do much. Essentially the behavior is the same as without the code.

Yes. it is test booted but not extensively on all
platforms/configurations.

However this code is blindly enabling pkey subsystem without referring
to the device tree. I have fixed this patch series to add that additional
patch. In that patch series, I place all the code without
enabling the subsystem.  The last patch actually fires it up, 
depending on availability of the feature as told by the device-tree.
RP



Re: [RFC v6 01/62] powerpc: Free up four 64K PTE bits in 4K backed HPTE pages

2017-07-20 Thread Ram Pai
On Thu, Jul 20, 2017 at 11:21:51AM +0530, Aneesh Kumar K.V wrote:
> 
> .
> 
> > /*
> > @@ -116,8 +104,8 @@ int __hash_page_4K(unsigned long ea, unsigned long 
> > access, unsigned long vsid,
> >  * On hash insert failure we use old pte value and we don't
> >  * want slot information there if we have a insert failure.
> >  */
> > -   old_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > -   new_pte &= ~(H_PAGE_HASHPTE | H_PAGE_F_GIX | H_PAGE_F_SECOND);
> > +   old_pte &= ~(H_PAGE_HASHPTE);
> > +   new_pte &= ~(H_PAGE_HASHPTE);
> > goto htab_insert_hpte;
> > }
> 
> With the current path order and above hunk we will breaks the bisect I guess. 
> With the above, when
> we convert a 64k hpte to 4khpte, since this is the first patch, we
> should clear that H_PAGE_F_GIX and H_PAGE_F_SECOND. We still use them
> for 64k. I guess you should move this hunk to second patch.

true. it should move to the next patch. Will fix it.
RP



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Segher Boessenkool
On Fri, Jul 21, 2017 at 07:17:39AM +1000, Benjamin Herrenschmidt wrote:
> > Great patch! Always good to see asm replaced with C.
> 
> Yeah ewll ... when C becomes some kind of weird glorifed asm like
> below, I don't see much of a point ;-)

Yeah.

> > > diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
> > > b/arch/powerpc/kernel/vdso64/gettime.c
> > > new file mode 100644
> > > index 000..01f411f
> > > --- /dev/null
> > > +++ b/arch/powerpc/kernel/vdso64/gettime.c
> > > @@ -0,0 +1,162 @@
> > 
> > ...
> > > +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> > > +  struct timespec *tp)
> > > +{
> > > + register clockid_t id asm("r3") = clk_id;
> > > + register struct timespec *t asm("r4") = tp;
> > > + register int nr asm("r0") = __NR_clock_gettime;
> > > + register int ret asm("r3");
> > 
> > I guess this works. I've always been a bit nervous about register
> > variables TBH.
> 
> Does it really work ? That really makes me nervous too, I woudn't do
> this without a strong ack from a toolchain person... Segher ?

Local register variables work perfectly well, but only for one thing:
those variables are guaranteed to be in those registers, _as arguments
to an asm_.

> > > + asm volatile("sc"
> > > +  : "=r" (ret)
> > > +  : "r"(nr), "r"(id), "r"(t)
> > > +  : "memory");
> > 
> > Not sure we need the memory clobber?
> > 
> > It can clobber more registers than that though.

It needs the memory clobber (if the system call accessess any of "your"
memory).  You need more register clobbers: also for most CR fields,
CTR, etc.

One trick that often works is doing the system call from within an
assembler function that uses the C ABI, since that has almost the same
calling conventions.

Something as simple as

.globl syscall42
syscall42:
li 0,42
sc
blr

(and yeah handle the CR bit 3 thing somehow)

and declare it as

int syscall42(some_type r3_arg, another_type r4_arg);


Inline asm is good when you want the asm code inlined into the callers,
potentially with arguments optimised etc.  The only overhead making the
syscall a function has is that single blr; the only optimisation you
miss is you could potentially load GPR0 a bit earlier (and you can get
a tiny bit more scheduling flexibility).


Segher


Re: [PATCH] drivers: cpuidle: Disable preemption before get_lppaca function call in pseries_idle_probe function

2017-07-20 Thread Benjamin Herrenschmidt
On Thu, 2017-07-20 at 14:57 -0300, Victor Aoqui wrote:
> When CONFIG_PREEMPT=y, the following warning shows up:
> 
> BUG: using smp_processor_id() in preemptible [] code: swapper/0/1
> caller is pseries_processor_idle_init+0x58/0x21c
> 
> This warning shows up because preemption cannot occur when using
> get_paca(), otherwise the paca_struct it points to may be the wrong one
> just after.
> 
> For this reason, preemption needs to be disabled before
> lppaca_shared_proc(get_lppaca()).

Also chekc the generated assembly. We had all sort of interesting
issues where gcc would copy the paca pointer or the lppaca pointer
to a GPR *outside* of the preempt disabled section...

In that specific case it's not a big deal but overall, I am not
comfortable with PREEMPT on powerpc until we do something a bit
more drastic...

I would like to remove all such direct accesses to paca, instead have a
"new" get_paca() written in asm that does the preempt disable then
returns the PACA in a GPR (not directly use r13, hide that from gcc),
and which is paired with a put_paca().

The few places where we want to directly access r13 should be hand
written in asm too to hide r13 from gcc, for accessing the irq_happened
in the fast path of local_irq_enable/disable/... we should do the same
with lock tokens.

Ben.



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Benjamin Herrenschmidt
On Thu, 2017-07-20 at 23:18 +1000, Michael Ellerman wrote:
> Santosh Sivaraj  writes:
> 
> > Current vDSO64 implementation does not have support for coarse
> > clocks (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls
> > back to system call. Below is a benchmark of the difference in execution
> > time with and without vDSO support.
> 
> Hi Santosh,
> 
> Great patch! Always good to see asm replaced with C.

Yeah ewll ... when C becomes some kind of weird glorifed asm like
below, I don't see much of a point ;-)

> > diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
> > b/arch/powerpc/kernel/vdso64/gettime.c
> > new file mode 100644
> > index 000..01f411f
> > --- /dev/null
> > +++ b/arch/powerpc/kernel/vdso64/gettime.c
> > @@ -0,0 +1,162 @@
> 
> ...
> > +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> > +struct timespec *tp)
> > +{
> > +   register clockid_t id asm("r3") = clk_id;
> > +   register struct timespec *t asm("r4") = tp;
> > +   register int nr asm("r0") = __NR_clock_gettime;
> > +   register int ret asm("r3");
> 
> I guess this works. I've always been a bit nervous about register
> variables TBH.

Does it really work ? That really makes me nervous too, I woudn't do
this without a strong ack from a toolchain person... Segher ?

> > +   asm volatile("sc"
> > +: "=r" (ret)
> > +: "r"(nr), "r"(id), "r"(t)
> > +: "memory");
> 
> Not sure we need the memory clobber?
> 
> It can clobber more registers than that though.
> 
> See: Documentation/powerpc/syscall64-abi.txt
> 
> > diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> > b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > index 3820213..1258009 100644
> > --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> > +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> > @@ -51,85 +53,21 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
> 
> ...
> > +   stwur1,-112(r1)
> > +  .cfi_register lr,r6
> > +   std r6,24(r1)
> > +   bl  V_LOCAL_FUNC(kernel_clock_gettime)
> > crclr   cr0*4+so
> 
> Clearing CR0[SO] says that the syscall always succeeded.
> 
> What happens if you call this with a completely bogus clock id?
> 
> I think the solution is probably to do the syscall fallback in asm, and
> everything else in C.
> 
> cheers


[PATCH] drivers: cpuidle: Disable preemption before get_lppaca function call in pseries_idle_probe function

2017-07-20 Thread Victor Aoqui
When CONFIG_PREEMPT=y, the following warning shows up:

BUG: using smp_processor_id() in preemptible [] code: swapper/0/1
caller is pseries_processor_idle_init+0x58/0x21c

This warning shows up because preemption cannot occur when using
get_paca(), otherwise the paca_struct it points to may be the wrong one
just after.

For this reason, preemption needs to be disabled before
lppaca_shared_proc(get_lppaca()).

Signed-off-by: Victor Aoqui 
---
 drivers/cpuidle/cpuidle-pseries.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle-pseries.c 
b/drivers/cpuidle/cpuidle-pseries.c
index e9b3853..d6dda8c 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -233,12 +233,16 @@ static int pseries_cpuidle_driver_init(void)
  */
 static int pseries_idle_probe(void)
 {
+   int retval;
 
if (cpuidle_disable != IDLE_NO_OVERRIDE)
return -ENODEV;
 
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
-   if (lppaca_shared_proc(get_lppaca())) {
+   preempt_disable();
+   retval = lppaca_shared_proc(get_lppaca());
+   preempt_enable();
+   if (retval) {
cpuidle_state_table = shared_states;
max_idle_state = ARRAY_SIZE(shared_states);
} else {
-- 
1.8.3.1



[PATCH] powerpc/kernel: Avoid preemption check during iommu_range_alloc

2017-07-20 Thread Victor Aoqui
Replaced __this_cpu_read function call by raw_cpu_read in
iommu_range_alloc function.
Preemption doesn't need to be disabled since any CPU can safely
use IOMMU pool.

Signed-off-by: Victor Aoqui 
---
 arch/powerpc/kernel/iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 233ca3f..0e49a45 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -208,7 +208,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
 * We don't need to disable preemption here because any CPU can
 * safely use any IOMMU pool.
 */
-   pool_nr = __this_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
+   pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
 
if (largealloc)
pool = &(tbl->large_pool);
-- 
1.8.3.1



[PATCH v2] powerpc/mm: Implemented default_hugepagesz verification for powerpc

2017-07-20 Thread Victor Aoqui
Implemented default hugepage size verification (default_hugepagesz=)
in order to allow allocation of defined number of pages (hugepages=)
only for supported hugepage sizes.

Signed-off-by: Victor Aoqui 
---
v2:

- Renamed default_hugepage_setup_sz function to hugetlb_default_size_setup;

- Added powerpc string to error message.

 arch/powerpc/mm/hugetlbpage.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca..3a142fe 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -780,6 +780,21 @@ static int __init hugepage_setup_sz(char *str)
 }
 __setup("hugepagesz=", hugepage_setup_sz);
 
+static int __init hugetlb_default_size_setup(char *str)
+{
+   unsigned long long size;
+
+   size = memparse(str, );
+
+   if (add_huge_page_size(size) != 0) {
+   hugetlb_bad_size();
+   pr_err("Invalid powerpc default huge page size 
specified(%llu)\n", size);
+   }
+
+   return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_size_setup);
+
 struct kmem_cache *hugepte_cache;
 static int __init hugetlbpage_init(void)
 {
-- 
1.8.3.1



Re: [PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Michael Ellerman
Santosh Sivaraj  writes:

> Current vDSO64 implementation does not have support for coarse
> clocks (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls
> back to system call. Below is a benchmark of the difference in execution
> time with and without vDSO support.

Hi Santosh,

Great patch! Always good to see asm replaced with C.

> diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
> b/arch/powerpc/kernel/vdso64/gettime.c
> new file mode 100644
> index 000..01f411f
> --- /dev/null
> +++ b/arch/powerpc/kernel/vdso64/gettime.c
> @@ -0,0 +1,162 @@
...
> +static notrace int gettime_syscall_fallback(clockid_t clk_id,
> +  struct timespec *tp)
> +{
> + register clockid_t id asm("r3") = clk_id;
> + register struct timespec *t asm("r4") = tp;
> + register int nr asm("r0") = __NR_clock_gettime;
> + register int ret asm("r3");

I guess this works. I've always been a bit nervous about register
variables TBH.

> + asm volatile("sc"
> +  : "=r" (ret)
> +  : "r"(nr), "r"(id), "r"(t)
> +  : "memory");

Not sure we need the memory clobber?

It can clobber more registers than that though.

See: Documentation/powerpc/syscall64-abi.txt

> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 3820213..1258009 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -51,85 +53,21 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
...
> + stwur1,-112(r1)
> +  .cfi_register lr,r6
> + std r6,24(r1)
> + bl  V_LOCAL_FUNC(kernel_clock_gettime)
>   crclr   cr0*4+so

Clearing CR0[SO] says that the syscall always succeeded.

What happens if you call this with a completely bogus clock id?

I think the solution is probably to do the syscall fallback in asm, and
everything else in C.

cheers


Re: [PATCH] powerpc/pseries: energy driver only print message when LPAR guest

2017-07-20 Thread Michael Ellerman
Nicholas Piggin  writes:

> This driver currently reports the H_BEST_ENERGY is unsupported even
> when booting in a non-LPAR environment (e.g., powernv). Prevent it.

Just delete the printk(). Users don't know what that means, and
developers have other better ways to detect that the hcall is missing if
anyone cares.

cheers


Re: [PATCH] powerpc/asm/cacheflush: Cleanup cacheflush function params

2017-07-20 Thread Michael Ellerman
Geert Uytterhoeven  writes:

> On Thu, Jul 20, 2017 at 1:43 PM, Michael Ellerman  wrote:
>> Matt Brown  writes:
>>> The cacheflush prototypes currently use start and stop values and each
>>> call requires typecasting the address to an unsigned long.
>>> This patch changes the cacheflush prototypes to follow the x86 style of
>>> using a base and size values, with base being a void pointer.
>>>
>>> All callers of the cacheflush functions, including drivers, have been
>>> modified to conform to the new prototypes.
>>>
>>> The 64 bit cacheflush functions which were implemented in assembly code
>>> (flush_dcache_range, flush_inval_dcache_range) have been translated into
>>> C for readability and coherence.
>
>>> --- a/arch/powerpc/include/asm/cacheflush.h
>>> +++ b/arch/powerpc/include/asm/cacheflush.h
>>> @@ -51,13 +51,13 @@ static inline void __flush_dcache_icache_phys(unsigned 
>>> long physaddr)
>>>   * Write any modified data cache blocks out to memory and invalidate them.
>>>   * Does not invalidate the corresponding instruction cache blocks.
>>>   */
>>> -static inline void flush_dcache_range(unsigned long start, unsigned long 
>>> stop)
>>> +static inline void flush_dcache_range(void *start, unsigned long size)
>>>  {
>>> - void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
>>> - unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 
>>> 1);
>>> + void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
>>
>> unsigned long would be nicer than u32.
>
> Indeed. That would make this work on ppc64, too.
> After which ppc64 has an identical copy (u64 = unsigned long on ppc64) below?

That was Matt's homework to notice that ;)

cheers


Re: [PATCH] powerpc/asm/cacheflush: Cleanup cacheflush function params

2017-07-20 Thread Geert Uytterhoeven
On Thu, Jul 20, 2017 at 1:43 PM, Michael Ellerman  wrote:
> Matt Brown  writes:
>> The cacheflush prototypes currently use start and stop values and each
>> call requires typecasting the address to an unsigned long.
>> This patch changes the cacheflush prototypes to follow the x86 style of
>> using a base and size values, with base being a void pointer.
>>
>> All callers of the cacheflush functions, including drivers, have been
>> modified to conform to the new prototypes.
>>
>> The 64 bit cacheflush functions which were implemented in assembly code
>> (flush_dcache_range, flush_inval_dcache_range) have been translated into
>> C for readability and coherence.

>> --- a/arch/powerpc/include/asm/cacheflush.h
>> +++ b/arch/powerpc/include/asm/cacheflush.h
>> @@ -51,13 +51,13 @@ static inline void __flush_dcache_icache_phys(unsigned 
>> long physaddr)
>>   * Write any modified data cache blocks out to memory and invalidate them.
>>   * Does not invalidate the corresponding instruction cache blocks.
>>   */
>> -static inline void flush_dcache_range(unsigned long start, unsigned long 
>> stop)
>> +static inline void flush_dcache_range(void *start, unsigned long size)
>>  {
>> - void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
>> - unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
>> + void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
>
> unsigned long would be nicer than u32.

Indeed. That would make this work on ppc64, too.
After which ppc64 has an identical copy (u64 = unsigned long on ppc64) below?

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [RESEND PATCH] powerpc: defconfig: Cleanup from old Kconfig options

2017-07-20 Thread Michael Ellerman
Krzysztof Kozlowski  writes:

> Remove old, dead Kconfig option USB_LED. It is gone since
> commit a335aaf3125c ("usb: misc: remove outdated USB LED driver").
>
> Signed-off-by: Krzysztof Kozlowski 
> ---
>  arch/powerpc/configs/c2k_defconfig| 1 -
>  arch/powerpc/configs/ppc6xx_defconfig | 1 -
>  2 files changed, 2 deletions(-)

Thanks. I'm working on cleaning up all the dead options in our
defconfigs, so I'll make this part of that series.

cheers


[PATCH] powerpc/pseries: energy driver only print message when LPAR guest

2017-07-20 Thread Nicholas Piggin
This driver currently reports the H_BEST_ENERGY is unsupported even
when booting in a non-LPAR environment (e.g., powernv). Prevent it.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/pseries/pseries_energy.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c 
b/arch/powerpc/platforms/pseries/pseries_energy.c
index 164a13d3998a..832b3d7898a3 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -229,6 +229,9 @@ static int __init pseries_energy_init(void)
int cpu, err;
struct device *cpu_dev;
 
+   if (!firmware_has_feature(FW_FEATURE_LPAR))
+   return 0;
+
if (!firmware_has_feature(FW_FEATURE_BEST_ENERGY)) {
printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n");
return 0;
-- 
2.11.0



Re: [PATCH] powerpc/asm/cacheflush: Cleanup cacheflush function params

2017-07-20 Thread Michael Ellerman
Hi Matt,

Thanks for tackling this mess.

Matt Brown  writes:
> The cacheflush prototypes currently use start and stop values and each
> call requires typecasting the address to an unsigned long.
> This patch changes the cacheflush prototypes to follow the x86 style of
> using a base and size values, with base being a void pointer.
>
> All callers of the cacheflush functions, including drivers, have been
> modified to conform to the new prototypes.
>
> The 64 bit cacheflush functions which were implemented in assembly code
> (flush_dcache_range, flush_inval_dcache_range) have been translated into
> C for readability and coherence.
>
> Signed-off-by: Matt Brown 
> ---
>  arch/powerpc/include/asm/cacheflush.h| 47 +
>  arch/powerpc/kernel/misc_64.S| 52 
> 
>  arch/powerpc/mm/dma-noncoherent.c| 15 
>  arch/powerpc/platforms/512x/mpc512x_shared.c | 10 +++---
>  arch/powerpc/platforms/85xx/smp.c|  6 ++--
>  arch/powerpc/sysdev/dart_iommu.c |  5 +--
>  drivers/ata/pata_bf54x.c |  3 +-
>  drivers/char/agp/uninorth-agp.c  |  6 ++--
>  drivers/gpu/drm/drm_cache.c  |  3 +-
>  drivers/macintosh/smu.c  | 15 
>  drivers/mmc/host/bfin_sdh.c  |  3 +-
>  drivers/mtd/nand/bf5xx_nand.c|  6 ++--
>  drivers/soc/fsl/qbman/dpaa_sys.h |  2 +-
>  drivers/soc/fsl/qbman/qman_ccsr.c|  3 +-
>  drivers/spi/spi-bfin5xx.c| 10 +++---
>  drivers/tty/serial/mpsc.c| 46 
>  drivers/usb/musb/blackfin.c  |  6 ++--

I think you want to trim that to powerpc only drivers for now at least.

> diff --git a/arch/powerpc/include/asm/cacheflush.h 
> b/arch/powerpc/include/asm/cacheflush.h
> index 11843e3..b8f04c3 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -51,13 +51,13 @@ static inline void __flush_dcache_icache_phys(unsigned 
> long physaddr)
>   * Write any modified data cache blocks out to memory and invalidate them.
>   * Does not invalidate the corresponding instruction cache blocks.
>   */
> -static inline void flush_dcache_range(unsigned long start, unsigned long 
> stop)
> +static inline void flush_dcache_range(void *start, unsigned long size)
>  {
> - void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
> - unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
> + void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));

unsigned long would be nicer than u32.

And ALIGN_DOWN() should work here I think.

> + unsigned long len = size + (L1_CACHE_BYTES - 1);

And ALIGN?

> @@ -83,22 +83,39 @@ static inline void clean_dcache_range(unsigned long 
> start, unsigned long stop)
>   * to invalidate the cache so the PPC core doesn't get stale data
>   * from the CPM (no cache snooping here :-).
>   */
> -static inline void invalidate_dcache_range(unsigned long start,
> -unsigned long stop)
> +static inline void invalidate_dcache_range(void *start, unsigned long size)
>  {
> - void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
> - unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
> + void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
> + unsigned long len = size + (L1_CACHE_SHIFT - 1);
>   unsigned long i;
>  
> - for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
> + for (i = 0; i < len >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
>   dcbi(addr);
>   mb();   /* sync */
>  }
>  
>  #endif /* CONFIG_PPC32 */
>  #ifdef CONFIG_PPC64
> -extern void flush_dcache_range(unsigned long start, unsigned long stop);
> -extern void flush_inval_dcache_range(unsigned long start, unsigned long 
> stop);
> +static inline void flush_dcache_range(void *start, unsigned long size)
> +{
> + void *addr = (void *)((u64)start & ~(L1_CACHE_BYTES - 1));
> + unsigned long len = size + (L1_CACHE_BYTES - 1);
> + unsigned long i;
> +
> + for (i = 0; i < len >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
> + dcbf(addr);
> + mb();   /* sync */
> +}

I'd probably prefer a precursor patch to do the asm -> C conversion, but
I guess that's a pain because then you have to implement both the old
and new logic in C.

Also L1_CACHE_SHIFT is not necessarily == DCACHEL1BLOCKSIZE.

Finally it would be good to see what code the compiler generates out of
this, and see how it compares to the asm version. Not because it's
particularly performance critical (hopefully) but just so we know.

> diff --git a/arch/powerpc/platforms/512x/mpc512x_shared.c 
> b/arch/powerpc/platforms/512x/mpc512x_shared.c
> index 6b4f4cb..0f3a7d9 100644
> --- 

[PATCH] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-07-20 Thread Santosh Sivaraj
Current vDSO64 implementation does not have support for coarse
clocks (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls
back to system call. Below is a benchmark of the difference in execution
time with and without vDSO support.

(Non-coarse clocks are also included just for completion)

Without vDSO support:

clock-gettime-realtime: syscall: 1547 nsec/call
clock-gettime-realtime:libc: 258 nsec/call
clock-gettime-realtime:vdso: 180 nsec/call

clock-gettime-monotonic: syscall: 1399 nsec/call
clock-gettime-monotonic:libc: 317 nsec/call
clock-gettime-monotonic:vdso: 249 nsec/call

clock-gettime-realtime-coarse: syscall: 1228 nsec/call
clock-gettime-realtime-coarse:libc: 1320 nsec/call
clock-gettime-realtime-coarse:vdso: 1330 nsec/call

clock-gettime-monotonic-coarse: syscall: 1263 nsec/call
clock-gettime-monotonic-coarse:libc: 1368 nsec/call
clock-gettime-monotonic-coarse:vdso: 1258 nsec/call

With vDSO support:
--
clock-gettime-realtime: syscall: 1660 nsec/call
clock-gettime-realtime:libc: 251 nsec/call
clock-gettime-realtime:vdso: 180 nsec/call

clock-gettime-monotonic: syscall: 1514 nsec/call
clock-gettime-monotonic:libc: 309 nsec/call
clock-gettime-monotonic:vdso: 239 nsec/call

clock-gettime-realtime-coarse: syscall: 1228 nsec/call
clock-gettime-realtime-coarse:libc: 172 nsec/call
clock-gettime-realtime-coarse:vdso: 101 nsec/call

clock-gettime-monotonic-coarse: syscall: 1347 nsec/call
clock-gettime-monotonic-coarse:libc: 187 nsec/call
clock-gettime-monotonic-coarse:vdso: 125 nsec/call

Used https://github.com/nlynch-mentor/vdsotest.git for the benchmarks.

CC: Benjamin Herrenschmidt 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/vdso.h   |   1 +
 arch/powerpc/kernel/vdso64/Makefile   |   2 +-
 arch/powerpc/kernel/vdso64/gettime.c  | 162 ++
 arch/powerpc/kernel/vdso64/gettimeofday.S |  82 ++-
 4 files changed, 174 insertions(+), 73 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso64/gettime.c

diff --git a/arch/powerpc/include/asm/vdso.h b/arch/powerpc/include/asm/vdso.h
index c53f5f6..721e4cf 100644
--- a/arch/powerpc/include/asm/vdso.h
+++ b/arch/powerpc/include/asm/vdso.h
@@ -23,6 +23,7 @@ extern unsigned long vdso32_sigtramp;
 extern unsigned long vdso32_rt_sigtramp;
 
 int vdso_getcpu_init(void);
+struct vdso_data *__get_datapage(void);
 
 #else /* __ASSEMBLY__ */
 
diff --git a/arch/powerpc/kernel/vdso64/Makefile 
b/arch/powerpc/kernel/vdso64/Makefile
index 31107bf..8958d87 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -1,6 +1,6 @@
 # List of files in the vdso, has to be asm only for now
 
-obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o
+obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o 
gettime.o
 
 # Build rules
 
diff --git a/arch/powerpc/kernel/vdso64/gettime.c 
b/arch/powerpc/kernel/vdso64/gettime.c
new file mode 100644
index 000..01f411f
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/gettime.c
@@ -0,0 +1,162 @@
+/*
+ * Userland implementation of gettimeofday() for 64 bits processes in a
+ * ppc64 kernel for use in the vDSO
+ *
+ * Copyright (C) 2017 Santosh Sivaraj (sant...@fossix.org), IBM.
+ *
+ * Originally implemented in assembly by:
+ *   Benjamin Herrenschmuidt (b...@kernel.crashing.org),
+ *IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static notrace void kernel_get_tspec(struct timespec *tp,
+struct vdso_data *vdata, u32 *wtom_sec,
+u32 *wtom_nsec)
+{
+   u64 tb;
+   u32 update_count;
+
+   do {
+   /* check for update count & load values */
+   update_count = vdata->tb_update_count;
+
+   /* Get TB, offset it and scale result */
+   tb = mulhdu((get_tb() - vdata->tb_orig_stamp) << 12,
+   vdata->tb_to_xs) + vdata->stamp_sec_fraction;
+   tp->tv_sec = vdata->stamp_xtime.tv_sec;
+   if (wtom_sec)
+   *wtom_sec = vdata->wtom_clock_sec;
+   if (wtom_nsec)
+   *wtom_nsec = vdata->wtom_clock_nsec;
+   } while (update_count != vdata->tb_update_count);
+
+   tp->tv_nsec = ((u64)mulhwu(tb, NSEC_PER_SEC) << 32) >> 32;
+   tp->tv_sec += (tb >> 32);
+}
+
+static notrace int clock_get_realtime(struct timespec *tp,
+ struct vdso_data *vdata)
+{
+   

[PATCH v4] powerpc/mm/radix: Workaround prefetch issue with KVM

2017-07-20 Thread Benjamin Herrenschmidt
There's a somewhat architectural issue with Radix MMU and KVM.

When coming out of a guest with AIL (ie, MMU enabled), we start
executing hypervisor code with the PID register still containing
whatever the guest has been using.

The problem is that the CPU can (and will) then start prefetching
or speculatively load from whatever host context has that same
PID (if any), thus bringing translations for that context into
the TLB, which Linux doesn't know about.

This can cause stale translations and subsequent crashes.

Fixing this in a way that is neither racy nor a huge performance
impact is difficult. We could just make the host invalidations
always use broadcast forms but that would hurt single threaded
programs for example.

We chose to fix it instead by partitioning the PID space between
guest and host. This is possible because today Linux only use 19
out of the 20 bits of PID space, so existing guests will work
if we make the host use the top half of the 20 bits space.

We additionally add a property to indicate to Linux the size of
the PID register which will be useful if we eventually have
processors with a larger PID space available.

There is still an issue with malicious guests purposefully setting
the PID register to a value in the host range. Hopefully future HW
can prevent that, but in the meantime, we handle it with a pair of
kludges:

 - On the way out of a guest, before we clear the current VCPU
in the PACA, we check the PID and if it's outside of the permitted
range we flush the TLB for that PID.

 - When context switching, if the mm is "new" on that CPU (the
corresponding bit was set for the first time in the mm cpumask), we
check if any sibling thread is in KVM (has a non-NULL VCPU pointer
in the PACA). If that is the case, we also flush the PID for that
CPU (core).

This second part is needed to handle the case where a process is
migrated (or starts a new pthread) on a sibling thread of the CPU
coming out of KVM, as there's a window where stale translations
can exist before we detect it and flush them out.

A future optimization could be added by keeping track of whether
the PID has ever been used and avoid doing that for completely
fresh PIDs. We could similarily mark PIDs that have been the subject of
a global invalidation as "fresh". But for now this will do.

Signed-off-by: Benjamin Herrenschmidt 
---

v2. Do the check on KVM exit *after* we've restored the host PID

v3. Properly check for radix in the exit assembly and avoid adding
yet another function to tlb-radix.c

v4. Don't add an argument to switch_mmu_context(), instead call a
workaround function directly from switch_mm_irqs_off(), which
keeps the patch a lot smaller.
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 15 +-
 arch/powerpc/include/asm/mmu_context.h   | 18 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  | 49 ++--
 arch/powerpc/mm/mmu_context_book3s64.c   |  9 --
 arch/powerpc/mm/pgtable-radix.c  | 34 +-
 arch/powerpc/mm/tlb-radix.c  | 45 +++--
 6 files changed, 148 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3e3811..5b4023c616f7 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -59,13 +59,14 @@ extern struct patb_entry *partition_tb;
 #define PRTS_MASK  0x1f/* process table size field */
 #define PRTB_MASK  0x0000UL
 
-/*
- * Limit process table to PAGE_SIZE table. This
- * also limit the max pid we can support.
- * MAX_USER_CONTEXT * 16 bytes of space.
- */
-#define PRTB_SIZE_SHIFT(CONTEXT_BITS + 4)
-#define PRTB_ENTRIES   (1ul << CONTEXT_BITS)
+/* Number of supported PID bits */
+extern unsigned int mmu_pid_bits;
+
+/* Base PID to allocate from */
+extern unsigned int mmu_base_pid;
+
+#define PRTB_SIZE_SHIFT(mmu_pid_bits + 4)
+#define PRTB_ENTRIES   (1ul << mmu_pid_bits)
 
 /*
  * Power9 currently only support 64K partition table size.
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index da7e9432fa8f..0c76675394c5 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -45,7 +45,7 @@ extern void set_context(unsigned long id, pgd_t *pgd);
 
 #ifdef CONFIG_PPC_BOOK3S_64
 extern void radix__switch_mmu_context(struct mm_struct *prev,
-struct mm_struct *next);
+ struct mm_struct *next);
 static inline void switch_mmu_context(struct mm_struct *prev,
  struct mm_struct *next,
  struct task_struct *tsk)
@@ -67,6 +67,12 @@ extern void __destroy_context(unsigned long context_id);
 extern void mmu_context_init(void);
 #endif
 
+#if 

Re: [PATCH v2 2/3] powerpc/powernv: machine check use kernel crash path

2017-07-20 Thread Mahesh Jagannath Salgaonkar
On 07/19/2017 12:29 PM, Nicholas Piggin wrote:
> There are quite a few machine check exceptions that can be caused by
> kernel bugs. To make debugging easier, use the kernel crash path in
> cases of synchronous machine checks that occur in kernel mode, if that
> would not result in the machine going straight to panic or crash dump.
> 
> There is a downside here that die()ing the process in kernel mode can
> still leave the system unstable. panic_on_oops will always force the
> system to fail-stop, so systems where that behaviour is important will
> still do the right thing.
> 
> As a test, when triggering an i-side 0111b error (ifetch from foreign
> address) in kernel mode process context on POWER9, the kernel currently
> dies quickly like this:
> 
> Severe Machine check interrupt [Not recovered]
>   NIP []: 0x
>   Initiator: CPU
>   Error type: Real address [Instruction fetch (foreign)]
> [  127.426651616,0] OPAL: Reboot requested due to Platform error.
> Effective[  127.426693712,3] OPAL: Reboot requested due to Platform 
> error. address: 
> opal: Reboot type 1 not supported
> Kernel panic - not syncing: PowerNV Unrecovered Machine Check
> CPU: 56 PID: 4425 Comm: syscall Tainted: G   M
> 4.12.0-rc1-13857-ga4700a261072-dirty #35
> Call Trace:
> [  128.017988928,4] IPMI: BUG: Dropping ESEL on the floor due to buggy/mising 
> code in OPAL for this BMCRebooting in 10 seconds..
> Trying to free IRQ 496 from IRQ context!
> 
> 
> After this patch, the process is killed and the kernel continues with
> this message, which gives enough information to identify the offending
> branch (i.e., with CFAR):
> 
> Severe Machine check interrupt [Not recovered]
>   NIP []: 0x
>   Initiator: CPU
>   Error type: Real address [Instruction fetch (foreign)]
> Effective address: 
> Oops: Machine check, sig: 7 [#1]
> SMP NR_CPUS=2048
> NUMA
> PowerNV
> Modules linked in: iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 
> iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack 
> nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm_hv 
> kvm iptable_filter binfmt_misc vmx_crypto ip_tables x_tables autofs4 
> crc32c_vpmsum
> CPU: 22 PID: 4436 Comm: syscall Tainted: G   M
> 4.12.0-rc1-13857-ga4700a261072-dirty #36
> task: c0093230 task.stack: c0093238
> NIP:  LR: 217706a4 CTR: 
> REGS: cfc8fd80 TRAP: 0200   Tainted: G   M 
> (4.12.0-rc1-13857-ga4700a261072-dirty)
> MSR: 901c1003 
>   CR: 24000484  XER: 2000
> CFAR: c0004c80 DAR: 21770a90 DSISR: 0a00 SOFTE: 1
> GPR00: 1ebe 7fffce4818b0 21797f00 
> GPR04: 7fff8007ac24 44000484 4000 7fff801405e8
> GPR08: 9280f033 24000484  0030
> GPR12: 90001003 7fff801bc370  
> GPR16:    
> GPR20:    
> GPR24:    
> GPR28: 7fff801b  217707a0 7fffce481918
> NIP [] 0x
> LR [217706a4] 0x217706a4
> Call Trace:
> Instruction dump:
>        
>        
> ---[ end trace 32ae1dabb4f8dae6 ]---
> 
> Signed-off-by: Nicholas Piggin 

Reviewed-by: Mahesh Salgaonkar 

Thanks,
-Mahesh.

> ---
>  arch/powerpc/include/asm/bug.h|  1 +
>  arch/powerpc/include/asm/fadump.h |  2 ++
>  arch/powerpc/kernel/fadump.c  |  9 -
>  arch/powerpc/kernel/traps.c   | 22 ++
>  arch/powerpc/platforms/powernv/opal.c | 32 ++--
>  5 files changed, 59 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
> index 0151af6c2a50..9a918b3ca5ee 100644
> --- a/arch/powerpc/include/asm/bug.h
> +++ b/arch/powerpc/include/asm/bug.h
> @@ -133,6 +133,7 @@ extern int do_page_fault(struct pt_regs *, unsigned long, 
> unsigned long);
>  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
>  extern void _exception(int, struct pt_regs *, int, unsigned long);
>  extern void die(const char *, struct pt_regs *, long);
> +extern bool die_will_crash(void);
> 
>  #endif /* !__ASSEMBLY__ */
> 
> diff --git a/arch/powerpc/include/asm/fadump.h 
> b/arch/powerpc/include/asm/fadump.h
> index ce88bbe1d809..5a23010af600 100644
> --- a/arch/powerpc/include/asm/fadump.h
> +++ b/arch/powerpc/include/asm/fadump.h
> @@ -209,11 +209,13 @@ 

Re: [PATCH] powerpc/asm/cacheflush: Cleanup cacheflush function params

2017-07-20 Thread Geert Uytterhoeven
On Thu, Jul 20, 2017 at 8:28 AM, Matt Brown  wrote:
> The cacheflush prototypes currently use start and stop values and each
> call requires typecasting the address to an unsigned long.
> This patch changes the cacheflush prototypes to follow the x86 style of
> using a base and size values, with base being a void pointer.
>
> All callers of the cacheflush functions, including drivers, have been
> modified to conform to the new prototypes.
>
> The 64 bit cacheflush functions which were implemented in assembly code
> (flush_dcache_range, flush_inval_dcache_range) have been translated into
> C for readability and coherence.
>
> Signed-off-by: Matt Brown 

>  drivers/spi/spi-bfin5xx.c| 10 +++---
>  drivers/usb/musb/blackfin.c  |  6 ++--

These are used on blackfin, so changing them without changing the blackfin
cache ops will break the build.

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [RFC v6 27/62] powerpc: helper to validate key-access permissions of a pte

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> helper function that checks if the read/write/execute is allowed
> on the pte.
>
> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h |4 +++
>  arch/powerpc/include/asm/pkeys.h |   12 +
>  arch/powerpc/mm/pkeys.c  |   33 
> ++
>  3 files changed, 49 insertions(+), 0 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 30d7f55..0056e58 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -472,6 +472,10 @@ static inline void write_uamor(u64 value)
>   mtspr(SPRN_UAMOR, value);
>  }
>
> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> +extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute);
> +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> +
>  #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
>  static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
>  unsigned long addr, pte_t *ptep)
> diff --git a/arch/powerpc/include/asm/pkeys.h 
> b/arch/powerpc/include/asm/pkeys.h
> index bbb5d85..7a9aade 100644
> --- a/arch/powerpc/include/asm/pkeys.h
> +++ b/arch/powerpc/include/asm/pkeys.h
> @@ -53,6 +53,18 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
>   ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL));
>  }
>
> +static inline u16 pte_to_pkey_bits(u64 pteflags)
> +{
> + if (!pkey_inited)
> + return 0x0UL;

Do we really need that above check ? We should always find it
peky_inited to be set. 

> +
> + return (((pteflags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0UL));
> +}
> +


-aneesh



[PATCH] powerpc/asm/cacheflush: Cleanup cacheflush function params

2017-07-20 Thread Matt Brown
The cacheflush prototypes currently use start and stop values and each
call requires typecasting the address to an unsigned long.
This patch changes the cacheflush prototypes to follow the x86 style of
using a base and size values, with base being a void pointer.

All callers of the cacheflush functions, including drivers, have been
modified to conform to the new prototypes.

The 64 bit cacheflush functions which were implemented in assembly code
(flush_dcache_range, flush_inval_dcache_range) have been translated into
C for readability and coherence.

Signed-off-by: Matt Brown 
---
 arch/powerpc/include/asm/cacheflush.h| 47 +
 arch/powerpc/kernel/misc_64.S| 52 
 arch/powerpc/mm/dma-noncoherent.c| 15 
 arch/powerpc/platforms/512x/mpc512x_shared.c | 10 +++---
 arch/powerpc/platforms/85xx/smp.c|  6 ++--
 arch/powerpc/sysdev/dart_iommu.c |  5 +--
 drivers/ata/pata_bf54x.c |  3 +-
 drivers/char/agp/uninorth-agp.c  |  6 ++--
 drivers/gpu/drm/drm_cache.c  |  3 +-
 drivers/macintosh/smu.c  | 15 
 drivers/mmc/host/bfin_sdh.c  |  3 +-
 drivers/mtd/nand/bf5xx_nand.c|  6 ++--
 drivers/soc/fsl/qbman/dpaa_sys.h |  2 +-
 drivers/soc/fsl/qbman/qman_ccsr.c|  3 +-
 drivers/spi/spi-bfin5xx.c| 10 +++---
 drivers/tty/serial/mpsc.c| 46 
 drivers/usb/musb/blackfin.c  |  6 ++--
 17 files changed, 86 insertions(+), 152 deletions(-)

diff --git a/arch/powerpc/include/asm/cacheflush.h 
b/arch/powerpc/include/asm/cacheflush.h
index 11843e3..b8f04c3 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -51,13 +51,13 @@ static inline void __flush_dcache_icache_phys(unsigned long 
physaddr)
  * Write any modified data cache blocks out to memory and invalidate them.
  * Does not invalidate the corresponding instruction cache blocks.
  */
-static inline void flush_dcache_range(unsigned long start, unsigned long stop)
+static inline void flush_dcache_range(void *start, unsigned long size)
 {
-   void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-   unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+   void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
+   unsigned long len = size + (L1_CACHE_BYTES - 1);
unsigned long i;
 
-   for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+   for (i = 0; i < len >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
dcbf(addr);
mb();   /* sync */
 }
@@ -67,13 +67,13 @@ static inline void flush_dcache_range(unsigned long start, 
unsigned long stop)
  * Does not invalidate the corresponding cache lines (especially for
  * any corresponding instruction cache).
  */
-static inline void clean_dcache_range(unsigned long start, unsigned long stop)
+static inline void clean_dcache_range(void *start, unsigned long size)
 {
-   void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-   unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+   void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
+   unsigned long len = size + (L1_CACHE_BYTES  - 1);
unsigned long i;
 
-   for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+   for (i = 0; i < len >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
dcbst(addr);
mb();   /* sync */
 }
@@ -83,22 +83,39 @@ static inline void clean_dcache_range(unsigned long start, 
unsigned long stop)
  * to invalidate the cache so the PPC core doesn't get stale data
  * from the CPM (no cache snooping here :-).
  */
-static inline void invalidate_dcache_range(unsigned long start,
-  unsigned long stop)
+static inline void invalidate_dcache_range(void *start, unsigned long size)
 {
-   void *addr = (void *)(start & ~(L1_CACHE_BYTES - 1));
-   unsigned long size = stop - (unsigned long)addr + (L1_CACHE_BYTES - 1);
+   void *addr = (void *)((u32)start & ~(L1_CACHE_BYTES - 1));
+   unsigned long len = size + (L1_CACHE_SHIFT - 1);
unsigned long i;
 
-   for (i = 0; i < size >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
+   for (i = 0; i < len >> L1_CACHE_SHIFT; i++, addr += L1_CACHE_BYTES)
dcbi(addr);
mb();   /* sync */
 }
 
 #endif /* CONFIG_PPC32 */
 #ifdef CONFIG_PPC64
-extern void flush_dcache_range(unsigned long start, unsigned long stop);
-extern void flush_inval_dcache_range(unsigned long start, unsigned long stop);
+static inline void flush_dcache_range(void *start, unsigned long size)
+{
+   void *addr = (void *)((u64)start & ~(L1_CACHE_BYTES - 1));
+   unsigned long len = size + 

Re: [RFC v6 26/62] powerpc: Program HPTE key protection bits

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> Map the PTE protection key bits to the HPTE key protection bits,
> while creating HPTE  entries.
>
Reviewed-by: Aneesh Kumar K.V 

> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h |5 +
>  arch/powerpc/include/asm/pkeys.h  |   12 
>  arch/powerpc/mm/hash_utils_64.c   |4 
>  3 files changed, 21 insertions(+), 0 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index 6981a52..f7a6ed3 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -90,6 +90,8 @@
>  #define HPTE_R_PP0   ASM_CONST(0x8000)
>  #define HPTE_R_TSASM_CONST(0x4000)
>  #define HPTE_R_KEY_HIASM_CONST(0x3000)
> +#define HPTE_R_KEY_BIT0  ASM_CONST(0x2000)
> +#define HPTE_R_KEY_BIT1  ASM_CONST(0x1000)
>  #define HPTE_R_RPN_SHIFT 12
>  #define HPTE_R_RPN   ASM_CONST(0x0000)
>  #define HPTE_R_RPN_3_0   ASM_CONST(0x01fff000)
> @@ -104,6 +106,9 @@
>  #define HPTE_R_C ASM_CONST(0x0080)
>  #define HPTE_R_R ASM_CONST(0x0100)
>  #define HPTE_R_KEY_LOASM_CONST(0x0e00)
> +#define HPTE_R_KEY_BIT2  ASM_CONST(0x0800)
> +#define HPTE_R_KEY_BIT3  ASM_CONST(0x0400)
> +#define HPTE_R_KEY_BIT4  ASM_CONST(0x0200)
>
>  #define HPTE_V_1TB_SEG   ASM_CONST(0x4000)
>  #define HPTE_V_VRMA_MASK ASM_CONST(0x4001ff00)
> diff --git a/arch/powerpc/include/asm/pkeys.h 
> b/arch/powerpc/include/asm/pkeys.h
> index ad39db0..bbb5d85 100644
> --- a/arch/powerpc/include/asm/pkeys.h
> +++ b/arch/powerpc/include/asm/pkeys.h
> @@ -41,6 +41,18 @@ static inline u64 vmflag_to_page_pkey_bits(u64 vm_flags)
>   ((vm_flags & VM_PKEY_BIT4) ? H_PAGE_PKEY_BIT0 : 0x0UL));
>  }
>
> +static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
> +{
> + if (!pkey_inited)
> + return 0x0UL;
> +
> + return (((pteflags & H_PAGE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) |
> + ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL));
> +}
> +
>  static inline int vma_pkey(struct vm_area_struct *vma)
>  {
>   if (!pkey_inited)
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index f88423b..1e74529 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -231,6 +231,10 @@ unsigned long htab_convert_pte_flags(unsigned long 
> pteflags)
>*/
>   rflags |= HPTE_R_M;
>
> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> + rflags |= pte_to_hpte_pkey_bits(pteflags);
> +#endif
> +
>   return rflags;
>  }
>
> -- 
> 1.7.1



[PATCH] powerpc/include/asm: Remove unused 64bit cacheflush function

2017-07-20 Thread Matt Brown
The flush_dcache_phys_range function is no longer used in the kernel.
This patch removes and cleans up the function.

Signed-off-by: Matt Brown 
---
 arch/powerpc/include/asm/cacheflush.h |  1 -
 arch/powerpc/kernel/misc_64.S | 38 ---
 2 files changed, 39 deletions(-)

diff --git a/arch/powerpc/include/asm/cacheflush.h 
b/arch/powerpc/include/asm/cacheflush.h
index b77f036..11843e3 100644
--- a/arch/powerpc/include/asm/cacheflush.h
+++ b/arch/powerpc/include/asm/cacheflush.h
@@ -99,7 +99,6 @@ static inline void invalidate_dcache_range(unsigned long 
start,
 #ifdef CONFIG_PPC64
 extern void flush_dcache_range(unsigned long start, unsigned long stop);
 extern void flush_inval_dcache_range(unsigned long start, unsigned long stop);
-extern void flush_dcache_phys_range(unsigned long start, unsigned long stop);
 #endif
 
 #define copy_to_user_page(vma, page, vaddr, dst, src, len) \
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index c119044..0ed5c55 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -144,44 +144,6 @@ _GLOBAL_TOC(flush_dcache_range)
blr
 EXPORT_SYMBOL(flush_dcache_range)
 
-/*
- * Like above, but works on non-mapped physical addresses.
- * Use only for non-LPAR setups ! It also assumes real mode
- * is cacheable. Used for flushing out the DART before using
- * it as uncacheable memory 
- *
- * flush_dcache_phys_range(unsigned long start, unsigned long stop)
- *
- *flush all bytes from start to stop-1 inclusive
- */
-_GLOBAL(flush_dcache_phys_range)
-   ld  r10,PPC64_CACHES@toc(r2)
-   lwz r7,DCACHEL1BLOCKSIZE(r10)   /* Get dcache block size */
-   addir5,r7,-1
-   andcr6,r3,r5/* round low to line bdy */
-   subfr8,r6,r4/* compute length */
-   add r8,r8,r5/* ensure we get enough */
-   lwz r9,DCACHEL1LOGBLOCKSIZE(r10)/* Get log-2 of dcache block 
size */
-   srw.r8,r8,r9/* compute line count */
-   beqlr   /* nothing to do? */
-   mfmsr   r5  /* Disable MMU Data Relocation */
-   ori r0,r5,MSR_DR
-   xorir0,r0,MSR_DR
-   sync
-   mtmsr   r0
-   sync
-   isync
-   mtctr   r8
-0: dcbst   0,r6
-   add r6,r6,r7
-   bdnz0b
-   sync
-   isync
-   mtmsr   r5  /* Re-enable MMU Data Relocation */
-   sync
-   isync
-   blr
-
 _GLOBAL(flush_inval_dcache_range)
ld  r10,PPC64_CACHES@toc(r2)
lwz r7,DCACHEL1BLOCKSIZE(r10)   /* Get dcache block size */
-- 
2.9.3



Re: [RFC v6 11/62] powerpc: initial pkey plumbing

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> basic setup to initialize the pkey system. Only 64K kernel in HPT
> mode, enables the pkey system.
>
> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/Kconfig   |   16 ++
>  arch/powerpc/include/asm/mmu_context.h |5 +++
>  arch/powerpc/include/asm/pkeys.h   |   51 
> 
>  arch/powerpc/kernel/setup_64.c |4 ++
>  arch/powerpc/mm/Makefile   |1 +
>  arch/powerpc/mm/hash_utils_64.c|1 +
>  arch/powerpc/mm/pkeys.c|   18 +++
>  7 files changed, 96 insertions(+), 0 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/pkeys.h
>  create mode 100644 arch/powerpc/mm/pkeys.c
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index bf4391d..5c60fd6 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -855,6 +855,22 @@ config SECCOMP
>
> If unsure, say Y. Only embedded should say N here.
>
> +config PPC64_MEMORY_PROTECTION_KEYS
> + prompt "PowerPC Memory Protection Keys"
> + def_bool y
> + # Note: only available in 64-bit mode
> + depends on PPC64 && PPC_64K_PAGES
> + select ARCH_USES_HIGH_VMA_FLAGS
> + select ARCH_HAS_PKEYS
> + ---help---
> +   Memory Protection Keys provides a mechanism for enforcing
> +   page-based protections, but without requiring modification of the
> +   page tables when an application changes protection domains.
> +
> +   For details, see Documentation/vm/protection-keys.txt
> +
> +   If unsure, say y.
> +
>  endmenu


Shouldn't this come as the last patch ? Or can we enable this config by
this patch ? If so what does it do ? Did you test boot each of this
patch to make sure we don't break git bisect ?

>
>  config ISA_DMA_API
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index da7e943..4b93547 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -181,5 +181,10 @@ static inline bool arch_vma_access_permitted(struct 
> vm_area_struct *vma,
>   /* by default, allow everything */


-aneesh



Re: [RFC v6 07/62] powerpc: use helper functions in __hash_page_huge() for 64K PTE

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> replace redundant code in __hash_page_huge() with helper
> functions pte_get_hash_gslot() and pte_set_hash_slot()
>

Can you fold all the helper function usage into one patch ?


> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/mm/hugetlbpage-hash64.c |   24 
>  1 files changed, 4 insertions(+), 20 deletions(-)
>
> diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c 
> b/arch/powerpc/mm/hugetlbpage-hash64.c
> index 6f7aee3..e6dcd50 100644
> --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> @@ -23,7 +23,6 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> access, unsigned long vsid,
>int ssize, unsigned int shift, unsigned int mmu_psize)
>  {
>   real_pte_t rpte;
> - unsigned long *hidxp;
>   unsigned long vpn;
>   unsigned long old_pte, new_pte;
>   unsigned long rflags, pa, sz;
> @@ -74,16 +73,10 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> access, unsigned long vsid,
>   /* Check if pte already has an hpte (case 2) */
>   if (unlikely(old_pte & H_PAGE_HASHPTE)) {
>   /* There MIGHT be an HPTE for this pte */
> - unsigned long hash, slot, hidx;
> + unsigned long gslot;
>
> - hash = hpt_hash(vpn, shift, ssize);
> - hidx = __rpte_to_hidx(rpte, 0);
> - if (hidx & _PTEIDX_SECONDARY)
> - hash = ~hash;
> - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> - slot += hidx & _PTEIDX_GROUP_IX;
> -
> - if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
> + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
>  mmu_psize, ssize, flags) == -1)
>   old_pte &= ~_PAGE_HPTEFLAGS;
>   }
> @@ -110,16 +103,7 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> access, unsigned long vsid,
>   return -1;
>   }
>
> - /*
> -  * Insert slot number & secondary bit in PTE second half.
> -  */
> - hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> - rpte.hidx &= ~(0xfUL);
> - *hidxp = rpte.hidx  | (slot & 0xfUL);
> - /*
> -  * check __real_pte for details on matching smp_rmb()
> -  */
> - smp_wmb();
> + new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
>   }
>
>   /*
> -- 
> 1.7.1



Re: [RFC v6 06/62] powerpc: use helper functions in __hash_page_64K() for 64K PTE

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> replace redundant code in __hash_page_64K() with helper
> functions pte_get_hash_gslot() and pte_set_hash_slot()
>

Reviewed-by: Aneesh Kumar K.V 


> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/mm/hash64_64k.c |   24 
>  1 files changed, 4 insertions(+), 20 deletions(-)
>
> diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
> index 0012618..645f621 100644
> --- a/arch/powerpc/mm/hash64_64k.c
> +++ b/arch/powerpc/mm/hash64_64k.c
> @@ -244,7 +244,6 @@ int __hash_page_64K(unsigned long ea, unsigned long 
> access,
>   unsigned long flags, int ssize)
>  {
>   real_pte_t rpte;
> - unsigned long *hidxp;
>   unsigned long hpte_group;
>   unsigned long rflags, pa;
>   unsigned long old_pte, new_pte;
> @@ -289,18 +288,12 @@ int __hash_page_64K(unsigned long ea, unsigned long 
> access,
>
>   vpn  = hpt_vpn(ea, vsid, ssize);
>   if (unlikely(old_pte & H_PAGE_HASHPTE)) {
> - unsigned long hash, slot, hidx;
> -
> - hash = hpt_hash(vpn, shift, ssize);
> - hidx = __rpte_to_hidx(rpte, 0);
> - if (hidx & _PTEIDX_SECONDARY)
> - hash = ~hash;
> - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> - slot += hidx & _PTEIDX_GROUP_IX;
> + unsigned long gslot;
>   /*
>* There MIGHT be an HPTE for this pte
>*/
> - if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
> + gslot = pte_get_hash_gslot(vpn, shift, ssize, rpte, 0);
> + if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, MMU_PAGE_64K,
>  MMU_PAGE_64K, ssize,
>  flags) == -1)
>   old_pte &= ~_PAGE_HPTEFLAGS;
> @@ -350,17 +343,8 @@ int __hash_page_64K(unsigned long ea, unsigned long 
> access,
>   return -1;
>   }
>
> - /*
> -  * Insert slot number & secondary bit in PTE second half.
> -  */
> - hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
> - rpte.hidx &= ~(0xfUL);
> - *hidxp = rpte.hidx  | (slot & 0xfUL);
> - /*
> -  * check __real_pte for details on matching smp_rmb()
> -  */
> - smp_wmb();
>   new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
> + new_pte |= pte_set_hash_slot(ptep, rpte, 0, slot);
>   }
>   *ptep = __pte(new_pte & ~H_PAGE_BUSY);
>   return 0;
> -- 
> 1.7.1



Re: [RFC v6 04/62] powerpc: introduce pte_get_hash_gslot() helper

2017-07-20 Thread Aneesh Kumar K.V
Ram Pai  writes:

> Introduce pte_get_hash_gslot()() which returns the slot number of the
> HPTE in the global hash table.
>
> This function will come in handy as we work towards re-arranging the
> PTE bits in the later patches.
>

Reviewed-by: Aneesh Kumar K.V 

> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/book3s/64/hash.h |3 +++
>  arch/powerpc/mm/hash_utils_64.c   |   18 ++
>  2 files changed, 21 insertions(+), 0 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
> b/arch/powerpc/include/asm/book3s/64/hash.h
> index d27f885..277158c 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash.h
> @@ -156,6 +156,9 @@ static inline int hash__pte_none(pte_t pte)
>   return (pte_val(pte) & ~H_PTE_NONE_MASK) == 0;
>  }
>
> +unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
> + int ssize, real_pte_t rpte, unsigned int subpg_index);
> +
>  /* This low level function performs the actual PTE insertion
>   * Setting the PTE depends on the MMU type and other factors. It's
>   * an horrible mess that I'm not going to try to clean up now but
> diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
> index 1b494d0..d3604da 100644
> --- a/arch/powerpc/mm/hash_utils_64.c
> +++ b/arch/powerpc/mm/hash_utils_64.c
> @@ -1591,6 +1591,24 @@ static inline void tm_flush_hash_page(int local)
>  }
>  #endif
>
> +/*
> + * return the global hash slot, corresponding to the given
> + * pte, which contains the hpte.
> + */
> +unsigned long pte_get_hash_gslot(unsigned long vpn, unsigned long shift,
> + int ssize, real_pte_t rpte, unsigned int subpg_index)
> +{
> + unsigned long hash, slot, hidx;
> +
> + hash = hpt_hash(vpn, shift, ssize);
> + hidx = __rpte_to_hidx(rpte, subpg_index);
> + if (hidx & _PTEIDX_SECONDARY)
> + hash = ~hash;
> + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> + slot += hidx & _PTEIDX_GROUP_IX;
> + return slot;
> +}
> +
>  /* WARNING: This is called from hash_low_64.S, if you change this prototype,
>   *  do not forget to update the assembly call site !
>   */
> -- 
> 1.7.1