Re: [PATCH v2 04/10] powerpc/smp: Enable small core scheduling sooner

2020-07-21 Thread Gautham R Shenoy
Hello Srikar,

On Tue, Jul 21, 2020 at 05:08:08PM +0530, Srikar Dronamraju wrote:
> Enable small core scheduling as soon as we detect that we are in a
> system that supports thread group. Doing so would avoid a redundant
> check.

The patch looks good to me. However, I think the commit message still
reflect the v1 code where we were moving the functionality from
smp_cpus_done() to init_big_cores().

In this we are moving it to a helper function to collate all fixups to
topology.

> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Enable small core scheduling sooner
>   Restored the previous info msg (Jordan)
>   Moved big core topology fixup to fixup_topology (Gautham)
> 
>  arch/powerpc/kernel/smp.c | 17 +++--
>  1 file changed, 11 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 1ce95da00cb6..72f16dc0cb26 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1370,6 +1370,16 @@ int setup_profiling_timer(unsigned int multiplier)
>   return 0;
>  }
> 
> +static void fixup_topology(void)
> +{
> +#ifdef CONFIG_SCHED_SMT
> + if (has_big_cores) {
> + pr_info("Big cores detected but using small core scheduling\n");
> + powerpc_topology[0].mask = smallcore_smt_mask;
> + }
> +#endif
> +}
> +
>  void __init smp_cpus_done(unsigned int max_cpus)
>  {
>   /*
> @@ -1383,12 +1393,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
> 
>   dump_numa_cpu_topology();
> 
> -#ifdef CONFIG_SCHED_SMT
> - if (has_big_cores) {
> - pr_info("Big cores detected but using small core scheduling\n");
> - powerpc_topology[0].mask = smallcore_smt_mask;
> - }
> -#endif
> + fixup_topology();
>   set_sched_topology(powerpc_topology);
>  }
> 
> -- 
> 2.17.1
> 


[PATCH] selftests/powerpc: Add test of memcmp at end of page

2020-07-21 Thread Michael Ellerman
Update our memcmp selftest, to test the case where we're comparing up
to the end of a page and the subsequent page is not mapped. We have to
make sure we don't read off the end of the page and cause a fault.

We had a bug there in the past, fixed in commit
d9470757398a ("powerpc/64: Fix memcmp reading past the end of src/dest").

Signed-off-by: Michael Ellerman 
---
 .../selftests/powerpc/stringloops/memcmp.c| 40 ++-
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/powerpc/stringloops/memcmp.c 
b/tools/testing/selftests/powerpc/stringloops/memcmp.c
index b1fa7546957f..e4605ca850dc 100644
--- a/tools/testing/selftests/powerpc/stringloops/memcmp.c
+++ b/tools/testing/selftests/powerpc/stringloops/memcmp.c
@@ -2,6 +2,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "utils.h"
 
@@ -13,6 +14,9 @@
 #define LARGE_MAX_OFFSET 32
 #define LARGE_SIZE_START 4096
 
+/* This is big enough to fit LARGE_SIZE and works on 4K & 64K kernels */
+#define MAP_SIZE (64 * 1024)
+
 #define MAX_OFFSET_DIFF_S1_S2 48
 
 int vmx_count;
@@ -68,25 +72,25 @@ static void test_one(char *s1, char *s2, unsigned long 
max_offset,
 
 static int testcase(bool islarge)
 {
-   char *s1;
-   char *s2;
-   unsigned long i;
-
-   unsigned long comp_size = (islarge ? LARGE_SIZE : SIZE);
-   unsigned long alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
-   int iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
-
-   s1 = memalign(128, alloc_size);
-   if (!s1) {
-   perror("memalign");
-   exit(1);
-   }
+   unsigned long i, comp_size, alloc_size;
+   char *p, *s1, *s2;
+   int iterations;
 
-   s2 = memalign(128, alloc_size);
-   if (!s2) {
-   perror("memalign");
-   exit(1);
-   }
+   comp_size = (islarge ? LARGE_SIZE : SIZE);
+   alloc_size = comp_size + MAX_OFFSET_DIFF_S1_S2;
+   iterations = islarge ? LARGE_ITERATIONS : ITERATIONS;
+
+   p = mmap(NULL, 4 * MAP_SIZE, PROT_READ | PROT_WRITE,
+MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+   FAIL_IF(p == MAP_FAILED);
+
+   /* Put s1/s2 at the end of a page */
+   s1 = p + MAP_SIZE - alloc_size;
+   s2 = p + 3 * MAP_SIZE - alloc_size;
+
+   /* And unmap the subsequent page to force a fault if we overread */
+   munmap(p + MAP_SIZE, MAP_SIZE);
+   munmap(p + 3 * MAP_SIZE, MAP_SIZE);
 
srandom(time(0));
 
-- 
2.25.1



Re: [PATCH v2 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-21 Thread Gautham R Shenoy
On Tue, Jul 21, 2020 at 05:08:06PM +0530, Srikar Dronamraju wrote:
> A new sched_domain_topology_level was added just for Power9. However the
> same can be achieved by merging powerpc_topology with power9_topology
> and makes the code more simpler especially when adding a new sched
> domain.
> 
> Cc: linuxppc-dev 
> Cc: LKML 
> Cc: Michael Ellerman 
> Cc: Ingo Molnar 
> Cc: Peter Zijlstra 
> Cc: Valentin Schneider 
> Cc: Nick Piggin 
> Cc: Oliver OHalloran 
> Cc: Nathan Lynch 
> Cc: Michael Neuling 
> Cc: Anton Blanchard 
> Cc: Gautham R Shenoy 
> Cc: Vaidyanathan Srinivasan 
> Cc: Jordan Niethe 
> Signed-off-by: Srikar Dronamraju 
> ---
> Changelog v1 -> v2:
> powerpc/smp: Merge Power9 topology with Power topology
>   Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
>   since cpu_smt_mask is only defined under CONFIG_SCHED_SMT
> 
>  arch/powerpc/kernel/smp.c | 33 ++---
>  1 file changed, 10 insertions(+), 23 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
> index 680c0edcc59d..0e0b118d9b6e 100644
> --- a/arch/powerpc/kernel/smp.c
> +++ b/arch/powerpc/kernel/smp.c
> @@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
>  }
> 
>  #ifdef CONFIG_SCHED_SMT
> -/* cpumask of CPUs with asymetric SMT dependancy */
> +/* cpumask of CPUs with asymmetric SMT dependency */
>  static int powerpc_smt_flags(void)
>  {
>   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
> @@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
>  }
>  #endif
> 
> -static struct sched_domain_topology_level powerpc_topology[] = {
> -#ifdef CONFIG_SCHED_SMT
> - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
> -#endif
> - { cpu_cpu_mask, SD_INIT_NAME(DIE) },
> - { NULL, },
> -};
> -
>  /*
>   * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
>   * This topology makes it *much* cheaper to migrate tasks between adjacent 
> cores
> @@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
>   */
>  static const struct cpumask *shared_cache_mask(int cpu)
>  {
> - return cpu_l2_cache_mask(cpu);
> + if (shared_caches)
> + return cpu_l2_cache_mask(cpu);
> +
> + if (has_big_cores)
> + return cpu_smallcore_mask(cpu);
> +
> + return per_cpu(cpu_sibling_map, cpu);
>  }


It might be helpful to enumerate the consequences of this change:

With this patch, on POWER7 and POWER8

   SMT and CACHE domains' cpumasks will both be
   per_cpu(cpu_sibling_map, cpu).

   On POWER7 SMT level flags has the following
   (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)

   On POWER8 SMT level flags has the following
   (SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES).

   On both POWER7 and POWER8, CACHE level flags only has
   SD_SHARE_PKG_RESOURCES

   Thus, on both POWER7 and POWER8, since the SMT and CACHE cpumasks
   are the same and since CACHE has no additional flags which SMT does
   not, the parent domain CACHE will be degenerated.

   Hence we will have SMT --> DIE --> NUMA as before without the
   patch. So the patch introduces no behavioural change. Only change
   is an additional degeneration of the CACHE domain.

On POWER9 : Baremetal.
   SMT level cpumask = per_cpu(cpu_sibling_map, cpu)

   Since the caches are shared for a pair of two cores,
   CACHE level cpumask = cpu_l2_cache_mask(cpu)

   Thus, we will have SMT --> CACHE --> DIE --> NUMA as before.  No
   behavioural change.

On POWER9 : LPAR
   SMT level cpumask = cpu_smallcore_mask(cpu).

   Since the caches are shared,
   CACHE level cpumask = cpu_l2_cache_mask(cpu).

   Thus, we will have SMT --> CACHE --> DIE --> NUMA as before.  Again
   no change in behaviour.

Reviewed-by: Gautham R. Shenoy 

--
Thanks and Regards
gautham.


Re: [PATCH 5/5] powerpc sstep: Add tests for Prefixed Add Immediate

2020-07-21 Thread Michael Ellerman
Jordan Niethe  writes:
> On Mon, May 25, 2020 at 1:00 PM Jordan Niethe  wrote:
>>
>> Use the existing support for testing compute type instructions to test
>> Prefixed Add Immediate (paddi).  The R bit of the paddi instruction
>> controls whether current instruction address is used. Add test cases for
>> when R=1 and for R=0. paddi has a 34 bit immediate field formed by
>> concatenating si0 and si1. Add tests for the extreme values of this
>> field.
>>
>> Skip the paddi tests if ISA v3.1 is unsupported.
>>
>> Some of these test cases were added by Balamuruhan S.
>>
>> Signed-off-by: Jordan Niethe 
>> ---
>>  arch/powerpc/lib/test_emulate_step.c  | 127 ++
>>  .../lib/test_emulate_step_exec_instr.S|   1 +
>>  2 files changed, 128 insertions(+)
...
>> diff --git a/arch/powerpc/lib/test_emulate_step_exec_instr.S 
>> b/arch/powerpc/lib/test_emulate_step_exec_instr.S
>> index 1580f34f4f4f..aef53ee77a43 100644
>> --- a/arch/powerpc/lib/test_emulate_step_exec_instr.S
>> +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S
>> @@ -81,6 +81,7 @@ _GLOBAL(exec_instr)
>>
>> /* Placeholder for the test instruction */
>>  1: nop
>> +   nop
>> patch_site 1b patch__exec_instr
>>
>> /*
>> --
>> 2.17.1
>>
>
> Because of the alignment requirements of prefixed instructions, the
> noops to be patched need to be aligned.
> mpe, want me to send a new version?

No I'll just squash it in.

> --- a/arch/powerpc/lib/test_emulate_step_exec_instr.S
> +++ b/arch/powerpc/lib/test_emulate_step_exec_instr.S
> @@ -80,6 +80,7 @@ _GLOBAL(exec_instr)
> REST_NVGPRS(r31)
>
> /* Placeholder for the test instruction */
> +.align 6

I'll change it to .balign 64.

>  1: nop
> nop
> patch_site 1b patch__exec_instr


cheers


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Palmer Dabbelt

On Tue, 21 Jul 2020 21:50:42 PDT (-0700), m...@ellerman.id.au wrote:

Benjamin Herrenschmidt  writes:

On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote:

> Why ? Branch distance limits ? You can't use trampolines ?

Nothing fundamental, it's just that we don't have a large code model in the C
compiler.  As a result all the global symbols are resolved as 32-bit
PC-relative accesses.  We could fix this with a fast large code model, but then
the kernel would need to relax global symbol references in modules and we don't
even do that for the simple code models we have now.  FWIW, some of the
proposed large code models are essentially just split-PLT/GOT and therefor
don't require relaxation, but at that point we're essentially PIC until we
have more that 2GiB of kernel text -- and even then, we keep all the
performance issues.


My memory might be out of date but I *think* we do it on powerpc
without going to a large code model, but just having the in-kernel
linker insert trampolines.


We build modules with the large code model, and always have AFAIK:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/Makefile?commit=4fa640dc52302b5e62b01b05c755b055549633ae#n129

  # -mcmodel=medium breaks modules because it uses 32bit offsets from
  # the TOC pointer to create pointers where possible. Pointers into the
  # percpu data area are created by this method.
  #
  # The kernel module loader relocates the percpu data section from the
  # original location (starting with 0xd...) to somewhere in the base
  # kernel percpu data space (starting with 0xc...). We need a full
  # 64bit relocation for this to work, hence -mcmodel=large.
  KBUILD_CFLAGS_MODULE += -mcmodel=large


Well, a fast large code model would solve a lot of problems :).  Unfortunately
we just don't have enough people working on this stuff to do that.  It's a
somewhat tricky thing to do on RISC-V as there aren't any quick sequences for
long addresses, but I don't think we're that much worse off than everyone else.
At some point I had a bunch of designs written up, but they probably went along
with my SiFive computer.  I think we ended up decided that the best bet would
be to distribute constant tables throughout the text such that they're
accessible via the 32-bit PC-relative loads at any point -- essentially the
multi-GOT stuff that MIPS used for big objects.  Doing that well is a lot of
work and doing it poorly is just as slow as PIC, so we never got around to it.


We also insert trampolines for branches, but IIUC that's a separate
issue.


"PowerPC branch trampolines" points me here
https://sourceware.org/binutils/docs-2.20/ld/PowerPC-ELF32.html .  That sounds
like what we're doing already in the medium code models: we have short and
medium control transfer sequences, linker relaxation optimizes them when
possible.  Since we rely on linker relaxation pretty heavily we just don't
bother with the smaller code model: it'd be a 12-bit address space for data and
a 21-bit address space for text (with 13-bit maximum function size).  Instead
of building out such a small code model we just spent time improving the linker.


Re: [PATCH 15/15] powerpc/powernv/sriov: Make single PE mode a per-BAR setting

2020-07-21 Thread Oliver O'Halloran
On Wed, Jul 15, 2020 at 6:00 PM Alexey Kardashevskiy  wrote:
>
> >>>*
> >>> -  * Generally, one M64 BAR maps one IOV BAR. To avoid 
> >>> conflict
> >>> -  * with other devices, IOV BAR size is expanded to be
> >>> -  * (total_pe * VF_BAR_size).  When VF_BAR_size is half of 
> >>> M64
> >>> -  * segment size , the expanded size would equal to half of 
> >>> the
> >>> -  * whole M64 space size, which will exhaust the M64 Space 
> >>> and
> >>> -  * limit the system flexibility.  This is a design decision 
> >>> to
> >>> -  * set the boundary to quarter of the M64 segment size.
> >>> +  * The 1/4 limit is arbitrary and can be tweaked.
> >>>*/
> >>> - if (total_vf_bar_sz > gate) {
> >>> - mul = roundup_pow_of_two(total_vfs);
> >>> - dev_info(>dev,
> >>> - "VF BAR Total IOV size %llx > %llx, roundup 
> >>> to %d VFs\n",
> >>> - total_vf_bar_sz, gate, mul);
> >>> - iov->m64_single_mode = true;
> >>> - break;
> >>> - }
> >>> - }
> >>> + if (vf_bar_sz > (phb->ioda.m64_segsize >> 2)) {
> >>> + /*
> >>> +  * On PHB3, the minimum size alignment of M64 BAR in
> >>> +  * single mode is 32MB. If this VF BAR is smaller 
> >>> than
> >>> +  * 32MB, but still too large for a segmented window
> >>> +  * then we can't map it and need to disable SR-IOV 
> >>> for
> >>> +  * this device.
> >>
> >>
> >> Why not use single PE mode for such BAR? Better than nothing.
> >
> > Suppose you could, but I figured VFs were mainly interesting since you
> > could give each VF to a separate guest. If there's multiple VFs under
> > the same single PE BAR then they'd have to be assigned to the same
>
> True. But with one PE per VF we can still have 15 (or 14?) isolated VFs
> which is not hundreds but better than 0.

We can only use single PE BARs if the per-VF size is >= 32MB due to
the alignment requirements on P8. If the per-VF size is smaller then
we're stuck with multiple VFs inside the same BAR which is bad due to
the PAPR requirements mentioned below. Sure we could look at doing
something else, but considering this matches the current behaviour
it's a bit hard to care...

> > guest in order to retain the freeze/unfreeze behaviour that PAPR
> > requires. I guess that's how it used to work, but it seems better just
> > to disable them rather than having VFs which sort of work.
>
> Well, realistically the segment size should be 8MB to make this matter
> (or the whole window 2GB) which does not seem to happen so it does not
> matter.

I'm not sure what you mean.


Re: [RFC PATCH] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

2020-07-21 Thread Paul Mackerras
On Thu, Jul 16, 2020 at 01:32:13AM -0700, Ram Pai wrote:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault 
> is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, 
> the
> fault is delivered to the ultravisor.
> 
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.
> 
> This problem can be correctly solved with some help from the kernel.
> 
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.

Just a comment on the approach of putting the instruction in SPRG0:
these I/O accessors can be used in interrupt routines, which means
that if these accessors are ever used with interrupts enabled, there
is the possibility of an external interrupt occurring between the
instruction that sets SPRG0 and the load/store instruction that
faults.  If the handler for that interrupt itself does an I/O access,
it will overwrite SPRG0, corrupting the value set by the interrupted
code.

The choices to fix that would seem to be (a) disable interrupts around
all I/O accesses, (b) have the accessor save and restore SPRG0, or (c)
solve the problem another way, such as by doing a H_LOGICAL_CI_LOAD
or H_LOGICAL_CI_STORE hypercall.

Paul.


Re: [PATCH 05/15] powerpc/powernv/sriov: Move SR-IOV into a seperate file

2020-07-21 Thread Oliver O'Halloran
On Tue, Jul 14, 2020 at 7:16 PM Alexey Kardashevskiy  wrote:
>
> On 10/07/2020 15:23, Oliver O'Halloran wrote:
> > + align = pci_iov_resource_size(pdev, resno);
> > +
> > + /*
> > +  * iov can be null if we have an SR-IOV device with IOV BAR that can't
> > +  * be placed in the m64 space (i.e. The BAR is 32bit or non-prefetch).
> > +  * In that case we don't allow VFs to be enabled so just return the
> > +  * default alignment.
> > +  */
> > + if (!iov)
> > + return align;
>
>
> This is the new chunk. What would happen before? Non-prefetch BAR would
> still go to m64 space?

I don't think there's any real change. Currently if the setup in
pnv_pci_ioda_fixup_iov_resources() fails then pdn->vfs_expanded will
be zero. The !iov check here fills the same role, but it's more
explicit. vfs_expanded has some other behaviour too so we can't get
rid of it entirely (yet).


Re: [PATCHv3 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents

2020-07-21 Thread Michael Ellerman
Pingfan Liu  writes:
> A bug is observed on pseries by taking the following steps on rhel:
^
RHEL

I assume it happens on mainline too?

> -1. drmgr -c mem -r -q 5
> -2. echo c > /proc/sysrq-trigger
>
> And then, the failure looks like:
> kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
> kdump: saving vmcore-dmesg.txt
> kdump: saving vmcore-dmesg.txt complete
> kdump: saving vmcore
>  Checking for memory holes : [  0.0 %] /  
>  Checking for memory holes : [100.0 %] |  
>  Excluding unnecessary pages   : [100.0 %] \  
>  Copying data  : [  0.3 %] -  
> eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! 
> EA=0x7fffba40 access=0x8004 current=makedumpfile
> [   44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
> access=0x8004 current=makedumpfile
> [   44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
> psize 2 pte=0xc0005504
> [   44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip 
> 7fffbbc4d7fc lr 00011356ca3c code 2
> [   44.338548] Core dump to |/bin/false pipe failed
> /lib/kdump-lib-initramfs.sh: line 98:   469 Bus error   
> $CORE_COLLECTOR /proc/vmcore 
> $_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
> kdump: saving vmcore failed
>
> * Root cause *
>   After analyzing, it turns out that in the current implementation,
> when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
> the code __remove_memory() comes before drmem_update_dt().
> So in kdump kernel, when read_from_oldmem() resorts to
> pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
> non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it
> can be observed "Bus error"
>
> From a viewpoint of listener and publisher, the publisher notifies the
> listener before data is ready.  This introduces a problem where udev
> launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
> updating. And in capture kernel, makedumpfile will access the memory based
> on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.
>
> * Fix *
>   In order to fix this issue, update dt before __remove_memory(), and
> accordingly the same rule in hot-add path.
>
> This will introduce extra dt updating payload for each involved lmb when 
> hotplug.
> But it should be fine since drmem_update_dt() is memory based operation and
> hotplug is not a hot path.
>
> Signed-off-by: Pingfan Liu 
> Cc: Michael Ellerman 
> Cc: Hari Bathini 
> Cc: Nathan Lynch 
> To: linuxppc-dev@lists.ozlabs.org
> Cc: ke...@lists.infradead.org
> ---
> v2 -> v3: rebase onto ppc next-test branch
> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 1a3ac3b..def8cb3f 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -372,6 +372,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
>   invalidate_lmb_associativity_index(lmb);
>   lmb_clear_nid(lmb);
>   lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> + drmem_update_dt();

No error checking?

>   __remove_memory(nid, base_addr, block_sz);
>  
> @@ -607,6 +608,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
>  
>   lmb_set_nid(lmb);
>   lmb->flags |= DRCONF_MEM_ASSIGNED;
> + drmem_update_dt();

And here ..
>  
>   block_sz = memory_block_size_bytes();
>  
> @@ -625,6 +627,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
>   invalidate_lmb_associativity_index(lmb);
>   lmb_clear_nid(lmb);
>   lmb->flags &= ~DRCONF_MEM_ASSIGNED;
> + drmem_update_dt();


And here ..

>   __remove_memory(nid, base_addr, block_sz);
>   }
> @@ -877,9 +880,6 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
>   break;
>   }
>  
> - if (!rc)
> - rc = drmem_update_dt();
> -
>   unlock_device_hotplug();
>   return rc;

Whereas previously we did check it.


cheers


Re: [v3 02/15] KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR

2020-07-21 Thread Paul Mackerras
On Wed, Jul 22, 2020 at 07:39:26AM +0530, Athira Rajeev wrote:
> 
> 
> > On 21-Jul-2020, at 9:24 AM, Paul Mackerras  wrote:
> > 
> > On Fri, Jul 17, 2020 at 10:38:14AM -0400, Athira Rajeev wrote:
> >> Currently `kvm_vcpu_arch` stores all Monitor Mode Control registers
> >> in a flat array in order: mmcr0, mmcr1, mmcra, mmcr2, mmcrs
> >> Split this to give mmcra and mmcrs its own entries in vcpu and
> >> use a flat array for mmcr0 to mmcr2. This patch implements this
> >> cleanup to make code easier to read.
> > 
> > Changing the way KVM stores these values internally is fine, but
> > changing the user ABI is not.  This part:
> > 
> >> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
> >> b/arch/powerpc/include/uapi/asm/kvm.h
> >> index 264e266..e55d847 100644
> >> --- a/arch/powerpc/include/uapi/asm/kvm.h
> >> +++ b/arch/powerpc/include/uapi/asm/kvm.h
> >> @@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
> >> 
> >> #define KVM_REG_PPC_MMCR0  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
> >> #define KVM_REG_PPC_MMCR1  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
> >> -#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
> >> -#define KVM_REG_PPC_MMCR2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
> >> +#define KVM_REG_PPC_MMCR2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
> >> +#define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
> > 
> > means that existing userspace programs that used to work would now be
> > broken.  That is not acceptable (breaking the user ABI is only ever
> > acceptable with a very compelling reason).  So NAK to this part of the
> > patch.
> 
> Hi Paul
> 
> Thanks for checking the patch. I understood your point on user ABI breakage 
> that this particular change can cause.
> I will retain original KVM_REG_PPC_MMCRA and KVM_REG_PPC_MMCR2 order in 
> `kvm.h`
> And with that, additionally I will need below change ( on top of current 
> patch ) for my clean up updates for kvm cpu MMCR to work,
> Because now mmcra and mmcrs will have its own entries in vcpu and is not part 
> of the mmcr[] array
> Please suggest if this looks good

Yes, that looks fine.

By the way, is the new MMCRS register here at all related to the MMCRS
that there used to be on POWER8, but which wasn't present (as far as I
know) on POWER9?

Paul.


Re: [PATCH v4 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Gautham R Shenoy
On Tue, Jul 21, 2020 at 09:07:07PM +0530, Pratik Rajesh Sampat wrote:
> Replace the variable name from using "pnv_first_spr_loss_level" to
> "deep_spr_loss_state".
> 
> pnv_first_spr_loss_level is supposed to be the earliest state that
> has OPAL_PM_LOSE_FULL_CONTEXT set, in other places the kernel uses the
> "deep" states as terminology. Hence renaming the variable to be coherent
> to its semantics.
> 
> Signed-off-by: Pratik Rajesh Sampat 

Acked-by: Gautham R. Shenoy 

> ---
>  arch/powerpc/platforms/powernv/idle.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/idle.c 
> b/arch/powerpc/platforms/powernv/idle.c
> index 642abf0b8329..28462d59a8e1 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -48,7 +48,7 @@ static bool default_stop_found;
>   * First stop state levels when SPR and TB loss can occur.
>   */
>  static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> -static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
> +static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
> 
>  /*
>   * psscr value and mask of the deepest stop idle state.
> @@ -657,7 +657,7 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
> */
>   mmcr0   = mfspr(SPRN_MMCR0);
>   }
> - if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
> + if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
>   sprs.lpcr   = mfspr(SPRN_LPCR);
>   sprs.hfscr  = mfspr(SPRN_HFSCR);
>   sprs.fscr   = mfspr(SPRN_FSCR);
> @@ -741,7 +741,7 @@ static unsigned long power9_idle_stop(unsigned long 
> psscr, bool mmu_on)
>* just always test PSSCR for SPR/TB state loss.
>*/
>   pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
> - if (likely(pls < pnv_first_spr_loss_level)) {
> + if (likely(pls < deep_spr_loss_state)) {
>   if (sprs_saved)
>   atomic_stop_thread_idle();
>   goto out;
> @@ -1088,7 +1088,7 @@ static void __init pnv_power9_idle_init(void)
>* the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
>*/
>   pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> - pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
> + deep_spr_loss_state = MAX_STOP_STATE + 1;
>   for (i = 0; i < nr_pnv_idle_states; i++) {
>   int err;
>   struct pnv_idle_states_t *state = _idle_states[i];
> @@ -1099,8 +1099,8 @@ static void __init pnv_power9_idle_init(void)
>   pnv_first_tb_loss_level = psscr_rl;
> 
>   if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
> -  (pnv_first_spr_loss_level > psscr_rl))
> - pnv_first_spr_loss_level = psscr_rl;
> +  (deep_spr_loss_state > psscr_rl))
> + deep_spr_loss_state = psscr_rl;
> 
>   /*
>* The idle code does not deal with TB loss occurring
> @@ -,8 +,8 @@ static void __init pnv_power9_idle_init(void)
>* compatibility.
>*/
>   if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
> -  (pnv_first_spr_loss_level > psscr_rl))
> - pnv_first_spr_loss_level = psscr_rl;
> +  (deep_spr_loss_state > psscr_rl))
> + deep_spr_loss_state = psscr_rl;
> 
>   err = validate_psscr_val_mask(>psscr_val,
> >psscr_mask,
> @@ -1158,7 +1158,7 @@ static void __init pnv_power9_idle_init(void)
>   }
> 
>   pr_info("cpuidle-powernv: First stop level that may lose SPRs = 
> 0x%llx\n",
> - pnv_first_spr_loss_level);
> + deep_spr_loss_state);
> 
>   pr_info("cpuidle-powernv: First stop level that may lose timebase = 
> 0x%llx\n",
>   pnv_first_tb_loss_level);
> -- 
> 2.25.4
> 


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Michael Ellerman
Benjamin Herrenschmidt  writes:
> On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote:
>> > Why ? Branch distance limits ? You can't use trampolines ?
>> 
>> Nothing fundamental, it's just that we don't have a large code model in the C
>> compiler.  As a result all the global symbols are resolved as 32-bit
>> PC-relative accesses.  We could fix this with a fast large code model, but 
>> then
>> the kernel would need to relax global symbol references in modules and we 
>> don't
>> even do that for the simple code models we have now.  FWIW, some of the
>> proposed large code models are essentially just split-PLT/GOT and therefor
>> don't require relaxation, but at that point we're essentially PIC until we
>> have more that 2GiB of kernel text -- and even then, we keep all the
>> performance issues.
>
> My memory might be out of date but I *think* we do it on powerpc
> without going to a large code model, but just having the in-kernel
> linker insert trampolines.

We build modules with the large code model, and always have AFAIK:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/Makefile?commit=4fa640dc52302b5e62b01b05c755b055549633ae#n129

  # -mcmodel=medium breaks modules because it uses 32bit offsets from
  # the TOC pointer to create pointers where possible. Pointers into the
  # percpu data area are created by this method.
  #
  # The kernel module loader relocates the percpu data section from the
  # original location (starting with 0xd...) to somewhere in the base
  # kernel percpu data space (starting with 0xc...). We need a full
  # 64bit relocation for this to work, hence -mcmodel=large.
  KBUILD_CFLAGS_MODULE += -mcmodel=large


We also insert trampolines for branches, but IIUC that's a separate
issue.

cheers


Re: [v3 07/15] powerpc/perf: Add power10_feat to dt_cpu_ftrs

2020-07-21 Thread Jordan Niethe
On Sat, Jul 18, 2020 at 1:13 AM Athira Rajeev
 wrote:
>
> From: Madhavan Srinivasan 
>
> Add power10 feature function to dt_cpu_ftrs.c along
> with a power10 specific init() to initialize pmu sprs,
> sets the oprofile_cpu_type and cpu_features. This will
> enable performance monitoring unit(PMU) for Power10
> in CPU features with "performance-monitor-power10".
>
> For PowerISA v3.1, BHRB disable is controlled via Monitor Mode
> Control Register A (MMCRA) bit, namely "BHRB Recording Disable
> (BHRBRD)". This patch initializes MMCRA BHRBRD to disable BHRB
> feature at boot for power10.
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/include/asm/reg.h|  3 +++
>  arch/powerpc/kernel/cpu_setup_power.S |  8 
>  arch/powerpc/kernel/dt_cpu_ftrs.c | 26 ++
>  3 files changed, 37 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 21a1b2d..900ada1 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -1068,6 +1068,9 @@
>  #define MMCR0_PMC2_LOADMISSTIME0x5
>  #endif
>
> +/* BHRB disable bit for PowerISA v3.10 */
> +#define MMCRA_BHRB_DISABLE 0x0020
Shouldn't this go under SPRN_MMCRA with the other MMCRA_*.
> +
>  /*
>   * SPRG usage:
>   *
> diff --git a/arch/powerpc/kernel/cpu_setup_power.S 
> b/arch/powerpc/kernel/cpu_setup_power.S
> index efdcfa7..b8e0d1e 100644
> --- a/arch/powerpc/kernel/cpu_setup_power.S
> +++ b/arch/powerpc/kernel/cpu_setup_power.S
> @@ -94,6 +94,7 @@ _GLOBAL(__restore_cpu_power8)
>  _GLOBAL(__setup_cpu_power10)
> mflrr11
> bl  __init_FSCR_power10
> +   bl  __init_PMU_ISA31
So we set MMCRA here but then aren't we still going to call __init_PMU
which will overwrite that?
Would this setting MMCRA also need to be handled in __restore_cpu_power10?
> b   1f
>
>  _GLOBAL(__setup_cpu_power9)
> @@ -233,3 +234,10 @@ __init_PMU_ISA207:
> li  r5,0
> mtspr   SPRN_MMCRS,r5
> blr
> +
> +__init_PMU_ISA31:
> +   li  r5,0
> +   mtspr   SPRN_MMCR3,r5
> +   LOAD_REG_IMMEDIATE(r5, MMCRA_BHRB_DISABLE)
> +   mtspr   SPRN_MMCRA,r5
> +   blr
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
> b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index 3a40951..f482286 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -450,6 +450,31 @@ static int __init feat_enable_pmu_power9(struct 
> dt_cpu_feature *f)
> return 1;
>  }
>
> +static void init_pmu_power10(void)
> +{
> +   init_pmu_power9();
> +
> +   mtspr(SPRN_MMCR3, 0);
> +   mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);
> +}
> +
> +static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f)
> +{
> +   hfscr_pmu_enable();
> +
> +   init_pmu_power10();
> +   init_pmu_registers = init_pmu_power10;
> +
> +   cur_cpu_spec->cpu_features |= CPU_FTR_MMCRA;
> +   cur_cpu_spec->cpu_user_features |= PPC_FEATURE_PSERIES_PERFMON_COMPAT;
> +
> +   cur_cpu_spec->num_pmcs  = 6;
> +   cur_cpu_spec->pmc_type  = PPC_PMC_IBM;
> +   cur_cpu_spec->oprofile_cpu_type = "ppc64/power10";
> +
> +   return 1;
> +}
> +
>  static int __init feat_enable_tm(struct dt_cpu_feature *f)
>  {
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> @@ -639,6 +664,7 @@ struct dt_cpu_feature_match {
> {"pc-relative-addressing", feat_enable, 0},
> {"machine-check-power9", feat_enable_mce_power9, 0},
> {"performance-monitor-power9", feat_enable_pmu_power9, 0},
> +   {"performance-monitor-power10", feat_enable_pmu_power10, 0},
> {"event-based-branch-v3", feat_enable, 0},
> {"random-number-generator", feat_enable, 0},
> {"system-call-vectored", feat_disable, 0},
> --
> 1.8.3.1
>


Re: [v3 02/15] KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR

2020-07-21 Thread Michael Ellerman
Paul Mackerras  writes:
> On Fri, Jul 17, 2020 at 10:38:14AM -0400, Athira Rajeev wrote:
>> Currently `kvm_vcpu_arch` stores all Monitor Mode Control registers
>> in a flat array in order: mmcr0, mmcr1, mmcra, mmcr2, mmcrs
>> Split this to give mmcra and mmcrs its own entries in vcpu and
>> use a flat array for mmcr0 to mmcr2. This patch implements this
>> cleanup to make code easier to read.
>
> Changing the way KVM stores these values internally is fine, but
> changing the user ABI is not.  This part:
>
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>> b/arch/powerpc/include/uapi/asm/kvm.h
>> index 264e266..e55d847 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
>>  
>>  #define KVM_REG_PPC_MMCR0   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
>>  #define KVM_REG_PPC_MMCR1   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
>> -#define KVM_REG_PPC_MMCRA   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>> -#define KVM_REG_PPC_MMCR2   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
>> +#define KVM_REG_PPC_MMCR2   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>> +#define KVM_REG_PPC_MMCRA   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
>
> means that existing userspace programs that used to work would now be
> broken.  That is not acceptable (breaking the user ABI is only ever
> acceptable with a very compelling reason).  So NAK to this part of the
> patch.

I assume we don't have a KVM unit test that would have caught this?

cheers


Re: [v3 02/15] KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR

2020-07-21 Thread Michael Ellerman
Athira Rajeev  writes:
>> On 21-Jul-2020, at 9:24 AM, Paul Mackerras  wrote:
>> On Fri, Jul 17, 2020 at 10:38:14AM -0400, Athira Rajeev wrote:
>>> Currently `kvm_vcpu_arch` stores all Monitor Mode Control registers
>>> in a flat array in order: mmcr0, mmcr1, mmcra, mmcr2, mmcrs
>>> Split this to give mmcra and mmcrs its own entries in vcpu and
>>> use a flat array for mmcr0 to mmcr2. This patch implements this
>>> cleanup to make code easier to read.
>> 
>> Changing the way KVM stores these values internally is fine, but
>> changing the user ABI is not.  This part:
>> 
>>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>>> b/arch/powerpc/include/uapi/asm/kvm.h
>>> index 264e266..e55d847 100644
>>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>>> @@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
>>> 
>>> #define KVM_REG_PPC_MMCR0   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
>>> #define KVM_REG_PPC_MMCR1   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
>>> -#define KVM_REG_PPC_MMCRA  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>>> -#define KVM_REG_PPC_MMCR2  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
>>> +#define KVM_REG_PPC_MMCR2  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>>> +#define KVM_REG_PPC_MMCRA  (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
>> 
>> means that existing userspace programs that used to work would now be
>> broken.  That is not acceptable (breaking the user ABI is only ever
>> acceptable with a very compelling reason).  So NAK to this part of the
>> patch.
>
> Hi Paul
>
> Thanks for checking the patch. I understood your point on user ABI breakage 
> that this particular change can cause.
> I will retain original KVM_REG_PPC_MMCRA and KVM_REG_PPC_MMCR2 order in 
> `kvm.h`
> And with that, additionally I will need below change ( on top of current 
> patch ) for my clean up updates for kvm cpu MMCR to work,
> Because now mmcra and mmcrs will have its own entries in vcpu and is not part 
> of the mmcr[] array
> Please suggest if this looks good

I did the same patch I think in my testing branch, it's here:

https://github.com/linuxppc/linux/commit/daea78154eff1b7e2f36be05a8f95feb5a588912


Can you please check that matches what you sent.

cheers

> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3f90eee261fc..b10bb404f0d5 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1679,10 +1679,13 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu 
> *vcpu, u64 id,
> case KVM_REG_PPC_UAMOR:
> *val = get_reg_val(id, vcpu->arch.uamor);
> break;
> -   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
> +   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
> i = id - KVM_REG_PPC_MMCR0;
> *val = get_reg_val(id, vcpu->arch.mmcr[i]);
> break;
> +   case KVM_REG_PPC_MMCR2:
> +   *val = get_reg_val(id, vcpu->arch.mmcr[2]);
> +   break;
> case KVM_REG_PPC_MMCRA:
> *val = get_reg_val(id, vcpu->arch.mmcra);
> break;
> @@ -1906,10 +1909,13 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu 
> *vcpu, u64 id,
> case KVM_REG_PPC_UAMOR:
> vcpu->arch.uamor = set_reg_val(id, *val);
> break;
> -   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
> +   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
> i = id - KVM_REG_PPC_MMCR0;
> vcpu->arch.mmcr[i] = set_reg_val(id, *val);
> break;
> +   case KVM_REG_PPC_MMCR2:
> +   vcpu->arch.mmcr[2] = set_reg_val(id, *val);
> +   break;
> case KVM_REG_PPC_MMCRA:
> vcpu->arch.mmcra = set_reg_val(id, *val);
> break;
> —
>
>
>> 
>> Regards,
>> Paul.


[PATCH v2 14/14] powerpc/eeh: Move PE tree setup into the platform

2020-07-21 Thread Oliver O'Halloran
The EEH core has a concept of a "PE tree" to support PowerNV. The PE tree
follows the PCI bus structures because a reset asserted on an upstream
bridge will be propagated to the downstream bridges. On pseries there's a
1-1 correspondence between what the guest sees are a PHB and a PE so the
"tree" is really just a single node.

Current the EEH core is reponsible for setting up this PE tree which it
does by traversing the pci_dn tree. The structure of the pci_dn tree
matches the bus tree on PowerNV which leads to the PE tree being "correct"
this setup method doesn't make a whole lot of sense and it's actively
confusing for the pseries case where it doesn't really do anything.

We want to remove the dependence on pci_dn anyway so this patch move
choosing where to insert a new PE into the platform code rather than
being part of the generic EEH code. For PowerNV this simplifies the
tree building logic and removes the use of pci_dn. For pseries we
keep the existing logic. I'm not really convinced it does anything
due to the 1-1 PE-to-PHB correspondence so every device under that
PHB should be in the same PE, but I'd rather not remove it entirely
until we've had a chance to look at it more deeply.

Signed-off-by: Oliver O'Halloran 
Reviewed-by: Alexey Kardashevskiy 
---
v2: Reworked pseries PE setup slightly. NOT DONE YET. mostly done needs test
---
 arch/powerpc/include/asm/eeh.h   |  2 +-
 arch/powerpc/kernel/eeh_pe.c | 70 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 27 +++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 66 +++---
 4 files changed, 102 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index df9462230e75..187c23324d96 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -283,7 +283,7 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
  int pe_no, int config_addr);
-int eeh_pe_tree_insert(struct eeh_dev *edev);
+int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent);
 int eeh_pe_tree_remove(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 898205829a8f..ea2f8b362d18 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -318,53 +318,20 @@ struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
return pe;
 }
 
-/**
- * eeh_pe_get_parent - Retrieve the parent PE
- * @edev: EEH device
- *
- * The whole PEs existing in the system are organized as hierarchy
- * tree. The function is used to retrieve the parent PE according
- * to the parent EEH device.
- */
-static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
-{
-   struct eeh_dev *parent;
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
-   /*
-* It might have the case for the indirect parent
-* EEH device already having associated PE, but
-* the direct parent EEH device doesn't have yet.
-*/
-   if (edev->physfn)
-   pdn = pci_get_pdn(edev->physfn);
-   else
-   pdn = pdn ? pdn->parent : NULL;
-   while (pdn) {
-   /* We're poking out of PCI territory */
-   parent = pdn_to_eeh_dev(pdn);
-   if (!parent)
-   return NULL;
-
-   if (parent->pe)
-   return parent->pe;
-
-   pdn = pdn->parent;
-   }
-
-   return NULL;
-}
-
 /**
  * eeh_pe_tree_insert - Add EEH device to parent PE
  * @edev: EEH device
+ * @new_pe_parent: PE to create additional PEs under
  *
- * Add EEH device to the parent PE. If the parent PE already
- * exists, the PE type will be changed to EEH_PE_BUS. Otherwise,
- * we have to create new PE to hold the EEH device and the new
- * PE will be linked to its parent PE as well.
+ * Add EEH device to the PE in edev->pe_config_addr. If a PE already
+ * exists with that address then @edev is added to that PE. Otherwise
+ * a new PE is created and inserted into the PE tree as a child of
+ * @new_pe_parent.
+ *
+ * If @new_pe_parent is NULL then the new PE will be inserted under
+ * directly under the the PHB.
  */
-int eeh_pe_tree_insert(struct eeh_dev *edev)
+int eeh_pe_tree_insert(struct eeh_dev *edev, struct eeh_pe *new_pe_parent)
 {
struct pci_controller *hose = edev->controller;
struct eeh_pe *pe, *parent;
@@ -399,7 +366,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
}
 
eeh_edev_dbg(edev,
-"Added to device PE (parent: PE#%x)\n",
+"Added to existing PE (parent: PE#%x)\n",
   

[PATCH v2 13/14] powerpc/eeh: Drop pdn use in eeh_pe_tree_insert()

2020-07-21 Thread Oliver O'Halloran
This is mostly just to make the subsequent diffs less noisy. No functional
changes.

One thing that needs calling out is the removal of the "config_addr"
variable and replacing it with edev->bdfn. The contents of edev->bdfn are
the same, however it's worth pointing out that what RTAS calls a
"config_addr" isn't the same as the bdfn. The config_addr is supposed to
be:  with each field being an 8 bit number. Various parts
of the EEH code use BDFN and "config_addr" as interchangeable quantities
even though they aren't really.

Signed-off-by: Oliver O'Halloran 
---
v2: no changes
---
 arch/powerpc/kernel/eeh_pe.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 97bf09db2ecd..898205829a8f 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -366,9 +366,8 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
  */
 int eeh_pe_tree_insert(struct eeh_dev *edev)
 {
+   struct pci_controller *hose = edev->controller;
struct eeh_pe *pe, *parent;
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-   int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
@@ -382,7 +381,7 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
 * PE should be composed of PCI bus and its subordinate
 * components.
 */
-   pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
+   pe = eeh_pe_get(hose, edev->pe_config_addr, edev->bdfn);
if (pe) {
if (pe->type & EEH_PE_INVALID) {
list_add_tail(>entry, >edevs);
@@ -416,15 +415,15 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
 
/* Create a new EEH PE */
if (edev->physfn)
-   pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF);
+   pe = eeh_pe_alloc(hose, EEH_PE_VF);
else
-   pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE);
+   pe = eeh_pe_alloc(hose, EEH_PE_DEVICE);
if (!pe) {
pr_err("%s: out of memory!\n", __func__);
return -ENOMEM;
}
pe->addr= edev->pe_config_addr;
-   pe->config_addr = config_addr;
+   pe->config_addr = edev->bdfn;
 
/*
 * Put the new EEH PE into hierarchy tree. If the parent
@@ -434,10 +433,10 @@ int eeh_pe_tree_insert(struct eeh_dev *edev)
 */
parent = eeh_pe_get_parent(edev);
if (!parent) {
-   parent = eeh_phb_pe_get(pdn->phb);
+   parent = eeh_phb_pe_get(hose);
if (!parent) {
pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
-   __func__, pdn->phb->global_number);
+   __func__, hose->global_number);
edev->pe = NULL;
kfree(pe);
return -EEXIST;
-- 
2.26.2



[PATCH v2 12/14] powerpc/eeh: Rename eeh_{add_to|remove_from}_parent_pe()

2020-07-21 Thread Oliver O'Halloran
The naming of eeh_{add_to|remove_from}_parent_pe() doesn't really reflect
what they actually do. If the PE referred to be edev->pe_config_addr
already exists under that PHB then the edev is added to that PE. However,
if the PE doesn't exist the a new one is created for the edev.

The bulk of the implementation of eeh_add_to_parent_pe() covers that
second case. Similarly, most of eeh_remove_from_parent_pe() is
determining when it's safe to delete a PE.

Signed-off-by: Oliver O'Halloran 
---
v2: no changes
---
 arch/powerpc/include/asm/eeh.h   | 4 ++--
 arch/powerpc/kernel/eeh.c| 4 ++--
 arch/powerpc/kernel/eeh_driver.c | 2 +-
 arch/powerpc/kernel/eeh_pe.c | 8 
 arch/powerpc/kernel/pci_dn.c | 2 +-
 arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 8 
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 0d99aad8d9b7..df9462230e75 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -283,8 +283,8 @@ struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
 struct eeh_pe *eeh_pe_next(struct eeh_pe *pe, struct eeh_pe *root);
 struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
  int pe_no, int config_addr);
-int eeh_add_to_parent_pe(struct eeh_dev *edev);
-int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
+int eeh_pe_tree_insert(struct eeh_dev *edev);
+int eeh_pe_tree_remove(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
 void *eeh_pe_traverse(struct eeh_pe *root,
  eeh_pe_traverse_func fn, void *flag);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f203ffc5c57d..94682382fc8c 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1107,7 +1107,7 @@ void eeh_probe_device(struct pci_dev *dev)
 * FIXME: HEY MA, LOOK AT ME, NO LOCKING!
 */
if (edev->pdev && edev->pdev != dev) {
-   eeh_rmv_from_parent_pe(edev);
+   eeh_pe_tree_remove(edev);
eeh_addr_cache_rmv_dev(edev->pdev);
eeh_sysfs_remove_device(edev->pdev);
 
@@ -1186,7 +1186,7 @@ void eeh_remove_device(struct pci_dev *dev)
edev->in_error = false;
dev->dev.archdata.edev = NULL;
if (!(edev->pe->state & EEH_PE_KEEP))
-   eeh_rmv_from_parent_pe(edev);
+   eeh_pe_tree_remove(edev);
else
edev->mode |= EEH_DEV_DISCONNECTED;
 }
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b84d3cb2532e..4197e4559f65 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -542,7 +542,7 @@ static void *eeh_pe_detach_dev(struct eeh_pe *pe, void 
*userdata)
continue;
 
edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
-   eeh_rmv_from_parent_pe(edev);
+   eeh_pe_tree_remove(edev);
}
 
return NULL;
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index f20fb0ee6aec..97bf09db2ecd 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -356,7 +356,7 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
 }
 
 /**
- * eeh_add_to_parent_pe - Add EEH device to parent PE
+ * eeh_pe_tree_insert - Add EEH device to parent PE
  * @edev: EEH device
  *
  * Add EEH device to the parent PE. If the parent PE already
@@ -364,7 +364,7 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
  * we have to create new PE to hold the EEH device and the new
  * PE will be linked to its parent PE as well.
  */
-int eeh_add_to_parent_pe(struct eeh_dev *edev)
+int eeh_pe_tree_insert(struct eeh_dev *edev)
 {
struct eeh_pe *pe, *parent;
struct pci_dn *pdn = eeh_dev_to_pdn(edev);
@@ -459,7 +459,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 }
 
 /**
- * eeh_rmv_from_parent_pe - Remove one EEH device from the associated PE
+ * eeh_pe_tree_remove - Remove one EEH device from the associated PE
  * @edev: EEH device
  *
  * The PE hierarchy tree might be changed when doing PCI hotplug.
@@ -467,7 +467,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
  * during EEH recovery. So we have to call the function remove the
  * corresponding PE accordingly if necessary.
  */
-int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
+int eeh_pe_tree_remove(struct eeh_dev *edev)
 {
struct eeh_pe *pe, *parent, *child;
bool keep, recover;
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index bf11ac8427ac..e99b7c547d7e 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -263,7 +263,7 @@ void remove_sriov_vf_pdns(struct pci_dev *pdev)
 * have a configured PE.
   

[PATCH v2 11/14] powerpc/eeh: Remove class code field from edev

2020-07-21 Thread Oliver O'Halloran
The edev->class_code field is never referenced anywhere except for the
platform specific probe functions. The same information is available in
the pci_dev for PowerNV and in the pci_dn on pseries so we can remove
the field.

Signed-off-by: Oliver O'Halloran 
---
v2: no changes
---
 arch/powerpc/include/asm/eeh.h   | 1 -
 arch/powerpc/platforms/powernv/eeh-powernv.c | 5 ++---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d16d5b59dd22..0d99aad8d9b7 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -133,7 +133,6 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 
 struct eeh_dev {
int mode;   /* EEH mode */
-   int class_code; /* Class code of the device */
int bdfn;   /* bdfn of device (for cfg ops) */
struct pci_controller *controller;
int pe_config_addr; /* PE config address*/
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index c9f2f454d053..7cbb03a97a61 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -372,19 +372,18 @@ static struct eeh_dev *pnv_eeh_probe(struct pci_dev *pdev)
}
 
/* Skip for PCI-ISA bridge */
-   if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_ISA)
+   if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
return NULL;
 
eeh_edev_dbg(edev, "Probing device\n");
 
/* Initialize eeh device */
-   edev->class_code = pdn->class_code;
edev->mode  &= 0xFF00;
edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
edev->af_cap   = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
edev->aer_cap  = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
-   if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+   if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
pnv_pci_cfg_read(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index b981332db873..67931fe5f341 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -273,12 +273,11 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
 * correctly reflects that current device is root port
 * or PCIe switch downstream port.
 */
-   edev->class_code = pdn->class_code;
edev->pcix_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
edev->pcie_cap = pseries_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
edev->aer_cap = pseries_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
edev->mode &= 0xFF00;
-   if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
+   if ((pdn->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
edev->mode |= EEH_DEV_BRIDGE;
if (edev->pcie_cap) {
rtas_read_config(pdn, edev->pcie_cap + PCI_EXP_FLAGS,
-- 
2.26.2



[PATCH v2 10/14] powerpc/eeh: Remove spurious use of pci_dn in eeh_dump_dev_log

2020-07-21 Thread Oliver O'Halloran
Retrieve the domain, bus, device, and function numbers from the edev.

Signed-off-by: Oliver O'Halloran 
---
v2: no change
---
 arch/powerpc/kernel/eeh.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 1a12c8bdf61e..f203ffc5c57d 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -167,23 +167,17 @@ void eeh_show_enabled(void)
  */
 static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
u32 cfg;
int cap, i;
int n = 0, l = 0;
char buffer[128];
 
-   if (!pdn) {
-   pr_warn("EEH: Note: No error log for absent device.\n");
-   return 0;
-   }
-
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
-  pdn->phb->global_number, pdn->busno,
-  PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+   edev->pe->phb->global_number, edev->bdfn >> 8,
+   PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
-   pdn->phb->global_number, pdn->busno,
-   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+   edev->pe->phb->global_number, edev->bdfn >> 8,
+   PCI_SLOT(edev->bdfn), PCI_FUNC(edev->bdfn));
 
eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
-- 
2.26.2



[PATCH v2 09/14] powerpc/eeh: Pass eeh_dev to eeh_ops->{read|write}_config()

2020-07-21 Thread Oliver O'Halloran
Mechanical conversion of the eeh_ops interfaces to use eeh_dev to reference
a specific device rather than pci_dn. No functional changes.

Signed-off-by: Oliver O'Halloran 
---
v2: no change
---
 arch/powerpc/include/asm/eeh.h   |  4 +-
 arch/powerpc/kernel/eeh.c| 22 +
 arch/powerpc/kernel/eeh_pe.c | 47 +---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 43 ++
 arch/powerpc/platforms/pseries/eeh_pseries.c | 16 ---
 5 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8045cede0df9..d16d5b59dd22 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -226,8 +226,8 @@ struct eeh_ops {
int (*configure_bridge)(struct eeh_pe *pe);
int (*err_inject)(struct eeh_pe *pe, int type, int func,
  unsigned long addr, unsigned long mask);
-   int (*read_config)(struct pci_dn *pdn, int where, int size, u32 *val);
-   int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val);
+   int (*read_config)(struct eeh_dev *edev, int where, int size, u32 *val);
+   int (*write_config)(struct eeh_dev *edev, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
int (*restore_config)(struct eeh_dev *edev);
int (*notify_resume)(struct eeh_dev *edev);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 1cef0f4bb2d5..1a12c8bdf61e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -185,21 +185,21 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
 
-   eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, );
+   eeh_ops->read_config(edev, PCI_VENDOR_ID, 4, );
n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg);
pr_warn("EEH: PCI device/vendor: %08x\n", cfg);
 
-   eeh_ops->read_config(pdn, PCI_COMMAND, 4, );
+   eeh_ops->read_config(edev, PCI_COMMAND, 4, );
n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg);
pr_warn("EEH: PCI cmd/status register: %08x\n", cfg);
 
/* Gather bridge-specific registers */
if (edev->mode & EEH_DEV_BRIDGE) {
-   eeh_ops->read_config(pdn, PCI_SEC_STATUS, 2, );
+   eeh_ops->read_config(edev, PCI_SEC_STATUS, 2, );
n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg);
pr_warn("EEH: Bridge secondary status: %04x\n", cfg);
 
-   eeh_ops->read_config(pdn, PCI_BRIDGE_CONTROL, 2, );
+   eeh_ops->read_config(edev, PCI_BRIDGE_CONTROL, 2, );
n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg);
pr_warn("EEH: Bridge control: %04x\n", cfg);
}
@@ -207,11 +207,11 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
/* Dump out the PCI-X command and status regs */
cap = edev->pcix_cap;
if (cap) {
-   eeh_ops->read_config(pdn, cap, 4, );
+   eeh_ops->read_config(edev, cap, 4, );
n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg);
pr_warn("EEH: PCI-X cmd: %08x\n", cfg);
 
-   eeh_ops->read_config(pdn, cap+4, 4, );
+   eeh_ops->read_config(edev, cap+4, 4, );
n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg);
pr_warn("EEH: PCI-X status: %08x\n", cfg);
}
@@ -223,7 +223,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pr_warn("EEH: PCI-E capabilities and status follow:\n");
 
for (i=0; i<=8; i++) {
-   eeh_ops->read_config(pdn, cap+4*i, 4, );
+   eeh_ops->read_config(edev, cap+4*i, 4, );
n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
 
if ((i % 4) == 0) {
@@ -250,7 +250,7 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
pr_warn("EEH: PCI-E AER capability register set follows:\n");
 
for (i=0; i<=13; i++) {
-   eeh_ops->read_config(pdn, cap+4*i, 4, );
+   eeh_ops->read_config(edev, cap+4*i, 4, );
n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg);
 
if ((i % 4) == 0) {
@@ -917,15 +917,13 @@ int eeh_pe_reset_full(struct eeh_pe *pe, bool 
include_passed)
  */
 void eeh_save_bars(struct eeh_dev *edev)
 {
-   struct pci_dn *pdn;
int i;
 
-   pdn = eeh_dev_to_pdn(edev);
-   if (!pdn)
+   if (!edev)
return;
 
for (i = 0; i < 16; i++)
-   eeh_ops->read_config(pdn, i * 4, 4, >config_space[i]);
+   eeh_ops->read_config(edev, i * 4, 4, 

[PATCH v2 08/14] powerpc/eeh: Pass eeh_dev to eeh_ops->resume_notify()

2020-07-21 Thread Oliver O'Halloran
Mechanical conversion of the eeh_ops interfaces to use eeh_dev to reference
a specific device rather than pci_dn. No functional changes.

Signed-off-by: Oliver O'Halloran 
---
v2: no changes
---
 arch/powerpc/include/asm/eeh.h   | 2 +-
 arch/powerpc/kernel/eeh_driver.c | 4 ++--
 arch/powerpc/kernel/eeh_sysfs.c  | 2 +-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 4 +---
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 70a686d731f6..8045cede0df9 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -230,7 +230,7 @@ struct eeh_ops {
int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
int (*restore_config)(struct eeh_dev *edev);
-   int (*notify_resume)(struct pci_dn *pdn);
+   int (*notify_resume)(struct eeh_dev *edev);
 };
 
 extern int eeh_subsystem_flags;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b70b9273f45a..b84d3cb2532e 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -425,8 +425,8 @@ static enum pci_ers_result eeh_report_resume(struct eeh_dev 
*edev,
 
pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
 #ifdef CONFIG_PCI_IOV
-   if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev))
-   eeh_ops->notify_resume(eeh_dev_to_pdn(edev));
+   if (eeh_ops->notify_resume)
+   eeh_ops->notify_resume(edev);
 #endif
return PCI_ERS_RESULT_NONE;
 }
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
index 4fb0f1e1017a..429620da73ba 100644
--- a/arch/powerpc/kernel/eeh_sysfs.c
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -99,7 +99,7 @@ static ssize_t eeh_notify_resume_store(struct device *dev,
if (!edev || !edev->pe || !eeh_ops->notify_resume)
return -ENODEV;
 
-   if (eeh_ops->notify_resume(pci_get_pdn(pdev)))
+   if (eeh_ops->notify_resume(edev))
return -EIO;
 
return count;
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 39cdf447dfa6..426227ddee32 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -793,10 +793,8 @@ static int pseries_call_allow_unfreeze(struct eeh_dev 
*edev)
return rc;
 }
 
-static int pseries_notify_resume(struct pci_dn *pdn)
+static int pseries_notify_resume(struct eeh_dev *edev)
 {
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-
if (!edev)
return -EEXIST;
 
-- 
2.26.2



[PATCH v2 07/14] powerpc/eeh: Pass eeh_dev to eeh_ops->restore_config()

2020-07-21 Thread Oliver O'Halloran
Mechanical conversion of the eeh_ops interfaces to use eeh_dev to reference
a specific device rather than pci_dn. No functional changes.

Signed-off-by: Oliver O'Halloran 
---
v2: no changes
---
 arch/powerpc/include/asm/eeh.h   | 2 +-
 arch/powerpc/kernel/eeh.c| 5 ++---
 arch/powerpc/kernel/eeh_pe.c | 6 ++
 arch/powerpc/platforms/powernv/eeh-powernv.c | 6 ++
 4 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index c7d5a234bb51..70a686d731f6 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -229,7 +229,7 @@ struct eeh_ops {
int (*read_config)(struct pci_dn *pdn, int where, int size, u32 *val);
int (*write_config)(struct pci_dn *pdn, int where, int size, u32 val);
int (*next_error)(struct eeh_pe **pe);
-   int (*restore_config)(struct pci_dn *pdn);
+   int (*restore_config)(struct eeh_dev *edev);
int (*notify_resume)(struct pci_dn *pdn);
 };
 
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index a4df6f6de0bd..1cef0f4bb2d5 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -726,7 +726,6 @@ static void eeh_disable_and_save_dev_state(struct eeh_dev 
*edev,
 
 static void eeh_restore_dev_state(struct eeh_dev *edev, void *userdata)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
struct pci_dev *pdev = eeh_dev_to_pci_dev(edev);
struct pci_dev *dev = userdata;
 
@@ -734,8 +733,8 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
return;
 
/* Apply customization from firmware */
-   if (pdn && eeh_ops->restore_config)
-   eeh_ops->restore_config(pdn);
+   if (eeh_ops->restore_config)
+   eeh_ops->restore_config(edev);
 
/* The caller should restore state for the specified device */
if (pdev != dev)
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 177852e39a25..d71493f66917 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -843,16 +843,14 @@ static void eeh_restore_device_bars(struct eeh_dev *edev)
  */
 static void eeh_restore_one_device_bars(struct eeh_dev *edev, void *flag)
 {
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
/* Do special restore for bridges */
if (edev->mode & EEH_DEV_BRIDGE)
eeh_restore_bridge_bars(edev);
else
eeh_restore_device_bars(edev);
 
-   if (eeh_ops->restore_config && pdn)
-   eeh_ops->restore_config(pdn);
+   if (eeh_ops->restore_config)
+   eeh_ops->restore_config(edev);
 }
 
 /**
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 8f3a7611efc1..a41e67f674e6 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1619,12 +1619,10 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
return ret;
 }
 
-static int pnv_eeh_restore_config(struct pci_dn *pdn)
+static int pnv_eeh_restore_config(struct eeh_dev *edev)
 {
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
struct pnv_phb *phb;
s64 ret = 0;
-   int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
if (!edev)
return -EEXIST;
@@ -1638,7 +1636,7 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 
if (ret) {
pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
-   __func__, config_addr, ret);
+   __func__, edev->bdfn, ret);
return -EIO;
}
 
-- 
2.26.2



[PATCH v2 06/14] powerpc/eeh: Remove VF config space restoration

2020-07-21 Thread Oliver O'Halloran
There's a bunch of strange things about this code. First up is that none of
the fields being written to are functional for a VF. The SR-IOV
specification lists then as "Reserved, but OS should preserve" so writing
new values to them doesn't do anything and is clearly wrong from a
correctness perspective.

However, since VFs are designed to be managed by the OS there is an
argument to be made that we should be saving and restoring some parts of
config space. We already sort of do that by saving the first 64 bytes of
config space in the eeh_dev (see eeh_dev->config_space[]). This is
inadequate since it doesn't even consider saving and restoring the PCI
capability structures. However, this is a problem with EEH in general and
that needs to be fixed for non-VF devices too.

There's no real reason to keep around this around so delete it.

Signed-off-by: Oliver O'Halloran 
Reviewed-by: Alexey Kardashevskiy 
---
v2: no changes
---
 arch/powerpc/include/asm/eeh.h   |  1 -
 arch/powerpc/kernel/eeh.c| 59 
 arch/powerpc/platforms/powernv/eeh-powernv.c | 20 ++-
 arch/powerpc/platforms/pseries/eeh_pseries.c | 26 +
 4 files changed, 7 insertions(+), 99 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index d619012281bd..c7d5a234bb51 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -314,7 +314,6 @@ int eeh_pe_reset(struct eeh_pe *pe, int option, bool 
include_passed);
 int eeh_pe_configure(struct eeh_pe *pe);
 int eeh_pe_inject_err(struct eeh_pe *pe, int type, int func,
  unsigned long addr, unsigned long mask);
-int eeh_restore_vf_config(struct pci_dn *pdn);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 859f76020256..a4df6f6de0bd 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -742,65 +742,6 @@ static void eeh_restore_dev_state(struct eeh_dev *edev, 
void *userdata)
pci_restore_state(pdev);
 }
 
-int eeh_restore_vf_config(struct pci_dn *pdn)
-{
-   struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
-   u32 devctl, cmd, cap2, aer_capctl;
-   int old_mps;
-
-   if (edev->pcie_cap) {
-   /* Restore MPS */
-   old_mps = (ffs(pdn->mps) - 8) << 5;
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-2, );
-   devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
-   devctl |= old_mps;
-   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
- 2, devctl);
-
-   /* Disable Completion Timeout if possible */
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
-4, );
-   if (cap2 & PCI_EXP_DEVCAP2_COMP_TMOUT_DIS) {
-   eeh_ops->read_config(pdn,
-edev->pcie_cap + PCI_EXP_DEVCTL2,
-4, );
-   cap2 |= PCI_EXP_DEVCTL2_COMP_TMOUT_DIS;
-   eeh_ops->write_config(pdn,
- edev->pcie_cap + PCI_EXP_DEVCTL2,
- 4, cap2);
-   }
-   }
-
-   /* Enable SERR and parity checking */
-   eeh_ops->read_config(pdn, PCI_COMMAND, 2, );
-   cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
-   eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
-
-   /* Enable report various errors */
-   if (edev->pcie_cap) {
-   eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
-2, );
-   devctl &= ~PCI_EXP_DEVCTL_CERE;
-   devctl |= (PCI_EXP_DEVCTL_NFERE |
-  PCI_EXP_DEVCTL_FERE |
-  PCI_EXP_DEVCTL_URRE);
-   eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
- 2, devctl);
-   }
-
-   /* Enable ECRC generation and check */
-   if (edev->pcie_cap && edev->aer_cap) {
-   eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
-4, _capctl);
-   aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
-   eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
- 4, aer_capctl);
-   }
-
-   return 0;
-}
-
 /**
  * pcibios_set_pcie_reset_state - Set PCI-E reset state
  * @dev: pci device struct
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index bcd0515d8f79..8f3a7611efc1 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1629,20 +1629,12 @@ static int 

[PATCH v2 05/14] powerpc/eeh: Kill off eeh_ops->get_pe_addr()

2020-07-21 Thread Oliver O'Halloran
This is used in precisely one place which is in pseries specific platform
code.  There's no need to have the callback in eeh_ops since the platform
chooses the EEH PE addresses anyway. The PowerNV implementation has always
been a stub too so remove it.

Signed-off-by: Oliver O'Halloran 
Reviewed-by: Alexey Kardashevskiy 
---
v2: Made "buid" in pseries_eeh_get_pe_addr() an unsigned long to match
the pci_controller type.
---
 arch/powerpc/include/asm/eeh.h   |  1 -
 arch/powerpc/platforms/powernv/eeh-powernv.c | 13 
 arch/powerpc/platforms/pseries/eeh_pseries.c | 22 ++--
 3 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 9c5ba535f6a1..d619012281bd 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -220,7 +220,6 @@ struct eeh_ops {
int (*init)(void);
struct eeh_dev *(*probe)(struct pci_dev *pdev);
int (*set_option)(struct eeh_pe *pe, int option);
-   int (*get_pe_addr)(struct eeh_pe *pe);
int (*get_state)(struct eeh_pe *pe, int *delay);
int (*reset)(struct eeh_pe *pe, int option);
int (*get_log)(struct eeh_pe *pe, int severity, char *drv_log, unsigned 
long len);
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 79409e005fcd..bcd0515d8f79 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -535,18 +535,6 @@ static int pnv_eeh_set_option(struct eeh_pe *pe, int 
option)
return 0;
 }
 
-/**
- * pnv_eeh_get_pe_addr - Retrieve PE address
- * @pe: EEH PE
- *
- * Retrieve the PE address according to the given tranditional
- * PCI BDF (Bus/Device/Function) address.
- */
-static int pnv_eeh_get_pe_addr(struct eeh_pe *pe)
-{
-   return pe->addr;
-}
-
 static void pnv_eeh_get_phb_diag(struct eeh_pe *pe)
 {
struct pnv_phb *phb = pe->phb->private_data;
@@ -1670,7 +1658,6 @@ static struct eeh_ops pnv_eeh_ops = {
.init   = pnv_eeh_init,
.probe  = pnv_eeh_probe,
.set_option = pnv_eeh_set_option,
-   .get_pe_addr= pnv_eeh_get_pe_addr,
.get_state  = pnv_eeh_get_state,
.reset  = pnv_eeh_reset,
.get_log= pnv_eeh_get_log,
diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index 18a2522b9b5e..bcc72b9a5309 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+static int pseries_eeh_get_pe_addr(struct pci_dn *pdn);
+
 /* RTAS tokens */
 static int ibm_set_eeh_option;
 static int ibm_set_slot_reset;
@@ -301,7 +303,7 @@ void pseries_eeh_init_edev(struct pci_dn *pdn)
eeh_edev_dbg(edev, "EEH failed to enable on device (code 
%d)\n", ret);
} else {
/* Retrieve PE address */
-   edev->pe_config_addr = eeh_ops->get_pe_addr();
+   edev->pe_config_addr = pseries_eeh_get_pe_addr(pdn);
pe.addr = edev->pe_config_addr;
 
/* Some older systems (Power4) allow the ibm,set-eeh-option
@@ -431,8 +433,10 @@ static int pseries_eeh_set_option(struct eeh_pe *pe, int 
option)
  * It's notable that zero'ed return value means invalid PE config
  * address.
  */
-static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
+static int pseries_eeh_get_pe_addr(struct pci_dn *pdn)
 {
+   int config_addr = rtas_config_addr(pdn->busno, pdn->devfn, 0);
+   unsigned long buid = pdn->phb->buid;
int ret = 0;
int rets[3];
 
@@ -443,18 +447,16 @@ static int pseries_eeh_get_pe_addr(struct eeh_pe *pe)
 * meaningless.
 */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-   pe->config_addr, BUID_HI(pe->phb->buid),
-   BUID_LO(pe->phb->buid), 1);
+   config_addr, BUID_HI(buid), BUID_LO(buid), 1);
if (ret || (rets[0] == 0))
return 0;
 
/* Retrieve the associated PE config address */
ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
-   pe->config_addr, BUID_HI(pe->phb->buid),
-   BUID_LO(pe->phb->buid), 0);
+   config_addr, BUID_HI(buid), BUID_LO(buid), 0);
if (ret) {
pr_warn("%s: Failed to get address for PHB#%x-PE#%x\n",
-   __func__, pe->phb->global_number, 
pe->config_addr);
+   __func__, pdn->phb->global_number, config_addr);
return 0;
}
 
@@ -463,11 +465,10 @@ static int 

[PATCH v2 04/14] powerpc/pseries: Stop using pdn->pe_number

2020-07-21 Thread Oliver O'Halloran
The pci_dn->pe_number field is mainly used to track the IODA PE number of a
device on PowerNV. At some point it grew a user in the pseries SR-IOV
support which muddies the waters a bit, so remove it.

Signed-off-by: Oliver O'Halloran 
Reviewed-by: Alexey Kardashevskiy 
---
v2: no change
---
 arch/powerpc/platforms/pseries/eeh_pseries.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/eeh_pseries.c 
b/arch/powerpc/platforms/pseries/eeh_pseries.c
index ace117f99d94..18a2522b9b5e 100644
--- a/arch/powerpc/platforms/pseries/eeh_pseries.c
+++ b/arch/powerpc/platforms/pseries/eeh_pseries.c
@@ -52,8 +52,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
dev_dbg(>dev, "EEH: Setting up device\n");
 #ifdef CONFIG_PCI_IOV
if (pdev->is_virtfn) {
-   struct pci_dn *physfn_pdn;
-
pdn->device_id  =  pdev->device;
pdn->vendor_id  =  pdev->vendor;
pdn->class_code =  pdev->class;
@@ -63,8 +61,6 @@ void pseries_pcibios_bus_add_device(struct pci_dev *pdev)
 * completion from platform.
 */
pdn->last_allow_rc =  0;
-   physfn_pdn  =  pci_get_pdn(pdev->physfn);
-   pdn->pe_number  =  physfn_pdn->pe_num_map[pdn->vf_index];
}
 #endif
pseries_eeh_init_edev(pdn);
@@ -772,8 +768,8 @@ int pseries_send_allow_unfreeze(struct pci_dn *pdn,
 
 static int pseries_call_allow_unfreeze(struct eeh_dev *edev)
 {
+   int cur_vfs = 0, rc = 0, vf_index, bus, devfn, vf_pe_num;
struct pci_dn *pdn, *tmp, *parent, *physfn_pdn;
-   int cur_vfs = 0, rc = 0, vf_index, bus, devfn;
u16 *vf_pe_array;
 
vf_pe_array = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
@@ -806,8 +802,10 @@ static int pseries_call_allow_unfreeze(struct eeh_dev 
*edev)
}
} else {
pdn = pci_get_pdn(edev->pdev);
-   vf_pe_array[0] = cpu_to_be16(pdn->pe_number);
physfn_pdn = pci_get_pdn(edev->physfn);
+
+   vf_pe_num = physfn_pdn->pe_num_map[edev->vf_index];
+   vf_pe_array[0] = cpu_to_be16(vf_pe_num);
rc = pseries_send_allow_unfreeze(physfn_pdn,
 vf_pe_array, 1);
pdn->last_allow_rc = rc;
-- 
2.26.2



[PATCH v2 03/14] powerpc/eeh: Move vf_index out of pci_dn and into eeh_dev

2020-07-21 Thread Oliver O'Halloran
Drivers that do not support the PCI error handling callbacks are handled by
tearing down the device and re-probing them. If the device being removed is
a virtual function then we need to know the VF index so it can be removed
using the pci_iov_{add|remove}_virtfn() API.

Currently this is handled by looking up the pci_dn, and using the vf_index
that was stashed there when the pci_dn for the VF was created in
pcibios_sriov_enable(). We would like to eliminate the use of pci_dn
outside of pseries though so we need to provide the generic EEH code with
some other way to find the vf_index.

The easiest thing to do here is move the vf_index field out of pci_dn and
into eeh_dev.  Currently pci_dn and eeh_dev are allocated and initialized
together so this is a fairly minimal change in preparation for splitting
pci_dn and eeh_dev in the future.

Signed-off-by: Oliver O'Halloran 
Reviewed-by: Alexey Kardashevskiy 
---
v2: Commit message fixup
---
 arch/powerpc/include/asm/eeh.h| 3 +++
 arch/powerpc/include/asm/pci-bridge.h | 1 -
 arch/powerpc/kernel/eeh_driver.c  | 6 ++
 arch/powerpc/kernel/pci_dn.c  | 7 ---
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8537dd334094..9c5ba535f6a1 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -148,7 +148,10 @@ struct eeh_dev {
struct pci_dn *pdn; /* Associated PCI device node   */
struct pci_dev *pdev;   /* Associated PCI device*/
bool in_error;  /* Error flag for edev  */
+
+   /* VF specific properties */
struct pci_dev *physfn; /* Associated SRIOV PF  */
+   int vf_index;   /* Index of this VF */
 };
 
 /* "fmt" must be a simple literal string */
diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index b92e81b256e5..d2a2a14e56f9 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -202,7 +202,6 @@ struct pci_dn {
 #define IODA_INVALID_PE0x
unsigned int pe_number;
 #ifdef CONFIG_PCI_IOV
-   int vf_index;   /* VF index in the PF */
u16 vfs_expanded;   /* number of VFs IOV BAR expanded */
u16 num_vfs;/* number of VFs enabled*/
unsigned int *pe_num_map;   /* PE# for the first VF PE or array */
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 7b048cee767c..b70b9273f45a 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -477,7 +477,7 @@ static void *eeh_add_virt_device(struct eeh_dev *edev)
}
 
 #ifdef CONFIG_PCI_IOV
-   pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index);
+   pci_iov_add_virtfn(edev->physfn, edev->vf_index);
 #endif
return NULL;
 }
@@ -521,9 +521,7 @@ static void eeh_rmv_device(struct eeh_dev *edev, void 
*userdata)
 
if (edev->physfn) {
 #ifdef CONFIG_PCI_IOV
-   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
-
-   pci_iov_remove_virtfn(edev->physfn, pdn->vf_index);
+   pci_iov_remove_virtfn(edev->physfn, edev->vf_index);
edev->pdev = NULL;
 #endif
if (rmv_data)
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index f790a8d06f50..bf11ac8427ac 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -146,7 +146,6 @@ static struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
 
 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent,
-  int vf_index,
   int busno, int devfn)
 {
struct pci_dn *pdn;
@@ -163,7 +162,6 @@ static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn 
*parent,
pdn->parent = parent;
pdn->busno = busno;
pdn->devfn = devfn;
-   pdn->vf_index = vf_index;
pdn->pe_number = IODA_INVALID_PE;
INIT_LIST_HEAD(>child_list);
INIT_LIST_HEAD(>list);
@@ -194,7 +192,7 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev)
for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
struct eeh_dev *edev __maybe_unused;
 
-   pdn = add_one_sriov_vf_pdn(parent, i,
+   pdn = add_one_sriov_vf_pdn(parent,
   pci_iov_virtfn_bus(pdev, i),
   pci_iov_virtfn_devfn(pdev, i));
if (!pdn) {
@@ -207,7 +205,10 @@ struct pci_dn *add_sriov_vf_pdns(struct pci_dev *pdev)
/* Create the EEH device for the VF */
edev = eeh_dev_init(pdn);
BUG_ON(!edev);
+
+   /* FIXME: these should probably be populated by the EEH probe 

[PATCH v2 02/14] powerpc/eeh: Remove eeh_dev.c

2020-07-21 Thread Oliver O'Halloran
The only thing in this file is eeh_dev_init() which is allocates and
initialises an eeh_dev based on a pci_dn. This is only ever called from
pci_dn.c so move it into there and remove the file.

Signed-off-by: Oliver O'Halloran 
---
v2: no change
---
 arch/powerpc/include/asm/eeh.h |  6 
 arch/powerpc/kernel/Makefile   |  2 +-
 arch/powerpc/kernel/eeh_dev.c  | 54 --
 arch/powerpc/kernel/pci_dn.c   | 20 +
 4 files changed, 21 insertions(+), 61 deletions(-)
 delete mode 100644 arch/powerpc/kernel/eeh_dev.c

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 64487b88c569..8537dd334094 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -293,7 +293,6 @@ void eeh_pe_restore_bars(struct eeh_pe *pe);
 const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
-struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_show_enabled(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
@@ -339,11 +338,6 @@ static inline bool eeh_enabled(void)
 
 static inline void eeh_show_enabled(void) { }
 
-static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
-{
-   return NULL;
-}
-
 static inline void eeh_dev_phb_init_dynamic(struct pci_controller *phb) { }
 
 static inline int eeh_check_failure(const volatile void __iomem *token)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 244542ae2a91..c5211bdcf1b6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -71,7 +71,7 @@ obj-$(CONFIG_PPC_RTAS_DAEMON) += rtasd.o
 obj-$(CONFIG_RTAS_FLASH)   += rtas_flash.o
 obj-$(CONFIG_RTAS_PROC)+= rtas-proc.o
 obj-$(CONFIG_PPC_DT_CPU_FTRS)  += dt_cpu_ftrs.o
-obj-$(CONFIG_EEH)  += eeh.o eeh_pe.o eeh_dev.o eeh_cache.o \
+obj-$(CONFIG_EEH)  += eeh.o eeh_pe.o eeh_cache.o \
  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)   += smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump.o
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
deleted file mode 100644
index 8e159a12f10c..
--- a/arch/powerpc/kernel/eeh_dev.c
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * The file intends to implement dynamic creation of EEH device, which will
- * be bound with OF node and PCI device simutaneously. The EEH devices would
- * be foundamental information for EEH core components to work proerly. 
Besides,
- * We have to support multiple situations where dynamic creation of EEH device
- * is required:
- *
- * 1) Before PCI emunation starts, we need create EEH devices according to the
- *PCI sensitive OF nodes.
- * 2) When PCI emunation is done, we need do the binding between PCI device and
- *the associated EEH device.
- * 3) DR (Dynamic Reconfiguration) would create PCI sensitive OF node. EEH 
device
- *will be created while PCI sensitive OF node is detected from DR.
- * 4) PCI hotplug needs redoing the binding between PCI device and EEH device. 
If
- *PHB is newly inserted, we also need create EEH devices accordingly.
- *
- * Copyright Benjamin Herrenschmidt & Gavin Shan, IBM Corporation 2012.
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-/**
- * eeh_dev_init - Create EEH device according to OF node
- * @pdn: PCI device node
- *
- * It will create EEH device according to the given OF node. The function
- * might be called by PCI emunation, DR, PHB hotplug.
- */
-struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
-{
-   struct eeh_dev *edev;
-
-   /* Allocate EEH device */
-   edev = kzalloc(sizeof(*edev), GFP_KERNEL);
-   if (!edev)
-   return NULL;
-
-   /* Associate EEH device with OF node */
-   pdn->edev = edev;
-   edev->pdn = pdn;
-   edev->bdfn = (pdn->busno << 8) | pdn->devfn;
-   edev->controller = pdn->phb;
-
-   return edev;
-}
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 4e654df55969..f790a8d06f50 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -124,6 +124,26 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
return NULL;
 }
 
+#ifdef CONFIG_EEH
+static struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
+{
+   struct eeh_dev *edev;
+
+   /* Allocate EEH device */
+   edev = kzalloc(sizeof(*edev), GFP_KERNEL);
+   if (!edev)
+   return NULL;
+
+   /* Associate EEH device with OF node */
+   pdn->edev = edev;
+   edev->pdn = pdn;
+   edev->bdfn = (pdn->busno << 8) | pdn->devfn;
+   edev->controller = pdn->phb;
+
+   return edev;
+}
+#endif /* CONFIG_EEH */
+
 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_sriov_vf_pdn(struct pci_dn *parent,

[PATCH v2 01/14] powerpc/eeh: Remove eeh_dev_phb_init_dynamic()

2020-07-21 Thread Oliver O'Halloran
This function is a one line wrapper around eeh_phb_pe_create() and despite
the name it doesn't create any eeh_dev structures. Replace it with direct
calls to eeh_phb_pe_create() since that does what it says on the tin
and removes a layer of indirection.

Signed-off-by: Oliver O'Halloran 
---
v2: Added sub prototype of eeh_phb_pe_create() for the !CONFIG_EEH case.
---
 arch/powerpc/include/asm/eeh.h |  3 ++-
 arch/powerpc/kernel/eeh.c  |  2 +-
 arch/powerpc/kernel/eeh_dev.c  | 13 -
 arch/powerpc/kernel/of_platform.c  |  4 ++--
 arch/powerpc/platforms/pseries/pci_dlpar.c |  2 +-
 5 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 964a54292b36..64487b88c569 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -294,7 +294,6 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe);
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
-void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
 void eeh_show_enabled(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
@@ -370,6 +369,8 @@ void pseries_eeh_init_edev_recursive(struct pci_dn *pdn);
 #else
 static inline void pseries_eeh_add_device_early(struct pci_dn *pdn) { }
 static inline void pseries_eeh_add_device_tree_early(struct pci_dn *pdn) { }
+
+static inline int eeh_phb_pe_create(struct pci_controller *phb) { return 0; }
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index d407981dec76..859f76020256 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1096,7 +1096,7 @@ static int eeh_init(void)
 
/* Initialize PHB PEs */
list_for_each_entry_safe(hose, tmp, _list, list_node)
-   eeh_dev_phb_init_dynamic(hose);
+   eeh_phb_pe_create(hose);
 
eeh_addr_cache_init();
 
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index 7370185c7a05..8e159a12f10c 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -52,16 +52,3 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
 
return edev;
 }
-
-/**
- * eeh_dev_phb_init_dynamic - Create EEH devices for devices included in PHB
- * @phb: PHB
- *
- * Scan the PHB OF node and its child association, then create the
- * EEH devices accordingly
- */
-void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
-{
-   /* EEH PE for PHB */
-   eeh_phb_pe_create(phb);
-}
diff --git a/arch/powerpc/kernel/of_platform.c 
b/arch/powerpc/kernel/of_platform.c
index 71a3f97dc988..f89376ff633e 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -62,8 +62,8 @@ static int of_pci_phb_probe(struct platform_device *dev)
/* Init pci_dn data structures */
pci_devs_phb_init_dynamic(phb);
 
-   /* Create EEH PEs for the PHB */
-   eeh_dev_phb_init_dynamic(phb);
+   /* Create EEH PE for the PHB */
+   eeh_phb_pe_create(phb);
 
/* Scan the bus */
pcibios_scan_phb(phb);
diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c 
b/arch/powerpc/platforms/pseries/pci_dlpar.c
index b3a38f5a6b68..f9ae17e8a0f4 100644
--- a/arch/powerpc/platforms/pseries/pci_dlpar.c
+++ b/arch/powerpc/platforms/pseries/pci_dlpar.c
@@ -34,7 +34,7 @@ struct pci_controller *init_phb_dynamic(struct device_node 
*dn)
pci_devs_phb_init_dynamic(phb);
 
/* Create EEH devices for the PHB */
-   eeh_dev_phb_init_dynamic(phb);
+   eeh_phb_pe_create(phb);
 
if (dn->child)
pseries_eeh_init_edev_recursive(PCI_DN(dn));
-- 
2.26.2



Re: [PATCH v4 07/12] ppc64/kexec_file: add support to relocate purgatory

2020-07-21 Thread Michael Ellerman
Hari Bathini  writes:
> Right now purgatory implementation is only minimal. But if purgatory
> code is to be enhanced to copy memory to the backup region and verify
> sha256 digest, relocations may have to be applied to the purgatory.
> So, add support to relocate purgatory in kexec_file_load system call
> by setting up TOC pointer and applying RELA relocations as needed.
>
> Reported-by: kernel test robot 
> [lkp: In v1, 'struct mem_sym' was declared in parameter list]
> Signed-off-by: Hari Bathini 
> ---
>
> * Michael, can you share your opinion on the below:
> - https://lore.kernel.org/patchwork/patch/1272027/
> - My intention in cover note.

It seems like a lot of complexity for little benefit.

AFAICS your final purgatory_64.c is only 36 lines, and all it does is a
single (open coded) memcpy().

It seems like we could write that in not many more lines of assembler
and avoid all this code.

What am I missing?

cheers


OF: Can't handle multiple dma-ranges with different offsets

2020-07-21 Thread Chris Packham
Hi,

I've just fired up linux kernel v5.7 on a p2040 based system and I'm 
getting the following new warning

OF: Can't handle multiple dma-ranges with different offsets on 
node(/pcie@ffe202000)
OF: Can't handle multiple dma-ranges with different offsets on 
node(/pcie@ffe202000)

The warning itself was added in commit 9d55bebd9816 ("of/address: 
Support multiple 'dma-ranges' entries") but I gather it's pointing out 
something about the dts. My boards dts is based heavily on p2041rdb.dts 
and the relevant pci2 section is identical (reproduced below for reference).

     pci2: pcie@ffe202000 {
         reg = <0xf 0xfe202000 0 0x1000>;
         ranges = <0x0200 0 0xe000 0xc 0x4000 0 0x2000
               0x0100 0 0x 0xf 0xf802 0 0x0001>;
         pcie@0 {
             ranges = <0x0200 0 0xe000
                   0x0200 0 0xe000
                   0 0x2000

                   0x0100 0 0x
                   0x0100 0 0x
                   0 0x0001>;
         };
     };

I haven't noticed any ill effect (aside from the scary message). I'm not 
sure if there's something missing in the dts or in the code that checks 
the ranges. Any guidance would be appreciated.

Thanks,
Chris


Re: [v3 04/15] powerpc/perf: Add support for ISA3.1 PMU SPRs

2020-07-21 Thread Jordan Niethe
On Sat, Jul 18, 2020 at 1:02 AM Athira Rajeev
 wrote:
>
> From: Madhavan Srinivasan 
>
> PowerISA v3.1 includes new performance monitoring unit(PMU)
> special purpose registers (SPRs). They are
>
> Monitor Mode Control Register 3 (MMCR3)
> Sampled Instruction Event Register 2 (SIER2)
> Sampled Instruction Event Register 3 (SIER3)
>
> MMCR3 is added for further sampling related configuration
> control. SIER2/SIER3 are added to provide additional
> information about the sampled instruction.
>
> Patch adds new PPMU flag called "PPMU_ARCH_310S" to support
> handling of these new SPRs, updates the struct thread_struct
> to include these new SPRs, include MMCR3 in struct mmcr_regs.
> This is needed to support programming of MMCR3 SPR during
> event_[enable/disable]. Patch also adds the sysfs support
> for the MMCR3 SPR along with SPRN_ macros for these new pmu sprs.
>
> Signed-off-by: Madhavan Srinivasan 
> ---
>  arch/powerpc/include/asm/perf_event_server.h |  2 ++
>  arch/powerpc/include/asm/processor.h |  4 
>  arch/powerpc/include/asm/reg.h   |  6 ++
>  arch/powerpc/kernel/sysfs.c  |  8 
>  arch/powerpc/perf/core-book3s.c  | 29 
> 
>  5 files changed, 49 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/perf_event_server.h 
> b/arch/powerpc/include/asm/perf_event_server.h
> index 14b8dc1..832450a 100644
> --- a/arch/powerpc/include/asm/perf_event_server.h
> +++ b/arch/powerpc/include/asm/perf_event_server.h
> @@ -22,6 +22,7 @@ struct mmcr_regs {
> unsigned long mmcr1;
> unsigned long mmcr2;
> unsigned long mmcra;
> +   unsigned long mmcr3;
>  };
>  /*
>   * This struct provides the constants and functions needed to
> @@ -75,6 +76,7 @@ struct power_pmu {
>  #define PPMU_HAS_SIER  0x0040 /* Has SIER */
>  #define PPMU_ARCH_207S 0x0080 /* PMC is architecture v2.07S */
>  #define PPMU_NO_SIAR   0x0100 /* Do not use SIAR */
> +#define PPMU_ARCH_310S 0x0200 /* Has MMCR3, SIER2 and SIER3 */
We elsewhere have CPU_FTR_ARCH_31, so should this be PPMU_ARCH_31S to
be consistent.
>
>  /*
>   * Values for flags to get_alternatives()
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index 52a6783..a466e94 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -272,6 +272,10 @@ struct thread_struct {
> unsignedmmcr0;
>
> unsignedused_ebb;
> +   unsigned long   mmcr3;
> +   unsigned long   sier2;
> +   unsigned long   sier3;
> +
>  #endif
>  };
>
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 88e6c78..21a1b2d 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -876,7 +876,9 @@
>  #define   MMCR0_FCHV   0x0001UL /* freeze conditions in hypervisor mode 
> */
>  #define SPRN_MMCR1 798
>  #define SPRN_MMCR2 785
> +#define SPRN_MMCR3 754
>  #define SPRN_UMMCR2769
> +#define SPRN_UMMCR3738
>  #define SPRN_MMCRA 0x312
>  #define   MMCRA_SDSYNC 0x8000UL /* SDAR synced with SIAR */
>  #define   MMCRA_SDAR_DCACHE_MISS 0x4000UL
> @@ -918,6 +920,10 @@
>  #define   SIER_SIHV0x100   /* Sampled MSR_HV */
>  #define   SIER_SIAR_VALID  0x040   /* SIAR contents valid */
>  #define   SIER_SDAR_VALID  0x020   /* SDAR contents valid */
> +#define SPRN_SIER2 752
> +#define SPRN_SIER3 753
> +#define SPRN_USIER2736
> +#define SPRN_USIER3737
>  #define SPRN_SIAR  796
>  #define SPRN_SDAR  797
>  #define SPRN_TACR  888
> diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
> index 571b325..46b4ebc 100644
> --- a/arch/powerpc/kernel/sysfs.c
> +++ b/arch/powerpc/kernel/sysfs.c
> @@ -622,8 +622,10 @@ void ppc_enable_pmcs(void)
>  SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
>
>  SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
> +SYSFS_PMCSETUP(mmcr3, SPRN_MMCR3);
>
>  static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
> +static DEVICE_ATTR(mmcr3, 0600, show_mmcr3, store_mmcr3);
>  #endif /* HAS_PPC_PMC56 */
>
>
> @@ -886,6 +888,9 @@ static int register_cpu_online(unsigned int cpu)
>  #ifdef CONFIG_PMU_SYSFS
> if (cpu_has_feature(CPU_FTR_MMCRA))
> device_create_file(s, _attr_mmcra);
> +
> +   if (cpu_has_feature(CPU_FTR_ARCH_31))
> +   device_create_file(s, _attr_mmcr3);
>  #endif /* CONFIG_PMU_SYSFS */
>
> if (cpu_has_feature(CPU_FTR_PURR)) {
> @@ -980,6 +985,9 @@ static int unregister_cpu_online(unsigned int cpu)
>  #ifdef CONFIG_PMU_SYSFS
> if (cpu_has_feature(CPU_FTR_MMCRA))
> device_remove_file(s, _attr_mmcra);
> +
> +   if (cpu_has_feature(CPU_FTR_ARCH_31))
> +   device_remove_file(s, _attr_mmcr3);
>  #endif /* CONFIG_PMU_SYSFS */
>
> if 

Re: [PATCH 2/2] powerpc/numa: Remove a redundant variable

2020-07-21 Thread Nathan Lynch
Srikar Dronamraju  writes:
> In of_drconf_to_nid_single() default_nid always refers to NUMA_NO_NODE.
> Hence replace it with NUMA_NO_NODE.
>
> No functional changes.
>

...

> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index a2c5fe0d0cad..b066d89c2975 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -432,16 +432,15 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa)
>  static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>  {
>   struct assoc_arrays aa = { .arrays = NULL };
> - int default_nid = NUMA_NO_NODE;
> - int nid = default_nid;
> + int nid = NUMA_NO_NODE;
>   int rc, index;
>  
>   if ((min_common_depth < 0) || !numa_enabled)
> - return default_nid;
> + return NUMA_NO_NODE;
>  
>   rc = of_get_assoc_arrays();
>   if (rc)
> - return default_nid;
> + return NUMA_NO_NODE;
>  
>   if (min_common_depth <= aa.array_sz &&
>   !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < 
> aa.n_arrays) {
> @@ -449,7 +448,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   nid = of_read_number([index], 1);
>  
>   if (nid == 0x || nid >= num_possible_nodes())
> - nid = default_nid;
> + nid = NUMA_NO_NODE;
>  
>   if (nid > 0) {
>   index = lmb->aa_index * aa.array_sz;

Doesn't seem like an improvement to me, sorry. Admittedly it's a small
point, but a local variable named "default_nid" communicates that it's a
default or fallback value. That information is lost with this change.


Re: [v3 01/15] powerpc/perf: Update cpu_hw_event to use `struct` for storing MMCR registers

2020-07-21 Thread Athira Rajeev


> On 21-Jul-2020, at 9:12 AM, Jordan Niethe  wrote:
> 
> On Sat, Jul 18, 2020 at 12:48 AM Athira Rajeev
> mailto:atraj...@linux.vnet.ibm.com>> wrote:
>> 
>> core-book3s currently uses array to store the MMCR registers as part
>> of per-cpu `cpu_hw_events`. This patch does a clean up to use `struct`
>> to store mmcr regs instead of array. This will make code easier to read
>> and reduces chance of any subtle bug that may come in the future, say
>> when new registers are added. Patch updates all relevant code that was
>> using MMCR array ( cpuhw->mmcr[x]) to use newly introduced `struct`.
>> This includes the PMU driver code for supported platforms (power5
>> to power9) and ISA macros for counter support functions.
>> 
>> Signed-off-by: Athira Rajeev 
>> ---
>> arch/powerpc/include/asm/perf_event_server.h | 10 --
>> arch/powerpc/perf/core-book3s.c  | 53 
>> +---
>> arch/powerpc/perf/isa207-common.c| 20 +--
>> arch/powerpc/perf/isa207-common.h|  4 +--
>> arch/powerpc/perf/mpc7450-pmu.c  | 21 +++
>> arch/powerpc/perf/power5+-pmu.c  | 17 -
>> arch/powerpc/perf/power5-pmu.c   | 17 -
>> arch/powerpc/perf/power6-pmu.c   | 16 -
>> arch/powerpc/perf/power7-pmu.c   | 17 -
>> arch/powerpc/perf/ppc970-pmu.c   | 24 ++---
>> 10 files changed, 105 insertions(+), 94 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/perf_event_server.h 
>> b/arch/powerpc/include/asm/perf_event_server.h
>> index 3e9703f..f9a3668 100644
>> --- a/arch/powerpc/include/asm/perf_event_server.h
>> +++ b/arch/powerpc/include/asm/perf_event_server.h
>> @@ -17,6 +17,12 @@
>> 
>> struct perf_event;
>> 
>> +struct mmcr_regs {
>> +   unsigned long mmcr0;
>> +   unsigned long mmcr1;
>> +   unsigned long mmcr2;
>> +   unsigned long mmcra;
>> +};
>> /*
>>  * This struct provides the constants and functions needed to
>>  * describe the PMU on a particular POWER-family CPU.
>> @@ -28,7 +34,7 @@ struct power_pmu {
>>unsigned long   add_fields;
>>unsigned long   test_adder;
>>int (*compute_mmcr)(u64 events[], int n_ev,
>> -   unsigned int hwc[], unsigned long mmcr[],
>> +   unsigned int hwc[], struct mmcr_regs *mmcr,
>>struct perf_event *pevents[]);
>>int (*get_constraint)(u64 event_id, unsigned long *mskp,
>>unsigned long *valp);
>> @@ -41,7 +47,7 @@ struct power_pmu {
>>unsigned long   group_constraint_val;
>>u64 (*bhrb_filter_map)(u64 branch_sample_type);
>>void(*config_bhrb)(u64 pmu_bhrb_filter);
>> -   void(*disable_pmc)(unsigned int pmc, unsigned long 
>> mmcr[]);
>> +   void(*disable_pmc)(unsigned int pmc, struct mmcr_regs 
>> *mmcr);
>>int (*limited_pmc_event)(u64 event_id);
>>u32 flags;
>>const struct attribute_group**attr_groups;
>> diff --git a/arch/powerpc/perf/core-book3s.c 
>> b/arch/powerpc/perf/core-book3s.c
>> index cd6a742..18b1b6a 100644
>> --- a/arch/powerpc/perf/core-book3s.c
>> +++ b/arch/powerpc/perf/core-book3s.c
>> @@ -37,12 +37,7 @@ struct cpu_hw_events {
>>struct perf_event *event[MAX_HWEVENTS];
>>u64 events[MAX_HWEVENTS];
>>unsigned int flags[MAX_HWEVENTS];
>> -   /*
>> -* The order of the MMCR array is:
>> -*  - 64-bit, MMCR0, MMCR1, MMCRA, MMCR2
>> -*  - 32-bit, MMCR0, MMCR1, MMCR2
>> -*/
>> -   unsigned long mmcr[4];
>> +   struct mmcr_regs mmcr;
>>struct perf_event *limited_counter[MAX_LIMITED_HWCOUNTERS];
>>u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
>>u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES];
>> @@ -121,7 +116,7 @@ static void ebb_event_add(struct perf_event *event) { }
>> static void ebb_switch_out(unsigned long mmcr0) { }
>> static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
>> {
>> -   return cpuhw->mmcr[0];
>> +   return cpuhw->mmcr.mmcr0;
>> }
>> 
>> static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
>> @@ -590,7 +585,7 @@ static void ebb_switch_out(unsigned long mmcr0)
>> 
>> static unsigned long ebb_switch_in(bool ebb, struct cpu_hw_events *cpuhw)
>> {
>> -   unsigned long mmcr0 = cpuhw->mmcr[0];
>> +   unsigned long mmcr0 = cpuhw->mmcr.mmcr0;
>> 
>>if (!ebb)
>>goto out;
>> @@ -624,7 +619,7 @@ static unsigned long ebb_switch_in(bool ebb, struct 
>> cpu_hw_events *cpuhw)
>> * unfreeze counters, it should not set exclude_xxx in its events and
>> * instead manage the MMCR2 entirely by itself.
>> */
>> -   mtspr(SPRN_MMCR2, cpuhw->mmcr[3] | current->thread.mmcr2);
>> +   mtspr(SPRN_MMCR2, 

Re: [v3 02/15] KVM: PPC: Book3S HV: Cleanup updates for kvm vcpu MMCR

2020-07-21 Thread Athira Rajeev


> On 21-Jul-2020, at 9:24 AM, Paul Mackerras  wrote:
> 
> On Fri, Jul 17, 2020 at 10:38:14AM -0400, Athira Rajeev wrote:
>> Currently `kvm_vcpu_arch` stores all Monitor Mode Control registers
>> in a flat array in order: mmcr0, mmcr1, mmcra, mmcr2, mmcrs
>> Split this to give mmcra and mmcrs its own entries in vcpu and
>> use a flat array for mmcr0 to mmcr2. This patch implements this
>> cleanup to make code easier to read.
> 
> Changing the way KVM stores these values internally is fine, but
> changing the user ABI is not.  This part:
> 
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
>> b/arch/powerpc/include/uapi/asm/kvm.h
>> index 264e266..e55d847 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -510,8 +510,8 @@ struct kvm_ppc_cpu_char {
>> 
>> #define KVM_REG_PPC_MMCR0(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10)
>> #define KVM_REG_PPC_MMCR1(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11)
>> -#define KVM_REG_PPC_MMCRA   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>> -#define KVM_REG_PPC_MMCR2   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
>> +#define KVM_REG_PPC_MMCR2   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12)
>> +#define KVM_REG_PPC_MMCRA   (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x13)
> 
> means that existing userspace programs that used to work would now be
> broken.  That is not acceptable (breaking the user ABI is only ever
> acceptable with a very compelling reason).  So NAK to this part of the
> patch.

Hi Paul

Thanks for checking the patch. I understood your point on user ABI breakage 
that this particular change can cause.
I will retain original KVM_REG_PPC_MMCRA and KVM_REG_PPC_MMCR2 order in `kvm.h`
And with that, additionally I will need below change ( on top of current patch 
) for my clean up updates for kvm cpu MMCR to work,
Because now mmcra and mmcrs will have its own entries in vcpu and is not part 
of the mmcr[] array
Please suggest if this looks good

Thanks
Athira 

—
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3f90eee261fc..b10bb404f0d5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1679,10 +1679,13 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_UAMOR:
*val = get_reg_val(id, vcpu->arch.uamor);
break;
-   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
+   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
i = id - KVM_REG_PPC_MMCR0;
*val = get_reg_val(id, vcpu->arch.mmcr[i]);
break;
+   case KVM_REG_PPC_MMCR2:
+   *val = get_reg_val(id, vcpu->arch.mmcr[2]);
+   break;
case KVM_REG_PPC_MMCRA:
*val = get_reg_val(id, vcpu->arch.mmcra);
break;
@@ -1906,10 +1909,13 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
case KVM_REG_PPC_UAMOR:
vcpu->arch.uamor = set_reg_val(id, *val);
break;
-   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR2:
+   case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCR1:
i = id - KVM_REG_PPC_MMCR0;
vcpu->arch.mmcr[i] = set_reg_val(id, *val);
break;
+   case KVM_REG_PPC_MMCR2:
+   vcpu->arch.mmcr[2] = set_reg_val(id, *val);
+   break;
case KVM_REG_PPC_MMCRA:
vcpu->arch.mmcra = set_reg_val(id, *val);
break;
—


> 
> Regards,
> Paul.



Re: [PATCH 1/2] powerpc/numa: Limit possible nodes to within num_possible_nodes

2020-07-21 Thread Nathan Lynch
Srikar Dronamraju  writes:
> MAX_NUMNODES is a theoretical maximum number of nodes thats is supported
> by the kernel. Device tree properties exposes the number of possible
> nodes on the current platform. The kernel would detected this and would
> use it for most of its resource allocations.  If the platform now
> increases the nodes to over what was already exposed, then it may lead
> to inconsistencies. Hence limit it to the already exposed nodes.
>
> Suggested-by: Nathan Lynch 
> Cc: linuxppc-dev 
> Cc: Michael Ellerman 
> Cc: Nathan Lynch 
> Cc: Tyrel Datwyler 
> Signed-off-by: Srikar Dronamraju 
> ---
>  arch/powerpc/mm/numa.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 8ec7ff05ae47..a2c5fe0d0cad 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -221,7 +221,7 @@ static void initialize_distance_lookup_table(int nid,
>   }
>  }
>  
> -/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
> +/* Returns nid in the range [0..num_possible_nodes()-1], or -1 if no useful 
> numa
>   * info is found.
>   */
>  static int associativity_to_nid(const __be32 *associativity)
> @@ -235,7 +235,7 @@ static int associativity_to_nid(const __be32 
> *associativity)
>   nid = of_read_number([min_common_depth], 1);
>  
>   /* POWER4 LPAR uses 0x as invalid node */
> - if (nid == 0x || nid >= MAX_NUMNODES)
> + if (nid == 0x || nid >= num_possible_nodes())
>   nid = NUMA_NO_NODE;
>  
>   if (nid > 0 &&
> @@ -448,7 +448,7 @@ static int of_drconf_to_nid_single(struct drmem_lmb *lmb)
>   index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
>   nid = of_read_number([index], 1);
>  
> - if (nid == 0x || nid >= MAX_NUMNODES)
> + if (nid == 0x || nid >= num_possible_nodes())
>   nid = default_nid;
>  
>   if (nid > 0) {

Yes, looks fine to me.

Reviewed-by: Nathan Lynch 


[PATCH] powerpc/40x: Fix assembler warning about r0

2020-07-21 Thread Michael Ellerman
The assembler says:
  arch/powerpc/kernel/head_40x.S:623: Warning: invalid register expression

It's objecting to the use of r0 as the RA argument. That's because
when RA = 0 the literal value 0 is used, rather than the content of
r0, making the use of r0 in the source potentially confusing.

Fix it to use a literal 0, the generated code is identical.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/head_40x.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S
index 926bfa73586a..5b282d9965a5 100644
--- a/arch/powerpc/kernel/head_40x.S
+++ b/arch/powerpc/kernel/head_40x.S
@@ -620,7 +620,7 @@ _ENTRY(saved_ksp_limit)
ori r6, r6, swapper_pg_dir@l
lis r5, abatron_pteptrs@h
ori r5, r5, abatron_pteptrs@l
-   stw r5, 0xf0(r0)/* Must match your Abatron config file */
+   stw r5, 0xf0(0) /* Must match your Abatron config file */
tophys(r5,r5)
stw r6, 0(r5)
 
-- 
2.25.1



Re: [RFC PATCH] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

2020-07-21 Thread Benjamin Herrenschmidt
On Wed, 2020-07-22 at 12:06 +1000, Michael Ellerman wrote:
> Ram Pai  writes:
> > An instruction accessing a mmio address, generates a HDSI fault.  This 
> > fault is
> > appropriately handled by the Hypervisor.  However in the case of secureVMs, 
> > the
> > fault is delivered to the ultravisor.
> > 
> > Unfortunately the Ultravisor has no correct-way to fetch the faulting
> > instruction. The PEF architecture does not allow Ultravisor to enable MMU
> > translation. Walking the two level page table to read the instruction can 
> > race
> > with other vcpus modifying the SVM's process scoped page table.
> 
> You're trying to read the guest's kernel text IIUC, that mapping should
> be stable. Possibly permissions on it could change over time, but the
> virtual -> real mapping should not.

This will of course no longer work if userspace tries to access MMIO
but as long as you are ok limiting MMIO usage to the kernel, that's one
way.

iirc MMIO emulation in KVM on powerpc already has that race because of
HW TLB inval broadcast and it hasn't hurt anybody ... so far.

> > This problem can be correctly solved with some help from the kernel.
> > 
> > Capture the faulting instruction in SPRG0 register, before executing the
> > faulting instruction. This enables the ultravisor to easily procure the
> > faulting instruction and emulate it.
> 
> This is not something I'm going to merge. Sorry.

Another approach is to have the guest explicitly switch to using UV
calls for MMIO.

Cheers,
Ben.




Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Benjamin Herrenschmidt
On Tue, 2020-07-21 at 16:48 -0700, Palmer Dabbelt wrote:
> > Why ? Branch distance limits ? You can't use trampolines ?
> 
> Nothing fundamental, it's just that we don't have a large code model in the C
> compiler.  As a result all the global symbols are resolved as 32-bit
> PC-relative accesses.  We could fix this with a fast large code model, but 
> then
> the kernel would need to relax global symbol references in modules and we 
> don't
> even do that for the simple code models we have now.  FWIW, some of the
> proposed large code models are essentially just split-PLT/GOT and therefor
> don't require relaxation, but at that point we're essentially PIC until we
> have more that 2GiB of kernel text -- and even then, we keep all the
> performance issues.

My memory might be out of date but I *think* we do it on powerpc
without going to a large code model, but just having the in-kernel
linker insert trampolines.

Cheers,
Ben.




Re: [RFC PATCH] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

2020-07-21 Thread Michael Ellerman
Ram Pai  writes:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault 
> is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, 
> the
> fault is delivered to the ultravisor.
>
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.

You're trying to read the guest's kernel text IIUC, that mapping should
be stable. Possibly permissions on it could change over time, but the
virtual -> real mapping should not.

> This problem can be correctly solved with some help from the kernel.
>
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.

This is not something I'm going to merge. Sorry.

cheers


[PATCHv3 2/2] powerpc/pseries: update device tree before ejecting hotplug uevents

2020-07-21 Thread Pingfan Liu
A bug is observed on pseries by taking the following steps on rhel:
-1. drmgr -c mem -r -q 5
-2. echo c > /proc/sysrq-trigger

And then, the failure looks like:
kdump: saving to /sysroot//var/crash/127.0.0.1-2020-01-16-02:06:14/
kdump: saving vmcore-dmesg.txt
kdump: saving vmcore-dmesg.txt complete
kdump: saving vmcore
 Checking for memory holes : [  0.0 %] /
   Checking for memory holes : [100.0 %] |  
 Excluding unnecessary pages   : [100.0 %] \
   Copying data  : [  0.3 %] -  
eta: 38s[   44.337636] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
access=0x8004 current=makedumpfile
[   44.337663] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
psize 2 pte=0xc0005504
[   44.337677] hash-mmu: mm: Hashing failure ! EA=0x7fffba40 
access=0x8004 current=makedumpfile
[   44.337692] hash-mmu: trap=0x300 vsid=0x13a109c ssize=1 base psize=2 
psize 2 pte=0xc0005504
[   44.337708] makedumpfile[469]: unhandled signal 7 at 7fffba40 nip 
7fffbbc4d7fc lr 00011356ca3c code 2
[   44.338548] Core dump to |/bin/false pipe failed
/lib/kdump-lib-initramfs.sh: line 98:   469 Bus error   
$CORE_COLLECTOR /proc/vmcore 
$_mp/$KDUMP_PATH/$HOST_IP-$DATEDIR/vmcore-incomplete
kdump: saving vmcore failed

* Root cause *
  After analyzing, it turns out that in the current implementation,
when hot-removing lmb, the KOBJ_REMOVE event ejects before the dt updating as
the code __remove_memory() comes before drmem_update_dt().
So in kdump kernel, when read_from_oldmem() resorts to
pSeries_lpar_hpte_insert() to install hpte, but fails with -2 due to
non-exist pfn. And finally, low_hash_fault() raise SIGBUS to process, as it
can be observed "Bus error"

>From a viewpoint of listener and publisher, the publisher notifies the
listener before data is ready.  This introduces a problem where udev
launches kexec-tools (due to KOBJ_REMOVE) and loads a stale dt before
updating. And in capture kernel, makedumpfile will access the memory based
on the stale dt info, and hit a SIGBUS error due to an un-existed lmb.

* Fix *
  In order to fix this issue, update dt before __remove_memory(), and
accordingly the same rule in hot-add path.

This will introduce extra dt updating payload for each involved lmb when 
hotplug.
But it should be fine since drmem_update_dt() is memory based operation and
hotplug is not a hot path.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Hari Bathini 
Cc: Nathan Lynch 
To: linuxppc-dev@lists.ozlabs.org
Cc: ke...@lists.infradead.org
---
v2 -> v3: rebase onto ppc next-test branch
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 1a3ac3b..def8cb3f 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -372,6 +372,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
invalidate_lmb_associativity_index(lmb);
lmb_clear_nid(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+   drmem_update_dt();
 
__remove_memory(nid, base_addr, block_sz);
 
@@ -607,6 +608,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
lmb_set_nid(lmb);
lmb->flags |= DRCONF_MEM_ASSIGNED;
+   drmem_update_dt();
 
block_sz = memory_block_size_bytes();
 
@@ -625,6 +627,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
invalidate_lmb_associativity_index(lmb);
lmb_clear_nid(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+   drmem_update_dt();
 
__remove_memory(nid, base_addr, block_sz);
}
@@ -877,9 +880,6 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
break;
}
 
-   if (!rc)
-   rc = drmem_update_dt();
-
unlock_device_hotplug();
return rc;
 }
-- 
2.7.5



[PATCHv3 1/2] powerpc/pseries: group lmb operation and memblock's

2020-07-21 Thread Pingfan Liu
This patch prepares for the incoming patch which swaps the order of KOBJ_
uevent and dt's updating.

It has no functional effect, just groups lmb operation and memblock's in
order to insert dt updating operation easily, and makes it easier to
review.

Signed-off-by: Pingfan Liu 
Cc: Michael Ellerman 
Cc: Hari Bathini 
Cc: Nathan Lynch 
To: linuxppc-dev@lists.ozlabs.org
Cc: ke...@lists.infradead.org
---
v2 -> v3: rebase onto ppc next-test branch
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 26 -
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5d545b7..1a3ac3b 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -355,7 +355,8 @@ static int dlpar_add_lmb(struct drmem_lmb *);
 static int dlpar_remove_lmb(struct drmem_lmb *lmb)
 {
unsigned long block_sz;
-   int rc;
+   phys_addr_t base_addr;
+   int rc, nid;
 
if (!lmb_is_removable(lmb))
return -EINVAL;
@@ -364,17 +365,19 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
if (rc)
return rc;
 
+   base_addr = lmb->base_addr;
+   nid = lmb->nid;
block_sz = pseries_memory_block_size();
 
-   __remove_memory(lmb->nid, lmb->base_addr, block_sz);
-
-   /* Update memory regions for memory remove */
-   memblock_remove(lmb->base_addr, block_sz);
-
invalidate_lmb_associativity_index(lmb);
lmb_clear_nid(lmb);
lmb->flags &= ~DRCONF_MEM_ASSIGNED;
 
+   __remove_memory(nid, base_addr, block_sz);
+
+   /* Update memory regions for memory remove */
+   memblock_remove(base_addr, block_sz);
+
return 0;
 }
 
@@ -603,6 +606,8 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
}
 
lmb_set_nid(lmb);
+   lmb->flags |= DRCONF_MEM_ASSIGNED;
+
block_sz = memory_block_size_bytes();
 
/* Add the memory */
@@ -614,11 +619,14 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
rc = dlpar_online_lmb(lmb);
if (rc) {
-   __remove_memory(lmb->nid, lmb->base_addr, block_sz);
+   int nid = lmb->nid;
+   phys_addr_t base_addr = lmb->base_addr;
+
invalidate_lmb_associativity_index(lmb);
lmb_clear_nid(lmb);
-   } else {
-   lmb->flags |= DRCONF_MEM_ASSIGNED;
+   lmb->flags &= ~DRCONF_MEM_ASSIGNED;
+
+   __remove_memory(nid, base_addr, block_sz);
}
 
return rc;
-- 
2.7.5



Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-21 Thread Alexey Kardashevskiy



On 22/07/2020 08:13, Leonardo Bras wrote:
> On Tue, 2020-07-21 at 14:59 +1000, Alexey Kardashevskiy wrote:
>>
>> On 16/07/2020 17:16, Leonardo Bras wrote:
>>> Move the part of iommu_table_free() that does struct iommu_table cleaning
>>> into iommu_table_clean, so we can invoke it separately.
>>>
>>> This new function is useful for cleaning struct iommu_table before
>>> initializing it again with a new DMA window, without having it freed and
>>> allocated again.
>>>
>>> Signed-off-by: Leonardo Bras 
>>> ---
>>>  arch/powerpc/kernel/iommu.c | 30 ++
>>>  1 file changed, 18 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index 9704f3f76e63..c3242253a4e7 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct 
>>> iommu_table *tbl, int nid,
>>> return tbl;
>>>  }
>>>  
>>> -static void iommu_table_free(struct kref *kref)
>>> +static void iommu_table_clean(struct iommu_table *tbl)
>>
>> iommu_table_free() + iommu_init_table() + set_iommu_table_base() should
>> work too, why new helper?
> 
> iommu_table_free() also frees the tbl, which would need allocate it
> again (new address) and to fill it up again, unnecessarily. 

It is a new table in fact, everything is new there. You are only saving
kfree+kzalloc which does not seem a huge win.

Also, iommu_table_update() simply assumes 64bit window by passing
res_start=res_end=0 to iommu_init_table() which is not horribly robust
either. Yeah, I know, iommu_init_table() is always called with zeroes in
pseries but this is somewhat ok as those tables are from the device tree
and those windows don't overlap with 32bit MMIO but under KVM they will
(well, if we hack QEMU to advertise a single window).

I suggest removing iommu_pseries_table_update() from 6/7 and do
iommu_table_free() + iommu_init_table() + set_iommu_table_base() with a
WARN_ON(pdev->dev.archdata.dma_offset>=SZ_4G), may be even do this all
in enable_ddw() where we know for sure if it is 1:1 mapping or just a
big window.

Out of curiosity - what page sizes does pHyp advertise in "query"?


> I think it's a better approach to only change what is needed.
> 
>> There is also iommu_table_clear() which does a different thing so you
>> need a better name.
> 
> I agree.
> I had not noticed this other function before sending the patchset. What
> would be a better name though? __iommu_table_free()? 
> 
>> Second, iommu_table_free
>> use and it would be ok as we would only see this when hot-unplugging a
>> PE because we always kept the default window.
>> Btw you must be seeing these warnings now every time you create DDW with
>> these patches as at least the first page is reserved, do not you?
> 
> It does not print a warning.
> I noticed other warnings,

And what are these?

> but not this one from iommu_table_free():
> /* verify that table contains no entries */
> if (!bitmap_empty(tbl->it_ma
> p, tbl->it_size))
>   pr_warn("%s: Unexpected TCEs\n", __func__);
> 
> Before that, iommu_table_release_pages(tbl) is supposed to clear the 
> bitmap, so this only tests for a tce that is created in this short period.


iommu_table_release_pages() only clears reserved pages - page 0 (just a
protection against NULL DMA pointers) and 32bit MMIO (these should not
be set for 64bit window). The "%s: Unexpected TCEs\n" is what checks for
actual mapped TCEs.


>> Since we are replacing a table for a device which is still in the
>> system, we should not try messing with its DMA if it already has
>> mappings so the warning should become an error preventing DDW. It is
>> rather hard to trigger in practice but I could hack a driver to ask for
>> 32bit DMA mask first, map few pages and then ask for 64bit DMA mask, it
>> is not illegal, I think. So this needs a new helper - "bool
>> iommu_table_in_use(tbl)" - to use in enable_ddw(). Or I am overthinking
>> this?... Thanks,
> 
> As of today, there seems to be nothing like that happening in the
> driver I am testing. 
> I spoke to Brian King on slack, and he mentioned that at the point DDW
> is created there should be no allocations in place.

Correct, there should not be. But it is also not a huge effort to fall
back if there are.


> But I suppose some driver could try to do this.
> 
> Maybe a better approach would be removing the mapping only if the
> default window is removed (at the end of enable_ddw, as an else to
> resetting the default DMA window), and having a way to add more
> mappings to those pools. But this last part doesn't look so simple, and
> it would be better to understand if it's necessary investing work in
> this.
> 
> What do you think?

Add iommu_table_in_use(tbl) and fail DDW if that says "yes".



-- 
Alexey


Re: [PATCH 5/5] powerpc sstep: Add tests for Prefixed Add Immediate

2020-07-21 Thread Jordan Niethe
On Mon, May 25, 2020 at 1:00 PM Jordan Niethe  wrote:
>
> Use the existing support for testing compute type instructions to test
> Prefixed Add Immediate (paddi).  The R bit of the paddi instruction
> controls whether current instruction address is used. Add test cases for
> when R=1 and for R=0. paddi has a 34 bit immediate field formed by
> concatenating si0 and si1. Add tests for the extreme values of this
> field.
>
> Skip the paddi tests if ISA v3.1 is unsupported.
>
> Some of these test cases were added by Balamuruhan S.
>
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/lib/test_emulate_step.c  | 127 ++
>  .../lib/test_emulate_step_exec_instr.S|   1 +
>  2 files changed, 128 insertions(+)
>
> diff --git a/arch/powerpc/lib/test_emulate_step.c 
> b/arch/powerpc/lib/test_emulate_step.c
> index 579b5db80674..33a72b7d2764 100644
> --- a/arch/powerpc/lib/test_emulate_step.c
> +++ b/arch/powerpc/lib/test_emulate_step.c
> @@ -105,6 +105,13 @@
> ___PPC_RA(a) | ___PPC_RB(b))
>  #define TEST_ADDC_DOT(t, a, b) ppc_inst(PPC_INST_ADDC | ___PPC_RT(t) |   
>   \
> ___PPC_RA(a) | ___PPC_RB(b) | 0x1)
> +#define TEST_PADDI(t, a, i, pr)ppc_inst_prefix(PPC_PREFIX_MLS | 
> __PPC_PRFX_R(pr) | \
> +   IMM_H(i),   \
> +   PPC_INST_ADDI | \
> +   ___PPC_RT(t) | ___PPC_RA(a) |   \
> +   IMM_L(i))
> +
> +
>
>  #define MAX_SUBTESTS   16
>
> @@ -699,6 +706,11 @@ struct compute_test {
> } subtests[MAX_SUBTESTS + 1];
>  };
>
> +/* Extreme values for si0||si1 (the MLS:D-form 34 bit immediate field) */
> +#define SI_MIN BIT(33)
> +#define SI_MAX (BIT(33) - 1)
> +#define SI_UMAX (BIT(34) - 1)
> +
>  static struct compute_test compute_tests[] = {
> {
> .mnemonic = "nop",
> @@ -1071,6 +1083,121 @@ static struct compute_test compute_tests[] = {
> }
> }
> }
> +   },
> +   {
> +   .mnemonic = "paddi",
> +   .cpu_feature = CPU_FTR_ARCH_31,
> +   .subtests = {
> +   {
> +   .descr = "RA = LONG_MIN, SI = SI_MIN, R = 0",
> +   .instr = TEST_PADDI(21, 22, SI_MIN, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = LONG_MIN,
> +   }
> +   },
> +   {
> +   .descr = "RA = LONG_MIN, SI = SI_MAX, R = 0",
> +   .instr = TEST_PADDI(21, 22, SI_MAX, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = LONG_MIN,
> +   }
> +   },
> +   {
> +   .descr = "RA = LONG_MAX, SI = SI_MAX, R = 0",
> +   .instr = TEST_PADDI(21, 22, SI_MAX, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = LONG_MAX,
> +   }
> +   },
> +   {
> +   .descr = "RA = ULONG_MAX, SI = SI_UMAX, R = 
> 0",
> +   .instr = TEST_PADDI(21, 22, SI_UMAX, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = ULONG_MAX,
> +   }
> +   },
> +   {
> +   .descr = "RA = ULONG_MAX, SI = 0x1, R = 0",
> +   .instr = TEST_PADDI(21, 22, 0x1, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = ULONG_MAX,
> +   }
> +   },
> +   {
> +   .descr = "RA = INT_MIN, SI = SI_MIN, R = 0",
> +   .instr = TEST_PADDI(21, 22, SI_MIN, 0),
> +   .regs = {
> +   .gpr[21] = 0,
> +   .gpr[22] = INT_MIN,
> +   }
> +   },
> +   {
> +   .descr = "RA = INT_MIN, SI = SI_MAX, R = 0",
> +   .instr = TEST_PADDI(21, 22, SI_MAX, 0),
> +   

Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-21 Thread Brian King
On 7/21/20 5:13 PM, Leonardo Bras wrote:
> On Tue, 2020-07-21 at 14:59 +1000, Alexey Kardashevskiy wrote:
>>
>> On 16/07/2020 17:16, Leonardo Bras wrote:
>>> Move the part of iommu_table_free() that does struct iommu_table cleaning
>>> into iommu_table_clean, so we can invoke it separately.
>>>
>>> This new function is useful for cleaning struct iommu_table before
>>> initializing it again with a new DMA window, without having it freed and
>>> allocated again.
>>>
>>> Signed-off-by: Leonardo Bras 
>>> ---
>>>  arch/powerpc/kernel/iommu.c | 30 ++
>>>  1 file changed, 18 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
>>> index 9704f3f76e63..c3242253a4e7 100644
>>> --- a/arch/powerpc/kernel/iommu.c
>>> +++ b/arch/powerpc/kernel/iommu.c
>>> @@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct 
>>> iommu_table *tbl, int nid,
>>> return tbl;
>>>  }
>>>  
>>> -static void iommu_table_free(struct kref *kref)
>>> +static void iommu_table_clean(struct iommu_table *tbl)
>>
>> iommu_table_free() + iommu_init_table() + set_iommu_table_base() should
>> work too, why new helper?
> 
> iommu_table_free() also frees the tbl, which would need allocate it
> again (new address) and to fill it up again, unnecessarily. 
> I think it's a better approach to only change what is needed.
> 
>> There is also iommu_table_clear() which does a different thing so you
>> need a better name.
> 
> I agree.
> I had not noticed this other function before sending the patchset. What
> would be a better name though? __iommu_table_free()? 
> 
>> Second, iommu_table_free
>> use and it would be ok as we would only see this when hot-unplugging a
>> PE because we always kept the default window.
>> Btw you must be seeing these warnings now every time you create DDW with
>> these patches as at least the first page is reserved, do not you?
> 
> It does not print a warning.
> I noticed other warnings, but not this one from iommu_table_free():
> /* verify that table contains no entries */
> if (!bitmap_empty(tbl->it_ma
> p, tbl->it_size))
>   pr_warn("%s: Unexpected TCEs\n", __func__);
> 
> Before that, iommu_table_release_pages(tbl) is supposed to clear the 
> bitmap, so this only tests for a tce that is created in this short period.
> 
>> Since we are replacing a table for a device which is still in the
>> system, we should not try messing with its DMA if it already has
>> mappings so the warning should become an error preventing DDW. It is
>> rather hard to trigger in practice but I could hack a driver to ask for
>> 32bit DMA mask first, map few pages and then ask for 64bit DMA mask, it
>> is not illegal, I think. So this needs a new helper - "bool
>> iommu_table_in_use(tbl)" - to use in enable_ddw(). Or I am overthinking
>> this?... Thanks,
> 
> As of today, there seems to be nothing like that happening in the
> driver I am testing. 
> I spoke to Brian King on slack, and he mentioned that at the point DDW
> is created there should be no allocations in place.

I think there are a couple of scenarios here. One is where there is a DMA
allocation prior to a call to set the DMA mask. Second scenario is if the
driver makes multiple calls to set the DMA mask. I would argue that a properly
written driver should tell the IOMMU subsystem what DMA mask it supports prior
to allocating DMA memroy. Documentation/core-api/dma-api-howto.rst should
describe what is legal and what is not.

It might be reasonable to declare its not allowed to allocate DMA memory
and then later change the DMA mask and clearly call this out in the 
documentation
if its not already.

-Brian


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center



Re: [PATCH v2 1/3] powerpc: inline doorbell sending functions

2020-07-21 Thread Michael Ellerman
Nicholas Piggin  writes:
> These are only called in one place for a given platform, so inline them
> for performance.
>
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/include/asm/dbell.h | 63 ++--
>  arch/powerpc/kernel/dbell.c  | 55 
>  2 files changed, 60 insertions(+), 58 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/dbell.h 
> b/arch/powerpc/include/asm/dbell.h
> index 4ce6808deed3..f19d2282e3f8 100644
> --- a/arch/powerpc/include/asm/dbell.h
> +++ b/arch/powerpc/include/asm/dbell.h
> @@ -13,6 +13,7 @@
>  
>  #include 
>  #include 
> +#include 
>  
>  #define PPC_DBELL_MSG_BRDCAST(0x0400)
>  #define PPC_DBELL_TYPE(x)(((x) & 0xf) << (63-36))


This somehow breaks ppc40x_defconfig and others:

In file included from /home/michael/linux/arch/powerpc/include/asm/kvm_ppc.h:24,
 from /home/michael/linux/arch/powerpc/include/asm/dbell.h:16,
 from /home/michael/linux/arch/powerpc/kernel/asm-offsets.c:38:
/home/michael/linux/arch/powerpc/include/asm/kvm_booke.h: In function 
‘kvmppc_get_fault_dar’:
/home/michael/linux/arch/powerpc/include/asm/kvm_booke.h:94:19: error: ‘struct 
kvm_vcpu_arch’ has no member named ‘fault_dear’
   94 |  return vcpu->arch.fault_dear;
  |   ^
make[2]: *** [/home/michael/linux/scripts/Makefile.build:114: 
arch/powerpc/kernel/asm-offsets.s] Error 1
make[1]: *** [/home/michael/linux/Makefile:1175: prepare0] Error 2
make: *** [Makefile:185: __sub-make] Error 2

cheers


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Palmer Dabbelt

On Tue, 21 Jul 2020 16:12:58 PDT (-0700), b...@kernel.crashing.org wrote:

On Tue, 2020-07-21 at 12:05 -0700, Palmer Dabbelt wrote:


* We waste vmalloc space on 32-bit systems, where there isn't a lot of it.
* On 64-bit systems the VA space around the kernel is precious because it's the
  only place we can place text (modules, BPF, whatever).


Why ? Branch distance limits ? You can't use trampolines ?


Nothing fundamental, it's just that we don't have a large code model in the C
compiler.  As a result all the global symbols are resolved as 32-bit
PC-relative accesses.  We could fix this with a fast large code model, but then
the kernel would need to relax global symbol references in modules and we don't
even do that for the simple code models we have now.  FWIW, some of the
proposed large code models are essentially just split-PLT/GOT and therefor
don't require relaxation, but at that point we're essentially PIC until we
have more that 2GiB of kernel text -- and even then, we keep all the
performance issues.


 If we start putting
  the kernel in the vmalloc space then we either have to pre-allocate a bunch
  of space around it (essentially making it a fixed mapping anyway) or it
  becomes likely that we won't be able to find space for modules as they're
  loaded into running systems.


I dislike the kernel being in the vmalloc space (see my other email)
but I don't understand the specific issue with modules.


Essentially what's above, the modules smell the same as the rest of the
kernel's code and therefor have a similar set of restrictions.  If we build PIC
modules and have the PLT entries do GOT loads (as do our shared libraries) then
we could break this restriction, but that comes with some performance
implications.  Like I said in the other email, I'm less worried about the
instruction side of things so maybe that's the right way to go.


* Relying on a relocatable kernel for sv48 support introduces a fairly large
  performance hit.


Out of curiosity why would relocatable kernels introduce a significant
hit ? Where about do you see the overhead coming from ?


Our PIC codegen, probably better addressed by my other email and above.




Roughly, my proposal would be to:

* Leave the 32-bit memory map alone.  On 32-bit systems we can load modules
  anywhere and we only have one VA width, so we're not really solving any
  problems with these changes.
* Staticly allocate a 2GiB portion of the VA space for all our text, as its own
  region.  We'd link/relocate the kernel here instead of around PAGE_OFFSET,
  which would decouple the kernel from the physical memory layout of the system.
  This would have the side effect of sorting out a bunch of bootloader headaches
  that we currently have.
* Sort out how to maintain a linear map as the canonical hole moves around
  between the VA widths without adding a bunch of overhead to the virt2phys and
  friends.  This is probably going to be the trickiest part, but I think if we
  just change the page table code to essentially lie about VAs when an sv39
  system runs an sv48+sv39 kernel we could make it work -- there'd be some
  logical complexity involved, but it would remain fast.

This doesn't solve the problem of virtually relocatable kernels, but it does
let us decouple that from the sv48 stuff.  It also lets us stop relying on a
fixed physical address the kernel is loaded into, which is another thing I
don't like.

I know this may be a more complicated approach, but there aren't any sv48
systems around right now so I just don't see the rush to support them,
particularly when there's a cost to what already exists (for those who haven't
been watching, so far all the sv48 patch sets have imposed a significant
performance penalty on all systems).


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Palmer Dabbelt

On Tue, 21 Jul 2020 16:11:02 PDT (-0700), b...@kernel.crashing.org wrote:

On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote:

> > I guess I don't understand why this is necessary at all.
> > Specifically: why
> > can't we just relocate the kernel within the linear map?  That would
> > let the
> > bootloader put the kernel wherever it wants, modulo the physical
> > memory size we
> > support.  We'd need to handle the regions that are coupled to the
> > kernel's
> > execution address, but we could just put them in an explicit memory
> > region
> > which is what we should probably be doing anyway.
>
> Virtual relocation in the linear mapping requires to move the kernel
> physically too. Zong implemented this physical move in its KASLR RFC
> patchset, which is cumbersome since finding an available physical spot
> is harder than just selecting a virtual range in the vmalloc range.
>
> In addition, having the kernel mapping in the linear mapping prevents
> the use of hugepage for the linear mapping resulting in performance loss
> (at least for the GB that encompasses the kernel).
>
> Why do you find this "ugly" ? The vmalloc region is just a bunch of
> available virtual addresses to whatever purpose we want, and as noted by
> Zong, arm64 uses the same scheme.


I don't get it :-)

At least on powerpc we move the kernel in the linear mapping and it
works fine with huge pages, what is your problem there ? You rely on
punching small-page size holes in there ?


That was my original suggestion, and I'm not actually sure it's invalid.  It
would mean that both the kernel's physical and virtual addresses are set by the
bootloader, which may or may not be workable if we want to have an sv48+sv39
kernel.  My initial approach to sv48+sv39 kernels would be to just throw away
the sv39 memory on sv48 kernels, which would preserve the linear map but mean
that there is no single physical address that's accessible for both.  That
would require some coordination between the bootloader and the kernel as to
where it should be loaded, but maybe there's a better way to design the linear
map.  Right now we have a bunch of unwritten rules about where things need to
be loaded, which is a recipe for disaster.

We could copy the kernel around, but I'm not sure I really like that idea.  We
do zero the BSS right now, so it's not like we entirely rely on the bootloader
to set up the kernel image, but with the hart race boot scheme we have right
now we'd at least need to leave a stub sitting around.  Maybe we just throw
away SBI v0.1, though, that's why we called it all legacy in the first place.

My bigger worry is that anything that involves running the kernel at arbitrary
virtual addresses means we need a PIC kernel, which means every global symbol
needs an indirection.  That's probably not so bad for shared libraries, but the
kernel has a lot of global symbols.  PLT references probably aren't so scary,
as we have an incoherent instruction cache so the virtual function predictor
isn't that hard to build, but making all global data accesses GOT-relative
seems like a disaster for performance.  This fixed-VA thing really just exists
so we don't have to be full-on PIC.

In theory I think we could just get away with pretending that medany is PIC,
which I believe works as long as the data and text offset stays constant, you
you don't have any symbols between 2GiB and -2GiB (as those may stay fixed,
even in medany), and you deal with GP accordingly (which should work itself out
in the current startup code).  We rely on this for some of the early boot code
(and will soon for kexec), but that's a very controlled code base and we've
already had some issues.  I'd be much more comfortable adding an explicit
semi-PIC code model, as I tend to miss something when doing these sorts of
things and then we could at least add it to the GCC test runs and guarantee it
actually works.  Not really sure I want to deal with that, though.  It would,
however, be the only way to get random virtual addresses during kernel
execution.


At least in the old days, there were a number of assumptions that
the kernel text/data/bss resides in the linear mapping.


Ya, it terrified me as well.  Alex says arm64 puts the kernel in the vmalloc
region, so assuming that's the case it must be possible.  I didn't get that
from reading the arm64 port (I guess it's no secret that pretty much all I do
is copy their code)


If you change that you need to ensure that it's still physically
contiguous and you'll have to tweak __va and __pa, which might induce
extra overhead.


I'm operating under the assumption that we don't want to add an additional load
to virt2phys conversions.  arm64 bends over backwards to avoid the load, and
I'm assuming they have a reason for doing so.  Of course, if we're PIC then
maybe performance just doesn't matter, but I'm not sure I want to just give up.
Distros will probably build the sv48+sv39 kernels as soon as they show up, even
if there's no sv48 

Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Benjamin Herrenschmidt
On Tue, 2020-07-21 at 12:05 -0700, Palmer Dabbelt wrote:
> 
> * We waste vmalloc space on 32-bit systems, where there isn't a lot of it.
> * On 64-bit systems the VA space around the kernel is precious because it's 
> the
>   only place we can place text (modules, BPF, whatever). 

Why ? Branch distance limits ? You can't use trampolines ?

>  If we start putting
>   the kernel in the vmalloc space then we either have to pre-allocate a bunch
>   of space around it (essentially making it a fixed mapping anyway) or it
>   becomes likely that we won't be able to find space for modules as they're
>   loaded into running systems.

I dislike the kernel being in the vmalloc space (see my other email)
but I don't understand the specific issue with modules.

> * Relying on a relocatable kernel for sv48 support introduces a fairly large
>   performance hit.

Out of curiosity why would relocatable kernels introduce a significant
hit ? Where about do you see the overhead coming from ?

> Roughly, my proposal would be to:
> 
> * Leave the 32-bit memory map alone.  On 32-bit systems we can load modules
>   anywhere and we only have one VA width, so we're not really solving any
>   problems with these changes.
> * Staticly allocate a 2GiB portion of the VA space for all our text, as its 
> own
>   region.  We'd link/relocate the kernel here instead of around PAGE_OFFSET,
>   which would decouple the kernel from the physical memory layout of the 
> system.
>   This would have the side effect of sorting out a bunch of bootloader 
> headaches
>   that we currently have.
> * Sort out how to maintain a linear map as the canonical hole moves around
>   between the VA widths without adding a bunch of overhead to the virt2phys 
> and
>   friends.  This is probably going to be the trickiest part, but I think if we
>   just change the page table code to essentially lie about VAs when an sv39
>   system runs an sv48+sv39 kernel we could make it work -- there'd be some
>   logical complexity involved, but it would remain fast.
> 
> This doesn't solve the problem of virtually relocatable kernels, but it does
> let us decouple that from the sv48 stuff.  It also lets us stop relying on a
> fixed physical address the kernel is loaded into, which is another thing I
> don't like.
> 
> I know this may be a more complicated approach, but there aren't any sv48
> systems around right now so I just don't see the rush to support them,
> particularly when there's a cost to what already exists (for those who haven't
> been watching, so far all the sv48 patch sets have imposed a significant
> performance penalty on all systems).



Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Benjamin Herrenschmidt
On Tue, 2020-07-21 at 14:36 -0400, Alex Ghiti wrote:
> > > I guess I don't understand why this is necessary at all.  
> > > Specifically: why
> > > can't we just relocate the kernel within the linear map?  That would 
> > > let the
> > > bootloader put the kernel wherever it wants, modulo the physical 
> > > memory size we
> > > support.  We'd need to handle the regions that are coupled to the 
> > > kernel's
> > > execution address, but we could just put them in an explicit memory 
> > > region
> > > which is what we should probably be doing anyway.
> > 
> > Virtual relocation in the linear mapping requires to move the kernel 
> > physically too. Zong implemented this physical move in its KASLR RFC 
> > patchset, which is cumbersome since finding an available physical spot 
> > is harder than just selecting a virtual range in the vmalloc range.
> > 
> > In addition, having the kernel mapping in the linear mapping prevents 
> > the use of hugepage for the linear mapping resulting in performance loss 
> > (at least for the GB that encompasses the kernel).
> > 
> > Why do you find this "ugly" ? The vmalloc region is just a bunch of 
> > available virtual addresses to whatever purpose we want, and as noted by 
> > Zong, arm64 uses the same scheme.

I don't get it :-)

At least on powerpc we move the kernel in the linear mapping and it
works fine with huge pages, what is your problem there ? You rely on
punching small-page size holes in there ?

At least in the old days, there were a number of assumptions that
the kernel text/data/bss resides in the linear mapping.

If you change that you need to ensure that it's still physically
contiguous and you'll have to tweak __va and __pa, which might induce
extra overhead.

Cheers,
Ben.
 



Re: [PATCH v4 5/7] powerpc/iommu: Move iommu_table cleaning routine to iommu_table_clean

2020-07-21 Thread Leonardo Bras
On Tue, 2020-07-21 at 14:59 +1000, Alexey Kardashevskiy wrote:
> 
> On 16/07/2020 17:16, Leonardo Bras wrote:
> > Move the part of iommu_table_free() that does struct iommu_table cleaning
> > into iommu_table_clean, so we can invoke it separately.
> > 
> > This new function is useful for cleaning struct iommu_table before
> > initializing it again with a new DMA window, without having it freed and
> > allocated again.
> > 
> > Signed-off-by: Leonardo Bras 
> > ---
> >  arch/powerpc/kernel/iommu.c | 30 ++
> >  1 file changed, 18 insertions(+), 12 deletions(-)
> > 
> > diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
> > index 9704f3f76e63..c3242253a4e7 100644
> > --- a/arch/powerpc/kernel/iommu.c
> > +++ b/arch/powerpc/kernel/iommu.c
> > @@ -735,21 +735,10 @@ struct iommu_table *iommu_init_table(struct 
> > iommu_table *tbl, int nid,
> > return tbl;
> >  }
> >  
> > -static void iommu_table_free(struct kref *kref)
> > +static void iommu_table_clean(struct iommu_table *tbl)
> 
> iommu_table_free() + iommu_init_table() + set_iommu_table_base() should
> work too, why new helper?

iommu_table_free() also frees the tbl, which would need allocate it
again (new address) and to fill it up again, unnecessarily. 
I think it's a better approach to only change what is needed.

> There is also iommu_table_clear() which does a different thing so you
> need a better name.

I agree.
I had not noticed this other function before sending the patchset. What
would be a better name though? __iommu_table_free()? 

> Second, iommu_table_free
> use and it would be ok as we would only see this when hot-unplugging a
> PE because we always kept the default window.
> Btw you must be seeing these warnings now every time you create DDW with
> these patches as at least the first page is reserved, do not you?

It does not print a warning.
I noticed other warnings, but not this one from iommu_table_free():
/* verify that table contains no entries */
if (!bitmap_empty(tbl->it_ma
p, tbl->it_size))
pr_warn("%s: Unexpected TCEs\n", __func__);

Before that, iommu_table_release_pages(tbl) is supposed to clear the 
bitmap, so this only tests for a tce that is created in this short period.

> Since we are replacing a table for a device which is still in the
> system, we should not try messing with its DMA if it already has
> mappings so the warning should become an error preventing DDW. It is
> rather hard to trigger in practice but I could hack a driver to ask for
> 32bit DMA mask first, map few pages and then ask for 64bit DMA mask, it
> is not illegal, I think. So this needs a new helper - "bool
> iommu_table_in_use(tbl)" - to use in enable_ddw(). Or I am overthinking
> this?... Thanks,

As of today, there seems to be nothing like that happening in the
driver I am testing. 
I spoke to Brian King on slack, and he mentioned that at the point DDW
is created there should be no allocations in place.

But I suppose some driver could try to do this.

Maybe a better approach would be removing the mapping only if the
default window is removed (at the end of enable_ddw, as an else to
resetting the default DMA window), and having a way to add more
mappings to those pools. But this last part doesn't look so simple, and
it would be better to understand if it's necessary investing work in
this.

What do you think?

Best regards,




Re: [PATCH v4 6/7] powerpc/pseries/iommu: Make use of DDW even if it does not map the partition

2020-07-21 Thread Leonardo Bras
On Thu, 2020-07-16 at 04:16 -0300, Leonardo Bras wrote:
> +static void iommu_pseries_table_update(struct pci_dev *dev,
> +  struct device_node *pdn)
> +{
> +   const struct dynamic_dma_window_prop *ddw;
> +   struct pci_dn *pci;
> +   int len;
> +
> +   ddw = of_get_property(pdn, DMA64_PROPNAME, );
> +   if (!ddw  || len < sizeof(struct dynamic_dma_window_prop))
> +   return;
> +
> +   iommu_table_update(pci->table_group->tables[0], pci->phb->node,
> +  ddw->liobn, ddw->dma_base, ddw->tce_shift,
> +  ddw->window_shift);
> +}
> +
>  static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
>  {
> struct device_node *pdn, *dn;
> @@ -1382,6 +1403,7 @@ static bool iommu_bypass_supported_pSeriesLP(struct 
> pci_dev *pdev, u64 dma_mask)
> pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn);
> if (pdev->dev.archdata.dma_offset)
> return true;
> +   iommu_pseries_table_update(pdev, pdn);
> }
> 

Noticed a bug in this one: pci is not getting assigned. 
My bad, there must have been a merge error.

Also, I will refactor the function to make use of pdn only, as I can do
pci = PCI_DN(pdn) (I think it's better this way).

Sorry for the buggy patch.

Best regards,



Re: [PATCH v2 2/2] KVM: PPC: Book3S HV: rework secure mem slot dropping

2020-07-21 Thread Ram Pai
On Tue, Jul 21, 2020 at 12:42:02PM +0200, Laurent Dufour wrote:
> When a secure memslot is dropped, all the pages backed in the secure device
> (aka really backed by secure memory by the Ultravisor) should be paged out
> to a normal page. Previously, this was achieved by triggering the page
> fault mechanism which is calling kvmppc_svm_page_out() on each pages.
> 
> This can't work when hot unplugging a memory slot because the memory slot
> is flagged as invalid and gfn_to_pfn() is then not trying to access the
> page, so the page fault mechanism is not triggered.
> 
> Since the final goal is to make a call to kvmppc_svm_page_out() it seems
> simpler to directly calling it instead of triggering such a mechanism. This
^^ call directly instead of triggering..

> way kvmppc_uvmem_drop_pages() can be called even when hot unplugging a
> memslot.
> 
> Since kvmppc_uvmem_drop_pages() is already holding kvm->arch.uvmem_lock,
> the call to __kvmppc_svm_page_out() is made.
> As __kvmppc_svm_page_out needs the vma pointer to migrate the pages, the
> VMA is fetched in a lazy way, to not trigger find_vma() all the time. In
> addition, the mmap_sem is help in read mode during that time, not in write
  ^^ held

> mode since the virual memory layout is not impacted, and
> kvm->arch.uvmem_lock prevents concurrent operation on the secure device.
> 
> Cc: Ram Pai 

Reviewed-by: Ram Pai 

RP


Re: [PATCH v2 1/2] KVM: PPC: Book3S HV: move kvmppc_svm_page_out up

2020-07-21 Thread Ram Pai
On Tue, Jul 21, 2020 at 12:42:01PM +0200, Laurent Dufour wrote:
> kvmppc_svm_page_out() will need to be called by kvmppc_uvmem_drop_pages()
> so move it upper in this file.
> 
> Furthermore it will be interesting to call this function when already
> holding the kvm->arch.uvmem_lock, so prefix the original function with __
> and remove the locking in it, and introduce a wrapper which call that
> function with the lock held.
> 
> There is no functional change.

Reviewed-by: Ram Pai 

> 
> Cc: Ram Pai 
> Cc: Bharata B Rao 
> Cc: Paul Mackerras 
> Signed-off-by: Laurent Dufour 
> ---

RP


[powerpc:next-test] BUILD REGRESSION 08b8bb849948ff5e2305d1115ce8bbdd55364a70

2020-07-21 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
next-test
branch HEAD: 08b8bb849948ff5e2305d1115ce8bbdd55364a70  powerpc 
test_emulate_step: move extern declaration to sstep.h

Error/Warning in current branch:

arch/powerpc/include/asm/ppc-opcode.h:277:22: error: called object is not a 
function or function pointer
arch/powerpc/include/asm/ppc-opcode.h:281:22: error: called object is not a 
function or function pointer
arch/powerpc/lib/test_emulate_step.c:28:4: error: 'PPC_INST_LWZ' undeclared 
(first use in this function); did you mean 'PPC_INST_LSWI'?
arch/powerpc/lib/test_emulate_step.c:29:4: error: 'PPC_INST_LWZ' undeclared 
(first use in this function); did you mean 'PPC_INST_LFD'?

Error/Warning ids grouped by kconfigs:

recent_errors
|-- powerpc-allmodconfig
|   |-- 
arch-powerpc-include-asm-ppc-opcode.h:error:called-object-is-not-a-function-or-function-pointer
|   `-- 
arch-powerpc-lib-test_emulate_step.c:error:PPC_INST_LWZ-undeclared-(first-use-in-this-function)
`-- powerpc-allyesconfig
|-- 
arch-powerpc-include-asm-ppc-opcode.h:error:called-object-is-not-a-function-or-function-pointer
`-- 
arch-powerpc-lib-test_emulate_step.c:error:PPC_INST_LWZ-undeclared-(first-use-in-this-function)

elapsed time: 1784m

configs tested: 97
configs skipped: 2

arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
armzeus_defconfig
shecovec24-romimage_defconfig
arm   netwinder_defconfig
mipsnlm_xlr_defconfig
powerpccell_defconfig
s390  debug_defconfig
arm  pxa3xx_defconfig
m68km5407c3_defconfig
sh  sdk7780_defconfig
c6x dsk6455_defconfig
arm   h5000_defconfig
i386 allyesconfig
i386defconfig
i386  debian-10.3
i386  allnoconfig
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
powerpc defconfig
i386 randconfig-a001-20200719
i386 randconfig-a006-20200719
i386 randconfig-a002-20200719
i386 randconfig-a005-20200719
i386 randconfig-a003-20200719
i386 randconfig-a004-20200719
x86_64   randconfig-a005-20200719
x86_64   randconfig-a002-20200719
x86_64   randconfig-a006-20200719
x86_64   randconfig-a001-20200719
x86_64   randconfig-a003-20200719
x86_64   randconfig-a004-20200719
riscvallyesconfig
riscv allnoconfig
riscv   defconfig

Re: [PATCH v6] ima: move APPRAISE_BOOTPARAM dependency on ARCH_POLICY to runtime

2020-07-21 Thread Bruno Meneguele
On Tue, Jul 21, 2020 at 01:26:16PM -0400, Mimi Zohar wrote:
> On Mon, 2020-07-20 at 12:38 -0300, Bruno Meneguele wrote:
> > On Mon, Jul 20, 2020 at 10:56:55AM -0400, Mimi Zohar wrote:
> > > On Mon, 2020-07-20 at 10:40 -0400, Nayna wrote:
> > > > On 7/13/20 12:48 PM, Bruno Meneguele wrote:
> > > > > The IMA_APPRAISE_BOOTPARAM config allows enabling different 
> > > > > "ima_appraise="
> > > > > modes - log, fix, enforce - at run time, but not when IMA architecture
> > > > > specific policies are enabled.  This prevents properly labeling the
> > > > > filesystem on systems where secure boot is supported, but not enabled 
> > > > > on the
> > > > > platform.  Only when secure boot is actually enabled should these IMA
> > > > > appraise modes be disabled.
> > > > >
> > > > > This patch removes the compile time dependency and makes it a runtime
> > > > > decision, based on the secure boot state of that platform.
> > > > >
> > > > > Test results as follows:
> > > > >
> > > > > -> x86-64 with secure boot enabled
> > > > >
> > > > > [0.015637] Kernel command line: <...> ima_policy=appraise_tcb 
> > > > > ima_appraise=fix
> > > > > [0.015668] ima: Secure boot enabled: ignoring ima_appraise=fix 
> > > > > boot parameter option
> > > > >
> > > 
> > > Is it common to have two colons in the same line?  Is the colon being
> > > used as a delimiter when parsing the kernel logs?  Should the second
> > > colon be replaced with a hyphen?  (No need to repost.  I'll fix it
> > > up.)
> > >  
> > 
> > AFAICS it has been used without any limitations, e.g:
> > 
> > PM: hibernation: Registered nosave memory: [mem 0x-0x0fff]
> > clocksource: hpet: mask: 0x max_cycles: 0x, max_idle_ns: 
> > 133484873504 ns
> > microcode: CPU0: patch_level=0x08701013
> > Lockdown: modprobe: unsigned module loading is restricted; see man 
> > kernel_lockdown.7
> > ...
> > 
> > I'd say we're fine using it.
> 
> Ok.  FYI, it's now in next-integrity.
> 
> Mimi
> 

Thanks Mimi.

-- 
bmeneg 
PGP Key: http://bmeneg.com/pubkey.txt


signature.asc
Description: PGP signature


Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Palmer Dabbelt

On Tue, 21 Jul 2020 11:36:10 PDT (-0700), a...@ghiti.fr wrote:

Let's try to make progress here: I add linux-mm in CC to get feedback on
this patch as it blocks sv48 support too.


Sorry for being slow here.  I haven't replied because I hadn't really fleshed
out the design yet, but just so everyone's on the same page my problems with
this are:

* We waste vmalloc space on 32-bit systems, where there isn't a lot of it.
* On 64-bit systems the VA space around the kernel is precious because it's the
 only place we can place text (modules, BPF, whatever).  If we start putting
 the kernel in the vmalloc space then we either have to pre-allocate a bunch
 of space around it (essentially making it a fixed mapping anyway) or it
 becomes likely that we won't be able to find space for modules as they're
 loaded into running systems.
* Relying on a relocatable kernel for sv48 support introduces a fairly large
 performance hit.

Roughly, my proposal would be to:

* Leave the 32-bit memory map alone.  On 32-bit systems we can load modules
 anywhere and we only have one VA width, so we're not really solving any
 problems with these changes.
* Staticly allocate a 2GiB portion of the VA space for all our text, as its own
 region.  We'd link/relocate the kernel here instead of around PAGE_OFFSET,
 which would decouple the kernel from the physical memory layout of the system.
 This would have the side effect of sorting out a bunch of bootloader headaches
 that we currently have.
* Sort out how to maintain a linear map as the canonical hole moves around
 between the VA widths without adding a bunch of overhead to the virt2phys and
 friends.  This is probably going to be the trickiest part, but I think if we
 just change the page table code to essentially lie about VAs when an sv39
 system runs an sv48+sv39 kernel we could make it work -- there'd be some
 logical complexity involved, but it would remain fast.

This doesn't solve the problem of virtually relocatable kernels, but it does
let us decouple that from the sv48 stuff.  It also lets us stop relying on a
fixed physical address the kernel is loaded into, which is another thing I
don't like.

I know this may be a more complicated approach, but there aren't any sv48
systems around right now so I just don't see the rush to support them,
particularly when there's a cost to what already exists (for those who haven't
been watching, so far all the sv48 patch sets have imposed a significant
performance penalty on all systems).



Alex

Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit :

Hi Palmer,

Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit :

On Sun, 07 Jun 2020 00:59:46 PDT (-0700), a...@ghiti.fr wrote:

This is a preparatory patch for relocatable kernel.

The kernel used to be linked at PAGE_OFFSET address and used to be
loaded
physically at the beginning of the main memory. Therefore, we could use
the linear mapping for the kernel mapping.

But the relocated kernel base address will be different from PAGE_OFFSET
and since in the linear mapping, two different virtual addresses cannot
point to the same physical address, the kernel mapping needs to lie
outside
the linear mapping.


I know it's been a while, but I keep opening this up to review it and
just
can't get over how ugly it is to put the kernel's linear map in the
vmalloc
region.

I guess I don't understand why this is necessary at all.
Specifically: why
can't we just relocate the kernel within the linear map?  That would
let the
bootloader put the kernel wherever it wants, modulo the physical
memory size we
support.  We'd need to handle the regions that are coupled to the
kernel's
execution address, but we could just put them in an explicit memory
region
which is what we should probably be doing anyway.


Virtual relocation in the linear mapping requires to move the kernel
physically too. Zong implemented this physical move in its KASLR RFC
patchset, which is cumbersome since finding an available physical spot
is harder than just selecting a virtual range in the vmalloc range.

In addition, having the kernel mapping in the linear mapping prevents
the use of hugepage for the linear mapping resulting in performance loss
(at least for the GB that encompasses the kernel).

Why do you find this "ugly" ? The vmalloc region is just a bunch of
available virtual addresses to whatever purpose we want, and as noted by
Zong, arm64 uses the same scheme.




In addition, because modules and BPF must be close to the kernel (inside
+-2GB window), the kernel is placed at the end of the vmalloc zone minus
2GB, which leaves room for modules and BPF. The kernel could not be
placed at the beginning of the vmalloc zone since other vmalloc
allocations from the kernel could get all the +-2GB window around the
kernel which would prevent new modules and BPF programs to be loaded.


Well, that's not enough to make sure this doesn't happen -- it's just
enough to
make sure it doesn't happen very quickily.  That's the same boat we're
already
in, though, 

Re: [PATCH v5 1/4] riscv: Move kernel mapping to vmalloc zone

2020-07-21 Thread Alex Ghiti
Let's try to make progress here: I add linux-mm in CC to get feedback on 
this patch as it blocks sv48 support too.


Alex

Le 7/9/20 à 7:11 AM, Alex Ghiti a écrit :

Hi Palmer,

Le 7/9/20 à 1:05 AM, Palmer Dabbelt a écrit :

On Sun, 07 Jun 2020 00:59:46 PDT (-0700), a...@ghiti.fr wrote:

This is a preparatory patch for relocatable kernel.

The kernel used to be linked at PAGE_OFFSET address and used to be 
loaded

physically at the beginning of the main memory. Therefore, we could use
the linear mapping for the kernel mapping.

But the relocated kernel base address will be different from PAGE_OFFSET
and since in the linear mapping, two different virtual addresses cannot
point to the same physical address, the kernel mapping needs to lie 
outside

the linear mapping.


I know it's been a while, but I keep opening this up to review it and 
just
can't get over how ugly it is to put the kernel's linear map in the 
vmalloc

region.

I guess I don't understand why this is necessary at all.  
Specifically: why
can't we just relocate the kernel within the linear map?  That would 
let the
bootloader put the kernel wherever it wants, modulo the physical 
memory size we
support.  We'd need to handle the regions that are coupled to the 
kernel's
execution address, but we could just put them in an explicit memory 
region

which is what we should probably be doing anyway.


Virtual relocation in the linear mapping requires to move the kernel 
physically too. Zong implemented this physical move in its KASLR RFC 
patchset, which is cumbersome since finding an available physical spot 
is harder than just selecting a virtual range in the vmalloc range.


In addition, having the kernel mapping in the linear mapping prevents 
the use of hugepage for the linear mapping resulting in performance loss 
(at least for the GB that encompasses the kernel).


Why do you find this "ugly" ? The vmalloc region is just a bunch of 
available virtual addresses to whatever purpose we want, and as noted by 
Zong, arm64 uses the same scheme.





In addition, because modules and BPF must be close to the kernel (inside
+-2GB window), the kernel is placed at the end of the vmalloc zone minus
2GB, which leaves room for modules and BPF. The kernel could not be
placed at the beginning of the vmalloc zone since other vmalloc
allocations from the kernel could get all the +-2GB window around the
kernel which would prevent new modules and BPF programs to be loaded.


Well, that's not enough to make sure this doesn't happen -- it's just 
enough to
make sure it doesn't happen very quickily.  That's the same boat we're 
already

in, though, so it's not like it's worse.


Indeed, that's not worse, I haven't found a way to reserve vmalloc area 
without actually allocating it.





Signed-off-by: Alexandre Ghiti 
Reviewed-by: Zong Li 
---
 arch/riscv/boot/loader.lds.S |  3 +-
 arch/riscv/include/asm/page.h    | 10 +-
 arch/riscv/include/asm/pgtable.h | 38 ++---
 arch/riscv/kernel/head.S |  3 +-
 arch/riscv/kernel/module.c   |  4 +--
 arch/riscv/kernel/vmlinux.lds.S  |  3 +-
 arch/riscv/mm/init.c | 58 +---
 arch/riscv/mm/physaddr.c |  2 +-
 8 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
index 47a5003c2e28..62d94696a19c 100644
--- a/arch/riscv/boot/loader.lds.S
+++ b/arch/riscv/boot/loader.lds.S
@@ -1,13 +1,14 @@
 /* SPDX-License-Identifier: GPL-2.0 */

 #include 
+#include 

 OUTPUT_ARCH(riscv)
 ENTRY(_start)

 SECTIONS
 {
-    . = PAGE_OFFSET;
+    . = KERNEL_LINK_ADDR;

 .payload : {
 *(.payload)
diff --git a/arch/riscv/include/asm/page.h 
b/arch/riscv/include/asm/page.h

index 2d50f76efe48..48bb09b6a9b7 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -90,18 +90,26 @@ typedef struct page *pgtable_t;

 #ifdef CONFIG_MMU
 extern unsigned long va_pa_offset;
+extern unsigned long va_kernel_pa_offset;
 extern unsigned long pfn_base;
 #define ARCH_PFN_OFFSET    (pfn_base)
 #else
 #define va_pa_offset    0
+#define va_kernel_pa_offset    0
 #define ARCH_PFN_OFFSET    (PAGE_OFFSET >> PAGE_SHIFT)
 #endif /* CONFIG_MMU */

 extern unsigned long max_low_pfn;
 extern unsigned long min_low_pfn;
+extern unsigned long kernel_virt_addr;

 #define __pa_to_va_nodebug(x)    ((void *)((unsigned long) (x) + 
va_pa_offset))

-#define __va_to_pa_nodebug(x)    ((unsigned long)(x) - va_pa_offset)
+#define linear_mapping_va_to_pa(x)    ((unsigned long)(x) - 
va_pa_offset)

+#define kernel_mapping_va_to_pa(x)    \
+    ((unsigned long)(x) - va_kernel_pa_offset)
+#define __va_to_pa_nodebug(x)    \
+    (((x) >= PAGE_OFFSET) ?    \
+    linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))

 #ifdef CONFIG_DEBUG_VIRTUAL
 extern phys_addr_t __virt_to_phys(unsigned long x);
diff --git a/arch/riscv/include/asm/pgtable.h 
b/arch/riscv/include/asm/pgtable.h


[powerpc:next] BUILD SUCCESS 9a77c4a0a12597c661be374b8d566516c0341570

2020-07-21 Thread kernel test robot
tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
next
branch HEAD: 9a77c4a0a12597c661be374b8d566516c0341570  powerpc/prom: Enable 
Radix GTSE in cpu pa-features

elapsed time: 1700m

configs tested: 117
configs skipped: 4

The following configs have been built successfully.
More configs may be tested in the coming days.

arm defconfig
arm  allyesconfig
arm  allmodconfig
arm   allnoconfig
arm64allyesconfig
arm64   defconfig
arm64allmodconfig
arm64 allnoconfig
c6x defconfig
c6xevmc6474_defconfig
sh   alldefconfig
mips bigsur_defconfig
arm   sama5_defconfig
arm   omap1_defconfig
mipse55_defconfig
arm shannon_defconfig
sparc64   allnoconfig
arm  footbridge_defconfig
s390 alldefconfig
armzeus_defconfig
shecovec24-romimage_defconfig
arm   netwinder_defconfig
mipsnlm_xlr_defconfig
powerpccell_defconfig
s390  debug_defconfig
arm  pxa3xx_defconfig
m68km5407c3_defconfig
sh  sdk7780_defconfig
c6x dsk6455_defconfig
arm   h5000_defconfig
arm s3c6400_defconfig
mips   rbtx49xx_defconfig
armu300_defconfig
ia64 alldefconfig
i386  allnoconfig
i386 allyesconfig
i386defconfig
i386  debian-10.3
ia64 allmodconfig
ia64defconfig
ia64  allnoconfig
ia64 allyesconfig
m68k allmodconfig
m68k  allnoconfig
m68k   sun3_defconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
nios2allyesconfig
openriscdefconfig
c6x  allyesconfig
c6x   allnoconfig
openrisc allyesconfig
nds32   defconfig
nds32 allnoconfig
csky allyesconfig
cskydefconfig
alpha   defconfig
alphaallyesconfig
xtensa   allyesconfig
h8300allyesconfig
h8300allmodconfig
xtensa  defconfig
arc defconfig
arc  allyesconfig
sh   allmodconfig
shallnoconfig
microblazeallnoconfig
mips allyesconfig
mips  allnoconfig
mips allmodconfig
pariscallnoconfig
parisc  defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc  allyesconfig
powerpc  rhel-kconfig
powerpc  allmodconfig
powerpc   allnoconfig
powerpc defconfig
i386 randconfig-a001-20200719
i386 randconfig-a006-20200719
i386 randconfig-a002-20200719
i386 randconfig-a005-20200719
i386 randconfig-a003-20200719
i386 randconfig-a004-20200719
i386 randconfig-a015-20200719
i386 randconfig-a011-20200719
i386 randconfig-a016-20200719
i386 randconfig-a012-20200719
i386 randconfig-a013-20200719
i386 randconfig-a014-20200719
x86_64   randconfig-a005-20200719
x86_64   randconfig-a002-20200719
x86_64   randconfig-a006-20200719
x86_64   randconfig-a001-20200719
x86_64   randconfig-a003-20200719
x86_64   randconfig-a004-20200719
riscvallyesconfig
riscv allnoconfig
riscv   

Re: [PATCH v6] ima: move APPRAISE_BOOTPARAM dependency on ARCH_POLICY to runtime

2020-07-21 Thread Mimi Zohar
On Mon, 2020-07-20 at 12:38 -0300, Bruno Meneguele wrote:
> On Mon, Jul 20, 2020 at 10:56:55AM -0400, Mimi Zohar wrote:
> > On Mon, 2020-07-20 at 10:40 -0400, Nayna wrote:
> > > On 7/13/20 12:48 PM, Bruno Meneguele wrote:
> > > > The IMA_APPRAISE_BOOTPARAM config allows enabling different 
> > > > "ima_appraise="
> > > > modes - log, fix, enforce - at run time, but not when IMA architecture
> > > > specific policies are enabled.  This prevents properly labeling the
> > > > filesystem on systems where secure boot is supported, but not enabled 
> > > > on the
> > > > platform.  Only when secure boot is actually enabled should these IMA
> > > > appraise modes be disabled.
> > > >
> > > > This patch removes the compile time dependency and makes it a runtime
> > > > decision, based on the secure boot state of that platform.
> > > >
> > > > Test results as follows:
> > > >
> > > > -> x86-64 with secure boot enabled
> > > >
> > > > [0.015637] Kernel command line: <...> ima_policy=appraise_tcb 
> > > > ima_appraise=fix
> > > > [0.015668] ima: Secure boot enabled: ignoring ima_appraise=fix boot 
> > > > parameter option
> > > >
> > 
> > Is it common to have two colons in the same line?  Is the colon being
> > used as a delimiter when parsing the kernel logs?  Should the second
> > colon be replaced with a hyphen?  (No need to repost.  I'll fix it
> > up.)
> >  
> 
> AFAICS it has been used without any limitations, e.g:
> 
> PM: hibernation: Registered nosave memory: [mem 0x-0x0fff]
> clocksource: hpet: mask: 0x max_cycles: 0x, max_idle_ns: 
> 133484873504 ns
> microcode: CPU0: patch_level=0x08701013
> Lockdown: modprobe: unsigned module loading is restricted; see man 
> kernel_lockdown.7
> ...
> 
> I'd say we're fine using it.

Ok.  FYI, it's now in next-integrity.

Mimi


[PATCH v4 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Pratik Rajesh Sampat
Replace the variable name from using "pnv_first_spr_loss_level" to
"deep_spr_loss_state".

pnv_first_spr_loss_level is supposed to be the earliest state that
has OPAL_PM_LOSE_FULL_CONTEXT set, in other places the kernel uses the
"deep" states as terminology. Hence renaming the variable to be coherent
to its semantics.

Signed-off-by: Pratik Rajesh Sampat 
---
 arch/powerpc/platforms/powernv/idle.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 642abf0b8329..28462d59a8e1 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -48,7 +48,7 @@ static bool default_stop_found;
  * First stop state levels when SPR and TB loss can occur.
  */
 static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
-static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
+static u64 deep_spr_loss_state = MAX_STOP_STATE + 1;
 
 /*
  * psscr value and mask of the deepest stop idle state.
@@ -657,7 +657,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
  */
mmcr0   = mfspr(SPRN_MMCR0);
}
-   if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
+   if ((psscr & PSSCR_RL_MASK) >= deep_spr_loss_state) {
sprs.lpcr   = mfspr(SPRN_LPCR);
sprs.hfscr  = mfspr(SPRN_HFSCR);
sprs.fscr   = mfspr(SPRN_FSCR);
@@ -741,7 +741,7 @@ static unsigned long power9_idle_stop(unsigned long psscr, 
bool mmu_on)
 * just always test PSSCR for SPR/TB state loss.
 */
pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
-   if (likely(pls < pnv_first_spr_loss_level)) {
+   if (likely(pls < deep_spr_loss_state)) {
if (sprs_saved)
atomic_stop_thread_idle();
goto out;
@@ -1088,7 +1088,7 @@ static void __init pnv_power9_idle_init(void)
 * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
 */
pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
-   pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
+   deep_spr_loss_state = MAX_STOP_STATE + 1;
for (i = 0; i < nr_pnv_idle_states; i++) {
int err;
struct pnv_idle_states_t *state = _idle_states[i];
@@ -1099,8 +1099,8 @@ static void __init pnv_power9_idle_init(void)
pnv_first_tb_loss_level = psscr_rl;
 
if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
-(pnv_first_spr_loss_level > psscr_rl))
-   pnv_first_spr_loss_level = psscr_rl;
+(deep_spr_loss_state > psscr_rl))
+   deep_spr_loss_state = psscr_rl;
 
/*
 * The idle code does not deal with TB loss occurring
@@ -,8 +,8 @@ static void __init pnv_power9_idle_init(void)
 * compatibility.
 */
if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
-(pnv_first_spr_loss_level > psscr_rl))
-   pnv_first_spr_loss_level = psscr_rl;
+(deep_spr_loss_state > psscr_rl))
+   deep_spr_loss_state = psscr_rl;
 
err = validate_psscr_val_mask(>psscr_val,
  >psscr_mask,
@@ -1158,7 +1158,7 @@ static void __init pnv_power9_idle_init(void)
}
 
pr_info("cpuidle-powernv: First stop level that may lose SPRs = 
0x%llx\n",
-   pnv_first_spr_loss_level);
+   deep_spr_loss_state);
 
pr_info("cpuidle-powernv: First stop level that may lose timebase = 
0x%llx\n",
pnv_first_tb_loss_level);
-- 
2.25.4



[PATCH v4 3/3] powerpc/powernv/idle: Exclude mfspr on HID1, 4, 5 on P9 and above

2020-07-21 Thread Pratik Rajesh Sampat
POWER9 onwards the support for the registers HID1, HID4, HID5 has been
receded.
Although mfspr on the above registers worked in Power9, In Power10
simulator is unrecognized. Moving their assignment under the
check for machines lower than Power9

Signed-off-by: Pratik Rajesh Sampat 
Reviewed-by: Gautham R. Shenoy 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/platforms/powernv/idle.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 28462d59a8e1..92098d6106be 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -73,9 +73,6 @@ static int pnv_save_sprs_for_deep_states(void)
 */
uint64_t lpcr_val   = mfspr(SPRN_LPCR);
uint64_t hid0_val   = mfspr(SPRN_HID0);
-   uint64_t hid1_val   = mfspr(SPRN_HID1);
-   uint64_t hid4_val   = mfspr(SPRN_HID4);
-   uint64_t hid5_val   = mfspr(SPRN_HID5);
uint64_t hmeer_val  = mfspr(SPRN_HMEER);
uint64_t msr_val = MSR_IDLE;
uint64_t psscr_val = pnv_deepest_stop_psscr_val;
@@ -117,6 +114,9 @@ static int pnv_save_sprs_for_deep_states(void)
 
/* Only p8 needs to set extra HID regiters */
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+   uint64_t hid1_val = mfspr(SPRN_HID1);
+   uint64_t hid4_val = mfspr(SPRN_HID4);
+   uint64_t hid5_val = mfspr(SPRN_HID5);
 
rc = opal_slw_set_reg(pir, SPRN_HID1, hid1_val);
if (rc != 0)
-- 
2.25.4



[PATCH v4 1/3] powerpc/powernv/idle: Replace CPU features check with PVR check

2020-07-21 Thread Pratik Rajesh Sampat
As the idle framework's architecture is incomplete, hence instead of
checking for just the processor type advertised in the device tree CPU
features; check for the Processor Version Register (PVR) so that finer
granularity can be leveraged while making processor checks.

Hence, making the PVR check on the outer level function, subsequently in
the hierarchy keeping the CPU_FTR_ARCH_300 check intact as it is a
faster check to do because of static branches

Signed-off-by: Pratik Rajesh Sampat 
---
 arch/powerpc/platforms/powernv/idle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index 2dd467383a88..642abf0b8329 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -1205,7 +1205,7 @@ static void __init pnv_probe_idle_states(void)
return;
}
 
-   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   if (pvr_version_is(PVR_POWER9))
pnv_power9_idle_init();
 
for (i = 0; i < nr_pnv_idle_states; i++)
-- 
2.25.4



[PATCH v4 0/3] powernv/idle: Power9 idle cleanup

2020-07-21 Thread Pratik Rajesh Sampat
v3: https://lkml.org/lkml/2020/7/17/1093
Changelog v3-->v4:
Based on comments from Nicholas Piggin and Gautham Shenoy,
1. Changed the naming of pnv_first_spr_loss_level from
pnv_first_fullstate_loss_level to deep_spr_loss_state
2. Make the P9 PVR check only on the top level function
pnv_probe_idle_states and let the rest of the checks be DT based because
it is faster to do so

Pratik Rajesh Sampat (3):
  powerpc/powernv/idle: Replace CPU features check with PVR check
  powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable
  powerpc/powernv/idle: Exclude mfspr on HID1,4,5 on P9 and above

 arch/powerpc/platforms/powernv/idle.c | 26 +-
 1 file changed, 13 insertions(+), 13 deletions(-)

-- 
2.25.4



Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread Mathieu Desnoyers
- On Jul 21, 2020, at 11:19 AM, Peter Zijlstra pet...@infradead.org wrote:

> On Tue, Jul 21, 2020 at 11:15:13AM -0400, Mathieu Desnoyers wrote:
>> - On Jul 21, 2020, at 11:06 AM, Peter Zijlstra pet...@infradead.org 
>> wrote:
>> 
>> > On Tue, Jul 21, 2020 at 08:04:27PM +1000, Nicholas Piggin wrote:
>> > 
>> >> That being said, the x86 sync core gap that I imagined could be fixed
>> >> by changing to rq->curr == rq->idle test does not actually exist because
>> >> the global membarrier does not have a sync core option. So fixing the
>> >> exit_lazy_tlb points that this series does *should* fix that. So
>> >> PF_KTHREAD may be less problematic than I thought from implementation
>> >> point of view, only semantics.
>> > 
>> > So I've been trying to figure out where that PF_KTHREAD comes from,
>> > commit 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy
>> > load") changed 'p->mm' to '!(p->flags & PF_KTHREAD)'.
>> > 
>> > So the first version:
>> > 
>> >  
>> > https://lkml.kernel.org/r/20190906031300.1647-5-mathieu.desnoy...@efficios.com
>> > 
>> > appears to unconditionally send the IPI and checks p->mm in the IPI
>> > context, but then v2:
>> > 
>> >  
>> > https://lkml.kernel.org/r/20190908134909.12389-1-mathieu.desnoy...@efficios.com
>> > 
>> > has the current code. But I've been unable to find the reason the
>> > 'p->mm' test changed into '!(p->flags & PF_KTHREAD)'.
>> 
>> Looking back at my inbox, it seems like you are the one who proposed to
>> skip all kthreads:
>> 
>> https://lkml.kernel.org/r/20190904124333.gq2...@hirez.programming.kicks-ass.net
> 
> I had a feeling it might've been me ;-) I just couldn't find the email.
> 
>> > The comment doesn't really help either; sure we have the whole lazy mm
>> > thing, but that's ->active_mm, not ->mm.
>> > 
>> > Possibly it is because {,un}use_mm() do not have sufficient barriers to
>> > make the remote p->mm test work? Or were we over-eager with the !p->mm
>> > doesn't imply kthread 'cleanups' at the time?
>> 
>> The nice thing about adding back kthreads to the threads considered for
>> membarrier
>> IPI is that it has no observable effect on the user-space ABI. No 
>> pre-existing
>> kthread
>> rely on this, and we just provide an additional guarantee for future kthread
>> implementations.
>> 
>> > Also, I just realized, I still have a fix for use_mm() now
>> > kthread_use_mm() that seems to have been lost.
>> 
>> I suspect we need to at least document the memory barriers in kthread_use_mm 
>> and
>> kthread_unuse_mm to state that they are required by membarrier if we want to
>> ipi kthreads as well.
> 
> Right, so going by that email you found it was mostly a case of being
> lazy, but yes, if we audit the kthread_{,un}use_mm() barriers and add
> any other bits that might be needed, covering kthreads should be
> possible.
> 
> No objections from me for making it so.

I'm OK on making membarrier cover kthreads using mm as well, provided we
audit kthread_{,un}use_mm() to make sure the proper barriers are in place
after setting task->mm and before clearing it.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread Peter Zijlstra
On Tue, Jul 21, 2020 at 11:15:13AM -0400, Mathieu Desnoyers wrote:
> - On Jul 21, 2020, at 11:06 AM, Peter Zijlstra pet...@infradead.org wrote:
> 
> > On Tue, Jul 21, 2020 at 08:04:27PM +1000, Nicholas Piggin wrote:
> > 
> >> That being said, the x86 sync core gap that I imagined could be fixed
> >> by changing to rq->curr == rq->idle test does not actually exist because
> >> the global membarrier does not have a sync core option. So fixing the
> >> exit_lazy_tlb points that this series does *should* fix that. So
> >> PF_KTHREAD may be less problematic than I thought from implementation
> >> point of view, only semantics.
> > 
> > So I've been trying to figure out where that PF_KTHREAD comes from,
> > commit 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy
> > load") changed 'p->mm' to '!(p->flags & PF_KTHREAD)'.
> > 
> > So the first version:
> > 
> >  
> > https://lkml.kernel.org/r/20190906031300.1647-5-mathieu.desnoy...@efficios.com
> > 
> > appears to unconditionally send the IPI and checks p->mm in the IPI
> > context, but then v2:
> > 
> >  
> > https://lkml.kernel.org/r/20190908134909.12389-1-mathieu.desnoy...@efficios.com
> > 
> > has the current code. But I've been unable to find the reason the
> > 'p->mm' test changed into '!(p->flags & PF_KTHREAD)'.
> 
> Looking back at my inbox, it seems like you are the one who proposed to
> skip all kthreads: 
> 
> https://lkml.kernel.org/r/20190904124333.gq2...@hirez.programming.kicks-ass.net

I had a feeling it might've been me ;-) I just couldn't find the email.

> > The comment doesn't really help either; sure we have the whole lazy mm
> > thing, but that's ->active_mm, not ->mm.
> > 
> > Possibly it is because {,un}use_mm() do not have sufficient barriers to
> > make the remote p->mm test work? Or were we over-eager with the !p->mm
> > doesn't imply kthread 'cleanups' at the time?
> 
> The nice thing about adding back kthreads to the threads considered for 
> membarrier
> IPI is that it has no observable effect on the user-space ABI. No 
> pre-existing kthread
> rely on this, and we just provide an additional guarantee for future kthread
> implementations.
> 
> > Also, I just realized, I still have a fix for use_mm() now
> > kthread_use_mm() that seems to have been lost.
> 
> I suspect we need to at least document the memory barriers in kthread_use_mm 
> and
> kthread_unuse_mm to state that they are required by membarrier if we want to
> ipi kthreads as well.

Right, so going by that email you found it was mostly a case of being
lazy, but yes, if we audit the kthread_{,un}use_mm() barriers and add
any other bits that might be needed, covering kthreads should be
possible.

No objections from me for making it so.


Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread Mathieu Desnoyers
- On Jul 21, 2020, at 11:06 AM, Peter Zijlstra pet...@infradead.org wrote:

> On Tue, Jul 21, 2020 at 08:04:27PM +1000, Nicholas Piggin wrote:
> 
>> That being said, the x86 sync core gap that I imagined could be fixed
>> by changing to rq->curr == rq->idle test does not actually exist because
>> the global membarrier does not have a sync core option. So fixing the
>> exit_lazy_tlb points that this series does *should* fix that. So
>> PF_KTHREAD may be less problematic than I thought from implementation
>> point of view, only semantics.
> 
> So I've been trying to figure out where that PF_KTHREAD comes from,
> commit 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy
> load") changed 'p->mm' to '!(p->flags & PF_KTHREAD)'.
> 
> So the first version:
> 
>  
> https://lkml.kernel.org/r/20190906031300.1647-5-mathieu.desnoy...@efficios.com
> 
> appears to unconditionally send the IPI and checks p->mm in the IPI
> context, but then v2:
> 
>  
> https://lkml.kernel.org/r/20190908134909.12389-1-mathieu.desnoy...@efficios.com
> 
> has the current code. But I've been unable to find the reason the
> 'p->mm' test changed into '!(p->flags & PF_KTHREAD)'.

Looking back at my inbox, it seems like you are the one who proposed to
skip all kthreads: 

https://lkml.kernel.org/r/20190904124333.gq2...@hirez.programming.kicks-ass.net

> 
> The comment doesn't really help either; sure we have the whole lazy mm
> thing, but that's ->active_mm, not ->mm.
> 
> Possibly it is because {,un}use_mm() do not have sufficient barriers to
> make the remote p->mm test work? Or were we over-eager with the !p->mm
> doesn't imply kthread 'cleanups' at the time?

The nice thing about adding back kthreads to the threads considered for 
membarrier
IPI is that it has no observable effect on the user-space ABI. No pre-existing 
kthread
rely on this, and we just provide an additional guarantee for future kthread
implementations.

> Also, I just realized, I still have a fix for use_mm() now
> kthread_use_mm() that seems to have been lost.

I suspect we need to at least document the memory barriers in kthread_use_mm and
kthread_unuse_mm to state that they are required by membarrier if we want to
ipi kthreads as well.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


[PATCH -next] PCI: rpadlpar: Make some functions static

2020-07-21 Thread Wei Yongjun
The sparse tool report build warnings as follows:

drivers/pci/hotplug/rpadlpar_core.c:355:5: warning:
 symbol 'dlpar_remove_pci_slot' was not declared. Should it be static?
drivers/pci/hotplug/rpadlpar_core.c:461:12: warning:
 symbol 'rpadlpar_io_init' was not declared. Should it be static?
drivers/pci/hotplug/rpadlpar_core.c:473:6: warning:
 symbol 'rpadlpar_io_exit' was not declared. Should it be static?

Those functions are not used outside of this file, so marks them
static.
Also mark rpadlpar_io_exit() as __exit.

Reported-by: Hulk Robot 
Signed-off-by: Wei Yongjun 
---
 drivers/pci/hotplug/rpadlpar_core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/hotplug/rpadlpar_core.c 
b/drivers/pci/hotplug/rpadlpar_core.c
index c5eb509c72f0..f979b7098acf 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -352,7 +352,7 @@ static int dlpar_remove_vio_slot(char *drc_name, struct 
device_node *dn)
  * -ENODEV Not a valid drc_name
  * -EIOInternal PCI Error
  */
-int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
+static int dlpar_remove_pci_slot(char *drc_name, struct device_node *dn)
 {
struct pci_bus *bus;
struct slot *slot;
@@ -458,7 +458,7 @@ static inline int is_dlpar_capable(void)
return (int) (rc != RTAS_UNKNOWN_SERVICE);
 }
 
-int __init rpadlpar_io_init(void)
+static int __init rpadlpar_io_init(void)
 {
 
if (!is_dlpar_capable()) {
@@ -470,7 +470,7 @@ int __init rpadlpar_io_init(void)
return dlpar_sysfs_init();
 }
 
-void rpadlpar_io_exit(void)
+static void __exit rpadlpar_io_exit(void)
 {
dlpar_sysfs_exit();
 }



Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread peterz
On Tue, Jul 21, 2020 at 08:04:27PM +1000, Nicholas Piggin wrote:

> That being said, the x86 sync core gap that I imagined could be fixed 
> by changing to rq->curr == rq->idle test does not actually exist because
> the global membarrier does not have a sync core option. So fixing the
> exit_lazy_tlb points that this series does *should* fix that. So
> PF_KTHREAD may be less problematic than I thought from implementation
> point of view, only semantics.

So I've been trying to figure out where that PF_KTHREAD comes from,
commit 227a4aadc75b ("sched/membarrier: Fix p->mm->membarrier_state racy
load") changed 'p->mm' to '!(p->flags & PF_KTHREAD)'.

So the first version:

  https://lkml.kernel.org/r/20190906031300.1647-5-mathieu.desnoy...@efficios.com

appears to unconditionally send the IPI and checks p->mm in the IPI
context, but then v2:

  
https://lkml.kernel.org/r/20190908134909.12389-1-mathieu.desnoy...@efficios.com

has the current code. But I've been unable to find the reason the
'p->mm' test changed into '!(p->flags & PF_KTHREAD)'.

The comment doesn't really help either; sure we have the whole lazy mm
thing, but that's ->active_mm, not ->mm.

Possibly it is because {,un}use_mm() do not have sufficient barriers to
make the remote p->mm test work? Or were we over-eager with the !p->mm
doesn't imply kthread 'cleanups' at the time?

Also, I just realized, I still have a fix for use_mm() now
kthread_use_mm() that seems to have been lost.




Re: [RFC PATCH] powerpc/pseries/svm: capture instruction faulting on MMIO access, in sprg0 register

2020-07-21 Thread Nicholas Piggin
Excerpts from Ram Pai's message of July 16, 2020 6:32 pm:
> An instruction accessing a mmio address, generates a HDSI fault.  This fault 
> is
> appropriately handled by the Hypervisor.  However in the case of secureVMs, 
> the
> fault is delivered to the ultravisor.

Why not a ucall if you're paraultravizing it anyway?

> 
> Unfortunately the Ultravisor has no correct-way to fetch the faulting
> instruction. The PEF architecture does not allow Ultravisor to enable MMU
> translation. Walking the two level page table to read the instruction can race
> with other vcpus modifying the SVM's process scoped page table.
> 
> This problem can be correctly solved with some help from the kernel.
> 
> Capture the faulting instruction in SPRG0 register, before executing the
> faulting instruction. This enables the ultravisor to easily procure the
> faulting instruction and emulate it.
> 
> Signed-off-by: Ram Pai 
> ---
>  arch/powerpc/include/asm/io.h | 85 
> ++-
>  1 file changed, 75 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
> index 635969b..7ef663d 100644
> --- a/arch/powerpc/include/asm/io.h
> +++ b/arch/powerpc/include/asm/io.h
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #define SIO_CONFIG_RA0x398
>  #define SIO_CONFIG_RD0x399
> @@ -105,34 +106,98 @@
>  static inline u##size name(const volatile u##size __iomem *addr) \
>  {\
>   u##size ret;\
> - __asm__ __volatile__("sync;"#insn" %0,%y1;twi 0,%0,0;isync" \
> - : "=r" (ret) : "Z" (*addr) : "memory"); \
> + if (is_secure_guest()) {\
> + __asm__ __volatile__("mfsprg0 %3;"  \
> + "lnia %2;"  \
> + "ld %2,12(%2);" \
> + "mtsprg0 %2;"   \
> + "sync;" \
> + #insn" %0,%y1;" \
> + "twi 0,%0,0;"   \
> + "isync;"\
> + "mtsprg0 %3"\

We prefer to use mtspr in new code, and the nia offset should be 
calculated with a label I think "(1f - .)(%2)" should work.

SPRG usage is documented in arch/powerpc/include/asm/reg.h if this 
goes past RFC stage. Looks like SPRG0 probably could be used for this.

Thanks,
Nick


Re: [PATCH v3 0/2] Selftest for cpuidle latency measurement

2020-07-21 Thread Daniel Lezcano
On 21/07/2020 14:42, Pratik Rajesh Sampat wrote:
> v2: https://lkml.org/lkml/2020/7/17/369
> Changelog v2-->v3
> Based on comments from Gautham R. Shenoy adding the following in the
> selftest,
> 1. Grepping modules to determine if already loaded
> 2. Wrapper to enable/disable states
> 3. Preventing any operation/test on offlined CPUs 
> ---
> 
> The patch series introduces a mechanism to measure wakeup latency for
> IPI and timer based interrupts
> The motivation behind this series is to find significant deviations
> behind advertised latency and resisdency values

Why do you want to measure for the timer and the IPI ? Whatever the
source of the wakeup, the exit latency remains the same, no ?

Is all this kernel-ish code really needed ?

What about using a highres periodic timer and make it expires every eg.
50ms x 2400, so it is 120 secondes and measure the deviation. Repeat the
operation for each idle states.

And in order to make it as much accurate as possible, set the program
affinity on a CPU and isolate this one by preventing other processes to
be scheduled on and migrate the interrupts on the other CPUs.

That will be all userspace code, no?





-- 
 Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog


Re: [PATCH v3 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Gautham R Shenoy
Hi,

On Wed, Jul 22, 2020 at 12:37:41AM +1000, Nicholas Piggin wrote:
> Excerpts from Pratik Sampat's message of July 21, 2020 8:29 pm:
> > 
> > 
> > On 20/07/20 5:27 am, Nicholas Piggin wrote:
> >> Excerpts from Pratik Rajesh Sampat's message of July 18, 2020 4:53 am:
> >>> Replace the variable name from using "pnv_first_spr_loss_level" to
> >>> "pnv_first_fullstate_loss_level".
> >>>
> >>> As pnv_first_spr_loss_level is supposed to be the earliest state that
> >>> has OPAL_PM_LOSE_FULL_CONTEXT set, however as shallow states too loose
> >>> SPR values, render an incorrect terminology.
> >> It also doesn't lose "full" state at this loss level though. From the
> >> architecture it could be called "hv state loss level", but in POWER10
> >> even that is not strictly true.
> >>
> > Right. Just discovered that deep stop states won't loose full state
> > P10 onwards.
> > Would it better if we rename it as "pnv_all_spr_loss_state" instead
> > so that it stays generic enough while being semantically coherent?
> 
> It doesn't lose all SPRs. It does physically, but for Linux it appears 
> at least timebase SPRs are retained and that's mostly how it's 
> documented.
> 
> Maybe there's no really good name for it, but we do call it "deep" stop 
> in other places, you could call it deep_spr_loss maybe. I don't mind too 
> much though, whatever Gautham is happy with.

Nick's suggestion is fine by me. We can call it deep_spr_loss_state.

> 
> Thanks,
> Nick

--
Thanks and Regards
gautham.


Re: [PATCH v3 2/3] powerpc/powernv/idle: Rename pnv_first_spr_loss_level variable

2020-07-21 Thread Nicholas Piggin
Excerpts from Pratik Sampat's message of July 21, 2020 8:29 pm:
> 
> 
> On 20/07/20 5:27 am, Nicholas Piggin wrote:
>> Excerpts from Pratik Rajesh Sampat's message of July 18, 2020 4:53 am:
>>> Replace the variable name from using "pnv_first_spr_loss_level" to
>>> "pnv_first_fullstate_loss_level".
>>>
>>> As pnv_first_spr_loss_level is supposed to be the earliest state that
>>> has OPAL_PM_LOSE_FULL_CONTEXT set, however as shallow states too loose
>>> SPR values, render an incorrect terminology.
>> It also doesn't lose "full" state at this loss level though. From the
>> architecture it could be called "hv state loss level", but in POWER10
>> even that is not strictly true.
>>
> Right. Just discovered that deep stop states won't loose full state
> P10 onwards.
> Would it better if we rename it as "pnv_all_spr_loss_state" instead
> so that it stays generic enough while being semantically coherent?

It doesn't lose all SPRs. It does physically, but for Linux it appears 
at least timebase SPRs are retained and that's mostly how it's 
documented.

Maybe there's no really good name for it, but we do call it "deep" stop 
in other places, you could call it deep_spr_loss maybe. I don't mind too 
much though, whatever Gautham is happy with.

Thanks,
Nick


Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-21 Thread Waiman Long

On 7/21/20 7:08 AM, Nicholas Piggin wrote:

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index b752d34517b3..26d8766a1106 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock 
*lock)
  
  #else

  extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
  #endif
  
  static __always_inline void queued_spin_lock(struct qspinlock *lock)

  {
-   u32 val = 0;
-
-   if (likely(atomic_try_cmpxchg_lock(>val, , _Q_LOCKED_VAL)))
+   atomic_t *a = >val;
+   u32 val;
+
+again:
+   asm volatile(
+"1:\t"   PPC_LWARX(%0,0,%1,1) " # queued_spin_lock  
\n"
+   : "=" (val)
+   : "r" (>counter)
+   : "memory");
+
+   if (likely(val == 0)) {
+   asm_volatile_goto(
+   "  stwcx.  %0,0,%1 \n"
+   "  bne-%l[again]   \n"
+   "\t"  PPC_ACQUIRE_BARRIER "  
\n"
+   :
+   : "r"(_Q_LOCKED_VAL), "r" (>counter)
+   : "cr0", "memory"
+   : again );
return;
-
-   queued_spin_lock_slowpath(lock, val);
+   }
+
+   if (likely(val == _Q_LOCKED_VAL)) {
+   asm_volatile_goto(
+   "  stwcx.  %0,0,%1 \n"
+   "  bne-%l[again]   \n"
+   :
+   : "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (>counter)
+   : "cr0", "memory"
+   : again );
+
+   atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
+// clear_pending_set_locked(lock);
+   WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+// lockevent_inc(lock_pending);
+   return;
+   }
+
+   if (val == _Q_PENDING_VAL) {
+   int cnt = _Q_PENDING_LOOPS;
+   val = atomic_cond_read_relaxed(a,
+  (VAL != _Q_PENDING_VAL) || 
!cnt--);
+   if (!(val & ~_Q_LOCKED_MASK))
+   goto again;
+}
+   queued_spin_lock_slowpath_queue(lock);
  }
  #define queued_spin_lock queued_spin_lock
  


I am fine with the arch code override some part of the generic code.



diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..ebcc6f5d99d5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -287,10 +287,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct 
qspinlock *lock,
  
  #ifdef CONFIG_PARAVIRT_SPINLOCKS

  #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath_queue
native_queued_spin_lock_slowpath_queue
  #endif
  
  #endif /* _GEN_PV_LOCK_SLOWPATH */
  
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock);

+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+
  /**
   * queued_spin_lock_slowpath - acquire the queued spinlock
   * @lock: Pointer to queued spinlock structure
@@ -314,12 +318,6 @@ static __always_inline u32  __pv_wait_head_or_lock(struct 
qspinlock *lock,
   */
  void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
  {
-   struct mcs_spinlock *prev, *next, *node;
-   u32 old, tail;
-   int idx;
-
-   BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
-
if (pv_enabled())
goto pv_queue;
  
@@ -397,6 +395,26 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)

  queue:
lockevent_inc(lock_slowpath);
  pv_queue:
+   __queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath);
+
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+   lockevent_inc(lock_slowpath);
+   __queued_spin_lock_slowpath_queue(lock);
+}
+EXPORT_SYMBOL(queued_spin_lock_slowpath_queue);
+
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock)
+{
+   struct mcs_spinlock *prev, *next, *node;
+   u32 old, tail;
+   u32 val;
+   int idx;
+
+   BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
+
node = this_cpu_ptr([0].mcs);
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);
@@ -559,7 +577,6 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 */
__this_cpu_dec(qnodes[0].mcs.count);
  }
-EXPORT_SYMBOL(queued_spin_lock_slowpath);
  
  /*

   * Generate the paravirt code for queued_spin_unlock_slowpath().

I would prefer to extract out the pending bit handling code out into a 
separate helper function which can be overridden by the arch code 
instead of breaking the slowpath 

Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread Nicholas Piggin
Excerpts from Mathieu Desnoyers's message of July 21, 2020 11:11 pm:
> - On Jul 21, 2020, at 6:04 AM, Nicholas Piggin npig...@gmail.com wrote:
> 
>> Excerpts from Mathieu Desnoyers's message of July 21, 2020 2:46 am:
> [...]
>> 
>> Yeah you're probably right in this case I think. Quite likely most kernel
>> tasks that asynchronously write to user memory would at least have some
>> kind of producer-consumer barriers.
>> 
>> But is that restriction of all async modifications documented and enforced
>> anywhere?
>> 
 How about other memory accesses via kthread_use_mm? Presumably there is
 still ordering requirement there for membarrier,
>>> 
>>> Please provide an example case with memory accesses via kthread_use_mm where
>>> ordering matters to support your concern.
>> 
>> I think the concern Andy raised with io_uring was less a specific
>> problem he saw and more a general concern that we have these memory
>> accesses which are not synchronized with membarrier.
>> 
 so I really think
 it's a fragile interface with no real way for the user to know how
 kernel threads may use its mm for any particular reason, so membarrier
 should synchronize all possible kernel users as well.
>>> 
>>> I strongly doubt so, but perhaps something should be clarified in the
>>> documentation
>>> if you have that feeling.
>> 
>> I'd rather go the other way and say if you have reasoning or numbers for
>> why PF_KTHREAD is an important optimisation above rq->curr == rq->idle
>> then we could think about keeping this subtlety with appropriate
>> documentation added, otherwise we can just kill it and remove all doubt.
>> 
>> That being said, the x86 sync core gap that I imagined could be fixed
>> by changing to rq->curr == rq->idle test does not actually exist because
>> the global membarrier does not have a sync core option. So fixing the
>> exit_lazy_tlb points that this series does *should* fix that. So
>> PF_KTHREAD may be less problematic than I thought from implementation
>> point of view, only semantics.
> 
> Today, the membarrier global expedited command explicitly skips kernel 
> threads,
> but it happens that membarrier private expedited considers those with the
> same mm as target for the IPI.
> 
> So we already implement a semantic which differs between private and global
> expedited membarriers.

Which is not a good thing.

> This can be explained in part by the fact that
> kthread_use_mm was introduced after 4.16, where the most recent membarrier
> commands where introduced. It seems that the effect on membarrier was not
> considered when kthread_use_mm was introduced.

No it was just renamed, it used to be called use_mm and has been in the 
kernel for ~ever.

That you hadn't considered this is actually weight for my point, which 
is that there's so much subtle behaviour that's easy to miss we're 
better off with simpler and fewer special cases until it's proven 
they're needed. Not the other way around.

> 
> Looking at membarrier(2) documentation, it states that IPIs are only sent to
> threads belonging to the same process as the calling thread. If my 
> understanding
> of the notion of process is correct, this should rule out sending the IPI to
> kernel threads, given they are not "part" of the same process, only borrowing
> the mm. But I agree that the distinction is moot, and should be clarified.

It does if you read it in a user-hostile legalistic way. The reality is 
userspace shouldn't and can't know about how the kernel might implement 
functionality.

> Without a clear use-case to justify adding a constraint on membarrier, I am
> tempted to simply clarify documentation of current membarrier commands,
> stating clearly that they are not guaranteed to affect kernel threads. Then,
> if we have a compelling use-case to implement a different behavior which 
> covers
> kthreads, this could be added consistently across membarrier commands with a
> flag (or by adding new commands).
> 
> Does this approach make sense ?

The other position is without a clear use case for PF_KTHREAD, seeing as 
async kernel accesses had not been considered before now, we limit the 
optimision to only skipping the idle thread. I think that makes more 
sense (unless you have a reason for PF_KTHREAD but it doesn't seem like 
there is much of one).

Thanks,
Nick


Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR

2020-07-21 Thread Ravi Bangoria




On 7/21/20 7:37 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:

On 7/21/20 4:59 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:

On 7/17/20 11:14 AM, Jordan Niethe wrote:

On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
 wrote:


Add new device-tree feature for 2nd DAWR. If this feature is present,
2nd DAWR is supported, otherwise not.

Signed-off-by: Ravi Bangoria 
---
arch/powerpc/include/asm/cputable.h | 7 +--
arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++
2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index e506d429b1af..3445c86e1f6f 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
#define CPU_FTR_P9_TLBIE_ERAT_BUG  LONG_ASM_CONST(0x0001)
#define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002)
#define CPU_FTR_ARCH_31
LONG_ASM_CONST(0x0004)
+#define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)

#ifndef __ASSEMBLY__

@@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
#define CPU_FTRS_POSSIBLE  \
   (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | 
\
+CPU_FTR_DAWR1)
#else
#define CPU_FTRS_POSSIBLE  \
   (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | 
\
+CPU_FTR_DAWR1)



Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
into CPU_FTRS_POWER10?
Then it will be picked up by CPU_FTRS_POSSIBLE.


I remember a discussion about this with Mikey and we decided to do it
this way. Obviously, the purpose is to make CPU_FTR_DAWR1 independent of
CPU_FTRS_POWER10 because DAWR1 is an optional feature in p10. I fear
including CPU_FTR_DAWR1 in CPU_FTRS_POWER10 can make it forcefully enabled
even when device-tree property is not present or pa-feature bit it not set,
because we do:

 {   /* 3.1-compliant processor, i.e. Power10 "architected" mode */
 .pvr_mask   = 0x,
 .pvr_value  = 0x0f06,
 .cpu_name   = "POWER10 (architected)",
 .cpu_features   = CPU_FTRS_POWER10,


The pa-features logic will turn it off if the feature bit is not set.

So you should be able to put it in CPU_FTRS_POWER10.

See for example CPU_FTR_NOEXECUTE.


Ah ok. scan_features() clears the feature if the bit is not set in
pa-features. So it should work find for powervm. I'll verify the same
thing happens in case of baremetal where we use cpu-features not
pa-features. If it works in baremetal as well, will put it in
CPU_FTRS_POWER10.


When we use DT CPU features we don't use CPU_FTRS_POWER10 at all.

We construct a cpu_spec from scratch with just the base set of features:

static struct cpu_spec __initdata base_cpu_spec = {
.cpu_name   = NULL,
.cpu_features   = CPU_FTRS_DT_CPU_BASE,


And then individual features are enabled via the device tree flags.


Ah good. I was under a wrong impression that we use cpu_specs[] for all
the cases. Thanks mpe for explaining in detail :)

Ravi


Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR

2020-07-21 Thread Michael Ellerman
Ravi Bangoria  writes:
> On 7/21/20 4:59 PM, Michael Ellerman wrote:
>> Ravi Bangoria  writes:
>>> On 7/17/20 11:14 AM, Jordan Niethe wrote:
 On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
  wrote:
>
> Add new device-tree feature for 2nd DAWR. If this feature is present,
> 2nd DAWR is supported, otherwise not.
>
> Signed-off-by: Ravi Bangoria 
> ---
>arch/powerpc/include/asm/cputable.h | 7 +--
>arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++
>2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/cputable.h 
> b/arch/powerpc/include/asm/cputable.h
> index e506d429b1af..3445c86e1f6f 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
>#define CPU_FTR_P9_TLBIE_ERAT_BUG  
> LONG_ASM_CONST(0x0001)
>#define CPU_FTR_P9_RADIX_PREFETCH_BUG  
> LONG_ASM_CONST(0x0002)
>#define CPU_FTR_ARCH_31
> LONG_ASM_CONST(0x0004)
> +#define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)
>
>#ifndef __ASSEMBLY__
>
> @@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
>#define CPU_FTRS_POSSIBLE  \
>   (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
>CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 
> | \
> -CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
> CPU_FTRS_POWER10)
> +CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
> CPU_FTRS_POWER10 | \
> +CPU_FTR_DAWR1)
>#else
>#define CPU_FTRS_POSSIBLE  \
>   (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
>CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
>CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
>CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 
> | \
> -CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
> CPU_FTRS_POWER10)
> +CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
> CPU_FTRS_POWER10 | \
> +CPU_FTR_DAWR1)
>> 
 Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
 into CPU_FTRS_POWER10?
 Then it will be picked up by CPU_FTRS_POSSIBLE.
>>>
>>> I remember a discussion about this with Mikey and we decided to do it
>>> this way. Obviously, the purpose is to make CPU_FTR_DAWR1 independent of
>>> CPU_FTRS_POWER10 because DAWR1 is an optional feature in p10. I fear
>>> including CPU_FTR_DAWR1 in CPU_FTRS_POWER10 can make it forcefully enabled
>>> even when device-tree property is not present or pa-feature bit it not set,
>>> because we do:
>>>
>>> {   /* 3.1-compliant processor, i.e. Power10 "architected" mode 
>>> */
>>> .pvr_mask   = 0x,
>>> .pvr_value  = 0x0f06,
>>> .cpu_name   = "POWER10 (architected)",
>>> .cpu_features   = CPU_FTRS_POWER10,
>> 
>> The pa-features logic will turn it off if the feature bit is not set.
>> 
>> So you should be able to put it in CPU_FTRS_POWER10.
>> 
>> See for example CPU_FTR_NOEXECUTE.
>
> Ah ok. scan_features() clears the feature if the bit is not set in
> pa-features. So it should work find for powervm. I'll verify the same
> thing happens in case of baremetal where we use cpu-features not
> pa-features. If it works in baremetal as well, will put it in
> CPU_FTRS_POWER10.

When we use DT CPU features we don't use CPU_FTRS_POWER10 at all.

We construct a cpu_spec from scratch with just the base set of features:

static struct cpu_spec __initdata base_cpu_spec = {
.cpu_name   = NULL,
.cpu_features   = CPU_FTRS_DT_CPU_BASE,


And then individual features are enabled via the device tree flags.

cheers


Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR

2020-07-21 Thread Ravi Bangoria




On 7/21/20 4:59 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:

On 7/17/20 11:14 AM, Jordan Niethe wrote:

On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
 wrote:


Add new device-tree feature for 2nd DAWR. If this feature is present,
2nd DAWR is supported, otherwise not.

Signed-off-by: Ravi Bangoria 
---
   arch/powerpc/include/asm/cputable.h | 7 +--
   arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++
   2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index e506d429b1af..3445c86e1f6f 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
   #define CPU_FTR_P9_TLBIE_ERAT_BUG  LONG_ASM_CONST(0x0001)
   #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002)
   #define CPU_FTR_ARCH_31
LONG_ASM_CONST(0x0004)
+#define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)

   #ifndef __ASSEMBLY__

@@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
   #define CPU_FTRS_POSSIBLE  \
  (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
   CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | 
\
+CPU_FTR_DAWR1)
   #else
   #define CPU_FTRS_POSSIBLE  \
  (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
   CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
   CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
   CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
-CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10)
+CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | CPU_FTRS_POWER10 | 
\
+CPU_FTR_DAWR1)



Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
into CPU_FTRS_POWER10?
Then it will be picked up by CPU_FTRS_POSSIBLE.


I remember a discussion about this with Mikey and we decided to do it
this way. Obviously, the purpose is to make CPU_FTR_DAWR1 independent of
CPU_FTRS_POWER10 because DAWR1 is an optional feature in p10. I fear
including CPU_FTR_DAWR1 in CPU_FTRS_POWER10 can make it forcefully enabled
even when device-tree property is not present or pa-feature bit it not set,
because we do:

{   /* 3.1-compliant processor, i.e. Power10 "architected" mode */
.pvr_mask   = 0x,
.pvr_value  = 0x0f06,
.cpu_name   = "POWER10 (architected)",
.cpu_features   = CPU_FTRS_POWER10,


The pa-features logic will turn it off if the feature bit is not set.

So you should be able to put it in CPU_FTRS_POWER10.

See for example CPU_FTR_NOEXECUTE.


Ah ok. scan_features() clears the feature if the bit is not set in
pa-features. So it should work find for powervm. I'll verify the same
thing happens in case of baremetal where we use cpu-features not
pa-features. If it works in baremetal as well, will put it in
CPU_FTRS_POWER10.

Thanks for the clarification,
Ravi


Re: [PATCH v4 09/10] powerpc/watchpoint: Return available watchpoints dynamically

2020-07-21 Thread Ravi Bangoria




On 7/21/20 5:06 PM, Michael Ellerman wrote:

Ravi Bangoria  writes:

On 7/20/20 9:12 AM, Jordan Niethe wrote:

On Fri, Jul 17, 2020 at 2:11 PM Ravi Bangoria
 wrote:


So far Book3S Powerpc supported only one watchpoint. Power10 is
introducing 2nd DAWR. Enable 2nd DAWR support for Power10.
Availability of 2nd DAWR will depend on CPU_FTR_DAWR1.

Signed-off-by: Ravi Bangoria 
---
   arch/powerpc/include/asm/cputable.h  | 4 +++-
   arch/powerpc/include/asm/hw_breakpoint.h | 5 +++--
   2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 3445c86e1f6f..36a0851a7a9b 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -633,7 +633,9 @@ enum {
* Maximum number of hw breakpoint supported on powerpc. Number of
* breakpoints supported by actual hw might be less than this.
*/
-#define HBP_NUM_MAX1
+#define HBP_NUM_MAX2
+#define HBP_NUM_ONE1
+#define HBP_NUM_TWO2



I wonder if these defines are necessary - has it any advantage over
just using the literal?


No, not really. Initially I had something like:

#define HBP_NUM_MAX2
#define HBP_NUM_P8_P9  1
#define HBP_NUM_P102

But then I thought it's also not right. So I made it _ONE and _TWO.
Now the function that decides nr watchpoints dynamically (nr_wp_slots)
is in different file, I thought to keep it like this so it would be
easier to figure out why _MAX is 2.


I don't think it makes anything clearer.

I had to stare at it thinking there was some sort of mapping or
indirection going on, before I realised it's just literally the number
of breakpoints.

So please just do:

static inline int nr_wp_slots(void)
{
return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1;
}

If you think HBP_NUM_MAX needs explanation then do that with a comment,
it can refer to nr_wp_slots() if that's helpful.


Agreed. By adding a comment, we can remove those macros. Will change it.

Thanks,
Ravi


Re: [RFC PATCH 4/7] x86: use exit_lazy_tlb rather than membarrier_mm_sync_core_before_usermode

2020-07-21 Thread Mathieu Desnoyers
- On Jul 21, 2020, at 6:04 AM, Nicholas Piggin npig...@gmail.com wrote:

> Excerpts from Mathieu Desnoyers's message of July 21, 2020 2:46 am:
[...]
> 
> Yeah you're probably right in this case I think. Quite likely most kernel
> tasks that asynchronously write to user memory would at least have some
> kind of producer-consumer barriers.
> 
> But is that restriction of all async modifications documented and enforced
> anywhere?
> 
>>> How about other memory accesses via kthread_use_mm? Presumably there is
>>> still ordering requirement there for membarrier,
>> 
>> Please provide an example case with memory accesses via kthread_use_mm where
>> ordering matters to support your concern.
> 
> I think the concern Andy raised with io_uring was less a specific
> problem he saw and more a general concern that we have these memory
> accesses which are not synchronized with membarrier.
> 
>>> so I really think
>>> it's a fragile interface with no real way for the user to know how
>>> kernel threads may use its mm for any particular reason, so membarrier
>>> should synchronize all possible kernel users as well.
>> 
>> I strongly doubt so, but perhaps something should be clarified in the
>> documentation
>> if you have that feeling.
> 
> I'd rather go the other way and say if you have reasoning or numbers for
> why PF_KTHREAD is an important optimisation above rq->curr == rq->idle
> then we could think about keeping this subtlety with appropriate
> documentation added, otherwise we can just kill it and remove all doubt.
> 
> That being said, the x86 sync core gap that I imagined could be fixed
> by changing to rq->curr == rq->idle test does not actually exist because
> the global membarrier does not have a sync core option. So fixing the
> exit_lazy_tlb points that this series does *should* fix that. So
> PF_KTHREAD may be less problematic than I thought from implementation
> point of view, only semantics.

Today, the membarrier global expedited command explicitly skips kernel threads,
but it happens that membarrier private expedited considers those with the
same mm as target for the IPI.

So we already implement a semantic which differs between private and global
expedited membarriers. This can be explained in part by the fact that
kthread_use_mm was introduced after 4.16, where the most recent membarrier
commands where introduced. It seems that the effect on membarrier was not
considered when kthread_use_mm was introduced.

Looking at membarrier(2) documentation, it states that IPIs are only sent to
threads belonging to the same process as the calling thread. If my understanding
of the notion of process is correct, this should rule out sending the IPI to
kernel threads, given they are not "part" of the same process, only borrowing
the mm. But I agree that the distinction is moot, and should be clarified.

Without a clear use-case to justify adding a constraint on membarrier, I am
tempted to simply clarify documentation of current membarrier commands,
stating clearly that they are not guaranteed to affect kernel threads. Then,
if we have a compelling use-case to implement a different behavior which covers
kthreads, this could be added consistently across membarrier commands with a
flag (or by adding new commands).

Does this approach make sense ?

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com


[PATCH v3 0/2] Selftest for cpuidle latency measurement

2020-07-21 Thread Pratik Rajesh Sampat
v2: https://lkml.org/lkml/2020/7/17/369
Changelog v2-->v3
Based on comments from Gautham R. Shenoy adding the following in the
selftest,
1. Grepping modules to determine if already loaded
2. Wrapper to enable/disable states
3. Preventing any operation/test on offlined CPUs 
---

The patch series introduces a mechanism to measure wakeup latency for
IPI and timer based interrupts
The motivation behind this series is to find significant deviations
behind advertised latency and resisdency values

To achieve this, we introduce a kernel module and expose its control
knobs through the debugfs interface that the selftests can engage with.

The kernel module provides the following interfaces within
/sys/kernel/debug/latency_test/ for,
1. IPI test:
  ipi_cpu_dest   # Destination CPU for the IPI
  ipi_cpu_src# Origin of the IPI
  ipi_latency_ns # Measured latency time in ns
2. Timeout test:
  timeout_cpu_src # CPU on which the timer to be queued
  timeout_expected_ns # Timer duration
  timeout_diff_ns # Difference of actual duration vs expected timer
To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

The selftest inserts the module, disables all the idle states and
enables them one by one testing the following:
1. Keeping source CPU constant, iterates through all the CPUS measuring
   IPI latency for baseline (CPU is busy with
   "cat /dev/random > /dev/null" workload) and the when the CPU is
   allowed to be at rest
2. Iterating through all the CPUs, sending expected timer durations to
   be equivalent to the residency of the the deepest idle state
   enabled and extracting the difference in time between the time of
   wakeup and the expected timer duration

Usage
-
Can be used in conjuction to the rest of the selftests.
Default Output location in: tools/testing/cpuidle/cpuidle.log

To run this test specifically:
$ make -C tools/testing/selftests TARGETS="cpuidle" run_tests

There are a few optinal arguments too that the script can take
[-h ]
[-m ]
[-o ]

Sample output snippet
-
--IPI Latency Test---
--Baseline IPI Latency measurement: CPU Busy--
SRC_CPU   DEST_CPU IPI_Latency(ns)
...
08 1996
09 2125
0   10 1264
0   11 1788
0   12 2045
Baseline Average IPI latency(ns): 1843
---Enabling state: 5---
SRC_CPU   DEST_CPU IPI_Latency(ns)
08   621719
09   624752
0   10   622218
0   11   623968
0   12   621303
Expected IPI latency(ns): 10
Observed Average IPI latency(ns): 622792

--Timeout Latency Test--
--Baseline Timeout Latency measurement: CPU Busy--
Wakeup_src Baseline_delay(ns) 
...
82249
92226
10   2211
11   2183
12   2263
Baseline Average timeout diff(ns): 2226
---Enabling state: 5---
8   10749   
9   10911   
10  10912   
11  12100   
12  73276   
Expected timeout(ns): 1200
Observed Average timeout diff(ns): 23589

Pratik Rajesh Sampat (2):
  cpuidle: Trace IPI based and timer based wakeup latency from idle
states
  selftest/cpuidle: Add support for cpuidle latency measurement

 drivers/cpuidle/Makefile   |   1 +
 drivers/cpuidle/test-cpuidle_latency.c | 150 ++
 lib/Kconfig.debug  |  10 +
 tools/testing/selftests/Makefile   |   1 +
 tools/testing/selftests/cpuidle/Makefile   |   6 +
 tools/testing/selftests/cpuidle/cpuidle.sh | 310 +
 tools/testing/selftests/cpuidle/settings   |   1 +
 7 files changed, 479 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c
 create mode 100644 tools/testing/selftests/cpuidle/Makefile
 create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
 create mode 100644 tools/testing/selftests/cpuidle/settings

-- 
2.25.4



[PATCH v3 2/2] selftest/cpuidle: Add support for cpuidle latency measurement

2020-07-21 Thread Pratik Rajesh Sampat
This patch adds support to trace IPI based and timer based wakeup
latency from idle states

Latches onto the test-cpuidle_latency kernel module using the debugfs
interface to send IPIs or schedule a timer based event, which in-turn
populates the debugfs with the latency measurements.

Currently for the IPI and timer tests; first disable all idle states
and then test for latency measurements incrementally enabling each state

Signed-off-by: Pratik Rajesh Sampat 
---
 tools/testing/selftests/Makefile   |   1 +
 tools/testing/selftests/cpuidle/Makefile   |   6 +
 tools/testing/selftests/cpuidle/cpuidle.sh | 310 +
 tools/testing/selftests/cpuidle/settings   |   1 +
 4 files changed, 318 insertions(+)
 create mode 100644 tools/testing/selftests/cpuidle/Makefile
 create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
 create mode 100644 tools/testing/selftests/cpuidle/settings

diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile
index 1195bd85af38..ab6cf51f3518 100644
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -7,6 +7,7 @@ TARGETS += capabilities
 TARGETS += cgroup
 TARGETS += clone3
 TARGETS += cpufreq
+TARGETS += cpuidle
 TARGETS += cpu-hotplug
 TARGETS += drivers/dma-buf
 TARGETS += efivarfs
diff --git a/tools/testing/selftests/cpuidle/Makefile 
b/tools/testing/selftests/cpuidle/Makefile
new file mode 100644
index ..72fd5d2e974d
--- /dev/null
+++ b/tools/testing/selftests/cpuidle/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+all:
+
+TEST_PROGS := cpuidle.sh
+
+include ../lib.mk
diff --git a/tools/testing/selftests/cpuidle/cpuidle.sh 
b/tools/testing/selftests/cpuidle/cpuidle.sh
new file mode 100755
index ..19cc24ccd4af
--- /dev/null
+++ b/tools/testing/selftests/cpuidle/cpuidle.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+LOG=cpuidle.log
+MODULE=/lib/modules/$(uname -r)/kernel/drivers/cpuidle/test-cpuidle_latency.ko
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+helpme()
+{
+   printf "Usage: $0 [-h] [-todg args]
+   [-h ]
+   [-m ]
+   [-o ]
+   \n"
+   exit 2
+}
+
+parse_arguments()
+{
+   while getopts ht:m:o: arg
+   do
+   case $arg in
+   h) # --help
+   helpme
+   ;;
+   m) # --mod-file
+   MODULE=$OPTARG
+   ;;
+   o) # output log files
+   LOG=$OPTARG
+   ;;
+   \?)
+   helpme
+   ;;
+   esac
+   done
+}
+
+ins_mod()
+{
+   if [ ! -f "$MODULE" ]; then
+   printf "$MODULE module does not exist. Exitting\n"
+   exit $ksft_skip
+   fi
+   # Check if the module is already loaded
+   grep "test_cpuidle_latency" /proc/modules
+   if [ $? == 0 ]; then
+   printf "Module: $MODULE already loaded\n\n"
+   return 0
+   fi
+   printf "Inserting $MODULE module\n\n"
+   insmod $MODULE
+   if [ $? != 0 ]; then
+   printf "Insmod $MODULE failed\n"
+   exit $ksft_skip
+   fi
+}
+
+compute_average()
+{
+   arr=("$@")
+   sum=0
+   size=${#arr[@]}
+   if [ $size == 0 ]; then
+   avg=0
+   return 1
+   fi
+   for i in "${arr[@]}"
+   do
+   sum=$((sum + i))
+   done
+   avg=$((sum/size))
+}
+
+# Disable all stop states
+disable_idle()
+{
+   for ((cpu=0; cpu 
/sys/devices/system/cpu/cpu$cpu/cpuidle/state$state/disable
+   done
+   done
+}
+
+# Perform operation on each CPU for the given state
+# $1 - Operation: enable (0) / disable (1)
+# $2 - State to enable
+op_state()
+{
+   for ((cpu=0; cpu 
/sys/devices/system/cpu/cpu$cpu/cpuidle/state$2/disable
+   done
+}
+
+cpuidle_enable_state()
+{
+   state=$1
+   op_state 0 $state
+}
+
+cpuidle_disable_state()
+{
+   state=$1
+   op_state 1 $state
+}
+
+cpu_is_online()
+{
+   cpu=$1
+   status=$(cat /sys/devices/system/cpu/cpu$cpu/online)
+   echo $status
+}
+
+# Extract latency in microseconds and convert to nanoseconds
+extract_latency()
+{
+   for ((state=0; state /dev/null &
+   task_pid=$!
+   # Wait for the workload to achieve 100% CPU usage
+   sleep 1
+   fi
+   taskset 0x1 echo $dest_cpu > /sys/kernel/debug/latency_test/ipi_cpu_dest
+   ipi_latency=$(cat /sys/kernel/debug/latency_test/ipi_latency_ns)
+   src_cpu=$(cat /sys/kernel/debug/latency_test/ipi_cpu_src)
+   if [ "$1" = "baseline" ]; then
+   kill $task_pid
+   wait $task_pid 2>/dev/null
+   fi
+}
+
+# Incrementally Enable idle states one 

[PATCH v3 1/2] cpuidle: Trace IPI based and timer based wakeup latency from idle states

2020-07-21 Thread Pratik Rajesh Sampat
Fire directed smp_call_function_single IPIs from a specified source
CPU to the specified target CPU to reduce the noise we have to wade
through in the trace log.
The module is based on the idea written by Srivatsa Bhat and maintained
by Vaidyanathan Srinivasan internally.

Queue HR timer and measure jitter. Wakeup latency measurement for idle
states using hrtimer.  Echo a value in ns to timer_test_function and
watch trace. A HRtimer will be queued and when it fires the expected
wakeup vs actual wakeup is computes and delay printed in ns.

Implemented as a module which utilizes debugfs so that it can be
integrated with selftests.

To include the module, check option and include as module
kernel hacking -> Cpuidle latency selftests

[srivatsa.b...@linux.vnet.ibm.com: Initial implementation in
 cpidle/sysfs]

[sva...@linux.vnet.ibm.com: wakeup latency measurements using hrtimer
 and fix some of the time calculation]

[e...@linux.vnet.ibm.com: Fix some whitespace and tab errors and
 increase the resolution of IPI wakeup]

Signed-off-by: Pratik Rajesh Sampat 
Reviewed-by: Gautham R. Shenoy 
---
 drivers/cpuidle/Makefile   |   1 +
 drivers/cpuidle/test-cpuidle_latency.c | 150 +
 lib/Kconfig.debug  |  10 ++
 3 files changed, 161 insertions(+)
 create mode 100644 drivers/cpuidle/test-cpuidle_latency.c

diff --git a/drivers/cpuidle/Makefile b/drivers/cpuidle/Makefile
index f07800cbb43f..2ae05968078c 100644
--- a/drivers/cpuidle/Makefile
+++ b/drivers/cpuidle/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED) += coupled.o
 obj-$(CONFIG_DT_IDLE_STATES) += dt_idle_states.o
 obj-$(CONFIG_ARCH_HAS_CPU_RELAX) += poll_state.o
 obj-$(CONFIG_HALTPOLL_CPUIDLE)   += cpuidle-haltpoll.o
+obj-$(CONFIG_IDLE_LATENCY_SELFTEST)  += test-cpuidle_latency.o
 
 
##
 # ARM SoC drivers
diff --git a/drivers/cpuidle/test-cpuidle_latency.c 
b/drivers/cpuidle/test-cpuidle_latency.c
new file mode 100644
index ..61574665e972
--- /dev/null
+++ b/drivers/cpuidle/test-cpuidle_latency.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Module-based API test facility for cpuidle latency using IPIs and timers
+ */
+
+#include 
+#include 
+#include 
+
+/* IPI based wakeup latencies */
+struct latency {
+   unsigned int src_cpu;
+   unsigned int dest_cpu;
+   ktime_t time_start;
+   ktime_t time_end;
+   u64 latency_ns;
+} ipi_wakeup;
+
+static void measure_latency(void *info)
+{
+   struct latency *v;
+   ktime_t time_diff;
+
+   v = (struct latency *)info;
+   v->time_end = ktime_get();
+   time_diff = ktime_sub(v->time_end, v->time_start);
+   v->latency_ns = ktime_to_ns(time_diff);
+}
+
+void run_smp_call_function_test(unsigned int cpu)
+{
+   ipi_wakeup.src_cpu = smp_processor_id();
+   ipi_wakeup.dest_cpu = cpu;
+   ipi_wakeup.time_start = ktime_get();
+   smp_call_function_single(cpu, measure_latency, _wakeup, 1);
+}
+
+/* Timer based wakeup latencies */
+struct timer_data {
+   unsigned int src_cpu;
+   u64 timeout;
+   ktime_t time_start;
+   ktime_t time_end;
+   struct hrtimer timer;
+   u64 timeout_diff_ns;
+} timer_wakeup;
+
+static enum hrtimer_restart timer_called(struct hrtimer *hrtimer)
+{
+   struct timer_data *w;
+   ktime_t time_diff;
+
+   w = container_of(hrtimer, struct timer_data, timer);
+   w->time_end = ktime_get();
+
+   time_diff = ktime_sub(w->time_end, w->time_start);
+   time_diff = ktime_sub(time_diff, ns_to_ktime(w->timeout));
+   w->timeout_diff_ns = ktime_to_ns(time_diff);
+   return HRTIMER_NORESTART;
+}
+
+static void run_timer_test(unsigned int ns)
+{
+   hrtimer_init(_wakeup.timer, CLOCK_MONOTONIC,
+HRTIMER_MODE_REL);
+   timer_wakeup.timer.function = timer_called;
+   timer_wakeup.time_start = ktime_get();
+   timer_wakeup.src_cpu = smp_processor_id();
+   timer_wakeup.timeout = ns;
+
+   hrtimer_start(_wakeup.timer, ns_to_ktime(ns),
+ HRTIMER_MODE_REL_PINNED);
+}
+
+static struct dentry *dir;
+
+static int cpu_read_op(void *data, u64 *value)
+{
+   *value = ipi_wakeup.dest_cpu;
+   return 0;
+}
+
+static int cpu_write_op(void *data, u64 value)
+{
+   run_smp_call_function_test(value);
+   return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(ipi_ops, cpu_read_op, cpu_write_op, "%llu\n");
+
+static int timeout_read_op(void *data, u64 *value)
+{
+   *value = timer_wakeup.timeout;
+   return 0;
+}
+
+static int timeout_write_op(void *data, u64 value)
+{
+   run_timer_test(value);
+   return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(timeout_ops, timeout_read_op, timeout_write_op, 
"%llu\n");
+
+static int __init latency_init(void)
+{
+   struct dentry *temp;
+
+   dir = debugfs_create_dir("latency_test", 0);
+  

Re: [PATCH v3 0/4] powerpc/mm/radix: Memory unplug fixes

2020-07-21 Thread Michael Ellerman
Bharata B Rao  writes:
> On Tue, Jul 21, 2020 at 11:45:20AM +1000, Michael Ellerman wrote:
>> Nathan Lynch  writes:
>> > "Aneesh Kumar K.V"  writes:
>> >> This is the next version of the fixes for memory unplug on radix.
>> >> The issues and the fix are described in the actual patches.
>> >
>> > I guess this isn't actually causing problems at runtime right now, but I
>> > notice calls to resize_hpt_for_hotplug() from arch_add_memory() and
>> > arch_remove_memory(), which ought to be mmu-agnostic:
>> >
>> > int __ref arch_add_memory(int nid, u64 start, u64 size,
>> >  struct mhp_params *params)
>> > {
>> >unsigned long start_pfn = start >> PAGE_SHIFT;
>> >unsigned long nr_pages = size >> PAGE_SHIFT;
>> >int rc;
>> >
>> >resize_hpt_for_hotplug(memblock_phys_mem_size());
>> >
>> >start = (unsigned long)__va(start);
>> >rc = create_section_mapping(start, start + size, nid,
>> >params->pgprot);
>> > ...
>> 
>> Hmm well spotted.
>> 
>> That does return early if the ops are not setup:
>> 
>> int resize_hpt_for_hotplug(unsigned long new_mem_size)
>> {
>>  unsigned target_hpt_shift;
>> 
>>  if (!mmu_hash_ops.resize_hpt)
>>  return 0;
>> 
>> 
>> And:
>> 
>> void __init hpte_init_pseries(void)
>> {
>>  ...
>>  if (firmware_has_feature(FW_FEATURE_HPT_RESIZE))
>>  mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt;
>> 
>> And that comes in via ibm,hypertas-functions:
>> 
>>  {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
>> 
>> 
>> But firmware is not necessarily going to add/remove that call based on
>> whether we're using hash/radix.
>
> Correct but hpte_init_pseries() will not be called for radix guests.

Yeah, duh. You'd think the function name would have been a sufficient
clue for me :)

>> So I think a follow-up patch is needed to make this more robust.
>> 
>> Aneesh/Bharata what platform did you test this series on? I'm curious
>> how this didn't break.
>
> I have tested memory hotplug/unplug for radix guest on zz platform and
> sanity-tested this for hash guest on P8.
>
> As noted above, mmu_hash_ops.resize_hpt will not be set for radix
> guest and hence we won't see any breakage.

OK.

That's probably fine as it is then. Or maybe just a comment in
resize_hpt_for_hotplug() pointing out that resize_hpt will be NULL if
we're using radix.

cheers


Re: [PATCH v2 2/2] selftest/cpuidle: Add support for cpuidle latency measurement

2020-07-21 Thread Pratik Sampat

Hi Gautham, Thanks for the review.


On 20/07/20 11:22 am, Gautham R Shenoy wrote:

Hi Pratik,


On Fri, Jul 17, 2020 at 02:48:01PM +0530, Pratik Rajesh Sampat wrote:

This patch adds support to trace IPI based and timer based wakeup
latency from idle states

Latches onto the test-cpuidle_latency kernel module using the debugfs
interface to send IPIs or schedule a timer based event, which in-turn
populates the debugfs with the latency measurements.

Currently for the IPI and timer tests; first disable all idle states
and then test for latency measurements incrementally enabling each state

Signed-off-by: Pratik Rajesh Sampat 

A few comments below.


---
  tools/testing/selftests/Makefile   |   1 +
  tools/testing/selftests/cpuidle/Makefile   |   6 +
  tools/testing/selftests/cpuidle/cpuidle.sh | 257 +
  tools/testing/selftests/cpuidle/settings   |   1 +
  4 files changed, 265 insertions(+)
  create mode 100644 tools/testing/selftests/cpuidle/Makefile
  create mode 100755 tools/testing/selftests/cpuidle/cpuidle.sh
  create mode 100644 tools/testing/selftests/cpuidle/settings


[..skip..]


+
+ins_mod()
+{
+   if [ ! -f "$MODULE" ]; then
+   printf "$MODULE module does not exist. Exitting\n"

If the module has been compiled into the kernel (due to a
localyesconfig, for instance), then it is unlikely that we will find
it in /lib/modules. Perhaps you want to check if the debugfs
directories created by the module exist, and if so, print a message
saying that the modules is already loaded or some such?


That's a good idea. I can can grep for this module within /proc/modules
and not insert it, if it is already there


+   exit $ksft_skip
+   fi
+   printf "Inserting $MODULE module\n\n"
+   insmod $MODULE
+   if [ $? != 0 ]; then
+   printf "Insmod $MODULE failed\n"
+   exit $ksft_skip
+   fi
+}
+
+compute_average()
+{
+   arr=("$@")
+   sum=0
+   size=${#arr[@]}
+   for i in "${arr[@]}"
+   do
+   sum=$((sum + i))
+   done
+   avg=$((sum/size))

It would be good to assert that "size" isn't 0 here.


Sure


+}
+
+# Disable all stop states
+disable_idle()
+{
+   for ((cpu=0; cpu 
/sys/devices/system/cpu/cpu$cpu/cpuidle/state$state/disable

So, on offlined CPUs, we won't see
/sys/devices/system/cpu/cpu$cpu/cpuidle/state$state directory. You
should probably perform this operation only on online CPUs.


Right. I should make CPU operations only on online CPUs all over the script

[..snip..]

Thanks
Pratik



[PATCH v2 03/10] powerpc/smp: Move powerpc_topology above

2020-07-21 Thread Srikar Dronamraju
Just moving the powerpc_topology description above.
This will help in using functions in this file and avoid declarations.

No other functional changes

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 116 +++---
 1 file changed, 58 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 0e0b118d9b6e..1ce95da00cb6 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -818,6 +818,64 @@ static int init_cpu_l1_cache_map(int cpu)
return err;
 }
 
+static bool shared_caches;
+
+#ifdef CONFIG_SCHED_SMT
+/* cpumask of CPUs with asymmetric SMT dependency */
+static int powerpc_smt_flags(void)
+{
+   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
+
+   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
+   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
+   flags |= SD_ASYM_PACKING;
+   }
+   return flags;
+}
+#endif
+
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
+{
+   return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *smallcore_smt_mask(int cpu)
+{
+   return cpu_smallcore_mask(cpu);
+}
+#endif
+
+static struct sched_domain_topology_level powerpc_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+   { NULL, },
+};
+
 static int init_big_cores(void)
 {
int cpu;
@@ -1249,8 +1307,6 @@ static void add_cpu_to_masks(int cpu)
set_cpus_related(cpu, i, cpu_core_mask);
 }
 
-static bool shared_caches;
-
 /* Activate a secondary processor. */
 void start_secondary(void *unused)
 {
@@ -1314,62 +1370,6 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
-#ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymmetric SMT dependency */
-static int powerpc_smt_flags(void)
-{
-   int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
-
-   if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
-   printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
-   flags |= SD_ASYM_PACKING;
-   }
-   return flags;
-}
-#endif
-
-/*
- * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
- * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
- * since the migrated task remains cache hot. We want to take advantage of this
- * at the scheduler level so an extra topology level is required.
- */
-static int powerpc_shared_cache_flags(void)
-{
-   return SD_SHARE_PKG_RESOURCES;
-}
-
-/*
- * We can't just pass cpu_l2_cache_mask() directly because
- * returns a non-const pointer and the compiler barfs on that.
- */
-static const struct cpumask *shared_cache_mask(int cpu)
-{
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *smallcore_smt_mask(int cpu)
-{
-   return cpu_smallcore_mask(cpu);
-}
-#endif
-
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-- 
2.17.1



[PATCH v2 10/10] powerpc/smp: Implement cpu_to_coregroup_id

2020-07-21 Thread Srikar Dronamraju
Lookup the coregroup id from the associativity array.

If unable to detect the coregroup id, fallback on the core id.
This way, ensure sched_domain degenerates and an extra sched domain is
not created.

Ideally this function should have been implemented in
arch/powerpc/kernel/smp.c. However if its implemented in mm/numa.c, we
don't need to find the primary domain again.

If the device-tree mentions more than one coregroup, then kernel
implements only the last or the smallest coregroup, which currently
corresponds to the penultimate domain in the device-tree.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
powerpc/smp: Implement cpu_to_coregroup_id
Move coregroup_enabled before getting associativity (Gautham)

 arch/powerpc/mm/numa.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index ef8aa580da21..ae57b68beaee 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -1218,6 +1218,26 @@ int find_and_online_cpu_nid(int cpu)
 
 int cpu_to_coregroup_id(int cpu)
 {
+   __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
+   int index;
+
+   if (cpu < 0 || cpu > nr_cpu_ids)
+   return -1;
+
+   if (!coregroup_enabled)
+   goto out;
+
+   if (!firmware_has_feature(FW_FEATURE_VPHN))
+   goto out;
+
+   if (vphn_get_associativity(cpu, associativity))
+   goto out;
+
+   index = of_read_number(associativity, 1);
+   if (index > min_common_depth + 1)
+   return of_read_number([index - 1], 1);
+
+out:
return cpu_to_core_id(cpu);
 }
 
-- 
2.17.1



[PATCH v2 09/10] Powerpc/smp: Create coregroup domain

2020-07-21 Thread Srikar Dronamraju
Add percpu coregroup maps and masks to create coregroup domain.
If a coregroup doesn't exist, the coregroup domain will be degenerated
in favour of SMT/CACHE domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Powerpc/smp: Create coregroup domain
Moved coregroup topology fixup to fixup_topology (Gautham)

 arch/powerpc/include/asm/topology.h | 10 
 arch/powerpc/kernel/smp.c   | 38 +
 arch/powerpc/mm/numa.c  |  5 
 3 files changed, 53 insertions(+)

diff --git a/arch/powerpc/include/asm/topology.h 
b/arch/powerpc/include/asm/topology.h
index f0b6300e7dd3..6609174918ab 100644
--- a/arch/powerpc/include/asm/topology.h
+++ b/arch/powerpc/include/asm/topology.h
@@ -88,12 +88,22 @@ static inline int cpu_distance(__be32 *cpu1_assoc, __be32 
*cpu2_assoc)
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR)
 extern int find_and_online_cpu_nid(int cpu);
+extern int cpu_to_coregroup_id(int cpu);
 #else
 static inline int find_and_online_cpu_nid(int cpu)
 {
return 0;
 }
 
+static inline int cpu_to_coregroup_id(int cpu)
+{
+#ifdef CONFIG_SMP
+   return cpu_to_core_id(cpu);
+#else
+   return 0;
+#endif
+}
+
 #endif /* CONFIG_NUMA && CONFIG_PPC_SPLPAR */
 
 #include 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 97b762a1944a..a7e1366b7fd3 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -80,6 +80,7 @@ DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_coregroup_map);
 
 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
@@ -91,6 +92,7 @@ enum {
smt_idx,
 #endif
bigcore_idx,
+   mc_idx,
die_idx,
 };
 
@@ -869,6 +871,21 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static struct cpumask *cpu_coregroup_mask(int cpu)
+{
+   return per_cpu(cpu_coregroup_map, cpu);
+}
+
+static bool has_coregroup_support(void)
+{
+   return coregroup_enabled;
+}
+
+static const struct cpumask *cpu_mc_mask(int cpu)
+{
+   return cpu_coregroup_mask(cpu);
+}
+
 static const struct cpumask *cpu_bigcore_mask(int cpu)
 {
return per_cpu(cpu_sibling_map, cpu);
@@ -879,6 +896,7 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
{ cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
+   { cpu_mc_mask, SD_INIT_NAME(MC) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -927,6 +945,10 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
GFP_KERNEL, node);
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
GFP_KERNEL, node);
+   if (has_coregroup_support())
+   zalloc_cpumask_var_node(_cpu(cpu_coregroup_map, 
cpu),
+   GFP_KERNEL, node);
+
 #ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
@@ -944,6 +966,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
 
+   if (has_coregroup_support())
+   cpumask_set_cpu(boot_cpuid, cpu_coregroup_mask(boot_cpuid));
+
init_big_cores();
if (has_big_cores) {
cpumask_set_cpu(boot_cpuid,
@@ -1235,6 +1260,8 @@ static void remove_cpu_from_masks(int cpu)
set_cpus_unrelated(cpu, i, cpu_sibling_mask);
if (has_big_cores)
set_cpus_unrelated(cpu, i, cpu_smallcore_mask);
+   if (has_coregroup_support())
+   set_cpus_unrelated(cpu, i, cpu_coregroup_mask);
}
 }
 #endif
@@ -1295,6 +1322,14 @@ static void add_cpu_to_masks(int cpu)
add_cpu_to_smallcore_masks(cpu);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
+   if (has_coregroup_support()) {
+   cpumask_set_cpu(cpu, cpu_coregroup_mask(cpu));
+   for_each_cpu(i, cpu_online_mask) {
+   if (cpu_to_coregroup_id(cpu) == cpu_to_coregroup_id(i))
+   set_cpus_related(cpu, i, cpu_coregroup_mask);
+   }
+   }
+
if (pkg_id == -1) {
struct cpumask *(*mask)(int) = cpu_sibling_mask;
 
@@ -1386,6 +1421,9 @@ int setup_profiling_timer(unsigned int 

[PATCH v2 08/10] powerpc/smp: Allocate cpumask only after searching thread group

2020-07-21 Thread Srikar Dronamraju
If allocated earlier and the search fails, then cpumask need to be
freed. However cpu_l1_cache_map can be allocated after we search thread
group.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 320e36a0ec0b..97b762a1944a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -797,10 +797,6 @@ static int init_cpu_l1_cache_map(int cpu)
if (err)
goto out;
 
-   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
-   GFP_KERNEL,
-   cpu_to_node(cpu));
-
cpu_group_start = get_cpu_thread_group_start(cpu, );
 
if (unlikely(cpu_group_start == -1)) {
@@ -809,6 +805,9 @@ static int init_cpu_l1_cache_map(int cpu)
goto out;
}
 
+   zalloc_cpumask_var_node(_cpu(cpu_l1_cache_map, cpu),
+   GFP_KERNEL, cpu_to_node(cpu));
+
for (i = first_thread; i < first_thread + threads_per_core; i++) {
int i_group_start = get_cpu_thread_group_start(i, );
 
-- 
2.17.1



[PATCH v2 07/10] Powerpc/numa: Detect support for coregroup

2020-07-21 Thread Srikar Dronamraju
Add support for grouping cores based on the device-tree classification.
- The last domain in the associativity domains always refers to the
core.
- If primary reference domain happens to be the penultimate domain in
the associativity domains device-tree property, then there are no
coregroups. However if its not a penultimate domain, then there are
coregroups. There can be more than one coregroup. For now we would be
interested in the last or the smallest coregroups.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
Powerpc/numa: Detect support for coregroup
Explained Coregroup in commit msg (Michael Ellerman)

 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/smp.c  |  1 +
 arch/powerpc/mm/numa.c | 34 +-
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 49a25e2400f2..5bdc17a7049f 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -28,6 +28,7 @@
 extern int boot_cpuid;
 extern int spinning_secondaries;
 extern u32 *cpu_to_phys_id;
+extern bool coregroup_enabled;
 
 extern void cpu_die(void);
 extern int cpu_to_chip_id(int cpu);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 933ebdf97432..320e36a0ec0b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -74,6 +74,7 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
 
 struct task_struct *secondary_current;
 bool has_big_cores;
+bool coregroup_enabled;
 
 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
 DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map);
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index bc5b2e8112c8..3248160c0327 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -886,7 +886,9 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
 static void __init find_possible_nodes(void)
 {
struct device_node *rtas;
-   u32 numnodes, i;
+   const __be32 *domains;
+   int prop_length, max_nodes;
+   u32 i;
 
if (!numa_enabled)
return;
@@ -895,25 +897,31 @@ static void __init find_possible_nodes(void)
if (!rtas)
return;
 
-   if (of_property_read_u32_index(rtas, 
"ibm,current-associativity-domains",
-   min_common_depth, )) {
-   /*
-* ibm,current-associativity-domains is a fairly recent
-* property. If it doesn't exist, then fallback on
-* ibm,max-associativity-domains. Current denotes what the
-* platform can support compared to max which denotes what the
-* Hypervisor can support.
-*/
-   if (of_property_read_u32_index(rtas, 
"ibm,max-associativity-domains",
-   min_common_depth, ))
+   /*
+* ibm,current-associativity-domains is a fairly recent property. If
+* it doesn't exist, then fallback on ibm,max-associativity-domains.
+* Current denotes what the platform can support compared to max
+* which denotes what the Hypervisor can support.
+*/
+   domains = of_get_property(rtas, "ibm,current-associativity-domains",
+   _length);
+   if (!domains) {
+   domains = of_get_property(rtas, "ibm,max-associativity-domains",
+   _length);
+   if (!domains)
goto out;
}
 
-   for (i = 0; i < numnodes; i++) {
+   max_nodes = of_read_number([min_common_depth], 1);
+   for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
}
 
+   prop_length /= sizeof(int);
+   if (prop_length > min_common_depth + 2)
+   coregroup_enabled = 1;
+
 out:
of_node_put(rtas);
 }
-- 
2.17.1



[PATCH v2 06/10] powerpc/smp: Generalize 2nd sched domain

2020-07-21 Thread Srikar Dronamraju
Currently "CACHE" domain happens to be the 2nd sched domain as per
powerpc_topology. This domain will collapse if cpumask of l2-cache is
same as SMT domain. However we could generalize this domain such that it
could mean either be a "CACHE" domain or a "BIGCORE" domain.

While setting up the "CACHE" domain, check if shared_cache is already
set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
powerpc/smp: Generalize 2nd sched domain
Moved shared_cache topology fixup to fixup_topology (Gautham)

 arch/powerpc/kernel/smp.c | 49 ---
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 57468877499a..933ebdf97432 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -85,6 +85,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
 EXPORT_PER_CPU_SYMBOL(cpu_core_map);
 EXPORT_SYMBOL_GPL(has_big_cores);
 
+enum {
+#ifdef CONFIG_SCHED_SMT
+   smt_idx,
+#endif
+   bigcore_idx,
+   die_idx,
+};
+
 #define MAX_THREAD_LIST_SIZE   8
 #define THREAD_GROUP_SHARE_L1   1
 struct thread_groups {
@@ -851,13 +859,7 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   if (shared_caches)
-   return cpu_l2_cache_mask(cpu);
-
-   if (has_big_cores)
-   return cpu_smallcore_mask(cpu);
-
-   return per_cpu(cpu_sibling_map, cpu);
+   return per_cpu(cpu_l2_cache_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -867,11 +869,16 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
+static const struct cpumask *cpu_bigcore_mask(int cpu)
+{
+   return per_cpu(cpu_sibling_map, cpu);
+}
+
 static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
-   { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+   { cpu_bigcore_mask, SD_INIT_NAME(BIGCORE) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
 };
@@ -1313,7 +1320,6 @@ static void add_cpu_to_masks(int cpu)
 void start_secondary(void *unused)
 {
unsigned int cpu = smp_processor_id();
-   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
 
mmgrab(_mm);
current->active_mm = _mm;
@@ -1339,14 +1345,20 @@ void start_secondary(void *unused)
/* Update topology CPU masks */
add_cpu_to_masks(cpu);
 
-   if (has_big_cores)
-   sibling_mask = cpu_smallcore_mask;
/*
 * Check for any shared caches. Note that this must be done on a
 * per-core basis because one core in the pair might be disabled.
 */
-   if (!cpumask_equal(cpu_l2_cache_mask(cpu), sibling_mask(cpu)))
-   shared_caches = true;
+   if (!shared_caches) {
+   struct cpumask *(*sibling_mask)(int) = cpu_sibling_mask;
+   struct cpumask *mask = cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   sibling_mask = cpu_smallcore_mask;
+
+   if (cpumask_weight(mask) > cpumask_weight(sibling_mask(cpu)))
+   shared_caches = true;
+   }
 
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -1374,10 +1386,19 @@ int setup_profiling_timer(unsigned int multiplier)
 
 static void fixup_topology(void)
 {
+   if (shared_caches) {
+   pr_info("Using shared cache scheduler topology\n");
+   powerpc_topology[bigcore_idx].mask = shared_cache_mask;
+#ifdef CONFIG_SCHED_DEBUG
+   powerpc_topology[bigcore_idx].name = "CACHE";
+#endif
+   powerpc_topology[bigcore_idx].sd_flags = 
powerpc_shared_cache_flags;
+   }
+
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
+   powerpc_topology[smt_idx].mask = smallcore_smt_mask;
}
 #endif
 }
-- 
2.17.1



[PATCH v2 05/10] powerpc/smp: Dont assume l2-cache to be superset of sibling

2020-07-21 Thread Srikar Dronamraju
Current code assumes that cpumask of cpus sharing a l2-cache mask will
always be a superset of cpu_sibling_mask.

Lets stop that assumption. cpu_l2_cache_mask is a superset of
cpu_sibling_mask if and only if shared_caches is set.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
powerpc/smp: Dont assume l2-cache to be superset of sibling
Set cpumask after verifying l2-cache. (Gautham)

 arch/powerpc/kernel/smp.c | 28 +++-
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 72f16dc0cb26..57468877499a 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1196,6 +1196,7 @@ static bool update_mask_by_l2(int cpu, struct cpumask 
*(*mask_fn)(int))
if (!l2_cache)
return false;
 
+   cpumask_set_cpu(cpu, mask_fn(cpu));
for_each_cpu(i, cpu_online_mask) {
/*
 * when updating the marks the current CPU has not been marked
@@ -1278,29 +1279,30 @@ static void add_cpu_to_masks(int cpu)
 * add it to it's own thread sibling mask.
 */
cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+   cpumask_set_cpu(cpu, cpu_core_mask(cpu));
 
for (i = first_thread; i < first_thread + threads_per_core; i++)
if (cpu_online(i))
set_cpus_related(i, cpu, cpu_sibling_mask);
 
add_cpu_to_smallcore_masks(cpu);
-   /*
-* Copy the thread sibling mask into the cache sibling mask
-* and mark any CPUs that share an L2 with this CPU.
-*/
-   for_each_cpu(i, cpu_sibling_mask(cpu))
-   set_cpus_related(cpu, i, cpu_l2_cache_mask);
update_mask_by_l2(cpu, cpu_l2_cache_mask);
 
-   /*
-* Copy the cache sibling mask into core sibling mask and mark
-* any CPUs on the same chip as this CPU.
-*/
-   for_each_cpu(i, cpu_l2_cache_mask(cpu))
-   set_cpus_related(cpu, i, cpu_core_mask);
+   if (pkg_id == -1) {
+   struct cpumask *(*mask)(int) = cpu_sibling_mask;
+
+   /*
+* Copy the sibling mask into core sibling mask and
+* mark any CPUs on the same chip as this CPU.
+*/
+   if (shared_caches)
+   mask = cpu_l2_cache_mask;
+
+   for_each_cpu(i, mask(cpu))
+   set_cpus_related(cpu, i, cpu_core_mask);
 
-   if (pkg_id == -1)
return;
+   }
 
for_each_cpu(i, cpu_online_mask)
if (get_physical_package_id(i) == pkg_id)
-- 
2.17.1



[PATCH v2 04/10] powerpc/smp: Enable small core scheduling sooner

2020-07-21 Thread Srikar Dronamraju
Enable small core scheduling as soon as we detect that we are in a
system that supports thread group. Doing so would avoid a redundant
check.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
powerpc/smp: Enable small core scheduling sooner
Restored the previous info msg (Jordan)
Moved big core topology fixup to fixup_topology (Gautham)

 arch/powerpc/kernel/smp.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 1ce95da00cb6..72f16dc0cb26 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1370,6 +1370,16 @@ int setup_profiling_timer(unsigned int multiplier)
return 0;
 }
 
+static void fixup_topology(void)
+{
+#ifdef CONFIG_SCHED_SMT
+   if (has_big_cores) {
+   pr_info("Big cores detected but using small core scheduling\n");
+   powerpc_topology[0].mask = smallcore_smt_mask;
+   }
+#endif
+}
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
@@ -1383,12 +1393,7 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
dump_numa_cpu_topology();
 
-#ifdef CONFIG_SCHED_SMT
-   if (has_big_cores) {
-   pr_info("Big cores detected but using small core scheduling\n");
-   powerpc_topology[0].mask = smallcore_smt_mask;
-   }
-#endif
+   fixup_topology();
set_sched_topology(powerpc_topology);
 }
 
-- 
2.17.1



[PATCH v2 02/10] powerpc/smp: Merge Power9 topology with Power topology

2020-07-21 Thread Srikar Dronamraju
A new sched_domain_topology_level was added just for Power9. However the
same can be achieved by merging powerpc_topology with power9_topology
and makes the code more simpler especially when adding a new sched
domain.

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Signed-off-by: Srikar Dronamraju 
---
Changelog v1 -> v2:
powerpc/smp: Merge Power9 topology with Power topology
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

 arch/powerpc/kernel/smp.c | 33 ++---
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 680c0edcc59d..0e0b118d9b6e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1315,7 +1315,7 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 #ifdef CONFIG_SCHED_SMT
-/* cpumask of CPUs with asymetric SMT dependancy */
+/* cpumask of CPUs with asymmetric SMT dependency */
 static int powerpc_smt_flags(void)
 {
int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
@@ -1328,14 +1328,6 @@ static int powerpc_smt_flags(void)
 }
 #endif
 
-static struct sched_domain_topology_level powerpc_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-   { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
-#endif
-   { cpu_cpu_mask, SD_INIT_NAME(DIE) },
-   { NULL, },
-};
-
 /*
  * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
  * This topology makes it *much* cheaper to migrate tasks between adjacent 
cores
@@ -1353,7 +1345,13 @@ static int powerpc_shared_cache_flags(void)
  */
 static const struct cpumask *shared_cache_mask(int cpu)
 {
-   return cpu_l2_cache_mask(cpu);
+   if (shared_caches)
+   return cpu_l2_cache_mask(cpu);
+
+   if (has_big_cores)
+   return cpu_smallcore_mask(cpu);
+
+   return per_cpu(cpu_sibling_map, cpu);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -1363,7 +1361,7 @@ static const struct cpumask *smallcore_smt_mask(int cpu)
 }
 #endif
 
-static struct sched_domain_topology_level power9_topology[] = {
+static struct sched_domain_topology_level powerpc_topology[] = {
 #ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
 #endif
@@ -1388,21 +1386,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 #ifdef CONFIG_SCHED_SMT
if (has_big_cores) {
pr_info("Big cores detected but using small core scheduling\n");
-   power9_topology[0].mask = smallcore_smt_mask;
powerpc_topology[0].mask = smallcore_smt_mask;
}
 #endif
-   /*
-* If any CPU detects that it's sharing a cache with another CPU then
-* use the deeper topology that is aware of this sharing.
-*/
-   if (shared_caches) {
-   pr_info("Using shared cache scheduler topology\n");
-   set_sched_topology(power9_topology);
-   } else {
-   pr_info("Using standard scheduler topology\n");
-   set_sched_topology(powerpc_topology);
-   }
+   set_sched_topology(powerpc_topology);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
2.17.1



[PATCH v2 01/10] powerpc/smp: Cache node for reuse

2020-07-21 Thread Srikar Dronamraju
While cpu_to_node is inline function with access to per_cpu variable.
However when using repeatedly, it may be cleaner to cache it in a local
variable.

Also fix a build error in a some weird config.
"error: _numa_cpu_lookup_table_ undeclared"

No functional change

Cc: linuxppc-dev 
Cc: LKML 
Cc: Michael Ellerman 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Valentin Schneider 
Cc: Nick Piggin 
Cc: Oliver OHalloran 
Cc: Nathan Lynch 
Cc: Michael Neuling 
Cc: Anton Blanchard 
Cc: Gautham R Shenoy 
Cc: Vaidyanathan Srinivasan 
Cc: Jordan Niethe 
Reviewed-by: Gautham R. Shenoy 
Signed-off-by: Srikar Dronamraju 
---
 arch/powerpc/kernel/smp.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 73199470c265..680c0edcc59d 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -843,7 +843,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 
DBG("smp_prepare_cpus\n");
 
-   /* 
+   /*
 * setup_cpu may need to be called on the boot cpu. We havent
 * spun any cpus up but lets be paranoid.
 */
@@ -854,20 +854,24 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
cpu_callin_map[boot_cpuid] = 1;
 
for_each_possible_cpu(cpu) {
+   int node = cpu_to_node(cpu);
+
zalloc_cpumask_var_node(_cpu(cpu_sibling_map, cpu),
-   GFP_KERNEL, cpu_to_node(cpu));
+   GFP_KERNEL, node);
zalloc_cpumask_var_node(_cpu(cpu_l2_cache_map, cpu),
-   GFP_KERNEL, cpu_to_node(cpu));
+   GFP_KERNEL, node);
zalloc_cpumask_var_node(_cpu(cpu_core_map, cpu),
-   GFP_KERNEL, cpu_to_node(cpu));
+   GFP_KERNEL, node);
+#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
 * numa_node_id() works after this.
 */
if (cpu_present(cpu)) {
-   set_cpu_numa_node(cpu, numa_cpu_lookup_table[cpu]);
-   set_cpu_numa_mem(cpu,
-   local_memory_node(numa_cpu_lookup_table[cpu]));
+   node = numa_cpu_lookup_table[cpu];
+   set_cpu_numa_node(cpu, node);
+   set_cpu_numa_mem(cpu, local_memory_node(node));
}
+#endif
}
 
/* Init the cpumasks so the boot CPU is related to itself */
-- 
2.17.1



[PATCH v2 00/10] Coregroup support on Powerpc

2020-07-21 Thread Srikar Dronamraju
Changelog v1 -> v2:
v1: 
https://lore.kernel.org/linuxppc-dev/20200714043624.5648-1-sri...@linux.vnet.ibm.com/t/#u

powerpc/smp: Merge Power9 topology with Power topology
Replaced a reference to cpu_smt_mask with per_cpu(cpu_sibling_map, cpu)
since cpu_smt_mask is only defined under CONFIG_SCHED_SMT

powerpc/smp: Enable small core scheduling sooner
Restored the previous info msg (Jordan)
Moved big core topology fixup to fixup_topology (Gautham)

powerpc/smp: Dont assume l2-cache to be superset of sibling
Set cpumask after verifying l2-cache. (Gautham)

powerpc/smp: Generalize 2nd sched domain
Moved shared_cache topology fixup to fixup_topology (Gautham)

Powerpc/numa: Detect support for coregroup
Explained Coregroup in commit msg (Michael Ellerman)

Powerpc/smp: Create coregroup domain
Moved coregroup topology fixup to fixup_topology (Gautham)

powerpc/smp: Implement cpu_to_coregroup_id
Move coregroup_enabled before getting associativity (Gautham)

powerpc/smp: Provide an ability to disable coregroup
Patch dropped (Michael Ellerman)

Cleanup of existing powerpc topologies and add coregroup support on
Powerpc. Coregroup is a group of (subset of) cores of a DIE that share
a resource.

Patch 7 of this patch series: "Powerpc/numa: Detect support for coregroup"
depends on
https://lore.kernel.org/linuxppc-dev/20200707140644.7241-1-sri...@linux.vnet.ibm.com/t/#u
However it should be easy to rebase the patch without the above patch.

This patch series is based on top of current powerpc/next tree + the
above patch.

On Power 8 Systems
--
$ tail /proc/cpuinfo
processor   : 255
cpu : POWER8 (architected), altivec supported
clock   : 3724.00MHz
revision: 2.1 (pvr 004b 0201)

timebase: 51200
platform: pSeries
model   : IBM,8408-E8E
machine : CHRP IBM,8408-E8E
MMU : Hash

Before the patchset
---
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

After the patchset
--
$ cat /proc/sys/kernel/sched_domain/cpu0/domain*/name
SMT
DIE
NUMA
NUMA
$ head /proc/schedstat
version 15
timestamp 4295534931
cpu0 0 0 0 0 0 0 41389823338 17682779896 14117
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain2 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain3 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
cpu1 0 0 0 0 0 0 27087859050 152273672 10396
domain0 ,,,,,,,00ff 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
domain1 ,,,,,,, 
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

On Power 9 (with device-tree enablement to show coregroups).
(hunks for mimicing a coregroup was posted at
https://lore.kernel.org/linuxppc-dev/20200714043624.5648-1-sri...@linux.vnet.ibm.com/t/#m2cb09bb11c7a93257d6123d1d27edb8212f8af21)
---
$ tail /proc/cpuinfo
processor   : 127
cpu : POWER9 (architected), altivec supported
clock   : 3000.00MHz
revision: 2.2 (pvr 004e 0202)

timebase: 51200
platform: pSeries
model   : IBM,9008-22L
machine : CHRP IBM,9008-22L
MMU : Hash

Before patchset
--
$ cat 

Re: [PATCH v4 09/10] powerpc/watchpoint: Return available watchpoints dynamically

2020-07-21 Thread Michael Ellerman
Ravi Bangoria  writes:
> On 7/20/20 9:12 AM, Jordan Niethe wrote:
>> On Fri, Jul 17, 2020 at 2:11 PM Ravi Bangoria
>>  wrote:
>>>
>>> So far Book3S Powerpc supported only one watchpoint. Power10 is
>>> introducing 2nd DAWR. Enable 2nd DAWR support for Power10.
>>> Availability of 2nd DAWR will depend on CPU_FTR_DAWR1.
>>>
>>> Signed-off-by: Ravi Bangoria 
>>> ---
>>>   arch/powerpc/include/asm/cputable.h  | 4 +++-
>>>   arch/powerpc/include/asm/hw_breakpoint.h | 5 +++--
>>>   2 files changed, 6 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/cputable.h 
>>> b/arch/powerpc/include/asm/cputable.h
>>> index 3445c86e1f6f..36a0851a7a9b 100644
>>> --- a/arch/powerpc/include/asm/cputable.h
>>> +++ b/arch/powerpc/include/asm/cputable.h
>>> @@ -633,7 +633,9 @@ enum {
>>>* Maximum number of hw breakpoint supported on powerpc. Number of
>>>* breakpoints supported by actual hw might be less than this.
>>>*/
>>> -#define HBP_NUM_MAX1
>>> +#define HBP_NUM_MAX2
>>> +#define HBP_NUM_ONE1
>>> +#define HBP_NUM_TWO2

>> I wonder if these defines are necessary - has it any advantage over
>> just using the literal?
>
> No, not really. Initially I had something like:
>
> #define HBP_NUM_MAX2
> #define HBP_NUM_P8_P9  1
> #define HBP_NUM_P102
>
> But then I thought it's also not right. So I made it _ONE and _TWO.
> Now the function that decides nr watchpoints dynamically (nr_wp_slots)
> is in different file, I thought to keep it like this so it would be
> easier to figure out why _MAX is 2.

I don't think it makes anything clearer.

I had to stare at it thinking there was some sort of mapping or
indirection going on, before I realised it's just literally the number
of breakpoints.

So please just do:

static inline int nr_wp_slots(void)
{
   return cpu_has_feature(CPU_FTR_DAWR1) ? 2 : 1;
}

If you think HBP_NUM_MAX needs explanation then do that with a comment,
it can refer to nr_wp_slots() if that's helpful.

cheers


Re: [PATCH v4 05/10] powerpc/dt_cpu_ftrs: Add feature for 2nd DAWR

2020-07-21 Thread Michael Ellerman
Ravi Bangoria  writes:
> On 7/17/20 11:14 AM, Jordan Niethe wrote:
>> On Fri, Jul 17, 2020 at 2:10 PM Ravi Bangoria
>>  wrote:
>>>
>>> Add new device-tree feature for 2nd DAWR. If this feature is present,
>>> 2nd DAWR is supported, otherwise not.
>>>
>>> Signed-off-by: Ravi Bangoria 
>>> ---
>>>   arch/powerpc/include/asm/cputable.h | 7 +--
>>>   arch/powerpc/kernel/dt_cpu_ftrs.c   | 7 +++
>>>   2 files changed, 12 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/arch/powerpc/include/asm/cputable.h 
>>> b/arch/powerpc/include/asm/cputable.h
>>> index e506d429b1af..3445c86e1f6f 100644
>>> --- a/arch/powerpc/include/asm/cputable.h
>>> +++ b/arch/powerpc/include/asm/cputable.h
>>> @@ -214,6 +214,7 @@ static inline void cpu_feature_keys_init(void) { }
>>>   #define CPU_FTR_P9_TLBIE_ERAT_BUG  LONG_ASM_CONST(0x0001)
>>>   #define CPU_FTR_P9_RADIX_PREFETCH_BUG  LONG_ASM_CONST(0x0002)
>>>   #define CPU_FTR_ARCH_31
>>> LONG_ASM_CONST(0x0004)
>>> +#define CPU_FTR_DAWR1  LONG_ASM_CONST(0x0008)
>>>
>>>   #ifndef __ASSEMBLY__
>>>
>>> @@ -497,14 +498,16 @@ static inline void cpu_feature_keys_init(void) { }
>>>   #define CPU_FTRS_POSSIBLE  \
>>>  (CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | CPU_FTRS_POWER8 | \
>>>   CPU_FTR_ALTIVEC_COMP | CPU_FTR_VSX_COMP | CPU_FTRS_POWER9 | \
>>> -CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
>>> CPU_FTRS_POWER10)
>>> +CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
>>> CPU_FTRS_POWER10 | \
>>> +CPU_FTR_DAWR1)
>>>   #else
>>>   #define CPU_FTRS_POSSIBLE  \
>>>  (CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
>>>   CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
>>>   CPU_FTRS_POWER8 | CPU_FTRS_CELL | CPU_FTRS_PA6T | \
>>>   CPU_FTR_VSX_COMP | CPU_FTR_ALTIVEC_COMP | CPU_FTRS_POWER9 | \
>>> -CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
>>> CPU_FTRS_POWER10)
>>> +CPU_FTRS_POWER9_DD2_1 | CPU_FTRS_POWER9_DD2_2 | 
>>> CPU_FTRS_POWER10 | \
>>> +CPU_FTR_DAWR1)

>> Instead of putting CPU_FTR_DAWR1 into CPU_FTRS_POSSIBLE should it go
>> into CPU_FTRS_POWER10?
>> Then it will be picked up by CPU_FTRS_POSSIBLE.
>
> I remember a discussion about this with Mikey and we decided to do it
> this way. Obviously, the purpose is to make CPU_FTR_DAWR1 independent of
> CPU_FTRS_POWER10 because DAWR1 is an optional feature in p10. I fear
> including CPU_FTR_DAWR1 in CPU_FTRS_POWER10 can make it forcefully enabled
> even when device-tree property is not present or pa-feature bit it not set,
> because we do:
>
>{   /* 3.1-compliant processor, i.e. Power10 "architected" mode */
>.pvr_mask   = 0x,
>.pvr_value  = 0x0f06,
>.cpu_name   = "POWER10 (architected)",
>.cpu_features   = CPU_FTRS_POWER10,

The pa-features logic will turn it off if the feature bit is not set.

So you should be able to put it in CPU_FTRS_POWER10.

See for example CPU_FTR_NOEXECUTE.

cheers


Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-21 Thread Nicholas Piggin
Excerpts from Peter Zijlstra's message of July 9, 2020 6:31 pm:
> On Wed, Jul 08, 2020 at 07:54:34PM -0400, Waiman Long wrote:
>> On 7/8/20 4:41 AM, Peter Zijlstra wrote:
>> > On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> > > Yes, powerpc could certainly get more performance out of the slow
>> > > paths, and then there are a few parameters to tune.
>> > Can you clarify? The slow path is already in use on ARM64 which is weak,
>> > so I doubt there's superfluous serialization present. And Will spend a
>> > fair amount of time on making that thing guarantee forward progressm, so
>> > there just isn't too much room to play.
>> > 
>> > > We don't have a good alternate patching for function calls yet, but
>> > > that would be something to do for native vs pv.
>> > Going by your jump_label implementation, support for static_call should
>> > be fairly straight forward too, no?
>> > 
>> >https://lkml.kernel.org/r/20200624153024.794671...@infradead.org
>> > 
>> Speaking of static_call, I am also looking forward to it. Do you have an
>> idea when that will be merged?
> 
> 0day had one crash on the last round, I think Steve send a fix for that
> last night and I'll go look at it.
> 
> That said, the last posting got 0 feedback, so either everybody is
> really happy with it, or not interested. So let us know in the thread,
> with some review feedback.
> 
> Once I get through enough of the inbox to actually find the fix and test
> it, I'll also update the thread, and maybe threaten to merge it if
> everybody stays silent :-)

I'd like to use it in powerpc. We have code now for example that patches 
a branch immediately at the top of memcpy which branches to a different 
version of the function. pv queued spinlock selection obviously, and
there's a bunch of platform ops struct things that get filled in at boot 
time, etc.

So +1 here if you can get them through. I'm not 100% sure we can do
it with existing toolchain and no ugly hacks, but there's no way to
structure things that can get around that AFAIKS. We'd eventually
use it though, I'd say.

Thanks,
Nick


Re: [PATCH v3 0/6] powerpc: queued spinlocks and rwlocks

2020-07-21 Thread Nicholas Piggin
Excerpts from Peter Zijlstra's message of July 8, 2020 6:41 pm:
> On Tue, Jul 07, 2020 at 03:57:06PM +1000, Nicholas Piggin wrote:
>> Yes, powerpc could certainly get more performance out of the slow
>> paths, and then there are a few parameters to tune.
> 

Sorry for the delay, got bogged down and distracted by other things :(

> Can you clarify? The slow path is already in use on ARM64 which is weak,
> so I doubt there's superfluous serialization present. And Will spend a
> fair amount of time on making that thing guarantee forward progressm, so
> there just isn't too much room to play.

Sure, the way the pending not-queued slowpath (which I guess is the
medium-path) is implemented is just poorly structured for LL/SC. It
has one more atomic than necessary (queued_fetch_set_pending_acquire),
and a lot of branches in suboptimal order.

Attached patch (completely untested just compiled and looked at asm
so far) is a way we can fix this on powerpc I think. It's actually
very little generic code change which is good, duplicated medium-path
logic unfortunately but that's no worse than something like x86
really.

>> We don't have a good alternate patching for function calls yet, but
>> that would be something to do for native vs pv.
> 
> Going by your jump_label implementation, support for static_call should
> be fairly straight forward too, no?
> 
>   https://lkml.kernel.org/r/20200624153024.794671...@infradead.org

Nice, yeah it should be. I've wanted this for ages!

powerpc is kind of annoying to implement that with limited call range,
Hmm, not sure if we'd need a new linker feature to support it. We'd
provide call site patch space for indirect branches for those out of
range of direct call, so that should work fine. The trick would be 
patching in the TOC lookup for the function... should be doable somehow.

Thanks,
Nick

---

diff --git a/arch/powerpc/include/asm/qspinlock.h 
b/arch/powerpc/include/asm/qspinlock.h
index b752d34517b3..26d8766a1106 100644
--- a/arch/powerpc/include/asm/qspinlock.h
+++ b/arch/powerpc/include/asm/qspinlock.h
@@ -31,16 +31,57 @@ static inline void queued_spin_unlock(struct qspinlock 
*lock)
 
 #else
 extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
 #endif
 
 static __always_inline void queued_spin_lock(struct qspinlock *lock)
 {
-   u32 val = 0;
-
-   if (likely(atomic_try_cmpxchg_lock(>val, , _Q_LOCKED_VAL)))
+   atomic_t *a = >val;
+   u32 val;
+
+again:
+   asm volatile(
+"1:\t" PPC_LWARX(%0,0,%1,1) "  # queued_spin_lock  \n"
+   : "=" (val)
+   : "r" (>counter)
+   : "memory");
+
+   if (likely(val == 0)) {
+   asm_volatile_goto(
+   "   stwcx.  %0,0,%1 
\n"
+   "   bne-%l[again]   
\n"
+   "\t"PPC_ACQUIRE_BARRIER "   
\n"
+   :
+   : "r"(_Q_LOCKED_VAL), "r" (>counter)
+   : "cr0", "memory"
+   : again );
return;
-
-   queued_spin_lock_slowpath(lock, val);
+   }
+
+   if (likely(val == _Q_LOCKED_VAL)) {
+   asm_volatile_goto(
+   "   stwcx.  %0,0,%1 
\n"
+   "   bne-%l[again]   
\n"
+   :
+   : "r"(_Q_LOCKED_VAL | _Q_PENDING_VAL), "r" (>counter)
+   : "cr0", "memory"
+   : again );
+
+   atomic_cond_read_acquire(a, !(VAL & _Q_LOCKED_MASK));
+// clear_pending_set_locked(lock);
+   WRITE_ONCE(lock->locked_pending, _Q_LOCKED_VAL);
+// lockevent_inc(lock_pending);
+   return;
+   }
+
+   if (val == _Q_PENDING_VAL) {
+   int cnt = _Q_PENDING_LOOPS;
+   val = atomic_cond_read_relaxed(a,
+  (VAL != _Q_PENDING_VAL) || 
!cnt--);
+   if (!(val & ~_Q_LOCKED_MASK))
+   goto again;
+}
+   queued_spin_lock_slowpath_queue(lock);
 }
 #define queued_spin_lock queued_spin_lock
 
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b9515fcc9b29..ebcc6f5d99d5 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -287,10 +287,14 @@ static __always_inline u32  __pv_wait_head_or_lock(struct 
qspinlock *lock,
 
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 #define queued_spin_lock_slowpath  native_queued_spin_lock_slowpath
+#define queued_spin_lock_slowpath_queue
native_queued_spin_lock_slowpath_queue
 #endif
 
 #endif /* _GEN_PV_LOCK_SLOWPATH */
 
+void queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+static void __queued_spin_lock_slowpath_queue(struct qspinlock *lock);
+
 /**
  * queued_spin_lock_slowpath 

  1   2   >