Re: [PATCH v5 16/23] PCI: hotplug: movable BARs: Don't reserve IO/mem bus space

2019-09-03 Thread Oliver O'Halloran
On Fri, 2019-08-16 at 19:50 +0300, Sergey Miroshnichenko wrote:
> A hotplugged bridge with many hotplug-capable ports may request
> reserving more IO space than the machine has. This could be overridden
> with the "hpiosize=" kernel argument though.
> 
> But when BARs are movable, there are no need to reserve space anymore:
> new BARs are allocated not from reserved gaps, but via rearranging the
> existing BARs. Requesting a precise amount of space for bridge windows
> increases the chances of adding the new bridge successfully.

It wouldn't hurt to reserve some memory space to prevent unnecessary
BAR shuffling at runtime. If it turns out that we need more space then
we can always fall back to re-assigning the whole tree.

> Signed-off-by: Sergey Miroshnichenko 
> ---
>  drivers/pci/setup-bus.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> index c7b7e30c6284..7d64ec8e7088 100644
> --- a/drivers/pci/setup-bus.c
> +++ b/drivers/pci/setup-bus.c
> @@ -1287,7 +1287,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct 
> list_head *realloc_head)
>  
>   case PCI_HEADER_TYPE_BRIDGE:
>   pci_bridge_check_ranges(bus);
> - if (bus->self->is_hotplug_bridge) {
> + if (bus->self->is_hotplug_bridge && 
> !pci_movable_bars_enabled()) {
>   additional_io_size  = pci_hotplug_io_size;
>   additional_mem_size = pci_hotplug_mem_size;
>   }



Re: [PATCH v2 1/2] powerpc/nvdimm: Use HCALL error as the return value

2019-09-03 Thread Vaibhav Jain
Hi Aneesh,

Thanks for the patch. Minor review comments below:

"Aneesh Kumar K.V"  writes:

> This simplifies the error handling and also enable us to switch to
> H_SCM_QUERY hcall in a later patch on H_OVERLAP error.
>
> We also do some kernel print formatting fixup in this patch.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 26 ++-
>  1 file changed, 11 insertions(+), 15 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index a5ac371a3f06..3bef4d298ac6 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -66,28 +66,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
>   } while (rc == H_BUSY);
>  
>   if (rc) {
> - /* H_OVERLAP needs a separate error path */
> - if (rc == H_OVERLAP)
> - return -EBUSY;
> -
>   dev_err(>pdev->dev, "bind err: %lld\n", rc);
> - return -ENXIO;
> + return rc;
>   }
>  
>   p->bound_addr = saved;
> -
> - dev_dbg(>pdev->dev, "bound drc %x to %pR\n", p->drc_index, >res);
> -
> - return 0;

> + dev_dbg(>pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, 
> >res);
s/0x%x/%#x/
> + return rc;
rc == 0 always at this point hence 'return 0' should still work.

>  }
>  
> -static int drc_pmem_unbind(struct papr_scm_priv *p)
> +static void drc_pmem_unbind(struct papr_scm_priv *p)
>  {
>   unsigned long ret[PLPAR_HCALL_BUFSIZE];
>   uint64_t token = 0;
>   int64_t rc;
>  
> - dev_dbg(>pdev->dev, "unbind drc %x\n", p->drc_index);
> + dev_dbg(>pdev->dev, "unbind drc 0x%x\n", p->drc_index);
>  
>   /* NB: unbind has the same retry requirements as drc_pmem_bind() */
>   do {
> @@ -110,10 +104,10 @@ static int drc_pmem_unbind(struct papr_scm_priv *p)
>   if (rc)
>   dev_err(>pdev->dev, "unbind error: %lld\n", rc);
>   else
> - dev_dbg(>pdev->dev, "unbind drc %x complete\n",
> + dev_dbg(>pdev->dev, "unbind drc 0x%x complete\n",
>   p->drc_index);
>  
> - return rc == H_SUCCESS ? 0 : -ENXIO;
> + return;
I would prefer drc_pmem_unbind() to still return error from the
HCALL. The caller can descide if it wants to ignore the error or not.

>  }
>  
>  static int papr_scm_meta_get(struct papr_scm_priv *p,
> @@ -436,14 +430,16 @@ static int papr_scm_probe(struct platform_device *pdev)
>   rc = drc_pmem_bind(p);
>  
>   /* If phyp says drc memory still bound then force unbound and retry */
> - if (rc == -EBUSY) {
> + if (rc == H_OVERLAP) {
>   dev_warn(>dev, "Retrying bind after unbinding\n");
>   drc_pmem_unbind(p);
>   rc = drc_pmem_bind(p);
>   }
>  
> - if (rc)
> + if (rc != H_SUCCESS) {
> + rc = -ENXIO;
>   goto err;
> + }
>  
>   /* setup the resource for the newly bound range */
>   p->res.start = p->bound_addr;
> -- 
> 2.21.0
>

-- 
Vaibhav Jain 
Linux Technology Center, IBM India Pvt. Ltd.



Re: [PATCH v5 18/23] powerpc/pci: Handle BAR movement

2019-09-03 Thread Oliver O'Halloran
On Fri, 2019-08-16 at 19:50 +0300, Sergey Miroshnichenko wrote:
> Add pcibios_rescan_prepare()/_done() hooks for the powerpc platform. Now if
> the device's driver supports movable BARs, pcibios_rescan_prepare() will be
> called after the device is stopped, and pcibios_rescan_done() - before it
> resumes. There are no memory requests to this device between the hooks, so
> it it safe to rebuild the EEH address cache during that.
> 
> CC: Oliver O'Halloran 
> Signed-off-by: Sergey Miroshnichenko 
> ---
>  arch/powerpc/kernel/pci-hotplug.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/pci-hotplug.c 
> b/arch/powerpc/kernel/pci-hotplug.c
> index 0b0cf8168b47..18cf13bba228 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -144,3 +144,13 @@ void pci_hp_add_devices(struct pci_bus *bus)
>   pcibios_finish_adding_to_bus(bus);
>  }
>  EXPORT_SYMBOL_GPL(pci_hp_add_devices);
> +
> +void pcibios_rescan_prepare(struct pci_dev *pdev)
> +{
> + eeh_addr_cache_rmv_dev(pdev);
> +}
> +
> +void pcibios_rescan_done(struct pci_dev *pdev)
> +{
> + eeh_addr_cache_insert_dev(pdev);
> +}

Is this actually sufficent? The PE number for a device is largely
determined by the location of the MMIO BARs. If you move a BAR far
enough the PE number stored in the eeh_pe would need to be updated as
well.



Re: [PATCH] powerpc: Perform a bounds check in arch_add_memory

2019-09-03 Thread Alastair D'Silva
On Mon, 2019-09-02 at 09:28 +0200, David Hildenbrand wrote:
> On 02.09.19 01:54, Alastair D'Silva wrote:
> > On Tue, 2019-08-27 at 09:13 +0200, David Hildenbrand wrote:
> > > On 27.08.19 08:39, Alastair D'Silva wrote:
> > > > On Tue, 2019-08-27 at 08:28 +0200, Michal Hocko wrote:
> > > > > On Tue 27-08-19 15:20:46, Alastair D'Silva wrote:
> > > > > > From: Alastair D'Silva 
> > > > > > 
> > > > > > It is possible for firmware to allocate memory ranges
> > > > > > outside
> > > > > > the range of physical memory that we support
> > > > > > (MAX_PHYSMEM_BITS).
> > > > > 
> > > > > Doesn't that count as a FW bug? Do you have any evidence of
> > > > > that
> > > > > in
> > > > > the
> > > > > field? Just wondering...
> > > > > 
> > > > 
> > > > Not outside our lab, but OpenCAPI attached LPC memory is
> > > > assigned
> > > > addresses based on the slot/NPU it is connected to. These
> > > > addresses
> > > > prior to:
> > > > 4ffe713b7587 ("powerpc/mm: Increase the max addressable memory
> > > > to
> > > > 2PB")
> > > > were inaccessible and resulted in bogus sections - see our
> > > > discussion
> > > > on 'mm: Trigger bug on if a section is not found in
> > > > __section_nr'.
> > > > Doing this check here was your suggestion :)
> > > > 
> > > > It's entirely possible that a similar problem will occur in the
> > > > future,
> > > > and it's cheap to guard against, which is why I've added this.
> > > > 
> > > 
> > > If you keep it here, I guess this should be wrapped by a
> > > WARN_ON_ONCE().
> > > 
> > > If we move it to common code (e.g., __add_pages() or
> > > add_memory()),
> > > then
> > > probably not. I can see that s390x allows to configure
> > > MAX_PHYSMEM_BITS,
> > > so the check could actually make sense.
> > > 
> > 
> > I couldn't see a nice platform indepedent way to determine the
> > allowable address range, but if there is, then I'll move this to
> > the
> > generic code instead.
> > 
> 
> At least on the !ZONE_DEVICE path we have
> 
> __add_memory() -> register_memory_resource() ...
> 
> return ERR_PTR(-E2BIG);
> 
> 
> I was thinking about something like
> 
> int add_pages()
> {
>   if ((start + size - 1) >> MAX_PHYSMEM_BITS)
>   return -E2BIG;  
> 
>   return arch_add_memory(...)
> }
> 
> And switching users of arch_add_memory() to add_pages(). However, x86
> already has an add_pages() function, so that would need some more
> thought.
> 
> Maybe simply renaming the existing add_pages() to arch_add_pages().
> 
> add_pages(): Create virtual mapping
> __add_pages(): Don't create virtual mapping
> 
> arch_add_memory(): Arch backend for add_pages()
> arch_add_pages(): Arch backend for __add_pages()
> 
> It would be even more consistent if we would have arch_add_pages()
> vs.
> __arch_add_pages().

Looking a bit further, I think a good course of action would be to add
the check to memory_hotplug.c:check_hotplug_memory_range().

This would be the least invasive, and could check both
MAX_POSSIBLE_PHYSMEM_BITS and MAX_PHYSMEM_BITS.

With that in mind, we can drop this patch.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



Re: [PATCH 3/3] powerpc/tm: Add tm-poison test

2019-09-03 Thread Gustavo Romero

Hi Michael,

On 09/03/2019 08:46 AM, Michael Ellerman wrote:

Michael Neuling  writes:

From: Gustavo Romero 

Add TM selftest to check if FP or VEC register values from one process
can leak into another process when both run on the same CPU.

This tests for CVE-2019-15030 and CVE-2019-15031.

Signed-off-by: Gustavo Romero 
Signed-off-by: Michael Neuling 
---
  tools/testing/selftests/powerpc/tm/.gitignore |   1 +
  tools/testing/selftests/powerpc/tm/Makefile   |   2 +-
  .../testing/selftests/powerpc/tm/tm-poison.c  | 180 ++
  3 files changed, 182 insertions(+), 1 deletion(-)
  create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c


This doesn't build on 64-bit big endian:

tm-poison.c: In function 'tm_poison_test':
tm-poison.c:118:10: error: format '%lx' expects argument of type 'long unsigned 
int', but argument 2 has type 'uint64_t {aka long long unsigned int}' 
[-Werror=format=]
printf("Unknown value %#lx leaked into f31!\n", unknown);
   ^
tm-poison.c:166:10: error: format '%lx' expects argument of type 'long unsigned 
int', but argument 2 has type 'uint64_t {aka long long unsigned int}' 
[-Werror=format=]
printf("Unknown value %#lx leaked into vr31!\n", unknown);
   ^


Sorry. I sent a v2 addressing it. Now I tested the fix against Travis CI.

Thank you.

Cheers,
Gustavo


[PATCH v2 3/3] powerpc/tm: Add tm-poison test

2019-09-03 Thread gromero
From: Gustavo Romero 

Add TM selftest to check if FP or VEC register values from one process
can leak into another process when both run on the same CPU.

Signed-off-by: Gustavo Romero 
Signed-off-by: Michael Neuling 
---
 tools/testing/selftests/powerpc/tm/.gitignore |   1 +
 tools/testing/selftests/powerpc/tm/Makefile   |   2 +-
 .../testing/selftests/powerpc/tm/tm-poison.c  | 179 ++
 3 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c

diff --git a/tools/testing/selftests/powerpc/tm/.gitignore 
b/tools/testing/selftests/powerpc/tm/.gitignore
index 951fe855f7cd..98f2708d86cc 100644
--- a/tools/testing/selftests/powerpc/tm/.gitignore
+++ b/tools/testing/selftests/powerpc/tm/.gitignore
@@ -17,3 +17,4 @@ tm-vmx-unavail
 tm-unavailable
 tm-trap
 tm-sigreturn
+tm-poison
diff --git a/tools/testing/selftests/powerpc/tm/Makefile 
b/tools/testing/selftests/powerpc/tm/Makefile
index c0734ed0ef56..b15a1a325bd0 100644
--- a/tools/testing/selftests/powerpc/tm/Makefile
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -5,7 +5,7 @@ SIGNAL_CONTEXT_CHK_TESTS := tm-signal-context-chk-gpr 
tm-signal-context-chk-fpu
 TEST_GEN_PROGS := tm-resched-dscr tm-syscall tm-signal-msr-resv 
tm-signal-stack \
tm-vmxcopy tm-fork tm-tar tm-tmspr tm-vmx-unavail tm-unavailable 
tm-trap \
$(SIGNAL_CONTEXT_CHK_TESTS) tm-sigreturn tm-signal-sigreturn-nt \
-   tm-signal-context-force-tm
+   tm-signal-context-force-tm tm-poison
 
 top_srcdir = ../../../../..
 include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/tm/tm-poison.c 
b/tools/testing/selftests/powerpc/tm/tm-poison.c
new file mode 100644
index ..977558497c16
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/tm-poison.c
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2019, Gustavo Romero, Michael Neuling, IBM Corp.
+ *
+ * This test will spawn two processes. Both will be attached to the same
+ * CPU (CPU 0). The child will be in a loop writing to FP register f31 and
+ * VMX/VEC/Altivec register vr31 a known value, called poison, calling
+ * sched_yield syscall after to allow the parent to switch on the CPU.
+ * Parent will set f31 and vr31 to 1 and in a loop will check if f31 and
+ * vr31 remain 1 as expected until a given timeout (2m). If the issue is
+ * present child's poison will leak into parent's f31 or vr31 registers,
+ * otherwise, poison will never leak into parent's f31 and vr31 registers.
+ */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "tm.h"
+
+int tm_poison_test(void)
+{
+   int pid;
+   cpu_set_t cpuset;
+   uint64_t poison = 0xdeadbeefc0dec0fe;
+   uint64_t unknown = 0;
+   bool fail_fp = false;
+   bool fail_vr = false;
+
+   SKIP_IF(!have_htm());
+
+   /* Attach both Child and Parent to CPU 0 */
+   CPU_ZERO();
+   CPU_SET(0, );
+   sched_setaffinity(0, sizeof(cpuset), );
+
+   pid = fork();
+   if (!pid) {
+   /**
+* child
+*/
+   while (1) {
+   sched_yield();
+   asm (
+   "mtvsrd 31, %[poison];" // f31 = poison
+   "mtvsrd 63, %[poison];" // vr31 = poison
+
+   : : [poison] "r" (poison) : );
+   }
+   }
+
+   /**
+* parent
+*/
+   asm (
+   /*
+* Set r3, r4, and f31 to known value 1 before entering
+* in transaction. They won't be written after that.
+*/
+   "   li  3, 0x1  ;"
+   "   li  4, 0x1  ;"
+   "   mtvsrd  31, 4   ;"
+
+   /*
+* The Time Base (TB) is a 64-bit counter register that is
+* independent of the CPU clock and which is incremented
+* at a frequency of 51200 Hz, so every 1.953125ns.
+* So it's necessary 120s/0.1953125s = 6144000
+* increments to get a 2 minutes timeout. Below we set that
+* value in r5 and then use r6 to track initial TB value,
+* updating TB values in r7 at every iteration and comparing it
+* to r6. When r7 (current) - r6 (initial) > 6144000 we bail
+* out since for sure we spent already 2 minutes in the loop.
+* SPR 268 is the TB register.
+*/
+   "   lis 5, 14   ;"
+   "   ori 5, 5, 19996 ;"
+   "   sldi5, 5, 16;" // r5 = 6144000
+
+   "   mfspr   6, 268  ;" // r6 (TB initial)
+   "1: mfspr   7, 268  ;" // r7 (TB current)
+   "   

[PATCH v2 2/3] powerpc/tm: Fix restoring FP/VMX facility incorrectly on interrupts

2019-09-03 Thread gromero
From: Gustavo Romero 

When in userspace and MSR FP=0 the hardware FP state is unrelated to
the current process. This is extended for transactions where if tbegin
is run with FP=0, the hardware checkpoint FP state will also be
unrelated to the current process. Due to this, we need to ensure this
hardware checkpoint is updated with the correct state before we enable
FP for this process.

Unfortunately we get this wrong when returning to a process from a
hardware interrupt. A process that starts a transaction with FP=0 can
take an interrupt. When the kernel returns back to that process, we
change to FP=1 but with hardware checkpoint FP state not updated. If
this transaction is then rolled back, the FP registers now contain the
wrong state.

The process looks like this:
   Userspace:  Kernel

   Start userspace
with MSR FP=0 TM=1
  < -
   ...
   tbegin
   bne
   Hardware interrupt
    >


ret_from_except
  restore_math()
/* sees FP=0 */
restore_fp()
  tm_active_with_fp()
/* sees FP=1 (Incorrect) */
  load_fp_state()
FP = 0 -> 1
  < -
   Return to userspace
 with MSR TM=1 FP=1
 with junk in the FP TM checkpoint
   TM rollback
   reads FP junk

When returning from the hardware exception, tm_active_with_fp() is
incorrectly making restore_fp() call load_fp_state() which is setting
FP=1.

The fix is to remove tm_active_with_fp().

tm_active_with_fp() is attempting to handle the case where FP state
has been changed inside a transaction. In this case the checkpointed
and transactional FP state is different and hence we must restore the
FP state (ie. we can't do lazy FP restore inside a transaction that's
used FP). It's safe to remove tm_active_with_fp() as this case is
handled by restore_tm_state(). restore_tm_state() detects if FP has
been using inside a transaction and will set load_fp and call
restore_math() to ensure the FP state (checkpoint and transaction) is
restored.

This is a data integrity problem for the current process as the FP
registers are corrupted. It's also a security problem as the FP
registers from one process may be leaked to another.

Similarly for VMX.

A simple testcase to replicate this will be posted to
tools/testing/selftests/powerpc/tm/tm-poison.c

This fixes CVE-2019-15031.

Fixes: a7771176b439 ("powerpc: Don't enable FP/Altivec if not checkpointed")
Cc: sta...@vger.kernel.org # 4.15+
Signed-off-by: Gustavo Romero 
Signed-off-by: Michael Neuling 
---
 arch/powerpc/kernel/process.c | 18 ++
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 437b57068cf8..7a84c9f1778e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -101,21 +101,8 @@ static void check_if_tm_restore_required(struct 
task_struct *tsk)
}
 }
 
-static bool tm_active_with_fp(struct task_struct *tsk)
-{
-   return MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
-   (tsk->thread.ckpt_regs.msr & MSR_FP);
-}
-
-static bool tm_active_with_altivec(struct task_struct *tsk)
-{
-   return MSR_TM_ACTIVE(tsk->thread.regs->msr) &&
-   (tsk->thread.ckpt_regs.msr & MSR_VEC);
-}
 #else
 static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
-static inline bool tm_active_with_fp(struct task_struct *tsk) { return false; }
-static inline bool tm_active_with_altivec(struct task_struct *tsk) { return 
false; }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 bool strict_msr_control;
@@ -252,7 +239,7 @@ EXPORT_SYMBOL(enable_kernel_fp);
 
 static int restore_fp(struct task_struct *tsk)
 {
-   if (tsk->thread.load_fp || tm_active_with_fp(tsk)) {
+   if (tsk->thread.load_fp) {
load_fp_state(>thread.fp_state);
current->thread.load_fp++;
return 1;
@@ -334,8 +321,7 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 
 static int restore_altivec(struct task_struct *tsk)
 {
-   if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
-   (tsk->thread.load_vec || tm_active_with_altivec(tsk))) {
+   if (cpu_has_feature(CPU_FTR_ALTIVEC) && (tsk->thread.load_vec)) {
load_vr_state(>thread.vr_state);
tsk->thread.used_vr = 1;
tsk->thread.load_vec++;
-- 
2.17.2



[PATCH v2 1/3] powerpc/tm: Fix FP/VMX unavailable exceptions inside a transaction

2019-09-03 Thread gromero
From: Gustavo Romero 

When we take an FP unavailable exception in a transaction we have to
account for the hardware FP TM checkpointed registers being
incorrect. In this case for this process we know the current and
checkpointed FP registers must be the same (since FP wasn't used
inside the transaction) hence in the thread_struct we copy the current
FP registers to the checkpointed ones.

This copy is done in tm_reclaim_thread(). We use thread->ckpt_regs.msr
to determine if FP was on when in userspace. thread->ckpt_regs.msr
represents the state of the MSR when exiting userspace. This is setup
by check_if_tm_restore_required().

Unfortunatley there is an optimisation in giveup_all() which returns
early if tsk->thread.regs->msr (via local variable `usermsr`) has
FP=VEC=VSX=SPE=0. This optimisation means that
check_if_tm_restore_required() is not called and hence
thread->ckpt_regs.msr is not updated and will contain an old value.

This can happen if due to load_fp=255 we start a userspace process
with MSR FP=1 and then we are context switched out. In this case
thread->ckpt_regs.msr will contain FP=1. If that same process is then
context switched in and load_fp overflows, MSR will have FP=0. If that
process now enters a transaction and does an FP instruction, the FP
unavailable will not update thread->ckpt_regs.msr (the bug) and MSR
FP=1 will be retained in thread->ckpt_regs.msr.  tm_reclaim_thread()
will then not perform the required memcpy and the checkpointed FP regs
in the thread struct will contain the wrong values.

The code path for this happening is:

   Userspace:  Kernel
   Start userspace
with MSR FP/VEC/VSX/SPE=0 TM=1
  < -
   ...
   tbegin
   bne
   fp instruction
   FP unavailable
    >
fp_unavailable_tm()
  tm_reclaim_current()
tm_reclaim_thread()
  giveup_all()
return early since FP/VMX/VSX=0
/* ckpt MSR not updated 
(Incorrect) */
  tm_reclaim()
/* thread_struct ckpt FP regs 
contain junk (OK) */
  /* Sees ckpt MSR FP=1 (Incorrect) 
*/
  no memcpy() performed
/* thread_struct ckpt FP regs 
not fixed (Incorrect) */
  tm_recheckpoint()
 /* Put junk in hardware checkpoint 
FP regs */
 
  < -
   Return to userspace
 with MSR TM=1 FP=1
 with junk in the FP TM checkpoint
   TM rollback
   reads FP junk

This is a data integrity problem for the current process as the FP
registers are corrupted. It's also a security problem as the FP
registers from one process may be leaked to another.

This patch moves up check_if_tm_restore_required() in giveup_all() to
ensure thread->ckpt_regs.msr is updated correctly.

A simple testcase to replicate this will be posted to
tools/testing/selftests/powerpc/tm/tm-poison.c

Similarly for VMX.

This fixes CVE-2019-15030.

Fixes: f48e91e87e67 ("powerpc/tm: Fix FP and VMX register corruption")
Cc: sta...@vger.kernel.org # 4.12+
Signed-off-by: Gustavo Romero 
Signed-off-by: Michael Neuling 
---
 arch/powerpc/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8fc4de0d22b4..437b57068cf8 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -497,13 +497,14 @@ void giveup_all(struct task_struct *tsk)
if (!tsk->thread.regs)
return;
 
+   check_if_tm_restore_required(tsk);
+
usermsr = tsk->thread.regs->msr;
 
if ((usermsr & msr_all_available) == 0)
return;
 
msr_check_and_set(msr_all_available);
-   check_if_tm_restore_required(tsk);
 
WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & 
MSR_VEC)));
 
-- 
2.17.2



RE: [PATCH v2 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 08:51 +0200, Christophe Leroy wrote:


> > > This piece of code looks pretty similar to the one before. Can we
> > > refactor into a small helper ?
> > > 
> > 
> > Not much point, it's removed in a subsequent patch.
> > 
> 
> But you tell me that you leave to people the opportunity to not
> apply 
> that subsequent patch, and that's the reason you didn't put that
> patch 
> before this one. In that case adding a helper is worth it.
> 
> Christophe

I factored it out anyway, since it made the code much nicer to read.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



RE: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 22:11 +0200, Gabriel Paubert wrote:
> On Tue, Sep 03, 2019 at 01:31:57PM -0500, Segher Boessenkool wrote:
> > On Tue, Sep 03, 2019 at 07:05:19PM +0200, Christophe Leroy wrote:
> > > Le 03/09/2019 à 18:04, Segher Boessenkool a écrit :
> > > > (Why are they separate though?  It could just be one loop var).
> > > 
> > > Yes it could just be a single loop var, but in that case it would
> > > have 
> > > to be reset at the start of the second loop, which means we would
> > > have 
> > > to pass 'addr' for resetting the loop anyway,
> > 
> > Right, I noticed that after hitting send, as usual.
> > 
> > > so I opted to do it 
> > > outside the inline asm by using to separate loop vars set to
> > > their 
> > > starting value outside the inline asm.
> > 
> > The thing is, the way it is written now, it will get separate
> > registers
> > for each loop (with proper earlyclobbers added).  Not that that
> > really
> > matters of course, it just feels wrong :-)
> 
> After "mtmsr %3", it is always possible to copy %0 to %3 and use it
> as
> an address register for the second loop. One register less to
> allocate
> for the compiler. Constraints of course have to be adjusted.
> 
> 

Given that we're dealing with registers holding data that has been
named outside the assembler, this feels dirty. We'd be using the
register passed in as 'msr' to hold the address instead.

Since we're not short on registers, I don't see this as a good change.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



RE: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 11:04 -0500, Segher Boessenkool wrote:
> On Tue, Sep 03, 2019 at 04:28:09PM +0200, Christophe Leroy wrote:
> > Le 03/09/2019 à 15:04, Segher Boessenkool a écrit :
> > > On Tue, Sep 03, 2019 at 03:23:57PM +1000, Alastair D'Silva wrote:
> > > > +   asm volatile(
> > > > +   "   mtctr %2;"
> > > > +   "   mtmsr %3;"
> > > > +   "   isync;"
> > > > +   "0: dcbst   0, %0;"
> > > > +   "   addi%0, %0, %4;"
> > > > +   "   bdnz0b;"
> > > > +   "   sync;"
> > > > +   "   mtctr %2;"
> > > > +   "1: icbi0, %1;"
> > > > +   "   addi%1, %1, %4;"
> > > > +   "   bdnz1b;"
> > > > +   "   sync;"
> > > > +   "   mtmsr %5;"
> > > > +   "   isync;"
> > > > +   : "+r" (loop1), "+r" (loop2)
> > > > +   : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
> > > > +   : "ctr", "memory");
> > > 
> > > This outputs as one huge assembler statement, all on one
> > > line.  That's
> > > going to be fun to read or debug.
> > 
> > Do you mean \n has to be added after the ; ?
> 
> Something like that.  There is no really satisfying way for doing
> huge
> inline asm, and maybe that is a good thing ;-)
> 
> Often people write \n\t at the end of each line of inline asm.  This
> works
> pretty well (but then there are labels, oh joy).
> 
> > > loop1 and/or loop2 can be assigned the same register as msr0 or
> > > nb.  They
> > > need to be made earlyclobbers.  (msr is fine, all of its reads
> > > are before
> > > any writes to loop1 or loop2; and bytes is fine, it's not a
> > > register).
> > 
> > Can you explicit please ? Doesn't '+r' means that they are input
> > and 
> > output at the same time ?
> 
> That is what + means, yes -- that this output is an input as
> well.  It is
> the same to write
> 
>   asm("mov %1,%0 ; mov %0,42" : "+r"(x), "=r"(y));
> or to write
>   asm("mov %1,%0 ; mov %0,42" : "=r"(x), "=r"(y) : "0"(x));
> 
> (So not "at the same time" as in "in the same machine instruction",
> but
> more loosely, as in "in the same inline asm statement").
> 
> > "to be made earlyclobbers", what does this means exactly ? How to
> > do that ?
> 
> You write &, like "+" in this case.  It means the machine code
> writes
> to this register before it has consumed all asm inputs (remember, GCC
> does not understand (or even parse!) the assembler string).
> 
> So just
> 
>   : "+" (loop1), "+" (loop2)
> 
> will do.  (Why are they separate though?  It could just be one loop
> var).
> 
> 

Thanks, I've updated these.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



RE: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 08:08 +0200, Christophe Leroy wrote:
> 
> Le 03/09/2019 à 07:23, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > Similar to commit 22e9c88d486a
> > ("powerpc/64: reuse PPC32 static inline flush_dcache_range()")
> > this patch converts the following ASM symbols to C:
> >  flush_icache_range()
> >  __flush_dcache_icache()
> >  __flush_dcache_icache_phys()
> > 
> > This was done as we discovered a long-standing bug where the length
> > of the
> > range was truncated due to using a 32 bit shift instead of a 64 bit
> > one.
> > 
> > By converting these functions to C, it becomes easier to maintain.
> > 
> > flush_dcache_icache_phys() retains a critical assembler section as
> > we must
> > ensure there are no memory accesses while the data MMU is disabled
> > (authored by Christophe Leroy). Since this has no external callers,
> > it has
> > also been made static, allowing the compiler to inline it within
> > flush_dcache_icache_page().
> > 
> > Signed-off-by: Alastair D'Silva 
> > Signed-off-by: Christophe Leroy 
> > ---
> >   arch/powerpc/include/asm/cache.h  |  26 ++---
> >   arch/powerpc/include/asm/cacheflush.h |  24 ++--
> >   arch/powerpc/kernel/misc_32.S | 117 
> >   arch/powerpc/kernel/misc_64.S | 102 -
> >   arch/powerpc/mm/mem.c | 152
> > +-
> >   5 files changed, 173 insertions(+), 248 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/cache.h
> > b/arch/powerpc/include/asm/cache.h
> > index f852d5cd746c..91c808c6738b 100644
> > --- a/arch/powerpc/include/asm/cache.h
> > +++ b/arch/powerpc/include/asm/cache.h
> > @@ -98,20 +98,7 @@ static inline u32 l1_icache_bytes(void)
> >   #endif
> >   #endif /* ! __ASSEMBLY__ */
> >   
> > -#if defined(__ASSEMBLY__)
> > -/*
> > - * For a snooping icache, we still need a dummy icbi to purge all
> > the
> > - * prefetched instructions from the ifetch buffers. We also need a
> > sync
> > - * before the icbi to order the the actual stores to memory that
> > might
> > - * have modified instructions with the icbi.
> > - */
> > -#define PURGE_PREFETCHED_INS   \
> > -   sync;   \
> > -   icbi0,r3;   \
> > -   sync;   \
> > -   isync
> > -
> > -#else
> > +#if !defined(__ASSEMBLY__)
> >   #define __read_mostly
> > __attribute__((__section__(".data..read_mostly")))
> >   
> >   #ifdef CONFIG_PPC_BOOK3S_32
> > @@ -145,6 +132,17 @@ static inline void dcbst(void *addr)
> >   {
> > __asm__ __volatile__ ("dcbst %y0" : : "Z"(*(u8 *)addr) :
> > "memory");
> >   }
> > +
> > +static inline void icbi(void *addr)
> > +{
> > +   __asm__ __volatile__ ("icbi 0, %0" : : "r"(addr) : "memory");
> 
> I think "__asm__ __volatile__" is deprecated. Use "asm volatile"
> instead.
> 

Ok.

> > +}
> > +
> > +static inline void iccci(void *addr)
> > +{
> > +   __asm__ __volatile__ ("iccci 0, %0" : : "r"(addr) : "memory");
> > +}
> > +
> 
> Same
> 
> >   #endif /* !__ASSEMBLY__ */
> >   #endif /* __KERNEL__ */
> >   #endif /* _ASM_POWERPC_CACHE_H */
> > diff --git a/arch/powerpc/include/asm/cacheflush.h
> > b/arch/powerpc/include/asm/cacheflush.h
> > index ed57843ef452..4a1c9f0200e1 100644
> > --- a/arch/powerpc/include/asm/cacheflush.h
> > +++ b/arch/powerpc/include/asm/cacheflush.h
> > @@ -42,24 +42,20 @@ extern void flush_dcache_page(struct page
> > *page);
> >   #define flush_dcache_mmap_lock(mapping)   do { } while
> > (0)
> >   #define flush_dcache_mmap_unlock(mapping) do { } while (0)
> >   
> > -extern void flush_icache_range(unsigned long, unsigned long);
> > +void flush_icache_range(unsigned long start, unsigned long stop);
> >   extern void flush_icache_user_range(struct vm_area_struct *vma,
> > struct page *page, unsigned long
> > addr,
> > int len);
> > -extern void __flush_dcache_icache(void *page_va);
> >   extern void flush_dcache_icache_page(struct page *page);
> > -#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> > -extern void __flush_dcache_icache_phys(unsigned long physaddr);
> > -#else
> > -static inline void __flush_dcache_icache_phys(unsigned long
> > physaddr)
> > -{
> > -   BUG();
> > -}
> > -#endif
> > -
> > -/*
> > - * Write any modified data cache blocks out to memory and
> > invalidate them.
> > - * Does not invalidate the corresponding instruction cache blocks.
> > +void __flush_dcache_icache(void *page);
> > +
> > +/**
> > + * flush_dcache_range(): Write any modified data cache blocks out
> > to memory and
> > + * invalidate them. Does not invalidate the corresponding
> > instruction cache
> > + * blocks.
> > + *
> > + * @start: the start address
> > + * @stop: the stop address (exclusive)
> >*/
> >   static inline void flush_dcache_range(unsigned long start,
> > unsigned long stop)
> >   {
> > diff --git a/arch/powerpc/kernel/misc_32.S
> > b/arch/powerpc/kernel/misc_32.S
> > index 

Re: [RFC PATCH 00/11] Secure Virtual Machine Enablement

2019-09-03 Thread Sukadev Bhattiprolu
Thiago Jung Bauermann [bauer...@linux.ibm.com] wrote:
> [ Some people didn't receive all the patches in this series, even though
>   the linuxppc-dev list did so trying to send again. This is exactly the
>   same series I posted yesterday. Sorry for the clutter. ]
> 
> This series contains preliminary work to enable Secure Virtual Machines
> (SVM) on powerpc. SVMs request to be migrated to secure memory very early in
> the boot process (in prom_init()), so by default all of their memory is
> inaccessible to the hypervisor. There is an ultravisor call that the VM can
> use to request certain pages to be made accessible (aka shared).

I would like to piggy-back on this series (since it provides the
context) to add another patch we need for SVMs :-) 

Appreciate any comments. 

---

>From ed93a0e36ec886483a72fdb8d99380bbe6607f37 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu 
Date: Thu, 16 May 2019 20:57:12 -0500
Subject: [PATCH 1/1] powerpc/pseries/svm: don't access some SPRs

Ultravisor disables some CPU features like EBB and BHRB in the HFSCR
for secure virtual machines (SVMs). If the SVMs attempt to access
related registers, they will get a Program Interrupt.

Use macros/wrappers to skip accessing EBB and BHRB related registers
for secure VMs.

Signed-off-by: Sukadev Bhattiprolu 
---
 arch/powerpc/include/asm/reg.h  | 35 
 arch/powerpc/kernel/process.c   | 12 -
 arch/powerpc/kvm/book3s_hv.c| 24 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 48 -
 arch/powerpc/kvm/book3s_hv_tm_builtin.c |  6 ++---
 arch/powerpc/perf/core-book3s.c |  4 +--
 arch/powerpc/xmon/xmon.c|  2 +-
 7 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ec3714c..1397cb3 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1376,6 +1376,41 @@ static inline void msr_check_and_clear(unsigned long 
bits)
__msr_check_and_clear(bits);
 }
 
+#ifdef CONFIG_PPC_SVM
+/*
+ * Move from some "restricted" sprs.
+ * Secure VMs should not access some registers as the related features
+ * are disabled in the CPU. If an SVM is attempting read from the given
+ * SPR, return 0. Otherwise behave like a normal mfspr.
+ */
+#define mfspr_r(rn)\
+({ \
+   unsigned long rval = 0ULL;  \
+   \
+   if (!(mfmsr() & MSR_S)) \
+   asm volatile("mfspr %0," __stringify(rn)\
+   : "=r" (rval)); \
+   rval;   \
+})
+
+/*
+ * Move to some "restricted" sprs.
+ * Secure VMs should not access some registers as the related features
+ * are disabled in the CPU. If an SVM is attempting write to the given
+ * SPR, ignore the write. Otherwise behave like a normal mtspr.
+ */
+#define mtspr_r(rn, v) \
+({ \
+   if (!(mfmsr() & MSR_S)) \
+   asm volatile("mtspr " __stringify(rn) ",%0" :   \
+: "r" ((unsigned long)(v)) \
+: "memory");   \
+})
+#else
+#define mfspr_rmfspr
+#define mtspr_rmtspr
+#endif
+
 #ifdef __powerpc64__
 #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
 #define mftb() ({unsigned long rval;   \
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8fc4de0..d5e7386 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1072,9 +1072,9 @@ static inline void save_sprs(struct thread_struct *t)
t->dscr = mfspr(SPRN_DSCR);
 
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
-   t->bescr = mfspr(SPRN_BESCR);
-   t->ebbhr = mfspr(SPRN_EBBHR);
-   t->ebbrr = mfspr(SPRN_EBBRR);
+   t->bescr = mfspr_r(SPRN_BESCR);
+   t->ebbhr = mfspr_r(SPRN_EBBHR);
+   t->ebbrr = mfspr_r(SPRN_EBBRR);
 
t->fscr = mfspr(SPRN_FSCR);
 
@@ -,11 +,11 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
 
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
if (old_thread->bescr != new_thread->bescr)
-   mtspr(SPRN_BESCR, new_thread->bescr);
+   mtspr_r(SPRN_BESCR, new_thread->bescr);
if (old_thread->ebbhr != new_thread->ebbhr)
-   mtspr(SPRN_EBBHR, new_thread->ebbhr);
+   mtspr_r(SPRN_EBBHR, new_thread->ebbhr);
 

Re: linux-next: build warnings after merge of the kbuild tree

2019-09-03 Thread Masahiro Yamada
Hi Stephen,

On Wed, Sep 4, 2019 at 9:13 AM Stephen Rothwell  wrote:
>
> Hi all,
>
> After merging the kbuild tree, today's linux-next build (powerpc
> ppc64_defconfig) produced these warnings:
>
>
> Presumably introduced by commit
>
>   1267f9d3047d ("kbuild: add $(BASH) to run scripts with bash-extension")
>
> and presumably arch/powerpc/tools/unrel_branch_check.sh (which has no
> #! line) is a bash script.  Yeah, is uses '((' and '))'.

Thanks for catching this.


Could you fix it up as follows?
I will squash it for tomorrow's linux-next.


--- a/arch/powerpc/Makefile.postlink
+++ b/arch/powerpc/Makefile.postlink
@@ -18,7 +18,7 @@ quiet_cmd_relocs_check = CHKREL  $@
 ifdef CONFIG_PPC_BOOK3S_64
   cmd_relocs_check =   \
$(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh
"$(OBJDUMP)" "$@" ; \
-   $(CONFIG_SHELL)
$(srctree)/arch/powerpc/tools/unrel_branch_check.sh "$(OBJDUMP)" "$@"
+   $(BASH) $(srctree)/arch/powerpc/tools/unrel_branch_check.sh
"$(OBJDUMP)" "$@"
 else
   cmd_relocs_check =   \
$(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh
"$(OBJDUMP)" "$@"





> --
> Cheers,
> Stephen Rothwell



-- 
Best Regards
Masahiro Yamada


Re: [PATCH] powerpc: Avoid clang warnings around setjmp and longjmp

2019-09-03 Thread Nathan Chancellor
On Tue, Sep 03, 2019 at 02:31:28PM -0500, Segher Boessenkool wrote:
> On Mon, Sep 02, 2019 at 10:55:53PM -0700, Nathan Chancellor wrote:
> > On Thu, Aug 29, 2019 at 09:59:48AM +, David Laight wrote:
> > > From: Nathan Chancellor
> > > > Sent: 28 August 2019 19:45
> > > ...
> > > > However, I think that -fno-builtin-* would be appropriate here because
> > > > we are providing our own setjmp implementation, meaning clang should not
> > > > be trying to do anything with the builtin implementation like building a
> > > > declaration for it.
> > > 
> > > Isn't implementing setjmp impossible unless you tell the compiler that
> > > you function is 'setjmp-like' ?
> > 
> > No idea, PowerPC is the only architecture that does such a thing.
> 
> Since setjmp can return more than once, yes, exciting things can happen
> if you do not tell the compiler about this.
> 
> 
> Segher
> 

Fair enough so I guess we are back to just outright disabling the
warning.

Cheers,
Nathan


linux-next: build warnings after merge of the kbuild tree

2019-09-03 Thread Stephen Rothwell
Hi all,

After merging the kbuild tree, today's linux-next build (powerpc
ppc64_defconfig) produced these warnings:


Presumably introduced by commit

  1267f9d3047d ("kbuild: add $(BASH) to run scripts with bash-extension")

and presumably arch/powerpc/tools/unrel_branch_check.sh (which has no
#! line) is a bash script.  Yeah, is uses '((' and '))'.

-- 
Cheers,
Stephen Rothwell


pgpCUDP_5ifOy.pgp
Description: OpenPGP digital signature


Re: [PATCH v7 5/5] kasan debug: track pages allocated for vmalloc shadow

2019-09-03 Thread Daniel Axtens
Andrey Konovalov  writes:

> On Tue, Sep 3, 2019 at 4:56 PM Daniel Axtens  wrote:
>>
>> Provide the current number of vmalloc shadow pages in
>> /sys/kernel/debug/kasan_vmalloc/shadow_pages.
>
> Maybe it makes sense to put this into /sys/kernel/debug/kasan/
> (without _vmalloc) and name e.g. vmalloc_shadow_pages? In case we want
> to expose more generic KASAN debugging info later.

We certainly could. I just wonder if this patch is useful on an ongoing
basis. I wrote it to validate my work on lazy freeing of shadow pages -
which is why I included it - but I'm not sure it has much ongoing value
beyond demonstrating that the freeing code works.

If we think it's worth holding on to this patch, I can certainly adjust
the paths.

Regards,
Daniel

>
>>
>> Signed-off-by: Daniel Axtens 
>>
>> ---
>>
>> Merging this is probably overkill, but I leave it to the discretion
>> of the broader community.
>>
>> On v4 (no dynamic freeing), I saw the following approximate figures
>> on my test VM:
>>
>>  - fresh boot: 720
>>  - after test_vmalloc: ~14000
>>
>> With v5 (lazy dynamic freeing):
>>
>>  - boot: ~490-500
>>  - running modprobe test_vmalloc pushes the figures up to sometimes
>> as high as ~14000, but they drop down to ~560 after the test ends.
>> I'm not sure where the extra sixty pages are from, but running the
>> test repeately doesn't cause the number to keep growing, so I don't
>> think we're leaking.
>>  - with vmap_stack, spawning tasks pushes the figure up to ~4200, then
>> some clearing kicks in and drops it down to previous levels again.
>> ---
>>  mm/kasan/common.c | 26 ++
>>  1 file changed, 26 insertions(+)
>>
>> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
>> index e33cbab83309..e40854512417 100644
>> --- a/mm/kasan/common.c
>> +++ b/mm/kasan/common.c
>> @@ -35,6 +35,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>
>>  #include 
>>
>> @@ -750,6 +751,8 @@ core_initcall(kasan_memhotplug_init);
>>  #endif
>>
>>  #ifdef CONFIG_KASAN_VMALLOC
>> +static u64 vmalloc_shadow_pages;
>> +
>>  static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>>   void *unused)
>>  {
>> @@ -776,6 +779,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, 
>> unsigned long addr,
>> if (likely(pte_none(*ptep))) {
>> set_pte_at(_mm, addr, ptep, pte);
>> page = 0;
>> +   vmalloc_shadow_pages++;
>> }
>> spin_unlock(_mm.page_table_lock);
>> if (page)
>> @@ -829,6 +833,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, 
>> unsigned long addr,
>> if (likely(!pte_none(*ptep))) {
>> pte_clear(_mm, addr, ptep);
>> free_page(page);
>> +   vmalloc_shadow_pages--;
>> }
>> spin_unlock(_mm.page_table_lock);
>>
>> @@ -947,4 +952,25 @@ void kasan_release_vmalloc(unsigned long start, 
>> unsigned long end,
>>(unsigned long)shadow_end);
>> }
>>  }
>> +
>> +static __init int kasan_init_vmalloc_debugfs(void)
>> +{
>> +   struct dentry *root, *count;
>> +
>> +   root = debugfs_create_dir("kasan_vmalloc", NULL);
>> +   if (IS_ERR(root)) {
>> +   if (PTR_ERR(root) == -ENODEV)
>> +   return 0;
>> +   return PTR_ERR(root);
>> +   }
>> +
>> +   count = debugfs_create_u64("shadow_pages", 0444, root,
>> +  _shadow_pages);
>> +
>> +   if (IS_ERR(count))
>> +   return PTR_ERR(root);
>> +
>> +   return 0;
>> +}
>> +late_initcall(kasan_init_vmalloc_debugfs);
>>  #endif
>> --
>> 2.20.1
>>
>> --
>> You received this message because you are subscribed to the Google Groups 
>> "kasan-dev" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to kasan-dev+unsubscr...@googlegroups.com.
>> To view this discussion on the web visit 
>> https://groups.google.com/d/msgid/kasan-dev/20190903145536.3390-6-dja%40axtens.net.


Re: [PATCH v3 4/4] powerpc: load firmware trusted keys/hashes into kernel keyring

2019-09-03 Thread Mimi Zohar
On Mon, 2019-08-26 at 09:23 -0400, Nayna Jain wrote:
> The keys used to verify the Host OS kernel are managed by firmware as
> secure variables. This patch loads the verification keys into the .platform
> keyring and revocation hashes into .blacklist keyring. This enables
> verification and loading of the kernels signed by the boot time keys which
> are trusted by firmware.
> 
> Signed-off-by: Nayna Jain 

Feel free to add my tag after addressing the formatting issues.

Reviewed-by: Mimi Zohar 

> diff --git a/security/integrity/platform_certs/load_powerpc.c 
> b/security/integrity/platform_certs/load_powerpc.c
> new file mode 100644
> index ..359d5063d4da
> --- /dev/null
> +++ b/security/integrity/platform_certs/load_powerpc.c
> @@ -0,0 +1,88 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2019 IBM Corporation
> + * Author: Nayna Jain 
> + *
> + *  - loads keys and hashes stored and controlled by the firmware.
> + */
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "keyring_handler.h"
> +
> +/*
> + * Get a certificate list blob from the named secure variable.
> + */
> +static __init void *get_cert_list(u8 *key, unsigned long keylen, uint64_t 
> *size)
> +{
> + int rc;
> + void *db;
> +
> + rc = secvar_ops->get(key, keylen, NULL, size);
> + if (rc) {
> + pr_err("Couldn't get size: %d\n", rc);
> + return NULL;
> + }
> +
> + db = kmalloc(*size, GFP_KERNEL);
> + if (!db)
> + return NULL;
> +
> + rc = secvar_ops->get(key, keylen, db, size);
> + if (rc) {
> + kfree(db);
> + pr_err("Error reading db var: %d\n", rc);
> + return NULL;
> + }
> +
> + return db;
> +}
> +
> +/*
> + * Load the certs contained in the keys databases into the platform trusted
> + * keyring and the blacklisted X.509 cert SHA256 hashes into the blacklist
> + * keyring.
> + */
> +static int __init load_powerpc_certs(void)
> +{
> + void *db = NULL, *dbx = NULL;
> + uint64_t dbsize = 0, dbxsize = 0;
> + int rc = 0;
> +
> + if (!secvar_ops)
> + return -ENODEV;
> +
> + /* Get db, and dbx.  They might not exist, so it isn't
> +  * an error if we can't get them.
> +  */
> + db = get_cert_list("db", 3, );
> + if (!db) {
> + pr_err("Couldn't get db list from firmware\n");
> + } else {
> + rc = parse_efi_signature_list("powerpc:db",
> + db, dbsize, get_handler_for_db);
> + if (rc)
> + pr_err("Couldn't parse db signatures: %d\n",
> + rc);

There's no need to split this line.

> + kfree(db);
> + }
> +
> + dbx = get_cert_list("dbx", 3,  );
> + if (!dbx) {
> + pr_info("Couldn't get dbx list from firmware\n");
> + } else {
> + rc = parse_efi_signature_list("powerpc:dbx",
> + dbx, dbxsize,
> + get_handler_for_dbx);

Formatting of this line is off.

> + if (rc)
> + pr_err("Couldn't parse dbx signatures: %d\n", rc);
> + kfree(dbx);
> + }
> +
> + return rc;
> +}
> +late_initcall(load_powerpc_certs);



Re: [PATCH v3 3/4] x86/efi: move common keyring handler functions to new file

2019-09-03 Thread Mimi Zohar
(Cc'ing Josh Boyer, David Howells)

On Mon, 2019-09-02 at 21:55 +1000, Michael Ellerman wrote:
> Nayna Jain  writes:
> 
> > The handlers to add the keys to the .platform keyring and blacklisted
> > hashes to the .blacklist keyring is common for both the uefi and powerpc
> > mechanisms of loading the keys/hashes from the firmware.
> >
> > This patch moves the common code from load_uefi.c to keyring_handler.c
> >
> > Signed-off-by: Nayna Jain 

Acked-by: Mimi Zohar 

> > ---
> >  security/integrity/Makefile   |  3 +-
> >  .../platform_certs/keyring_handler.c  | 80 +++
> >  .../platform_certs/keyring_handler.h  | 32 
> >  security/integrity/platform_certs/load_uefi.c | 67 +---
> >  4 files changed, 115 insertions(+), 67 deletions(-)
> >  create mode 100644 security/integrity/platform_certs/keyring_handler.c
> >  create mode 100644 security/integrity/platform_certs/keyring_handler.h
> 
> This has no acks from security folks, though I'm not really clear on who
> maintains those files.

I upstreamed David's, Josh's, and Nayna's patches, so that's probably
me.

> Do I take it because it's mostly just code movement people are OK with
> it going in via the powerpc tree?

Yes, the only reason for splitting load_uefi.c is for powerpc.  These
patches should be upstreamed together.  

Mimi



[PATCH 1/2] powerpc/memcpy: Fix stack corruption for smaller sizes

2019-09-03 Thread Santosh Sivaraj
For sizes lesser than 128 bytes, the code branches out early without saving
the stack frame, which when restored later drops frame of the caller.

Tested-by: Aneesh Kumar K.V 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 949976dc115d..cb882d9a6d8a 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -84,7 +84,6 @@ err1; stw r0,0(r3)
 
 3: sub r5,r5,r6
cmpldi  r5,128
-   blt 5f
 
mflrr0
stdur1,-STACKFRAMESIZE(r1)
@@ -99,6 +98,7 @@ err1; stw r0,0(r3)
std r22,STK_REG(R22)(r1)
std r0,STACKFRAMESIZE+16(r1)
 
+   blt 5f
srdir6,r5,7
mtctr   r6
 
-- 
2.21.0



[PATCH 2/2] seltests/powerpc: Add a selftest for memcpy_mcsafe

2019-09-03 Thread Santosh Sivaraj
Appropriate self tests for memcpy_mcsafe

Suggested-by: Michael Ellerman 
Signed-off-by: Santosh Sivaraj 
---
 tools/testing/selftests/powerpc/copyloops/.gitignore   | 1 +
 tools/testing/selftests/powerpc/copyloops/Makefile | 7 ++-
 tools/testing/selftests/powerpc/copyloops/asm/export.h | 1 +
 .../testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S | 1 +
 4 files changed, 9 insertions(+), 1 deletion(-)
 create mode 12 tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S

diff --git a/tools/testing/selftests/powerpc/copyloops/.gitignore 
b/tools/testing/selftests/powerpc/copyloops/.gitignore
index de158104912a..12ef5b031974 100644
--- a/tools/testing/selftests/powerpc/copyloops/.gitignore
+++ b/tools/testing/selftests/powerpc/copyloops/.gitignore
@@ -11,3 +11,4 @@ memcpy_p7_t1
 copyuser_64_exc_t0
 copyuser_64_exc_t1
 copyuser_64_exc_t2
+memcpy_mcsafe_64
diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile 
b/tools/testing/selftests/powerpc/copyloops/Makefile
index 44574f3818b3..0917983a1c78 100644
--- a/tools/testing/selftests/powerpc/copyloops/Makefile
+++ b/tools/testing/selftests/powerpc/copyloops/Makefile
@@ -12,7 +12,7 @@ ASFLAGS = $(CFLAGS) -Wa,-mpower4
 TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \
copyuser_p7_t0 copyuser_p7_t1 \
memcpy_64_t0 memcpy_64_t1 memcpy_64_t2 \
-   memcpy_p7_t0 memcpy_p7_t1 \
+   memcpy_p7_t0 memcpy_p7_t1 memcpy_mcsafe_64 \
copyuser_64_exc_t0 copyuser_64_exc_t1 copyuser_64_exc_t2
 
 EXTRA_SOURCES := validate.c ../harness.c stubs.S
@@ -45,6 +45,11 @@ $(OUTPUT)/memcpy_p7_t%:  memcpy_power7.S $(EXTRA_SOURCES)
-D SELFTEST_CASE=$(subst memcpy_p7_t,,$(notdir $@)) \
-o $@ $^
 
+$(OUTPUT)/memcpy_mcsafe_64: memcpy_mcsafe_64.S $(EXTRA_SOURCES)
+   $(CC) $(CPPFLAGS) $(CFLAGS) \
+   -D COPY_LOOP=test_memcpy_mcsafe \
+   -o $@ $^
+
 $(OUTPUT)/copyuser_64_exc_t%: copyuser_64.S exc_validate.c ../harness.c \
copy_tofrom_user_reference.S stubs.S
$(CC) $(CPPFLAGS) $(CFLAGS) \
diff --git a/tools/testing/selftests/powerpc/copyloops/asm/export.h 
b/tools/testing/selftests/powerpc/copyloops/asm/export.h
index 05c1663c89b0..e6b80d5fbd14 100644
--- a/tools/testing/selftests/powerpc/copyloops/asm/export.h
+++ b/tools/testing/selftests/powerpc/copyloops/asm/export.h
@@ -1,3 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #define EXPORT_SYMBOL(x)
+#define EXPORT_SYMBOL_GPL(x)
 #define EXPORT_SYMBOL_KASAN(x)
diff --git a/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S 
b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
new file mode 12
index ..f0feef3062f6
--- /dev/null
+++ b/tools/testing/selftests/powerpc/copyloops/memcpy_mcsafe_64.S
@@ -0,0 +1 @@
+../../../../../arch/powerpc/lib/memcpy_mcsafe_64.S
\ No newline at end of file
-- 
2.21.0



Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Gabriel Paubert
On Tue, Sep 03, 2019 at 01:31:57PM -0500, Segher Boessenkool wrote:
> On Tue, Sep 03, 2019 at 07:05:19PM +0200, Christophe Leroy wrote:
> > Le 03/09/2019 à 18:04, Segher Boessenkool a écrit :
> > >(Why are they separate though?  It could just be one loop var).
> > 
> > Yes it could just be a single loop var, but in that case it would have 
> > to be reset at the start of the second loop, which means we would have 
> > to pass 'addr' for resetting the loop anyway,
> 
> Right, I noticed that after hitting send, as usual.
> 
> > so I opted to do it 
> > outside the inline asm by using to separate loop vars set to their 
> > starting value outside the inline asm.
> 
> The thing is, the way it is written now, it will get separate registers
> for each loop (with proper earlyclobbers added).  Not that that really
> matters of course, it just feels wrong :-)

After "mtmsr %3", it is always possible to copy %0 to %3 and use it as
an address register for the second loop. One register less to allocate
for the compiler. Constraints of course have to be adjusted.

Gabriel
> 
> 
> Segher


Re: [PATCH] powerpc: Avoid clang warnings around setjmp and longjmp

2019-09-03 Thread Segher Boessenkool
On Mon, Sep 02, 2019 at 10:55:53PM -0700, Nathan Chancellor wrote:
> On Thu, Aug 29, 2019 at 09:59:48AM +, David Laight wrote:
> > From: Nathan Chancellor
> > > Sent: 28 August 2019 19:45
> > ...
> > > However, I think that -fno-builtin-* would be appropriate here because
> > > we are providing our own setjmp implementation, meaning clang should not
> > > be trying to do anything with the builtin implementation like building a
> > > declaration for it.
> > 
> > Isn't implementing setjmp impossible unless you tell the compiler that
> > you function is 'setjmp-like' ?
> 
> No idea, PowerPC is the only architecture that does such a thing.

Since setjmp can return more than once, yes, exciting things can happen
if you do not tell the compiler about this.


Segher


Re: [PATCH v4 1/6] x86, s390: Move ARCH_HAS_MEM_ENCRYPT definition to arch/Kconfig

2019-09-03 Thread Thiago Jung Bauermann


Michael Ellerman  writes:

> On Tue, 2019-08-06 at 04:49:14 UTC, Thiago Jung Bauermann wrote:
>> powerpc is also going to use this feature, so put it in a generic location.
>> 
>> Signed-off-by: Thiago Jung Bauermann 
>> Reviewed-by: Thomas Gleixner 
>> Reviewed-by: Christoph Hellwig 
>
> Series applied to powerpc topic/mem-encrypt, thanks.
>
> https://git.kernel.org/powerpc/c/0c9c1d56397518eb823d458b00b06bcccd956794

Thank you!

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v4 02/16] powerpc/pseries: Introduce option to build secure virtual machines

2019-09-03 Thread Thiago Jung Bauermann


Michael Ellerman  writes:

> On Tue, 2019-08-20 at 02:13:12 UTC, Thiago Jung Bauermann wrote:
>> Introduce CONFIG_PPC_SVM to control support for secure guests and include
>> Ultravisor-related helpers when it is selected
>> 
>> Signed-off-by: Thiago Jung Bauermann 
>
> Patch 2-14 & 16 applied to powerpc next, thanks.
>
> https://git.kernel.org/powerpc/c/136bc0397ae21dbf63ca02e5775ad353a479cd2f

Thank you very much!

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Segher Boessenkool
On Tue, Sep 03, 2019 at 07:05:19PM +0200, Christophe Leroy wrote:
> Le 03/09/2019 à 18:04, Segher Boessenkool a écrit :
> >(Why are they separate though?  It could just be one loop var).
> 
> Yes it could just be a single loop var, but in that case it would have 
> to be reset at the start of the second loop, which means we would have 
> to pass 'addr' for resetting the loop anyway,

Right, I noticed that after hitting send, as usual.

> so I opted to do it 
> outside the inline asm by using to separate loop vars set to their 
> starting value outside the inline asm.

The thing is, the way it is written now, it will get separate registers
for each loop (with proper earlyclobbers added).  Not that that really
matters of course, it just feels wrong :-)


Segher


[PATCH v2] platform/powernv: Avoid re-registration of imc debugfs directory

2019-09-03 Thread Anju T Sudhakar
export_imc_mode_and_cmd() function which creates the debugfs interface for
imc-mode and imc-command, is invoked when each nest pmu units is
registered.
When the first nest pmu unit is registered, export_imc_mode_and_cmd()
creates 'imc' directory under `/debug/powerpc/`. In the subsequent
invocations debugfs_create_dir() function returns, since the directory
already exists.

The recent commit  (debugfs: make error message a bit more
verbose), throws a warning if we try to invoke `debugfs_create_dir()`
with an already existing directory name.

Address this warning by searching for an existing 'imc' directory,
and do not invoke debugfs_create_dir(), if the debugfs interface for
imc already exists.

This patch is based on:
https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-August/195898.html

Signed-off-by: Anju T Sudhakar 
Tested-by: Nageswara R Sastry 
---
Changes from v1 -> v2

* Minor changes in the commit message.
---
 arch/powerpc/platforms/powernv/opal-imc.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/opal-imc.c 
b/arch/powerpc/platforms/powernv/opal-imc.c
index e04b20625cb9..fc2f0e60a44d 100644
--- a/arch/powerpc/platforms/powernv/opal-imc.c
+++ b/arch/powerpc/platforms/powernv/opal-imc.c
@@ -55,14 +55,19 @@ static void export_imc_mode_and_cmd(struct device_node 
*node,
static u64 loc, *imc_mode_addr, *imc_cmd_addr;
char mode[16], cmd[16];
u32 cb_offset;
+   struct dentry *dir = NULL;
struct imc_mem_info *ptr = pmu_ptr->mem_info;
 
+
+   /* Return, if 'imc' interface already exists */
+   dir = debugfs_lookup("imc", powerpc_debugfs_root);
+   if (dir) {
+   dput(dir);
+   return;
+   }
imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root);
 
-   /*
-* Return here, either because 'imc' directory already exists,
-* Or failed to create a new one.
-*/
+   /* Return here, if failed to create the directory */
if (!imc_debugfs_parent)
return;
 
-- 
2.20.1



Re: [PATCH v5 06/31] pseries/fadump: define register/un-register callback functions

2019-09-03 Thread Hari Bathini



On 03/09/19 4:40 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> Make RTAS calls to register and un-register for FADump. Also, update
>> how fadump_region contents are diplayed to provide more information.
> 
> That sounds like two independent changes, so can this be split into two
> patches?

Yeah. On splitting, the below hunk would look a bit different in this patch
and the split patch would change it to how it looks now:

> + seq_printf(m, "DUMP: Src: %#016llx, Dest: %#016llx, ",
> +be64_to_cpu(fdm_ptr->rmr_region.source_address),
> +be64_to_cpu(fdm_ptr->rmr_region.destination_address));
> + seq_printf(m, "Size: %#llx, Dumped: %#llx bytes\n",
> +be64_to_cpu(fdm_ptr->rmr_region.source_len),
> +be64_to_cpu(fdm_ptr->rmr_region.bytes_dumped));


- Hari



Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Christophe Leroy




Le 03/09/2019 à 18:04, Segher Boessenkool a écrit :

On Tue, Sep 03, 2019 at 04:28:09PM +0200, Christophe Leroy wrote:

Le 03/09/2019 à 15:04, Segher Boessenkool a écrit :

On Tue, Sep 03, 2019 at 03:23:57PM +1000, Alastair D'Silva wrote:

+   asm volatile(
+   "   mtctr %2;"
+   "   mtmsr %3;"
+   "   isync;"
+   "0: dcbst   0, %0;"
+   "   addi%0, %0, %4;"
+   "   bdnz0b;"
+   "   sync;"
+   "   mtctr %2;"
+   "1: icbi0, %1;"
+   "   addi%1, %1, %4;"
+   "   bdnz1b;"
+   "   sync;"
+   "   mtmsr %5;"
+   "   isync;"
+   : "+r" (loop1), "+r" (loop2)
+   : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
+   : "ctr", "memory");


This outputs as one huge assembler statement, all on one line.  That's
going to be fun to read or debug.


Do you mean \n has to be added after the ; ?


Something like that.  There is no really satisfying way for doing huge
inline asm, and maybe that is a good thing ;-)

Often people write \n\t at the end of each line of inline asm.  This works
pretty well (but then there are labels, oh joy).


loop1 and/or loop2 can be assigned the same register as msr0 or nb.  They
need to be made earlyclobbers.  (msr is fine, all of its reads are before
any writes to loop1 or loop2; and bytes is fine, it's not a register).


Can you explicit please ? Doesn't '+r' means that they are input and
output at the same time ?


That is what + means, yes -- that this output is an input as well.  It is
the same to write

   asm("mov %1,%0 ; mov %0,42" : "+r"(x), "=r"(y));
or to write
   asm("mov %1,%0 ; mov %0,42" : "=r"(x), "=r"(y) : "0"(x));

(So not "at the same time" as in "in the same machine instruction", but
more loosely, as in "in the same inline asm statement").


"to be made earlyclobbers", what does this means exactly ? How to do that ?


You write &, like "+" in this case.  It means the machine code writes
to this register before it has consumed all asm inputs (remember, GCC
does not understand (or even parse!) the assembler string).

So just

: "+" (loop1), "+" (loop2)

will do.  (Why are they separate though?  It could just be one loop var).


Yes it could just be a single loop var, but in that case it would have 
to be reset at the start of the second loop, which means we would have 
to pass 'addr' for resetting the loop anyway, so I opted to do it 
outside the inline asm by using to separate loop vars set to their 
starting value outside the inline asm.


Christophe


[PATCH] powerpc/powernv: remove the unused pnv_npu_try_dma_set_bypass function

2019-09-03 Thread Christoph Hellwig
Neither pnv_npu_try_dma_set_bypass nor the pnv_npu_dma_set_32 and
pnv_npu_dma_set_bypass helpers called by it are used anywhere in the
kernel tree, so remove them.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 99 
 1 file changed, 99 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index c16249d251f1..a570a249edc3 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -192,105 +192,6 @@ static long pnv_npu_unset_window(struct iommu_table_group 
*table_group, int num)
return 0;
 }
 
-/*
- * Enables 32 bit DMA on NPU.
- */
-static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
-{
-   struct pci_dev *gpdev;
-   struct pnv_ioda_pe *gpe;
-   int64_t rc;
-
-   /*
-* Find the assoicated PCI devices and get the dma window
-* information from there.
-*/
-   if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
-   return;
-
-   gpe = get_gpu_pci_dev_and_pe(npe, );
-   if (!gpe)
-   return;
-
-   rc = pnv_npu_set_window(>table_group, 0,
-   gpe->table_group.tables[0]);
-
-   /*
-* NVLink devices use the same TCE table configuration as
-* their parent device so drivers shouldn't be doing DMA
-* operations directly on these devices.
-*/
-   set_dma_ops(>pdev->dev, _dummy_ops);
-}
-
-/*
- * Enables bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicitly enabled or
- * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
- * active at the same time.
- */
-static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
-{
-   struct pnv_phb *phb = npe->phb;
-   int64_t rc = 0;
-   phys_addr_t top = memblock_end_of_DRAM();
-
-   if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
-   return -EINVAL;
-
-   rc = pnv_npu_unset_window(>table_group, 0);
-   if (rc != OPAL_SUCCESS)
-   return rc;
-
-   /* Enable the bypass window */
-
-   top = roundup_pow_of_two(top);
-   dev_info(>pdev->dev, "Enabling bypass for PE %x\n",
-   npe->pe_number);
-   rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
-   npe->pe_number, npe->pe_number,
-   0 /* bypass base */, top);
-
-   if (rc == OPAL_SUCCESS)
-   pnv_pci_ioda2_tce_invalidate_entire(phb, false);
-
-   return rc;
-}
-
-void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
-{
-   int i;
-   struct pnv_phb *phb;
-   struct pci_dn *pdn;
-   struct pnv_ioda_pe *npe;
-   struct pci_dev *npdev;
-
-   for (i = 0; ; ++i) {
-   npdev = pnv_pci_get_npu_dev(gpdev, i);
-
-   if (!npdev)
-   break;
-
-   pdn = pci_get_pdn(npdev);
-   if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
-   return;
-
-   phb = pci_bus_to_host(npdev->bus)->private_data;
-
-   /* We only do bypass if it's enabled on the linked device */
-   npe = >ioda.pe_array[pdn->pe_number];
-
-   if (bypass) {
-   dev_info(>dev,
-   "Using 64-bit DMA iommu bypass\n");
-   pnv_npu_dma_set_bypass(npe);
-   } else {
-   dev_info(>dev, "Using 32-bit DMA via iommu\n");
-   pnv_npu_dma_set_32(npe);
-   }
-   }
-}
-
 #ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
 static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
-- 
2.20.1



[PATCH AUTOSEL 4.19 149/167] powerpc/mm: Limit rma_size to 1TB when running without HV mode

2019-09-03 Thread Sasha Levin
From: Suraj Jitindar Singh 

[ Upstream commit da0ef93310e67ae6902efded60b6724dab27a5d1 ]

The virtual real mode addressing (VRMA) mechanism is used when a
partition is using HPT (Hash Page Table) translation and performs real
mode accesses (MSR[IR|DR] = 0) in non-hypervisor mode. In this mode
effective address bits 0:23 are treated as zero (i.e. the access is
aliased to 0) and the access is performed using an implicit 1TB SLB
entry.

The size of the RMA (Real Memory Area) is communicated to the guest as
the size of the first memory region in the device tree. And because of
the mechanism described above can be expected to not exceed 1TB. In
the event that the host erroneously represents the RMA as being larger
than 1TB, guest accesses in real mode to memory addresses above 1TB
will be aliased down to below 1TB. This means that a memory access
performed in real mode may differ to one performed in virtual mode for
the same memory address, which would likely have unintended
consequences.

To avoid this outcome have the guest explicitly limit the size of the
RMA to the current maximum, which is 1TB. This means that even if the
first memory block is larger than 1TB, only the first 1TB should be
accessed in real mode.

Fixes: c610d65c0ad0 ("powerpc/pseries: lift RTAS limit for hash")
Cc: sta...@vger.kernel.org # v4.16+
Signed-off-by: Suraj Jitindar Singh 
Tested-by: Satheesh Rajendran 
Reviewed-by: David Gibson 
Signed-off-by: Michael Ellerman 
Link: https://lore.kernel.org/r/20190710052018.14628-1-sjitindarsi...@gmail.com
Signed-off-by: Sasha Levin 
---
 arch/powerpc/mm/hash_utils_64.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index f23a89d8e4ce6..29fd8940867e5 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1859,11 +1859,20 @@ void hash__setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
 *
 * For guests on platforms before POWER9, we clamp the it limit to 1G
 * to avoid some funky things such as RTAS bugs etc...
+*
+* On POWER9 we limit to 1TB in case the host erroneously told us that
+* the RMA was >1TB. Effective address bits 0:23 are treated as zero
+* (meaning the access is aliased to zero i.e. addr = addr % 1TB)
+* for virtual real mode addressing and so it doesn't make sense to
+* have an area larger than 1TB as it can't be addressed.
 */
if (!early_cpu_has_feature(CPU_FTR_HVMODE)) {
ppc64_rma_size = first_memblock_size;
if (!early_cpu_has_feature(CPU_FTR_ARCH_300))
ppc64_rma_size = min_t(u64, ppc64_rma_size, 0x4000);
+   else
+   ppc64_rma_size = min_t(u64, ppc64_rma_size,
+  1UL << SID_SHIFT_1T);
 
/* Finally limit subsequent allocations */
memblock_set_current_limit(ppc64_rma_size);
-- 
2.20.1



Re: [PATCH v5 11/31] powernv/fadump: add fadump support on powernv

2019-09-03 Thread Hari Bathini



On 03/09/19 4:40 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> Add basic callback functions for FADump on PowerNV platform.
> 
> I assume this doesn't actually work yet?
> 
> Does something block it from appearing to work at runtime?

With this patch, "fadump=on" would reserve memory for FADump as support is 
enabled
but registration with f/w is not yet added. So, it would fail to register...

> 
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index d8dcd88..fc4ecfe 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -566,7 +566,7 @@ config CRASH_DUMP
>>  
>>  config FA_DUMP
>>  bool "Firmware-assisted dump"
>> -depends on PPC64 && PPC_RTAS
>> +depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
>>  select CRASH_CORE
>>  select CRASH_DUMP
>>  help
>> @@ -577,7 +577,8 @@ config FA_DUMP
>>is meant to be a kdump replacement offering robustness and
>>speed not possible without system firmware assistance.
>>  
>> -  If unsure, say "N"
>> +  If unsure, say "y". Only special kernels like petitboot may
>> +  need to say "N" here.
>>  
>>  config IRQ_ALL_CPUS
>>  bool "Distribute interrupts on all CPUs by default"
>> diff --git a/arch/powerpc/kernel/fadump-common.h 
>> b/arch/powerpc/kernel/fadump-common.h
>> index d2c5b16..f6c52d3 100644
>> --- a/arch/powerpc/kernel/fadump-common.h
>> +++ b/arch/powerpc/kernel/fadump-common.h
>> @@ -140,4 +140,13 @@ static inline int rtas_fadump_dt_scan(struct fw_dump 
>> *fadump_config, ulong node)
>>  }
>>  #endif
>>  
>> +#ifdef CONFIG_PPC_POWERNV
>> +extern int opal_fadump_dt_scan(struct fw_dump *fadump_config, ulong node);
>> +#else
>> +static inline int opal_fadump_dt_scan(struct fw_dump *fadump_config, ulong 
>> node)
>> +{
>> +return 1;
>> +}
> 
> Extending the strange flat device tree calling convention to these
> functions is not ideal.
> 
> It would be better I think if they just returned bool true/false for
> "found it" / "not found", and then early_init_dt_scan_fw_dump() can
> convert that into the appropriate return value.
> 
>> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
>> index f7c8073..b8061fb9 100644
>> --- a/arch/powerpc/kernel/fadump.c
>> +++ b/arch/powerpc/kernel/fadump.c
>> @@ -114,6 +114,9 @@ int __init early_init_dt_scan_fw_dump(unsigned long 
>> node, const char *uname,
>>  if (strcmp(uname, "rtas") == 0)
>>  return rtas_fadump_dt_scan(_dump, node);
>>  
>> +if (strcmp(uname, "ibm,opal") == 0)
>> +return opal_fadump_dt_scan(_dump, node);
>> +
> 
> ie this would become:
> 
>   if (strcmp(uname, "ibm,opal") == 0 && opal_fadump_dt_scan(_dump, 
> node))
> return 1;
> 

Yeah. Will update accordingly...

Thanks
Hari



Re: [PATCH v5 10/31] opal: add MPIPL interface definitions

2019-09-03 Thread Hari Bathini



On 03/09/19 4:40 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> diff --git a/arch/powerpc/include/asm/opal-api.h 
>> b/arch/powerpc/include/asm/opal-api.h
>> index 383242e..c8a5665 100644
>> --- a/arch/powerpc/include/asm/opal-api.h
>> +++ b/arch/powerpc/include/asm/opal-api.h
>> @@ -980,6 +983,50 @@ struct opal_sg_list {
>>  };
>>  
>>  /*
>> + * Firmware-Assisted Dump (FADump) using MPIPL
>> + */
>> +
>> +/* MPIPL update operations */
>> +enum opal_mpipl_ops {
>> +OPAL_MPIPL_ADD_RANGE= 0,
>> +OPAL_MPIPL_REMOVE_RANGE = 1,
>> +OPAL_MPIPL_REMOVE_ALL   = 2,
>> +OPAL_MPIPL_FREE_PRESERVED_MEMORY= 3,
>> +};
>> +
>> +/*
>> + * Each tag maps to a metadata type. Use these tags to register/query
>> + * corresponding metadata address with/from OPAL.
>> + */
>> +enum opal_mpipl_tags {
>> +OPAL_MPIPL_TAG_CPU  = 0,
>> +OPAL_MPIPL_TAG_OPAL = 1,
>> +OPAL_MPIPL_TAG_KERNEL   = 2,
>> +OPAL_MPIPL_TAG_BOOT_MEM = 3,
>> +};
>> +
>> +/* Preserved memory details */
>> +struct opal_mpipl_region {
>> +__be64  src;
>> +__be64  dest;
>> +__be64  size;
>> +};
>> +
>> +/* FADump structure format version */
>> +#define MPIPL_FADUMP_VERSION0x01
>> +
>> +/* Metadata provided by OPAL. */
>> +struct opal_mpipl_fadump {
>> +u8  version;
>> +u8  reserved[7];
>> +__be32  crashing_pir;
>> +__be32  cpu_data_version;
>> +__be32  cpu_data_size;
>> +__be32  region_cnt;
>> +struct opal_mpipl_regionregion[];
>> +} __attribute__((packed));
>> +
> 
> The above hunk is in the wrong place vs the skiboot header. Please put
> things in exactly the same place in the skiboot and kernel versions of
> the header.
> 
> After your kernel & skiboot patches are applied, the result of:
> 
>  $ git diff ~/src/skiboot/include/opal-api.h 
> arch/powerpc/include/asm/opal-api.h
> 
> Should not include anything MPIPL/fadump related.

Sure.

> 
>> diff --git a/arch/powerpc/include/asm/opal.h 
>> b/arch/powerpc/include/asm/opal.h
>> index 57bd029..878110a 100644
>> --- a/arch/powerpc/include/asm/opal.h
>> +++ b/arch/powerpc/include/asm/opal.h
>> @@ -39,6 +39,12 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, 
>> uint32_t bdfn,
>>  uint64_t PE_handle);
>>  int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
>>  uint64_t rate_phys, uint32_t size);
>> +
>> +int64_t opal_mpipl_update(enum opal_mpipl_ops op, u64 src,
>> +  u64 dest, u64 size);
>> +int64_t opal_mpipl_register_tag(enum opal_mpipl_tags tag, uint64_t addr);
>> +int64_t opal_mpipl_query_tag(enum opal_mpipl_tags tag, uint64_t *addr);
>> +
> 
> Please consistently use kernel types for new prototypes in here.

uint64_t instead of 'enum's?

- Hari



[PATCH AUTOSEL 4.19 137/167] KVM: PPC: Book3S HV: Fix CR0 setting in TM emulation

2019-09-03 Thread Sasha Levin
From: Michael Neuling 

[ Upstream commit 3fefd1cd95df04da67c83c1cb93b663f04b3324f ]

When emulating tsr, treclaim and trechkpt, we incorrectly set CR0. The
code currently sets:
CR0 <- 00 || MSR[TS]
but according to the ISA it should be:
CR0 <-  0 || MSR[TS] || 0

This fixes the bit shift to put the bits in the correct location.

This is a data integrity issue as CR0 is corrupted.

Fixes: 4bb3c7a0208f ("KVM: PPC: Book3S HV: Work around transactional memory 
bugs in POWER9")
Cc: sta...@vger.kernel.org # v4.17+
Tested-by: Suraj Jitindar Singh 
Signed-off-by: Michael Neuling 
Signed-off-by: Michael Ellerman 
Signed-off-by: Sasha Levin 
---
 arch/powerpc/kvm/book3s_hv_tm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index 888e2609e3f15..31cd0f327c8a2 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -131,7 +131,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
}
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
/* L=1 => tresume, L=0 => tsuspend */
if (instr & (1 << 21)) {
if (MSR_TM_SUSPENDED(msr))
@@ -175,7 +175,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
return RESUME_GUEST;
 
@@ -205,7 +205,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
/* Set CR0 to indicate previous transactional state */
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fff) |
-   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+   (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr = msr | MSR_TS_S;
return RESUME_GUEST;
}
-- 
2.20.1



[PATCH AUTOSEL 4.19 136/167] KVM: PPC: Use ccr field in pt_regs struct embedded in vcpu struct

2019-09-03 Thread Sasha Levin
From: Paul Mackerras 

[ Upstream commit fd0944baad806dfb4c777124ec712c55b714ff51 ]

When the 'regs' field was added to struct kvm_vcpu_arch, the code
was changed to use several of the fields inside regs (e.g., gpr, lr,
etc.) but not the ccr field, because the ccr field in struct pt_regs
is 64 bits on 64-bit platforms, but the cr field in kvm_vcpu_arch is
only 32 bits.  This changes the code to use the regs.ccr field
instead of cr, and changes the assembly code on 64-bit platforms to
use 64-bit loads and stores instead of 32-bit ones.

Reviewed-by: David Gibson 
Signed-off-by: Paul Mackerras 
Signed-off-by: Michael Ellerman 
Signed-off-by: Sasha Levin 
---
 arch/powerpc/include/asm/kvm_book3s.h|  4 ++--
 arch/powerpc/include/asm/kvm_book3s_64.h |  4 ++--
 arch/powerpc/include/asm/kvm_booke.h |  4 ++--
 arch/powerpc/include/asm/kvm_host.h  |  2 --
 arch/powerpc/kernel/asm-offsets.c|  4 ++--
 arch/powerpc/kvm/book3s_emulate.c| 12 ++--
 arch/powerpc/kvm/book3s_hv.c |  4 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  4 ++--
 arch/powerpc/kvm/book3s_hv_tm.c  |  6 +++---
 arch/powerpc/kvm/book3s_hv_tm_builtin.c  |  5 +++--
 arch/powerpc/kvm/book3s_pr.c |  4 ++--
 arch/powerpc/kvm/bookehv_interrupts.S|  8 
 arch/powerpc/kvm/emulate_loadstore.c |  1 -
 13 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 83a9aa3cf6891..dd18d8174504f 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -301,12 +301,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, 
int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-   vcpu->arch.cr = val;
+   vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-   return vcpu->arch.cr;
+   return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index dc435a5af7d6c..14fa07c73f44d 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -482,7 +482,7 @@ static inline u64 sanitize_msr(u64 msr)
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.cr  = vcpu->arch.cr_tm;
+   vcpu->arch.regs.ccr  = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.regs.link  = vcpu->arch.lr_tm;
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
@@ -499,7 +499,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu 
*vcpu)
 
 static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.cr_tm  = vcpu->arch.cr;
+   vcpu->arch.cr_tm  = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.lr_tm  = vcpu->arch.regs.link;
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
diff --git a/arch/powerpc/include/asm/kvm_booke.h 
b/arch/powerpc/include/asm/kvm_booke.h
index d513e3ed1c659..f0cef625f17ce 100644
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, 
int num)
 
 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
 {
-   vcpu->arch.cr = val;
+   vcpu->arch.regs.ccr = val;
 }
 
 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 {
-   return vcpu->arch.cr;
+   return vcpu->arch.regs.ccr;
 }
 
 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 2b6049e839706..2f95e38f05491 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -538,8 +538,6 @@ struct kvm_vcpu_arch {
ulong tar;
 #endif
 
-   u32 cr;
-
 #ifdef CONFIG_PPC_BOOK3S
ulong hflags;
ulong guest_owned_ext;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 89cf15566c4e8..7c3738d890e8b 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S
OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
 #endif
-   OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+   OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
@@ -695,7 +695,7 @@ int main(void)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
-   OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
+   OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
OFFSET(VCPU_CTR, 

Re: [PATCH v5 07/31] powerpc/fadump: release all the memory above boot memory size

2019-09-03 Thread Hari Bathini



On 03/09/19 4:40 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
> 
>> Except for reserve dump area which is permanent reserved, all memory
> permanently
> 
>> above boot memory size is released when the dump is invalidated. Make
>> this a bit more explicit in the code.
> 
> I'm not clear on what you mean by "boot memory size"?

boot memory size is the amount of memory used to boot the capture kernel. 
Basically,
the amount of memory required for the kernel to boot successfully when booted 
with
restricted memory..

- Hari



[PATCH AUTOSEL 4.19 075/167] powerpc/kvm: Save and restore host AMR/IAMR/UAMOR

2019-09-03 Thread Sasha Levin
From: Michael Ellerman 

[ Upstream commit c3c7470c75566a077c8dc71dcf8f1948b8ddfab4 ]

When the hash MMU is active the AMR, IAMR and UAMOR are used for
pkeys. The AMR is directly writable by user space, and the UAMOR masks
those writes, meaning both registers are effectively user register
state. The IAMR is used to create an execute only key.

Also we must maintain the value of at least the AMR when running in
process context, so that any memory accesses done by the kernel on
behalf of the process are correctly controlled by the AMR.

Although we are correctly switching all registers when going into a
guest, on returning to the host we just write 0 into all regs, except
on Power9 where we restore the IAMR correctly.

This could be observed by a user process if it writes the AMR, then
runs a guest and we then return immediately to it without
rescheduling. Because we have written 0 to the AMR that would have the
effect of granting read/write permission to pages that the process was
trying to protect.

In addition, when using the Radix MMU, the AMR can prevent inadvertent
kernel access to userspace data, writing 0 to the AMR disables that
protection.

So save and restore AMR, IAMR and UAMOR.

Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
Cc: sta...@vger.kernel.org # v4.16+
Signed-off-by: Russell Currey 
Signed-off-by: Michael Ellerman 
Acked-by: Paul Mackerras 
Signed-off-by: Sasha Levin 
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 26 -
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 1d14046124a01..5902a60f92268 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -56,6 +56,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 #define STACK_SLOT_DAWR(SFS-56)
 #define STACK_SLOT_DAWRX   (SFS-64)
 #define STACK_SLOT_HFSCR   (SFS-72)
+#define STACK_SLOT_AMR (SFS-80)
+#define STACK_SLOT_UAMOR   (SFS-88)
 
 /*
  * Call kvmppc_hv_entry in real mode.
@@ -760,11 +762,9 @@ BEGIN_FTR_SECTION
mfspr   r5, SPRN_TIDR
mfspr   r6, SPRN_PSSCR
mfspr   r7, SPRN_PID
-   mfspr   r8, SPRN_IAMR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
std r7, STACK_SLOT_PID(r1)
-   std r8, STACK_SLOT_IAMR(r1)
mfspr   r5, SPRN_HFSCR
std r5, STACK_SLOT_HFSCR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
@@ -772,11 +772,18 @@ BEGIN_FTR_SECTION
mfspr   r5, SPRN_CIABR
mfspr   r6, SPRN_DAWR
mfspr   r7, SPRN_DAWRX
+   mfspr   r8, SPRN_IAMR
std r5, STACK_SLOT_CIABR(r1)
std r6, STACK_SLOT_DAWR(r1)
std r7, STACK_SLOT_DAWRX(r1)
+   std r8, STACK_SLOT_IAMR(r1)
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
+   mfspr   r5, SPRN_AMR
+   std r5, STACK_SLOT_AMR(r1)
+   mfspr   r6, SPRN_UAMOR
+   std r6, STACK_SLOT_UAMOR(r1)
+
 BEGIN_FTR_SECTION
/* Set partition DABR */
/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -1713,22 +1720,25 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mtspr   SPRN_PSPB, r0
mtspr   SPRN_WORT, r0
 BEGIN_FTR_SECTION
-   mtspr   SPRN_IAMR, r0
mtspr   SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li  r0, 1
sldir0, r0, 31
mtspr   SPRN_MMCRS, r0
 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-8:
 
-   /* Save and reset AMR and UAMOR before turning on the MMU */
+   /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
+   ld  r8, STACK_SLOT_IAMR(r1)
+   mtspr   SPRN_IAMR, r8
+
+8: /* Power7 jumps back in here */
mfspr   r5,SPRN_AMR
mfspr   r6,SPRN_UAMOR
std r5,VCPU_AMR(r9)
std r6,VCPU_UAMOR(r9)
-   li  r6,0
-   mtspr   SPRN_AMR,r6
+   ld  r5,STACK_SLOT_AMR(r1)
+   ld  r6,STACK_SLOT_UAMOR(r1)
+   mtspr   SPRN_AMR, r5
mtspr   SPRN_UAMOR, r6
 
/* Switch DSCR back to host value */
@@ -1897,11 +1907,9 @@ BEGIN_FTR_SECTION
ld  r5, STACK_SLOT_TID(r1)
ld  r6, STACK_SLOT_PSSCR(r1)
ld  r7, STACK_SLOT_PID(r1)
-   ld  r8, STACK_SLOT_IAMR(r1)
mtspr   SPRN_TIDR, r5
mtspr   SPRN_PSSCR, r6
mtspr   SPRN_PID, r7
-   mtspr   SPRN_IAMR, r8
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 #ifdef CONFIG_PPC_RADIX_MMU
-- 
2.20.1



[PATCH AUTOSEL 4.19 043/167] powerpc/pkeys: Fix handling of pkey state across fork()

2019-09-03 Thread Sasha Levin
From: Ram Pai 

[ Upstream commit 2cd4bd192ee94848695c1c052d87913260e10f36 ]

Protection key tracking information is not copied over to the
mm_struct of the child during fork(). This can cause the child to
erroneously allocate keys that were already allocated. Any allocated
execute-only key is lost aswell.

Add code; called by dup_mmap(), to copy the pkey state from parent to
child explicitly.

This problem was originally found by Dave Hansen on x86, which turns
out to be a problem on powerpc aswell.

Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
Cc: sta...@vger.kernel.org # v4.16+
Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
Signed-off-by: Michael Ellerman 
Signed-off-by: Sasha Levin 
---
 arch/powerpc/include/asm/mmu_context.h | 15 +--
 arch/powerpc/mm/pkeys.c| 10 ++
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index b694d6af11508..ae953958c0f33 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -217,12 +217,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-   struct mm_struct *mm)
-{
-   return 0;
-}
-
 #ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
@@ -247,6 +241,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 #ifdef CONFIG_PPC_MEM_KEYS
 bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
   bool execute, bool foreign);
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm);
 #else /* CONFIG_PPC_MEM_KEYS */
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
@@ -259,6 +254,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 #define thread_pkey_regs_save(thread)
 #define thread_pkey_regs_restore(new_thread, old_thread)
 #define thread_pkey_regs_init(thread)
+#define arch_dup_pkeys(oldmm, mm)
 
 static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 {
@@ -267,5 +263,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 
 #endif /* CONFIG_PPC_MEM_KEYS */
 
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+   struct mm_struct *mm)
+{
+   arch_dup_pkeys(oldmm, mm);
+   return 0;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index b271b283c785e..25a8dd9cd71db 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -414,3 +414,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, 
bool write,
 
return pkey_access_permitted(vma_pkey(vma), write, execute);
 }
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+   if (static_branch_likely(_disabled))
+   return;
+
+   /* Duplicate the oldmm pkey state in mm: */
+   mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+   mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}
-- 
2.20.1



[PATCH AUTOSEL 4.19 035/167] KVM: PPC: Book3S HV: Fix race between kvm_unmap_hva_range and MMU mode switch

2019-09-03 Thread Sasha Levin
From: Paul Mackerras 

[ Upstream commit 234ff0b729ad882d20f7996591a964965647addf ]

Testing has revealed an occasional crash which appears to be caused
by a race between kvmppc_switch_mmu_to_hpt and kvm_unmap_hva_range_hv.
The symptom is a NULL pointer dereference in __find_linux_pte() called
from kvm_unmap_radix() with kvm->arch.pgtable == NULL.

Looking at kvmppc_switch_mmu_to_hpt(), it does indeed clear
kvm->arch.pgtable (via kvmppc_free_radix()) before setting
kvm->arch.radix to NULL, and there is nothing to prevent
kvm_unmap_hva_range_hv() or the other MMU callback functions from
being called concurrently with kvmppc_switch_mmu_to_hpt() or
kvmppc_switch_mmu_to_radix().

This patch therefore adds calls to spin_lock/unlock on the kvm->mmu_lock
around the assignments to kvm->arch.radix, and makes sure that the
partition-scoped radix tree or HPT is only freed after changing
kvm->arch.radix.

This also takes the kvm->mmu_lock in kvmppc_rmap_reset() to make sure
that the clearing of each rmap array (one per memslot) doesn't happen
concurrently with use of the array in the kvm_unmap_hva_range_hv()
or the other MMU callbacks.

Fixes: 18c3640cefc7 ("KVM: PPC: Book3S HV: Add infrastructure for running HPT 
guests on radix host")
Cc: sta...@vger.kernel.org # v4.15+
Signed-off-by: Paul Mackerras 
Signed-off-by: Sasha Levin 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c |  3 +++
 arch/powerpc/kvm/book3s_hv.c| 15 +++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 68e14afecac85..a488c105b9234 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -744,12 +744,15 @@ void kvmppc_rmap_reset(struct kvm *kvm)
srcu_idx = srcu_read_lock(>srcu);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
+   /* Mutual exclusion with kvm_unmap_hva_range etc. */
+   spin_lock(>mmu_lock);
/*
 * This assumes it is acceptable to lose reference and
 * change bits across a reset.
 */
memset(memslot->arch.rmap, 0,
   memslot->npages * sizeof(*memslot->arch.rmap));
+   spin_unlock(>mmu_lock);
}
srcu_read_unlock(>srcu, srcu_idx);
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 083dcedba11ce..9595db30e6b87 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3813,12 +3813,15 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu 
*vcpu)
 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
+   kvmppc_rmap_reset(kvm);
+   kvm->arch.process_table = 0;
+   /* Mutual exclusion with kvm_unmap_hva_range etc. */
+   spin_lock(>mmu_lock);
+   kvm->arch.radix = 0;
+   spin_unlock(>mmu_lock);
kvmppc_free_radix(kvm);
kvmppc_update_lpcr(kvm, LPCR_VPM1,
   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
-   kvmppc_rmap_reset(kvm);
-   kvm->arch.radix = 0;
-   kvm->arch.process_table = 0;
return 0;
 }
 
@@ -3831,10 +3834,14 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
if (err)
return err;
 
+   kvmppc_rmap_reset(kvm);
+   /* Mutual exclusion with kvm_unmap_hva_range etc. */
+   spin_lock(>mmu_lock);
+   kvm->arch.radix = 1;
+   spin_unlock(>mmu_lock);
kvmppc_free_hpt(>arch.hpt);
kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
-   kvm->arch.radix = 1;
return 0;
 }
 
-- 
2.20.1



Re: [PATCH v5 05/31] pseries/fadump: introduce callbacks for platform specific operations

2019-09-03 Thread Hari Bathini



On 03/09/19 4:40 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> Introduce callback functions for platform specific operations like
>> register, unregister, invalidate & such. Also, define place-holders
>> for the same on pSeries platform.
> 
> We already have an ops structure for machine specific calls, it's
> ppc_md. Is there a good reason why these aren't just in machdep_calls
> under #ifdef CONFIG_FA_DUMP ?

Not really. We move this callbacks to 'struct machdep_calls'

- Hari



Re: [PATCH v5 01/31] powerpc/fadump: move internal macros/definitions to a new header

2019-09-03 Thread Hari Bathini



On 03/09/19 4:39 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> Though asm/fadump.h is meant to be used by other components dealing
>> with FADump, it also has macros/definitions internal to FADump code.
>> Move them to a new header file used within FADump code. This also
>> makes way for refactoring platform specific FADump code.
>>
>> Signed-off-by: Hari Bathini 
>> ---
>>  arch/powerpc/include/asm/fadump.h   |   71 
>>  arch/powerpc/kernel/fadump-common.h |   89 
>> +++
>>  arch/powerpc/kernel/fadump.c|2 +
> 
> I don't like having a header in kernel that's then used in platform
> code. Because we end up having to do gross things like:
> 
>   arch/powerpc/platforms/powernv/opal-core.c:#include 
> "../../kernel/fadump-common.h"
>   arch/powerpc/platforms/powernv/opal-fadump.c:#include 
> "../../kernel/fadump-common.h"
>   arch/powerpc/platforms/pseries/rtas-fadump.c:#include 
> "../../kernel/fadump-common.h"
> 
> 
> I'd rather you put the internal bits in 
> arch/powerpc/include/asm/fadump-internal.h

True. Will put the internal bits in arch/powerpc/include/asm/fadump-internal.h

- Hari



Re: [PATCH v5 02/31] powerpc/fadump: move internal code to a new file

2019-09-03 Thread Hari Bathini



On 03/09/19 4:39 PM, Michael Ellerman wrote:
> Hari Bathini  writes:
>> Make way for refactoring platform specific FADump code by moving code
>> that could be referenced from multiple places to fadump-common.c file.
>>
>> Signed-off-by: Hari Bathini 
>> ---
>>  arch/powerpc/kernel/Makefile|2 
>>  arch/powerpc/kernel/fadump-common.c |  140 
>> ++
>>  arch/powerpc/kernel/fadump-common.h |8 ++
>>  arch/powerpc/kernel/fadump.c|  146 
>> ++-
>>  4 files changed, 158 insertions(+), 138 deletions(-)
>>  create mode 100644 arch/powerpc/kernel/fadump-common.c
> 
> I don't understand why we need fadump.c and fadump-common.c? They're
> both common/shared across pseries & powernv aren't they?

The convention I tried to follow to have fadump-common.c shared between 
fadump.c,
pseries & powernv code while pseries & powernv code take callback requests from
fadump.c and use fadump-common.c (shared by both platforms), if necessary to 
fullfil
those requests...

> By the end of the series we end up with 149 lines in fadump-common.c
> which seems like a waste of time. Just put it all in fadump.c.

Yeah. Probably not worth a new C file. Will just have two separate headers. One 
for
internal code and one for interfacing with other modules...

[...]

>> + * Copyright 2019, IBM Corp.
>> + * Author: Hari Bathini 
> 
> These can just be:
> 
>  * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
>  * Copyright 2019, Hari Bathini, IBM Corporation.
> 

Sure.

>> + */
>> +
>> +#undef DEBUG
> 
> Don't undef DEBUG please.
> 

Sorry! Seeing such thing in most files, I thought this was the convention. Will 
drop
this change in all the new files I added.

>> +#define pr_fmt(fmt) "fadump: " fmt
>> +
>> +#include 
>> +#include 
>> +#include 
>> +#include 
>> +
>> +#include "fadump-common.h"
>> +
>> +void *fadump_cpu_notes_buf_alloc(unsigned long size)
>> +{
>> +void *vaddr;
>> +struct page *page;
>> +unsigned long order, count, i;
>> +
>> +order = get_order(size);
>> +vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
>> +if (!vaddr)
>> +return NULL;
>> +
>> +count = 1 << order;
>> +page = virt_to_page(vaddr);
>> +for (i = 0; i < count; i++)
>> +SetPageReserved(page + i);
>> +return vaddr;
>> +}
> 
> I realise you're just moving this code, but why do we need all this hand
> rolled allocation stuff?

Yeah, I think alloc_pages_exact() may be better here. Mahesh, am I missing 
something?

- Hari



Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Segher Boessenkool
On Tue, Sep 03, 2019 at 04:28:09PM +0200, Christophe Leroy wrote:
> Le 03/09/2019 à 15:04, Segher Boessenkool a écrit :
> >On Tue, Sep 03, 2019 at 03:23:57PM +1000, Alastair D'Silva wrote:
> >>+   asm volatile(
> >>+   "   mtctr %2;"
> >>+   "   mtmsr %3;"
> >>+   "   isync;"
> >>+   "0: dcbst   0, %0;"
> >>+   "   addi%0, %0, %4;"
> >>+   "   bdnz0b;"
> >>+   "   sync;"
> >>+   "   mtctr %2;"
> >>+   "1: icbi0, %1;"
> >>+   "   addi%1, %1, %4;"
> >>+   "   bdnz1b;"
> >>+   "   sync;"
> >>+   "   mtmsr %5;"
> >>+   "   isync;"
> >>+   : "+r" (loop1), "+r" (loop2)
> >>+   : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
> >>+   : "ctr", "memory");
> >
> >This outputs as one huge assembler statement, all on one line.  That's
> >going to be fun to read or debug.
> 
> Do you mean \n has to be added after the ; ?

Something like that.  There is no really satisfying way for doing huge
inline asm, and maybe that is a good thing ;-)

Often people write \n\t at the end of each line of inline asm.  This works
pretty well (but then there are labels, oh joy).

> >loop1 and/or loop2 can be assigned the same register as msr0 or nb.  They
> >need to be made earlyclobbers.  (msr is fine, all of its reads are before
> >any writes to loop1 or loop2; and bytes is fine, it's not a register).
> 
> Can you explicit please ? Doesn't '+r' means that they are input and 
> output at the same time ?

That is what + means, yes -- that this output is an input as well.  It is
the same to write

  asm("mov %1,%0 ; mov %0,42" : "+r"(x), "=r"(y));
or to write
  asm("mov %1,%0 ; mov %0,42" : "=r"(x), "=r"(y) : "0"(x));

(So not "at the same time" as in "in the same machine instruction", but
more loosely, as in "in the same inline asm statement").

> "to be made earlyclobbers", what does this means exactly ? How to do that ?

You write &, like "+" in this case.  It means the machine code writes
to this register before it has consumed all asm inputs (remember, GCC
does not understand (or even parse!) the assembler string).

So just

: "+" (loop1), "+" (loop2)

will do.  (Why are they separate though?  It could just be one loop var).


Segher


Re: [PATCH v7 5/5] kasan debug: track pages allocated for vmalloc shadow

2019-09-03 Thread Andrey Konovalov
On Tue, Sep 3, 2019 at 4:56 PM Daniel Axtens  wrote:
>
> Provide the current number of vmalloc shadow pages in
> /sys/kernel/debug/kasan_vmalloc/shadow_pages.

Maybe it makes sense to put this into /sys/kernel/debug/kasan/
(without _vmalloc) and name e.g. vmalloc_shadow_pages? In case we want
to expose more generic KASAN debugging info later.

>
> Signed-off-by: Daniel Axtens 
>
> ---
>
> Merging this is probably overkill, but I leave it to the discretion
> of the broader community.
>
> On v4 (no dynamic freeing), I saw the following approximate figures
> on my test VM:
>
>  - fresh boot: 720
>  - after test_vmalloc: ~14000
>
> With v5 (lazy dynamic freeing):
>
>  - boot: ~490-500
>  - running modprobe test_vmalloc pushes the figures up to sometimes
> as high as ~14000, but they drop down to ~560 after the test ends.
> I'm not sure where the extra sixty pages are from, but running the
> test repeately doesn't cause the number to keep growing, so I don't
> think we're leaking.
>  - with vmap_stack, spawning tasks pushes the figure up to ~4200, then
> some clearing kicks in and drops it down to previous levels again.
> ---
>  mm/kasan/common.c | 26 ++
>  1 file changed, 26 insertions(+)
>
> diff --git a/mm/kasan/common.c b/mm/kasan/common.c
> index e33cbab83309..e40854512417 100644
> --- a/mm/kasan/common.c
> +++ b/mm/kasan/common.c
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>
> @@ -750,6 +751,8 @@ core_initcall(kasan_memhotplug_init);
>  #endif
>
>  #ifdef CONFIG_KASAN_VMALLOC
> +static u64 vmalloc_shadow_pages;
> +
>  static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
>   void *unused)
>  {
> @@ -776,6 +779,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, 
> unsigned long addr,
> if (likely(pte_none(*ptep))) {
> set_pte_at(_mm, addr, ptep, pte);
> page = 0;
> +   vmalloc_shadow_pages++;
> }
> spin_unlock(_mm.page_table_lock);
> if (page)
> @@ -829,6 +833,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, 
> unsigned long addr,
> if (likely(!pte_none(*ptep))) {
> pte_clear(_mm, addr, ptep);
> free_page(page);
> +   vmalloc_shadow_pages--;
> }
> spin_unlock(_mm.page_table_lock);
>
> @@ -947,4 +952,25 @@ void kasan_release_vmalloc(unsigned long start, unsigned 
> long end,
>(unsigned long)shadow_end);
> }
>  }
> +
> +static __init int kasan_init_vmalloc_debugfs(void)
> +{
> +   struct dentry *root, *count;
> +
> +   root = debugfs_create_dir("kasan_vmalloc", NULL);
> +   if (IS_ERR(root)) {
> +   if (PTR_ERR(root) == -ENODEV)
> +   return 0;
> +   return PTR_ERR(root);
> +   }
> +
> +   count = debugfs_create_u64("shadow_pages", 0444, root,
> +  _shadow_pages);
> +
> +   if (IS_ERR(count))
> +   return PTR_ERR(root);
> +
> +   return 0;
> +}
> +late_initcall(kasan_init_vmalloc_debugfs);
>  #endif
> --
> 2.20.1
>
> --
> You received this message because you are subscribed to the Google Groups 
> "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kasan-dev+unsubscr...@googlegroups.com.
> To view this discussion on the web visit 
> https://groups.google.com/d/msgid/kasan-dev/20190903145536.3390-6-dja%40axtens.net.


[PATCH v7 5/5] kasan debug: track pages allocated for vmalloc shadow

2019-09-03 Thread Daniel Axtens
Provide the current number of vmalloc shadow pages in
/sys/kernel/debug/kasan_vmalloc/shadow_pages.

Signed-off-by: Daniel Axtens 

---

Merging this is probably overkill, but I leave it to the discretion
of the broader community.

On v4 (no dynamic freeing), I saw the following approximate figures
on my test VM:

 - fresh boot: 720
 - after test_vmalloc: ~14000

With v5 (lazy dynamic freeing):

 - boot: ~490-500
 - running modprobe test_vmalloc pushes the figures up to sometimes
as high as ~14000, but they drop down to ~560 after the test ends.
I'm not sure where the extra sixty pages are from, but running the
test repeately doesn't cause the number to keep growing, so I don't
think we're leaking.
 - with vmap_stack, spawning tasks pushes the figure up to ~4200, then
some clearing kicks in and drops it down to previous levels again.
---
 mm/kasan/common.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index e33cbab83309..e40854512417 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -750,6 +751,8 @@ core_initcall(kasan_memhotplug_init);
 #endif
 
 #ifdef CONFIG_KASAN_VMALLOC
+static u64 vmalloc_shadow_pages;
+
 static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  void *unused)
 {
@@ -776,6 +779,7 @@ static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned 
long addr,
if (likely(pte_none(*ptep))) {
set_pte_at(_mm, addr, ptep, pte);
page = 0;
+   vmalloc_shadow_pages++;
}
spin_unlock(_mm.page_table_lock);
if (page)
@@ -829,6 +833,7 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, 
unsigned long addr,
if (likely(!pte_none(*ptep))) {
pte_clear(_mm, addr, ptep);
free_page(page);
+   vmalloc_shadow_pages--;
}
spin_unlock(_mm.page_table_lock);
 
@@ -947,4 +952,25 @@ void kasan_release_vmalloc(unsigned long start, unsigned 
long end,
   (unsigned long)shadow_end);
}
 }
+
+static __init int kasan_init_vmalloc_debugfs(void)
+{
+   struct dentry *root, *count;
+
+   root = debugfs_create_dir("kasan_vmalloc", NULL);
+   if (IS_ERR(root)) {
+   if (PTR_ERR(root) == -ENODEV)
+   return 0;
+   return PTR_ERR(root);
+   }
+
+   count = debugfs_create_u64("shadow_pages", 0444, root,
+  _shadow_pages);
+
+   if (IS_ERR(count))
+   return PTR_ERR(root);
+
+   return 0;
+}
+late_initcall(kasan_init_vmalloc_debugfs);
 #endif
-- 
2.20.1



[PATCH v7 4/5] x86/kasan: support KASAN_VMALLOC

2019-09-03 Thread Daniel Axtens
In the case where KASAN directly allocates memory to back vmalloc
space, don't map the early shadow page over it.

We prepopulate pgds/p4ds for the range that would otherwise be empty.
This is required to get it synced to hardware on boot, allowing the
lower levels of the page tables to be filled dynamically.

Acked-by: Dmitry Vyukov 
Signed-off-by: Daniel Axtens 

---
v5: fix some checkpatch CHECK warnings. There are some that remain
around lines ending with '(': I have not changed these because
it's consistent with the rest of the file and it's not easy to
see how to fix it without creating an overlong line or lots of
temporary variables.

v2: move from faulting in shadow pgds to prepopulating
---
 arch/x86/Kconfig|  1 +
 arch/x86/mm/kasan_init_64.c | 60 +
 2 files changed, 61 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2502f7f60c9c..300b4766ccfa 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -134,6 +134,7 @@ config X86
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_JUMP_LABEL_RELATIVE
select HAVE_ARCH_KASAN  if X86_64
+   select HAVE_ARCH_KASAN_VMALLOC  if X86_64
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS  if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if MMU && COMPAT
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 296da58f3013..8f00f462709e 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -245,6 +245,51 @@ static void __init kasan_map_early_shadow(pgd_t *pgd)
} while (pgd++, addr = next, addr != end);
 }
 
+static void __init kasan_shallow_populate_p4ds(pgd_t *pgd,
+  unsigned long addr,
+  unsigned long end,
+  int nid)
+{
+   p4d_t *p4d;
+   unsigned long next;
+   void *p;
+
+   p4d = p4d_offset(pgd, addr);
+   do {
+   next = p4d_addr_end(addr, end);
+
+   if (p4d_none(*p4d)) {
+   p = early_alloc(PAGE_SIZE, nid, true);
+   p4d_populate(_mm, p4d, p);
+   }
+   } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_shallow_populate_pgds(void *start, void *end)
+{
+   unsigned long addr, next;
+   pgd_t *pgd;
+   void *p;
+   int nid = early_pfn_to_nid((unsigned long)start);
+
+   addr = (unsigned long)start;
+   pgd = pgd_offset_k(addr);
+   do {
+   next = pgd_addr_end(addr, (unsigned long)end);
+
+   if (pgd_none(*pgd)) {
+   p = early_alloc(PAGE_SIZE, nid, true);
+   pgd_populate(_mm, pgd, p);
+   }
+
+   /*
+* we need to populate p4ds to be synced when running in
+* four level mode - see sync_global_pgds_l4()
+*/
+   kasan_shallow_populate_p4ds(pgd, addr, next, nid);
+   } while (pgd++, addr = next, addr != (unsigned long)end);
+}
+
 #ifdef CONFIG_KASAN_INLINE
 static int kasan_die_handler(struct notifier_block *self,
 unsigned long val,
@@ -352,9 +397,24 @@ void __init kasan_init(void)
shadow_cpu_entry_end = (void *)round_up(
(unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
 
+   /*
+* If we're in full vmalloc mode, don't back vmalloc space with early
+* shadow pages. Instead, prepopulate pgds/p4ds so they are synced to
+* the global table and we can populate the lower levels on demand.
+*/
+#ifdef CONFIG_KASAN_VMALLOC
+   kasan_shallow_populate_pgds(
+   kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+   kasan_mem_to_shadow((void *)VMALLOC_END));
+
+   kasan_populate_early_shadow(
+   kasan_mem_to_shadow((void *)VMALLOC_END + 1),
+   shadow_cpu_entry_begin);
+#else
kasan_populate_early_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
shadow_cpu_entry_begin);
+#endif
 
kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
  (unsigned long)shadow_cpu_entry_end, 0);
-- 
2.20.1



[PATCH v7 3/5] fork: support VMAP_STACK with KASAN_VMALLOC

2019-09-03 Thread Daniel Axtens
Supporting VMAP_STACK with KASAN_VMALLOC is straightforward:

 - clear the shadow region of vmapped stacks when swapping them in
 - tweak Kconfig to allow VMAP_STACK to be turned on with KASAN

Reviewed-by: Dmitry Vyukov 
Signed-off-by: Daniel Axtens 
---
 arch/Kconfig  | 9 +
 kernel/fork.c | 4 
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 6728c5fa057e..e15f1486682a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -843,16 +843,17 @@ config HAVE_ARCH_VMAP_STACK
 config VMAP_STACK
default y
bool "Use a virtually-mapped stack"
-   depends on HAVE_ARCH_VMAP_STACK && !KASAN
+   depends on HAVE_ARCH_VMAP_STACK
+   depends on !KASAN || KASAN_VMALLOC
---help---
  Enable this if you want the use virtually-mapped kernel stacks
  with guard pages.  This causes kernel stack overflows to be
  caught immediately rather than causing difficult-to-diagnose
  corruption.
 
- This is presently incompatible with KASAN because KASAN expects
- the stack to map directly to the KASAN shadow map using a formula
- that is incorrect if the stack is in vmalloc space.
+ To use this with KASAN, the architecture must support backing
+ virtual mappings with real shadow memory, and KASAN_VMALLOC must
+ be enabled.
 
 config ARCH_OPTIONAL_KERNEL_RWX
def_bool n
diff --git a/kernel/fork.c b/kernel/fork.c
index f601168f6b21..52279fd5e72d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -94,6 +94,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -229,6 +230,9 @@ static unsigned long *alloc_thread_stack_node(struct 
task_struct *tsk, int node)
if (!s)
continue;
 
+   /* Clear the KASAN shadow of the stack. */
+   kasan_unpoison_shadow(s->addr, THREAD_SIZE);
+
/* Clear stale pointers from reused stack. */
memset(s->addr, 0, THREAD_SIZE);
 
-- 
2.20.1



[PATCH v7 2/5] kasan: add test for vmalloc

2019-09-03 Thread Daniel Axtens
Test kasan vmalloc support by adding a new test to the module.

Signed-off-by: Daniel Axtens 

--

v5: split out per Christophe Leroy
---
 lib/test_kasan.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/lib/test_kasan.c b/lib/test_kasan.c
index 49cc4d570a40..328d33beae36 100644
--- a/lib/test_kasan.c
+++ b/lib/test_kasan.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -748,6 +749,30 @@ static noinline void __init kmalloc_double_kzfree(void)
kzfree(ptr);
 }
 
+#ifdef CONFIG_KASAN_VMALLOC
+static noinline void __init vmalloc_oob(void)
+{
+   void *area;
+
+   pr_info("vmalloc out-of-bounds\n");
+
+   /*
+* We have to be careful not to hit the guard page.
+* The MMU will catch that and crash us.
+*/
+   area = vmalloc(3000);
+   if (!area) {
+   pr_err("Allocation failed\n");
+   return;
+   }
+
+   ((volatile char *)area)[3100];
+   vfree(area);
+}
+#else
+static void __init vmalloc_oob(void) {}
+#endif
+
 static int __init kmalloc_tests_init(void)
 {
/*
@@ -793,6 +818,7 @@ static int __init kmalloc_tests_init(void)
kasan_strings();
kasan_bitops();
kmalloc_double_kzfree();
+   vmalloc_oob();
 
kasan_restore_multi_shot(multishot);
 
-- 
2.20.1



[PATCH v7 1/5] kasan: support backing vmalloc space with real shadow memory

2019-09-03 Thread Daniel Axtens
Hook into vmalloc and vmap, and dynamically allocate real shadow
memory to back the mappings.

Most mappings in vmalloc space are small, requiring less than a full
page of shadow space. Allocating a full shadow page per mapping would
therefore be wasteful. Furthermore, to ensure that different mappings
use different shadow pages, mappings would have to be aligned to
KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE.

Instead, share backing space across multiple mappings. Allocate a
backing page when a mapping in vmalloc space uses a particular page of
the shadow region. This page can be shared by other vmalloc mappings
later on.

We hook in to the vmap infrastructure to lazily clean up unused shadow
memory.

To avoid the difficulties around swapping mappings around, this code
expects that the part of the shadow region that covers the vmalloc
space will not be covered by the early shadow page, but will be left
unmapped. This will require changes in arch-specific code.

This allows KASAN with VMAP_STACK, and may be helpful for architectures
that do not have a separate module space (e.g. powerpc64, which I am
currently working on). It also allows relaxing the module alignment
back to PAGE_SIZE.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=202009
Acked-by: Vasily Gorbik 
Signed-off-by: Daniel Axtens 
[Mark: rework shadow allocation]
Signed-off-by: Mark Rutland 

--

v2: let kasan_unpoison_shadow deal with ranges that do not use a
full shadow byte.

v3: relax module alignment
rename to kasan_populate_vmalloc which is a much better name
deal with concurrency correctly

v4: Mark's rework
Poision pages on vfree
Handle allocation failures

v5: Per Christophe Leroy, split out test and dynamically free pages.

v6: Guard freeing page properly. Drop WARN_ON_ONCE(pte_none(*ptep)),
 on reflection it's unnecessary debugging cruft with too high a
 false positive rate.

v7: tlb flush, thanks Mark.
explain more clearly how freeing works and is concurrency-safe.
---
 Documentation/dev-tools/kasan.rst |  63 +
 include/linux/kasan.h |  31 +
 include/linux/moduleloader.h  |   2 +-
 include/linux/vmalloc.h   |  12 ++
 lib/Kconfig.kasan |  16 +++
 mm/kasan/common.c | 204 ++
 mm/kasan/generic_report.c |   3 +
 mm/kasan/kasan.h  |   1 +
 mm/vmalloc.c  |  45 ++-
 9 files changed, 375 insertions(+), 2 deletions(-)

diff --git a/Documentation/dev-tools/kasan.rst 
b/Documentation/dev-tools/kasan.rst
index b72d07d70239..bdb92c3de7a5 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -215,3 +215,66 @@ brk handler is used to print bug reports.
 A potential expansion of this mode is a hardware tag-based mode, which would
 use hardware memory tagging support instead of compiler instrumentation and
 manual shadow memory manipulation.
+
+What memory accesses are sanitised by KASAN?
+
+
+The kernel maps memory in a number of different parts of the address
+space. This poses something of a problem for KASAN, which requires
+that all addresses accessed by instrumented code have a valid shadow
+region.
+
+The range of kernel virtual addresses is large: there is not enough
+real memory to support a real shadow region for every address that
+could be accessed by the kernel.
+
+By default
+~~
+
+By default, architectures only map real memory over the shadow region
+for the linear mapping (and potentially other small areas). For all
+other areas - such as vmalloc and vmemmap space - a single read-only
+page is mapped over the shadow area. This read-only shadow page
+declares all memory accesses as permitted.
+
+This presents a problem for modules: they do not live in the linear
+mapping, but in a dedicated module space. By hooking in to the module
+allocator, KASAN can temporarily map real shadow memory to cover
+them. This allows detection of invalid accesses to module globals, for
+example.
+
+This also creates an incompatibility with ``VMAP_STACK``: if the stack
+lives in vmalloc space, it will be shadowed by the read-only page, and
+the kernel will fault when trying to set up the shadow data for stack
+variables.
+
+CONFIG_KASAN_VMALLOC
+
+
+With ``CONFIG_KASAN_VMALLOC``, KASAN can cover vmalloc space at the
+cost of greater memory usage. Currently this is only supported on x86.
+
+This works by hooking into vmalloc and vmap, and dynamically
+allocating real shadow memory to back the mappings.
+
+Most mappings in vmalloc space are small, requiring less than a full
+page of shadow space. Allocating a full shadow page per mapping would
+therefore be wasteful. Furthermore, to ensure that different mappings
+use different shadow pages, mappings would have to be aligned to
+``KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE``.
+
+Instead, we share backing space across multiple mappings. We 

[PATCH v7 0/5] kasan: support backing vmalloc space with real shadow memory

2019-09-03 Thread Daniel Axtens
Currently, vmalloc space is backed by the early shadow page. This
means that kasan is incompatible with VMAP_STACK.

This series provides a mechanism to back vmalloc space with real,
dynamically allocated memory. I have only wired up x86, because that's
the only currently supported arch I can work with easily, but it's
very easy to wire up other architectures, and it appears that there is
some work-in-progress code to do this on arm64 and s390.

This has been discussed before in the context of VMAP_STACK:
 - https://bugzilla.kernel.org/show_bug.cgi?id=202009
 - https://lkml.org/lkml/2018/7/22/198
 - https://lkml.org/lkml/2019/7/19/822

In terms of implementation details:

Most mappings in vmalloc space are small, requiring less than a full
page of shadow space. Allocating a full shadow page per mapping would
therefore be wasteful. Furthermore, to ensure that different mappings
use different shadow pages, mappings would have to be aligned to
KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE.

Instead, share backing space across multiple mappings. Allocate a
backing page when a mapping in vmalloc space uses a particular page of
the shadow region. This page can be shared by other vmalloc mappings
later on.

We hook in to the vmap infrastructure to lazily clean up unused shadow
memory.


v1: https://lore.kernel.org/linux-mm/20190725055503.19507-1-...@axtens.net/
v2: https://lore.kernel.org/linux-mm/20190729142108.23343-1-...@axtens.net/
 Address review comments:
 - Patch 1: use kasan_unpoison_shadow's built-in handling of
ranges that do not align to a full shadow byte
 - Patch 3: prepopulate pgds rather than faulting things in
v3: https://lore.kernel.org/linux-mm/20190731071550.31814-1-...@axtens.net/
 Address comments from Mark Rutland:
 - kasan_populate_vmalloc is a better name
 - handle concurrency correctly
 - various nits and cleanups
 - relax module alignment in KASAN_VMALLOC case
v4: https://lore.kernel.org/linux-mm/20190815001636.12235-1-...@axtens.net/
 Changes to patch 1 only:
 - Integrate Mark's rework, thanks Mark!
 - handle the case where kasan_populate_shadow might fail
 - poision shadow on free, allowing the alloc path to just
 unpoision memory that it uses
v5: https://lore.kernel.org/linux-mm/20190830003821.10737-1-...@axtens.net/
 Address comments from Christophe Leroy:
 - Fix some issues with my descriptions in commit messages and docs
 - Dynamically free unused shadow pages by hooking into the vmap book-keeping
 - Split out the test into a separate patch
 - Optional patch to track the number of pages allocated
 - minor checkpatch cleanups
v6: https://lore.kernel.org/linux-mm/20190902112028.23773-1-...@axtens.net/
 Properly guard freeing pages in patch 1, drop debugging code.
v7: Add a TLB flush on freeing, thanks Mark Rutland.
Explain more clearly how I think freeing is concurrency-safe.

Daniel Axtens (5):
  kasan: support backing vmalloc space with real shadow memory
  kasan: add test for vmalloc
  fork: support VMAP_STACK with KASAN_VMALLOC
  x86/kasan: support KASAN_VMALLOC
  kasan debug: track pages allocated for vmalloc shadow

 Documentation/dev-tools/kasan.rst |  63 
 arch/Kconfig  |   9 +-
 arch/x86/Kconfig  |   1 +
 arch/x86/mm/kasan_init_64.c   |  60 
 include/linux/kasan.h |  31 
 include/linux/moduleloader.h  |   2 +-
 include/linux/vmalloc.h   |  12 ++
 kernel/fork.c |   4 +
 lib/Kconfig.kasan |  16 +++
 lib/test_kasan.c  |  26 
 mm/kasan/common.c | 230 ++
 mm/kasan/generic_report.c |   3 +
 mm/kasan/kasan.h  |   1 +
 mm/vmalloc.c  |  45 +-
 14 files changed, 497 insertions(+), 6 deletions(-)

-- 
2.20.1



Re: [PATCH v2 2/9] x86: numa: check the node id consistently for x86

2019-09-03 Thread Peter Zijlstra
On Tue, Sep 03, 2019 at 12:15:24PM +, Salil Mehta wrote:
> > From: Linuxarm [mailto:linuxarm-boun...@huawei.com] On Behalf Of Peter 
> > Zijlstra
> > On Tue, Sep 03, 2019 at 02:19:04PM +0800, Yunsheng Lin wrote:

> > > Is it possible that the node id set by device_add() become invalid
> > > if the node is offlined, then dev_to_node() may return a invalid
> > > node id.
> > 
> > In that case I would expect the device to go away too. Once the memory
> > controller goes away, the PCI bus connected to it cannot continue to
> > function.
> 
> I am not sure if this is *exactly* true on our system as NUMA nodes are
> part of the SoCs and devices could still be used even if all the memory
> and CPUs part of the node are turned off. Although, it is highly unlikely
> anybody would do that(maybe could be debated for the Power Management case?) 

Cute; anyway, we never change nr_node_ids (after boot), so once a node
is deemed valid it always is.

The worst that can happen in the above case, is that cpumask_of_node()
returns an empty mask, which, if all CPUs (of said node) are offline, is
an accurate representation.



Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Christophe Leroy




Le 03/09/2019 à 15:04, Segher Boessenkool a écrit :

Hi!

On Tue, Sep 03, 2019 at 03:23:57PM +1000, Alastair D'Silva wrote:

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c



+#if !defined(CONFIG_PPC_8xx) & !defined(CONFIG_PPC64)


Please write that as &&?  That is more usual, and thus, easier to read.


+static void flush_dcache_icache_phys(unsigned long physaddr)



+   asm volatile(
+   "   mtctr %2;"
+   "   mtmsr %3;"
+   "   isync;"
+   "0: dcbst   0, %0;"
+   "   addi%0, %0, %4;"
+   "   bdnz0b;"
+   "   sync;"
+   "   mtctr %2;"
+   "1: icbi0, %1;"
+   "   addi%1, %1, %4;"
+   "   bdnz1b;"
+   "   sync;"
+   "   mtmsr %5;"
+   "   isync;"
+   : "+r" (loop1), "+r" (loop2)
+   : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
+   : "ctr", "memory");


This outputs as one huge assembler statement, all on one line.  That's
going to be fun to read or debug.


Do you mean \n has to be added after the ; ?



loop1 and/or loop2 can be assigned the same register as msr0 or nb.  They
need to be made earlyclobbers.  (msr is fine, all of its reads are before
any writes to loop1 or loop2; and bytes is fine, it's not a register).


Can you explicit please ? Doesn't '+r' means that they are input and 
output at the same time ?


"to be made earlyclobbers", what does this means exactly ? How to do that ?

Christophe


Re: [PATCH v6 3/3] soc: fsl: add RCPM driver

2019-09-03 Thread Pavel Machek
Hi!

> > +   /* Begin with first registered wakeup source */
> > +   ws = wakeup_source_get_start();
> 
> Since I have mad some change in this version, could you please take a look on 
> this.
> If it's OK to you, I would re-add 'Acked-by: Pavel Machek  '

I'm sorry, I'm a bit busy at the moment and this is not really my
area.

Best regards,
Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH 06/14] powerpc/eeh: Remove stale CAPI comment

2019-09-03 Thread Andrew Donnellan

On 3/9/19 8:15 pm, Oliver O'Halloran wrote:

Support for switching CAPI cards into and out of CAPI mode was removed a
while ago. Drop the comment since it's no longer relevant.

Cc: Andrew Donnellan 
Signed-off-by: Oliver O'Halloran 


Oliver looked... unimpressed with the hackiness of the design of our 
mode-switching as he yelled at me to come explain this comment.


Acked-by: Andrew Donnellan 


---
  arch/powerpc/platforms/powernv/eeh-powernv.c | 7 ---
  1 file changed, 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e7b867912f24..94e26d56ecd2 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1125,13 +1125,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
  
-	/*

-* If dealing with the root bus (or the bus underneath the
-* root port), we reset the bus underneath the root port.
-*
-* The cxl driver depends on this behaviour for bi-modal card
-* switching.
-*/
if (pci_is_root_bus(bus) ||
pci_is_root_bus(bus->parent))
return pnv_eeh_root_reset(hose, option);



--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited



Re: [PATCH] powerpc/powernv: fix a W=1 compilation warning

2019-09-03 Thread Qian Cai
On Tue, 2019-09-03 at 15:30 +0200, Christoph Hellwig wrote:
> On Tue, Sep 03, 2019 at 09:29:14AM -0400, Qian Cai wrote:
> > I saw Christ start to remove npu-dma.c code [1]
> > 
> > [1] https://lore.kernel.org/linuxppc-dev/20190625145239.2759-4-...@lst.de/
> > 
> > Should pnv_npu_dma_set_32() be removed too?
> > 
> > It was only called by pnv_npu_try_dma_set_bypass() but the later is not used
> > anywhere in the kernel tree. If that is a case, I don't need to bother
> > fixing
> > the warning here.
> 
> Yes, pnv_npu_try_dma_set_bypass and pnv_npu_dma_set_32 should go away
> as well as they are unused.  Do you want to send a patch or should I
> prepare one?

I would be nice that you could prepare one since it probably could be a part for
your previous attempt.


Re: [PATCH] powerpc/powernv: fix a W=1 compilation warning

2019-09-03 Thread Christoph Hellwig
On Tue, Sep 03, 2019 at 09:29:14AM -0400, Qian Cai wrote:
> I saw Christ start to remove npu-dma.c code [1]
> 
> [1] https://lore.kernel.org/linuxppc-dev/20190625145239.2759-4-...@lst.de/
> 
> Should pnv_npu_dma_set_32() be removed too?
> 
> It was only called by pnv_npu_try_dma_set_bypass() but the later is not used
> anywhere in the kernel tree. If that is a case, I don't need to bother fixing
> the warning here.

Yes, pnv_npu_try_dma_set_bypass and pnv_npu_dma_set_32 should go away
as well as they are unused.  Do you want to send a patch or should I
prepare one?


Re: [PATCH] powerpc/powernv: fix a W=1 compilation warning

2019-09-03 Thread Qian Cai
I saw Christ start to remove npu-dma.c code [1]

[1] https://lore.kernel.org/linuxppc-dev/20190625145239.2759-4-...@lst.de/

Should pnv_npu_dma_set_32() be removed too?

It was only called by pnv_npu_try_dma_set_bypass() but the later is not used
anywhere in the kernel tree. If that is a case, I don't need to bother fixing
the warning here.

On Wed, 2019-05-22 at 12:09 -0400, Qian Cai wrote:
> The commit b575c731fe58 ("powerpc/powernv/npu: Add set/unset window
> helpers") called pnv_npu_set_window() in a void function
> pnv_npu_dma_set_32(), but the return code from pnv_npu_set_window() has
> no use there as all the error logging happen in pnv_npu_set_window(),
> so just remove the unused variable to avoid a compilation warning,
> 
> arch/powerpc/platforms/powernv/npu-dma.c: In function
> 'pnv_npu_dma_set_32':
> arch/powerpc/platforms/powernv/npu-dma.c:198:10: warning: variable ‘rc’
> set but not used [-Wunused-but-set-variable]
> 
> Signed-off-by: Qian Cai 
> ---
>  arch/powerpc/platforms/powernv/npu-dma.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
> b/arch/powerpc/platforms/powernv/npu-dma.c
> index 495550432f3d..035208ed591f 100644
> --- a/arch/powerpc/platforms/powernv/npu-dma.c
> +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> @@ -195,7 +195,6 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
>  {
>   struct pci_dev *gpdev;
>   struct pnv_ioda_pe *gpe;
> - int64_t rc;
>  
>   /*
>    * Find the assoicated PCI devices and get the dma window
> @@ -208,8 +207,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
>   if (!gpe)
>   return;
>  
> - rc = pnv_npu_set_window(>table_group, 0,
> - gpe->table_group.tables[0]);
> + pnv_npu_set_window(>table_group, 0,
> +    gpe->table_group.tables[0]);
>  
>   /*
>    * NVLink devices use the same TCE table configuration as


Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Segher Boessenkool
Hi!

On Tue, Sep 03, 2019 at 03:23:57PM +1000, Alastair D'Silva wrote:
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c

> +#if !defined(CONFIG_PPC_8xx) & !defined(CONFIG_PPC64)

Please write that as &&?  That is more usual, and thus, easier to read.

> +static void flush_dcache_icache_phys(unsigned long physaddr)

> + asm volatile(
> + "   mtctr %2;"
> + "   mtmsr %3;"
> + "   isync;"
> + "0: dcbst   0, %0;"
> + "   addi%0, %0, %4;"
> + "   bdnz0b;"
> + "   sync;"
> + "   mtctr %2;"
> + "1: icbi0, %1;"
> + "   addi%1, %1, %4;"
> + "   bdnz1b;"
> + "   sync;"
> + "   mtmsr %5;"
> + "   isync;"
> + : "+r" (loop1), "+r" (loop2)
> + : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
> + : "ctr", "memory");

This outputs as one huge assembler statement, all on one line.  That's
going to be fun to read or debug.

loop1 and/or loop2 can be assigned the same register as msr0 or nb.  They
need to be made earlyclobbers.  (msr is fine, all of its reads are before
any writes to loop1 or loop2; and bytes is fine, it's not a register).


Segher


[RFC PATCH 2/3] powerpc/mm/book3s64/radix: Use freed_tables instead of need_flush_all

2019-09-03 Thread Aneesh Kumar K.V
With commit: 22a61c3c4f13 ("asm-generic/tlb: Track freeing of page-table
directories in struct mmu_gather") we now track whether we freed page
table in mmu_gather. Use that to decide whether to flush Page Walk Cache.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgalloc.h  | 15 ---
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 15 ---
 arch/powerpc/mm/book3s64/radix_tlb.c  | 11 +++
 3 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index d5a44912902f..f6968c811026 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -122,11 +122,6 @@ static inline void pud_populate(struct mm_struct *mm, 
pud_t *pud, pmd_t *pmd)
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
  unsigned long address)
 {
-   /*
-* By now all the pud entries should be none entries. So go
-* ahead and flush the page walk cache
-*/
-   flush_tlb_pgtable(tlb, address);
pgtable_free_tlb(tlb, pud, PUD_INDEX);
 }
 
@@ -143,11 +138,6 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t 
*pmd)
 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
  unsigned long address)
 {
-   /*
-* By now all the pud entries should be none entries. So go
-* ahead and flush the page walk cache
-*/
-   flush_tlb_pgtable(tlb, address);
return pgtable_free_tlb(tlb, pmd, PMD_INDEX);
 }
 
@@ -166,11 +156,6 @@ static inline void pmd_populate(struct mm_struct *mm, 
pmd_t *pmd,
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
 {
-   /*
-* By now all the pud entries should be none entries. So go
-* ahead and flush the page walk cache
-*/
-   flush_tlb_pgtable(tlb, address);
pgtable_free_tlb(tlb, table, PTE_INDEX);
 }
 
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index ebf572ea621e..250399fa2459 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -147,19 +147,4 @@ static inline void flush_tlb_fix_spurious_fault(struct 
vm_area_struct *vma,
flush_tlb_page(vma, address);
 }
 
-/*
- * flush the page walk cache for the address
- */
-static inline void flush_tlb_pgtable(struct mmu_gather *tlb, unsigned long 
address)
-{
-   /*
-* Flush the page table walk cache on freeing a page table. We already
-* have marked the upper/higher level page table entry none by now.
-* So it is safe to flush PWC here.
-*/
-   if (!radix_enabled())
-   return;
-
-   radix__flush_tlb_pwc(tlb, address);
-}
 #endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_H */
diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 059fef601eb9..dc395152e973 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -616,18 +616,13 @@ static void __flush_all_mm(struct mm_struct *mm, bool 
fullmm)
}
preempt_enable();
 }
+
 void radix__flush_all_mm(struct mm_struct *mm)
 {
__flush_all_mm(mm, false);
 }
 EXPORT_SYMBOL(radix__flush_all_mm);
 
-void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
-{
-   tlb->need_flush_all = 1;
-}
-EXPORT_SYMBOL(radix__flush_tlb_pwc);
-
 void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
 int psize)
 {
@@ -866,12 +861,12 @@ void radix__tlb_flush(struct mmu_gather *tlb)
if (tlb->fullmm) {
__flush_all_mm(mm, true);
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
-   if (!tlb->need_flush_all)
+   if (!tlb->freed_tables)
radix__flush_tlb_mm(mm);
else
radix__flush_all_mm(mm);
} else {
-   if (!tlb->need_flush_all)
+   if (!tlb->freed_tables)
radix__flush_tlb_range_psize(mm, start, end, psize);
else
radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
-- 
2.21.0



[RFC PATCH 3/3] powerpc/mm/book3s64/radix: Flush the full mm even when need_flush_all is set

2019-09-03 Thread Aneesh Kumar K.V
With the previous patch, we should now not be using need_flush_all for powerpc.
But then make sure we force a PID tlbie flush with RIC=2 if we ever
find need_flush_all set. Also don't reset it after a mmu gather flush

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 3 +--
 include/asm-generic/tlb.h| 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index dc395152e973..cfa708c99313 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -858,7 +858,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 * that flushes the process table entry cache upon process teardown.
 * See the comment for radix in arch_exit_mmap().
 */
-   if (tlb->fullmm) {
+   if (tlb->fullmm || tlb->need_flush_all) {
__flush_all_mm(mm, true);
} else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
if (!tlb->freed_tables)
@@ -871,7 +871,6 @@ void radix__tlb_flush(struct mmu_gather *tlb)
else
radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
}
-   tlb->need_flush_all = 0;
 }
 
 static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct 
*mm,
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 04c0644006fd..e64991142a8b 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -428,7 +428,7 @@ static inline void tlb_change_page_size(struct mmu_gather 
*tlb,
 {
 #ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
if (tlb->page_size && tlb->page_size != page_size) {
-   if (!tlb->fullmm)
+   if (!tlb->fullmm && !tlb->need_flush_all)
tlb_flush_mmu(tlb);
}
 
-- 
2.21.0



[RFC PATCH 1/3] mm/powerpc/book3s64/radix: Remove unused code.

2019-09-03 Thread Aneesh Kumar K.V
mm_tlb_flush_nested change was added in the mmu gather tlb flush to handle
the case of parallel pte invalidate happening with mmap_sem held in read
mode. This fix was done by commit: 02390f66bd23 ("powerpc/64s/radix: Fix
MADV_[FREE|DONTNEED] TLB flush miss problem with THP") and the problem is
explained in detail in commit: 99baac21e458 ("mm: fix MADV_[FREE|DONTNEED] TLB
flush miss problem")

This was later updated by commit: 7a30df49f63a ("mm: mmu_gather: remove
__tlb_reset_range() for force flush") to do a full mm flush rather than
a range flush. By commit: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem
in munmap") we are also now allowing a page table free in mmap_sem read mode
which means we should do a PWC flush too. Our current full mm flush imply
a PWC flush.

With all the above change the mm_tlb_flush_nested(mm) branch in radix__tlb_flush
will never be taken because for the nested case we would have taken the
if (tlb->fullmm) branch. This patch removes the unused code. Also, remove the
gflush change in __radix__flush_tlb_range that was added to handle the range tlb
flush code. We only check for THP there because hugetlb is flushed via a
different code path where page size is explicitly specified

This is a partial revert of commit: 02390f66bd23 ("powerpc/64s/radix: Fix
MADV_[FREE|DONTNEED] TLB flush miss problem with THP")

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 62 +++-
 1 file changed, 6 insertions(+), 56 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 71f7fede2fa4..059fef601eb9 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -692,8 +692,7 @@ static unsigned long tlb_single_page_flush_ceiling 
__read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
 static inline void __radix__flush_tlb_range(struct mm_struct *mm,
-   unsigned long start, unsigned long end,
-   bool flush_all_sizes)
+   unsigned long start, unsigned long 
end)
 
 {
unsigned long pid;
@@ -735,26 +734,16 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
_tlbie_pid(pid, RIC_FLUSH_TLB);
}
} else {
-   bool hflush = flush_all_sizes;
-   bool gflush = flush_all_sizes;
+   bool hflush = false;
unsigned long hstart, hend;
-   unsigned long gstart, gend;
 
-   if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
-   hflush = true;
-
-   if (hflush) {
+   if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
hstart = (start + PMD_SIZE - 1) & PMD_MASK;
hend = end & PMD_MASK;
if (hstart == hend)
hflush = false;
-   }
-
-   if (gflush) {
-   gstart = (start + PUD_SIZE - 1) & PUD_MASK;
-   gend = end & PUD_MASK;
-   if (gstart == gend)
-   gflush = false;
+   else
+   hflush = true;
}
 
asm volatile("ptesync": : :"memory");
@@ -763,18 +752,12 @@ static inline void __radix__flush_tlb_range(struct 
mm_struct *mm,
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
PMD_SIZE, MMU_PAGE_2M);
-   if (gflush)
-   __tlbiel_va_range(gstart, gend, pid,
-   PUD_SIZE, MMU_PAGE_1G);
asm volatile("ptesync": : :"memory");
} else {
__tlbie_va_range(start, end, pid, page_size, 
mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
PMD_SIZE, MMU_PAGE_2M);
-   if (gflush)
-   __tlbie_va_range(gstart, gend, pid,
-   PUD_SIZE, MMU_PAGE_1G);
fixup_tlbie();
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
@@ -791,7 +774,7 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
return radix__flush_hugetlb_tlb_range(vma, start, end);
 #endif
 
-   __radix__flush_tlb_range(vma->vm_mm, start, end, false);
+   __radix__flush_tlb_range(vma->vm_mm, start, end);
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -882,39 +865,6 @@ void radix__tlb_flush(struct mmu_gather *tlb)

Re: [mainline][BUG][PPC][btrfs][bisected 00801a] kernel BUG at fs/btrfs/locking.c:71!

2019-09-03 Thread David Sterba
On Tue, Sep 03, 2019 at 02:25:07PM +0530, Abdul Haleem wrote:
> Greeting's
> 
> Mainline kernel panics with LTP/fs_fill-dir tests for btrfs file system on my 
> P9 box running mainline kernel 5.3.0-rc5
> 
> BUG_ON was first introduced by below commit

Well, technically the bug_on was there already the only change is the
handling of the updates of the value.

> commit 00801ae4bb2be5f5af46502ef239ac5f4b536094
> Author: David Sterba 
> Date:   Thu May 2 16:53:47 2019 +0200
> 
> btrfs: switch extent_buffer write_locks from atomic to int
> 
> The write_locks is either 0 or 1 and always updated under the lock,
> so we don't need the atomic_t semantics.

Assuming the code was correct before the patch, if this got broken one
of the above does not hold anymore:

* 0/1 updates -- this can be verified in code that all the state
  transitions are valid, ie. initial 0, locked update to 1, locked
  update 1->0

* atomic_t -> int behaves differently and the changes of the value get
  mixed up, eg. on the instruction level where intel architecture does
  'inc' while p9 does I-don't-know-what a RMW update?

But even with a RMW, this should not matter due to
write_lock/write_unlock around all the updates.


[PATCH v2 2/2] powerpc/nvdimm: use H_SCM_QUERY hcall on H_OVERLAP error

2019-09-03 Thread Aneesh Kumar K.V
Right now we force an unbind of SCM memory at drcindex on H_OVERLAP error.
This really slows down operations like kexec where we get the H_OVERLAP
error because we don't go through a full hypervisor re init.

H_OVERLAP error for a H_SCM_BIND_MEM hcall indicates that SCM memory at
drc index is already bound. Since we don't specify a logical memory
address for bind hcall, we can use the H_SCM_QUERY hcall to query
the already bound logical address.

Boot time difference with and without patch is:

[5.583617] IOMMU table initialized, virtual merging enabled
[5.603041] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Retrying 
bind after unbinding
[  301.514221] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Retrying 
bind after unbinding
[  340.057238] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 
failures), 275 descs

after fix

[5.101572] IOMMU table initialized, virtual merging enabled
[5.116984] papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Querying 
SCM details
[5.117223] papr_scm ibm,persistent-memory:ibm,pmemory@44108001: Querying 
SCM details
[5.120530] hv-24x7: read 1530 catalog entries, created 537 event attrs (0 
failures), 275 descs

Signed-off-by: Aneesh Kumar K.V 
---
Changes from V1:
* Use the first block and last block to query the logical bind memory
* If we fail to query, ubind and retry the bind.


 arch/powerpc/platforms/pseries/papr_scm.c | 48 +++
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 3bef4d298ac6..61883291defc 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -65,10 +65,8 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
cond_resched();
} while (rc == H_BUSY);
 
-   if (rc) {
-   dev_err(>pdev->dev, "bind err: %lld\n", rc);
+   if (rc)
return rc;
-   }
 
p->bound_addr = saved;
dev_dbg(>pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, 
>res);
@@ -110,6 +108,42 @@ static void drc_pmem_unbind(struct papr_scm_priv *p)
return;
 }
 
+static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
+{
+   unsigned long start_addr;
+   unsigned long end_addr;
+   unsigned long ret[PLPAR_HCALL_BUFSIZE];
+   int64_t rc;
+
+
+   rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+p->drc_index, 0);
+   if (rc)
+   goto err_out;
+   start_addr = ret[0];
+
+   /* Make sure the full region is bound. */
+   rc = plpar_hcall(H_SCM_QUERY_BLOCK_MEM_BINDING, ret,
+p->drc_index, p->blocks - 1);
+   if (rc)
+   goto err_out;
+   end_addr = ret[0];
+
+   if ((end_addr - start_addr) != ((p->blocks - 1) * p->block_size))
+   goto err_out;
+
+   p->bound_addr = start_addr;
+   dev_dbg(>pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, 
>res);
+   return rc;
+
+err_out:
+   dev_info(>pdev->dev,
+"Failed to query, trying an unbind followed by bind");
+   drc_pmem_unbind(p);
+   return drc_pmem_bind(p);
+}
+
+
 static int papr_scm_meta_get(struct papr_scm_priv *p,
 struct nd_cmd_get_config_data_hdr *hdr)
 {
@@ -430,13 +464,11 @@ static int papr_scm_probe(struct platform_device *pdev)
rc = drc_pmem_bind(p);
 
/* If phyp says drc memory still bound then force unbound and retry */
-   if (rc == H_OVERLAP) {
-   dev_warn(>dev, "Retrying bind after unbinding\n");
-   drc_pmem_unbind(p);
-   rc = drc_pmem_bind(p);
-   }
+   if (rc == H_OVERLAP)
+   rc = drc_pmem_query_n_bind(p);
 
if (rc != H_SUCCESS) {
+   dev_err(>pdev->dev, "bind err: %d\n", rc);
rc = -ENXIO;
goto err;
}
-- 
2.21.0



[PATCH v2 1/2] powerpc/nvdimm: Use HCALL error as the return value

2019-09-03 Thread Aneesh Kumar K.V
This simplifies the error handling and also enable us to switch to
H_SCM_QUERY hcall in a later patch on H_OVERLAP error.

We also do some kernel print formatting fixup in this patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 26 ++-
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index a5ac371a3f06..3bef4d298ac6 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -66,28 +66,22 @@ static int drc_pmem_bind(struct papr_scm_priv *p)
} while (rc == H_BUSY);
 
if (rc) {
-   /* H_OVERLAP needs a separate error path */
-   if (rc == H_OVERLAP)
-   return -EBUSY;
-
dev_err(>pdev->dev, "bind err: %lld\n", rc);
-   return -ENXIO;
+   return rc;
}
 
p->bound_addr = saved;
-
-   dev_dbg(>pdev->dev, "bound drc %x to %pR\n", p->drc_index, >res);
-
-   return 0;
+   dev_dbg(>pdev->dev, "bound drc 0x%x to %pR\n", p->drc_index, 
>res);
+   return rc;
 }
 
-static int drc_pmem_unbind(struct papr_scm_priv *p)
+static void drc_pmem_unbind(struct papr_scm_priv *p)
 {
unsigned long ret[PLPAR_HCALL_BUFSIZE];
uint64_t token = 0;
int64_t rc;
 
-   dev_dbg(>pdev->dev, "unbind drc %x\n", p->drc_index);
+   dev_dbg(>pdev->dev, "unbind drc 0x%x\n", p->drc_index);
 
/* NB: unbind has the same retry requirements as drc_pmem_bind() */
do {
@@ -110,10 +104,10 @@ static int drc_pmem_unbind(struct papr_scm_priv *p)
if (rc)
dev_err(>pdev->dev, "unbind error: %lld\n", rc);
else
-   dev_dbg(>pdev->dev, "unbind drc %x complete\n",
+   dev_dbg(>pdev->dev, "unbind drc 0x%x complete\n",
p->drc_index);
 
-   return rc == H_SUCCESS ? 0 : -ENXIO;
+   return;
 }
 
 static int papr_scm_meta_get(struct papr_scm_priv *p,
@@ -436,14 +430,16 @@ static int papr_scm_probe(struct platform_device *pdev)
rc = drc_pmem_bind(p);
 
/* If phyp says drc memory still bound then force unbound and retry */
-   if (rc == -EBUSY) {
+   if (rc == H_OVERLAP) {
dev_warn(>dev, "Retrying bind after unbinding\n");
drc_pmem_unbind(p);
rc = drc_pmem_bind(p);
}
 
-   if (rc)
+   if (rc != H_SUCCESS) {
+   rc = -ENXIO;
goto err;
+   }
 
/* setup the resource for the newly bound range */
p->res.start = p->bound_addr;
-- 
2.21.0



RE: [PATCH v2 2/9] x86: numa: check the node id consistently for x86

2019-09-03 Thread Salil Mehta
> From: Linuxarm [mailto:linuxarm-boun...@huawei.com] On Behalf Of Peter 
> Zijlstra
> Sent: Tuesday, September 3, 2019 8:11 AM
> 
> On Tue, Sep 03, 2019 at 02:19:04PM +0800, Yunsheng Lin wrote:
> > On 2019/9/2 20:56, Peter Zijlstra wrote:
> > > On Mon, Sep 02, 2019 at 08:25:24PM +0800, Yunsheng Lin wrote:
> > >> On 2019/9/2 15:25, Peter Zijlstra wrote:
> > >>> On Mon, Sep 02, 2019 at 01:46:51PM +0800, Yunsheng Lin wrote:
> >  On 2019/9/1 0:12, Peter Zijlstra wrote:
> > >>>
> > > 1) because even it is not set, the device really does belong to a 
> > > node.
> > > It is impossible a device will have magic uniform access to memory 
> > > when
> > > CPUs cannot.
> > 
> >  So it means dev_to_node() will return either NUMA_NO_NODE or a
> >  valid node id?
> > >>>
> > >>> NUMA_NO_NODE := -1, which is not a valid node number. It is also, like> 
> > >>> I
> > >>> said, not a valid device location on a NUMA system.
> > >>>
> > >>> Just because ACPI/BIOS is shit, doesn't mean the device doesn't have a
> > >>> node association. It just means we don't know and might have to guess.
> > >>
> > >> How do we guess the device's location when ACPI/BIOS does not set it?
> > >
> > > See device_add(), it looks to the device's parent and on NO_NODE, puts
> > > it there.
> > >
> > > Lacking any hints, just stick it to node0 and print a FW_BUG or
> > > something.
> > >
> > >> It seems dev_to_node() does not do anything about that and leave the
> > >> job to the caller or whatever function that get called with its return
> > >> value, such as cpumask_of_node().
> > >
> > > Well, dev_to_node() doesn't do anything; nor should it. It are the
> > > callers of set_dev_node() that should be taking care.
> > >
> > > Also note how device_add() sets the device node to the parent device's
> > > node on NUMA_NO_NODE. Arguably we should change it to complain when it
> > > finds NUMA_NO_NODE and !parent.
> >
> > Is it possible that the node id set by device_add() become invalid
> > if the node is offlined, then dev_to_node() may return a invalid
> > node id.
> 
> In that case I would expect the device to go away too. Once the memory
> controller goes away, the PCI bus connected to it cannot continue to
> function.

I am not sure if this is *exactly* true on our system as NUMA nodes are
part of the SoCs and devices could still be used even if all the memory
and CPUs part of the node are turned off. Although, it is highly unlikely
anybody would do that(maybe could be debated for the Power Management case?) 

Best Regards
Salil



Re: [PATCH 3/3] powerpc/tm: Add tm-poison test

2019-09-03 Thread Michael Ellerman
Michael Neuling  writes:
> From: Gustavo Romero 
>
> Add TM selftest to check if FP or VEC register values from one process
> can leak into another process when both run on the same CPU.
>
> This tests for CVE-2019-15030 and CVE-2019-15031.
>
> Signed-off-by: Gustavo Romero 
> Signed-off-by: Michael Neuling 
> ---
>  tools/testing/selftests/powerpc/tm/.gitignore |   1 +
>  tools/testing/selftests/powerpc/tm/Makefile   |   2 +-
>  .../testing/selftests/powerpc/tm/tm-poison.c  | 180 ++
>  3 files changed, 182 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/powerpc/tm/tm-poison.c

This doesn't build on 64-bit big endian:

tm-poison.c: In function 'tm_poison_test':
tm-poison.c:118:10: error: format '%lx' expects argument of type 'long unsigned 
int', but argument 2 has type 'uint64_t {aka long long unsigned int}' 
[-Werror=format=]
   printf("Unknown value %#lx leaked into f31!\n", unknown);
  ^
tm-poison.c:166:10: error: format '%lx' expects argument of type 'long unsigned 
int', but argument 2 has type 'uint64_t {aka long long unsigned int}' 
[-Werror=format=]
   printf("Unknown value %#lx leaked into vr31!\n", unknown);
  ^

cheers


Re: [PATCH v2 3/6] powerpc: Convert flush_icache_range & friends to C

2019-09-03 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 03/09/2019 à 07:23, Alastair D'Silva a écrit :
>> From: Alastair D'Silva 
>> 
>> Similar to commit 22e9c88d486a
>> ("powerpc/64: reuse PPC32 static inline flush_dcache_range()")
>> this patch converts the following ASM symbols to C:
>>  flush_icache_range()
>>  __flush_dcache_icache()
>>  __flush_dcache_icache_phys()
>> 
>> This was done as we discovered a long-standing bug where the length of the
>> range was truncated due to using a 32 bit shift instead of a 64 bit one.
>> 
>> By converting these functions to C, it becomes easier to maintain.
>> 
>> flush_dcache_icache_phys() retains a critical assembler section as we must
>> ensure there are no memory accesses while the data MMU is disabled
>> (authored by Christophe Leroy). Since this has no external callers, it has
>> also been made static, allowing the compiler to inline it within
>> flush_dcache_icache_page().
>> 
>> Signed-off-by: Alastair D'Silva 
>> Signed-off-by: Christophe Leroy 
>> ---
>>   arch/powerpc/include/asm/cache.h  |  26 ++---
>>   arch/powerpc/include/asm/cacheflush.h |  24 ++--
>>   arch/powerpc/kernel/misc_32.S | 117 
>>   arch/powerpc/kernel/misc_64.S | 102 -
>>   arch/powerpc/mm/mem.c | 152 +-
>>   5 files changed, 173 insertions(+), 248 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/cache.h 
>> b/arch/powerpc/include/asm/cache.h
>> index f852d5cd746c..91c808c6738b 100644
>> --- a/arch/powerpc/include/asm/cache.h
>> +++ b/arch/powerpc/include/asm/cache.h
>> @@ -98,20 +98,7 @@ static inline u32 l1_icache_bytes(void)
>>   #endif
>>   #endif /* ! __ASSEMBLY__ */
>>   
>> -#if defined(__ASSEMBLY__)
>> -/*
>> - * For a snooping icache, we still need a dummy icbi to purge all the
>> - * prefetched instructions from the ifetch buffers. We also need a sync
>> - * before the icbi to order the the actual stores to memory that might
>> - * have modified instructions with the icbi.
>> - */
>> -#define PURGE_PREFETCHED_INS\
>> -sync;   \
>> -icbi0,r3;   \
>> -sync;   \
>> -isync
>> -
>> -#else
>> +#if !defined(__ASSEMBLY__)
>>   #define __read_mostly __attribute__((__section__(".data..read_mostly")))
>>   
>>   #ifdef CONFIG_PPC_BOOK3S_32
>> @@ -145,6 +132,17 @@ static inline void dcbst(void *addr)
>>   {
>>  __asm__ __volatile__ ("dcbst %y0" : : "Z"(*(u8 *)addr) : "memory");
>>   }
>> +
>> +static inline void icbi(void *addr)
>> +{
>> +__asm__ __volatile__ ("icbi 0, %0" : : "r"(addr) : "memory");
>
> I think "__asm__ __volatile__" is deprecated. Use "asm volatile" instead.

Yes please.

>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 9191a66b3bc5..cd540123874d 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -321,6 +321,105 @@ void free_initmem(void)
>>  free_initmem_default(POISON_FREE_INITMEM);
>>   }
>>   
>> +/*
>> + * Warning: This macro will perform an early return if the CPU has
>> + * a coherent icache. The intent is is call this early in function,
>> + * and handle the non-coherent icache variant afterwards.
>> + *
>> + * For a snooping icache, we still need a dummy icbi to purge all the
>> + * prefetched instructions from the ifetch buffers. We also need a sync
>> + * before the icbi to order the the actual stores to memory that might
>> + * have modified instructions with the icbi.
>> + */
>> +#define flush_coherent_icache_or_return(addr) { \
>> +if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { \
>> +mb(); /* sync */\
>> +icbi(addr); \
>> +mb(); /* sync */\
>> +isync();\
>> +return; \
>> +}   \
>> +}
>
> I hate this kind of awful macro which kills code readability.

Yes I agree.

> Please to something like
>
> static bool flush_coherent_icache_or_return(unsigned long addr)
> {
>   if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
>   return false;
>
>   mb(); /* sync */
>   icbi(addr);
>   mb(); /* sync */
>   isync();
>   return true;
> }
>
> then callers will do:
>
>   if (flush_coherent_icache_or_return(addr))
>   return;

I don't think it needs the "_or_return" in the name.

eg, it can just be:

if (flush_coherent_icache(addr))
return;


Which reads fine I think, ie. flush the coherent icache, and if that
succeeds return, else continue.

cheers


Re: [PATCH 1/1] mm/pgtable/debug: Add test validating architecture page table helpers

2019-09-03 Thread kbuild test robot
Hi Anshuman,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc7 next-20190902]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Anshuman-Khandual/mm-debug-Add-tests-for-architecture-exported-page-table-helpers/20190903-162959
config: m68k-allmodconfig (attached as .config)
compiler: m68k-linux-gcc (GCC) 7.4.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.4.0 make.cross ARCH=m68k 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 

All error/warnings (new ones prefixed by >>):

   In file included from arch/m68k/include/asm/bug.h:32:0,
from include/linux/bug.h:5,
from include/linux/thread_info.h:12,
from include/asm-generic/preempt.h:5,
from ./arch/m68k/include/generated/asm/preempt.h:1,
from include/linux/preempt.h:78,
from arch/m68k/include/asm/irqflags.h:6,
from include/linux/irqflags.h:16,
from arch/m68k/include/asm/atomic.h:6,
from include/linux/atomic.h:7,
from include/linux/mm_types_task.h:13,
from include/linux/mm_types.h:5,
from include/linux/hugetlb.h:5,
from mm/arch_pgtable_test.c:14:
   mm/arch_pgtable_test.c: In function 'pmd_clear_tests':
>> arch/m68k/include/asm/page.h:31:22: error: lvalue required as unary '&' 
>> operand
#define pmd_val(x) (()->pmd[0])
 ^
   include/asm-generic/bug.h:124:25: note: in definition of macro 'WARN_ON'
 int __ret_warn_on = !!(condition);\
^
>> arch/m68k/include/asm/motorola_pgtable.h:138:26: note: in expansion of macro 
>> 'pmd_val'
#define pmd_none(pmd)  (!pmd_val(pmd))
 ^~~
>> mm/arch_pgtable_test.c:233:11: note: in expansion of macro 'pmd_none'
 WARN_ON(!pmd_none(READ_ONCE(*pmdp)));
  ^~~~
   mm/arch_pgtable_test.c: In function 'pmd_populate_tests':
>> arch/m68k/include/asm/page.h:31:22: error: lvalue required as unary '&' 
>> operand
#define pmd_val(x) (()->pmd[0])
 ^
   include/asm-generic/bug.h:124:25: note: in definition of macro 'WARN_ON'
 int __ret_warn_on = !!(condition);\
^
   arch/m68k/include/asm/motorola_pgtable.h:139:25: note: in expansion of macro 
'pmd_val'
#define pmd_bad(pmd)  ((pmd_val(pmd) & _DESCTYPE_MASK) != _PAGE_TABLE)
^~~
>> mm/arch_pgtable_test.c:245:10: note: in expansion of macro 'pmd_bad'
 WARN_ON(pmd_bad(READ_ONCE(*pmdp)));
 ^~~

vim +/pmd_none +233 mm/arch_pgtable_test.c

   228  
   229  static void pmd_clear_tests(pmd_t *pmdp)
   230  {
   231  memset(pmdp, RANDOM_NZVALUE, sizeof(pmd_t));
   232  pmd_clear(pmdp);
 > 233  WARN_ON(!pmd_none(READ_ONCE(*pmdp)));
   234  }
   235  
   236  static void pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
   237 pgtable_t pgtable)
   238  {
   239  /*
   240   * This entry points to next level page table page.
   241   * Hence this must not qualify as pmd_bad().
   242   */
   243  pmd_clear(pmdp);
   244  pmd_populate(mm, pmdp, pgtable);
 > 245  WARN_ON(pmd_bad(READ_ONCE(*pmdp)));
   246  }
   247  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v5 11/31] powernv/fadump: add fadump support on powernv

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> Add basic callback functions for FADump on PowerNV platform.

I assume this doesn't actually work yet?

Does something block it from appearing to work at runtime?

> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index d8dcd88..fc4ecfe 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -566,7 +566,7 @@ config CRASH_DUMP
>  
>  config FA_DUMP
>   bool "Firmware-assisted dump"
> - depends on PPC64 && PPC_RTAS
> + depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
>   select CRASH_CORE
>   select CRASH_DUMP
>   help
> @@ -577,7 +577,8 @@ config FA_DUMP
> is meant to be a kdump replacement offering robustness and
> speed not possible without system firmware assistance.
>  
> -   If unsure, say "N"
> +   If unsure, say "y". Only special kernels like petitboot may
> +   need to say "N" here.
>  
>  config IRQ_ALL_CPUS
>   bool "Distribute interrupts on all CPUs by default"
> diff --git a/arch/powerpc/kernel/fadump-common.h 
> b/arch/powerpc/kernel/fadump-common.h
> index d2c5b16..f6c52d3 100644
> --- a/arch/powerpc/kernel/fadump-common.h
> +++ b/arch/powerpc/kernel/fadump-common.h
> @@ -140,4 +140,13 @@ static inline int rtas_fadump_dt_scan(struct fw_dump 
> *fadump_config, ulong node)
>  }
>  #endif
>  
> +#ifdef CONFIG_PPC_POWERNV
> +extern int opal_fadump_dt_scan(struct fw_dump *fadump_config, ulong node);
> +#else
> +static inline int opal_fadump_dt_scan(struct fw_dump *fadump_config, ulong 
> node)
> +{
> + return 1;
> +}

Extending the strange flat device tree calling convention to these
functions is not ideal.

It would be better I think if they just returned bool true/false for
"found it" / "not found", and then early_init_dt_scan_fw_dump() can
convert that into the appropriate return value.

> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index f7c8073..b8061fb9 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -114,6 +114,9 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, 
> const char *uname,
>   if (strcmp(uname, "rtas") == 0)
>   return rtas_fadump_dt_scan(_dump, node);
>  
> + if (strcmp(uname, "ibm,opal") == 0)
> + return opal_fadump_dt_scan(_dump, node);
> +

ie this would become:

if (strcmp(uname, "ibm,opal") == 0 && opal_fadump_dt_scan(_dump, 
node))
return 1;

>   return 0;
>  }
>  
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index da2e99e..43a6e1c 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -6,6 +6,7 @@ obj-y += opal-msglog.o opal-hmi.o 
> opal-power.o opal-irqchip.o
>  obj-y+= opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-sensor-groups.o
>  
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
> +obj-$(CONFIG_FA_DUMP)+= opal-fadump.o
>  obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
>  obj-$(CONFIG_CXL_BASE)   += pci-cxl.o
>  obj-$(CONFIG_EEH)+= eeh-powernv.o
> diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c 
> b/arch/powerpc/platforms/powernv/opal-fadump.c
> new file mode 100644
> index 000..e330877
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-fadump.c
> @@ -0,0 +1,97 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Firmware-Assisted Dump support on POWER platform (OPAL).
> + *
> + * Copyright 2019, IBM Corp.
> + * Author: Hari Bathini 
> + */
> +
> +#undef DEBUG

No undef again please.

> +#define pr_fmt(fmt) "opal fadump: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +#include "../../kernel/fadump-common.h"
> +
> +static ulong opal_fadump_init_mem_struct(struct fw_dump *fadump_conf)
> +{
> + return fadump_conf->reserve_dump_area_start;
> +}
> +
> +static int opal_fadump_register(struct fw_dump *fadump_conf)
> +{
> + return -EIO;
> +}
> +
> +static int opal_fadump_unregister(struct fw_dump *fadump_conf)
> +{
> + return -EIO;
> +}
> +
> +static int opal_fadump_invalidate(struct fw_dump *fadump_conf)
> +{
> + return -EIO;
> +}
> +
> +static int __init opal_fadump_process(struct fw_dump *fadump_conf)
> +{
> + return -EINVAL;
> +}
> +
> +static void opal_fadump_region_show(struct fw_dump *fadump_conf,
> + struct seq_file *m)
> +{
> +}
> +
> +static void opal_fadump_trigger(struct fadump_crash_info_header *fdh,
> + const char *msg)
> +{
> + int rc;
> +
> + rc = opal_cec_reboot2(OPAL_REBOOT_MPIPL, msg);
> + if (rc == OPAL_UNSUPPORTED) {
> + pr_emerg("Reboot type %d not supported.\n",
> +  OPAL_REBOOT_MPIPL);
> + } else if (rc == OPAL_HARDWARE)
> + pr_emerg("No backend support for MPIPL!\n");
> +}
> +
> +static struct fadump_ops 

Re: [PATCH v5 10/31] opal: add MPIPL interface definitions

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index 383242e..c8a5665 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -980,6 +983,50 @@ struct opal_sg_list {
>  };
>  
>  /*
> + * Firmware-Assisted Dump (FADump) using MPIPL
> + */
> +
> +/* MPIPL update operations */
> +enum opal_mpipl_ops {
> + OPAL_MPIPL_ADD_RANGE= 0,
> + OPAL_MPIPL_REMOVE_RANGE = 1,
> + OPAL_MPIPL_REMOVE_ALL   = 2,
> + OPAL_MPIPL_FREE_PRESERVED_MEMORY= 3,
> +};
> +
> +/*
> + * Each tag maps to a metadata type. Use these tags to register/query
> + * corresponding metadata address with/from OPAL.
> + */
> +enum opal_mpipl_tags {
> + OPAL_MPIPL_TAG_CPU  = 0,
> + OPAL_MPIPL_TAG_OPAL = 1,
> + OPAL_MPIPL_TAG_KERNEL   = 2,
> + OPAL_MPIPL_TAG_BOOT_MEM = 3,
> +};
> +
> +/* Preserved memory details */
> +struct opal_mpipl_region {
> + __be64  src;
> + __be64  dest;
> + __be64  size;
> +};
> +
> +/* FADump structure format version */
> +#define MPIPL_FADUMP_VERSION 0x01
> +
> +/* Metadata provided by OPAL. */
> +struct opal_mpipl_fadump {
> + u8  version;
> + u8  reserved[7];
> + __be32  crashing_pir;
> + __be32  cpu_data_version;
> + __be32  cpu_data_size;
> + __be32  region_cnt;
> + struct opal_mpipl_regionregion[];
> +} __attribute__((packed));
> +

The above hunk is in the wrong place vs the skiboot header. Please put
things in exactly the same place in the skiboot and kernel versions of
the header.

After your kernel & skiboot patches are applied, the result of:

 $ git diff ~/src/skiboot/include/opal-api.h arch/powerpc/include/asm/opal-api.h

Should not include anything MPIPL/fadump related.


> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 57bd029..878110a 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -39,6 +39,12 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t 
> bdfn,
>   uint64_t PE_handle);
>  int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
>   uint64_t rate_phys, uint32_t size);
> +
> +int64_t opal_mpipl_update(enum opal_mpipl_ops op, u64 src,
> +   u64 dest, u64 size);
> +int64_t opal_mpipl_register_tag(enum opal_mpipl_tags tag, uint64_t addr);
> +int64_t opal_mpipl_query_tag(enum opal_mpipl_tags tag, uint64_t *addr);
> +

Please consistently use kernel types for new prototypes in here.

cheers


Re: [PATCH v5 07/31] powerpc/fadump: release all the memory above boot memory size

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:

> Except for reserve dump area which is permanent reserved, all memory
permanently

> above boot memory size is released when the dump is invalidated. Make
> this a bit more explicit in the code.

I'm not clear on what you mean by "boot memory size"?

cheers


Re: [PATCH v5 06/31] pseries/fadump: define register/un-register callback functions

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> Make RTAS calls to register and un-register for FADump. Also, update
> how fadump_region contents are diplayed to provide more information.

That sounds like two independent changes, so can this be split into two
patches?

cheers


Re: [PATCH v5 05/31] pseries/fadump: introduce callbacks for platform specific operations

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> Introduce callback functions for platform specific operations like
> register, unregister, invalidate & such. Also, define place-holders
> for the same on pSeries platform.

We already have an ops structure for machine specific calls, it's
ppc_md. Is there a good reason why these aren't just in machdep_calls
under #ifdef CONFIG_FA_DUMP ?

cheers


Re: [PATCH v5 01/31] powerpc/fadump: move internal macros/definitions to a new header

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> Though asm/fadump.h is meant to be used by other components dealing
> with FADump, it also has macros/definitions internal to FADump code.
> Move them to a new header file used within FADump code. This also
> makes way for refactoring platform specific FADump code.
>
> Signed-off-by: Hari Bathini 
> ---
>  arch/powerpc/include/asm/fadump.h   |   71 
>  arch/powerpc/kernel/fadump-common.h |   89 
> +++
>  arch/powerpc/kernel/fadump.c|2 +

I don't like having a header in kernel that's then used in platform
code. Because we end up having to do gross things like:

  arch/powerpc/platforms/powernv/opal-core.c:#include 
"../../kernel/fadump-common.h"
  arch/powerpc/platforms/powernv/opal-fadump.c:#include 
"../../kernel/fadump-common.h"
  arch/powerpc/platforms/pseries/rtas-fadump.c:#include 
"../../kernel/fadump-common.h"


I'd rather you put the internal bits in 
arch/powerpc/include/asm/fadump-internal.h

cheers


Re: [PATCH v5 02/31] powerpc/fadump: move internal code to a new file

2019-09-03 Thread Michael Ellerman
Hari Bathini  writes:
> Make way for refactoring platform specific FADump code by moving code
> that could be referenced from multiple places to fadump-common.c file.
>
> Signed-off-by: Hari Bathini 
> ---
>  arch/powerpc/kernel/Makefile|2 
>  arch/powerpc/kernel/fadump-common.c |  140 ++
>  arch/powerpc/kernel/fadump-common.h |8 ++
>  arch/powerpc/kernel/fadump.c|  146 
> ++-
>  4 files changed, 158 insertions(+), 138 deletions(-)
>  create mode 100644 arch/powerpc/kernel/fadump-common.c

I don't understand why we need fadump.c and fadump-common.c? They're
both common/shared across pseries & powernv aren't they?

By the end of the series we end up with 149 lines in fadump-common.c
which seems like a waste of time. Just put it all in fadump.c.

> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 56dfa7a..439d548 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -78,7 +78,7 @@ obj-$(CONFIG_EEH)  += eeh.o eeh_pe.o eeh_dev.o 
> eeh_cache.o \
> eeh_driver.o eeh_event.o eeh_sysfs.o
>  obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o
>  obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
> -obj-$(CONFIG_FA_DUMP)+= fadump.o
> +obj-$(CONFIG_FA_DUMP)+= fadump.o fadump-common.o
>  ifdef CONFIG_PPC32
>  obj-$(CONFIG_E500)   += idle_e500.o
>  endif
> diff --git a/arch/powerpc/kernel/fadump-common.c 
> b/arch/powerpc/kernel/fadump-common.c
> new file mode 100644
> index 000..7f39e4f
> --- /dev/null
> +++ b/arch/powerpc/kernel/fadump-common.c
> @@ -0,0 +1,140 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Firmware-Assisted Dump internal code.
> + *
> + * Copyright 2011, IBM Corporation
> + * Author: Mahesh Salgaonkar 

Can we not put emails in C files anymore please, they just bitrot, just
the names is fine.

> + * Copyright 2019, IBM Corp.
> + * Author: Hari Bathini 

These can just be:

 * Copyright 2011, Mahesh Salgaonkar, IBM Corporation.
 * Copyright 2019, Hari Bathini, IBM Corporation.

> + */
> +
> +#undef DEBUG

Don't undef DEBUG please.

> +#define pr_fmt(fmt) "fadump: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "fadump-common.h"
> +
> +void *fadump_cpu_notes_buf_alloc(unsigned long size)
> +{
> + void *vaddr;
> + struct page *page;
> + unsigned long order, count, i;
> +
> + order = get_order(size);
> + vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
> + if (!vaddr)
> + return NULL;
> +
> + count = 1 << order;
> + page = virt_to_page(vaddr);
> + for (i = 0; i < count; i++)
> + SetPageReserved(page + i);
> + return vaddr;
> +}

I realise you're just moving this code, but why do we need all this hand
rolled allocation stuff?

cheers


Re: [mainline][BUG][PPC][btrfs][bisected 00801a] kernel BUG at fs/btrfs/locking.c:71!

2019-09-03 Thread Nikolay Borisov



On 3.09.19 г. 11:55 ч., Abdul Haleem wrote:
> Greeting's
> 
> Mainline kernel panics with LTP/fs_fill-dir tests for btrfs file system on my 
> P9 box running mainline kernel 5.3.0-rc5
> 
> BUG_ON was first introduced by below commit
> 
> commit 00801ae4bb2be5f5af46502ef239ac5f4b536094
> Author: David Sterba 
> Date:   Thu May 2 16:53:47 2019 +0200
> 
> btrfs: switch extent_buffer write_locks from atomic to int
> 
> The write_locks is either 0 or 1 and always updated under the lock,
> so we don't need the atomic_t semantics.
> 
> Reviewed-by: Nikolay Borisov 
> Signed-off-by: David Sterba 
> 
> diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
> index 2706676279..98fccce420 100644
> --- a/fs/btrfs/locking.c
> +++ b/fs/btrfs/locking.c
> @@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct
> extent_buffer *eb)
>  
>  static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
>  {
> -   atomic_inc(>write_locks);
> +   eb->write_locks++;
>  }
>  
>  static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
>  {
> -   atomic_dec(>write_locks);
> +   eb->write_locks--;
>  }
>  
>  void btrfs_assert_tree_locked(struct extent_buffer *eb)
>  {
> -   BUG_ON(!atomic_read(>write_locks));
> +   BUG_ON(!eb->write_locks);
>  }
>  
> 
> tests logs:
> avocado-misc-tests/io/disk/ltp_fs.py:LtpFs.test_fs_run;fs_fill-dir-ext3-61cd: 
>  [ 3376.022096] EXT4-fs (nvme0n1): mounting ext3 file system using the ext4 
> subsystem
> EXT4-fs (nvme0n1): mounted filesystem with ordered data mode. Opts: (null)
> EXT4-fs (loop1): mounting ext2 file system using the ext4 subsystem
> EXT4-fs (loop1): mounted filesystem without journal. Opts: (null)
> EXT4-fs (loop1): mounting ext3 file system using the ext4 subsystem
> EXT4-fs (loop1): mounted filesystem with ordered data mode. Opts: (null)
> EXT4-fs (loop1): mounted filesystem with ordered data mode. Opts: (null)
> XFS (loop1): Mounting V5 Filesystem
> XFS (loop1): Ending clean mount
> XFS (loop1): Unmounting Filesystem
> BTRFS: device fsid 7c08f81b-6642-4a06-9182-2884e80d56ee devid 1 transid 5 
> /dev/loop1
> BTRFS info (device loop1): disk space caching is enabled
> BTRFS info (device loop1): has skinny extents
> BTRFS info (device loop1): enabling ssd optimizations
> BTRFS info (device loop1): creating UUID tree
> [ cut here ]
> kernel BUG at fs/btrfs/locking.c:71!
> Oops: Exception in kernel mode, sig: 5 [#1]
> LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in: fuse(E) vfat(E) fat(E) btrfs(E) xor(E)
> zstd_decompress(E) zstd_compress(E) raid6_pq(E) xfs(E) raid0(E)
> linear(E) dm_round_robin(E) dm_queue_length(E) dm_service_time(E)
> dm_multipath(E) loop(E) rpadlpar_io(E) rpaphp(E) lpfc(E) bnx2x(E)
> xt_CHECKSUM(E) xt_MASQUERADE(E) tun(E) bridge(E) stp(E) llc(E) kvm_pr(E)
> kvm(E) tcp_diag(E) udp_diag(E) inet_diag(E) unix_diag(E)
> af_packet_diag(E) netlink_diag(E) ip6t_rpfilter(E) ipt_REJECT(E)
> nf_reject_ipv4(E) ip6t_REJECT(E) nf_reject_ipv6(E) xt_conntrack(E)
> ip_set(E) nfnetlink(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E)
> ip6table_mangle(E) ip6table_security(E) ip6table_raw(E) iptable_nat(E)
> nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E)
> iptable_mangle(E) iptable_security(E) iptable_raw(E) ebtable_filter(E)
> ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) sunrpc(E)
> raid10(E) xts(E) pseries_rng(E) vmx_crypto(E) sg(E) uio_pdrv_genirq(E)
> uio(E) binfmt_misc(E) sch_fq_codel(E) ip_tables(E)
>  ext4(E) mbcache(E) jbd2(E) sr_mod(E) cdrom(E) sd_mod(E) ibmvscsi(E)
> scsi_transport_srp(E) ibmveth(E) nvmet_fc(E) nvmet(E) nvme_fc(E)
> nvme_fabrics(E) scsi_transport_fc(E) mdio(E) libcrc32c(E) ptp(E)
> pps_core(E) nvme(E) nvme_core(E) dm_mirror(E) dm_region_hash(E)
> dm_log(E) dm_mod(E) [last unloaded: lpfc]
> CPU: 14 PID: 1803 Comm: kworker/u32:8 Tainted: GE 
> 5.3.0-rc5-autotest-autotest #1
> Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
> NIP:  c0080164dd70 LR: c0080164df00 CTR: c0a817a0
> REGS: c260b5d0 TRAP: 0700   Tainted: GE  
> (5.3.0-rc5-autotest-autotest)
> MSR:  800102029033   CR: 22444082  XER: 
> 
> CFAR: c0080164defc IRQMASK: 0
> GPR00: c008015c55f4 c260b860 c00801703b00 c00267a29af0
> GPR04:  0001  
> GPR08:  0001  0004
> GPR12: 4000 c0001ec58e00  
> GPR16: 0001 0004 0001 0001
> GPR20:  0001 3e0f83e1 c0025a7cbef0
> GPR24: c260ba26 4000 c14a26e8 0003
> GPR28: 0004 c0025f2010a0 c00267a29af0 
> NIP 

[PATCH 14/14] selftests/powerpc: Add basic EEH selftest

2019-09-03 Thread Oliver O'Halloran
Use the new eeh_dev_check and eeh_dev_break interfaces to test EEH
recovery.  Historically this has been done manually using platform specific
EEH error injection facilities (e.g. via RTAS). However, documentation on
how to use these facilities is haphazard at best and non-existent at worst
so it's hard to develop a cross-platform test.

The new debugfs interfaces allow the kernel to handle the platform specific
details so we can write a more generic set of sets. This patch adds the
most basic of recovery tests where:

a) Errors are injected and recovered from sequentially,
b) Errors are not injected into PCI-PCI bridges, such as PCIe switches.
c) Errors are only injected into device function zero.
d) No errors are injected into Virtual Functions.

a), b) and c) are largely due to limitations of Linux's EEH support.  EEH
recovery is serialised in the EEH recovery thread which forces a).
Similarly, multi-function PCI devices are almost always grouped into the
same PE so injecting an error on one function exercises the same code
paths. c) is because we currently more or less ignore PCI bridges during
recovery and assume that the recovered topology will be the same as the
original.

d) is due to the limits of the eeh_dev_break interface. With the current
implementation we can't inject an error into a specific VF without
potentially causing additional errors on other VFs. Due to the serialised
recovery process we might end up timing out waiting for another function to
recover before the function of interest is recovered. The platform specific
error injection facilities are finer-grained and allow this capability, but
doing that requires working out how to use those facilities first.

Basicly, it's better than nothing and it's a base to build on.

Signed-off-by: Oliver O'Halloran 
---
 tools/testing/selftests/powerpc/Makefile  |  1 +
 tools/testing/selftests/powerpc/eeh/Makefile  |  9 ++
 .../selftests/powerpc/eeh/eeh-basic.sh| 82 +++
 .../selftests/powerpc/eeh/eeh-functions.sh| 76 +
 4 files changed, 168 insertions(+)
 create mode 100644 tools/testing/selftests/powerpc/eeh/Makefile
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-basic.sh
 create mode 100755 tools/testing/selftests/powerpc/eeh/eeh-functions.sh

diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index b3ad909aefbc..644770c3b754 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -26,6 +26,7 @@ SUB_DIRS = alignment  \
   switch_endian\
   syscalls \
   tm   \
+  eeh  \
   vphn \
   math \
   ptrace   \
diff --git a/tools/testing/selftests/powerpc/eeh/Makefile 
b/tools/testing/selftests/powerpc/eeh/Makefile
new file mode 100644
index ..b397babd569b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+noarg:
+   $(MAKE) -C ../
+
+TEST_PROGS := eeh-basic.sh
+TEST_FILES := eeh-functions.sh
+
+top_srcdir = ../../../../..
+include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh 
b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
new file mode 100755
index ..f988d2f42e8f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0-only
+
+. ./eeh-functions.sh
+
+if ! eeh_supported ; then
+   echo "EEH not supported on this system, skipping"
+   exit 0;
+fi
+
+if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
+   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
+   echo "debugfs EEH testing files are missing. Is debugfs mounted?"
+   exit 1;
+fi
+
+pre_lspci=`mktemp`
+lspci > $pre_lspci
+
+# Bump the max freeze count to something absurd so we don't
+# trip over it while breaking things.
+echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
+
+# record the devices that we break in here. Assuming everything
+# goes to plan we should get them back once the recover process
+# is finished.
+devices=""
+
+# Build up a list of candidate devices.
+for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
+   # skip bridges since we can't recover them (yet...)
+   if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
+   echo "$dev, Skipped: bridge"
+   continue;
+   fi
+
+   # Skip VFs for now since we don't have a reliable way
+   # to break them.
+   if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
+   echo "$dev, Skipped: virtfn"
+   continue;
+   fi
+
+   # Don't inject errosr into an already-frozen PE. This happens with
+   # PEs that contain multiple PCI devices (e.g. multi-function cards)
+   # and injecting new errors during the recovery process will 

[PATCH 13/14] powerpc/eeh: Add a eeh_dev_break debugfs interface

2019-09-03 Thread Oliver O'Halloran
Add an interface to debugfs for generating an EEH event on a given device.
This works by disabling memory accesses to and from the device by setting
the PCI_COMMAND register (or the VF Memory Space Enable on the parent PF).

This is a somewhat portable alternative to using the platform specific
error injection mechanisms since those tend to be either hard to use, or
straight up broken. For pseries the interfaces also requires the use of
/dev/mem which is probably going to go away in a post-LOCKDOWN world
(and it's a horrific hack to begin with) so moving to a kernel-provided
interface makes sense and provides a sane, cross-platform interface for
userspace so we can write more generic testing scripts.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c | 139 +-
 1 file changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index ace1c5a6b8ed..a55d2f01da1d 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1894,7 +1894,8 @@ static ssize_t eeh_dev_check_write(struct file *filp,
char buf[20];
int ret;
 
-   ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+   memset(buf, 0, sizeof(buf));
+   ret = simple_write_to_buffer(buf, sizeof(buf)-1, ppos, user_buf, count);
if (!ret)
return -EFAULT;
 
@@ -1931,6 +1932,139 @@ static const struct file_operations eeh_dev_check_fops 
= {
.read   = eeh_debugfs_dev_usage,
 };
 
+static int eeh_debugfs_break_device(struct pci_dev *pdev)
+{
+   struct resource *bar = NULL;
+   void __iomem *mapped;
+   u16 old, bit;
+   int i, pos;
+
+   /* Do we have an MMIO BAR to disable? */
+   for (i = 0; i <= PCI_STD_RESOURCE_END; i++) {
+   struct resource *r = >resource[i];
+
+   if (!r->flags || !r->start)
+   continue;
+   if (r->flags & IORESOURCE_IO)
+   continue;
+   if (r->flags & IORESOURCE_UNSET)
+   continue;
+
+   bar = r;
+   break;
+   }
+
+   if (!bar) {
+   pci_err(pdev, "Unable to find Memory BAR to cause EEH with\n");
+   return -ENXIO;
+   }
+
+   pci_err(pdev, "Going to break: %pR\n", bar);
+
+   if (pdev->is_virtfn) {
+#ifndef CONFIG_IOV
+   return -ENXIO;
+#else
+   /*
+* VFs don't have a per-function COMMAND register, so the best
+* we can do is clear the Memory Space Enable bit in the PF's
+* SRIOV control reg.
+*
+* Unfortunately, this requires that we have a PF (i.e doesn't
+* work for a passed-through VF) and it has the potential side
+* effect of also causing an EEH on every other VF under the
+* PF. Oh well.
+*/
+   pdev = pdev->physfn;
+   if (!pdev)
+   return -ENXIO; /* passed through VFs have no PF */
+
+   pos  = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
+   pos += PCI_SRIOV_CTRL;
+   bit  = PCI_SRIOV_CTRL_MSE;
+#endif /* !CONFIG_IOV */
+   } else {
+   bit = PCI_COMMAND_MEMORY;
+   pos = PCI_COMMAND;
+   }
+
+   /*
+* Process here is:
+*
+* 1. Disable Memory space.
+*
+* 2. Perform an MMIO to the device. This should result in an error
+*(CA  / UR) being raised by the device which results in an EEH
+*PE freeze. Using the in_8() accessor skips the eeh detection hook
+*so the freeze hook so the EEH Detection machinery won't be
+*triggered here. This is to match the usual behaviour of EEH
+*where the HW will asyncronously freeze a PE and it's up to
+*the kernel to notice and deal with it.
+*
+* 3. Turn Memory space back on. This is more important for VFs
+*since recovery will probably fail if we don't. For normal
+*the COMMAND register is reset as a part of re-initialising
+*the device.
+*
+* Breaking stuff is the point so who cares if it's racy ;)
+*/
+   pci_read_config_word(pdev, pos, );
+
+   mapped = ioremap(bar->start, PAGE_SIZE);
+   if (!mapped) {
+   pci_err(pdev, "Unable to map MMIO BAR %pR\n", bar);
+   return -ENXIO;
+   }
+
+   pci_write_config_word(pdev, pos, old & ~bit);
+   in_8(mapped);
+   pci_write_config_word(pdev, pos, old);
+
+   iounmap(mapped);
+
+   return 0;
+}
+
+static ssize_t eeh_dev_break_write(struct file *filp,
+   const char __user *user_buf,
+   size_t count, loff_t *ppos)
+{
+   uint32_t domain, bus, dev, fn;
+   struct 

[PATCH 12/14] powerpc/eeh: Add debugfs interface to run an EEH check

2019-09-03 Thread Oliver O'Halloran
Detecting an frozen EEH PE usually occurs when an MMIO load returns a 0xFFs
response. When performing EEH testing using the EEH error injection feature
available on some platforms there is no simple way to kick-off the kernel's
recovery process since any accesses from userspace (usually /dev/mem) will
bypass the MMIO helpers in the kernel which check if a 0xFF response is due
to an EEH freeze or not.

If a device contains a 0xFF byte in it's config space it's possible to
trigger the recovery process via config space read from userspace, but this
is not a reliable method. If a driver is bound to the device an in use it
will frequently trigger the MMIO check, but this is also inconsistent.

To solve these problems this patch adds a debugfs file called
"eeh_dev_check" which accepts a ::. string and runs
eeh_dev_check_failure() on it. This is the same check that's done when the
kernel gets a 0xFF result from an config or MMIO read with the added
benifit that it can be reliably triggered from userspace.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh.c | 61 +++
 1 file changed, 61 insertions(+)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 46d17817b438..ace1c5a6b8ed 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1873,6 +1873,64 @@ static const struct file_operations 
eeh_force_recover_fops = {
.llseek = no_llseek,
.write  = eeh_force_recover_write,
 };
+
+static ssize_t eeh_debugfs_dev_usage(struct file *filp,
+   char __user *user_buf,
+   size_t count, loff_t *ppos)
+{
+   static const char usage[] = "input format: ::.\n";
+
+   return simple_read_from_buffer(user_buf, count, ppos,
+  usage, sizeof(usage) - 1);
+}
+
+static ssize_t eeh_dev_check_write(struct file *filp,
+   const char __user *user_buf,
+   size_t count, loff_t *ppos)
+{
+   uint32_t domain, bus, dev, fn;
+   struct pci_dev *pdev;
+   struct eeh_dev *edev;
+   char buf[20];
+   int ret;
+
+   ret = simple_write_to_buffer(buf, sizeof(buf), ppos, user_buf, count);
+   if (!ret)
+   return -EFAULT;
+
+   ret = sscanf(buf, "%x:%x:%x.%x", , , , );
+   if (ret != 4) {
+   pr_err("%s: expected 4 args, got %d\n", __func__, ret);
+   return -EINVAL;
+   }
+
+   pdev = pci_get_domain_bus_and_slot(domain, bus, (dev << 3) | fn);
+   if (!pdev)
+   return -ENODEV;
+
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev) {
+   pci_err(pdev, "No eeh_dev for this device!\n");
+   pci_dev_put(pdev);
+   return -ENODEV;
+   }
+
+   ret = eeh_dev_check_failure(edev);
+   pci_info(pdev, "eeh_dev_check_failure(%04x:%02x:%02x.%01x) = %d\n",
+   domain, bus, dev, fn, ret);
+
+   pci_dev_put(pdev);
+
+   return count;
+}
+
+static const struct file_operations eeh_dev_check_fops = {
+   .open   = simple_open,
+   .llseek = no_llseek,
+   .write  = eeh_dev_check_write,
+   .read   = eeh_debugfs_dev_usage,
+};
+
 #endif
 
 static int __init eeh_init_proc(void)
@@ -1888,6 +1946,9 @@ static int __init eeh_init_proc(void)
debugfs_create_bool("eeh_disable_recovery", 0600,
powerpc_debugfs_root,
_debugfs_no_recover);
+   debugfs_create_file_unsafe("eeh_dev_check", 0600,
+   powerpc_debugfs_root, NULL,
+   _dev_check_fops);
debugfs_create_file_unsafe("eeh_force_recover", 0600,
powerpc_debugfs_root, NULL,
_force_recover_fops);
-- 
2.21.0



[PATCH 11/14] powerpc/eeh: Set attention indicator while recovering

2019-09-03 Thread Oliver O'Halloran
I am the RAS team. Hear me roar.

Roar.

On a more serious note, being able to locate failed devices can be helpful.
Set the attention indicator if the slot supports it once we've determined
the device is present and only clear it if the device is fully recovered.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh_driver.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 0d34cc12c529..80bd157fcb45 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -803,6 +803,10 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev)
if (!ops || !ops->get_adapter_status)
return true;
 
+   /* set the attention indicator while we've got the slot ops */
+   if (ops->set_attention_status)
+   ops->set_attention_status(slot->hotplug, 1);
+
rc = ops->get_adapter_status(slot->hotplug, );
if (rc)
return true;
@@ -810,6 +814,28 @@ static bool eeh_slot_presence_check(struct pci_dev *pdev)
return !!state;
 }
 
+static void eeh_clear_slot_attention(struct pci_dev *pdev)
+{
+   const struct hotplug_slot_ops *ops;
+   struct pci_slot *slot;
+
+   if (!pdev)
+   return;
+
+   if (pdev->error_state == pci_channel_io_perm_failure)
+   return;
+
+   slot = pdev->slot;
+   if (!slot || !slot->hotplug)
+   return;
+
+   ops = slot->hotplug->ops;
+   if (!ops || !ops->set_attention_status)
+   return;
+
+   ops->set_attention_status(slot->hotplug, 0);
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -1098,6 +1124,12 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
 * we don't want to modify the PE tree structure so we do it here.
 */
eeh_pe_cleanup(pe);
+
+   /* clear the slot attention LED for all recovered devices */
+   eeh_for_each_pe(pe, tmp_pe)
+   eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+   eeh_clear_slot_attention(edev->pdev);
+
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
-- 
2.21.0



[PATCH 10/14] pci-hotplug/pnv_php: Add attention indicator support

2019-09-03 Thread Oliver O'Halloran
pnv_php is generally used with PCIe bridges which provide a native
interface for setting the attention and power indicator LEDs. Wire up
those interfaces even if firmware does not have support for them (yet...)

Signed-off-by: Oliver O'Halloran 
---
 drivers/pci/hotplug/pnv_php.c | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6fdf8b74cb0a..d7b2b47bc33e 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -419,9 +419,21 @@ static int pnv_php_get_attention_state(struct hotplug_slot 
*slot, u8 *state)
 static int pnv_php_set_attention_state(struct hotplug_slot *slot, u8 state)
 {
struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
+   struct pci_dev *bridge = php_slot->pdev;
+   u16 new, mask;
 
-   /* FIXME: Make it real once firmware supports it */
php_slot->attention_state = state;
+   if (!bridge)
+   return 0;
+
+   mask = PCI_EXP_SLTCTL_AIC;
+
+   if (state)
+   new = PCI_EXP_SLTCTL_ATTN_IND_ON;
+   else
+   new = PCI_EXP_SLTCTL_ATTN_IND_OFF;
+
+   pcie_capability_clear_and_set_word(bridge, PCI_EXP_SLTCTL, mask, new);
 
return 0;
 }
-- 
2.21.0



[PATCH 09/14] pci-hotplug/pnv_php: Add support for IODA3 Power9 PHBs

2019-09-03 Thread Oliver O'Halloran
Currently we check that an IODA2 compatible PHB is upstream of this slot.
This is mainly to avoid pnv_php creating slots for the various "virtual
PHBs" that we create for NVLink. There's no real need for this restriction
so allow it on IODA3.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/pci.c | 3 ++-
 drivers/pci/hotplug/pnv_php.c| 6 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 6104418c9ad5..2825d004dece 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -54,7 +54,8 @@ int pnv_pci_get_slot_id(struct device_node *np, uint64_t *id)
break;
}
 
-   if (!of_device_is_compatible(parent, "ibm,ioda2-phb")) {
+   if (!of_device_is_compatible(parent, "ibm,ioda2-phb") &&
+   !of_device_is_compatible(parent, "ibm,ioda3-phb")) {
of_node_put(parent);
continue;
}
diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index b0e243dabf77..6fdf8b74cb0a 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -994,6 +994,9 @@ static int __init pnv_php_init(void)
for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
pnv_php_register(dn);
 
+   for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
+   pnv_php_register(dn);
+
return 0;
 }
 
@@ -1003,6 +1006,9 @@ static void __exit pnv_php_exit(void)
 
for_each_compatible_node(dn, NULL, "ibm,ioda2-phb")
pnv_php_unregister(dn);
+
+   for_each_compatible_node(dn, NULL, "ibm,ioda3-phb")
+   pnv_php_unregister(dn);
 }
 
 module_init(pnv_php_init);
-- 
2.21.0



[PATCH 08/14] pci-hotplug/pnv_php: Add a reset_slot() callback

2019-09-03 Thread Oliver O'Halloran
When performing EEH recovery of devices in a hotplug slot we need to use
the slot driver's ->reset_slot() callback to prevent spurious hotplug
events due to spurious DLActive and PresDet change interrupts. Add a
reset_slot() callback to pnv_php so we can handle recovery of devices
in pnv_php managed slots.

Signed-off-by: Oliver O'Halloran 
---
 drivers/pci/hotplug/pnv_php.c | 39 +++
 1 file changed, 39 insertions(+)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index 6758fd7c382e..b0e243dabf77 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -511,6 +511,37 @@ static int pnv_php_enable(struct pnv_php_slot *php_slot, 
bool rescan)
return 0;
 }
 
+static int pnv_php_reset_slot(struct hotplug_slot *slot, int probe)
+{
+   struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
+   struct pci_dev *bridge = php_slot->pdev;
+   uint16_t sts;
+
+   /*
+* The CAPI folks want pnv_php to drive OpenCAPI slots
+* which don't have a bridge. Only claim to support
+* reset_slot() if we have a bridge device (for now...)
+*/
+   if (probe)
+   return !bridge;
+
+   /* mask our interrupt while resetting the bridge */
+   if (php_slot->irq > 0)
+   disable_irq(php_slot->irq);
+
+   pci_bridge_secondary_bus_reset(bridge);
+
+   /* clear any state changes that happened due to the reset */
+   pcie_capability_read_word(php_slot->pdev, PCI_EXP_SLTSTA, );
+   sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
+   pcie_capability_write_word(php_slot->pdev, PCI_EXP_SLTSTA, sts);
+
+   if (php_slot->irq > 0)
+   enable_irq(php_slot->irq);
+
+   return 0;
+}
+
 static int pnv_php_enable_slot(struct hotplug_slot *slot)
 {
struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
@@ -548,6 +579,7 @@ static const struct hotplug_slot_ops php_slot_ops = {
.set_attention_status   = pnv_php_set_attention_state,
.enable_slot= pnv_php_enable_slot,
.disable_slot   = pnv_php_disable_slot,
+   .reset_slot = pnv_php_reset_slot,
 };
 
 static void pnv_php_release(struct pnv_php_slot *php_slot)
@@ -721,6 +753,12 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, );
sts &= (PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_DLLSC);
pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, sts);
+
+   pci_dbg(pdev, "PCI slot [%s]: HP int! DLAct: %d, PresDet: %d\n",
+   php_slot->name,
+   !!(sts & PCI_EXP_SLTSTA_DLLSC),
+   !!(sts & PCI_EXP_SLTSTA_PDC));
+
if (sts & PCI_EXP_SLTSTA_DLLSC) {
pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, );
added = !!(lsts & PCI_EXP_LNKSTA_DLLLA);
@@ -735,6 +773,7 @@ static irqreturn_t pnv_php_interrupt(int irq, void *data)
 
added = !!(presence == OPAL_PCI_SLOT_PRESENT);
} else {
+   pci_dbg(pdev, "PCI slot [%s]: Spurious IRQ?\n", php_slot->name);
return IRQ_NONE;
}
 
-- 
2.21.0



[PATCH 07/14] powernv/eeh: Use generic code to handle hot resets

2019-09-03 Thread Oliver O'Halloran
When we reset PCI devices managed by a hotplug driver the reset may
generate spurious hotplug events that cause the PCI device we're resetting
to be torn down accidently. This is a problem for EEH (when the driver is
EEH aware) since we want to leave the OS PCI device state intact so that
the device can be re-set without losing any resources (network, disks,
etc) provided by the driver.

Generic PCI code provides the pci_bus_error_reset() function to handle
resetting a PCI Device (or bus) by using the reset method provided by the
hotplug slot driver. We can use this function if the EEH core has
requested a hot reset (common case) without tripping over the hotplug
driver.

Signed-off-by: Oliver O'Halloran 
---
I know that include is a bit gross, but:

a) We're already doing it in pci-ioda.c, and in pseries/pci.
b) It's pci_bus_error_reset() isn't really a function that
   should be provided to non-pci core code.
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 38 ++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 94e26d56ecd2..6bc24a47e9ef 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -34,6 +34,7 @@
 
 #include "powernv.h"
 #include "pci.h"
+#include "../../../../drivers/pci/pci.h"
 
 static int eeh_event_irq = -EINVAL;
 
@@ -849,7 +850,7 @@ static int __pnv_eeh_bridge_reset(struct pci_dev *dev, int 
option)
int aer = edev ? edev->aer_cap : 0;
u32 ctrl;
 
-   pr_debug("%s: Reset PCI bus %04x:%02x with option %d\n",
+   pr_debug("%s: Secondary Reset PCI bus %04x:%02x with option %d\n",
 __func__, pci_domain_nr(dev->bus),
 dev->bus->number, option);
 
@@ -907,6 +908,10 @@ static int pnv_eeh_bridge_reset(struct pci_dev *pdev, int 
option)
if (!dn || !of_get_property(dn, "ibm,reset-by-firmware", NULL))
return __pnv_eeh_bridge_reset(pdev, option);
 
+   pr_debug("%s: FW reset PCI bus %04x:%02x with option %d\n",
+__func__, pci_domain_nr(pdev->bus),
+pdev->bus->number, option);
+
switch (option) {
case EEH_RESET_FUNDAMENTAL:
scope = OPAL_RESET_PCI_FUNDAMENTAL;
@@ -1125,10 +1130,37 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
 
-   if (pci_is_root_bus(bus) ||
-   pci_is_root_bus(bus->parent))
+   if (pci_is_root_bus(bus))
return pnv_eeh_root_reset(hose, option);
 
+   /*
+* For hot resets try use the generic PCI error recovery reset
+* functions. These correctly handles the case where the secondary
+* bus is behind a hotplug slot and it will use the slot provided
+* reset methods to prevent spurious hotplug events during the reset.
+*
+* Fundemental resets need to be handled internally to EEH since the
+* PCI core doesn't really have a concept of a fundemental reset,
+* mainly because there's no standard way to generate one. Only a
+* few devices require an FRESET so it should be fine.
+*/
+   if (option != EEH_RESET_FUNDAMENTAL) {
+   /*
+* NB: Skiboot and pnv_eeh_bridge_reset() also no-op the
+* de-assert step. It's like the OPAL reset API was
+* poorly designed or something...
+*/
+   if (option == EEH_RESET_DEACTIVATE)
+   return 0;
+
+   rc = pci_bus_error_reset(bus->self);
+   if (!rc)
+   return 0;
+   }
+
+   /* otherwise, use the generic bridge reset. this might call into FW */
+   if (pci_is_root_bus(bus->parent))
+   return pnv_eeh_root_reset(hose, option);
return pnv_eeh_bridge_reset(bus->self, option);
 }
 
-- 
2.21.0



[PATCH 06/14] powerpc/eeh: Remove stale CAPI comment

2019-09-03 Thread Oliver O'Halloran
Support for switching CAPI cards into and out of CAPI mode was removed a
while ago. Drop the comment since it's no longer relevant.

Cc: Andrew Donnellan 
Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/platforms/powernv/eeh-powernv.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index e7b867912f24..94e26d56ecd2 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1125,13 +1125,6 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
 
-   /*
-* If dealing with the root bus (or the bus underneath the
-* root port), we reset the bus underneath the root port.
-*
-* The cxl driver depends on this behaviour for bi-modal card
-* switching.
-*/
if (pci_is_root_bus(bus) ||
pci_is_root_bus(bus->parent))
return pnv_eeh_root_reset(hose, option);
-- 
2.21.0



[PATCH 05/14] powerpc/eeh: Defer printing stack trace

2019-09-03 Thread Oliver O'Halloran
Currently we print a stack trace in the event handler to help with
debugging EEH issues. In the case of suprise hot-unplug this is unneeded,
so we want to prevent printing the stack trace unless we know it's due to
an actual device error. To accomplish this, we can save a stack trace at
the point of detection and only print it once the EEH recovery handler has
determined the freeze was due to an actual error.

Since the whole point of this is to prevent spurious EEH output we also
move a few prints out of the detection thread, or mark them as pr_debug
so anyone interested can get output from the eeh_check_dev_failure()
if they want.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/eeh.h   | 11 +
 arch/powerpc/kernel/eeh.c| 15 -
 arch/powerpc/kernel/eeh_driver.c | 38 +++-
 arch/powerpc/kernel/eeh_event.c  | 26 ++
 4 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index c13119a5e69b..9d0e1694a94d 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -88,6 +88,17 @@ struct eeh_pe {
struct list_head child_list;/* List of PEs below this PE*/
struct list_head child; /* Memb. child_list/eeh_phb_pe  */
struct list_head edevs; /* List of eeh_dev in this PE   */
+
+   /*
+* Saved stack trace. When we find a PE freeze in eeh_dev_check_failure
+* the stack trace is saved here so we can print it in the recovery
+* thread if it turns out to due to a real problem rather than
+* a hot-remove.
+*
+* A max of 64 entries might be overkill, but it also might not be.
+*/
+   unsigned long stack_trace[64];
+   int trace_entries;
 };
 
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c468e79d13c..46d17817b438 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -420,11 +420,9 @@ static int eeh_phb_check_failure(struct eeh_pe *pe)
eeh_pe_mark_isolated(phb_pe);
eeh_serialize_unlock(flags);
 
-   pr_err("EEH: PHB#%x failure detected, location: %s\n",
+   pr_debug("EEH: PHB#%x failure detected, location: %s\n",
phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
-   dump_stack();
eeh_send_failure_event(phb_pe);
-
return 1;
 out:
eeh_serialize_unlock(flags);
@@ -451,7 +449,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
unsigned long flags;
struct device_node *dn;
struct pci_dev *dev;
-   struct eeh_pe *pe, *parent_pe, *phb_pe;
+   struct eeh_pe *pe, *parent_pe;
int rc = 0;
const char *location = NULL;
 
@@ -581,13 +579,8 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
 * a stack trace will help the device-driver authors figure
 * out what happened.  So print that out.
 */
-   phb_pe = eeh_phb_pe_get(pe->phb);
-   pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
-  pe->phb->global_number, pe->addr);
-   pr_err("EEH: PE location: %s, PHB location: %s\n",
-  eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
-   dump_stack();
-
+   pr_debug("EEH: %s: Frozen PHB#%x-PE#%x detected\n",
+   __func__, pe->phb->global_number, pe->addr);
eeh_send_failure_event(pe);
 
return 1;
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 52ce7584af43..0d34cc12c529 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -863,8 +863,44 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
if (eeh_slot_presence_check(edev->pdev))
devices++;
 
-   if (!devices)
+   if (!devices) {
+   pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
+   pe->phb->global_number, pe->addr);
goto out; /* nothing to recover */
+   }
+
+   /* Log the event */
+   if (pe->type & EEH_PE_PHB) {
+   pr_err("EEH: PHB#%x failure detected, location: %s\n",
+   pe->phb->global_number, eeh_pe_loc_get(pe));
+   } else {
+   struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
+
+   pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+  pe->phb->global_number, pe->addr);
+   pr_err("EEH: PE location: %s, PHB location: %s\n",
+  eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
+   }
+
+   /*
+* Print the saved stack trace now that we've verified there's
+* something to recover.
+*/
+   if (pe->trace_entries) {
+   void **ptrs = (void **) pe->stack_trace;
+   int i;
+
+   pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
+ 

[PATCH 04/14] powerpc/eeh: Check slot presence state in eeh_handle_normal_event()

2019-09-03 Thread Oliver O'Halloran
When a device is surprise removed while undergoing IO we will probably
get an EEH PE freeze due to MMIO timeouts and other errors. When a freeze
is detected we send a recovery event to the EEH worker thread which will
notify drivers, and perform recovery as needed.

In the event of a hot-remove we don't want recovery to occur since there
isn't a device to recover. The recovery process is fairly long due to
the number of wait states (required by PCIe) which causes problems when
devices are removed and replaced (e.g. hot swapping of U.2 NVMe drives).

To determine if we need to skip the recovery process we can use the
get_adapter_state() operation of the hotplug_slot to determine if the
slot contains a device or not, and if the slot is empty we can skip
recovery entirely.

One thing to note is that the slot being EEH frozen does not prevent the
hotplug driver from working. We don't have the EEH recovery thread
remove any of the devices since it's assumed that the hotplug driver
will handle tearing down the slot state.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh_driver.c | 60 
 1 file changed, 60 insertions(+)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 18a69fac4d80..52ce7584af43 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -769,6 +770,46 @@ static void eeh_pe_cleanup(struct eeh_pe *pe)
}
 }
 
+/**
+ * eeh_check_slot_presence - Check if a device is still present in a slot
+ * @pdev: pci_dev to check
+ *
+ * This function may return a false positive if we can't determine the slot's
+ * presence state. This might happen for for PCIe slots if the PE containing
+ * the upstream bridge is also frozen, or the bridge is part of the same PE
+ * as the device.
+ *
+ * This shouldn't happen often, but you might see it if you hotplug a PCIe
+ * switch.
+ */
+static bool eeh_slot_presence_check(struct pci_dev *pdev)
+{
+   const struct hotplug_slot_ops *ops;
+   struct pci_slot *slot;
+   u8 state;
+   int rc;
+
+   if (!pdev)
+   return false;
+
+   if (pdev->error_state == pci_channel_io_perm_failure)
+   return false;
+
+   slot = pdev->slot;
+   if (!slot || !slot->hotplug)
+   return true;
+
+   ops = slot->hotplug->ops;
+   if (!ops || !ops->get_adapter_status)
+   return true;
+
+   rc = ops->get_adapter_status(slot->hotplug, );
+   if (rc)
+   return true;
+
+   return !!state;
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -799,6 +840,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
enum pci_ers_result result = PCI_ERS_RESULT_NONE;
struct eeh_rmv_data rmv_data =
{LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
+   int devices = 0;
 
bus = eeh_pe_bus_get(pe);
if (!bus) {
@@ -807,6 +849,23 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
return;
}
 
+   /*
+* When devices are hot-removed we might get an EEH due to
+* a driver attempting to touch the MMIO space of a removed
+* device. In this case we don't have a device to recover
+* so suppress the event if we can't find any present devices.
+*
+* The hotplug driver should take care of tearing down the
+* device itself.
+*/
+   eeh_for_each_pe(pe, tmp_pe)
+   eeh_pe_for_each_dev(tmp_pe, edev, tmp)
+   if (eeh_slot_presence_check(edev->pdev))
+   devices++;
+
+   if (!devices)
+   goto out; /* nothing to recover */
+
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > eeh_max_freezes) {
@@ -997,6 +1056,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
}
}
 
+out:
/*
 * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
 * we don't want to modify the PE tree structure so we do it here.
-- 
2.21.0



[PATCH 03/14] powerpc/eeh: Make permanently failed devices non-actionable

2019-09-03 Thread Oliver O'Halloran
If a device is torn down by a hotplug slot driver it's marked as removed
and marked as permaantly failed. There's no point in trying to recover a
permernantly failed device so it should be considered un-actionable.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/kernel/eeh_driver.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 75266156943f..18a69fac4d80 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -96,8 +96,16 @@ static bool eeh_dev_removed(struct eeh_dev *edev)
 
 static bool eeh_edev_actionable(struct eeh_dev *edev)
 {
-   return (edev->pdev && !eeh_dev_removed(edev) &&
-   !eeh_pe_passed(edev->pe));
+   if (!edev->pdev)
+   return false;
+   if (edev->pdev->error_state == pci_channel_io_perm_failure)
+   return false;
+   if (eeh_dev_removed(edev))
+   return false;
+   if (eeh_pe_passed(edev->pe))
+   return false;
+
+   return true;
 }
 
 /**
-- 
2.21.0



[PATCH 02/14] powerpc/eeh: Fix race when freeing PDNs

2019-09-03 Thread Oliver O'Halloran
When hot-adding devices we rely on the hotplug driver to create pci_dn's
for the devices under the hotplug slot. Converse, when hot-removing the
driver will remove the pci_dn's that it created. This is a problem because
the pci_dev is still live until it's refcount drops to zero. This can
happen if the driver is slow to tear down it's internal state. Ideally, the
driver would not attempt to perform any config accesses to the device once
it's been marked as removed, but sometimes it happens. As a result, we
might attempt to access the pci_dn for a device that has been torn down and
the kernel may crash as a result.

To fix this, don't free the pci_dn unless the corresponding pci_dev has
been released.  If the pci_dev is still live, then we mark the pci_dn with
a flag that indicates the pci_dev's release function should free it.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/pci-bridge.h |  1 +
 arch/powerpc/kernel/pci-hotplug.c |  7 +++
 arch/powerpc/kernel/pci_dn.c  | 21 +++--
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 54a9de01c2a3..69f4cb3b7c56 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -183,6 +183,7 @@ struct iommu_table;
 struct pci_dn {
int flags;
 #define PCI_DN_FLAG_IOV_VF 0x01
+#define PCI_DN_FLAG_DEAD   0x02/* Device has been hot-removed */
 
int busno;  /* pci bus number */
int devfn;  /* pci device and function number */
diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index 0b0cf8168b47..fc62c4bc47b1 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -55,11 +55,18 @@ EXPORT_SYMBOL_GPL(pci_find_bus_by_node);
 void pcibios_release_device(struct pci_dev *dev)
 {
struct pci_controller *phb = pci_bus_to_host(dev->bus);
+   struct pci_dn *pdn = pci_get_pdn(dev);
 
eeh_remove_device(dev);
 
if (phb->controller_ops.release_device)
phb->controller_ops.release_device(dev);
+
+   /* free()ing the pci_dn has been deferred to us, do it now */
+   if (pdn && (pdn->flags & PCI_DN_FLAG_DEAD)) {
+   pci_dbg(dev, "freeing dead pdn\n");
+   kfree(pdn);
+   }
 }
 
 /**
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 5614ca0940ca..4e654df55969 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -320,6 +320,7 @@ void pci_remove_device_node_info(struct device_node *dn)
 {
struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
struct device_node *parent;
+   struct pci_dev *pdev;
 #ifdef CONFIG_EEH
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
@@ -333,12 +334,28 @@ void pci_remove_device_node_info(struct device_node *dn)
WARN_ON(!list_empty(>child_list));
list_del(>list);
 
+   /* Drop the parent pci_dn's ref to our backing dt node */
parent = of_get_parent(dn);
if (parent)
of_node_put(parent);
 
-   dn->data = NULL;
-   kfree(pdn);
+   /*
+* At this point we *might* still have a pci_dev that was
+* instantiated from this pci_dn. So defer free()ing it until
+* the pci_dev's release function is called.
+*/
+   pdev = pci_get_domain_bus_and_slot(pdn->phb->global_number,
+   pdn->busno, pdn->devfn);
+   if (pdev) {
+   /* NB: pdev has a ref to dn */
+   pci_dbg(pdev, "marked pdn (from %pOF) as dead\n", dn);
+   pdn->flags |= PCI_DN_FLAG_DEAD;
+   } else {
+   dn->data = NULL;
+   kfree(pdn);
+   }
+
+   pci_dev_put(pdev);
 }
 EXPORT_SYMBOL_GPL(pci_remove_device_node_info);
 
-- 
2.21.0



[PATCH 01/14] powerpc/eeh: Clean up EEH PEs after recovery finishes

2019-09-03 Thread Oliver O'Halloran
When the last device in an eeh_pe is removed the eeh_pe structure itself
(and any empty parents) are freed since they are no longer needed. This
results in a crash when a hotplug driver is involved since the following
may occur:

1. Device is suprise removed.
2. Driver performs an MMIO, which fails and queues and eeh_event.
3. Hotplug driver receives a hotplug interrupt and removes any
   pci_devs that were under the slot.
4. pci_dev is torn down and the eeh_pe is freed.
5. The EEH event handler thread processes the eeh_event and crashes
   since the eeh_pe pointer in the eeh_event structure is no
   longer valid.

Crashing is generally considered poor form. Instead of doing that use
the fact PEs are marked as EEH_PE_INVALID to keep them around until the
end of the recovery cycle, at which point we can safely prune any empty
PEs.

Signed-off-by: Oliver O'Halloran 
---
Sam Bobroff is working on implementing proper refcounting for EEH PEs,
but that's not going to be ready for a while and this is less broken
than what we have now.
---
 arch/powerpc/kernel/eeh_driver.c | 36 ++--
 arch/powerpc/kernel/eeh_event.c  |  8 +++
 arch/powerpc/kernel/eeh_pe.c | 23 +++-
 3 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index a31cd32c4ce9..75266156943f 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -734,6 +734,33 @@ static int eeh_reset_device(struct eeh_pe *pe, struct 
pci_bus *bus,
  */
 #define MAX_WAIT_FOR_RECOVERY 300
 
+
+/* Walks the PE tree after processing an event to remove any stale PEs.
+ *
+ * NB: This needs to be recursive to ensure the leaf PEs get removed
+ * before their parents do. Although this is possible to do recursively
+ * we don't since this is easier to read and we need to garantee
+ * the leaf nodes will be handled first.
+ */
+static void eeh_pe_cleanup(struct eeh_pe *pe)
+{
+   struct eeh_pe *child_pe, *tmp;
+
+   list_for_each_entry_safe(child_pe, tmp, >child_list, child)
+   eeh_pe_cleanup(child_pe);
+
+   if (pe->state & EEH_PE_KEEP)
+   return;
+
+   if (!(pe->state & EEH_PE_INVALID))
+   return;
+
+   if (list_empty(>edevs) && list_empty(>child_list)) {
+   list_del(>child);
+   kfree(pe);
+   }
+}
+
 /**
  * eeh_handle_normal_event - Handle EEH events on a specific PE
  * @pe: EEH PE - which should not be used after we return, as it may
@@ -772,8 +799,6 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
return;
}
 
-   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
-
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
if (pe->freeze_count > eeh_max_freezes) {
@@ -963,6 +988,12 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
return;
}
}
+
+   /*
+* Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
+* we don't want to modify the PE tree structure so we do it here.
+*/
+   eeh_pe_cleanup(pe);
eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
 }
 
@@ -1035,6 +1066,7 @@ void eeh_handle_special_event(void)
 */
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
+   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
eeh_handle_normal_event(pe);
} else {
pci_lock_rescan_remove();
diff --git a/arch/powerpc/kernel/eeh_event.c b/arch/powerpc/kernel/eeh_event.c
index 64cfbe41174b..e36653e5f76b 100644
--- a/arch/powerpc/kernel/eeh_event.c
+++ b/arch/powerpc/kernel/eeh_event.c
@@ -121,6 +121,14 @@ int __eeh_send_failure_event(struct eeh_pe *pe)
}
event->pe = pe;
 
+   /*
+* Mark the PE as recovering before inserting it in the queue.
+* This prevents the PE from being free()ed by a hotplug driver
+* while the PE is sitting in the event queue.
+*/
+   if (pe)
+   eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
+
/* We may or may not be called in an interrupt context */
spin_lock_irqsave(_eventlist_lock, flags);
list_add(>list, _eventlist);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 1a6254bcf056..177852e39a25 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -470,6 +470,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 {
struct eeh_pe *pe, *parent, *child;
+   bool keep, recover;
int cnt;
 
pe = eeh_dev_to_pe(edev);
@@ -490,10 +491,21 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
 */
while (1) {
parent = pe->parent;
+
+   /* PHB PEs should never be removed */
if 

EEH + hotplug fixes

2019-09-03 Thread Oliver O'Halloran
Fixes various random crashes and other bad behaviour when hot
pluggable slots are used with EEH, namely:

1) Random crashes due to eeh_pe and pci_dn lifecycle mis-management
2) Hotplug slots tearing down devices you are trying to recover due
   to the reset that occurs while recovering a PE / bus.
3) Hot-remove causing spurious EEH events.

And some others. This series also enables pnv_php on Power9
since various people were carrying around hacks to make it
work and with the above fixes it seems to be fairly stable
now.

The series also adds the beginnings of a platform-independent test
infrastructure for EEH and a selftest script that exercises the
basic recovery path.

Oliver




[mainline][BUG][PPC][btrfs][bisected 00801a] kernel BUG at fs/btrfs/locking.c:71!

2019-09-03 Thread Abdul Haleem
Greeting's

Mainline kernel panics with LTP/fs_fill-dir tests for btrfs file system on my 
P9 box running mainline kernel 5.3.0-rc5

BUG_ON was first introduced by below commit

commit 00801ae4bb2be5f5af46502ef239ac5f4b536094
Author: David Sterba 
Date:   Thu May 2 16:53:47 2019 +0200

btrfs: switch extent_buffer write_locks from atomic to int

The write_locks is either 0 or 1 and always updated under the lock,
so we don't need the atomic_t semantics.

Reviewed-by: Nikolay Borisov 
Signed-off-by: David Sterba 

diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2706676279..98fccce420 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -58,17 +58,17 @@ static void btrfs_assert_tree_read_locked(struct
extent_buffer *eb)
 
 static void btrfs_assert_tree_write_locks_get(struct extent_buffer *eb)
 {
-   atomic_inc(>write_locks);
+   eb->write_locks++;
 }
 
 static void btrfs_assert_tree_write_locks_put(struct extent_buffer *eb)
 {
-   atomic_dec(>write_locks);
+   eb->write_locks--;
 }
 
 void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-   BUG_ON(!atomic_read(>write_locks));
+   BUG_ON(!eb->write_locks);
 }
 

tests logs:
avocado-misc-tests/io/disk/ltp_fs.py:LtpFs.test_fs_run;fs_fill-dir-ext3-61cd:  
[ 3376.022096] EXT4-fs (nvme0n1): mounting ext3 file system using the ext4 
subsystem
EXT4-fs (nvme0n1): mounted filesystem with ordered data mode. Opts: (null)
EXT4-fs (loop1): mounting ext2 file system using the ext4 subsystem
EXT4-fs (loop1): mounted filesystem without journal. Opts: (null)
EXT4-fs (loop1): mounting ext3 file system using the ext4 subsystem
EXT4-fs (loop1): mounted filesystem with ordered data mode. Opts: (null)
EXT4-fs (loop1): mounted filesystem with ordered data mode. Opts: (null)
XFS (loop1): Mounting V5 Filesystem
XFS (loop1): Ending clean mount
XFS (loop1): Unmounting Filesystem
BTRFS: device fsid 7c08f81b-6642-4a06-9182-2884e80d56ee devid 1 transid 5 
/dev/loop1
BTRFS info (device loop1): disk space caching is enabled
BTRFS info (device loop1): has skinny extents
BTRFS info (device loop1): enabling ssd optimizations
BTRFS info (device loop1): creating UUID tree
[ cut here ]
kernel BUG at fs/btrfs/locking.c:71!
Oops: Exception in kernel mode, sig: 5 [#1]
LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in: fuse(E) vfat(E) fat(E) btrfs(E) xor(E)
zstd_decompress(E) zstd_compress(E) raid6_pq(E) xfs(E) raid0(E)
linear(E) dm_round_robin(E) dm_queue_length(E) dm_service_time(E)
dm_multipath(E) loop(E) rpadlpar_io(E) rpaphp(E) lpfc(E) bnx2x(E)
xt_CHECKSUM(E) xt_MASQUERADE(E) tun(E) bridge(E) stp(E) llc(E) kvm_pr(E)
kvm(E) tcp_diag(E) udp_diag(E) inet_diag(E) unix_diag(E)
af_packet_diag(E) netlink_diag(E) ip6t_rpfilter(E) ipt_REJECT(E)
nf_reject_ipv4(E) ip6t_REJECT(E) nf_reject_ipv6(E) xt_conntrack(E)
ip_set(E) nfnetlink(E) ebtable_nat(E) ebtable_broute(E) ip6table_nat(E)
ip6table_mangle(E) ip6table_security(E) ip6table_raw(E) iptable_nat(E)
nf_nat(E) nf_conntrack(E) nf_defrag_ipv6(E) nf_defrag_ipv4(E)
iptable_mangle(E) iptable_security(E) iptable_raw(E) ebtable_filter(E)
ebtables(E) ip6table_filter(E) ip6_tables(E) iptable_filter(E) sunrpc(E)
raid10(E) xts(E) pseries_rng(E) vmx_crypto(E) sg(E) uio_pdrv_genirq(E)
uio(E) binfmt_misc(E) sch_fq_codel(E) ip_tables(E)
 ext4(E) mbcache(E) jbd2(E) sr_mod(E) cdrom(E) sd_mod(E) ibmvscsi(E)
scsi_transport_srp(E) ibmveth(E) nvmet_fc(E) nvmet(E) nvme_fc(E)
nvme_fabrics(E) scsi_transport_fc(E) mdio(E) libcrc32c(E) ptp(E)
pps_core(E) nvme(E) nvme_core(E) dm_mirror(E) dm_region_hash(E)
dm_log(E) dm_mod(E) [last unloaded: lpfc]
CPU: 14 PID: 1803 Comm: kworker/u32:8 Tainted: GE 
5.3.0-rc5-autotest-autotest #1
Workqueue: btrfs-endio-write btrfs_endio_write_helper [btrfs]
NIP:  c0080164dd70 LR: c0080164df00 CTR: c0a817a0
REGS: c260b5d0 TRAP: 0700   Tainted: GE  
(5.3.0-rc5-autotest-autotest)
MSR:  800102029033   CR: 22444082  XER: 

CFAR: c0080164defc IRQMASK: 0
GPR00: c008015c55f4 c260b860 c00801703b00 c00267a29af0
GPR04:  0001  
GPR08:  0001  0004
GPR12: 4000 c0001ec58e00  
GPR16: 0001 0004 0001 0001
GPR20:  0001 3e0f83e1 c0025a7cbef0
GPR24: c260ba26 4000 c14a26e8 0003
GPR28: 0004 c0025f2010a0 c00267a29af0 
NIP [c0080164dd70] btrfs_assert_tree_locked+0x10/0x20 [btrfs]
LR [c0080164df00] btrfs_set_lock_blocking_write+0x60/0x100 [btrfs]
Call Trace:
[c260b860] [c260b8e0] 0xc260b8e0 (unreliable)
[c260b890] [c008015c55f4] 

Re: [PATCH v2 2/9] x86: numa: check the node id consistently for x86

2019-09-03 Thread Yunsheng Lin
On 2019/9/3 15:11, Peter Zijlstra wrote:
> On Tue, Sep 03, 2019 at 02:19:04PM +0800, Yunsheng Lin wrote:
>> On 2019/9/2 20:56, Peter Zijlstra wrote:
>>> On Mon, Sep 02, 2019 at 08:25:24PM +0800, Yunsheng Lin wrote:
 On 2019/9/2 15:25, Peter Zijlstra wrote:
> On Mon, Sep 02, 2019 at 01:46:51PM +0800, Yunsheng Lin wrote:
>> On 2019/9/1 0:12, Peter Zijlstra wrote:
>
>>> 1) because even it is not set, the device really does belong to a node.
>>> It is impossible a device will have magic uniform access to memory when
>>> CPUs cannot.
>>
>> So it means dev_to_node() will return either NUMA_NO_NODE or a
>> valid node id?
>
> NUMA_NO_NODE := -1, which is not a valid node number. It is also, like I
> said, not a valid device location on a NUMA system.
>
> Just because ACPI/BIOS is shit, doesn't mean the device doesn't have a
> node association. It just means we don't know and might have to guess.

 How do we guess the device's location when ACPI/BIOS does not set it?
>>>
>>> See device_add(), it looks to the device's parent and on NO_NODE, puts
>>> it there.
>>>
>>> Lacking any hints, just stick it to node0 and print a FW_BUG or
>>> something.
>>>
 It seems dev_to_node() does not do anything about that and leave the
 job to the caller or whatever function that get called with its return
 value, such as cpumask_of_node().
>>>
>>> Well, dev_to_node() doesn't do anything; nor should it. It are the
>>> callers of set_dev_node() that should be taking care.
>>>
>>> Also note how device_add() sets the device node to the parent device's
>>> node on NUMA_NO_NODE. Arguably we should change it to complain when it
>>> finds NUMA_NO_NODE and !parent.
>>
>> Is it possible that the node id set by device_add() become invalid
>> if the node is offlined, then dev_to_node() may return a invalid
>> node id.
> 
> In that case I would expect the device to go away too. Once the memory
> controller goes away, the PCI bus connected to it cannot continue to
> function.

Ok. To summarize the discussion in order to for me to understand it
correctly:

1) Make sure device_add() set to default node0 to a device if
   ACPI/BIOS does not set the node id and it has not no parent device.

2) Use '(unsigned)node_id >= nr_node_ids' to fix the
   CONFIG_DEBUG_PER_CPU_MAPS version of cpumask_of_node() for x86
   and arm64, x86 just has a fix from you now, a patch for arm64 is
   also needed.

3) Maybe fix some other the sign bug for node id checking through the
   kernel using the '(unsigned)node_id >= nr_node_ids'.

Please see if I understand it correctly or miss something.
Maybe I can begin by sending a patch about item one to see if everyone
is ok with the idea?


> 
>> From the comment in select_fallback_rq(), it seems that a node can
>> be offlined, not sure if node offline process has taken cared of that?
>>
>>  /*
>>  * If the node that the CPU is on has been offlined, cpu_to_node()
>>  * will return -1. There is no CPU on the node, and we should
>>  * select the CPU on the other node.
>>  */
> 
> Ugh, so I disagree with that notion. cpu_to_node() mapping should be
> fixed, you simply cannot change it after boot, too much stuff relies on
> it.
> 
> Setting cpu_to_node to -1 on node offline is just wrong. But alas, it
> seems this is already so.




[PATCH 1/1] mm/pgtable/debug: Add test validating architecture page table helpers

2019-09-03 Thread Anshuman Khandual
This adds a test module which will validate architecture page table helpers
and accessors regarding compliance with generic MM semantics expectations.
This will help various architectures in validating changes to the existing
page table helpers or addition of new ones.

Test page table and memory pages creating it's entries at various level are
all allocated from system memory with required alignments. If memory pages
with required size and alignment could not be allocated, then all depending
individual tests are skipped.

Cc: Andrew Morton 
Cc: Vlastimil Babka 
Cc: Greg Kroah-Hartman 
Cc: Thomas Gleixner 
Cc: Mike Rapoport 
Cc: Jason Gunthorpe 
Cc: Dan Williams 
Cc: Peter Zijlstra 
Cc: Michal Hocko 
Cc: Mark Rutland 
Cc: Mark Brown 
Cc: Steven Price 
Cc: Ard Biesheuvel 
Cc: Masahiro Yamada 
Cc: Kees Cook 
Cc: Tetsuo Handa 
Cc: Matthew Wilcox 
Cc: Sri Krishna chowdary 
Cc: Dave Hansen 
Cc: Russell King - ARM Linux 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: "David S. Miller" 
Cc: Vineet Gupta 
Cc: James Hogan 
Cc: Paul Burton 
Cc: Ralf Baechle 
Cc: linux-snps-...@lists.infradead.org
Cc: linux-m...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-i...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org

Suggested-by: Catalin Marinas 
Signed-off-by: Anshuman Khandual 
---
 mm/Kconfig.debug   |  14 ++
 mm/Makefile|   1 +
 mm/arch_pgtable_test.c | 425 +
 3 files changed, 440 insertions(+)
 create mode 100644 mm/arch_pgtable_test.c

diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 327b3ebf23bf..ce9c397f7b07 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -117,3 +117,17 @@ config DEBUG_RODATA_TEST
 depends on STRICT_KERNEL_RWX
 ---help---
   This option enables a testcase for the setting rodata read-only.
+
+config DEBUG_ARCH_PGTABLE_TEST
+   bool "Test arch page table helpers for semantics compliance"
+   depends on MMU
+   depends on DEBUG_KERNEL
+   help
+ This options provides a kernel module which can be used to test
+ architecture page table helper functions on various platform in
+ verifying if they comply with expected generic MM semantics. This
+ will help architectures code in making sure that any changes or
+ new additions of these helpers will still conform to generic MM
+ expected semantics.
+
+ If unsure, say N.
diff --git a/mm/Makefile b/mm/Makefile
index d996846697ef..bb572c5aa8c5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_ARCH_PGTABLE_TEST) += arch_pgtable_test.o
 obj-$(CONFIG_PAGE_OWNER) += page_owner.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/arch_pgtable_test.c b/mm/arch_pgtable_test.c
new file mode 100644
index ..f15be8a73723
--- /dev/null
+++ b/mm/arch_pgtable_test.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This kernel module validates architecture page table helpers &
+ * accessors and helps in verifying their continued compliance with
+ * generic MM semantics.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ *
+ * Author: Anshuman Khandual 
+ */
+#define pr_fmt(fmt) "arch_pgtable_test: %s " fmt, __func__
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Basic operations
+ *
+ * mkold(entry)= An old and not a young entry
+ * mkyoung(entry)  = A young and not an old entry
+ * mkdirty(entry)  = A dirty and not a clean entry
+ * mkclean(entry)  = A clean and not a dirty entry
+ * mkwrite(entry)  = A write and not a write protected entry
+ * wrprotect(entry)= A write protected and not a write entry
+ * pxx_bad(entry)  = A mapped and non-table entry
+ * pxx_same(entry1, entry2)= Both entries hold the exact same value
+ */
+#define VADDR_TEST (PGDIR_SIZE + PUD_SIZE + PMD_SIZE + PAGE_SIZE)
+#define VMA_TEST_FLAGS (VM_READ|VM_WRITE|VM_EXEC)
+#define RANDOM_NZVALUE (0xbe)
+
+static bool pud_aligned;
+static bool pmd_aligned;
+
+extern struct mm_struct *mm_alloc(void);
+
+static void pte_basic_tests(struct page *page, pgprot_t prot)
+{
+   pte_t pte = mk_pte(page, prot);
+
+   WARN_ON(!pte_same(pte, pte));
+   WARN_ON(!pte_young(pte_mkyoung(pte)));
+   WARN_ON(!pte_dirty(pte_mkdirty(pte)));
+   WARN_ON(!pte_write(pte_mkwrite(pte)));
+   

[PATCH 0/1] mm/debug: Add tests for architecture exported page table helpers

2019-09-03 Thread Anshuman Khandual
This series adds a test validation for architecture exported page table
helpers. Patch in the series adds basic transformation tests at various
levels of the page table.

This test was originally suggested by Catalin during arm64 THP migration
RFC discussion earlier. Going forward it can include more specific tests
with respect to various generic MM functions like THP, HugeTLB etc and
platform specific tests.

https://lore.kernel.org/linux-mm/20190628102003.ga56...@arrakis.emea.arm.com/

Questions:

Should alloc_gigantic_page() be made available as an interface for general
use in the kernel. The test module here uses very similar implementation from
HugeTLB to allocate a PUD aligned memory block. Similar for mm_alloc() which
needs to be exported through a header.

Matthew Wilcox had expressed concerns regarding memory allocation for mapped
page table entries at various level. He also suggested using synethetic pfns
which can be derived from virtual address of a known kernel text symbol like
kernel_init(). But as discussed previously, it seems like allocated memory
might still outweigh synthetic pfns. This proposal goes with allocated memory
but if there is a broader agreement with respect to synthetic pfns, will be
happy to rework the test.

Testing:

Build and boot tested on arm64 and x86 platforms. While arm64 clears all
these tests, following errors were reported on x86.

1. WARN_ON(pud_bad(pud)) in pud_populate_tests()
2. WARN_ON(p4d_bad(p4d)) in p4d_populate_tests()

I would really appreciate if folks can help validate this test on other
platforms and report back problems if any. Suggestions, comments and
inputs welcome. Thank you.

Changes in V3:

- Added fallback mechanism for PMD aligned memory allocation failure

Changes in V2:

https://lore.kernel.org/linux-mm/1565335998-22553-1-git-send-email-anshuman.khand...@arm.com/T/#u

- Moved test module and it's config from lib/ to mm/
- Renamed config TEST_ARCH_PGTABLE as DEBUG_ARCH_PGTABLE_TEST
- Renamed file from test_arch_pgtable.c to arch_pgtable_test.c
- Added relevant MODULE_DESCRIPTION() and MODULE_AUTHOR() details
- Dropped loadable module config option
- Basic tests now use memory blocks with required size and alignment
- PUD aligned memory block gets allocated with alloc_contig_range()
- If PUD aligned memory could not be allocated it falls back on PMD aligned
  memory block from page allocator and pud_* tests are skipped
- Clear and populate tests now operate on real in memory page table entries
- Dummy mm_struct gets allocated with mm_alloc()
- Dummy page table entries get allocated with [pud|pmd|pte]_alloc_[map]()
- Simplified [p4d|pgd]_basic_tests(), now has random values in the entries

RFC V1:

https://lore.kernel.org/linux-mm/1564037723-26676-1-git-send-email-anshuman.khand...@arm.com/

Cc: Andrew Morton 
Cc: Vlastimil Babka 
Cc: Greg Kroah-Hartman 
Cc: Thomas Gleixner 
Cc: Mike Rapoport 
Cc: Jason Gunthorpe 
Cc: Dan Williams 
Cc: Peter Zijlstra 
Cc: Michal Hocko 
Cc: Mark Rutland 
Cc: Mark Brown 
Cc: Steven Price 
Cc: Ard Biesheuvel 
Cc: Masahiro Yamada 
Cc: Kees Cook 
Cc: Tetsuo Handa 
Cc: Matthew Wilcox 
Cc: Sri Krishna chowdary 
Cc: Dave Hansen 
Cc: Russell King - ARM Linux 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: "David S. Miller" 
Cc: Vineet Gupta 
Cc: James Hogan 
Cc: Paul Burton 
Cc: Ralf Baechle 
Cc: linux-snps-...@lists.infradead.org
Cc: linux-m...@vger.kernel.org
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-i...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-s...@vger.kernel.org
Cc: linux...@vger.kernel.org
Cc: sparcli...@vger.kernel.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org

Anshuman Khandual (1):
  mm/pgtable/debug: Add test validating architecture page table helpers

 mm/Kconfig.debug   |  14 ++
 mm/Makefile|   1 +
 mm/arch_pgtable_test.c | 425 +
 3 files changed, 440 insertions(+)
 create mode 100644 mm/arch_pgtable_test.c

-- 
2.20.1



[PATCH] x86/mm: Fix cpumask_of_node() error condition

2019-09-03 Thread Peter Zijlstra
On Mon, Sep 02, 2019 at 08:17:31PM +0200, Ingo Molnar wrote:

> Nitpicking: please also fix the kernel message to say ">=".

Full patch below.

---
Subject: x86/mm: Fix cpumask_of_node() error condition

When CONFIG_DEBUG_PER_CPU_MAPS we validate that the @node argument of
cpumask_of_node() is a valid node_id. It however forgets to check for
negative numbers. Fix this by explicitly casting to unsigned.

  (unsigned)node >= nr_node_ids

verifies: 0 <= node < nr_node_ids

Also ammend the error message to match the condition.

Acked-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
---
 arch/x86/mm/numa.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index e6dad600614c..4123100e0eaf 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -861,9 +861,9 @@ void numa_remove_cpu(int cpu)
  */
 const struct cpumask *cpumask_of_node(int node)
 {
-   if (node >= nr_node_ids) {
+   if ((unsigned)node >= nr_node_ids) {
printk(KERN_WARNING
-   "cpumask_of_node(%d): node > nr_node_ids(%u)\n",
+   "cpumask_of_node(%d): (unsigned)node >= 
nr_node_ids(%u)\n",
node, nr_node_ids);
dump_stack();
return cpu_none_mask;


Re: [PATCH v2 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-09-03 Thread Christophe Leroy




Le 03/09/2019 à 08:25, Alastair D'Silva a écrit :

On Tue, 2019-09-03 at 08:19 +0200, Christophe Leroy wrote:


Le 03/09/2019 à 07:23, Alastair D'Silva a écrit :

From: Alastair D'Silva 

When presented with large amounts of memory being hotplugged
(in my test case, ~890GB), the call to flush_dcache_range takes
a while (~50 seconds), triggering RCU stalls.

This patch breaks up the call into 1GB chunks, calling
cond_resched() inbetween to allow the scheduler to run.

Signed-off-by: Alastair D'Silva 
---
   arch/powerpc/mm/mem.c | 18 --
   1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index cd540123874d..854aaea2c6ae 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -104,11 +104,14 @@ int __weak remove_section_mapping(unsigned
long start, unsigned long end)
return -ENODEV;
   }
   
+#define FLUSH_CHUNK_SIZE SZ_1G


Maybe the name is a bit long for a local define. See if we could
reduce
code line splits below by shortening this name.


+
   int __ref arch_add_memory(int nid, u64 start, u64 size,
struct mhp_restrictions *restrictions)
   {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+   u64 i;
int rc;
   
   	resize_hpt_for_hotplug(memblock_phys_mem_size());

@@ -120,7 +123,12 @@ int __ref arch_add_memory(int nid, u64 start,
u64 size,
start, start + size, rc);
return -EFAULT;
}
-   flush_dcache_range(start, start + size);
+
+   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
+   flush_dcache_range(start + i,
+  min(start + size, start + i +
FLUSH_CHUNK_SIZE));


My eyes don't like it.

What about
for (; i < size; i += FLUSH_CHUNK_SIZE) {
int len = min(size - i, FLUSH_CHUNK_SIZE);

flush_dcache_range(start + i, start + i + len);
cond_resched();
}

or

end = start + size;
for (; start < end; start += FLUSH_CHUNK_SIZE, size -=
FLUSH_CHUNK_SIZE) {
int len = min(size, FLUSH_CHUNK_SIZE);

flush_dcache_range(start, start + len);
cond_resched();
}


+   cond_resched();
+   }
   
   	return __add_pages(nid, start_pfn, nr_pages, restrictions);

   }
@@ -131,13 +139,19 @@ void __ref arch_remove_memory(int nid, u64
start, u64 size,
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
struct page *page = pfn_to_page(start_pfn) +
vmem_altmap_offset(altmap);
+   u64 i;
int ret;
   
   	__remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
   
   	/* Remove htab bolted mappings for this section of memory */

start = (unsigned long)__va(start);
-   flush_dcache_range(start, start + size);
+   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
+   flush_dcache_range(start + i,
+  min(start + size, start + i +
FLUSH_CHUNK_SIZE));
+   cond_resched();
+   }
+


This piece of code looks pretty similar to the one before. Can we
refactor into a small helper ?



Not much point, it's removed in a subsequent patch.



But you tell me that you leave to people the opportunity to not apply 
that subsequent patch, and that's the reason you didn't put that patch 
before this one. In that case adding a helper is worth it.


Christophe


Re: [PATCH v2 2/9] x86: numa: check the node id consistently for x86

2019-09-03 Thread Peter Zijlstra
On Tue, Sep 03, 2019 at 02:19:04PM +0800, Yunsheng Lin wrote:
> On 2019/9/2 20:56, Peter Zijlstra wrote:
> > On Mon, Sep 02, 2019 at 08:25:24PM +0800, Yunsheng Lin wrote:
> >> On 2019/9/2 15:25, Peter Zijlstra wrote:
> >>> On Mon, Sep 02, 2019 at 01:46:51PM +0800, Yunsheng Lin wrote:
>  On 2019/9/1 0:12, Peter Zijlstra wrote:
> >>>
> > 1) because even it is not set, the device really does belong to a node.
> > It is impossible a device will have magic uniform access to memory when
> > CPUs cannot.
> 
>  So it means dev_to_node() will return either NUMA_NO_NODE or a
>  valid node id?
> >>>
> >>> NUMA_NO_NODE := -1, which is not a valid node number. It is also, like I
> >>> said, not a valid device location on a NUMA system.
> >>>
> >>> Just because ACPI/BIOS is shit, doesn't mean the device doesn't have a
> >>> node association. It just means we don't know and might have to guess.
> >>
> >> How do we guess the device's location when ACPI/BIOS does not set it?
> > 
> > See device_add(), it looks to the device's parent and on NO_NODE, puts
> > it there.
> > 
> > Lacking any hints, just stick it to node0 and print a FW_BUG or
> > something.
> > 
> >> It seems dev_to_node() does not do anything about that and leave the
> >> job to the caller or whatever function that get called with its return
> >> value, such as cpumask_of_node().
> > 
> > Well, dev_to_node() doesn't do anything; nor should it. It are the
> > callers of set_dev_node() that should be taking care.
> > 
> > Also note how device_add() sets the device node to the parent device's
> > node on NUMA_NO_NODE. Arguably we should change it to complain when it
> > finds NUMA_NO_NODE and !parent.
> 
> Is it possible that the node id set by device_add() become invalid
> if the node is offlined, then dev_to_node() may return a invalid
> node id.

In that case I would expect the device to go away too. Once the memory
controller goes away, the PCI bus connected to it cannot continue to
function.

> From the comment in select_fallback_rq(), it seems that a node can
> be offlined, not sure if node offline process has taken cared of that?
> 
>   /*
>  * If the node that the CPU is on has been offlined, cpu_to_node()
>  * will return -1. There is no CPU on the node, and we should
>  * select the CPU on the other node.
>  */

Ugh, so I disagree with that notion. cpu_to_node() mapping should be
fixed, you simply cannot change it after boot, too much stuff relies on
it.

Setting cpu_to_node to -1 on node offline is just wrong. But alas, it
seems this is already so.

> With the above assumption that a device is always on a valid node,
> the node id returned from dev_to_node() can be safely passed to
> cpumask_of_node() without any checking?




Re: [PATCH v2 6/6] powerpc: Don't flush caches when adding memory

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 08:23 +0200, Christophe Leroy wrote:
> 
> Le 03/09/2019 à 07:24, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > This operation takes a significant amount of time when hotplugging
> > large amounts of memory (~50 seconds with 890GB of persistent
> > memory).
> > 
> > This was orignally in commit fb5924fddf9e
> > ("powerpc/mm: Flush cache on memory hot(un)plug") to support
> > memtrace,
> > but the flush on add is not needed as it is flushed on remove.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/mm/mem.c | 7 ---
> >   1 file changed, 7 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index 854aaea2c6ae..2a14b5b93e19 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -111,7 +111,6 @@ int __ref arch_add_memory(int nid, u64 start,
> > u64 size,
> >   {
> > unsigned long start_pfn = start >> PAGE_SHIFT;
> > unsigned long nr_pages = size >> PAGE_SHIFT;
> > -   u64 i;
> > int rc;
> >   
> > resize_hpt_for_hotplug(memblock_phys_mem_size());
> > @@ -124,12 +123,6 @@ int __ref arch_add_memory(int nid, u64 start,
> > u64 size,
> > return -EFAULT;
> > }
> >   
> > -   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> > -   flush_dcache_range(start + i,
> > -  min(start + size, start + i +
> > FLUSH_CHUNK_SIZE));
> > -   cond_resched();
> > -   }
> > -
> 
> So you are removing the code you added in patch 4. Why not move this
> one 
> before patch 4 ?
> 

I put them in this order so that if someone did want the flushes in
arch_add_memory, they could drop the later patch, but not trigger RCU
stalls.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



Re: [PATCH v2 4/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-09-03 Thread Alastair D'Silva
On Tue, 2019-09-03 at 08:19 +0200, Christophe Leroy wrote:
> 
> Le 03/09/2019 à 07:23, Alastair D'Silva a écrit :
> > From: Alastair D'Silva 
> > 
> > When presented with large amounts of memory being hotplugged
> > (in my test case, ~890GB), the call to flush_dcache_range takes
> > a while (~50 seconds), triggering RCU stalls.
> > 
> > This patch breaks up the call into 1GB chunks, calling
> > cond_resched() inbetween to allow the scheduler to run.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >   arch/powerpc/mm/mem.c | 18 --
> >   1 file changed, 16 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index cd540123874d..854aaea2c6ae 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -104,11 +104,14 @@ int __weak remove_section_mapping(unsigned
> > long start, unsigned long end)
> > return -ENODEV;
> >   }
> >   
> > +#define FLUSH_CHUNK_SIZE SZ_1G
> 
> Maybe the name is a bit long for a local define. See if we could
> reduce 
> code line splits below by shortening this name.
> 
> > +
> >   int __ref arch_add_memory(int nid, u64 start, u64 size,
> > struct mhp_restrictions *restrictions)
> >   {
> > unsigned long start_pfn = start >> PAGE_SHIFT;
> > unsigned long nr_pages = size >> PAGE_SHIFT;
> > +   u64 i;
> > int rc;
> >   
> > resize_hpt_for_hotplug(memblock_phys_mem_size());
> > @@ -120,7 +123,12 @@ int __ref arch_add_memory(int nid, u64 start,
> > u64 size,
> > start, start + size, rc);
> > return -EFAULT;
> > }
> > -   flush_dcache_range(start, start + size);
> > +
> > +   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> > +   flush_dcache_range(start + i,
> > +  min(start + size, start + i +
> > FLUSH_CHUNK_SIZE));
> 
> My eyes don't like it.
> 
> What about
>   for (; i < size; i += FLUSH_CHUNK_SIZE) {
>   int len = min(size - i, FLUSH_CHUNK_SIZE);
> 
>   flush_dcache_range(start + i, start + i + len);
>   cond_resched();
>   }
> 
> or
> 
>   end = start + size;
>   for (; start < end; start += FLUSH_CHUNK_SIZE, size -=
> FLUSH_CHUNK_SIZE) {
>   int len = min(size, FLUSH_CHUNK_SIZE);
> 
>   flush_dcache_range(start, start + len);
>   cond_resched();
>   }
> 
> > +   cond_resched();
> > +   }
> >   
> > return __add_pages(nid, start_pfn, nr_pages, restrictions);
> >   }
> > @@ -131,13 +139,19 @@ void __ref arch_remove_memory(int nid, u64
> > start, u64 size,
> > unsigned long start_pfn = start >> PAGE_SHIFT;
> > unsigned long nr_pages = size >> PAGE_SHIFT;
> > struct page *page = pfn_to_page(start_pfn) +
> > vmem_altmap_offset(altmap);
> > +   u64 i;
> > int ret;
> >   
> > __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
> >   
> > /* Remove htab bolted mappings for this section of memory */
> > start = (unsigned long)__va(start);
> > -   flush_dcache_range(start, start + size);
> > +   for (i = 0; i < size; i += FLUSH_CHUNK_SIZE) {
> > +   flush_dcache_range(start + i,
> > +  min(start + size, start + i +
> > FLUSH_CHUNK_SIZE));
> > +   cond_resched();
> > +   }
> > +
> 
> This piece of code looks pretty similar to the one before. Can we 
> refactor into a small helper ?
> 

Not much point, it's removed in a subsequent patch.

-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



  1   2   >