[PATCH v2 0/6] Memory corruption may occur due to incorrent tlb flush

2020-03-03 Thread Santosh Sivaraj
The TLB flush optimisation (a46cc7a90f: powerpc/mm/radix: Improve TLB/PWC
flushes) may result in random memory corruption. Any concurrent page-table walk
could end up with a Use-after-Free. Even on UP this might give issues, since
mmu_gather is preemptible these days. An interrupt or preempted task accessing
user pages might stumble into the free page if the hardware caches page
directories.

The series is a backport of the fix sent by Peter [1].

The first three patches are dependencies for the last patch (avoid potential
double flush). If the performance impact due to double flush is considered
trivial then the first three patches and last patch may be dropped.

This is only for v4.19 stable.

Changelog:
* Send the patches with the correct format (commit sha1 upstream) for stable

--
Aneesh Kumar K.V (1):
  powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

Peter Zijlstra (4):
  asm-generic/tlb: Track freeing of page-table directories in struct
mmu_gather
  asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE
  mm/mmu_gather: invalidate TLB correctly on batch allocation failure
and flush
  asm-generic/tlb: avoid potential double flush

Will Deacon (1):
  asm-generic/tlb: Track which levels of the page tables have been
cleared

 arch/Kconfig |   3 -
 arch/powerpc/Kconfig |   2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h |   8 --
 arch/powerpc/include/asm/book3s/64/pgalloc.h |   2 -
 arch/powerpc/include/asm/tlb.h   |  11 ++
 arch/powerpc/mm/pgtable-book3s64.c   |   7 --
 arch/sparc/include/asm/tlb_64.h  |   9 ++
 arch/x86/Kconfig |   1 -
 include/asm-generic/tlb.h| 103 ---
 mm/memory.c  |  20 ++--
 10 files changed, 122 insertions(+), 44 deletions(-)

-- 
2.24.1



[PATCH v2 1/6] asm-generic/tlb: Track freeing of page-table directories in struct mmu_gather

2020-03-03 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 22a61c3c4f1379ef8b0ce0d5cb78baf3178950e2 upstream

Some architectures require different TLB invalidation instructions
depending on whether it is only the last-level of page table being
changed, or whether there are also changes to the intermediate
(directory) entries higher up the tree.

Add a new bit to the flags bitfield in struct mmu_gather so that the
architecture code can operate accordingly if it's the intermediate
levels being invalidated.

Signed-off-by: Peter Zijlstra 
Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for tlbflush backports]
---
 include/asm-generic/tlb.h | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b3353e21f3b3..97306b32d8d2 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -97,12 +97,22 @@ struct mmu_gather {
 #endif
unsigned long   start;
unsigned long   end;
-   /* we are in the middle of an operation to clear
-* a full mm and can make some optimizations */
-   unsigned intfullmm : 1,
-   /* we have performed an operation which
-* requires a complete flush of the tlb */
-   need_flush_all : 1;
+   /*
+* we are in the middle of an operation to clear
+* a full mm and can make some optimizations
+*/
+   unsigned intfullmm : 1;
+
+   /*
+* we have performed an operation which
+* requires a complete flush of the tlb
+*/
+   unsigned intneed_flush_all : 1;
+
+   /*
+* we have removed page directories
+*/
+   unsigned intfreed_tables : 1;
 
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
@@ -137,6 +147,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->start = TASK_SIZE;
tlb->end = 0;
}
+   tlb->freed_tables = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -278,6 +289,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pte_free_tlb(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pte_free_tlb(tlb, ptep, address); \
} while (0)
 #endif
@@ -285,7 +297,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)
 #endif
@@ -295,6 +308,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pud_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pud_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
@@ -304,7 +318,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
-- 
2.24.1



[PATCH v2 2/6] asm-generic/tlb: Track which levels of the page tables have been cleared

2020-03-03 Thread Santosh Sivaraj
From: Will Deacon 

commit a6d60245d6d9b1caf66b0d94419988c4836980af upstream

It is common for architectures with hugepage support to require only a
single TLB invalidation operation per hugepage during unmap(), rather than
iterating through the mapping at a PAGE_SIZE increment. Currently,
however, the level in the page table where the unmap() operation occurs
is not stored in the mmu_gather structure, therefore forcing
architectures to issue additional TLB invalidation operations or to give
up and over-invalidate by e.g. invalidating the entire TLB.

Ideally, we could add an interval rbtree to the mmu_gather structure,
which would allow us to associate the correct mapping granule with the
various sub-mappings within the range being invalidated. However, this
is costly in terms of book-keeping and memory management, so instead we
approximate by keeping track of the page table levels that are cleared
and provide a means to query the smallest granule required for invalidation.

Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 include/asm-generic/tlb.h | 58 +--
 mm/memory.c   |  4 ++-
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 97306b32d8d2..f2b9dc9cbaf8 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -114,6 +114,14 @@ struct mmu_gather {
 */
unsigned intfreed_tables : 1;
 
+   /*
+* at which levels have we cleared entries?
+*/
+   unsigned intcleared_ptes : 1;
+   unsigned intcleared_pmds : 1;
+   unsigned intcleared_puds : 1;
+   unsigned intcleared_p4ds : 1;
+
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
struct page *__pages[MMU_GATHER_BUNDLE];
@@ -148,6 +156,10 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
tlb->end = 0;
}
tlb->freed_tables = 0;
+   tlb->cleared_ptes = 0;
+   tlb->cleared_pmds = 0;
+   tlb->cleared_puds = 0;
+   tlb->cleared_p4ds = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -197,6 +209,25 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 }
 #endif
 
+static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
+{
+   if (tlb->cleared_ptes)
+   return PAGE_SHIFT;
+   if (tlb->cleared_pmds)
+   return PMD_SHIFT;
+   if (tlb->cleared_puds)
+   return PUD_SHIFT;
+   if (tlb->cleared_p4ds)
+   return P4D_SHIFT;
+
+   return PAGE_SHIFT;
+}
+
+static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
+{
+   return 1UL << tlb_get_unmap_shift(tlb);
+}
+
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * case where we're doing a full MM flush.  When we're doing a munmap,
@@ -230,13 +261,19 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_remove_tlb_entry(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->cleared_ptes = 1;  \
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
-#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)\
-   do { \
-   __tlb_adjust_range(tlb, address, huge_page_size(h)); \
-   __tlb_remove_tlb_entry(tlb, ptep, address);  \
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)   \
+   do {\
+   unsigned long _sz = huge_page_size(h);  \
+   __tlb_adjust_range(tlb, address, _sz);  \
+   if (_sz == PMD_SIZE)\
+   tlb->cleared_pmds = 1;  \
+   else if (_sz == PUD_SIZE)   \
+   tlb->cleared_puds = 1;  \
+   __tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
 /**
@@ -250,6 +287,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)   \
do {\
__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);   \
+   tlb->cleared_pmds = 1;  \
__tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
  

[PATCH v2 3/6] asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE

2020-03-03 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 96bc9567cbe112e9320250f01b9c060c882e8619 upstream.

Make issuing a TLB invalidate for page-table pages the normal case.

The reason is twofold:

 - too many invalidates is safer than too few,
 - most architectures use the linux page-tables natively
   and would thus require this.

Make it an opt-out, instead of an opt-in.

No change in behavior intended.

Signed-off-by: Peter Zijlstra (Intel) 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 arch/Kconfig | 2 +-
 arch/powerpc/Kconfig | 1 +
 arch/sparc/Kconfig   | 1 +
 arch/x86/Kconfig | 1 -
 mm/memory.c  | 2 +-
 5 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a336548487e6..061a12b8140e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,7 +363,7 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_INVALIDATE
+config HAVE_RCU_TABLE_NO_INVALIDATE
bool
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f475dc5829b..e09cfb109b8c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,6 +216,7 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e6f2a38d2e61..d90d632868aa 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,6 +64,7 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index af35f5caadbe..181d0d522977 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -181,7 +181,6 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if PARAVIRT
-   select HAVE_RCU_TABLE_INVALIDATEif HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && 
(UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_STACKPROTECTOR  if CC_HAS_SANE_STACKPROTECTOR
diff --git a/mm/memory.c b/mm/memory.c
index 1832c5ed6ac0..ba5689610c04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -327,7 +327,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct 
page *page, int page_
  */
 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 {
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
/*
 * Invalidate page-table caches used by hardware walkers. Then we still
 * need to RCU-sched wait while freeing the pages because software
-- 
2.24.1



[PATCH v2 4/6] powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

2020-03-03 Thread Santosh Sivaraj
From: "Aneesh Kumar K.V" 

commit 12e4d53f3f04e81f9e83d6fc10edc7314ab9f6b9 upstream.

Patch series "Fixup page directory freeing", v4.

This is a repost of patch series from Peter with the arch specific changes
except ppc64 dropped.  ppc64 changes are added here because we are redoing
the patch series on top of ppc64 changes.  This makes it easy to backport
these changes.  Only the first 2 patches need to be backported to stable.

The thing is, on anything SMP, freeing page directories should observe the
exact same order as normal page freeing:

 1) unhook page/directory
 2) TLB invalidate
 3) free page/directory

Without this, any concurrent page-table walk could end up with a
Use-after-Free.  This is esp.  trivial for anything that has software
page-table walkers (HAVE_FAST_GUP / software TLB fill) or the hardware
caches partial page-walks (ie.  caches page directories).

Even on UP this might give issues since mmu_gather is preemptible these
days.  An interrupt or preempted task accessing user pages might stumble
into the free page if the hardware caches page directories.

This patch series fixes ppc64 and add generic MMU_GATHER changes to
support the conversion of other architectures.  I haven't added patches
w.r.t other architecture because they are yet to be acked.

This patch (of 9):

A followup patch is going to make sure we correctly invalidate page walk
cache before we free page table pages.  In order to keep things simple
enable RCU_TABLE_FREE even for !SMP so that we don't have to fixup the
!SMP case differently in the followup patch

!SMP case is right now broken for radix translation w.r.t page walk
cache flush.  We can get interrupted in between page table free and
that would imply we have page walk cache entries pointing to tables
which got freed already.  Michael said "both our platforms that run on
Power9 force SMP on in Kconfig, so the !SMP case is unlikely to be a
problem for anyone in practice, unless they've hacked their kernel to
build it !SMP."

Link: 
http://lkml.kernel.org/r/20200116064531.483522-2-aneesh.ku...@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported for 4.19 stable]
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 8 
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 --
 arch/powerpc/mm/pgtable-book3s64.c   | 7 ---
 4 files changed, 1 insertion(+), 18 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e09cfb109b8c..1a00ce4b0040 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -215,7 +215,7 @@ config PPC
select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && 
HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
-   select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_FREE
select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 82e44b1a00ae..79ba3fbb512e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -110,7 +110,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
 #define check_pgt_cache()  do { } while (0)
 #define get_hugepd_cache_index(x)  (x)
 
-#ifdef CONFIG_SMP
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
@@ -127,13 +126,6 @@ static inline void __tlb_remove_table(void *_table)
 
pgtable_free(table, shift);
 }
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-   void *table, int shift)
-{
-   pgtable_free(table, shift);
-}
-#endif
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index f9019b579903..1013c0214213 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -47,9 +47,7 @@ extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned 
long);
 extern void pte_fragment_free(unsigned long *, int);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
 extern void __tlb_remove_table(void *_table);
-#endif
 
 static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
 {
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 297db665d953..5b4e9fd8990c 100644
-

[PATCH v2 5/6] mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush

2020-03-03 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0ed1325967ab5f7a4549a2641c6ebe115f76e228 upstream.

Architectures for which we have hardware walkers of Linux page table
should flush TLB on mmu gather batch allocation failures and batch flush.
Some architectures like POWER supports multiple translation modes (hash
and radix) and in the case of POWER only radix translation mode needs the
above TLBI.  This is because for hash translation mode kernel wants to
avoid this extra flush since there are no hardware walkers of linux page
table.  With radix translation, the hardware also walks linux page table
and with that, kernel needs to make sure to TLB invalidate page walk cache
before page table pages are freed.

More details in commit d86564a2f085 ("mm/tlb, x86/mm: Support invalidating
TLB caches for RCU_TABLE_FREE")

The changes to sparc are to make sure we keep the old behavior since we
are now removing HAVE_RCU_TABLE_NO_INVALIDATE.  The default value for
tlb_needs_table_invalidate is to always force an invalidate and sparc can
avoid the table invalidate.  Hence we define tlb_needs_table_invalidate to
false for sparc architecture.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-3-aneesh.ku...@linux.ibm.com
Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
Signed-off-by: Peter Zijlstra (Intel) 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 arch/Kconfig|  3 ---
 arch/powerpc/Kconfig|  1 -
 arch/powerpc/include/asm/tlb.h  | 11 +++
 arch/sparc/Kconfig  |  1 -
 arch/sparc/include/asm/tlb_64.h |  9 +
 include/asm-generic/tlb.h   | 15 +++
 mm/memory.c | 16 
 7 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 061a12b8140e..3abbdb0cea44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,9 +363,6 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_NO_INVALIDATE
-   bool
-
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1a00ce4b0040..e5bc0cfea2b1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,7 +216,6 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f0e571b2dc7c..63418275f402 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -30,6 +30,17 @@
 #define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 extern void tlb_flush(struct mmu_gather *tlb);
+/*
+ * book3s:
+ * Hash does not use the linux page-tables, so we can avoid
+ * the TLB invalidate for page-table freeing, Radix otoh does use the
+ * page-tables and needs the TLBI.
+ *
+ * nohash:
+ * We still do TLB invalidate in the __pte_free_tlb routine before we
+ * add the page table pages to mmu gather table batch.
+ */
+#define tlb_needs_table_invalidate()   radix_enabled()
 
 /* Get the generic bits... */
 #include 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d90d632868aa..e6f2a38d2e61 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,7 +64,6 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index a2f3fa61ee36..8cb8f3833239 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -28,6 +28,15 @@ void flush_tlb_pending(void);
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #define tlb_flush(tlb) flush_tlb_pending()
 
+/*
+ * SPARC64's hardware TLB fill does not use the Linux page-tables
+ * and therefore we don't need a TLBI when freeing page-table pages.
+ */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#define tlb_needs_table_invalidate()   (false)
+#endif
+
 #include 
 
 #endif /* _SPARC64_TLB_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f2b9dc9cbaf8..19934cdd143e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -61,8 +61,23 @@ struct mmu_table_batch {
 extern void tlb_table_flush(struct mmu_gather *tlb);
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
+/*
+ * This allows an architecture that does not use the linux page-tables for
+ * hardware to skip the TLBI w

[PATCH v2 6/6] asm-generic/tlb: avoid potential double flush

2020-03-03 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0758cd8304942292e95a0f750c374533db378b32 upstream.

Aneesh reported that:

tlb_flush_mmu()
  tlb_flush_mmu_tlbonly()
tlb_flush() <-- #1
  tlb_flush_mmu_free()
tlb_table_flush()
  tlb_table_invalidate()
tlb_flush_mmu_tlbonly()
  tlb_flush()   <-- #2

does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not
clear tlb->end in that case.

Observe that any caller to __tlb_adjust_range() also sets at least one of
the tlb->freed_tables || tlb->cleared_p* bits, and those are
unconditionally cleared by __tlb_reset_range().

Change the condition for actually issuing TLBI to having one of those bits
set, as opposed to having tlb->end != 0.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-4-aneesh.ku...@linux.ibm.com
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Aneesh Kumar K.V 
Reported-by: "Aneesh Kumar K.V" 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 include/asm-generic/tlb.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 19934cdd143e..427a70c56ddd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -179,7 +179,12 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
-   if (!tlb->end)
+   /*
+* Anything calling __tlb_adjust_range() also sets at least one of
+* these bits.
+*/
+   if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
+ tlb->cleared_puds || tlb->cleared_p4ds))
return;
 
tlb_flush(tlb);
-- 
2.24.1



[PATCH v3 0/6] Memory corruption may occur due to incorrent tlb flush

2020-03-12 Thread Santosh Sivaraj
The TLB flush optimisation (a46cc7a90f: powerpc/mm/radix: Improve TLB/PWC
flushes) may result in random memory corruption. Any concurrent page-table walk
could end up with a Use-after-Free. Even on UP this might give issues, since
mmu_gather is preemptible these days. An interrupt or preempted task accessing
user pages might stumble into the free page if the hardware caches page
directories.

The series is a backport of the fix sent by Peter [1].

The first three patches are dependencies for the last patch (avoid potential
double flush). If the performance impact due to double flush is considered
trivial then the first three patches and last patch may be dropped.

This is only for v4.19 stable.

[1] https://patchwork.kernel.org/cover/11284843/

--
Changelog:
v2: Send the patches with the correct format (commit sha1 upstream) for stable
v3: Fix compilation issue on ppc40x_defconfig and ppc44x_defconfig

--
Aneesh Kumar K.V (1):
  powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

Peter Zijlstra (4):
  asm-generic/tlb: Track freeing of page-table directories in struct
mmu_gather
  asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE
  mm/mmu_gather: invalidate TLB correctly on batch allocation failure
and flush
  asm-generic/tlb: avoid potential double flush

Will Deacon (1):
  asm-generic/tlb: Track which levels of the page tables have been
cleared

 arch/Kconfig |   3 -
 arch/powerpc/Kconfig |   2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h |   8 --
 arch/powerpc/include/asm/book3s/64/pgalloc.h |   2 -
 arch/powerpc/include/asm/nohash/32/pgalloc.h |   8 --
 arch/powerpc/include/asm/nohash/64/pgalloc.h |   9 +-
 arch/powerpc/include/asm/tlb.h   |  11 ++
 arch/powerpc/mm/pgtable-book3s64.c   |   7 --
 arch/sparc/include/asm/tlb_64.h  |   9 ++
 arch/x86/Kconfig |   1 -
 include/asm-generic/tlb.h| 103 ---
 mm/memory.c  |  20 ++--
 12 files changed, 123 insertions(+), 60 deletions(-)

-- 
2.24.1



[PATCH v3 1/6] asm-generic/tlb: Track freeing of page-table directories in struct mmu_gather

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 22a61c3c4f1379ef8b0ce0d5cb78baf3178950e2 upstream

Some architectures require different TLB invalidation instructions
depending on whether it is only the last-level of page table being
changed, or whether there are also changes to the intermediate
(directory) entries higher up the tree.

Add a new bit to the flags bitfield in struct mmu_gather so that the
architecture code can operate accordingly if it's the intermediate
levels being invalidated.

Signed-off-by: Peter Zijlstra 
Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for tlbflush backports]
---
 include/asm-generic/tlb.h | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b3353e21f3b3..97306b32d8d2 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -97,12 +97,22 @@ struct mmu_gather {
 #endif
unsigned long   start;
unsigned long   end;
-   /* we are in the middle of an operation to clear
-* a full mm and can make some optimizations */
-   unsigned intfullmm : 1,
-   /* we have performed an operation which
-* requires a complete flush of the tlb */
-   need_flush_all : 1;
+   /*
+* we are in the middle of an operation to clear
+* a full mm and can make some optimizations
+*/
+   unsigned intfullmm : 1;
+
+   /*
+* we have performed an operation which
+* requires a complete flush of the tlb
+*/
+   unsigned intneed_flush_all : 1;
+
+   /*
+* we have removed page directories
+*/
+   unsigned intfreed_tables : 1;
 
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
@@ -137,6 +147,7 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb)
tlb->start = TASK_SIZE;
tlb->end = 0;
}
+   tlb->freed_tables = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -278,6 +289,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pte_free_tlb(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pte_free_tlb(tlb, ptep, address); \
} while (0)
 #endif
@@ -285,7 +297,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)
 #endif
@@ -295,6 +308,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define pud_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__pud_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
@@ -304,7 +318,8 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->freed_tables = 1;  \
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
-- 
2.24.1



[PATCH v3 2/6] asm-generic/tlb: Track which levels of the page tables have been cleared

2020-03-12 Thread Santosh Sivaraj
From: Will Deacon 

commit a6d60245d6d9b1caf66b0d94419988c4836980af upstream

It is common for architectures with hugepage support to require only a
single TLB invalidation operation per hugepage during unmap(), rather than
iterating through the mapping at a PAGE_SIZE increment. Currently,
however, the level in the page table where the unmap() operation occurs
is not stored in the mmu_gather structure, therefore forcing
architectures to issue additional TLB invalidation operations or to give
up and over-invalidate by e.g. invalidating the entire TLB.

Ideally, we could add an interval rbtree to the mmu_gather structure,
which would allow us to associate the correct mapping granule with the
various sub-mappings within the range being invalidated. However, this
is costly in terms of book-keeping and memory management, so instead we
approximate by keeping track of the page table levels that are cleared
and provide a means to query the smallest granule required for invalidation.

Signed-off-by: Will Deacon 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 include/asm-generic/tlb.h | 58 +--
 mm/memory.c   |  4 ++-
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 97306b32d8d2..f2b9dc9cbaf8 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -114,6 +114,14 @@ struct mmu_gather {
 */
unsigned intfreed_tables : 1;
 
+   /*
+* at which levels have we cleared entries?
+*/
+   unsigned intcleared_ptes : 1;
+   unsigned intcleared_pmds : 1;
+   unsigned intcleared_puds : 1;
+   unsigned intcleared_p4ds : 1;
+
struct mmu_gather_batch *active;
struct mmu_gather_batch local;
struct page *__pages[MMU_GATHER_BUNDLE];
@@ -148,6 +156,10 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
tlb->end = 0;
}
tlb->freed_tables = 0;
+   tlb->cleared_ptes = 0;
+   tlb->cleared_pmds = 0;
+   tlb->cleared_puds = 0;
+   tlb->cleared_p4ds = 0;
 }
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
@@ -197,6 +209,25 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 }
 #endif
 
+static inline unsigned long tlb_get_unmap_shift(struct mmu_gather *tlb)
+{
+   if (tlb->cleared_ptes)
+   return PAGE_SHIFT;
+   if (tlb->cleared_pmds)
+   return PMD_SHIFT;
+   if (tlb->cleared_puds)
+   return PUD_SHIFT;
+   if (tlb->cleared_p4ds)
+   return P4D_SHIFT;
+
+   return PAGE_SHIFT;
+}
+
+static inline unsigned long tlb_get_unmap_size(struct mmu_gather *tlb)
+{
+   return 1UL << tlb_get_unmap_shift(tlb);
+}
+
 /*
  * In the case of tlb vma handling, we can optimise these away in the
  * case where we're doing a full MM flush.  When we're doing a munmap,
@@ -230,13 +261,19 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
 #define tlb_remove_tlb_entry(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   tlb->cleared_ptes = 1;  \
__tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
-#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)\
-   do { \
-   __tlb_adjust_range(tlb, address, huge_page_size(h)); \
-   __tlb_remove_tlb_entry(tlb, ptep, address);  \
+#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)   \
+   do {\
+   unsigned long _sz = huge_page_size(h);  \
+   __tlb_adjust_range(tlb, address, _sz);  \
+   if (_sz == PMD_SIZE)\
+   tlb->cleared_pmds = 1;  \
+   else if (_sz == PUD_SIZE)   \
+   tlb->cleared_puds = 1;  \
+   __tlb_remove_tlb_entry(tlb, ptep, address); \
} while (0)
 
 /**
@@ -250,6 +287,7 @@ static inline void tlb_remove_check_page_size_change(struct 
mmu_gather *tlb,
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)   \
do {\
__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);   \
+   tlb->cleared_pmds = 1;  \
__tlb_remove_pmd_tlb_entry(tlb, pmdp, address); \
  

[PATCH v3 3/6] asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 96bc9567cbe112e9320250f01b9c060c882e8619 upstream.

Make issuing a TLB invalidate for page-table pages the normal case.

The reason is twofold:

 - too many invalidates is safer than too few,
 - most architectures use the linux page-tables natively
   and would thus require this.

Make it an opt-out, instead of an opt-in.

No change in behavior intended.

Signed-off-by: Peter Zijlstra (Intel) 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: prerequisite for upcoming tlbflush backports]
---
 arch/Kconfig | 2 +-
 arch/powerpc/Kconfig | 1 +
 arch/sparc/Kconfig   | 1 +
 arch/x86/Kconfig | 1 -
 mm/memory.c  | 2 +-
 5 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index a336548487e6..061a12b8140e 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,7 +363,7 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_INVALIDATE
+config HAVE_RCU_TABLE_NO_INVALIDATE
bool
 
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f475dc5829b..e09cfb109b8c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,6 +216,7 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index e6f2a38d2e61..d90d632868aa 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,6 +64,7 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
+   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index af35f5caadbe..181d0d522977 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -181,7 +181,6 @@ config X86
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE  if PARAVIRT
-   select HAVE_RCU_TABLE_INVALIDATEif HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && 
(UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_STACKPROTECTOR  if CC_HAS_SANE_STACKPROTECTOR
diff --git a/mm/memory.c b/mm/memory.c
index 1832c5ed6ac0..ba5689610c04 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -327,7 +327,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct 
page *page, int page_
  */
 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 {
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
+#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
/*
 * Invalidate page-table caches used by hardware walkers. Then we still
 * need to RCU-sched wait while freeing the pages because software
-- 
2.24.1



[PATCH v3 4/6] powerpc/mmu_gather: enable RCU_TABLE_FREE even for !SMP case

2020-03-12 Thread Santosh Sivaraj
From: "Aneesh Kumar K.V" 

commit 12e4d53f3f04e81f9e83d6fc10edc7314ab9f6b9 upstream.

Patch series "Fixup page directory freeing", v4.

This is a repost of patch series from Peter with the arch specific changes
except ppc64 dropped.  ppc64 changes are added here because we are redoing
the patch series on top of ppc64 changes.  This makes it easy to backport
these changes.  Only the first 2 patches need to be backported to stable.

The thing is, on anything SMP, freeing page directories should observe the
exact same order as normal page freeing:

 1) unhook page/directory
 2) TLB invalidate
 3) free page/directory

Without this, any concurrent page-table walk could end up with a
Use-after-Free.  This is esp.  trivial for anything that has software
page-table walkers (HAVE_FAST_GUP / software TLB fill) or the hardware
caches partial page-walks (ie.  caches page directories).

Even on UP this might give issues since mmu_gather is preemptible these
days.  An interrupt or preempted task accessing user pages might stumble
into the free page if the hardware caches page directories.

This patch series fixes ppc64 and add generic MMU_GATHER changes to
support the conversion of other architectures.  I haven't added patches
w.r.t other architecture because they are yet to be acked.

This patch (of 9):

A followup patch is going to make sure we correctly invalidate page walk
cache before we free page table pages.  In order to keep things simple
enable RCU_TABLE_FREE even for !SMP so that we don't have to fixup the
!SMP case differently in the followup patch

!SMP case is right now broken for radix translation w.r.t page walk
cache flush.  We can get interrupted in between page table free and
that would imply we have page walk cache entries pointing to tables
which got freed already.  Michael said "both our platforms that run on
Power9 force SMP on in Kconfig, so the !SMP case is unlikely to be a
problem for anyone in practice, unless they've hacked their kernel to
build it !SMP."

Link: 
http://lkml.kernel.org/r/20200116064531.483522-2-aneesh.ku...@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V 
Cc:  # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported for 4.19 stable]
---
 arch/powerpc/Kconfig | 2 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 8 
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 8 
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 9 +
 arch/powerpc/mm/pgtable-book3s64.c   | 7 ---
 6 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index e09cfb109b8c..1a00ce4b0040 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -215,7 +215,7 @@ config PPC
select HAVE_HARDLOCKUP_DETECTOR_PERFif PERF_EVENTS && 
HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
-   select HAVE_RCU_TABLE_FREE  if SMP
+   select HAVE_RCU_TABLE_FREE
select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 82e44b1a00ae..79ba3fbb512e 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -110,7 +110,6 @@ static inline void pgtable_free(void *table, unsigned 
index_size)
 #define check_pgt_cache()  do { } while (0)
 #define get_hugepd_cache_index(x)  (x)
 
-#ifdef CONFIG_SMP
 static inline void pgtable_free_tlb(struct mmu_gather *tlb,
void *table, int shift)
 {
@@ -127,13 +126,6 @@ static inline void __tlb_remove_table(void *_table)
 
pgtable_free(table, shift);
 }
-#else
-static inline void pgtable_free_tlb(struct mmu_gather *tlb,
-   void *table, int shift)
-{
-   pgtable_free(table, shift);
-}
-#endif
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
  unsigned long address)
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index f9019b579903..1013c0214213 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -47,9 +47,7 @@ extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned 
long);
 extern void pte_fragment_free(unsigned long *, int);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
-#ifdef CONFIG_SMP
 extern void __tlb_remove_table(void *_table);
-#endif
 
 static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
 {
diff --git a

[PATCH v3 5/6] mm/mmu_gather: invalidate TLB correctly on batch allocation failure and flush

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0ed1325967ab5f7a4549a2641c6ebe115f76e228 upstream.

Architectures for which we have hardware walkers of Linux page table
should flush TLB on mmu gather batch allocation failures and batch flush.
Some architectures like POWER supports multiple translation modes (hash
and radix) and in the case of POWER only radix translation mode needs the
above TLBI.  This is because for hash translation mode kernel wants to
avoid this extra flush since there are no hardware walkers of linux page
table.  With radix translation, the hardware also walks linux page table
and with that, kernel needs to make sure to TLB invalidate page walk cache
before page table pages are freed.

More details in commit d86564a2f085 ("mm/tlb, x86/mm: Support invalidating
TLB caches for RCU_TABLE_FREE")

The changes to sparc are to make sure we keep the old behavior since we
are now removing HAVE_RCU_TABLE_NO_INVALIDATE.  The default value for
tlb_needs_table_invalidate is to always force an invalidate and sparc can
avoid the table invalidate.  Hence we define tlb_needs_table_invalidate to
false for sparc architecture.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-3-aneesh.ku...@linux.ibm.com
Fixes: a46cc7a90fd8 ("powerpc/mm/radix: Improve TLB/PWC flushes")
Signed-off-by: Peter Zijlstra (Intel) 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 arch/Kconfig|  3 ---
 arch/powerpc/Kconfig|  1 -
 arch/powerpc/include/asm/tlb.h  | 11 +++
 arch/sparc/Kconfig  |  1 -
 arch/sparc/include/asm/tlb_64.h |  9 +
 include/asm-generic/tlb.h   | 15 +++
 mm/memory.c | 16 
 7 files changed, 43 insertions(+), 13 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 061a12b8140e..3abbdb0cea44 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -363,9 +363,6 @@ config HAVE_ARCH_JUMP_LABEL
 config HAVE_RCU_TABLE_FREE
bool
 
-config HAVE_RCU_TABLE_NO_INVALIDATE
-   bool
-
 config ARCH_HAVE_NMI_SAFE_CMPXCHG
bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 1a00ce4b0040..e5bc0cfea2b1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -216,7 +216,6 @@ config PPC
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if PPC64 && CPU_LITTLE_ENDIAN
select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index f0e571b2dc7c..63418275f402 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -30,6 +30,17 @@
 #define tlb_remove_check_page_size_change tlb_remove_check_page_size_change
 
 extern void tlb_flush(struct mmu_gather *tlb);
+/*
+ * book3s:
+ * Hash does not use the linux page-tables, so we can avoid
+ * the TLB invalidate for page-table freeing, Radix otoh does use the
+ * page-tables and needs the TLBI.
+ *
+ * nohash:
+ * We still do TLB invalidate in the __pte_free_tlb routine before we
+ * add the page table pages to mmu gather table batch.
+ */
+#define tlb_needs_table_invalidate()   radix_enabled()
 
 /* Get the generic bits... */
 #include 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index d90d632868aa..e6f2a38d2e61 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -64,7 +64,6 @@ config SPARC64
select HAVE_KRETPROBES
select HAVE_KPROBES
select HAVE_RCU_TABLE_FREE if SMP
-   select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h
index a2f3fa61ee36..8cb8f3833239 100644
--- a/arch/sparc/include/asm/tlb_64.h
+++ b/arch/sparc/include/asm/tlb_64.h
@@ -28,6 +28,15 @@ void flush_tlb_pending(void);
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #define tlb_flush(tlb) flush_tlb_pending()
 
+/*
+ * SPARC64's hardware TLB fill does not use the Linux page-tables
+ * and therefore we don't need a TLBI when freeing page-table pages.
+ */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+#define tlb_needs_table_invalidate()   (false)
+#endif
+
 #include 
 
 #endif /* _SPARC64_TLB_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index f2b9dc9cbaf8..19934cdd143e 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -61,8 +61,23 @@ struct mmu_table_batch {
 extern void tlb_table_flush(struct mmu_gather *tlb);
 extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 
+/*
+ * This allows an architecture that does not use the linux page-tables for
+ * hardware to skip the TLBI w

[PATCH v3 6/6] asm-generic/tlb: avoid potential double flush

2020-03-12 Thread Santosh Sivaraj
From: Peter Zijlstra 

commit 0758cd8304942292e95a0f750c374533db378b32 upstream.

Aneesh reported that:

tlb_flush_mmu()
  tlb_flush_mmu_tlbonly()
tlb_flush() <-- #1
  tlb_flush_mmu_free()
tlb_table_flush()
  tlb_table_invalidate()
tlb_flush_mmu_tlbonly()
  tlb_flush()   <-- #2

does two TLBIs when tlb->fullmm, because __tlb_reset_range() will not
clear tlb->end in that case.

Observe that any caller to __tlb_adjust_range() also sets at least one of
the tlb->freed_tables || tlb->cleared_p* bits, and those are
unconditionally cleared by __tlb_reset_range().

Change the condition for actually issuing TLBI to having one of those bits
set, as opposed to having tlb->end != 0.

Link: 
http://lkml.kernel.org/r/20200116064531.483522-4-aneesh.ku...@linux.ibm.com
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Aneesh Kumar K.V 
Reported-by: "Aneesh Kumar K.V" 
Cc:   # 4.19
Signed-off-by: Santosh Sivaraj 
[santosh: backported to 4.19 stable]
---
 include/asm-generic/tlb.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 19934cdd143e..427a70c56ddd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -179,7 +179,12 @@ static inline void __tlb_reset_range(struct mmu_gather 
*tlb)
 
 static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
 {
-   if (!tlb->end)
+   /*
+* Anything calling __tlb_adjust_range() also sets at least one of
+* these bits.
+*/
+   if (!(tlb->freed_tables || tlb->cleared_ptes || tlb->cleared_pmds ||
+ tlb->cleared_puds || tlb->cleared_p4ds))
return;
 
tlb_flush(tlb);
-- 
2.24.1



[PATCH] papr/scm: Add bad memory ranges to nvdimm bad ranges

2020-04-01 Thread Santosh Sivaraj
Subscribe to the MCE notification and add the physical address which
generated a memory error to nvdimm bad range.

Signed-off-by: Santosh Sivaraj 
---

This patch depends on "powerpc/mce: Add MCE notification chain" [1].

Unlike the previous series[2], the patch adds badblock registration only for
pseries scm driver. Handling badblocks for baremetal (powernv) PMEM will be done
later and if possible get the badblock handling as a common code.

[1] 
https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganes...@linux.ibm.com/
[2] 
https://lore.kernel.org/linuxppc-dev/20190820023030.18232-1-sant...@fossix.org/

arch/powerpc/platforms/pseries/papr_scm.c | 96 ++-
 1 file changed, 95 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 0b4467e378e5..5012cbf4606e 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -39,8 +41,12 @@ struct papr_scm_priv {
struct resource res;
struct nd_region *region;
struct nd_interleave_set nd_set;
+   struct list_head region_list;
 };
 
+LIST_HEAD(papr_nd_regions);
+DEFINE_MUTEX(papr_ndr_lock);
+
 static int drc_pmem_bind(struct papr_scm_priv *p)
 {
unsigned long ret[PLPAR_HCALL_BUFSIZE];
@@ -372,6 +378,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
dev_info(dev, "Region registered with target node %d and online 
node %d",
 target_nid, online_nid);
 
+   mutex_lock(&papr_ndr_lock);
+   list_add_tail(&p->region_list, &papr_nd_regions);
+   mutex_unlock(&papr_ndr_lock);
+
return 0;
 
 err:   nvdimm_bus_unregister(p->bus);
@@ -379,6 +389,68 @@ err:   nvdimm_bus_unregister(p->bus);
return -ENXIO;
 }
 
+void papr_scm_add_badblock(struct nd_region *region, struct nvdimm_bus *bus,
+  u64 phys_addr)
+{
+   u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
+
+   if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) {
+   pr_err("Bad block registration for 0x%llx failed\n", phys_addr);
+   return;
+   }
+
+   pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n",
+aligned_addr, aligned_addr + L1_CACHE_BYTES);
+
+   nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON);
+}
+
+static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
+void *data)
+{
+   struct machine_check_event *evt = data;
+   struct papr_scm_priv *p;
+   u64 phys_addr;
+   bool found = false;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE)
+   return NOTIFY_DONE;
+
+   if (list_empty(&papr_nd_regions))
+   return NOTIFY_DONE;
+
+   phys_addr = evt->u.ue_error.physical_address +
+   (evt->u.ue_error.effective_address & ~PAGE_MASK);
+
+   if (!evt->u.ue_error.physical_address_provided ||
+   !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
+   return NOTIFY_DONE;
+
+   /* mce notifier is called from a process context, so mutex is safe */
+   mutex_lock(&papr_ndr_lock);
+   list_for_each_entry(p, &papr_nd_regions, region_list) {
+   struct resource res = p->res;
+
+   if (phys_addr >= res.start && phys_addr <= res.end) {
+   found = true;
+   break;
+   }
+   }
+
+   mutex_unlock(&papr_ndr_lock);
+
+   if (!found)
+   return NOTIFY_DONE;
+
+   papr_scm_add_badblock(p->region, p->bus, phys_addr);
+
+   return NOTIFY_OK;
+}
+
+static struct notifier_block mce_ue_nb = {
+   .notifier_call = handle_mce_ue
+};
+
 static int papr_scm_probe(struct platform_device *pdev)
 {
struct device_node *dn = pdev->dev.of_node;
@@ -476,6 +548,10 @@ static int papr_scm_remove(struct platform_device *pdev)
 {
struct papr_scm_priv *p = platform_get_drvdata(pdev);
 
+   mutex_lock(&papr_ndr_lock);
+   list_del(&(p->region_list));
+   mutex_unlock(&papr_ndr_lock);
+
nvdimm_bus_unregister(p->bus);
drc_pmem_unbind(p);
kfree(p->bus_desc.provider_name);
@@ -498,7 +574,25 @@ static struct platform_driver papr_scm_driver = {
},
 };
 
-module_platform_driver(papr_scm_driver);
+static int __init papr_scm_init(void)
+{
+   int ret;
+
+   ret = platform_driver_register(&papr_scm_driver);
+   if (!ret)
+   mce_register_notifier(&mce_ue_nb);
+
+   return ret;
+}
+module_init(papr_scm_init);
+
+static void __exit papr_scm_exit(void)
+{
+   mce_unregister_notifier(&

Re: [PATCH] papr/scm: Add bad memory ranges to nvdimm bad ranges

2020-04-09 Thread Santosh Sivaraj
On Wed, Apr 1, 2020 at 1:18 PM Santosh Sivaraj  wrote:

> Subscribe to the MCE notification and add the physical address which
> generated a memory error to nvdimm bad range.
>
> Signed-off-by: Santosh Sivaraj 
> ---
>

Any comments on this?

Thanks,
Santosh


> This patch depends on "powerpc/mce: Add MCE notification chain" [1].
>
> Unlike the previous series[2], the patch adds badblock registration only
> for
> pseries scm driver. Handling badblocks for baremetal (powernv) PMEM will
> be done
> later and if possible get the badblock handling as a common code.
>
> [1]
> https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganes...@linux.ibm.com/
> [2]
> https://lore.kernel.org/linuxppc-dev/20190820023030.18232-1-sant...@fossix.org/
>
> arch/powerpc/platforms/pseries/papr_scm.c | 96 ++-
>  1 file changed, 95 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 0b4467e378e5..5012cbf4606e 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -12,6 +12,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>
>  #include 
>
> @@ -39,8 +41,12 @@ struct papr_scm_priv {
> struct resource res;
> struct nd_region *region;
> struct nd_interleave_set nd_set;
> +   struct list_head region_list;
>  };
>
> +LIST_HEAD(papr_nd_regions);
> +DEFINE_MUTEX(papr_ndr_lock);
> +
>  static int drc_pmem_bind(struct papr_scm_priv *p)
>  {
> unsigned long ret[PLPAR_HCALL_BUFSIZE];
> @@ -372,6 +378,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv
> *p)
> dev_info(dev, "Region registered with target node %d and
> online node %d",
>  target_nid, online_nid);
>
> +   mutex_lock(&papr_ndr_lock);
> +   list_add_tail(&p->region_list, &papr_nd_regions);
> +   mutex_unlock(&papr_ndr_lock);
> +
> return 0;
>
>  err:   nvdimm_bus_unregister(p->bus);
> @@ -379,6 +389,68 @@ err:   nvdimm_bus_unregister(p->bus);
> return -ENXIO;
>  }
>
> +void papr_scm_add_badblock(struct nd_region *region, struct nvdimm_bus
> *bus,
> +  u64 phys_addr)
> +{
> +   u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
> +
> +   if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) {
> +   pr_err("Bad block registration for 0x%llx failed\n",
> phys_addr);
> +   return;
> +   }
> +
> +   pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n",
> +aligned_addr, aligned_addr + L1_CACHE_BYTES);
> +
> +   nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON);
> +}
> +
> +static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
> +void *data)
> +{
> +   struct machine_check_event *evt = data;
> +   struct papr_scm_priv *p;
> +   u64 phys_addr;
> +   bool found = false;
> +
> +   if (evt->error_type != MCE_ERROR_TYPE_UE)
> +   return NOTIFY_DONE;
> +
> +   if (list_empty(&papr_nd_regions))
> +   return NOTIFY_DONE;
> +
> +   phys_addr = evt->u.ue_error.physical_address +
> +   (evt->u.ue_error.effective_address & ~PAGE_MASK);
> +
> +   if (!evt->u.ue_error.physical_address_provided ||
> +   !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
> +   return NOTIFY_DONE;
> +
> +   /* mce notifier is called from a process context, so mutex is safe
> */
> +   mutex_lock(&papr_ndr_lock);
> +   list_for_each_entry(p, &papr_nd_regions, region_list) {
> +   struct resource res = p->res;
> +
> +   if (phys_addr >= res.start && phys_addr <= res.end) {
> +   found = true;
> +   break;
> +   }
> +   }
> +
> +   mutex_unlock(&papr_ndr_lock);
> +
> +   if (!found)
> +   return NOTIFY_DONE;
> +
> +   papr_scm_add_badblock(p->region, p->bus, phys_addr);
> +
> +   return NOTIFY_OK;
> +}
> +
> +static struct notifier_block mce_ue_nb = {
> +   .notifier_call = handle_mce_ue
> +};
> +
>  static int papr_scm_probe(struct platform_device *pdev)
>  {
> struct device_node *dn = pdev->dev.of_node;
> @@ -476,6 +548,10 @@ static int papr_scm_remove(struct platform_device
> *pdev)
>  {
> 

Re: [PATCH] papr/scm: Add bad memory ranges to nvdimm bad ranges

2020-04-13 Thread Santosh Sivaraj
kbuild test robot  writes:

> Hi Santosh,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on powerpc/next]
> [also build test ERROR on v5.7-rc1 next-20200412]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see
> https://stackoverflow.com/a/37406982]

This patch depends on "powerpc/mce: Add MCE notification chain" [1].

[1]: 
https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganes...@linux.ibm.com/

Thanks,
Santosh

>
> url:
> https://github.com/0day-ci/linux/commits/Santosh-Sivaraj/papr-scm-Add-bad-memory-ranges-to-nvdimm-bad-ranges/20200401-171233
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> config: powerpc-allyesconfig (attached as .config)
> compiler: powerpc64-linux-gcc (GCC) 9.3.0
> reproduce:
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> GCC_VERSION=9.3.0 make.cross ARCH=powerpc 
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kbuild test robot 
>
> All errors (new ones prefixed by >>):
>
>arch/powerpc/platforms/pseries/papr_scm.c: In function 'papr_scm_init':
>>> arch/powerpc/platforms/pseries/papr_scm.c:584:3: error: implicit 
>>> declaration of function 'mce_register_notifier'; did you mean 
>>> 'bus_register_notifier'? [-Werror=implicit-function-declaration]
>  584 |   mce_register_notifier(&mce_ue_nb);
>  |   ^
>  |   bus_register_notifier
>arch/powerpc/platforms/pseries/papr_scm.c: In function 'papr_scm_exit':
>>> arch/powerpc/platforms/pseries/papr_scm.c:592:2: error: implicit 
>>> declaration of function 'mce_unregister_notifier'; did you mean 
>>> 'bus_unregister_notifier'? [-Werror=implicit-function-declaration]
>  592 |  mce_unregister_notifier(&mce_ue_nb);
>  |  ^~~
>  |  bus_unregister_notifier
>cc1: some warnings being treated as errors
>
> vim +584 arch/powerpc/platforms/pseries/papr_scm.c
>
>577
>578static int __init papr_scm_init(void)
>579{
>580int ret;
>581
>582ret = platform_driver_register(&papr_scm_driver);
>583if (!ret)
>  > 584mce_register_notifier(&mce_ue_nb);
>585
>586return ret;
>587}
>588module_init(papr_scm_init);
>589
>590static void __exit papr_scm_exit(void)
>591{
>  > 592mce_unregister_notifier(&mce_ue_nb);
>593platform_driver_unregister(&papr_scm_driver);
>594}
>595module_exit(papr_scm_exit);
>596
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH] papr/scm: Add bad memory ranges to nvdimm bad ranges

2020-04-13 Thread Santosh Sivaraj
Mahesh J Salgaonkar  writes:

> On 2020-04-01 13:17:41 Wed, Santosh Sivaraj wrote:
>> Subscribe to the MCE notification and add the physical address which
>> generated a memory error to nvdimm bad range.
>> 
>> Signed-off-by: Santosh Sivaraj 
>> ---
>> 
>> This patch depends on "powerpc/mce: Add MCE notification chain" [1].
>> 
>> Unlike the previous series[2], the patch adds badblock registration only for
>> pseries scm driver. Handling badblocks for baremetal (powernv) PMEM will be 
>> done
>> later and if possible get the badblock handling as a common code.
>> 
>> [1] 
>> https://lore.kernel.org/linuxppc-dev/20200330071219.12284-1-ganes...@linux.ibm.com/
>> [2] 
>> https://lore.kernel.org/linuxppc-dev/20190820023030.18232-1-sant...@fossix.org/
>> 
>> arch/powerpc/platforms/pseries/papr_scm.c | 96 ++-
>>  1 file changed, 95 insertions(+), 1 deletion(-)
>> 
>> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
>> b/arch/powerpc/platforms/pseries/papr_scm.c
>> index 0b4467e378e5..5012cbf4606e 100644
>> --- a/arch/powerpc/platforms/pseries/papr_scm.c
>> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> [...]
>> +static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
>> + void *data)
>> +{
>> +struct machine_check_event *evt = data;
>> +struct papr_scm_priv *p;
>> +u64 phys_addr;
>> +bool found = false;
>> +
>> +if (evt->error_type != MCE_ERROR_TYPE_UE)
>> +return NOTIFY_DONE;
>> +
>> +if (list_empty(&papr_nd_regions))
>> +return NOTIFY_DONE;
>
> Do you really need this check ?

Quite harmless I guess, atleast it saves a branch and mutex_lock/unlock.

>
>> +
>> +phys_addr = evt->u.ue_error.physical_address +
>> +(evt->u.ue_error.effective_address & ~PAGE_MASK);
>> +
>> +if (!evt->u.ue_error.physical_address_provided ||
>> +!is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
>> +return NOTIFY_DONE;
>> +
>> +/* mce notifier is called from a process context, so mutex is safe */
>> +mutex_lock(&papr_ndr_lock);
>> +list_for_each_entry(p, &papr_nd_regions, region_list) {
>> +struct resource res = p->res;
>> +
>> +if (phys_addr >= res.start && phys_addr <= res.end) {
>> +found = true;
>> +break;
>> +}
>> +}
>> +
>> +mutex_unlock(&papr_ndr_lock);
>> +
>> +if (!found)
>> +return NOTIFY_DONE;
>> +
>> +papr_scm_add_badblock(p->region, p->bus, phys_addr);
>> +
>> +return NOTIFY_OK;
>> +}
>> +
>> +static struct notifier_block mce_ue_nb = {
>> +.notifier_call = handle_mce_ue
>> +};
>> +
> [...]
>> -module_platform_driver(papr_scm_driver);
>> +static int __init papr_scm_init(void)
>> +{
>> +int ret;
>> +
>> +ret = platform_driver_register(&papr_scm_driver);
>> +if (!ret)
>> +mce_register_notifier(&mce_ue_nb);
>> +
>> +return ret;
>> +}
>> +module_init(papr_scm_init);
>> +
>> +static void __exit papr_scm_exit(void)
>> +{
>> +mce_unregister_notifier(&mce_ue_nb);
>> +platform_driver_unregister(&papr_scm_driver);
>> +}
>> +module_exit(papr_scm_exit);
>
> Rest Looks good to me.
>
> Reviewed-by: Mahesh Salgaonkar 

Thanks for the review.

Santosh
>
> Thanks,
> -Mahesh.
>
>> +
>>  MODULE_DEVICE_TABLE(of, papr_scm_match);
>>  MODULE_LICENSE("GPL");
>>  MODULE_AUTHOR("IBM Corporation");
>> -- 
>> 2.25.1
>> 
>
> -- 
> Mahesh J Salgaonkar


[PATCH v2] papr/scm: Add bad memory ranges to nvdimm bad ranges

2020-04-16 Thread Santosh Sivaraj
Subscribe to the MCE notification and add the physical address which
generated a memory error to nvdimm bad range.

Reviewed-by: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 98 ++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index f35592423380..e23fd1399d5b 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -12,6 +12,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 
@@ -39,8 +41,12 @@ struct papr_scm_priv {
struct resource res;
struct nd_region *region;
struct nd_interleave_set nd_set;
+   struct list_head region_list;
 };
 
+LIST_HEAD(papr_nd_regions);
+DEFINE_MUTEX(papr_ndr_lock);
+
 static int drc_pmem_bind(struct papr_scm_priv *p)
 {
unsigned long ret[PLPAR_HCALL_BUFSIZE];
@@ -356,6 +362,10 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
dev_info(dev, "Region registered with target node %d and online 
node %d",
 target_nid, online_nid);
 
+   mutex_lock(&papr_ndr_lock);
+   list_add_tail(&p->region_list, &papr_nd_regions);
+   mutex_unlock(&papr_ndr_lock);
+
return 0;
 
 err:   nvdimm_bus_unregister(p->bus);
@@ -363,6 +373,70 @@ err:   nvdimm_bus_unregister(p->bus);
return -ENXIO;
 }
 
+void papr_scm_add_badblock(struct nd_region *region, struct nvdimm_bus *bus,
+  u64 phys_addr)
+{
+   u64 aligned_addr = ALIGN_DOWN(phys_addr, L1_CACHE_BYTES);
+
+   if (nvdimm_bus_add_badrange(bus, aligned_addr, L1_CACHE_BYTES)) {
+   pr_err("Bad block registration for 0x%llx failed\n", phys_addr);
+   return;
+   }
+
+   pr_debug("Add memory range (0x%llx - 0x%llx) as bad range\n",
+aligned_addr, aligned_addr + L1_CACHE_BYTES);
+
+   nvdimm_region_notify(region, NVDIMM_REVALIDATE_POISON);
+}
+
+static int handle_mce_ue(struct notifier_block *nb, unsigned long val,
+void *data)
+{
+   struct machine_check_event *evt = data;
+   struct papr_scm_priv *p;
+   u64 phys_addr;
+   bool found = false;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE)
+   return NOTIFY_DONE;
+
+   if (list_empty(&papr_nd_regions))
+   return NOTIFY_DONE;
+
+   /*
+* The physical address obtained here is PAGE_SIZE aligned, so get the
+* exact address from the effective address
+*/
+   phys_addr = evt->u.ue_error.physical_address +
+   (evt->u.ue_error.effective_address & ~PAGE_MASK);
+
+   if (!evt->u.ue_error.physical_address_provided ||
+   !is_zone_device_page(pfn_to_page(phys_addr >> PAGE_SHIFT)))
+   return NOTIFY_DONE;
+
+   /* mce notifier is called from a process context, so mutex is safe */
+   mutex_lock(&papr_ndr_lock);
+   list_for_each_entry(p, &papr_nd_regions, region_list) {
+   struct resource res = p->res;
+
+   if (phys_addr >= res.start && phys_addr <= res.end) {
+   found = true;
+   break;
+   }
+   }
+
+   if (found)
+   papr_scm_add_badblock(p->region, p->bus, phys_addr);
+
+   mutex_unlock(&papr_ndr_lock);
+
+   return found ? NOTIFY_OK : NOTIFY_DONE;
+}
+
+static struct notifier_block mce_ue_nb = {
+   .notifier_call = handle_mce_ue
+};
+
 static int papr_scm_probe(struct platform_device *pdev)
 {
struct device_node *dn = pdev->dev.of_node;
@@ -460,6 +534,10 @@ static int papr_scm_remove(struct platform_device *pdev)
 {
struct papr_scm_priv *p = platform_get_drvdata(pdev);
 
+   mutex_lock(&papr_ndr_lock);
+   list_del(&(p->region_list));
+   mutex_unlock(&papr_ndr_lock);
+
nvdimm_bus_unregister(p->bus);
drc_pmem_unbind(p);
kfree(p->bus_desc.provider_name);
@@ -482,7 +560,25 @@ static struct platform_driver papr_scm_driver = {
},
 };
 
-module_platform_driver(papr_scm_driver);
+static int __init papr_scm_init(void)
+{
+   int ret;
+
+   ret = platform_driver_register(&papr_scm_driver);
+   if (!ret)
+   mce_register_notifier(&mce_ue_nb);
+
+   return ret;
+}
+module_init(papr_scm_init);
+
+static void __exit papr_scm_exit(void)
+{
+   mce_unregister_notifier(&mce_ue_nb);
+   platform_driver_unregister(&papr_scm_driver);
+}
+module_exit(papr_scm_exit);
+
 MODULE_DEVICE_TABLE(of, papr_scm_match);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("IBM Corporation");
-- 
2.25.2



[PATCH 01/13] powerpc/mce: Make machine_check_ue_event() static

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[PATCH 00/13] powerpc: implement machine check safe memcpy

2019-06-20 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo.
---

Balbir Singh (2):
  powerpc/mce: Bug fixes for MCE handling in kernel space
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (9):
  powerpc/mce: Make machine_check_ue_event() static
  powerpc/mce: Add MCE notification chain
  powerpc/mce: Move machine_check_ue_event() call
  powerpc/mce: Allow notifier callback to handle MCE
  powerpc/mce: Do not process notifier-handled UE events
  powerpc/mce: Add fixup address to UE events
  powerpc/mce: Handle memcpy_mcsafe()
  powerpc/mce: Enable MCE notifiers in external modules
  powerpc/64s: Save r13 in machine_check_common_early

Santosh Sivaraj (2):
  powerpc/memcpy_mcsafe: return remaining bytes
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/asm-prototypes.h |   1 +
 arch/powerpc/include/asm/mce.h|  13 +-
 arch/powerpc/include/asm/string.h |   2 +
 arch/powerpc/include/asm/uaccess.h|  12 ++
 arch/powerpc/kernel/exceptions-64s.S  |  14 ++
 arch/powerpc/kernel/mce.c | 102 +-
 arch/powerpc/kernel/mce_power.c   |  26 ++-
 arch/powerpc/lib/Makefile |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S   | 226 ++
 arch/powerpc/platforms/pseries/ras.c  |   6 +-
 11 files changed, 386 insertions(+), 19 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[PATCH 02/13] powerpc/mce: Bug fixes for MCE handling in kernel space

2019-06-20 Thread Santosh Sivaraj
From: Balbir Singh 

The code currently assumes PAGE_SHIFT as the shift value of
the pfn, this works correctly (mostly) for user space pages,
but the correct thing to do is

1. Extract the shift value returned via the pte-walk API's
2. Use the shift value to access the instruction address.

Note, the final physical address still use PAGE_SHIFT for
computation. handle_ierror() is not modified and handle_derror()
is modified just for extracting the correct instruction
address.

This is largely due to __find_linux_pte() returning pfn's
shifted by pdshift. The code is much more generic and can
handle shift values returned.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, &shift);
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, &shift);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ &shift);
if (pfn != ULONG_MAX) {
*phys_addr =
-   (pfn << PAGE_SHIFT);
+   (pfn << shift);
}
}
}
diff --git a/a

[PATCH 03/13] powerpc/mce: Add MCE notification chain

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  1 +
 arch/powerpc/include/asm/mce.h|  4 
 arch/powerpc/kernel/exceptions-64s.S  |  4 
 arch/powerpc/kernel/mce.c | 22 ++
 4 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index ec1c97a8e8cb..f66f26ef3ce0 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,6 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
+void machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..948bef579086 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -214,4 +214,8 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr,
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+int mce_register_notifier(struct notifier_block *nb);
+int mce_unregister_notifier(struct notifier_block *nb);
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 6b86055e5251..2e56014fca21 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -457,6 +457,10 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
+
+   addir3,r1,STACK_FRAME_OVERHEAD
+   bl  machine_check_notify
+
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
b   4f
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..24d350a934e4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -42,6 +42,18 @@ static struct irq_work mce_event_process_work = {
 
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
+
+int mce_register_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(&mce_notifier_list, nb);
+}
+
+int mce_unregister_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
+}
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
@@ -635,3 +647,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 
return 1;
 }
+
+void machine_check_notify(struct pt_regs *regs)
+{
+   struct machine_check_event evt;
+
+   if (!get_mce_event(&evt, MCE_EVENT_DONTRELEASE))
+   return;
+
+   blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+}
-- 
2.20.1



[PATCH 04/13] powerpc/mce: Move machine_check_ue_event() call

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Move the call site of machine_check_ue_event() slightly later in the MCE
codepath. No functional change intended--this is prep for a later patch
to conditionally skip the call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 24d350a934e4..0ab171b41ede 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -156,7 +156,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -656,4 +655,8 @@ void machine_check_notify(struct pt_regs *regs)
return;
 
blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   evt.u.ue_error.physical_address_provided)
+   machine_check_ue_event(&evt);
 }
-- 
2.20.1



[PATCH 05/13] powerpc/mce: Allow notifier callback to handle MCE

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

If a notifier returns NOTIFY_STOP, consider the MCE handled, just as we
do when machine_check_early() returns 1.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/kernel/exceptions-64s.S  |  3 +++
 arch/powerpc/kernel/mce.c | 28 ---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index f66f26ef3ce0..49ee8f08de2a 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,7 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
-void machine_check_notify(struct pt_regs *regs);
+long machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 2e56014fca21..c83e38a403fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -460,6 +460,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
+   ld  r11,RESULT(r1)
+   or  r3,r3,r11
+   std r3,RESULT(r1)
 
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab171b41ede..912efe58e0b1 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -647,16 +647,28 @@ long hmi_exception_realmode(struct pt_regs *regs)
return 1;
 }
 
-void machine_check_notify(struct pt_regs *regs)
+long machine_check_notify(struct pt_regs *regs)
 {
-   struct machine_check_event evt;
+   int index = __this_cpu_read(mce_nest_count) - 1;
+   struct machine_check_event *evt;
+   int rc;
 
-   if (!get_mce_event(&evt, MCE_EVENT_DONTRELEASE))
-   return;
+   if (index < 0 || index >= MAX_MC_EVT)
+   return 0;
+
+   evt = this_cpu_ptr(&mce_event[index]);
 
-   blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+   rc = blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
+   if (rc & NOTIFY_STOP_MASK) {
+   evt->disposition = MCE_DISPOSITION_RECOVERED;
+   regs->msr |= MSR_RI;
 
-   if (evt.error_type == MCE_ERROR_TYPE_UE &&
-   evt.u.ue_error.physical_address_provided)
-   machine_check_ue_event(&evt);
+   return 1;
+   }
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE &&
+   evt->u.ue_error.physical_address_provided)
+   machine_check_ue_event(evt);
+
+   return 0;
 }
-- 
2.20.1



[PATCH 06/13] powerpc/mce: Do not process notifier-handled UE events

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h | 3 ++-
 arch/powerpc/kernel/mce.c  | 9 +
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 948bef579086..240dd1fdfe35 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  process_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 912efe58e0b1..2616f1f71734 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -157,6 +157,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
}
+
+   mce->u.ue_error.process_event = true;
}
return;
 }
@@ -241,6 +243,10 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   !evt.u.ue_error.process_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
@@ -660,6 +666,9 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   evt->u.ue_error.process_event = false;
+
evt->disposition = MCE_DISPOSITION_RECOVERED;
regs->msr |= MSR_RI;
 
-- 
2.20.1



[PATCH 07/13] powerpc/mce: Add fixup address to UE events

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

If the instruction causing a UE has an exception table entry with fixup
address, save it in the machine_check_event struct.

If a machine check notifier callback returns NOTIFY_STOP to indicate it
has handled the error, set nip to continue execution from the fixup
address.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h |  5 +++--
 arch/powerpc/kernel/mce.c  | 16 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 240dd1fdfe35..9d9661747adf 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,11 +122,12 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
+   u8  fixup_address_provided;
u8  process_event;
-   u8  reserved_1[4];
+   u8  reserved_1[3];
u64 effective_address;
u64 physical_address;
-   u8  reserved_2[8];
+   u64 fixup_address;
} ue_error;
 
struct {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 2616f1f71734..8afda1ab7358 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -15,10 +15,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -151,6 +153,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.link_error.effective_address_provided = true;
mce->u.link_error.effective_address = addr;
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+   const struct exception_table_entry *entry;
+
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
if (phys_addr != ULONG_MAX) {
@@ -158,6 +162,12 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address = phys_addr;
}
 
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce->u.ue_error.fixup_address_provided = true;
+   mce->u.ue_error.fixup_address = extable_fixup(entry);
+   }
+
mce->u.ue_error.process_event = true;
}
return;
@@ -666,8 +676,12 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->u.ue_error.fixup_address_provided)
+   regs->nip = evt->u.ue_error.fixup_address;
+
evt->u.ue_error.process_event = false;
+   }
 
evt->disposition = MCE_DISPOSITION_RECOVERED;
regs->msr |= MSR_RI;
-- 
2.20.1



[PATCH 08/13] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-06-20 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem
layer so as to convert machine check exceptions into
a return value on failure in case a machine check
exception is encountered during the memcpy.

This patch largely borrows from the copyuser_power7
logic and does not add the VMX optimizations, largely
to keep the patch simple. If needed those optimizations
can be folded in.

Signed-off-by: Balbir Singh 
Acked-by: Nicholas Piggin 
[ar...@linux.ibm.com: Added symbol export]
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 215 
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..50f865db0338
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   li  r3,-EFAULT
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r7,16(r4)
+err2;  ld  r8,24(r4)
+err2;  ld  r9,32(r4)
+err2;  ld  r10,40(r4)
+err2;  ld  r11,48(r4)
+err2;  ld  r12,56(r4)
+err2;  ld  r14,64(r4)
+err2;  ld  r15,72(r4)
+err2;  ld  r16,80(r4)
+err2;  ld  r17,88(r4)
+err2;  ld  r18,96(r4)
+err2;  ld  r19,104(r4)
+err2;  ld  r20,112(r4)
+err2;  ld  r21,120(r4)
+   addir4,r4,128
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r7,16(r3)
+err2;  std r8,24(r3)
+err2;  std r9,32(r3)
+err2;  std r10,40(r3)
+err2;  std r11,48(r3)
+err2;  std r12,56(r3)
+err2;  std r14,64(r3)
+err2;  std r15,72(r3)
+err2;  std r16,80(r3)
+err2;  std r17,88(r3)

[PATCH 09/13] powerpc/mce: Handle memcpy_mcsafe()

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Add an mce notifier intended to service memcpy_mcsafe().

The notifier uses this heuristic; if a UE occurs when accessing device
memory, and the faulting instruction had a fixup entry, the callback
will return NOTIFY_STOP.

This causes the notification mechanism to consider the MCE handled and
continue execution at the fixup address, which returns -EFAULT from the
memcpy_mcsafe() call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8afda1ab7358..9cb5a731377b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -56,6 +56,40 @@ int mce_unregister_notifier(struct notifier_block *nb)
return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
 }
 
+static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
+  void *data)
+{
+   struct machine_check_event *evt = data;
+   unsigned long pfn;
+   struct page *page;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE ||
+   !evt->u.ue_error.physical_address_provided)
+   return NOTIFY_DONE;
+
+   pfn = evt->u.ue_error.physical_address >> PAGE_SHIFT;
+   page = pfn_to_page(pfn);
+   if (!page)
+   return NOTIFY_DONE;
+
+   /* HMM and PMEM */
+   if (is_zone_device_page(page) && evt->u.ue_error.fixup_address_provided)
+   return NOTIFY_STOP;
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block memcpy_mcsafe_nb = {
+   .notifier_call = check_memcpy_mcsafe
+};
+
+static int __init mce_mcsafe_register(void)
+{
+   mce_register_notifier(&memcpy_mcsafe_nb);
+   return 0;
+}
+arch_initcall(mce_mcsafe_register);
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
-- 
2.20.1



[PATCH 10/13] powerpc/mce: Enable MCE notifiers in external modules

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ++
 arch/powerpc/kernel/mce.c| 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index c83e38a403fd..311f1392a2ec 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
 
+   /* Notifiers may be in a module, so enable virtual addressing. */
+   mfmsr   r11
+   ori r11,r11,MSR_IR
+   ori r11,r11,MSR_DR
+   mtmsr   r11
+
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
ld  r11,RESULT(r1)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 9cb5a731377b..413f7866a9c4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -50,11 +50,13 @@ int mce_register_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_register(&mce_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_register_notifier);
 
 int mce_unregister_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
 
 static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
   void *data)
-- 
2.20.1



[PATCH 11/13] powerpc/64s: Save r13 in machine_check_common_early

2019-06-20 Thread Santosh Sivaraj
From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

To fix, ensure that r13 is properly restored after an MCE.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 311f1392a2ec..932d8d05892c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -265,6 +265,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)
+   SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
/*
 * Register contents:
-- 
2.20.1



[PATCH 12/13] powerpc/memcpy_mcsafe: return remaining bytes

2019-06-20 Thread Santosh Sivaraj
memcpy_mcsafe currently return -EFAULT on a machine check exception, change
it to return the remaining bytes that needs to be copied, so that machine
check safe copy_to_user can maintain the same behavior as copy_to_user.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 129 +++-
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 50f865db0338..566c664aa640 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -30,11 +30,12 @@
ld  r14,STK_REG(R14)(r1)
addir1,r1,STACKFRAMESIZE
 .Ldo_err1:
-   li  r3,-EFAULT
+   mr  r3,r7
blr
 
 
 _GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
cmpldi  r5,16
blt .Lshort_copy
 
@@ -49,18 +50,21 @@ err1;   lbz r0,0(r4)
addir4,r4,1
 err1;  stb r0,0(r3)
addir3,r3,1
+   subir7,r7,1
 
 1: bf  cr7*4+2,2f
 err1;  lhz r0,0(r4)
addir4,r4,2
 err1;  sth r0,0(r3)
addir3,r3,2
+   subir7,r7,2
 
 2: bf  cr7*4+1,3f
 err1;  lwz r0,0(r4)
addir4,r4,4
 err1;  stw r0,0(r3)
addir3,r3,4
+   subir7,r7,4
 
 3: sub r5,r5,r6
cmpldi  r5,128
@@ -87,43 +91,69 @@ err1;   stw r0,0(r3)
 4:
 err2;  ld  r0,0(r4)
 err2;  ld  r6,8(r4)
-err2;  ld  r7,16(r4)
-err2;  ld  r8,24(r4)
-err2;  ld  r9,32(r4)
-err2;  ld  r10,40(r4)
-err2;  ld  r11,48(r4)
-err2;  ld  r12,56(r4)
-err2;  ld  r14,64(r4)
-err2;  ld  r15,72(r4)
-err2;  ld  r16,80(r4)
-err2;  ld  r17,88(r4)
-err2;  ld  r18,96(r4)
-err2;  ld  r19,104(r4)
-err2;  ld  r20,112(r4)
-err2;  ld  r21,120(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld  r17,80(r4)
+err2;  ld  r18,88(r4)
+err2;  ld  r19,96(r4)
+err2;  ld  r20,104(r4)
+err2;  ld  r21,112(r4)
+err2;  ld  r22,120(r4)
addir4,r4,128
 err2;  std r0,0(r3)
 err2;  std r6,8(r3)
-err2;  std r7,16(r3)
-err2;  std r8,24(r3)
-err2;  std r9,32(r3)
-err2;  std r10,40(r3)
-err2;  std r11,48(r3)
-err2;  std r12,56(r3)
-err2;  std r14,64(r3)
-err2;  std r15,72(r3)
-err2;  std r16,80(r3)
-err2;  std r17,88(r3)
-err2;  std r18,96(r3)
-err2;  std r19,104(r3)
-err2;  std r20,112(r3)
-err2;  std r21,120(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+err2;  std r15,64(r3)
+err2;  std r16,72(r3)
+err2;  std r17,80(r3)
+err2;  std r18,88(r3)
+err2;  std r19,96(r3)
+err2;  std r20,104(r3)
+err2;  std r21,112(r3)
+err2;  std r22,120(r3)
addir3,r3,128
+   subir7,r7,128
bdnz4b
 
clrldi  r5,r5,(64-7)
 
-   ld  r14,STK_REG(R14)(r1)
+   /* Up to 127B to go */
+5: srdir6,r5,4
+   mtocrf  0x01,r6
+
+6: bf  cr7*4+1,7f
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+   addir4,r4,64
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+   addir3,r3,64
+   subir7,r7,64
+
+7: ld  r14,STK_REG(R14)(r1)
ld  r15,STK_REG(R15)(r1)
ld  r16,STK_REG(R16)(r1)
ld  r17,STK_REG(R17)(r1)
@@ -134,42 +164,19 @@ err2; std r21,120(r3)
ld  r22,STK_REG(R22)(r1)
addir1,r1,STACKFRAMESIZE
 
-   /* Up to 127B to go */
-5: srdir6,r5,4
-   mtocrf  0x01,r6
-
-6: bf  cr7*4+1,7f
-err1;  ld  r0,0(r4)
-err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
-err1;  ld  r9,32(r4)
-err1;  ld  r10,40(r4)
-err1;  ld  r11,48(r4)
-err1;  ld  r12,56(r4)
-   addir4,r4,64
-err1;  std r0,0(r3)
-err1;  std r6,8(r3)
-err1;  std r7,16(r3)
-err1;  std r8,24(r3)
-err1;  std r9,32(r3)
-err1;  std r10,40(r3)
-err1;  std r11,48(r3)
-err1;  std r12,56(r3)
-   addir3,r3,64
-
/* Up to 63B to go */
-7: bf  cr7*4+2,8f
+   bf  cr7*4+2,8f
 err1;  ld  r0,0(r4)
 err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
+err1;  ld  r8,16(r4)
+err1;  ld  r9,24(r4)
addir4,r4,32
 err1;  std r0,0(r3)
 err1;  std r6,8(r3)
-err1;  std r7,16(r3

[PATCH 13/13] powerpc: add machine check safe copy_to_user

2019-06-20 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 12 
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..f8fcaab4c5bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe(to, from, n);
+   prevent_write_to_user(to, n);
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[v2 01/12] powerpc/mce: Make machine_check_ue_event() static

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[v2 05/12] powerpc/mce: Allow notifier callback to handle MCE

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

If a notifier returns NOTIFY_STOP, consider the MCE handled, just as we
do when machine_check_early() returns 1.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  2 +-
 arch/powerpc/include/asm/mce.h|  3 +-
 arch/powerpc/kernel/exceptions-64s.S  |  3 ++
 arch/powerpc/kernel/mce.c | 37 ++-
 4 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index f66f26ef3ce0..49ee8f08de2a 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,7 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
-void machine_check_notify(struct pt_regs *regs);
+long machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 948bef579086..240dd1fdfe35 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  process_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 2e56014fca21..c83e38a403fd 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -460,6 +460,9 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
+   ld  r11,RESULT(r1)
+   or  r3,r3,r11
+   std r3,RESULT(r1)
 
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab171b41ede..4a37928ab30e 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -157,6 +157,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
}
+
+   mce->u.ue_error.process_event = true;
}
return;
 }
@@ -241,6 +243,10 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   !evt.u.ue_error.process_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
@@ -647,16 +653,31 @@ long hmi_exception_realmode(struct pt_regs *regs)
return 1;
 }
 
-void machine_check_notify(struct pt_regs *regs)
+long machine_check_notify(struct pt_regs *regs)
 {
-   struct machine_check_event evt;
+   int index = __this_cpu_read(mce_nest_count) - 1;
+   struct machine_check_event *evt;
+   int rc;
 
-   if (!get_mce_event(&evt, MCE_EVENT_DONTRELEASE))
-   return;
+   if (index < 0 || index >= MAX_MC_EVT)
+   return 0;
 
-   blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+   evt = this_cpu_ptr(&mce_event[index]);
 
-   if (evt.error_type == MCE_ERROR_TYPE_UE &&
-   evt.u.ue_error.physical_address_provided)
-   machine_check_ue_event(&evt);
+   rc = blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
+   if (rc & NOTIFY_STOP_MASK) {
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   evt->u.ue_error.process_event = false;
+
+   if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI))
+   evt->disposition = MCE_DISPOSITION_RECOVERED;
+
+   return 1;
+   }
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE &&
+   evt->u.ue_error.physical_address_provided)
+   machine_check_ue_event(evt);
+
+   return 0;
 }
-- 
2.20.1



[v2 06/12] powerpc/mce: Add fixup address to UE events

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

If the instruction causing a UE has an exception table entry with fixup
address, save it in the machine_check_event struct.

If a machine check notifier callback returns NOTIFY_STOP to indicate it
has handled the error, set nip to continue execution from the fixup
address.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/mce.h |  5 +++--
 arch/powerpc/kernel/mce.c  | 16 +++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 240dd1fdfe35..9d9661747adf 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,11 +122,12 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
+   u8  fixup_address_provided;
u8  process_event;
-   u8  reserved_1[4];
+   u8  reserved_1[3];
u64 effective_address;
u64 physical_address;
-   u8  reserved_2[8];
+   u64 fixup_address;
} ue_error;
 
struct {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 4a37928ab30e..0233c0ee45ab 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -15,10 +15,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_PER_CPU(int, mce_nest_count);
 static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT], mce_event);
@@ -151,6 +153,8 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.link_error.effective_address_provided = true;
mce->u.link_error.effective_address = addr;
} else if (mce->error_type == MCE_ERROR_TYPE_UE) {
+   const struct exception_table_entry *entry;
+
mce->u.ue_error.effective_address_provided = true;
mce->u.ue_error.effective_address = addr;
if (phys_addr != ULONG_MAX) {
@@ -158,6 +162,12 @@ void save_mce_event(struct pt_regs *regs, long handled,
mce->u.ue_error.physical_address = phys_addr;
}
 
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce->u.ue_error.fixup_address_provided = true;
+   mce->u.ue_error.fixup_address = extable_fixup(entry);
+   }
+
mce->u.ue_error.process_event = true;
}
return;
@@ -666,8 +676,12 @@ long machine_check_notify(struct pt_regs *regs)
 
rc = blocking_notifier_call_chain(&mce_notifier_list, 0, evt);
if (rc & NOTIFY_STOP_MASK) {
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->u.ue_error.fixup_address_provided)
+   regs->nip = evt->u.ue_error.fixup_address;
+
evt->u.ue_error.process_event = false;
+   }
 
if ((rc & NOTIFY_STOP_MASK) && (regs->msr & MSR_RI))
evt->disposition = MCE_DISPOSITION_RECOVERED;
-- 
2.20.1



[v2 07/12] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-01 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem
layer so as to convert machine check exceptions into
a return value on failure in case a machine check
exception is encountered during the memcpy.

This patch largely borrows from the copyuser_power7
logic and does not add the VMX optimizations, largely
to keep the patch simple. If needed those optimizations
can be folded in.

Signed-off-by: Balbir Singh 
Acked-by: Nicholas Piggin 
[ar...@linux.ibm.com: Added symbol export]
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 215 
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..50f865db0338
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   li  r3,-EFAULT
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r7,16(r4)
+err2;  ld  r8,24(r4)
+err2;  ld  r9,32(r4)
+err2;  ld  r10,40(r4)
+err2;  ld  r11,48(r4)
+err2;  ld  r12,56(r4)
+err2;  ld  r14,64(r4)
+err2;  ld  r15,72(r4)
+err2;  ld  r16,80(r4)
+err2;  ld  r17,88(r4)
+err2;  ld  r18,96(r4)
+err2;  ld  r19,104(r4)
+err2;  ld  r20,112(r4)
+err2;  ld  r21,120(r4)
+   addir4,r4,128
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r7,16(r3)
+err2;  std r8,24(r3)
+err2;  std r9,32(r3)
+err2;  std r10,40(r3)
+err2;  std r11,48(r3)
+err2;  std r12,56(r3)
+err2;  std r14,64(r3)
+err2;  std r15,72(r3)
+err2;  std r16,80(r3)
+err2;  std r17,88(r3)

[v2 08/12] powerpc/mce: Handle memcpy_mcsafe()

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

Add an mce notifier intended to service memcpy_mcsafe().

The notifier uses this heuristic; if a UE occurs when accessing device
memory, and the faulting instruction had a fixup entry, the callback
will return NOTIFY_STOP.

This causes the notification mechanism to consider the MCE handled and
continue execution at the fixup address, which returns -EFAULT from the
memcpy_mcsafe() call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0233c0ee45ab..a8348a9bea5b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -56,6 +56,40 @@ int mce_unregister_notifier(struct notifier_block *nb)
return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
 }
 
+static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
+  void *data)
+{
+   struct machine_check_event *evt = data;
+   unsigned long pfn;
+   struct page *page;
+
+   if (evt->error_type != MCE_ERROR_TYPE_UE ||
+   !evt->u.ue_error.physical_address_provided)
+   return NOTIFY_DONE;
+
+   pfn = evt->u.ue_error.physical_address >> PAGE_SHIFT;
+   page = pfn_to_page(pfn);
+   if (!page)
+   return NOTIFY_DONE;
+
+   /* HMM and PMEM */
+   if (is_zone_device_page(page) && evt->u.ue_error.fixup_address_provided)
+   return NOTIFY_STOP;
+
+   return NOTIFY_DONE;
+}
+
+static struct notifier_block memcpy_mcsafe_nb = {
+   .notifier_call = check_memcpy_mcsafe
+};
+
+static int __init mce_mcsafe_register(void)
+{
+   mce_register_notifier(&memcpy_mcsafe_nb);
+   return 0;
+}
+arch_initcall(mce_mcsafe_register);
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
-- 
2.20.1



[v2 09/12] powerpc/mce: Enable MCE notifiers in external modules

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/exceptions-64s.S | 6 ++
 arch/powerpc/kernel/mce.c| 2 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index c83e38a403fd..311f1392a2ec 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -458,6 +458,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
 
+   /* Notifiers may be in a module, so enable virtual addressing. */
+   mfmsr   r11
+   ori r11,r11,MSR_IR
+   ori r11,r11,MSR_DR
+   mtmsr   r11
+
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_notify
ld  r11,RESULT(r1)
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a8348a9bea5b..9e4d497837d8 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -50,11 +50,13 @@ int mce_register_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_register(&mce_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_register_notifier);
 
 int mce_unregister_notifier(struct notifier_block *nb)
 {
return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
 }
+EXPORT_SYMBOL_GPL(mce_unregister_notifier);
 
 static int check_memcpy_mcsafe(struct notifier_block *nb, unsigned long val,
   void *data)
-- 
2.20.1



[v2 10/12] powerpc/memcpy_mcsafe: return remaining bytes

2019-07-01 Thread Santosh Sivaraj
memcpy_mcsafe currently return -EFAULT on a machine check exception, change
it to return the remaining bytes that needs to be copied, so that machine
check safe copy_to_user can maintain the same behavior as copy_to_user.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 129 +++-
 1 file changed, 70 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 50f865db0338..566c664aa640 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -30,11 +30,12 @@
ld  r14,STK_REG(R14)(r1)
addir1,r1,STACKFRAMESIZE
 .Ldo_err1:
-   li  r3,-EFAULT
+   mr  r3,r7
blr
 
 
 _GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
cmpldi  r5,16
blt .Lshort_copy
 
@@ -49,18 +50,21 @@ err1;   lbz r0,0(r4)
addir4,r4,1
 err1;  stb r0,0(r3)
addir3,r3,1
+   subir7,r7,1
 
 1: bf  cr7*4+2,2f
 err1;  lhz r0,0(r4)
addir4,r4,2
 err1;  sth r0,0(r3)
addir3,r3,2
+   subir7,r7,2
 
 2: bf  cr7*4+1,3f
 err1;  lwz r0,0(r4)
addir4,r4,4
 err1;  stw r0,0(r3)
addir3,r3,4
+   subir7,r7,4
 
 3: sub r5,r5,r6
cmpldi  r5,128
@@ -87,43 +91,69 @@ err1;   stw r0,0(r3)
 4:
 err2;  ld  r0,0(r4)
 err2;  ld  r6,8(r4)
-err2;  ld  r7,16(r4)
-err2;  ld  r8,24(r4)
-err2;  ld  r9,32(r4)
-err2;  ld  r10,40(r4)
-err2;  ld  r11,48(r4)
-err2;  ld  r12,56(r4)
-err2;  ld  r14,64(r4)
-err2;  ld  r15,72(r4)
-err2;  ld  r16,80(r4)
-err2;  ld  r17,88(r4)
-err2;  ld  r18,96(r4)
-err2;  ld  r19,104(r4)
-err2;  ld  r20,112(r4)
-err2;  ld  r21,120(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld  r17,80(r4)
+err2;  ld  r18,88(r4)
+err2;  ld  r19,96(r4)
+err2;  ld  r20,104(r4)
+err2;  ld  r21,112(r4)
+err2;  ld  r22,120(r4)
addir4,r4,128
 err2;  std r0,0(r3)
 err2;  std r6,8(r3)
-err2;  std r7,16(r3)
-err2;  std r8,24(r3)
-err2;  std r9,32(r3)
-err2;  std r10,40(r3)
-err2;  std r11,48(r3)
-err2;  std r12,56(r3)
-err2;  std r14,64(r3)
-err2;  std r15,72(r3)
-err2;  std r16,80(r3)
-err2;  std r17,88(r3)
-err2;  std r18,96(r3)
-err2;  std r19,104(r3)
-err2;  std r20,112(r3)
-err2;  std r21,120(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+err2;  std r15,64(r3)
+err2;  std r16,72(r3)
+err2;  std r17,80(r3)
+err2;  std r18,88(r3)
+err2;  std r19,96(r3)
+err2;  std r20,104(r3)
+err2;  std r21,112(r3)
+err2;  std r22,120(r3)
addir3,r3,128
+   subir7,r7,128
bdnz4b
 
clrldi  r5,r5,(64-7)
 
-   ld  r14,STK_REG(R14)(r1)
+   /* Up to 127B to go */
+5: srdir6,r5,4
+   mtocrf  0x01,r6
+
+6: bf  cr7*4+1,7f
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+   addir4,r4,64
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+   addir3,r3,64
+   subir7,r7,64
+
+7: ld  r14,STK_REG(R14)(r1)
ld  r15,STK_REG(R15)(r1)
ld  r16,STK_REG(R16)(r1)
ld  r17,STK_REG(R17)(r1)
@@ -134,42 +164,19 @@ err2; std r21,120(r3)
ld  r22,STK_REG(R22)(r1)
addir1,r1,STACKFRAMESIZE
 
-   /* Up to 127B to go */
-5: srdir6,r5,4
-   mtocrf  0x01,r6
-
-6: bf  cr7*4+1,7f
-err1;  ld  r0,0(r4)
-err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
-err1;  ld  r9,32(r4)
-err1;  ld  r10,40(r4)
-err1;  ld  r11,48(r4)
-err1;  ld  r12,56(r4)
-   addir4,r4,64
-err1;  std r0,0(r3)
-err1;  std r6,8(r3)
-err1;  std r7,16(r3)
-err1;  std r8,24(r3)
-err1;  std r9,32(r3)
-err1;  std r10,40(r3)
-err1;  std r11,48(r3)
-err1;  std r12,56(r3)
-   addir3,r3,64
-
/* Up to 63B to go */
-7: bf  cr7*4+2,8f
+   bf  cr7*4+2,8f
 err1;  ld  r0,0(r4)
 err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
+err1;  ld  r8,16(r4)
+err1;  ld  r9,24(r4)
addir4,r4,32
 err1;  std r0,0(r3)
 err1;  std r6,8(r3)
-err1;  std r7,16(r3

[v2 11/12] powerpc: add machine check safe copy_to_user

2019-07-01 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 12 
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..f8fcaab4c5bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe(to, from, n);
+   prevent_write_to_user(to, n);
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[v2 12/12] powerpc/64s: Save r13 in machine_check_common_early

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

To fix, ensure that r13 is properly restored after an MCE.

This commit is needed for testing this series, this is a possible simulator
bug.
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 311f1392a2ec..932d8d05892c 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -265,6 +265,7 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
 EXC_REAL_END(machine_check, 0x200, 0x100)
 EXC_VIRT_NONE(0x4200, 0x100)
 TRAMP_REAL_BEGIN(machine_check_common_early)
+   SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
/*
 * Register contents:
-- 
2.20.1



[v2 00/12] powerpc: implement machine check safe memcpy

2019-07-01 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which restores 
r13 is only for testing
on mambo, where r13 is not restored upon hittin vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can 
be used to enable code
reusablity, which I will send as another series.

--
Balbir Singh (2):
  powerpc/mce: Bug fixes for MCE handling in kernel space
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (8):
  powerpc/mce: Make machine_check_ue_event() static
  powerpc/mce: Add MCE notification chain
  powerpc/mce: Move machine_check_ue_event() call
  powerpc/mce: Allow notifier callback to handle MCE
  powerpc/mce: Add fixup address to UE events
  powerpc/mce: Handle memcpy_mcsafe()
  powerpc/mce: Enable MCE notifiers in external modules
  powerpc/64s: Save r13 in machine_check_common_early

Santosh Sivaraj (2):
  powerpc/memcpy_mcsafe: return remaining bytes
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig  |   1 +
 arch/powerpc/include/asm/asm-prototypes.h |   1 +
 arch/powerpc/include/asm/mce.h|  13 +-
 arch/powerpc/include/asm/string.h |   2 +
 arch/powerpc/include/asm/uaccess.h|  12 ++
 arch/powerpc/kernel/exceptions-64s.S  |  14 ++
 arch/powerpc/kernel/mce.c | 102 +-
 arch/powerpc/kernel/mce_power.c   |  26 ++-
 arch/powerpc/lib/Makefile |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S   | 226 ++
 arch/powerpc/platforms/pseries/ras.c  |   6 +-
 11 files changed, 386 insertions(+), 19 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[v2 02/12] powerpc/mce: Bug fixes for MCE handling in kernel space

2019-07-01 Thread Santosh Sivaraj
From: Balbir Singh 

The code currently assumes PAGE_SHIFT as the shift value of
the pfn, this works correctly (mostly) for user space pages,
but the correct thing to do is

1. Extract the shift value returned via the pte-walk API's
2. Use the shift value to access the instruction address.

Note, the final physical address still use PAGE_SHIFT for
computation. handle_ierror() is not modified and handle_derror()
is modified just for extracting the correct instruction
address.

This is largely due to __find_linux_pte() returning pfn's
shifted by pdshift. The code is much more generic and can
handle shift values returned.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, &shift);
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, &shift);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ &shift);
if (pfn != ULONG_MAX) {
*phys_addr =
-   (pfn << PAGE_SHIFT);
+   (pfn << shift);
}
}
}
diff --git a/a

[v2 03/12] powerpc/mce: Add MCE notification chain

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

Signed-off-by: Reza Arbab 
---
 arch/powerpc/include/asm/asm-prototypes.h |  1 +
 arch/powerpc/include/asm/mce.h|  4 
 arch/powerpc/kernel/exceptions-64s.S  |  4 
 arch/powerpc/kernel/mce.c | 22 ++
 4 files changed, 31 insertions(+)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index ec1c97a8e8cb..f66f26ef3ce0 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -72,6 +72,7 @@ void machine_check_exception(struct pt_regs *regs);
 void emulation_assist_interrupt(struct pt_regs *regs);
 long do_slb_fault(struct pt_regs *regs, unsigned long ea);
 void do_bad_slb_fault(struct pt_regs *regs, unsigned long ea, long err);
+void machine_check_notify(struct pt_regs *regs);
 
 /* signals, syscalls and interrupts */
 long sys_swapcontext(struct ucontext __user *old_ctx,
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..948bef579086 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -214,4 +214,8 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr,
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
+
+int mce_register_notifier(struct notifier_block *nb);
+int mce_unregister_notifier(struct notifier_block *nb);
+
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 6b86055e5251..2e56014fca21 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -457,6 +457,10 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_early
std r3,RESULT(r1)   /* Save result */
+
+   addir3,r1,STACK_FRAME_OVERHEAD
+   bl  machine_check_notify
+
ld  r12,_MSR(r1)
 BEGIN_FTR_SECTION
b   4f
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..24d350a934e4 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -42,6 +42,18 @@ static struct irq_work mce_event_process_work = {
 
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
+static BLOCKING_NOTIFIER_HEAD(mce_notifier_list);
+
+int mce_register_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_register(&mce_notifier_list, nb);
+}
+
+int mce_unregister_notifier(struct notifier_block *nb)
+{
+   return blocking_notifier_chain_unregister(&mce_notifier_list, nb);
+}
+
 static void mce_set_error_info(struct machine_check_event *mce,
   struct mce_error_info *mce_err)
 {
@@ -635,3 +647,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
 
return 1;
 }
+
+void machine_check_notify(struct pt_regs *regs)
+{
+   struct machine_check_event evt;
+
+   if (!get_mce_event(&evt, MCE_EVENT_DONTRELEASE))
+   return;
+
+   blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+}
-- 
2.20.1



[v2 04/12] powerpc/mce: Move machine_check_ue_event() call

2019-07-01 Thread Santosh Sivaraj
From: Reza Arbab 

Move the call site of machine_check_ue_event() slightly later in the MCE
codepath. No functional change intended--this is prep for a later patch
to conditionally skip the call.

Signed-off-by: Reza Arbab 
---
 arch/powerpc/kernel/mce.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 24d350a934e4..0ab171b41ede 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -156,7 +156,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -656,4 +655,8 @@ void machine_check_notify(struct pt_regs *regs)
return;
 
blocking_notifier_call_chain(&mce_notifier_list, 0, &evt);
+
+   if (evt.error_type == MCE_ERROR_TYPE_UE &&
+   evt.u.ue_error.physical_address_provided)
+   machine_check_ue_event(&evt);
 }
-- 
2.20.1



[v3 1/7] powerpc/mce: Make machine_check_ue_event() static

2019-07-05 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[v3 0/7] powerpc: implement machine check safe memcpy

2019-07-05 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a nop
is only for testing on mambo, where r13 is not restored upon hitting vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can
be used to enable code reusablity, which I will send as another series.

---
Change-log:

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch

---
Balbir Singh (2):
  powerpc/mce: Bug fixes for MCE handling in kernel space
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (2):
  powerpc/mce: Make machine_check_ue_event() static
  powerpc/64s: save r13 in MCE handler (simulator workaroud)

Santosh Sivaraj (3):
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc/memcpy_mcsafe: return remaining bytes
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   7 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  12 ++
 arch/powerpc/kernel/exceptions-64s.S |   1 +
 arch/powerpc/kernel/mce.c|  11 +-
 arch/powerpc/kernel/mce_power.c  |  41 +++--
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 239 +++
 arch/powerpc/platforms/pseries/ras.c |   6 +-
 10 files changed, 302 insertions(+), 20 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[v3 2/7] powerpc/mce: Bug fixes for MCE handling in kernel space

2019-07-05 Thread Santosh Sivaraj
From: Balbir Singh 

The code currently assumes PAGE_SHIFT as the shift value of
the pfn, this works correctly (mostly) for user space pages,
but the correct thing to do is

1. Extract the shift value returned via the pte-walk API's
2. Use the shift value to access the instruction address.

Note, the final physical address still use PAGE_SHIFT for
computation. handle_ierror() is not modified and handle_derror()
is modified just for extracting the correct instruction
address.

This is largely due to __find_linux_pte() returning pfn's
shifted by pdshift. The code is much more generic and can
handle shift values returned.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, &shift);
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, &shift);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ &shift);
if (pfn != ULONG_MAX) {
*phys_addr =
- 

[v3 3/7] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-05 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem
layer so as to convert machine check exceptions into
a return value on failure in case a machine check
exception is encountered during the memcpy.

This patch largely borrows from the copyuser_power7
logic and does not add the VMX optimizations, largely
to keep the patch simple. If needed those optimizations
can be folded in.

Signed-off-by: Balbir Singh 
Acked-by: Nicholas Piggin 
[ar...@linux.ibm.com: Added symbol export]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 215 
 3 files changed, 218 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..50f865db0338
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   li  r3,-EFAULT
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r7,16(r4)
+err2;  ld  r8,24(r4)
+err2;  ld  r9,32(r4)
+err2;  ld  r10,40(r4)
+err2;  ld  r11,48(r4)
+err2;  ld  r12,56(r4)
+err2;  ld  r14,64(r4)
+err2;  ld  r15,72(r4)
+err2;  ld  r16,80(r4)
+err2;  ld  r17,88(r4)
+err2;  ld  r18,96(r4)
+err2;  ld  r19,104(r4)
+err2;  ld  r20,112(r4)
+err2;  ld  r21,120(r4)
+   addir4,r4,128
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r7,16(r3)
+err2;  std r8,24(r3)
+err2;  std r9,32(r3)
+err2;  std r10,40(r3)
+err2;  std r11,48(r3)
+err2;  std r12,56(r3)
+err2;  std r14,64(r3)
+err2;  std r15,72(r3)
+err2;  std

[v3 4/7] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-07-05 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue exucution at the fixup entry. Stop processing the event
further or print it.

Based-on-patch-by: Reza Arbab 
Cc: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   |  7 ++-
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..f74257eb013b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..94f2bb307537 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,9 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
+   if (!mce->u.ue_error.ignore_event)
+   machine_check_ue_event(mce);
}
}
return;
@@ -230,6 +232,9 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE && evt.u.ue_error.ignore_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 04666c0b40a8..db4aa98a23ac 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a PFN. NOTE: we are in real
@@ -565,9 +567,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   return 1;
+   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
@@ -600,7 +611,7 @@ static long mce_handle_error(struct pt_regs *regs,
&phys_addr);
 
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-   handled = mce_handle_ue_error(regs);
+   handled = mce_handle_ue_error(regs, &mce_err);
 
save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
-- 
2.20.1



[v3 5/7] powerpc/memcpy_mcsafe: return remaining bytes

2019-07-05 Thread Santosh Sivaraj
memcpy_mcsafe currently return -EFAULT on a machine check exception, change
it to return the remaining bytes that needs to be copied, so that machine
check safe copy_to_user can maintain the same behavior as copy_to_user.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/lib/memcpy_mcsafe_64.S | 142 
 1 file changed, 83 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
index 50f865db0338..4d8a3d315992 100644
--- a/arch/powerpc/lib/memcpy_mcsafe_64.S
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -30,11 +30,25 @@
ld  r14,STK_REG(R14)(r1)
addir1,r1,STACKFRAMESIZE
 .Ldo_err1:
-   li  r3,-EFAULT
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+100:   EX_TABLE(100b, .Ldone)
+46:
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
blr
 
 
 _GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
cmpldi  r5,16
blt .Lshort_copy
 
@@ -49,18 +63,21 @@ err1;   lbz r0,0(r4)
addir4,r4,1
 err1;  stb r0,0(r3)
addir3,r3,1
+   subir7,r7,1
 
 1: bf  cr7*4+2,2f
 err1;  lhz r0,0(r4)
addir4,r4,2
 err1;  sth r0,0(r3)
addir3,r3,2
+   subir7,r7,2
 
 2: bf  cr7*4+1,3f
 err1;  lwz r0,0(r4)
addir4,r4,4
 err1;  stw r0,0(r3)
addir3,r3,4
+   subir7,r7,4
 
 3: sub r5,r5,r6
cmpldi  r5,128
@@ -87,43 +104,69 @@ err1;  stw r0,0(r3)
 4:
 err2;  ld  r0,0(r4)
 err2;  ld  r6,8(r4)
-err2;  ld  r7,16(r4)
-err2;  ld  r8,24(r4)
-err2;  ld  r9,32(r4)
-err2;  ld  r10,40(r4)
-err2;  ld  r11,48(r4)
-err2;  ld  r12,56(r4)
-err2;  ld  r14,64(r4)
-err2;  ld  r15,72(r4)
-err2;  ld  r16,80(r4)
-err2;  ld  r17,88(r4)
-err2;  ld  r18,96(r4)
-err2;  ld  r19,104(r4)
-err2;  ld  r20,112(r4)
-err2;  ld  r21,120(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld  r17,80(r4)
+err2;  ld  r18,88(r4)
+err2;  ld  r19,96(r4)
+err2;  ld  r20,104(r4)
+err2;  ld  r21,112(r4)
+err2;  ld  r22,120(r4)
addir4,r4,128
 err2;  std r0,0(r3)
 err2;  std r6,8(r3)
-err2;  std r7,16(r3)
-err2;  std r8,24(r3)
-err2;  std r9,32(r3)
-err2;  std r10,40(r3)
-err2;  std r11,48(r3)
-err2;  std r12,56(r3)
-err2;  std r14,64(r3)
-err2;  std r15,72(r3)
-err2;  std r16,80(r3)
-err2;  std r17,88(r3)
-err2;  std r18,96(r3)
-err2;  std r19,104(r3)
-err2;  std r20,112(r3)
-err2;  std r21,120(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+err2;  std r15,64(r3)
+err2;  std r16,72(r3)
+err2;  std r17,80(r3)
+err2;  std r18,88(r3)
+err2;  std r19,96(r3)
+err2;  std r20,104(r3)
+err2;  std r21,112(r3)
+err2;  std r22,120(r3)
addir3,r3,128
+   subir7,r7,128
bdnz4b
 
clrldi  r5,r5,(64-7)
 
-   ld  r14,STK_REG(R14)(r1)
+   /* Up to 127B to go */
+5: srdir6,r5,4
+   mtocrf  0x01,r6
+
+6: bf  cr7*4+1,7f
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+   addir4,r4,64
+err2;  std r0,0(r3)
+err2;  std r6,8(r3)
+err2;  std r8,16(r3)
+err2;  std r9,24(r3)
+err2;  std r10,32(r3)
+err2;  std r11,40(r3)
+err2;  std r12,48(r3)
+err2;  std r14,56(r3)
+   addir3,r3,64
+   subir7,r7,64
+
+7: ld  r14,STK_REG(R14)(r1)
ld  r15,STK_REG(R15)(r1)
ld  r16,STK_REG(R16)(r1)
ld  r17,STK_REG(R17)(r1)
@@ -134,42 +177,19 @@ err2; std r21,120(r3)
ld  r22,STK_REG(R22)(r1)
addir1,r1,STACKFRAMESIZE
 
-   /* Up to 127B to go */
-5: srdir6,r5,4
-   mtocrf  0x01,r6
-
-6: bf  cr7*4+1,7f
-err1;  ld  r0,0(r4)
-err1;  ld  r6,8(r4)
-err1;  ld  r7,16(r4)
-err1;  ld  r8,24(r4)
-err1;  ld  r9,32(r4)
-err1;  ld  r10,40(r4)
-err1;  ld  r11,48(r4)
-err1;  ld  r12,56(r4)
-   addir4,r4,64
-err1;  std r0,0(r3)
-err1;  std r6,8(r3)
-err1;  std r7,16(r3)
-err1;  std r8,24(r3)
-err1;  std r9,32(r3)
-err1;  std r10,40(r3)
-err1;  std r11,48(r3)
-err1;  std r12,56(r3)
-   addir3,r3,64
-
/* Up to 63B to go */
-7: bf  cr7*4+2,8f

[v3 6/7] powerpc: add machine check safe copy_to_user

2019-07-05 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 12 
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..f8fcaab4c5bc 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,18 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe(to, from, n);
+   prevent_write_to_user(to, n);
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[v3 7/7] powerpc/64s: save r13 in MCE handler (simulator workaroud)

2019-07-05 Thread Santosh Sivaraj
From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

After debugging we see that the first instruction at vector 200 is skipped by
the simulator, due to which r13 is not saved. Adding a nop at 0x200 fixes the
issue.

(This commit is needed for testing this series. This should not be taken
into the tree)
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 73ba246ca11d..8e43abb2a744 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -255,6 +255,7 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 * some code path might still want to branch into the original
 * vector
 */
+   nop
SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_0(PACA_EXMC)
 BEGIN_FTR_SECTION
-- 
2.20.1



[v4 0/6] powerpc: implement machine check safe memcpy

2019-07-06 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a nop
is only for testing on mambo, where r13 is not restored upon hitting vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can
be used to enable code reusablity, which I will send as another series.

---
Change-log:

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch

---
Balbir Singh (2):
  powerpc/mce: Bug fixes for MCE handling in kernel space
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (2):
  powerpc/mce: Make machine_check_ue_event() static
  powerpc/64s: save r13 in MCE handler (simulator workaroud)

Santosh Sivaraj (2):
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   7 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/exceptions-64s.S |   1 +
 arch/powerpc/kernel/mce.c|  11 +-
 arch/powerpc/kernel/mce_power.c  |  41 +++--
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 239 +++
 arch/powerpc/platforms/pseries/ras.c |   6 +-
 10 files changed, 304 insertions(+), 20 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[v4 1/6] powerpc/mce: Make machine_check_ue_event() static

2019-07-06 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[v4 2/6] powerpc/mce: Bug fixes for MCE handling in kernel space

2019-07-06 Thread Santosh Sivaraj
From: Balbir Singh 

The code currently assumes PAGE_SHIFT as the shift value of
the pfn, this works correctly (mostly) for user space pages,
but the correct thing to do is

1. Extract the shift value returned via the pte-walk API's
2. Use the shift value to access the instruction address.

Note, the final physical address still use PAGE_SHIFT for
computation. handle_ierror() is not modified and handle_derror()
is modified just for extracting the correct instruction
address.

This is largely due to __find_linux_pte() returning pfn's
shifted by pdshift. The code is much more generic and can
handle shift values returned.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, &shift);
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, &shift);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ &shift);
if (pfn != ULONG_MAX) {
*phys_addr =
- 

[v4 3/6] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-06 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Added symbol export]
[santosh: return remaining bytes instead of -EFAULT]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 239 
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..4d8a3d315992
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+100:   EX_TABLE(100b, .Ldone)
+46:
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   subir7,r7,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+   subir7,r7,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+   subir7,r7,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld

[v4 4/6] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-07-06 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue exucution at the fixup entry. Stop processing the event
further or print it.

Based-on-patch-by: Reza Arbab 
Cc: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   |  7 ++-
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..f74257eb013b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..94f2bb307537 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,9 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
+   if (!mce->u.ue_error.ignore_event)
+   machine_check_ue_event(mce);
}
}
return;
@@ -230,6 +232,9 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   if (evt.error_type == MCE_ERROR_TYPE_UE && evt.u.ue_error.ignore_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 04666c0b40a8..db4aa98a23ac 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a PFN. NOTE: we are in real
@@ -565,9 +567,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_exception_tables(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   return 1;
+   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
@@ -600,7 +611,7 @@ static long mce_handle_error(struct pt_regs *regs,
&phys_addr);
 
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-   handled = mce_handle_ue_error(regs);
+   handled = mce_handle_ue_error(regs, &mce_err);
 
save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
-- 
2.20.1



[v4 5/6] powerpc: add machine check safe copy_to_user

2019-07-06 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..8899864a5552 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,20 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   if (access_ok(to, n)) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe((void *)to, from, n);
+   prevent_write_to_user(to, n);
+   }
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[v4 6/6] powerpc/64s: save r13 in MCE handler (simulator workaroud)

2019-07-06 Thread Santosh Sivaraj
From: Reza Arbab 

Testing my memcpy_mcsafe() work in progress with an injected UE, I get
an error like this immediately after the function returns:

BUG: Unable to handle kernel data access at 0x7fff84dec8f8
Faulting instruction address: 0xc008009c00b0
Oops: Kernel access of bad area, sig: 11 [#1]
LE PAGE_SIZE=64K MMU=Radix MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
Modules linked in: mce(O+) vmx_crypto crc32c_vpmsum
CPU: 0 PID: 1375 Comm: modprobe Tainted: G   O  5.1.0-rc6 #267
NIP:  c008009c00b0 LR: c008009c00a8 CTR: c0095f90
REGS: c000ee197790 TRAP: 0300   Tainted: G   O   (5.1.0-rc6)
MSR:  9280b033   CR: 88002826  XER: 
0004
CFAR: c0095f8c DAR: 7fff84dec8f8 DSISR: 4000 IRQMASK: 0
GPR00: 6c6c6568 c000ee197a20 c008009c8400 fff2
GPR04: c008009c02e0 0006  c3c834c8
GPR08: 0080 776a6681b7fb5100  c008009c01c8
GPR12: c0095f90 7fff84debc00 4d071440 
GPR16: 00010601 c008009e c0c98dd8 c0c98d98
GPR20: c3bba970 c008009c04d0 c008009c0618 c01e5820
GPR24:  0100 0001 c3bba958
GPR28: c008009c02e8 c008009c0318 c008009c02e0 
NIP [c008009c00b0] cause_ue+0xa8/0xe8 [mce]
LR [c008009c00a8] cause_ue+0xa0/0xe8 [mce]

After debugging we see that the first instruction at vector 200 is skipped by
the simulator, due to which r13 is not saved. Adding a nop at 0x200 fixes the
issue.

(This commit is needed for testing this series. This should not be taken
into the tree)
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 73ba246ca11d..8e43abb2a744 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -255,6 +255,7 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
 * some code path might still want to branch into the original
 * vector
 */
+   nop
SET_SCRATCH0(r13)   /* save r13 */
EXCEPTION_PROLOG_0(PACA_EXMC)
 BEGIN_FTR_SECTION
-- 
2.20.1



[v5 0/6] powerpc: implement machine check safe memcpy

2019-07-09 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581 #50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M   
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a nop
is only for testing on mambo, where r13 is not restored upon hitting vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can
be used to enable code reusablity, which I will send as another series.

---
Change-log:

v5:
* Don't use search_exception_tables since it searches for module exception 
tables
  also [Nicholas]
* Fix commit message for patch 2 [Nicholas]

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch

---
Balbir Singh (2):
  powerpc/mce: Fix MCE handling for huge pages
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (1):
  powerpc/mce: Make machine_check_ue_event() static

Santosh Sivaraj (3):
  extable: Add function to search only kernel exception table
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   7 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/mce.c|  16 +-
 arch/powerpc/kernel/mce_power.c  |  41 +++--
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 239 +++
 arch/powerpc/platforms/pseries/ras.c |   6 +-
 include/linux/extable.h  |   2 +
 kernel/extable.c |  16 +-
 11 files changed, 323 insertions(+), 23 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[v5 1/6] powerpc/mce: Make machine_check_ue_event() static

2019-07-09 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[v5 2/6] powerpc/mce: Fix MCE handling for huge pages

2019-07-09 Thread Santosh Sivaraj
From: Balbir Singh 

The current code would fail on huge pages addresses, since the shift
would be incorrect. Use the correct page shift value returned by
__find_linux_pte() to get the correct pfn. The code is more generic
and can handle both regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  3 ++-
 arch/powerpc/kernel/mce_power.c  | 26 --
 arch/powerpc/platforms/pseries/ras.c |  6 --
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..94888a7025b3 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,8 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index e39536aad30d..04666c0b40a8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -23,7 +23,8 @@
  * Convert an address related to an mm to a PFN. NOTE: we are in real
  * mode, we could potentially race with page table updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr,
+ unsigned int *shift)
 {
pte_t *ptep;
unsigned long flags;
@@ -36,13 +37,15 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
 
local_irq_save(flags);
if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
+   ptep = find_current_mm_pte(mm->pgd, addr, NULL, shift);
else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = find_init_mm_pte(addr, shift);
local_irq_restore(flags);
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+   if (!*shift)
+   *shift = PAGE_SHIFT;
+   return (pte_val(*ptep) & PTE_RPN_MASK) >> *shift;
 }
 
 /* flush SLBs and reload */
@@ -358,15 +361,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
unsigned long pfn, instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
+   unsigned int shift;
 
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip, &shift);
if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = (pfn << shift) + (regs->nip & ((1 << shift) - 1));
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
+   pfn = addr_to_pfn(regs, op.ea, &shift);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = (pfn << shift);
return 0;
}
/*
@@ -442,12 +446,14 @@ static int mce_handle_ierror(struct pt_regs *regs,
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
unsigned long pfn;
+   unsigned int shift;
 
if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
+   pfn = addr_to_pfn(regs, regs->nip,
+ &shift);
if (pfn != ULONG_MAX) {
*phys_addr =
-   (pfn << PAGE_SHIFT);
+   (pfn << shift);
}
}
}
diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index f16fdd0f71f7..5e43283d3300 100644
--- a/arch/powerpc/platforms/pser

[v5 3/6] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-09 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Added symbol export]
[santosh: return remaining bytes instead of -EFAULT]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 239 
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index c55f9c27bf79..529d6536eb4a 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..4d8a3d315992
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+100:   EX_TABLE(100b, .Ldone)
+46:
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   subir7,r7,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+   subir7,r7,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+   subir7,r7,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld

[v5 4/6] extable: Add function to search only kernel exception table

2019-07-09 Thread Santosh Sivaraj
In real mode, the search_exception tables cannot be called because
it also searches the module exception tables if entry is not found
in the kernel exception tables.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Nicholas Piggin 
Signed-off-by: Santosh Sivaraj 
---
 include/linux/extable.h |  2 ++
 kernel/extable.c| 16 +---
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..0c2819ba67f0 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct
+exception_table_entry *search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..6d544cb79fff 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,23 @@ void __init sort_main_extable(void)
}
 }
 
-/* Given an address, look for it in the exception tables. */
+/* For the given address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+   return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
+/*
+ * Given an address, look for it in the kernel and the module exception
+ * tables.
+ */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
const struct exception_table_entry *e;
 
-   e = search_extable(__start___ex_table,
-  __stop___ex_table - __start___ex_table, addr);
+   e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
-- 
2.20.1



[v5 6/6] powerpc: add machine check safe copy_to_user

2019-07-09 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8c1c636308c8..a173b392c272 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -134,6 +134,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 76f34346b642..8899864a5552 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -386,6 +386,20 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   if (access_ok(to, n)) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe((void *)to, from, n);
+   prevent_write_to_user(to, n);
+   }
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[v5 5/6] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-07-09 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue execution at the fixup entry. Stop processing the event
further or print it.

Based-on-patch-by: Reza Arbab 
Cc: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   | 12 +++-
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 27 insertions(+), 4 deletions(-)

Nick, I didn't add has_fixup_handler in mce_event structure; if we do so we wil
have to access the mce_event from ue_handler code also. That is because Mahesh
did not want mce_event to be accessed outside of save_mce_event, get_mce_event
and remove_mce_event; that is why I added ignore_event in mce_err also.

I have added the comment you mentioned in your reply.

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 94888a7025b3..f74257eb013b 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..092e6bbc603f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,9 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
+   if (!mce->u.ue_error.ignore_event)
+   machine_check_ue_event(mce);
}
}
return;
@@ -230,6 +232,14 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   /*
+* Don't report this machine check because the caller has a asked us
+* to ignore the event, it has a fixup handler which will do the
+* appropriate error handling and reporting.
+*/
+   if (evt.error_type == MCE_ERROR_TYPE_UE && evt.u.ue_error.ignore_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 04666c0b40a8..582a22b1acfb 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a PFN. NOTE: we are in real
@@ -565,9 +567,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   return 1;
+   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
@@ -600,7 +611,7 @@ static long mce_handle_error(struct pt_regs *regs,
&phys_addr);
 
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-   handled = mce_handle_ue_error(regs);
+   handled = mce_handle_ue_error(regs, &mce_err);
 
save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
-- 
2.20.1



[v6 0/6] powerpc: implement machine check safe memcpy

2019-07-28 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581
#50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M  
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper
program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a nop
is only for testing on mambo, where r13 is not restored upon hitting vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can
be used to enable code reusablity, which I will send as another series.

---
Change-log:
v6:
* Don't return pfn, all callees are expecting physical address anyway [nick]
* Patch re-ordering: move exception table patch before memcpy_mcsafe patch 
[nick]
* Reword commit log for search_exception_tables patch [nick]

v5:
* Don't use search_exception_tables since it searches for module exception 
tables
  also [Nicholas]
* Fix commit message for patch 2 [Nicholas]

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch
--

Balbir Singh (2):
  powerpc/mce: Fix MCE handling for huge pages
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (1):
  powerpc/mce: Make machine_check_ue_event() static

Santosh Sivaraj (3):
  extable: Add function to search only kernel exception table
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   6 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/mce.c|  16 +-
 arch/powerpc/kernel/mce_power.c  |  65 +---
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 239 +++
 arch/powerpc/platforms/pseries/ras.c |   9 +-
 include/linux/extable.h  |   2 +
 kernel/extable.c |  11 +-
 11 files changed, 326 insertions(+), 41 deletions(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

-- 
2.20.1



[v6 1/6] powerpc/mce: Make machine_check_ue_event() static

2019-07-28 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..e78c4f18ea0a 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -203,7 +203,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[v6 2/6] powerpc/mce: Fix MCE handling for huge pages

2019-07-28 Thread Santosh Sivaraj
From: Balbir Singh 

The current code would fail on huge pages addresses, since the shift would be
incorrect. Use the correct page shift value returned by __find_linux_pte() to
get the correct physical address. The code is more generic and can handle both
regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
[santosh: return physical address instead of pfn; change commit log]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  2 +-
 arch/powerpc/kernel/mce_power.c  | 50 ++--
 arch/powerpc/platforms/pseries/ras.c |  9 ++---
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..f3a6036b6bc0 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..bed38a8e2e50 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -20,13 +20,14 @@
 #include 
 
 /*
- * Convert an address related to an mm to a PFN. NOTE: we are in real
- * mode, we could potentially race with page table updates.
+ * Convert an address related to an mm to a physical address.
+ * NOTE: we are in real mode, we could potentially race with page table 
updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr)
 {
-   pte_t *ptep;
-   unsigned long flags;
+   pte_t *ptep, pte;
+   unsigned int shift;
+   unsigned long flags, phys_addr;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -35,14 +36,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
mm = &init_mm;
 
local_irq_save(flags);
-   if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
-   else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
local_irq_restore(flags);
+
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+
+   pte = *ptep;
+   if (shift > PAGE_SHIFT) {
+   unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+
+   pte = __pte(pte_val(pte) | (addr & rpnmask));
+   }
+   phys_addr = pte_pfn(pte) << PAGE_SHIFT;
+
+   return phys_addr;
 }
 
 /* flush SLBs and reload */
@@ -354,18 +362,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
 * faults
 */
int instr;
-   unsigned long pfn, instr_addr;
+   unsigned long instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
 
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = addr_to_phys(regs, regs->nip) + (regs->nip & ~PAGE_MASK);
+   if (instr_addr != ULONG_MAX) {
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = addr_to_phys(regs, op.ea);
return 0;
}
/*
@@ -440,15 +446,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
*addr = regs->nip;
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
-   unsigned long pfn;
-
-   if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   *phys_addr =
-   (pfn << PAGE_SHIFT);
-   }
-   }
+  

[v6 3/6] extable: Add function to search only kernel exception table

2019-07-28 Thread Santosh Sivaraj
Certain architecture specific operating modes (e.g., in powerpc machine
check handler that is unable to access vmalloc memory), the
search_exception_tables cannot be called because it also searches the module
exception tables if entry is not found in the kernel exception table.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Nicholas Piggin 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 include/linux/extable.h |  2 ++
 kernel/extable.c| 11 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *
+search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
 }
 
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+   return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
 /* Given an address, look for it in the exception tables. */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
const struct exception_table_entry *e;
 
-   e = search_extable(__start___ex_table,
-  __stop___ex_table - __start___ex_table, addr);
+   e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
-- 
2.20.1



[v6 4/6] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-07-28 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Added symbol export]
[santosh: return remaining bytes instead of -EFAULT]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 239 
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index eebc782d89a5..fa6b1b657b43 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..4d8a3d315992
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+100:   EX_TABLE(100b, .Ldone)
+46:
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   subir7,r7,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+   subir7,r7,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+   subir7,r7,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld

[v6 5/6] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-07-28 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue execution at the fixup entry. Stop processing the event
further or print it.

Based-on-patch-by: Reza Arbab 
Cc: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   | 12 +++-
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index f3a6036b6bc0..e1931c8c2743 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index e78c4f18ea0a..2df132fe3354 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,9 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
+   if (!mce->u.ue_error.ignore_event)
+   machine_check_ue_event(mce);
}
}
return;
@@ -230,6 +232,14 @@ void machine_check_queue_event(void)
if (!get_mce_event(&evt, MCE_EVENT_RELEASE))
return;
 
+   /*
+* Don't report this machine check because the caller has a asked us to
+* ignore the event, it has a fixup handler which will do the
+* appropriate error handling and reporting.
+*/
+   if (evt.error_type == MCE_ERROR_TYPE_UE && evt.u.ue_error.ignore_event)
+   return;
+
index = __this_cpu_inc_return(mce_queue_count) - 1;
/* If queue is full, just return for now. */
if (index >= MAX_MC_EVT) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index bed38a8e2e50..36ca45bbb273 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a physical address.
@@ -558,9 +560,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   return 1;
+   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
@@ -593,7 +604,7 @@ static long mce_handle_error(struct pt_regs *regs,
&phys_addr);
 
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-   handled = mce_handle_ue_error(regs);
+   handled = mce_handle_ue_error(regs, &mce_err);
 
save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
-- 
2.20.1



[v6 6/6] powerpc: add machine check safe copy_to_user

2019-07-28 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f516796dd819..18d831f52fa7 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAS_ZONE_DEVICE if PPC_BOOK3S_64
select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 8b03eb44e876..15002b51ff18 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   if (access_ok(to, n)) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe((void *)to, from, n);
+   prevent_write_to_user(to, n);
+   }
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[PATCH] powerpc/kvm: Fall through switch case explicitly

2019-07-28 Thread Santosh Sivaraj
Implicit fallthrough warning was enabled globally which broke
the build. Make it explicit with a `fall through` comment.

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kvm/book3s_32_mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 653936177857..18f244aad7aa 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -239,6 +239,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu 
*vcpu, gva_t eaddr,
case 2:
case 6:
pte->may_write = true;
+   /* fall through */
case 3:
case 5:
case 7:
-- 
2.20.1



[PATCH] powerpc/mce: Schedule work from irq_work

2019-08-01 Thread Santosh Sivaraj
schedule_work() cannot be called from MCE exception context as MCE can
interrupt even in interrupt disabled context.

fixes: 733e4a4c ("powerpc/mce: hookup memory_failure for UE errors")
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kernel/mce.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..0ab6fa7c 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -275,8 +274,7 @@ static void machine_process_ue_event(struct work_struct 
*work)
}
 }
 /*
- * process pending MCE event from the mce event queue. This function will be
- * called during syscall exit.
+ * process pending MCE event from the mce event queue.
  */
 static void machine_check_process_queued_event(struct irq_work *work)
 {
@@ -292,6 +290,10 @@ static void machine_check_process_queued_event(struct 
irq_work *work)
while (__this_cpu_read(mce_queue_count) > 0) {
index = __this_cpu_read(mce_queue_count) - 1;
evt = this_cpu_ptr(&mce_event_queue[index]);
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   machine_check_ue_event(evt);
+
machine_check_print_event_info(evt, false, false);
__this_cpu_dec(mce_queue_count);
}
-- 
2.20.1



[PATCH v7 0/7] powerpc: implement machine check safe memcpy

2019-08-04 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581
#50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M  
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper
program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a nop
is only for testing on mambo, where r13 is not restored upon hitting vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros can
be used to enable code reusablity, which I will send as another series.

---
Change-log:
v7:
* Move schedule_work to be called from irq_work.

v6:
* Don't return pfn, all callees are expecting physical address anyway [nick]
* Patch re-ordering: move exception table patch before memcpy_mcsafe patch 
[nick]
* Reword commit log for search_exception_tables patch [nick]

v5:
* Don't use search_exception_tables since it searches for module exception 
tables
  also [Nicholas]
* Fix commit message for patch 2 [Nicholas]

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch
--
Balbir Singh (2):
  powerpc/mce: Fix MCE handling for huge pages
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (1):
  powerpc/mce: Make machine_check_ue_event() static

Santosh Sivaraj (4):
  powerpc/mce: Schedule work from irq_work
  extable: Add function to search only kernel exception table
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   6 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/mce.c|  23 ++-
 arch/powerpc/kernel/mce_power.c  |  65 +---
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 239 +++
 arch/powerpc/platforms/pseries/ras.c |   9 +-
 include/linux/extable.h  |   2 +
 kernel/extable.c |  11 +-
 11 files changed, 330 insertions(+), 

[PATCH v7 1/7] powerpc/mce: Schedule work from irq_work

2019-08-04 Thread Santosh Sivaraj
schedule_work() cannot be called from MCE exception context as MCE can
interrupt even in interrupt disabled context.

fixes: 733e4a4c ("powerpc/mce: hookup memory_failure for UE errors")
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kernel/mce.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..0ab6fa7c 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -275,8 +274,7 @@ static void machine_process_ue_event(struct work_struct 
*work)
}
 }
 /*
- * process pending MCE event from the mce event queue. This function will be
- * called during syscall exit.
+ * process pending MCE event from the mce event queue.
  */
 static void machine_check_process_queued_event(struct irq_work *work)
 {
@@ -292,6 +290,10 @@ static void machine_check_process_queued_event(struct 
irq_work *work)
while (__this_cpu_read(mce_queue_count) > 0) {
index = __this_cpu_read(mce_queue_count) - 1;
evt = this_cpu_ptr(&mce_event_queue[index]);
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   machine_check_ue_event(evt);
+
machine_check_print_event_info(evt, false, false);
__this_cpu_dec(mce_queue_count);
}
-- 
2.20.1



[PATCH v7 2/7] powerpc/mce: Make machine_check_ue_event() static

2019-08-04 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab6fa7c..8c0b471658a7 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -202,7 +202,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[PATCH v7 3/7] powerpc/mce: Fix MCE handling for huge pages

2019-08-04 Thread Santosh Sivaraj
From: Balbir Singh 

The current code would fail on huge pages addresses, since the shift would be
incorrect. Use the correct page shift value returned by __find_linux_pte() to
get the correct physical address. The code is more generic and can handle both
regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
[santosh: return physical address instead of pfn; change commit log]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  2 +-
 arch/powerpc/kernel/mce_power.c  | 50 ++--
 arch/powerpc/platforms/pseries/ras.c |  9 ++---
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..f3a6036b6bc0 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..bed38a8e2e50 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -20,13 +20,14 @@
 #include 
 
 /*
- * Convert an address related to an mm to a PFN. NOTE: we are in real
- * mode, we could potentially race with page table updates.
+ * Convert an address related to an mm to a physical address.
+ * NOTE: we are in real mode, we could potentially race with page table 
updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr)
 {
-   pte_t *ptep;
-   unsigned long flags;
+   pte_t *ptep, pte;
+   unsigned int shift;
+   unsigned long flags, phys_addr;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -35,14 +36,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
mm = &init_mm;
 
local_irq_save(flags);
-   if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
-   else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
local_irq_restore(flags);
+
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+
+   pte = *ptep;
+   if (shift > PAGE_SHIFT) {
+   unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+
+   pte = __pte(pte_val(pte) | (addr & rpnmask));
+   }
+   phys_addr = pte_pfn(pte) << PAGE_SHIFT;
+
+   return phys_addr;
 }
 
 /* flush SLBs and reload */
@@ -354,18 +362,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
 * faults
 */
int instr;
-   unsigned long pfn, instr_addr;
+   unsigned long instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
 
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = addr_to_phys(regs, regs->nip) + (regs->nip & ~PAGE_MASK);
+   if (instr_addr != ULONG_MAX) {
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = addr_to_phys(regs, op.ea);
return 0;
}
/*
@@ -440,15 +446,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
*addr = regs->nip;
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
-   unsigned long pfn;
-
-   if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   *phys_addr =
-   (pfn << PAGE_SHIFT);
-   }
-   }
+  

[PATCH v7 4/7] extable: Add function to search only kernel exception table

2019-08-04 Thread Santosh Sivaraj
Certain architecture specific operating modes (e.g., in powerpc machine
check handler that is unable to access vmalloc memory), the
search_exception_tables cannot be called because it also searches the module
exception tables if entry is not found in the kernel exception table.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Nicholas Piggin 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 include/linux/extable.h |  2 ++
 kernel/extable.c| 11 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *
+search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
 }
 
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+   return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
 /* Given an address, look for it in the exception tables. */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
const struct exception_table_entry *e;
 
-   e = search_extable(__start___ex_table,
-  __stop___ex_table - __start___ex_table, addr);
+   e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
-- 
2.20.1



[PATCH v7 5/7] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-08-04 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Added symbol export]
[santosh: return remaining bytes instead of -EFAULT]
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 239 
 3 files changed, 242 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index eebc782d89a5..fa6b1b657b43 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..4d8a3d315992
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,239 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+100:   EX_TABLE(100b, .Ldone)
+46:
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   subir7,r7,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+   subir7,r7,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+   subir7,r7,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72(r4)
+err2;  ld

[PATCH v7 6/7] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-08-04 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue execution at the fixup entry. Stop processing the event
further or print it.

Based-on-patch-by: Reza Arbab 
Cc: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   | 13 +++--
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index f3a6036b6bc0..e1931c8c2743 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8c0b471658a7..7a9a0bf6d614 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,6 +144,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
}
}
return;
@@ -256,8 +257,13 @@ static void machine_process_ue_event(struct work_struct 
*work)
/*
 * This should probably queued elsewhere, but
 * oh! well
+*
+* Don't report this machine check because the caller has a
+* asked us to ignore the event, it has a fixup handler which
+* will do the appropriate error handling and reporting.
 */
-   if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->error_type == MCE_ERROR_TYPE_UE &&
+   !evt->u.ue_error.ignore_event) {
if (evt->u.ue_error.physical_address_provided) {
unsigned long pfn;
 
@@ -291,8 +297,11 @@ static void machine_check_process_queued_event(struct 
irq_work *work)
index = __this_cpu_read(mce_queue_count) - 1;
evt = this_cpu_ptr(&mce_event_queue[index]);
 
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
machine_check_ue_event(evt);
+   if (evt->u.ue_error.ignore_event)
+   return;
+   }
 
machine_check_print_event_info(evt, false, false);
__this_cpu_dec(mce_queue_count);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index bed38a8e2e50..36ca45bbb273 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a physical address.
@@ -558,9 +560,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip = extable_fixup(entry);
+   return 1;
+   }
 
/*
 * On specific SCOM read via MMIO we may get a machine check
@@ -593,7 +604,7 @@ static long mce_handle_error(struct pt_regs *regs,
&phys_addr);
 
if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
-   handled = mce_handle_ue_error(regs);
+   handled = mce_handle_ue_error(regs, &mce_err);
 
save_mce_event(regs, handled, &mce_err, regs->nip, addr, phys_addr);
 
-- 
2.20.1



[PATCH v7 7/7] powerpc: add machine check safe copy_to_user

2019-08-04 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 77f6ebf97113..4316e36095a2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 8b03eb44e876..15002b51ff18 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   if (access_ok(to, n)) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe((void *)to, from, n);
+   prevent_write_to_user(to, n);
+   }
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



[PATCH v8 0/7] powerpc: implement machine check safe memcpy

2019-08-07 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581
#50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M  
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper
program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a
nop is only for testing on mambo, where r13 is not restored upon hitting
vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros
can be used to enable code reusablity, which I will send as another series.

---
Change-log:
v8:
* While ignoring UE events, return was used instead of continue.
* Checkpatch fixups for commit log

v7:
* Move schedule_work to be called from irq_work.

v6:
* Don't return pfn, all callees are expecting physical address anyway [nick]
* Patch re-ordering: move exception table patch before memcpy_mcsafe patch 
[nick]
* Reword commit log for search_exception_tables patch [nick]

v5:
* Don't use search_exception_tables since it searches for module exception 
tables
  also [Nicholas]
* Fix commit message for patch 2 [Nicholas]

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch
--
Balbir Singh (2):
  powerpc/mce: Fix MCE handling for huge pages
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (1):
  powerpc/mce: Make machine_check_ue_event() static

Santosh Sivaraj (4):
  powerpc/mce: Schedule work from irq_work
  extable: Add function to search only kernel exception table
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   6 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/mce.c|  26 ++-
 arch/powerpc/kernel/mce_power.c  |  65 ---
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 242 +++
 arch/powerpc/platforms/pseries/ras.c |   9 +-
 include/linux/ext

[PATCH v8 1/7] powerpc/mce: Schedule work from irq_work

2019-08-07 Thread Santosh Sivaraj
schedule_work() cannot be called from MCE exception context as MCE can
interrupt even in interrupt disabled context.

fixes: 733e4a4c ("powerpc/mce: hookup memory_failure for UE errors")
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/kernel/mce.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..0ab6fa7c 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -144,7 +144,6 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
-   machine_check_ue_event(mce);
}
}
return;
@@ -275,8 +274,7 @@ static void machine_process_ue_event(struct work_struct 
*work)
}
 }
 /*
- * process pending MCE event from the mce event queue. This function will be
- * called during syscall exit.
+ * process pending MCE event from the mce event queue.
  */
 static void machine_check_process_queued_event(struct irq_work *work)
 {
@@ -292,6 +290,10 @@ static void machine_check_process_queued_event(struct 
irq_work *work)
while (__this_cpu_read(mce_queue_count) > 0) {
index = __this_cpu_read(mce_queue_count) - 1;
evt = this_cpu_ptr(&mce_event_queue[index]);
+
+   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   machine_check_ue_event(evt);
+
machine_check_print_event_info(evt, false, false);
__this_cpu_dec(mce_queue_count);
}
-- 
2.20.1



[PATCH v8 2/7] powerpc/mce: Make machine_check_ue_event() static

2019-08-07 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 0ab6fa7c..8c0b471658a7 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -202,7 +202,7 @@ void release_mce_event(void)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.20.1



[PATCH v8 3/7] powerpc/mce: Fix MCE handling for huge pages

2019-08-07 Thread Santosh Sivaraj
From: Balbir Singh 

The current code would fail on huge pages addresses, since the shift would
be incorrect. Use the correct page shift value returned by
__find_linux_pte() to get the correct physical address. The code is more
generic and can handle both regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")
Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
Co-developed-by: Santosh Sivaraj 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h   |  2 +-
 arch/powerpc/kernel/mce_power.c  | 50 ++--
 arch/powerpc/platforms/pseries/ras.c |  9 ++---
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..f3a6036b6bc0 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..bed38a8e2e50 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -20,13 +20,14 @@
 #include 
 
 /*
- * Convert an address related to an mm to a PFN. NOTE: we are in real
- * mode, we could potentially race with page table updates.
+ * Convert an address related to an mm to a physical address.
+ * NOTE: we are in real mode, we could potentially race with page table 
updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr)
 {
-   pte_t *ptep;
-   unsigned long flags;
+   pte_t *ptep, pte;
+   unsigned int shift;
+   unsigned long flags, phys_addr;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -35,14 +36,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
mm = &init_mm;
 
local_irq_save(flags);
-   if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
-   else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
local_irq_restore(flags);
+
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+
+   pte = *ptep;
+   if (shift > PAGE_SHIFT) {
+   unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+
+   pte = __pte(pte_val(pte) | (addr & rpnmask));
+   }
+   phys_addr = pte_pfn(pte) << PAGE_SHIFT;
+
+   return phys_addr;
 }
 
 /* flush SLBs and reload */
@@ -354,18 +362,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
 * faults
 */
int instr;
-   unsigned long pfn, instr_addr;
+   unsigned long instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
 
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = addr_to_phys(regs, regs->nip) + (regs->nip & ~PAGE_MASK);
+   if (instr_addr != ULONG_MAX) {
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = addr_to_phys(regs, op.ea);
return 0;
}
/*
@@ -440,15 +446,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
*addr = regs->nip;
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
-   unsigned long pfn;
-
-   if (get_paca()->in_mce < MAX_MCE_DEPTH) {
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   *phys_addr =
-   (pfn << PAGE_SHIFT);
-   }
-   }
+  

[PATCH v8 4/7] extable: Add function to search only kernel exception table

2019-08-07 Thread Santosh Sivaraj
Certain architecture specific operating modes (e.g., in powerpc machine
check handler that is unable to access vmalloc memory), the
search_exception_tables cannot be called because it also searches the
module exception tables if entry is not found in the kernel exception
table.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Nicholas Piggin 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 include/linux/extable.h |  2 ++
 kernel/extable.c| 11 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *
+search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
 }
 
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+   return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
 /* Given an address, look for it in the exception tables. */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
const struct exception_table_entry *e;
 
-   e = search_extable(__start___ex_table,
-  __stop___ex_table - __start___ex_table, addr);
+   e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
-- 
2.20.1



[PATCH v8 5/7] powerpc/memcpy: Add memcpy_mcsafe for pmem

2019-08-07 Thread Santosh Sivaraj
From: Balbir Singh 

The pmem infrastructure uses memcpy_mcsafe in the pmem layer so as to
convert machine check exceptions into a return value on failure in case
a machine check exception is encountered during the memcpy. The return
value is the number of bytes remaining to be copied.

This patch largely borrows from the copyuser_power7 logic and does not add
the VMX optimizations, largely to keep the patch simple. If needed those
optimizations can be folded in.

Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Added symbol export]
Co-developed-by: Santosh Sivaraj 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/string.h   |   2 +
 arch/powerpc/lib/Makefile   |   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S | 242 
 3 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/lib/memcpy_mcsafe_64.S

diff --git a/arch/powerpc/include/asm/string.h 
b/arch/powerpc/include/asm/string.h
index 9bf6dffb4090..b72692702f35 100644
--- a/arch/powerpc/include/asm/string.h
+++ b/arch/powerpc/include/asm/string.h
@@ -53,7 +53,9 @@ void *__memmove(void *to, const void *from, __kernel_size_t 
n);
 #ifndef CONFIG_KASAN
 #define __HAVE_ARCH_MEMSET32
 #define __HAVE_ARCH_MEMSET64
+#define __HAVE_ARCH_MEMCPY_MCSAFE
 
+extern int memcpy_mcsafe(void *dst, const void *src, __kernel_size_t sz);
 extern void *__memset16(uint16_t *, uint16_t v, __kernel_size_t);
 extern void *__memset32(uint32_t *, uint32_t v, __kernel_size_t);
 extern void *__memset64(uint64_t *, uint64_t v, __kernel_size_t);
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index eebc782d89a5..fa6b1b657b43 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_PPC_BOOK3S_64) += copyuser_power7.o 
copypage_power7.o \
   memcpy_power7.o
 
 obj64-y+= copypage_64.o copyuser_64.o mem_64.o hweight_64.o \
-  memcpy_64.o pmem.o
+  memcpy_64.o pmem.o memcpy_mcsafe_64.o
 
 obj64-$(CONFIG_SMP)+= locks.o
 obj64-$(CONFIG_ALTIVEC)+= vmx-helper.o
diff --git a/arch/powerpc/lib/memcpy_mcsafe_64.S 
b/arch/powerpc/lib/memcpy_mcsafe_64.S
new file mode 100644
index ..949976dc115d
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_mcsafe_64.S
@@ -0,0 +1,242 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) IBM Corporation, 2011
+ * Derived from copyuser_power7.s by Anton Blanchard 
+ * Author - Balbir Singh 
+ */
+#include 
+#include 
+#include 
+
+   .macro err1
+100:
+   EX_TABLE(100b,.Ldo_err1)
+   .endm
+
+   .macro err2
+200:
+   EX_TABLE(200b,.Ldo_err2)
+   .endm
+
+   .macro err3
+300:   EX_TABLE(300b,.Ldone)
+   .endm
+
+.Ldo_err2:
+   ld  r22,STK_REG(R22)(r1)
+   ld  r21,STK_REG(R21)(r1)
+   ld  r20,STK_REG(R20)(r1)
+   ld  r19,STK_REG(R19)(r1)
+   ld  r18,STK_REG(R18)(r1)
+   ld  r17,STK_REG(R17)(r1)
+   ld  r16,STK_REG(R16)(r1)
+   ld  r15,STK_REG(R15)(r1)
+   ld  r14,STK_REG(R14)(r1)
+   addir1,r1,STACKFRAMESIZE
+.Ldo_err1:
+   /* Do a byte by byte copy to get the exact remaining size */
+   mtctr   r7
+46:
+err3;  lbz r0,0(r4)
+   addir4,r4,1
+err3;  stb r0,0(r3)
+   addir3,r3,1
+   bdnz46b
+   li  r3,0
+   blr
+
+.Ldone:
+   mfctr   r3
+   blr
+
+
+_GLOBAL(memcpy_mcsafe)
+   mr  r7,r5
+   cmpldi  r5,16
+   blt .Lshort_copy
+
+.Lcopy:
+   /* Get the source 8B aligned */
+   neg r6,r4
+   mtocrf  0x01,r6
+   clrldi  r6,r6,(64-3)
+
+   bf  cr7*4+3,1f
+err1;  lbz r0,0(r4)
+   addir4,r4,1
+err1;  stb r0,0(r3)
+   addir3,r3,1
+   subir7,r7,1
+
+1: bf  cr7*4+2,2f
+err1;  lhz r0,0(r4)
+   addir4,r4,2
+err1;  sth r0,0(r3)
+   addir3,r3,2
+   subir7,r7,2
+
+2: bf  cr7*4+1,3f
+err1;  lwz r0,0(r4)
+   addir4,r4,4
+err1;  stw r0,0(r3)
+   addir3,r3,4
+   subir7,r7,4
+
+3: sub r5,r5,r6
+   cmpldi  r5,128
+   blt 5f
+
+   mflrr0
+   stdur1,-STACKFRAMESIZE(r1)
+   std r14,STK_REG(R14)(r1)
+   std r15,STK_REG(R15)(r1)
+   std r16,STK_REG(R16)(r1)
+   std r17,STK_REG(R17)(r1)
+   std r18,STK_REG(R18)(r1)
+   std r19,STK_REG(R19)(r1)
+   std r20,STK_REG(R20)(r1)
+   std r21,STK_REG(R21)(r1)
+   std r22,STK_REG(R22)(r1)
+   std r0,STACKFRAMESIZE+16(r1)
+
+   srdir6,r5,7
+   mtctr   r6
+
+   /* Now do cacheline (128B) sized loads and stores. */
+   .align  5
+4:
+err2;  ld  r0,0(r4)
+err2;  ld  r6,8(r4)
+err2;  ld  r8,16(r4)
+err2;  ld  r9,24(r4)
+err2;  ld  r10,32(r4)
+err2;  ld  r11,40(r4)
+err2;  ld  r12,48(r4)
+err2;  ld  r14,56(r4)
+err2;  ld  r15,64(r4)
+err2;  ld  r16,72

[PATCH v8 6/7] powerpc/mce: Handle UE event for memcpy_mcsafe

2019-08-07 Thread Santosh Sivaraj
If we take a UE on one of the instructions with a fixup entry, set nip
to continue execution at the fixup entry. Stop processing the event
further or print it.

Co-developed-by: Reza Arbab 
Signed-off-by: Reza Arbab 
Cc: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/include/asm/mce.h  |  4 +++-
 arch/powerpc/kernel/mce.c   | 18 --
 arch/powerpc/kernel/mce_power.c | 15 +--
 3 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index f3a6036b6bc0..e1931c8c2743 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -122,7 +122,8 @@ struct machine_check_event {
enum MCE_UeErrorType ue_error_type:8;
u8  effective_address_provided;
u8  physical_address_provided;
-   u8  reserved_1[5];
+   u8  ignore_event;
+   u8  reserved_1[4];
u64 effective_address;
u64 physical_address;
u8  reserved_2[8];
@@ -193,6 +194,7 @@ struct mce_error_info {
enum MCE_Initiator  initiator:8;
enum MCE_ErrorClass error_class:8;
boolsync_error;
+   boolignore_event;
 };
 
 #define MAX_MC_EVT 100
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 8c0b471658a7..a80f5d6ef166 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,7 +33,6 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
-static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -144,6 +143,7 @@ void save_mce_event(struct pt_regs *regs, long handled,
if (phys_addr != ULONG_MAX) {
mce->u.ue_error.physical_address_provided = true;
mce->u.ue_error.physical_address = phys_addr;
+   mce->u.ue_error.ignore_event = mce_err->ignore_event;
}
}
return;
@@ -256,8 +256,17 @@ static void machine_process_ue_event(struct work_struct 
*work)
/*
 * This should probably queued elsewhere, but
 * oh! well
+*
+* Don't report this machine check because the caller has a
+* asked us to ignore the event, it has a fixup handler which
+* will do the appropriate error handling and reporting.
 */
if (evt->error_type == MCE_ERROR_TYPE_UE) {
+   if (evt->u.ue_error.ignore_event) {
+   __this_cpu_dec(mce_ue_count);
+   continue;
+   }
+
if (evt->u.ue_error.physical_address_provided) {
unsigned long pfn;
 
@@ -291,8 +300,13 @@ static void machine_check_process_queued_event(struct 
irq_work *work)
index = __this_cpu_read(mce_queue_count) - 1;
evt = this_cpu_ptr(&mce_event_queue[index]);
 
-   if (evt->error_type == MCE_ERROR_TYPE_UE)
+   if (evt->error_type == MCE_ERROR_TYPE_UE) {
machine_check_ue_event(evt);
+   if (evt->u.ue_error.ignore_event) {
+   __this_cpu_dec(mce_queue_count);
+   continue;
+   }
+   }
 
machine_check_print_event_info(evt, false, false);
__this_cpu_dec(mce_queue_count);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index bed38a8e2e50..36ca45bbb273 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Convert an address related to an mm to a physical address.
@@ -558,9 +560,18 @@ static int mce_handle_derror(struct pt_regs *regs,
return 0;
 }
 
-static long mce_handle_ue_error(struct pt_regs *regs)
+static long mce_handle_ue_error(struct pt_regs *regs,
+   struct mce_error_info *mce_err)
 {
long handled = 0;
+   const struct exception_table_entry *entry;
+
+   entry = search_kernel_exception_table(regs->nip);
+   if (entry) {
+   mce_err->ignore_event = true;
+   regs->nip 

[PATCH v8 7/7] powerpc: add machine check safe copy_to_user

2019-08-07 Thread Santosh Sivaraj
Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()

Signed-off-by: Santosh Sivaraj 
---
 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/uaccess.h | 14 ++
 2 files changed, 15 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 77f6ebf97113..4316e36095a2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!RELOCATABLE && !HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
+   select ARCH_HAS_UACCESS_MCSAFE  if PPC64
select ARCH_HAS_UBSAN_SANITIZE_ALL
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_KEEP_MEMBLOCK
diff --git a/arch/powerpc/include/asm/uaccess.h 
b/arch/powerpc/include/asm/uaccess.h
index 8b03eb44e876..15002b51ff18 100644
--- a/arch/powerpc/include/asm/uaccess.h
+++ b/arch/powerpc/include/asm/uaccess.h
@@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void __user 
*to,
return ret;
 }
 
+static __always_inline unsigned long __must_check
+copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
+{
+   if (likely(check_copy_size(from, n, true))) {
+   if (access_ok(to, n)) {
+   allow_write_to_user(to, n);
+   n = memcpy_mcsafe((void *)to, from, n);
+   prevent_write_to_user(to, n);
+   }
+   }
+
+   return n;
+}
+
 extern unsigned long __clear_user(void __user *addr, unsigned long size);
 
 static inline unsigned long clear_user(void __user *addr, unsigned long size)
-- 
2.20.1



Re: [PATCH v8 3/7] powerpc/mce: Fix MCE handling for huge pages

2019-08-10 Thread Santosh Sivaraj
Mahesh Jagannath Salgaonkar  writes:

> On 8/7/19 8:26 PM, Santosh Sivaraj wrote:
>> From: Balbir Singh 
>> 
>> The current code would fail on huge pages addresses, since the shift would
>> be incorrect. Use the correct page shift value returned by
>> __find_linux_pte() to get the correct physical address. The code is more
>> generic and can handle both regular and compound pages.
>> 
>> Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")
>> Signed-off-by: Balbir Singh 
>> [ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
>> Signed-off-by: Reza Arbab 
>> Co-developed-by: Santosh Sivaraj 
>> Signed-off-by: Santosh Sivaraj 
>> ---
>>  arch/powerpc/include/asm/mce.h   |  2 +-
>>  arch/powerpc/kernel/mce_power.c  | 50 ++--
>>  arch/powerpc/platforms/pseries/ras.c |  9 ++---
>>  3 files changed, 29 insertions(+), 32 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
>> index a4c6a74ad2fb..f3a6036b6bc0 100644
>> --- a/arch/powerpc/include/asm/mce.h
>> +++ b/arch/powerpc/include/asm/mce.h
>> @@ -209,7 +209,7 @@ extern void release_mce_event(void);
>>  extern void machine_check_queue_event(void);
>>  extern void machine_check_print_event_info(struct machine_check_event *evt,
>> bool user_mode, bool in_guest);
>> -unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
>> +unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr);
>>  #ifdef CONFIG_PPC_BOOK3S_64
>>  void flush_and_reload_slb(void);
>>  #endif /* CONFIG_PPC_BOOK3S_64 */
>> diff --git a/arch/powerpc/kernel/mce_power.c 
>> b/arch/powerpc/kernel/mce_power.c
>> index a814d2dfb5b0..bed38a8e2e50 100644
>> --- a/arch/powerpc/kernel/mce_power.c
>> +++ b/arch/powerpc/kernel/mce_power.c
>> @@ -20,13 +20,14 @@
>>  #include 
>>  
>>  /*
>> - * Convert an address related to an mm to a PFN. NOTE: we are in real
>> - * mode, we could potentially race with page table updates.
>> + * Convert an address related to an mm to a physical address.
>> + * NOTE: we are in real mode, we could potentially race with page table 
>> updates.
>>   */
>> -unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
>> +unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr)
>>  {
>> -pte_t *ptep;
>> -unsigned long flags;
>> +pte_t *ptep, pte;
>> +unsigned int shift;
>> +unsigned long flags, phys_addr;
>>  struct mm_struct *mm;
>>  
>>  if (user_mode(regs))
>> @@ -35,14 +36,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
>> long addr)
>>  mm = &init_mm;
>>  
>>  local_irq_save(flags);
>> -if (mm == current->mm)
>> -ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
>> -else
>> -ptep = find_init_mm_pte(addr, NULL);
>> +ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
>>  local_irq_restore(flags);
>> +
>>  if (!ptep || pte_special(*ptep))
>>  return ULONG_MAX;
>> -return pte_pfn(*ptep);
>> +
>> +pte = *ptep;
>> +if (shift > PAGE_SHIFT) {
>> +unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
>> +
>> +pte = __pte(pte_val(pte) | (addr & rpnmask));
>> +}
>> +phys_addr = pte_pfn(pte) << PAGE_SHIFT;
>> +
>> +return phys_addr;
>>  }
>>  
>>  /* flush SLBs and reload */
>> @@ -354,18 +362,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
>> *regs, uint64_t *addr,
>
> Now that we have addr_to_phys() can we change this function name as well
> to mce_find_instr_ea_and_phys() ?

Makes sense, will avoid confusions.

>
> Tested-by: Mahesh Salgaonkar 
>
> This should go to stable tree. Can you move this patch to 2nd position ?

Thanks for testing. Sure. Will reorder and mark stable as well.

Thanks,
Santosh
>
> Thanks,
> -Mahesh.
>


Re: [PATCH v8 7/7] powerpc: add machine check safe copy_to_user

2019-08-10 Thread Santosh Sivaraj
Michael Ellerman  writes:

> Santosh Sivaraj  writes:
>> Use  memcpy_mcsafe() implementation to define copy_to_user_mcsafe()
>>
>> Signed-off-by: Santosh Sivaraj 
>> ---
>>  arch/powerpc/Kconfig   |  1 +
>>  arch/powerpc/include/asm/uaccess.h | 14 ++
>>  2 files changed, 15 insertions(+)
>>
>> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
>> index 77f6ebf97113..4316e36095a2 100644
>> --- a/arch/powerpc/Kconfig
>> +++ b/arch/powerpc/Kconfig
>> @@ -137,6 +137,7 @@ config PPC
>>  select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
>> !RELOCATABLE && !HIBERNATION)
>>  select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
>>  select ARCH_HAS_UACCESS_FLUSHCACHE  if PPC64
>> +select ARCH_HAS_UACCESS_MCSAFE  if PPC64
>>  select ARCH_HAS_UBSAN_SANITIZE_ALL
>>  select ARCH_HAVE_NMI_SAFE_CMPXCHG
>>  select ARCH_KEEP_MEMBLOCK
>> diff --git a/arch/powerpc/include/asm/uaccess.h 
>> b/arch/powerpc/include/asm/uaccess.h
>> index 8b03eb44e876..15002b51ff18 100644
>> --- a/arch/powerpc/include/asm/uaccess.h
>> +++ b/arch/powerpc/include/asm/uaccess.h
>> @@ -387,6 +387,20 @@ static inline unsigned long raw_copy_to_user(void 
>> __user *to,
>>  return ret;
>>  }
>>  
>> +static __always_inline unsigned long __must_check
>> +copy_to_user_mcsafe(void __user *to, const void *from, unsigned long n)
>> +{
>> +if (likely(check_copy_size(from, n, true))) {
>> +if (access_ok(to, n)) {
>> +allow_write_to_user(to, n);
>> +n = memcpy_mcsafe((void *)to, from, n);
>> +prevent_write_to_user(to, n);
>> +}
>> +}
>> +
>> +return n;
>> +}
>
> This looks OK to me.
>
> It would be nice though if copy_to_user_mcsafe() followed the pattern of
> the other copy_to_user() etc. routines where the arch code is only
> responsible for the actual arch details, and all the checks are done in
> the generic code. That would be a good cleanup to do after this has gone
> in, as the 2nd implementation of the API.

Sure, will do that.

>
> cheers



[PATCH v9 0/7] powerpc: implement machine check safe memcpy

2019-08-12 Thread Santosh Sivaraj
During a memcpy from a pmem device, if a machine check exception is
generated we end up in a panic. In case of fsdax read, this should
only result in a -EIO. Avoid MCE by implementing memcpy_mcsafe.

Before this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[ 7621.714094] Disabling lock debugging due to kernel taint
[ 7621.714099] MCE: CPU0: machine check (Severe) Host UE Load/Store [Not 
recovered]
[ 7621.714104] MCE: CPU0: NIP: [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714107] MCE: CPU0: Hardware error
[ 7621.714112] opal: Hardware platform error: Unrecoverable Machine Check 
exception
[ 7621.714118] CPU: 0 PID: 1368 Comm: mount Tainted: G   M  
5.2.0-rc5-00239-g241e39004581
#50
[ 7621.714123] NIP:  c0088978 LR: c08e16f8 CTR: 01de
[ 7621.714129] REGS: c000fffbfd70 TRAP: 0200   Tainted: G   M  
(5.2.0-rc5-00239-g241e39004581)
[ 7621.714131] MSR:  92209033   CR: 
24428840  XER: 0004
[ 7621.714160] CFAR: c00889a8 DAR: deadbeefdeadbeef DSISR: 8000 
IRQMASK: 0
[ 7621.714171] GPR00: 0e00 c000f0b8b1e0 c12cf100 
c000ed8e1100 
[ 7621.714186] GPR04: c2001100 0001 0200 
03fff1272000 
[ 7621.714201] GPR08: 8000 0010 0020 
0030 
[ 7621.714216] GPR12: 0040 7fffb8c6d390 0050 
0060 
[ 7621.714232] GPR16: 0070  0001 
c000f0b8b960 
[ 7621.714247] GPR20: 0001 c000f0b8b940 0001 
0001 
[ 7621.714262] GPR24: c1382560 c00c003b6380 c00c003b6380 
0001 
[ 7621.714277] GPR28:  0001 c200 
0001 
[ 7621.714294] NIP [c0088978] memcpy_power7+0x418/0x7e0
[ 7621.714298] LR [c08e16f8] pmem_do_bvec+0xf8/0x430
...  ...
```

After this patch series:

```
bash-4.4# mount -o dax /dev/pmem0 /mnt/pmem/
[25302.883978] Buffer I/O error on dev pmem0, logical block 0, async page read
[25303.020816] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.021236] EXT4-fs (pmem0): Can't read superblock on 2nd try
[25303.152515] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25303.284031] EXT4-fs (pmem0): DAX enabled. Warning: EXPERIMENTAL, use at your 
own risk
[25304.084100] UDF-fs: bad mount option "dax" or missing value
mount: /mnt/pmem: wrong fs type, bad option, bad superblock on /dev/pmem0, 
missing codepage or helper
program, or other error.
```

MCE is injected on a pmem address using mambo. The last patch which adds a
nop is only for testing on mambo, where r13 is not restored upon hitting
vector 200.

The memcpy code can be optimised by adding VMX optimizations and GAS macros
can be used to enable code reusablity, which I will send as another series.

---
Change-log:
v9:
* Add a new IRQ work for UE events [mahesh]
* Reorder patches, and copy stable

v8:
* While ignoring UE events, return was used instead of continue.
* Checkpatch fixups for commit log

v7:
* Move schedule_work to be called from irq_work.

v6:
* Don't return pfn, all callees are expecting physical address anyway [nick]
* Patch re-ordering: move exception table patch before memcpy_mcsafe patch 
[nick]
* Reword commit log for search_exception_tables patch [nick]

v5:
* Don't use search_exception_tables since it searches for module exception 
tables
  also [Nicholas]
* Fix commit message for patch 2 [Nicholas]

v4:
* Squash return remaining bytes patch to memcpy_mcsafe implemtation patch 
[christophe]
* Access ok should be checked for copy_to_user_mcsafe() [christophe]

v3:
* Drop patch which enables DR/IR for external modules
* Drop notifier call chain, we don't want to do that in real mode
* Return remaining bytes from memcpy_mcsafe correctly
* We no longer restore r13 for simulator tests, rather use a nop at 
  vector 0x200 [workaround for simulator; not to be merged]

v2:
* Don't set RI bit explicitly [mahesh]
* Re-ordered series to get r13 workaround as the last patch
--
Balbir Singh (2):
  powerpc/mce: Fix MCE handling for huge pages
  powerpc/memcpy: Add memcpy_mcsafe for pmem

Reza Arbab (1):
  powerpc/mce: Make machine_check_ue_event() static

Santosh Sivaraj (4):
  powerpc/mce: Schedule work from irq_work
  extable: Add function to search only kernel exception table
  powerpc/mce: Handle UE event for memcpy_mcsafe
  powerpc: add machine check safe copy_to_user

 arch/powerpc/Kconfig |   1 +
 arch/powerpc/include/asm/mce.h   |   6 +-
 arch/powerpc/include/asm/string.h|   2 +
 arch/powerpc/include/asm/uaccess.h   |  14 ++
 arch/powerpc/kernel/mce.c|  31 +++-
 arch/powerpc/kernel/mce_power.c  |  70 
 arch/powerpc/lib/Makefile|   2 +-
 arch/powerpc/lib/memcpy_mcsafe_64.S  | 242 

[PATCH v9 1/7] powerpc/mce: Schedule work from irq_work

2019-08-12 Thread Santosh Sivaraj
schedule_work() cannot be called from MCE exception context as MCE can
interrupt even in interrupt disabled context.

fixes: 733e4a4c ("powerpc/mce: hookup memory_failure for UE errors")
Suggested-by: Mahesh Salgaonkar 
Signed-off-by: Santosh Sivaraj 
Cc: sta...@vger.kernel.org # v4.15+
---
 arch/powerpc/kernel/mce.c | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index b18df633eae9..cff31d4a501f 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -33,6 +33,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
mce_ue_event_queue);
 
 static void machine_check_process_queued_event(struct irq_work *work);
+static void machine_check_ue_irq_work(struct irq_work *work);
 void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
@@ -40,6 +41,10 @@ static struct irq_work mce_event_process_work = {
 .func = machine_check_process_queued_event,
 };
 
+static struct irq_work mce_ue_event_irq_work = {
+   .func = machine_check_ue_irq_work,
+};
+
 DECLARE_WORK(mce_ue_event_work, machine_process_ue_event);
 
 static void mce_set_error_info(struct machine_check_event *mce,
@@ -199,6 +204,10 @@ void release_mce_event(void)
get_mce_event(NULL, true);
 }
 
+static void machine_check_ue_irq_work(struct irq_work *work)
+{
+   schedule_work(&mce_ue_event_work);
+}
 
 /*
  * Queue up the MCE event which then can be handled later.
@@ -216,7 +225,7 @@ void machine_check_ue_event(struct machine_check_event *evt)
memcpy(this_cpu_ptr(&mce_ue_event_queue[index]), evt, sizeof(*evt));
 
/* Queue work to process this event later. */
-   schedule_work(&mce_ue_event_work);
+   irq_work_queue(&mce_ue_event_irq_work);
 }
 
 /*
-- 
2.21.0



[PATCH v9 2/7] powerpc/mce: Fix MCE handling for huge pages

2019-08-12 Thread Santosh Sivaraj
From: Balbir Singh 

The current code would fail on huge pages addresses, since the shift would
be incorrect. Use the correct page shift value returned by
__find_linux_pte() to get the correct physical address. The code is more
generic and can handle both regular and compound pages.

Fixes: ba41e1e1ccb9 ("powerpc/mce: Hookup derror (load/store) UE errors")
Signed-off-by: Balbir Singh 
[ar...@linux.ibm.com: Fixup pseries_do_memory_failure()]
Signed-off-by: Reza Arbab 
Co-developed-by: Santosh Sivaraj 
Signed-off-by: Santosh Sivaraj 
Tested-by: Mahesh Salgaonkar 
Cc: sta...@vger.kernel.org # v4.15+
---
 arch/powerpc/include/asm/mce.h   |  2 +-
 arch/powerpc/kernel/mce_power.c  | 55 ++--
 arch/powerpc/platforms/pseries/ras.c |  9 ++---
 3 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index a4c6a74ad2fb..f3a6036b6bc0 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -209,7 +209,7 @@ extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct machine_check_event *evt,
   bool user_mode, bool in_guest);
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr);
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr);
 #ifdef CONFIG_PPC_BOOK3S_64
 void flush_and_reload_slb(void);
 #endif /* CONFIG_PPC_BOOK3S_64 */
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..e74816f045f8 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -20,13 +20,14 @@
 #include 
 
 /*
- * Convert an address related to an mm to a PFN. NOTE: we are in real
- * mode, we could potentially race with page table updates.
+ * Convert an address related to an mm to a physical address.
+ * NOTE: we are in real mode, we could potentially race with page table 
updates.
  */
-unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr)
+unsigned long addr_to_phys(struct pt_regs *regs, unsigned long addr)
 {
-   pte_t *ptep;
-   unsigned long flags;
+   pte_t *ptep, pte;
+   unsigned int shift;
+   unsigned long flags, phys_addr;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -35,14 +36,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
mm = &init_mm;
 
local_irq_save(flags);
-   if (mm == current->mm)
-   ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
-   else
-   ptep = find_init_mm_pte(addr, NULL);
+   ptep = __find_linux_pte(mm->pgd, addr, NULL, &shift);
local_irq_restore(flags);
+
if (!ptep || pte_special(*ptep))
return ULONG_MAX;
-   return pte_pfn(*ptep);
+
+   pte = *ptep;
+   if (shift > PAGE_SHIFT) {
+   unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
+
+   pte = __pte(pte_val(pte) | (addr & rpnmask));
+   }
+   phys_addr = pte_pfn(pte) << PAGE_SHIFT;
+
+   return phys_addr;
 }
 
 /* flush SLBs and reload */
@@ -344,7 +352,7 @@ static const struct mce_derror_table mce_p9_derror_table[] 
= {
   MCE_INITIATOR_CPU,   MCE_SEV_SEVERE, true },
 { 0, false, 0, 0, 0, 0, 0 } };
 
-static int mce_find_instr_ea_and_pfn(struct pt_regs *regs, uint64_t *addr,
+static int mce_find_instr_ea_and_phys(struct pt_regs *regs, uint64_t *addr,
uint64_t *phys_addr)
 {
/*
@@ -354,18 +362,16 @@ static int mce_find_instr_ea_and_pfn(struct pt_regs 
*regs, uint64_t *addr,
 * faults
 */
int instr;
-   unsigned long pfn, instr_addr;
+   unsigned long instr_addr;
struct instruction_op op;
struct pt_regs tmp = *regs;
 
-   pfn = addr_to_pfn(regs, regs->nip);
-   if (pfn != ULONG_MAX) {
-   instr_addr = (pfn << PAGE_SHIFT) + (regs->nip & ~PAGE_MASK);
+   instr_addr = addr_to_phys(regs, regs->nip) + (regs->nip & ~PAGE_MASK);
+   if (instr_addr != ULONG_MAX) {
instr = *(unsigned int *)(instr_addr);
if (!analyse_instr(&op, &tmp, instr)) {
-   pfn = addr_to_pfn(regs, op.ea);
*addr = op.ea;
-   *phys_addr = (pfn << PAGE_SHIFT);
+   *phys_addr = addr_to_phys(regs, op.ea);
return 0;
}
/*
@@ -440,15 +446,9 @@ static int mce_handle_ierror(struct pt_regs *regs,
*addr = regs->nip;
if (mce_err->sync_error &&
table[i].error_type == MCE_ERROR_TYPE_UE) {
-   unsigned long pfn;
-
-   if (

[PATCH v9 3/7] powerpc/mce: Make machine_check_ue_event() static

2019-08-12 Thread Santosh Sivaraj
From: Reza Arbab 

The function doesn't get used outside this file, so make it static.

Signed-off-by: Reza Arbab 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 arch/powerpc/kernel/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index cff31d4a501f..a3b122a685a5 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -34,7 +34,7 @@ static DEFINE_PER_CPU(struct machine_check_event[MAX_MC_EVT],
 
 static void machine_check_process_queued_event(struct irq_work *work);
 static void machine_check_ue_irq_work(struct irq_work *work);
-void machine_check_ue_event(struct machine_check_event *evt);
+static void machine_check_ue_event(struct machine_check_event *evt);
 static void machine_process_ue_event(struct work_struct *work);
 
 static struct irq_work mce_event_process_work = {
@@ -212,7 +212,7 @@ static void machine_check_ue_irq_work(struct irq_work *work)
 /*
  * Queue up the MCE event which then can be handled later.
  */
-void machine_check_ue_event(struct machine_check_event *evt)
+static void machine_check_ue_event(struct machine_check_event *evt)
 {
int index;
 
-- 
2.21.0



[PATCH v9 4/7] extable: Add function to search only kernel exception table

2019-08-12 Thread Santosh Sivaraj
Certain architecture specific operating modes (e.g., in powerpc machine
check handler that is unable to access vmalloc memory), the
search_exception_tables cannot be called because it also searches the
module exception tables if entry is not found in the kernel exception
table.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Nicholas Piggin 
Signed-off-by: Santosh Sivaraj 
Reviewed-by: Nicholas Piggin 
---
 include/linux/extable.h |  2 ++
 kernel/extable.c| 11 +--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/extable.h b/include/linux/extable.h
index 41c5b3a25f67..81ecfaa83ad3 100644
--- a/include/linux/extable.h
+++ b/include/linux/extable.h
@@ -19,6 +19,8 @@ void trim_init_extable(struct module *m);
 
 /* Given an address, look for it in the exception tables */
 const struct exception_table_entry *search_exception_tables(unsigned long add);
+const struct exception_table_entry *
+search_kernel_exception_table(unsigned long addr);
 
 #ifdef CONFIG_MODULES
 /* For extable.c to search modules' exception tables. */
diff --git a/kernel/extable.c b/kernel/extable.c
index e23cce6e6092..f6c9406eec7d 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -40,13 +40,20 @@ void __init sort_main_extable(void)
}
 }
 
+/* Given an address, look for it in the kernel exception table */
+const
+struct exception_table_entry *search_kernel_exception_table(unsigned long addr)
+{
+   return search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
+}
+
 /* Given an address, look for it in the exception tables. */
 const struct exception_table_entry *search_exception_tables(unsigned long addr)
 {
const struct exception_table_entry *e;
 
-   e = search_extable(__start___ex_table,
-  __stop___ex_table - __start___ex_table, addr);
+   e = search_kernel_exception_table(addr);
if (!e)
e = search_module_extables(addr);
return e;
-- 
2.21.0



  1   2   3   >