[RFC PATCH 5/7] powerpc/64s/radix: Improve TLB flushing for page table freeing
Unmaps that free page tables always flush the entire PID, which is sub-optimal. Provide TLB range flushing with an additional PWC flush that can be use for va range invalidations with PWC flush. Time to munmap N pages of memory including last level page table teardown (after mmap, touch), local invalidate: N 1 2 4 8 16 32 64 vanilla 3.2us 3.3us 3.4us 3.6us 4.1us 5.2us 7.2us patched 1.4us 1.5us 1.7us 1.9us 2.6us 3.7us 6.2us Global invalidate: N 1 2 4 8 16 32 64 vanilla 2.2us 2.3us 2.4us 2.6us 3.2us 4.1us 6.2us patched 2.1us 2.5us 3.4us 5.2us 8.7us 15.7us 6.2us Local invalidates get much better across the board. Global ones have the same issue where multiple tlbies for va flush do get slower than the single tlbie to invalidate the PID. None of this test captures the TLB benefits of avoiding killing everything. Global gets worse, but it is brought in to line with global invalidate for munmap()s that do not free page tables. Signed-off-by: Nicholas Piggin --- arch/powerpc/mm/tlb-radix.c | 90 ++--- 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 4c6b9046e925..49cc581a31cd 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -39,6 +39,20 @@ static inline void __tlbiel_pid(unsigned long pid, int set, trace_tlbie(0, 1, rb, rs, ric, prs, r); } +static inline void __tlbie_pid(unsigned long pid, unsigned long ric) +{ + unsigned long rb,rs,prs,r; + + rb = PPC_BIT(53); /* IS = 1 */ + rs = pid << PPC_BITLSHIFT(31); + prs = 1; /* process scoped */ + r = 1; /* raidx format */ + + asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) +: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + trace_tlbie(0, 0, rb, rs, ric, prs, r); +} + /* * We use 128 set in radix mode and 256 set in hpt mode. */ @@ -70,18 +84,9 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) static inline void _tlbie_pid(unsigned long pid, unsigned long ric) { - unsigned long rb,rs,prs,r; - - rb = PPC_BIT(53); /* IS = 1 */ - rs = pid << PPC_BITLSHIFT(31); - prs = 1; /* process scoped */ - r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) -: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); + __tlbie_pid(pid, ric); asm volatile("eieio; tlbsync; ptesync": : :"memory"); - trace_tlbie(0, 0, rb, rs, ric, prs, r); } static inline void __tlbiel_va(unsigned long va, unsigned long pid, @@ -123,9 +128,11 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, static inline void _tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, - unsigned long psize) + unsigned long psize, bool also_pwc) { asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); __tlbiel_va_range(start, end, pid, page_size, psize); asm volatile("ptesync": : :"memory"); } @@ -169,9 +176,11 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, static inline void _tlbie_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, - unsigned long psize) + unsigned long psize, bool also_pwc) { asm volatile("ptesync": : :"memory"); + if (also_pwc) + __tlbie_pid(pid, RIC_FLUSH_PWC); __tlbie_va_range(start, end, pid, page_size, psize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -411,13 +420,15 @@ static int radix_get_mmu_psize(int page_size) return psize; } +static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, + unsigned long end, int psize); + void radix__tlb_flush(struct mmu_gather *tlb) { int psize = 0; struct mm_struct *mm = tlb->mm; int page_size = tlb->page_size; - psize = radix_get_mmu_psize(page_size); /* * if page size is not something we understand, do a full mm flush * @@ -425,17 +436,28 @@ void radix__tlb_flush(struct mmu_gather *tlb) * that flushes the process table entry cache upon process teardown. * See the comment for radix in arch_exit_mmap(). */ - if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) - radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else
[RFC PATCH 4/7] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush
The single page flush ceiling is the cut-off point at which we switch from invalidating individual pages, to invalidating the entire process address space in response to a range flush. Introduce a local variant of this heuristic because local and global tlbie have significantly different properties: - Local tlbiel requires 128 instructions to invalidate a PID, global tlbie only 1 instruction. - Global tlbie instructions are expensive broadcast operations. The local ceiling has been made much higher, 2x the number of instructions required to invalidate the entire PID (i.e., 256 pages). Time to mprotect N pages of memory (after mmap, touch), local invalidate: N 32 34 64 128 256 512 vanilla 7.4us 9.0us 14.6us 26.4us 50.2us 98.3us patched 7.4us 7.8us 13.8us 26.4us 51.9us 98.3us The behaviour of both is identical at N=32 and N=512. Between there, the vanilla kernel does a PID invalidate and the patched kernel does a va range invalidate. At N=128, these require the same number of tlbiel instructions, so the patched version can be sen to be cheaper when < 128, and more expensive when > 128. However this does not well capture the cost of invalidated TLB. The additional cost at 256 pages does not seem prohibitive. It may be the case that increasing the limit further would continue to be beneficial to avoid invalidating all of the process's TLB entries. Signed-off-by: Nicholas Piggin --- arch/powerpc/mm/tlb-radix.c | 23 +++ 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 8cd933560c28..4c6b9046e925 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -325,6 +325,7 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * individual page flushes to full-pid flushes. */ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -347,8 +348,15 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, return; preempt_disable(); - local = mm_is_thread_local(mm); - full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } if (full) { if (local) @@ -440,8 +448,15 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, return; preempt_disable(); - local = mm_is_thread_local(mm); - full = (end == TLB_FLUSH_ALL || nr_pages > tlb_single_page_flush_ceiling); + if (mm_is_thread_local(mm)) { + local = true; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_local_single_page_flush_ceiling); + } else { + local = false; + full = (end == TLB_FLUSH_ALL || + nr_pages > tlb_single_page_flush_ceiling); + } if (full) { if (local) -- 2.15.0.rc2
[RFC PATCH 3/7] powerpc/64s/radix: Optimize flush_tlb_range
Currently for radix, flush_tlb_range flushes the entire PID, because the Linux mm code does not tell us about page size here for THP vs regular pages. This is quite sub-optimal for small mremap / mprotect / change_protection. So implement va range flushes with two flush passes, one for each page size (regular and THP). The second flush has an order of matnitude fewer tlbie instructions than the first, so it is a relatively small additional cost. There is still room for improvement here with some changes to generic APIs, particularly if there are mostly THP pages to be invalidated, the small page flushes could be reduced. Time to mprotect 1 page of memory (after mmap, touch): vanilla 2.9us 1.8us patched 1.2us 1.6us Time to mprotect 30 pages of memory (after mmap, touch): vanilla 8.2us 7.2us patched 6.9us 17.9us Time to mprotect 34 pages of memory (after mmap, touch): vanilla 9.1us 8.0us patched 9.0us 8.0us 34 pages is the point at which the invalidation switches from va to entire PID, which tlbie can do in a single instruction. This is why in the case of 30 pages, the new code runs slower for this test. This is a deliberate tradeoff already present in the unmap and THP promotion code, the idea is that the benefit from avoiding flushing entire TLB for this PID on all threads in the system. Signed-off-by: Nicholas Piggin --- arch/powerpc/mm/tlb-radix.c | 139 1 file changed, 102 insertions(+), 37 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index ff056016cad4..8cd933560c28 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -100,6 +100,17 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, trace_tlbie(0, 1, rb, rs, ric, prs, r); } +static inline void __tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); +} + static inline void _tlbiel_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -114,12 +125,8 @@ static inline void _tlbiel_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) { - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbiel_va_range(start, end, pid, page_size, psize); asm volatile("ptesync": : :"memory"); } @@ -139,6 +146,17 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void __tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); +} + static inline void _tlbie_va(unsigned long va, unsigned long pid, unsigned long psize, unsigned long ric) { @@ -153,12 +171,8 @@ static inline void _tlbie_va_range(unsigned long start, unsigned long end, unsigned long pid, unsigned long page_size, unsigned long psize) { - unsigned long addr; - unsigned long ap = mmu_get_ap(psize); - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) - __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbie_va_range(start, end, pid, page_size, psize); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } @@ -299,17 +313,78 @@ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(radix__flush_tlb_kernel_range); +#define TLB_FLUSH_ALL -1UL + /* - * Currently, for range flushing, we just do a full mm flush. Because - * we use this in code path where we don' track the page size. + * Number of pages above which we invalidate the entire PID rather than + * flush individual pages, for local and global flushes respectively. + * + * tlbie goes out to the interconnect and individual ops are more costly. + * It also does not iterate over sets like the local tlbiel variant when + * invalidating a full PID, so it has a far lower threshold to change from + * individual page flushes to full-pid fl
[RFC PATCH 2/7] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
Move the barriers and range iteration down into the _tlbie* level, which improves readability. Signed-off-by: Nicholas Piggin --- arch/powerpc/mm/tlb-radix.c | 70 ++--- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index eeb658faa6c1..ff056016cad4 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) } static inline void __tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) + unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned long pid, } static inline void _tlbiel_va(unsigned long va, unsigned long pid, - unsigned long ap, unsigned long ric) + unsigned long psize, unsigned long ric) { + unsigned long ap = mmu_get_ap(psize); + asm volatile("ptesync": : :"memory"); __tlbiel_va(va, pid, ap, ric); asm volatile("ptesync": : :"memory"); } +static inline void _tlbiel_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + for (addr = start; addr < end; addr += page_size) + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + asm volatile("ptesync": : :"memory"); +} + static inline void __tlbie_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { @@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned long pid, } static inline void _tlbie_va(unsigned long va, unsigned long pid, -unsigned long ap, unsigned long ric) + unsigned long psize, unsigned long ric) { + unsigned long ap = mmu_get_ap(psize); + asm volatile("ptesync": : :"memory"); __tlbie_va(va, pid, ap, ric); asm volatile("eieio; tlbsync; ptesync": : :"memory"); } +static inline void _tlbie_va_range(unsigned long start, unsigned long end, + unsigned long pid, unsigned long page_size, + unsigned long psize) +{ + unsigned long addr; + unsigned long ap = mmu_get_ap(psize); + + asm volatile("ptesync": : :"memory"); + for (addr = start; addr < end; addr += page_size) + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} /* * Base TLB flushing operations: @@ -174,12 +203,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmadd int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -239,16 +267,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) { unsigned long pid; - unsigned long ap = mmu_get_ap(psize); pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) return; preempt_disable(); if (!mm_is_thread_local(mm)) - _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB); else - _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB); + _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB); preempt_enable(); } @@ -335,9 +362,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize) { unsigned long pid; - unsigned long addr; bool local; - unsigned long ap = mmu_get_ap(psize); unsigned long page_size = 1UL << mmu_psize_defs[psize].shift; pid = mm->context.id; @@ -354,18 +379,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, _tlbie_pid(pid, RIC_FLUSH_TLB); } else { - asm volatile("ptesync": : :"memory"); - for (addr = start; addr < end; addr += page_size) { - - if (local) - __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); - else - __tlbie_va(addr, pid
[RFC PATCH 1/7] powerpc/64s/radix: optimize TLB range flush barriers
Short range flushes issue a sequences of tlbie(l) instructions for individual effective addresses. These do not all require individual barrier sequences, only one covering all tlbie(l) instructions. Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync") made a similar optimization for tlbiel for PID flushing. For tlbie, the ISA says: The tlbsync instruction provides an ordering function for the effects of all tlbie instructions executed by the thread executing the tlbsync instruction, with respect to the memory barrier created by a subsequent ptesync instruction executed by the same thread. Time to munmap 30 pages of memory (after mmap, touch): local global vanilla 10.9us 22.3us patched 3.4us 14.4us Signed-off-by: Nicholas Piggin --- arch/powerpc/mm/tlb-radix.c | 41 - 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 0c8464653aa3..eeb658faa6c1 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric) trace_tlbie(0, 0, rb, rs, ric, prs, r); } -static inline void _tlbiel_va(unsigned long va, unsigned long pid, +static inline void __tlbiel_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); trace_tlbie(0, 1, rb, rs, ric, prs, r); } -static inline void _tlbie_va(unsigned long va, unsigned long pid, +static inline void _tlbiel_va(unsigned long va, unsigned long pid, + unsigned long ap, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + __tlbiel_va(va, pid, ap, ric); + asm volatile("ptesync": : :"memory"); +} + +static inline void __tlbie_va(unsigned long va, unsigned long pid, unsigned long ap, unsigned long ric) { unsigned long rb,rs,prs,r; @@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("eieio; tlbsync; ptesync": : :"memory"); trace_tlbie(0, 0, rb, rs, ric, prs, r); } +static inline void _tlbie_va(unsigned long va, unsigned long pid, +unsigned long ap, unsigned long ric) +{ + asm volatile("ptesync": : :"memory"); + __tlbie_va(va, pid, ap, ric); + asm volatile("eieio; tlbsync; ptesync": : :"memory"); +} + + /* * Base TLB flushing operations: * @@ -339,14 +352,20 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, _tlbiel_pid(pid, RIC_FLUSH_TLB); else _tlbie_pid(pid, RIC_FLUSH_TLB); + } else { + asm volatile("ptesync": : :"memory"); for (addr = start; addr < end; addr += page_size) { if (local) - _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); else - _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } + if (local) + asm volatile("ptesync": : :"memory"); + else + asm volatile("eieio; tlbsync; ptesync": : :"memory"); } preempt_enable(); } @@ -377,6 +396,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) _tlbie_pid(pid, RIC_FLUSH_PWC); /* Then iterate the pages */ + asm volatile("ptesync": : :"memory"); end = addr + HPAGE_PMD_SIZE; for (; addr < end; addr += PAGE_SIZE) { if (local) @@ -384,7 +404,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) else _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); } - + if (local) + asm volatile("ptesync": : :"memory"); + else + asm volatile("eieio; tlbsync; ptesync": : :"memory"); preempt_enable(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- 2.15.0
[RFC PATCH 0/7] powerpc/64s/radix TLB flush performance improvements
Here's a random mix of performance improvements for radix TLB flushing code. The main aims are to reduce the amount of translation that gets invalidated, and to reduce global flushes where we can do local. To that end, a parallel kernel compile benchmark using powerpc:tlbie tracepoint shows a reduction in tlbie instructions from about 290,000 to 80,000, and a reduction in tlbiel instructions from 49,500,000 to 15,000,000. Looks great, but unfortunately does not translate to a statistically significant performance improvement! The needle on TLB misses does not move much, I suspect because a lot of the flushing is done a startup and shutdown, and because a significant cost of TLB flushing itself is in the barriers. I have some microbenchmarks in the individual patches, and should start looking around for some more interesting workloads. I think most of this series is pretty obviously the right thing to do though. This goes on top of the 3 radix TLB fixes I sent out earlier. Thanks, Nick Nicholas Piggin (7): powerpc/64s/radix: optimize TLB range flush barriers powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions powerpc/64s/radix: Optimize flush_tlb_range powerpc/64s/radix: Introduce local single page ceiling for TLB range flush powerpc/64s/radix: Improve TLB flushing for page table freeing powerpc/64s/radix: reset mm_cpumask for single thread process when possible powerpc/64s/radix: Only flush local TLB for spurious fault flushes .../powerpc/include/asm/book3s/64/tlbflush-radix.h | 5 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 11 + arch/powerpc/include/asm/mmu_context.h | 19 ++ arch/powerpc/mm/pgtable-book3s64.c | 5 +- arch/powerpc/mm/pgtable.c | 2 +- arch/powerpc/mm/tlb-radix.c| 363 - 6 files changed, 325 insertions(+), 80 deletions(-) -- 2.15.0.rc2
[PATCH v2 2/4] powerpc: Force reload for recheckpoint during tm {fp, vec, vsx} unavailable exception
Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. tm_reclaim() has optimisations to not always save the FP/Altivec registers to the checkpointed save area. This was originally done because the caller might have information that the checkpointed registers aren't valid due to lazy save and restore. We've also been a little vague as to how tm_reclaim() leaves the FP/Altivec state since it doesn't necessarily always save it to the thread struct. This has lead to an (incorrect) assumption that it leaves the checkpointed state on the CPU. tm_recheckpoint() has similar optimisations in reverse. It may not always reload the checkpointed FP/Altivec registers from the thread struct before the trecheckpoint. It is therefore quite unclear where it expects to get the state from. This didn't help with the assumption made about tm_reclaim(). This patch is a minimal fix for ease of backporting. A more correct fix which removes the msr parameter to tm_reclaim() and tm_recheckpoint() altogether has been upstreamed to apply on top of this patch. Fixes: dc3106690b20 ("powerpc: tm: Always use fp_state and vr_state to store live registers") Signed-off-by: Cyril Bur --- V2: Add this patch for ease of backporting the same fix as the next patch. arch/powerpc/kernel/process.c | 4 ++-- arch/powerpc/kernel/traps.c | 22 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ebb5b58a4138..cfa75e99dcfb 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -866,6 +866,8 @@ static void tm_reclaim_thread(struct thread_struct *thr, if (!MSR_TM_SUSPENDED(mfmsr())) return; + giveup_all(container_of(thr, struct task_struct, thread)); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -885,8 +887,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - giveup_all(container_of(thr, struct task_struct, thread)); - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index ef6a45969812..a7d42c89a257 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1471,6 +1471,12 @@ void facility_unavailable_exception(struct pt_regs *regs) void fp_unavailable_tm(struct pt_regs *regs) { + /* +* Save the MSR now because tm_reclaim_current() is likely to +* change it +*/ + unsigned long orig_msr = regs->msr; + /* Note: This does not handle any kind of FP laziness. */ TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", @@ -1495,10 +1501,10 @@ void fp_unavailable_tm(struct pt_regs *regs) * If VMX is in use, the VRs now hold checkpointed values, * so we don't want to load the VRs from the thread_struct. */ - tm_recheckpoint(¤t->thread, MSR_FP); + tm_recheckpoint(¤t->thread, orig_msr | MSR_FP); /* If VMX is in use, get the transactional values back */ - if (regs->msr & MSR_VEC) { + if (orig_msr & MSR_VEC) { msr_check_and_set(MSR_VEC); load_vr_state(¤t->thread.vr_state); /* At this point all the VSX state is loaded, so enable it */ @@ -1508,6 +1514,12 @@ void fp_unavailable_tm(struct pt_regs *regs) void altivec_unavailable_tm(struct pt_regs *regs) { + /* +* Save the MSR now because tm_reclaim_current() is likely to +* change it +*/ + unsigned long orig_msr = regs->msr; + /* See the comments in fp_unavailable_tm(). This function operates * the same way. */ @@ -1517,10 +1529,10 @@ void altivec_unavailable_tm(struct pt_regs *regs) regs->nip, regs->msr); tm_reclaim_current(TM_CAUSE_FAC_UNAV); current->thread.load_vec = 1; - tm_recheckpoint(¤t->thread, MSR_VEC); + tm_recheckpoint(¤t->thread, orig_msr | MSR_VEC); current->thread.used_vr = 1; - if (regs->msr & MSR_FP) { + if (orig_msr & MSR_FP) { msr_check_and_set(MSR_FP); load_fp_state(¤t->thread.fp_state); regs->msr |= MSR_VSX; @@ -1559,7 +1571,7 @@ void vsx_unavailable_tm(
[PATCH v2 4/4] powerpc: Remove facility loadups on transactional {fp, vec, vsx} unavailable
After handling a transactional FP, Altivec or VSX unavailable exception. The return to userspace code will detect that the TIF_RESTORE_TM bit is set and call restore_tm_state(). restore_tm_state() will call restore_math() to ensure that the correct facilities are loaded. This means that all the loadup code in {fp,altivec,vsx}_unavailable_tm() is doing pointless work and can simply be removed. Signed-off-by: Cyril Bur --- V2: Obvious cleanup which should have been in v1 arch/powerpc/kernel/traps.c | 30 -- 1 file changed, 30 deletions(-) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 4a7bc64352fd..3181e85ef17c 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1471,12 +1471,6 @@ void facility_unavailable_exception(struct pt_regs *regs) void fp_unavailable_tm(struct pt_regs *regs) { - /* -* Save the MSR now because tm_reclaim_current() is likely to -* change it -*/ - unsigned long orig_msr = regs->msr; - /* Note: This does not handle any kind of FP laziness. */ TM_DEBUG("FP Unavailable trap whilst transactional at 0x%lx, MSR=%lx\n", @@ -1502,24 +1496,10 @@ void fp_unavailable_tm(struct pt_regs *regs) * so we don't want to load the VRs from the thread_struct. */ tm_recheckpoint(¤t->thread); - - /* If VMX is in use, get the transactional values back */ - if (orig_msr & MSR_VEC) { - msr_check_and_set(MSR_VEC); - load_vr_state(¤t->thread.vr_state); - /* At this point all the VSX state is loaded, so enable it */ - regs->msr |= MSR_VSX; - } } void altivec_unavailable_tm(struct pt_regs *regs) { - /* -* Save the MSR now because tm_reclaim_current() is likely to -* change it -*/ - unsigned long orig_msr = regs->msr; - /* See the comments in fp_unavailable_tm(). This function operates * the same way. */ @@ -1531,12 +1511,6 @@ void altivec_unavailable_tm(struct pt_regs *regs) current->thread.load_vec = 1; tm_recheckpoint(¤t->thread); current->thread.used_vr = 1; - - if (orig_msr & MSR_FP) { - msr_check_and_set(MSR_FP); - load_fp_state(¤t->thread.fp_state); - regs->msr |= MSR_VSX; - } } void vsx_unavailable_tm(struct pt_regs *regs) @@ -1561,10 +1535,6 @@ void vsx_unavailable_tm(struct pt_regs *regs) current->thread.load_fp = 1; tm_recheckpoint(¤t->thread); - - msr_check_and_set(MSR_FP | MSR_VEC); - load_fp_state(¤t->thread.fp_state); - load_vr_state(¤t->thread.vr_state); } #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ -- 2.14.3
[PATCH v2 3/4] powerpc: Always save/restore checkpointed regs during treclaim/trecheckpoint
Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. tm_reclaim() has optimisations to not always save the FP/Altivec registers to the checkpointed save area. This was originally done because the caller might have information that the checkpointed registers aren't valid due to lazy save and restore. We've also been a little vague as to how tm_reclaim() leaves the FP/Altivec state since it doesn't necessarily always save it to the thread struct. This has lead to an (incorrect) assumption that it leaves the checkpointed state on the CPU. tm_recheckpoint() has similar optimisations in reverse. It may not always reload the checkpointed FP/Altivec registers from the thread struct before the trecheckpoint. It is therefore quite unclear where it expects to get the state from. This didn't help with the assumption made about tm_reclaim(). These optimisations sit in what is by definition a slow path. If a process has to go through a reclaim/recheckpoint then its transaction will be doomed on returning to userspace. This mean that the process will be unable to complete its transaction and be forced to its failure handler. This is already an out if line case for userspace. Furthermore, the cost of copying 64 times 128 bits from registers isn't very long[0] (at all) on modern processors. As such it appears these optimisations have only served to increase code complexity and are unlikely to have had a measurable performance impact. Our transactional memory handling has been riddled with bugs. A cause of this has been difficulty in following the code flow, code complexity has not been our friend here. It makes sense to remove these optimisations in favour of a (hopefully) more stable implementation. This patch does mean that some times the assembly will needlessly save 'junk' registers which will subsequently get overwritten with the correct value by the C code which calls the assembly function. This small inefficiency is far outweighed by the reduction in complexity for general TM code, context switching paths, and transactional facility unavailable exception handler. 0: I tried to measure it once for other work and found that it was hiding in the noise of everything else I was working with. I find it exceedingly likely this will be the case here. Signed-off-by: Cyril Bur --- V2: Unchanged arch/powerpc/include/asm/tm.h | 5 ++-- arch/powerpc/kernel/process.c | 22 ++- arch/powerpc/kernel/signal_32.c | 2 +- arch/powerpc/kernel/signal_64.c | 2 +- arch/powerpc/kernel/tm.S| 59 - arch/powerpc/kernel/traps.c | 26 +- 6 files changed, 35 insertions(+), 81 deletions(-) diff --git a/arch/powerpc/include/asm/tm.h b/arch/powerpc/include/asm/tm.h index 82e06ca3a49b..33d965911bec 100644 --- a/arch/powerpc/include/asm/tm.h +++ b/arch/powerpc/include/asm/tm.h @@ -11,10 +11,9 @@ extern void tm_enable(void); extern void tm_reclaim(struct thread_struct *thread, - unsigned long orig_msr, uint8_t cause); + uint8_t cause); extern void tm_reclaim_current(uint8_t cause); -extern void tm_recheckpoint(struct thread_struct *thread, - unsigned long orig_msr); +extern void tm_recheckpoint(struct thread_struct *thread); extern void tm_abort(uint8_t cause); extern void tm_save_sprs(struct thread_struct *thread); extern void tm_restore_sprs(struct thread_struct *thread); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index cfa75e99dcfb..4b322ede6420 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -868,6 +868,8 @@ static void tm_reclaim_thread(struct thread_struct *thr, giveup_all(container_of(thr, struct task_struct, thread)); + tm_reclaim(thr, cause); + /* * If we are in a transaction and FP is off then we can't have * used FP inside that transaction. Hence the checkpointed @@ -886,8 +888,6 @@ static void tm_reclaim_thread(struct thread_struct *thr, if ((thr->ckpt_regs.msr & MSR_VEC) == 0) memcpy(&thr->ckvr_state, &thr->vr_state, sizeof(struct thread_vr_state)); - - tm_reclaim(thr, thr->ckpt_regs.msr, cause); } void tm_reclaim_current(uint8_t cause) @@ -936,11 +936,9 @@ static inline void tm_reclaim_task(struct task_struct *tsk) tm_save_sprs(thr); } -extern void __tm_recheckpoint(struct thread_struct *thread, -
[PATCH v2 1/4] powerpc: Don't enable FP/Altivec if not checkpointed
Lazy save and restore of FP/Altivec means that a userspace process can be sent to userspace with FP or Altivec disabled and loaded only as required (by way of an FP/Altivec unavailable exception). Transactional Memory complicates this situation as a transaction could be started without FP/Altivec being loaded up. This causes the hardware to checkpoint incorrect registers. Handling FP/Altivec unavailable exceptions while a thread is transactional requires a reclaim and recheckpoint to ensure the CPU has correct state for both sets of registers. Lazy save and restore of FP/Altivec cannot be done if a process is transactional. If a facility was enabled it must remain enabled whenever a thread is transactional. Commit dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use") ensures that the facilities are always enabled if a thread is transactional. A bug in the introduced code may cause it to inadvertently enable a facility that was (and should remain) disabled. The problem with this extraneous enablement is that the registers for the erroneously enabled facility have not been correctly recheckpointed - the recheckpointing code assumed the facility would remain disabled. Further compounding the issue, the transactional {fp,altivec,vsx} unavailable code has been incorrectly using the MSR to enable facilities. The presence of the {FP,VEC,VSX} bit in the regs->msr simply means if the registers are live on the CPU, not if the kernel should load them before returning to userspace. This has worked due to the bug mentioned above. This causes transactional threads which return to their failure handler to observe incorrect checkpointed registers. Perhaps an example will help illustrate the problem: A userspace process is running and uses both FP and Altivec registers. This process then continues to run for some time without touching either sets of registers. The kernel subsequently disables the facilities as part of lazy save and restore. The userspace process then performs a tbegin and the CPU checkpoints 'junk' FP and Altivec registers. The process then performs a floating point instruction triggering a fp unavailable exception in the kernel. The kernel then loads the FP registers - and only the FP registers. Since the thread is transactional it must perform a reclaim and recheckpoint to ensure both the checkpointed registers and the transactional registers are correct. It then (correctly) enables MSR[FP] for the process. Later (on exception exist) the kernel also (inadvertently) enables MSR[VEC]. The process is then returned to userspace. Since the act of loading the FP registers doomed the transaction we know CPU will fail the transaction, restore its checkpointed registers, and return the process to its failure handler. The problem is that we're now running with Altivec enabled and the 'junk' checkpointed registers are restored. The kernel had only recheckpointed FP. This patch solves this by only activating FP/Altivec if userspace was using them when it entered the kernel and not simply if the process is transactional. Fixes: dc16b553c949 ("powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use") Signed-off-by: Cyril Bur --- V2: Rather than incorrectly using the MSR to enable {FP,VEC,VSX} use the load_fp and load_vec booleans to help restore_math() make the correct decision arch/powerpc/kernel/process.c | 17 +++-- arch/powerpc/kernel/traps.c | 8 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a0c74bbf3454..ebb5b58a4138 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -230,9 +230,15 @@ void enable_kernel_fp(void) } EXPORT_SYMBOL(enable_kernel_fp); +static bool tm_active_with_fp(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_FP); +} + static int restore_fp(struct task_struct *tsk) { - if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { + if (tsk->thread.load_fp || tm_active_with_fp(tsk)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; return 1; @@ -311,10 +317,17 @@ void flush_altivec_to_thread(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(flush_altivec_to_thread); +static bool tm_active_with_altivec(struct task_struct *tsk) +{ + return msr_tm_active(tsk->thread.regs->msr) && + (tsk->thread.ckpt_regs.msr & MSR_VEC); +} + + static int restore_altivec(struct task_struct *tsk) { if (cpu_has_feature(CPU_FTR_ALTIVEC) && - (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) { + (tsk->thread.load_vec || tm_active_with_altivec(tsk))) { load_vr_state(&tsk->thread.vr_state); tsk->thread.used_vr = 1; tsk->thread.load
[PATCH kernel] powerpc/powernv/ioda: Relax max DMA window size check
DMA windows can only have a size of power of two on IODA2 hardware and using memory_hotplug_max() to determine the upper limit won't work correcly if it returns not power of two value. This relaxes the check by rounding up the value returned by memory_hotplug_max(). It is expected to impact DPDK on machines with non-power-of-two RAM size, mostly. KVM guests are less likely to be affected as usually guests get less than half of hosts RAM. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 269f119e4b3c..4c62162da181 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2769,7 +2769,8 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; - if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size)) + if ((window_size > roundup_pow_of_two(memory_hotplug_max())) || + !is_power_of_2(window_size)) return -EINVAL; /* Adjust direct table size from window_size and levels */ -- 2.11.0
RE: [PATCH v10 0/4] this patchset is to remove PPCisms for QEIC
Hi all, Could anybody review this patchset and take action on them? Thank you! Best Regards Qiang Zhao > > -Original Message- > > From: Zhao Qiang [mailto:qiang.z...@nxp.com] > > Sent: Monday, August 07, 2017 11:07 AM > > To: t...@linutronix.de > > Cc: o...@buserror.net; Xiaobo Xie ; linux- > > ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; Qiang Zhao > > > > Subject: [PATCH v10 0/4] this patchset is to remove PPCisms for QEIC > > > > QEIC is supported more than just powerpc boards, so remove PPCisms. > > > > changelog: > > Changes for v8: > > - use IRQCHIP_DECLARE() instead of subsys_initcall in qeic driver > > - remove include/soc/fsl/qe/qe_ic.h > > Changes for v9: > > - rebase > > - fix the compile issue when apply the second patch, in fact, there > > was no compile issue > > when apply all the patches of this patchset > > Changes for v10: > > - simplify codes, remove duplicated codes > > > > Zhao Qiang (4): > > irqchip/qeic: move qeic driver from drivers/soc/fsl/qe > > Changes for v2: > > - modify the subject and commit msg > > Changes for v3: > > - merge .h file to .c, rename it with irq-qeic.c > > Changes for v4: > > - modify comments > > Changes for v5: > > - disable rename detection > > Changes for v6: > > - rebase > > Changes for v7: > > - na > > > > irqchip/qeic: merge qeic init code from platforms to a common function > > Changes for v2: > > - modify subject and commit msg > > - add check for qeic by type > > Changes for v3: > > - na > > Changes for v4: > > - na > > Changes for v5: > > - na > > Changes for v6: > > - rebase > > Changes for v7: > > - na > > Changes for v8: > > - use IRQCHIP_DECLARE() instead of subsys_initcall > > > > irqchip/qeic: merge qeic_of_init into qe_ic_init > > Changes for v2: > > - modify subject and commit msg > > - return 0 and add put node when return in qe_ic_init > > Changes for v3: > > - na > > Changes for v4: > > - na > > Changes for v5: > > - na > > Changes for v6: > > - rebase > > Changes for v7: > > - na > > > > irqchip/qeic: remove PPCisms for QEIC > > Changes for v6: > > - new added > > Changes for v7: > > - fix warning > > Changes for v8: > > - remove include/soc/fsl/qe/qe_ic.h > > > > Zhao Qiang (4): > > irqchip/qeic: move qeic driver from drivers/soc/fsl/qe > > irqchip/qeic: merge qeic init code from platforms to a common function > > irqchip/qeic: merge qeic_of_init into qe_ic_init > > irqchip/qeic: remove PPCisms for QEIC > > > > MAINTAINERS| 6 + > > arch/powerpc/platforms/83xx/km83xx.c | 1 - > > arch/powerpc/platforms/83xx/misc.c | 16 - > > arch/powerpc/platforms/83xx/mpc832x_mds.c | 1 - > > arch/powerpc/platforms/83xx/mpc832x_rdb.c | 1 - > > arch/powerpc/platforms/83xx/mpc836x_mds.c | 1 - > > arch/powerpc/platforms/83xx/mpc836x_rdk.c | 1 - > > arch/powerpc/platforms/85xx/corenet_generic.c | 10 - > > arch/powerpc/platforms/85xx/mpc85xx_mds.c | 15 - > > arch/powerpc/platforms/85xx/mpc85xx_rdb.c | 17 - > > arch/powerpc/platforms/85xx/twr_p102x.c| 15 - > > drivers/irqchip/Makefile | 1 + > > drivers/{soc/fsl/qe/qe_ic.c => irqchip/irq-qeic.c} | 358 > > - > > drivers/soc/fsl/qe/Makefile| 2 +- > > drivers/soc/fsl/qe/qe_ic.h | 103 -- > > include/soc/fsl/qe/qe_ic.h | 139 > > 16 files changed, 218 insertions(+), 469 deletions(-) rename > > drivers/{soc/fsl/qe/qe_ic.c => irqchip/irq-qeic.c} (58%) delete mode > > 100644 drivers/soc/fsl/qe/qe_ic.h delete mode 100644 > > include/soc/fsl/qe/qe_ic.h > > > > -- > > 2.1.0.27.g96db324
Re: [rfc] powerpc/npu: Cleanup MMIO ATSD flushing
On Tue, Oct 31, 2017 at 3:03 AM, Aneesh Kumar K.V wrote: > > > On 10/30/2017 06:08 PM, Balbir Singh wrote: >> >> + >> +static void pnv_npu2_invalidate_helper(struct npu_context *npu_context, >> + struct mm_struct *mm, unsigned long start, >> + unsigned long end, bool flush) >> +{ >> + unsigned long address; >> + bool is_thp; >> + unsigned int hshift, shift; >> + >> + address = start; >> + do { >> + local_irq_disable(); >> + find_linux_pte(mm->pgd, address, &is_thp, &hshift); >> + if (!is_thp) >> + shift = PAGE_SHIFT; >> + else >> + shift = hshift; > > > Is that correct? if is_thp is 0 can we derive shift from hshift? IIUC we set > hshift only > if it is a hugepage. OK.. I assumed that it'll be set. So I've got to check for if (is_thp) shift = mmu_psize_defs[MMU_PAGE_2M].shift else if (hshift) shift = hshift else shift = PAGE_SHIFT > > >> + mmio_invalidate(npu_context, address > 0, address, flush, >> + shift); >> + local_irq_enable(); >> + address += (1ull << shift); >> + } while (address < end); >> } >> Thanks for the review! Balbir Singh.
Re: [PATCH v4 00/10] Allow opal-async waiters to get interrupted
On Mon, 2017-10-30 at 10:15 +0100, Boris Brezillon wrote: > On Tue, 10 Oct 2017 14:32:52 +1100 > Cyril Bur wrote: > > > V4: Rework and rethink. > > > > To recap: > > Userspace MTD read()s/write()s and erases to powernv_flash become > > calls into the OPAL firmware which subsequently handles flash access. > > Because the read()s, write()s or erases can be large (bounded of > > course my the size of flash) OPAL may take some time to service the > > request, this causes the powernv_flash driver to sit in a wait_event() > > for potentially minutes. This causes two problems, firstly, tools > > appear to hang for the entire time as they cannot be interrupted by > > signals and secondly, this can trigger hung task warnings. The correct > > solution is to use wait_event_interruptible() which my rework (as part > > of this series) of the opal-async infrastructure provides. > > > > The final patch in this series achieves this. It should eliminate both > > hung tasks and threads locking up. > > > > Included in this series are other simpler fixes for powernv_flash: > > > > Don't always return EIO on error. OPAL does mutual exclusion on the > > flash and also knows when the service processor takes control of the > > flash, in both of these cases it will return OPAL_BUSY, translating > > this to EIO is misleading to userspace. > > > > Handle receiving OPAL_SUCCESS when it expects OPAL_ASYNC_COMPLETION > > and don't treat it as an error. Unfortunately there are too many drivers > > out there with the incorrect behaviour so this means OPAL can never > > return anything but OPAL_ASYNC_COMPLETION, this shouldn't prevent the > > code from being correct. > > > > Don't return ERESTARTSYS if token acquisition is interrupted as > > powernv_flash can't be sure it hasn't already performed some work, let > > userspace deal with the problem. > > > > Change the incorrect use of BUG_ON() to WARN_ON() in powernv_flash. > > > > Not for powernv_flash, a fix from Stewart Smith which fits into this > > series as it relies on my improvements to the opal-async > > infrastructure. > > > > V3: export opal_error_code() so that powernv_flash can be built=m > > > > Hello, > > > > Version one of this series ignored that OPAL may continue to use > > buffers passed to it after Linux kfree()s the buffer. This version > > addresses this, not in a particularly nice way - future work could > > make this better. This version also includes a few cleanups and fixups > > to powernv_flash driver one along the course of this work that I > > thought I would just send. > > > > The problem we're trying to solve here is that currently all users of > > the opal-async calls must use wait_event(), this may be undesirable > > when there is a userspace process behind the request for the opal > > call, if OPAL takes too long to complete the call then hung task > > warnings will appear. > > > > In order to solve the problem callers should use > > wait_event_interruptible(), due to the interruptible nature of this > > call the opal-async infrastructure needs to track extra state > > associated with each async token, this is prepared for in patch 6/10. > > > > While I was working on the opal-async infrastructure improvements > > Stewart fixed another problem and he relies on the corrected behaviour > > of opal-async so I've sent it here. > > > > Hello MTD folk, traditionally Michael Ellerman takes powernv_flash > > driver patches through the powerpc tree, as always your feedback is > > very welcome. > > Just gave my acks on patches 1 to 4 and patch 10 (with minor comments > on patch 3 and 10). Feel free to take the patches directly through the > powerpc tree. > Hi Boris, thanks very much for the acks. All good points - I'll fix that up in a v2 Thanks again, Cyril > > > > Thanks, > > > > Cyril > > > > Cyril Bur (9): > > mtd: powernv_flash: Use WARN_ON_ONCE() rather than BUG_ON() > > mtd: powernv_flash: Don't treat OPAL_SUCCESS as an error > > mtd: powernv_flash: Remove pointless goto in driver init > > mtd: powernv_flash: Don't return -ERESTARTSYS on interrupted token > > acquisition > > powerpc/opal: Make __opal_async_{get,release}_token() static > > powerpc/opal: Rework the opal-async interface > > powerpc/opal: Add opal_async_wait_response_interruptible() to > > opal-async > > powerpc/powernv: Add OPAL_BUSY to opal_error_code() > > mtd: powernv_flash: Use opal_async_wait_response_interruptible() > > > > Stewart Smith (1): > > powernv/opal-sensor: remove not needed lock > > > > arch/powerpc/include/asm/opal.h | 4 +- > > arch/powerpc/platforms/powernv/opal-async.c | 183 > > +++ > > arch/powerpc/platforms/powernv/opal-sensor.c | 17 +-- > > arch/powerpc/platforms/powernv/opal.c| 2 + > > drivers/mtd/devices/powernv_flash.c | 83 +++- > > 5 files changed, 194 insertions(+), 95 deletions(-) > > > >
Re: [rfc] powerpc/npu: Cleanup MMIO ATSD flushing
On 10/30/2017 06:08 PM, Balbir Singh wrote: + +static void pnv_npu2_invalidate_helper(struct npu_context *npu_context, + struct mm_struct *mm, unsigned long start, + unsigned long end, bool flush) +{ + unsigned long address; + bool is_thp; + unsigned int hshift, shift; + + address = start; + do { + local_irq_disable(); + find_linux_pte(mm->pgd, address, &is_thp, &hshift); + if (!is_thp) + shift = PAGE_SHIFT; + else + shift = hshift; Is that correct? if is_thp is 0 can we derive shift from hshift? IIUC we set hshift only if it is a hugepage. + mmio_invalidate(npu_context, address > 0, address, flush, + shift); + local_irq_enable(); + address += (1ull << shift); + } while (address < end); } -aneesh
[PATCH 2/2] powerpc/kprobes: Dereference function pointers only if the address does not belong to kernel text
This makes the changes introduced in commit 83e840c770f2c5 ("powerpc64/elfv1: Only dereference function descriptor for non-text symbols") to be specific to the kprobe subsystem. We previously changed ppc_function_entry() to always check the provided address to confirm if it needed to be dereferenced. This is actually only an issue for kprobe blacklisted asm labels (through use of _ASM_NOKPROBE_SYMBOL) and can cause other issues with ftrace. Also, the additional checks are not really necessary for our other uses. As such, move this check to the kprobes subsystem. Signed-off-by: Naveen N. Rao --- arch/powerpc/kernel/kprobes.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index bbb4795660f8..ca5d5a081e75 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -598,7 +598,12 @@ NOKPROBE_SYMBOL(kprobe_fault_handler); unsigned long arch_deref_entry_point(void *entry) { - return ppc_global_function_entry(entry); +#ifdef PPC64_ELF_ABI_v1 + if (!kernel_text_address((unsigned long)entry)) + return ppc_global_function_entry(entry); + else +#endif + return (unsigned long)entry; } NOKPROBE_SYMBOL(arch_deref_entry_point); -- 2.14.2
[PATCH 1/2] Revert "powerpc64/elfv1: Only dereference function descriptor for non-text symbols"
This reverts commit 83e840c770f2c5 ("powerpc64/elfv1: Only dereference function descriptor for non-text symbols"). Chandan reported that on newer kernels, trying to enable function_graph tracer on ppc64 (BE) locks up the system with the following trace: Unable to handle kernel paging request for data at address 0x60002fa30010 Faulting instruction address: 0xc01f1300 Thread overran stack, or stack corrupted Oops: Kernel access of bad area, sig: 11 [#1] BE SMP NR_CPUS=2048 DEBUG_PAGEALLOC NUMA pSeries Modules linked in: CPU: 1 PID: 6586 Comm: bash Not tainted 4.14.0-rc3-00162-g6e51f1f-dirty #20 task: c00625c07200 task.stack: c00625c07310 NIP: c01f1300 LR: c0121cac CTR: c0061af8 REGS: c00625c088c0 TRAP: 0380 Not tainted (4.14.0-rc3-00162-g6e51f1f-dirty) MSR: 80001032 CR: 28002848 XER: CFAR: c01f1320 SOFTE: 0 GPR00: c0121cac c00625c08b40 c1439700 c125c5a8 GPR04: c13a0040 c135cbf0 GPR08: e92d0250812a1a30 e92d025081291a30 60002fa3 c00625c08c50 GPR12: 28002842 cfd40580 00010bacae64 GPR16: 00010bac96c8 GPR20: 00010bac0734 000a GPR24: c00636cd6610 c00113258b80 GPR28: c0061b10 c0061960 c125c5d8 c13a0040 NIP [c01f1300] .__is_insn_slot_addr+0x30/0x90 LR [c0121cac] .kernel_text_address+0x18c/0x1c0 Call Trace: [c00625c08b40] [c01bd040] .is_module_text_address+0x20/0x40 (unreliable) [c00625c08bc0] [c0121cac] .kernel_text_address+0x18c/0x1c0 [c00625c08c50] [c0061960] .prepare_ftrace_return+0x50/0x130 [c00625c08cf0] [c0061b10] .ftrace_graph_caller+0x14/0x34 [c00625c08d60] [c0121b40] .kernel_text_address+0x20/0x1c0 [c00625c08df0] [c0061960] .prepare_ftrace_return+0x50/0x130 ... [c00625c0ab30] [c0061960] .prepare_ftrace_return+0x50/0x130 [c00625c0abd0] [c0061b10] .ftrace_graph_caller+0x14/0x34 [c00625c0ac40] [c0121b40] .kernel_text_address+0x20/0x1c0 [c00625c0acd0] [c0061960] .prepare_ftrace_return+0x50/0x130 [c00625c0ad70] [c0061b10] .ftrace_graph_caller+0x14/0x34 [c00625c0ade0] [c0121b40] .kernel_text_address+0x20/0x1c0 Instruction dump: 7c0802a6 fbc1fff0 fbe1fff8 f8010010 f821ff81 7c7e1b78 7c9f2378 6000 6000 e95e0031 7faaf040 419e0028 7fa9f840 3d090001 7ebf4040 ---[ end trace 0870d7d56d703ff4 ]--- This is because ftrace is using ppc_function_entry() for obtaining the address of return_to_handler() in prepare_ftrace_return(). The call to kernel_text_address() itself gets traced and we end up in a recursive loop. Reported-by: Chandan Rajendra Signed-off-by: Naveen N. Rao --- arch/powerpc/include/asm/code-patching.h | 10 +- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 5482928eea1b..abef812de7f8 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -83,16 +83,8 @@ static inline unsigned long ppc_function_entry(void *func) * On PPC64 ABIv1 the function pointer actually points to the * function's descriptor. The first entry in the descriptor is the * address of the function text. -* -* However, we may also receive pointer to an assembly symbol. To -* detect that, we first check if the function pointer we receive -* already points to kernel/module text and we only dereference it -* if it doesn't. */ - if (kernel_text_address((unsigned long)func)) - return (unsigned long)func; - else - return ((func_descr_t *)func)->entry; + return ((func_descr_t *)func)->entry; #else return (unsigned long)func; #endif -- 2.14.2
[PATCH 0/2] Fix function_graph tracer for ppc64 BE
Chandan reported that trying to enable function_graph tracer on ppc64 BE now locks up the system. This is due to prepare_ftrace_return() using ppc_function_entry() for resolving return_to_handler(), which in turn invokes kernel_text_address(), which also gets traced resulting in a loop. We added a check for kernel_text_address() in ppc_function_entry() to guard all users in case we were called with a function, rather than a function descriptor. In hindsight, I feel that this is inefficient since we usually only pass function descriptors to ppc_function_entry() (and ppc_global_function_entry()). So, I am proposing that we revert the previous patch and instead implement the necessary checks in the kprobes subsystem. The other way to fix this is to simply guard the call to kernel_text_address() within [un]pause_graph_tracing(), if you think it's useful to have the check in ppc_function_entry() for all users. - Naveen Naveen N. Rao (2): Revert "powerpc64/elfv1: Only dereference function descriptor for non-text symbols" powerpc/kprobes: Dereference function pointers only if the address does not belong to kernel text arch/powerpc/include/asm/code-patching.h | 10 +- arch/powerpc/kernel/kprobes.c| 7 ++- 2 files changed, 7 insertions(+), 10 deletions(-) -- 2.14.2
Re: [PATCH 00/16] Remove hash page table slot tracking from linux PTE
"Aneesh Kumar K.V" writes: > "Aneesh Kumar K.V" writes: > > >> I looked at the perf data and with the test, we are doing larger number >> of hash faults and then around 10k flush_hash_range. Can the small >> improvement in number be due to the fact that we are not storing slot >> number when doing an insert now?. Also in the flush path we are now not >> using real_pte_t. >> > > With THP disabled I am finding below. > > Without patch > > 35.62% a.out[kernel.vmlinux][k] clear_user_page > 8.54% a.out[kernel.vmlinux][k] __lock_acquire > 3.86% a.out[kernel.vmlinux][k] native_flush_hash_range > 3.38% a.out[kernel.vmlinux][k] save_context_stack > 2.98% a.outa.out [.] main > 2.59% a.out[kernel.vmlinux][k] lock_acquire > 2.29% a.out[kernel.vmlinux][k] mark_lock > 2.23% a.out[kernel.vmlinux][k] native_hpte_insert > 1.87% a.out[kernel.vmlinux][k] get_mem_cgroup_from_mm > 1.71% a.out[kernel.vmlinux][k] > rcu_lockdep_current_cpu_online > 1.68% a.out[kernel.vmlinux][k] lock_release > 1.47% a.out[kernel.vmlinux][k] __handle_mm_fault > 1.41% a.out[kernel.vmlinux][k] validate_sp > > > With patch > 35.40% a.out[kernel.vmlinux][k] clear_user_page > 8.82% a.out[kernel.vmlinux][k] __lock_acquire > 3.66% a.outa.out [.] main > 3.49% a.out[kernel.vmlinux][k] save_context_stack > 2.77% a.out[kernel.vmlinux][k] lock_acquire > 2.45% a.out[kernel.vmlinux][k] mark_lock > 1.80% a.out[kernel.vmlinux][k] get_mem_cgroup_from_mm > 1.80% a.out[kernel.vmlinux][k] native_hpte_insert > 1.79% a.out[kernel.vmlinux][k] > rcu_lockdep_current_cpu_online > 1.78% a.out[kernel.vmlinux][k] lock_release > 1.73% a.out[kernel.vmlinux][k] native_flush_hash_range > 1.53% a.out[kernel.vmlinux][k] __handle_mm_fault > > That is we are now spending less time in native_flush_hash_range. > > -aneesh One possible explanation is, with slot tracking we do slot += hidx & _PTEIDX_GROUP_IX; hptep = htab_address + slot; want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); if (cpu_has_feature(CPU_FTR_ARCH_300)) hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); and without slot tracking we do for (i = 0; i < HPTES_PER_GROUP; i++, hptep++) { /* check locklessly first */ hpte_v = be64_to_cpu(hptep->v); if (cpu_has_feature(CPU_FTR_ARCH_300)) hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) continue; native_lock_hpte(hptep); That is without the patch series, we take the hpte lock always even if the hpte didn't match. Hence in perf annotate we find the lock to be highly contended without patch series. I will change that to compare pte without taking lock and see if that has any impact. -aneesh
Re: [PATCH 00/16] Remove hash page table slot tracking from linux PTE
"Aneesh Kumar K.V" writes: > I looked at the perf data and with the test, we are doing larger number > of hash faults and then around 10k flush_hash_range. Can the small > improvement in number be due to the fact that we are not storing slot > number when doing an insert now?. Also in the flush path we are now not > using real_pte_t. > With THP disabled I am finding below. Without patch 35.62% a.out[kernel.vmlinux][k] clear_user_page 8.54% a.out[kernel.vmlinux][k] __lock_acquire 3.86% a.out[kernel.vmlinux][k] native_flush_hash_range 3.38% a.out[kernel.vmlinux][k] save_context_stack 2.98% a.outa.out [.] main 2.59% a.out[kernel.vmlinux][k] lock_acquire 2.29% a.out[kernel.vmlinux][k] mark_lock 2.23% a.out[kernel.vmlinux][k] native_hpte_insert 1.87% a.out[kernel.vmlinux][k] get_mem_cgroup_from_mm 1.71% a.out[kernel.vmlinux][k] rcu_lockdep_current_cpu_online 1.68% a.out[kernel.vmlinux][k] lock_release 1.47% a.out[kernel.vmlinux][k] __handle_mm_fault 1.41% a.out[kernel.vmlinux][k] validate_sp With patch 35.40% a.out[kernel.vmlinux][k] clear_user_page 8.82% a.out[kernel.vmlinux][k] __lock_acquire 3.66% a.outa.out [.] main 3.49% a.out[kernel.vmlinux][k] save_context_stack 2.77% a.out[kernel.vmlinux][k] lock_acquire 2.45% a.out[kernel.vmlinux][k] mark_lock 1.80% a.out[kernel.vmlinux][k] get_mem_cgroup_from_mm 1.80% a.out[kernel.vmlinux][k] native_hpte_insert 1.79% a.out[kernel.vmlinux][k] rcu_lockdep_current_cpu_online 1.78% a.out[kernel.vmlinux][k] lock_release 1.73% a.out[kernel.vmlinux][k] native_flush_hash_range 1.53% a.out[kernel.vmlinux][k] __handle_mm_fault That is we are now spending less time in native_flush_hash_range. -aneesh
linux-next: manual merge of the powerpc tree with Linus' tree
Hi all, Today's linux-next merge of the powerpc tree got a conflict in: arch/powerpc/kvm/powerpc.c between commit: ac64115a66c1 ("KVM: PPC: Fix oops when checking KVM_CAP_PPC_HTM") from Linus' tree and commit: 2a3d6553cbd7 ("KVM: PPC: Tie KVM_CAP_PPC_HTM to the user-visible TM feature") from the powerpc tree. I fixed it up (see below) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. diff --cc arch/powerpc/kvm/powerpc.c index ee279c7f4802,a3746b98ec11.. --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@@ -644,7 -644,8 +644,8 @@@ int kvm_vm_ioctl_check_extension(struc break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && hv_enabled; - r = is_kvmppc_hv_enabled(kvm) && ++ r = hv_enabled && + (cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP); break; default: r = 0; signature.asc Description: PGP signature
[rfc] powerpc/npu: Cleanup MMIO ATSD flushing
While reviewing the code I found that the flush assumes all pages are of mmu_linear_psize, which is not correct. The patch uses find_linux_pte to find the right page size and uses that for launching the ATSD invalidation. A new helper is added to abstract the invalidation from the various notifiers. The patch also cleans up a bit by removing AP size from PID flushes. Signed-off-by: Balbir Singh --- arch/powerpc/platforms/powernv/npu-dma.c | 55 ++-- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 2cb6cbea4b3b..0d58f127b32d 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "powernv.h" #include "pci.h" @@ -459,9 +460,6 @@ static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) /* PRS set to process-scoped */ launch |= PPC_BIT(13); - /* AP */ - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - /* PID */ launch |= pid << PPC_BITLSHIFT(38); @@ -473,7 +471,8 @@ static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) } static int mmio_invalidate_va(struct npu *npu, unsigned long va, - unsigned long pid, bool flush) + unsigned long pid, bool flush, + unsigned int shift) { unsigned long launch; @@ -484,9 +483,8 @@ static int mmio_invalidate_va(struct npu *npu, unsigned long va, launch |= PPC_BIT(13); /* AP */ - launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + launch |= (u64) mmu_get_ap(shift) << PPC_BITLSHIFT(17); - /* PID */ launch |= pid << PPC_BITLSHIFT(38); /* No flush */ @@ -503,7 +501,8 @@ struct mmio_atsd_reg { }; static void mmio_invalidate_wait( - struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush, + unsigned int shift) { struct npu *npu; int i, reg; @@ -536,7 +535,8 @@ static void mmio_invalidate_wait( * the value of va. */ static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address, bool flush) + unsigned long address, bool flush, + unsigned int shift) { int i, j; struct npu *npu; @@ -569,7 +569,7 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, if (va) mmio_atsd_reg[i].reg = mmio_invalidate_va(npu, address, pid, - flush); + flush, shift); else mmio_atsd_reg[i].reg = mmio_invalidate_pid(npu, pid, flush); @@ -582,10 +582,33 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, } } - mmio_invalidate_wait(mmio_atsd_reg, flush); + mmio_invalidate_wait(mmio_atsd_reg, flush, shift); if (flush) /* Wait for the flush to complete */ - mmio_invalidate_wait(mmio_atsd_reg, false); + mmio_invalidate_wait(mmio_atsd_reg, false, shift); +} + +static void pnv_npu2_invalidate_helper(struct npu_context *npu_context, + struct mm_struct *mm, unsigned long start, + unsigned long end, bool flush) +{ + unsigned long address; + bool is_thp; + unsigned int hshift, shift; + + address = start; + do { + local_irq_disable(); + find_linux_pte(mm->pgd, address, &is_thp, &hshift); + if (!is_thp) + shift = PAGE_SHIFT; + else + shift = hshift; + mmio_invalidate(npu_context, address > 0, address, flush, + shift); + local_irq_enable(); + address += (1ull << shift); + } while (address < end); } static void pnv_npu2_mn_release(struct mmu_notifier *mn, @@ -601,7 +624,7 @@ static void pnv_npu2_mn_release(struct mmu_notifier *mn, * There should be no more translation requests for this PID, but we * need to ensure any entries for it are removed from the TLB. */ - mmio_invalidate(npu_context, 0, 0, true); + pnv_npu2_invalidate_helper(npu_context, mm, 0, PAGE_SIZE, true); } static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, @@ -611,7 +634,7 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, { struct npu_context *npu_context = mn_to_npu_context(mn); - m
Re: [PATCH] powerpc/kernel/sysfs: Export ldbar spr to sysfs
On Friday 27 October 2017 03:04 PM, Anju T Sudhakar wrote: Add ldbar spr to sysfs. The spr will hold thread level In-Memory Collection (IMC) counter configuration data. Signed-off-by: Anju T Sudhakar Acked-by: Madhavan Srinivasan --- arch/powerpc/kernel/sysfs.c | 8 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 4437c70c7c2b..8efcaece4796 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -466,6 +466,7 @@ static ssize_t __used \ #ifdef HAS_PPC_PMC_CLASSIC SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); +SYSFS_PMCSETUP(ldbar, SPRN_LDBAR); My bad. Missed to mention this. For ldbar spr, use SYSFS_SPRSETUP macro instead. Maddy SYSFS_PMCSETUP(pmc1, SPRN_PMC1); SYSFS_PMCSETUP(pmc2, SPRN_PMC2); SYSFS_PMCSETUP(pmc3, SPRN_PMC3); @@ -492,6 +493,7 @@ SYSFS_SPRSETUP(pir, SPRN_PIR); Lets be conservative and default to pseries. */ static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra); +static DEVICE_ATTR(ldbar, 0600, show_ldbar, store_ldbar); static DEVICE_ATTR(spurr, 0400, show_spurr, NULL); static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); @@ -757,6 +759,9 @@ static int register_cpu_online(unsigned int cpu) device_create_file(s, &pmc_attrs[i]); #ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_ARCH_300)) + device_create_file(s, &dev_attr_ldbar); + if (cpu_has_feature(CPU_FTR_MMCRA)) device_create_file(s, &dev_attr_mmcra); @@ -842,6 +847,9 @@ static int unregister_cpu_online(unsigned int cpu) device_remove_file(s, &pmc_attrs[i]); #ifdef CONFIG_PPC64 + if (cpu_has_feature(CPU_FTR_ARCH_300)) + device_remove_file(s, &dev_attr_ldbar); + if (cpu_has_feature(CPU_FTR_MMCRA)) device_remove_file(s, &dev_attr_mmcra);
Re: [PATCH 00/16] Remove hash page table slot tracking from linux PTE
Paul Mackerras writes: > On Fri, Oct 27, 2017 at 10:57:13AM +0530, Aneesh Kumar K.V wrote: >> >> >> On 10/27/2017 10:04 AM, Paul Mackerras wrote: >> >How do we interpret these numbers? Are they times, or speed? Is >> >larger better or worse? >> >> Sorry for not including the details. They are time in seconds. Test case is >> a modified mmap_bench included in powerpc/selftest. >> >> > >> >Can you give us the mean and standard deviation for each set of 5 >> >please? >> > >> >> powernv without patch >> median= 51.432255 >> stdev = 0.370835 >> >> with patch >> median = 50.739922 >> stdev = 0.06419662 >> >> pseries without patch >> median = 116.617884 >> stdev = 3.04531023 >> >> with patch no hcall >> median = 119.42494 >> stdev = 0.85874552 >> >> with patch and hcall >> median = 117.735808 >> stdev = 2.7624151 > > So on powernv, the patch set *improves* performance by about 1.3% > (almost 2 standard deviations). Do we know why that is? > I looked at the perf data and with the test, we are doing larger number of hash faults and then around 10k flush_hash_range. Can the small improvement in number be due to the fact that we are not storing slot number when doing an insert now?. Also in the flush path we are now not using real_pte_t. aneesh
[PATCH v5] powerpc/xmon: Support dumping software pagetables
It would be nice to be able to dump page tables in a particular context. eg: dumping vmalloc space: 0:mon> dv 0xd00037f0 pgd @ 0xc17c pgdp @ 0xc17c00d8 = 0xf10b1000 pudp @ 0xc000f10b13f8 = 0xf10d pmdp @ 0xc000f10d1ff8 = 0xf1102000 ptep @ 0xc000f1102780 = 0xc000f1ba018e Maps physical address = 0xf1ba Flags = Accessed Dirty Read Write This patch does not replicate the complex code of dump_pagetable and has no support for bolted linear mapping, thats why I've it's called dump virtual page table support. The format of the PTE can be expanded even further to add more useful information about the flags in the PTE if required. Signed-off-by: Balbir Singh [mpe: Bike shed the output format, show the pgdir] Signed-off-by: Michael Ellerman --- Changelog v5 - Remove dependence on CONFIG_HUGETLB_PAGE and make the changes PPC_BOOK3S_64 dependent. arch/powerpc/xmon/xmon.c | 122 +++ 1 file changed, 122 insertions(+) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 83e7faf5b3d3..9d9899c85d7c 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,9 @@ static void byterev(unsigned char *, int); static void memex(void); static int bsesc(void); static void dump(void); +#ifdef CONFIG_PPC_BOOK3S_64 +static void show_pte(unsigned long); +#endif static void prdump(unsigned long, long); static int ppc_inst_dump(unsigned long, long, int); static void dump_log_buf(void); @@ -234,6 +238,7 @@ Commands:\n\ #endif "\ dr dump stream of raw bytes\n\ + dv dump virtual address translation \n\ dt dump the tracing buffers (uses printk)\n\ dtc dump the tracing buffers for current CPU (uses printk)\n\ " @@ -2613,6 +2618,11 @@ dump(void) dump_log_buf(); } else if (c == 'o') { dump_opal_msglog(); + } else if (c == 'v') { +#ifdef CONFIG_PPC_BOOK3S_64 + /* dump virtual to physical translation */ + show_pte(adrs); +#endif } else if (c == 'r') { scanhex(&ndump); if (ndump == 0) @@ -2946,6 +2956,118 @@ static void show_task(struct task_struct *tsk) tsk->comm); } +#ifdef CONFIG_PPC_BOOK3S_64 +void format_pte(void *ptep, unsigned long pte) +{ + printf("ptep @ 0x%016lx = 0x%016lx\n", (unsigned long)ptep, pte); + printf("Maps physical address = 0x%016lx\n", pte & PTE_RPN_MASK); + + printf("Flags = %s%s%s%s%s\n", + (pte & _PAGE_ACCESSED) ? "Accessed " : "", + (pte & _PAGE_DIRTY)? "Dirty " : "", + (pte & _PAGE_READ) ? "Read " : "", + (pte & _PAGE_WRITE)? "Write " : "", + (pte & _PAGE_EXEC) ? "Exec " : ""); +} + +static void show_pte(unsigned long addr) +{ + unsigned long tskv = 0; + struct task_struct *tsk = NULL; + struct mm_struct *mm; + pgd_t *pgdp, *pgdir; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + if (!scanhex(&tskv)) + mm = &init_mm; + else + tsk = (struct task_struct *)tskv; + + if (tsk == NULL) + mm = &init_mm; + else + mm = tsk->active_mm; + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping pte for task %p\n", tsk); + return; + } + + catch_memory_errors = 1; + sync(); + + if (mm == &init_mm) { + pgdp = pgd_offset_k(addr); + pgdir = pgd_offset_k(0); + } else { + pgdp = pgd_offset(mm, addr); + pgdir = pgd_offset(mm, 0); + } + + if (pgd_none(*pgdp)) { + printf("no linux page table for address\n"); + return; + } + + printf("pgd @ 0x%016lx\n", pgdir); + + if (pgd_huge(*pgdp)) { + format_pte(pgdp, pgd_val(*pgdp)); + return; + } + printf("pgdp @ 0x%016lx = 0x%016lx\n", pgdp, pgd_val(*pgdp)); + + pudp = pud_offset(pgdp, addr); + + if (pud_none(*pudp)) { + printf("No valid PUD\n"); + return; + } + + if (pud_huge(*pudp)) { + format_pte(pudp, pud_val(*pudp)); + return; + } + + printf("pudp @ 0x%016lx = 0x%016lx\n", pudp, pud_val(*pudp)); + + pmdp = pmd_offset(pudp, addr); + + if (pmd_none(*pmdp)) { + printf("No valid PMD\n"); + return; + } + + if (pmd_huge(*pmdp)) { + format_pte(pmdp, pmd_val(*pmdp)); + return; + } + printf("pmdp @ 0x%016lx = 0x%016lx\n", pmdp, pmd_val(*pmdp)); + + ptep = pte_offset_map(pmdp, addr); +
Re: [PATCH] powerpc/kernel/sysfs: Export ldbar spr to sysfs
Hi Anju, Thank you for the patch! Yet something to improve: [auto build test ERROR on powerpc/next] [also build test ERROR on v4.14-rc7 next-20171018] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Anju-T-Sudhakar/powerpc-kernel-sysfs-Export-ldbar-spr-to-sysfs/20171030-155220 base: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next config: powerpc-storcenter_defconfig (attached as .config) compiler: powerpc-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # save the attached .config to linux build tree make.cross ARCH=powerpc All error/warnings (new ones prefixed by >>): >> arch/powerpc/kernel/sysfs.c:419:16: error: 'show_ldbar' defined but not used >> [-Werror=unused-function] static ssize_t show_##NAME(struct device *dev, \ ^ >> arch/powerpc/kernel/sysfs.c:443:2: note: in expansion of macro >> '__SYSFS_SPRSETUP_SHOW_STORE' __SYSFS_SPRSETUP_SHOW_STORE(NAME) ^~~ >> arch/powerpc/kernel/sysfs.c:469:1: note: in expansion of macro >> 'SYSFS_PMCSETUP' SYSFS_PMCSETUP(ldbar, SPRN_LDBAR); ^~ cc1: all warnings being treated as errors vim +/show_ldbar +419 arch/powerpc/kernel/sysfs.c 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 417 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 418 #define __SYSFS_SPRSETUP_SHOW_STORE(NAME) \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 @419 static ssize_t show_##NAME(struct device *dev, \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 420 struct device_attribute *attr, \ 4a0b2b4d arch/powerpc/kernel/sysfs.c Andi Kleen 2008-07-01 421 char *buf) \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 422 { \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 423 struct cpu *cpu = container_of(dev, struct cpu, dev); \ 9a371934 arch/powerpc/kernel/sysfs.c Rusty Russell 2009-03-11 424 unsigned long val; \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 425 smp_call_function_single(cpu->dev.id, read_##NAME, &val, 1);\ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 426 return sprintf(buf, "%lx\n", val); \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 427 } \ 3ff6eecc arch/powerpc/kernel/sysfs.c Adrian Bunk2008-01-24 428 static ssize_t __used \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 429 store_##NAME(struct device *dev, struct device_attribute *attr, \ 4a0b2b4d arch/powerpc/kernel/sysfs.c Andi Kleen 2008-07-01 430 const char *buf, size_t count) \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 431 { \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 432 struct cpu *cpu = container_of(dev, struct cpu, dev); \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 433 unsigned long val; \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 434 int ret = sscanf(buf, "%lx", &val); \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 435 if (ret != 1) \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 436 return -EINVAL; \ 8a25a2fd arch/powerpc/kernel/sysfs.c Kay Sievers2011-12-21 437 smp_call_function_single(cpu->dev.id, write_##NAME, &val, 1); \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 438 return count; \ ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 439 } ^1da177e arch/ppc64/kernel/sysfs.c Linus Torvalds 2005-04-16 440 fd7e4296 arch/powerpc/kernel/sysfs.c Madhavan Srinivasan2013-10-03 441 #define SYSFS_PMCSETUP(NAME, ADDRESS) \ 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 442 __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ppc_enable_pmcs()) \ 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 @443 __SYSFS_SPRSETUP_SHOW_STORE(NAME) fd7e4296 arch/powerpc/kernel/sysfs.c Madhavan Srinivasan2013-10-03 444 #define SYSFS_SPRSETUP(NAME, ADDRESS) \ 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 445 __SYSFS_SPRSETUP_READ_WRITE(NAME, ADDRESS, ) \ 39a360ef arch/powerpc/kernel/sysfs.c Sam bobroff2014-05-21 446 __SYSFS_SPRSETUP_SHOW_STORE(NAME) 39a360ef arch/po
Re: [PATCH v4 00/10] Allow opal-async waiters to get interrupted
On Tue, 10 Oct 2017 14:32:52 +1100 Cyril Bur wrote: > V4: Rework and rethink. > > To recap: > Userspace MTD read()s/write()s and erases to powernv_flash become > calls into the OPAL firmware which subsequently handles flash access. > Because the read()s, write()s or erases can be large (bounded of > course my the size of flash) OPAL may take some time to service the > request, this causes the powernv_flash driver to sit in a wait_event() > for potentially minutes. This causes two problems, firstly, tools > appear to hang for the entire time as they cannot be interrupted by > signals and secondly, this can trigger hung task warnings. The correct > solution is to use wait_event_interruptible() which my rework (as part > of this series) of the opal-async infrastructure provides. > > The final patch in this series achieves this. It should eliminate both > hung tasks and threads locking up. > > Included in this series are other simpler fixes for powernv_flash: > > Don't always return EIO on error. OPAL does mutual exclusion on the > flash and also knows when the service processor takes control of the > flash, in both of these cases it will return OPAL_BUSY, translating > this to EIO is misleading to userspace. > > Handle receiving OPAL_SUCCESS when it expects OPAL_ASYNC_COMPLETION > and don't treat it as an error. Unfortunately there are too many drivers > out there with the incorrect behaviour so this means OPAL can never > return anything but OPAL_ASYNC_COMPLETION, this shouldn't prevent the > code from being correct. > > Don't return ERESTARTSYS if token acquisition is interrupted as > powernv_flash can't be sure it hasn't already performed some work, let > userspace deal with the problem. > > Change the incorrect use of BUG_ON() to WARN_ON() in powernv_flash. > > Not for powernv_flash, a fix from Stewart Smith which fits into this > series as it relies on my improvements to the opal-async > infrastructure. > > V3: export opal_error_code() so that powernv_flash can be built=m > > Hello, > > Version one of this series ignored that OPAL may continue to use > buffers passed to it after Linux kfree()s the buffer. This version > addresses this, not in a particularly nice way - future work could > make this better. This version also includes a few cleanups and fixups > to powernv_flash driver one along the course of this work that I > thought I would just send. > > The problem we're trying to solve here is that currently all users of > the opal-async calls must use wait_event(), this may be undesirable > when there is a userspace process behind the request for the opal > call, if OPAL takes too long to complete the call then hung task > warnings will appear. > > In order to solve the problem callers should use > wait_event_interruptible(), due to the interruptible nature of this > call the opal-async infrastructure needs to track extra state > associated with each async token, this is prepared for in patch 6/10. > > While I was working on the opal-async infrastructure improvements > Stewart fixed another problem and he relies on the corrected behaviour > of opal-async so I've sent it here. > > Hello MTD folk, traditionally Michael Ellerman takes powernv_flash > driver patches through the powerpc tree, as always your feedback is > very welcome. Just gave my acks on patches 1 to 4 and patch 10 (with minor comments on patch 3 and 10). Feel free to take the patches directly through the powerpc tree. > > Thanks, > > Cyril > > Cyril Bur (9): > mtd: powernv_flash: Use WARN_ON_ONCE() rather than BUG_ON() > mtd: powernv_flash: Don't treat OPAL_SUCCESS as an error > mtd: powernv_flash: Remove pointless goto in driver init > mtd: powernv_flash: Don't return -ERESTARTSYS on interrupted token > acquisition > powerpc/opal: Make __opal_async_{get,release}_token() static > powerpc/opal: Rework the opal-async interface > powerpc/opal: Add opal_async_wait_response_interruptible() to > opal-async > powerpc/powernv: Add OPAL_BUSY to opal_error_code() > mtd: powernv_flash: Use opal_async_wait_response_interruptible() > > Stewart Smith (1): > powernv/opal-sensor: remove not needed lock > > arch/powerpc/include/asm/opal.h | 4 +- > arch/powerpc/platforms/powernv/opal-async.c | 183 > +++ > arch/powerpc/platforms/powernv/opal-sensor.c | 17 +-- > arch/powerpc/platforms/powernv/opal.c| 2 + > drivers/mtd/devices/powernv_flash.c | 83 +++- > 5 files changed, 194 insertions(+), 95 deletions(-) >
Re: [PATCH v4 10/10] mtd: powernv_flash: Use opal_async_wait_response_interruptible()
On Tue, 10 Oct 2017 14:33:02 +1100 Cyril Bur wrote: > The OPAL calls performed in this driver shouldn't be using > opal_async_wait_response() as this performs a wait_event() which, on > long running OPAL calls could result in hung task warnings. wait_event() > prevents timely signal delivery which is also undesirable. > > This patch also attempts to quieten down the use of dev_err() when > errors haven't actually occurred and also to return better information up > the stack rather than always -EIO. > > Signed-off-by: Cyril Bur Have some minor comments (see below). Once addressed you can add Acked-by: Boris Brezillon > --- > drivers/mtd/devices/powernv_flash.c | 57 > +++-- > 1 file changed, 35 insertions(+), 22 deletions(-) > > diff --git a/drivers/mtd/devices/powernv_flash.c > b/drivers/mtd/devices/powernv_flash.c > index 3343d4f5c4f3..42383dbca5a6 100644 > --- a/drivers/mtd/devices/powernv_flash.c > +++ b/drivers/mtd/devices/powernv_flash.c > @@ -1,7 +1,7 @@ > /* > * OPAL PNOR flash MTD abstraction > * > - * Copyright IBM 2015 > + * Copyright IBM 2015-2017 Not a big deal, but this copyright update is not really related to the change you describe in your commit message. Maybe this should be done in a separate commit. > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -89,33 +89,46 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > return -EIO; > } > > - if (rc == OPAL_SUCCESS) > - goto out_success; > + if (rc == OPAL_ASYNC_COMPLETION) { > + rc = opal_async_wait_response_interruptible(token, &msg); > + if (rc) { > + /* > + * If we return the mtd core will free the > + * buffer we've just passed to OPAL but OPAL > + * will continue to read or write from that > + * memory. > + * It may be tempting to ultimately return 0 > + * if we're doing a read or a write since we > + * are going to end up waiting until OPAL is > + * done. However, because the MTD core sends > + * us the userspace request in chunks, we need > + * to it know we've been interrupted. ^ 'it to' ? > + */ > + rc = -EINTR; > + if (opal_async_wait_response(token, &msg)) > + dev_err(dev, "opal_async_wait_response() > failed\n"); > + goto out; > + } > + rc = opal_get_async_rc(msg); > + } > > - if (rc != OPAL_ASYNC_COMPLETION) { > + /* > + * OPAL does mutual exclusion on the flash, it will return > + * OPAL_BUSY. > + * During firmware updates by the service processor OPAL may > + * be (temporarily) prevented from accessing the flash, in > + * this case OPAL will also return OPAL_BUSY. > + * Both cases aren't errors exactly but the flash could have > + * changed, userspace should be informed. > + */ > + if (rc != OPAL_SUCCESS && rc != OPAL_BUSY) > dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", > op, rc); > - rc = -EIO; > - goto out; > - } > > - rc = opal_async_wait_response(token, &msg); > - if (rc) { > - dev_err(dev, "opal async wait failed (rc %d)\n", rc); > - rc = -EIO; > - goto out; > - } > - > - rc = opal_get_async_rc(msg); > -out_success: > - if (rc == OPAL_SUCCESS) { > - rc = 0; > - if (retlen) > + if (rc == OPAL_SUCCESS && retlen) > *retlen = len; There's one too many tabs here. > - } else { > - rc = -EIO; > - } > > + rc = opal_error_code(rc); > out: > opal_async_release_token(token); > return rc;
Re: [PATCH v4 04/10] mtd: powernv_flash: Don't return -ERESTARTSYS on interrupted token acquisition
On Tue, 10 Oct 2017 14:32:56 +1100 Cyril Bur wrote: > Because the MTD core might split up a read() or write() from userspace > into several calls to the driver, we may fail to get a token but already > have done some work, best to return -EINTR back to userspace and have > them decide what to do. > > Signed-off-by: Cyril Bur Acked-by: Boris Brezillon > --- > drivers/mtd/devices/powernv_flash.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/drivers/mtd/devices/powernv_flash.c > b/drivers/mtd/devices/powernv_flash.c > index 4dd3b5d2feb2..3343d4f5c4f3 100644 > --- a/drivers/mtd/devices/powernv_flash.c > +++ b/drivers/mtd/devices/powernv_flash.c > @@ -47,6 +47,11 @@ enum flash_op { > FLASH_OP_ERASE, > }; > > +/* > + * Don't return -ERESTARTSYS if we can't get a token, the MTD core > + * might have split up the call from userspace and called into the > + * driver more than once, we'll already have done some amount of work. > + */ > static int powernv_flash_async_op(struct mtd_info *mtd, enum flash_op op, > loff_t offset, size_t len, size_t *retlen, u_char *buf) > { > @@ -63,6 +68,8 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > if (token < 0) { > if (token != -ERESTARTSYS) > dev_err(dev, "Failed to get an async token\n"); > + else > + token = -EINTR; > return token; > } >
Re: [PATCH v4 03/10] mtd: powernv_flash: Remove pointless goto in driver init
On Tue, 10 Oct 2017 14:32:55 +1100 Cyril Bur wrote: Can you please add a short description here? Once done you can add Acked-by: Boris Brezillon > Signed-off-by: Cyril Bur > --- > drivers/mtd/devices/powernv_flash.c | 16 ++-- > 1 file changed, 6 insertions(+), 10 deletions(-) > > diff --git a/drivers/mtd/devices/powernv_flash.c > b/drivers/mtd/devices/powernv_flash.c > index ca3ca6adf71e..4dd3b5d2feb2 100644 > --- a/drivers/mtd/devices/powernv_flash.c > +++ b/drivers/mtd/devices/powernv_flash.c > @@ -227,21 +227,20 @@ static int powernv_flash_probe(struct platform_device > *pdev) > int ret; > > data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL); > - if (!data) { > - ret = -ENOMEM; > - goto out; > - } > + if (!data) > + return -ENOMEM; > + > data->mtd.priv = data; > > ret = of_property_read_u32(dev->of_node, "ibm,opal-id", &(data->id)); > if (ret) { > dev_err(dev, "no device property 'ibm,opal-id'\n"); > - goto out; > + return ret; > } > > ret = powernv_flash_set_driver_info(dev, &data->mtd); > if (ret) > - goto out; > + return ret; > > dev_set_drvdata(dev, data); > > @@ -250,10 +249,7 @@ static int powernv_flash_probe(struct platform_device > *pdev) >* with an ffs partition at the start, it should prove easier for users >* to deal with partitions or not as they see fit >*/ > - ret = mtd_device_register(&data->mtd, NULL, 0); > - > -out: > - return ret; > + return mtd_device_register(&data->mtd, NULL, 0); > } > > /**
Re: [PATCH v4 02/10] mtd: powernv_flash: Don't treat OPAL_SUCCESS as an error
On Tue, 10 Oct 2017 14:32:54 +1100 Cyril Bur wrote: > While this driver expects to interact asynchronously, OPAL is well > within its rights to return OPAL_SUCCESS to indicate that the operation > completed without the need for a callback. We shouldn't treat > OPAL_SUCCESS as an error rather we should wrap up and return promptly to > the caller. > > Signed-off-by: Cyril Bur Acked-by: Boris Brezillon > --- > I'll note here that currently no OPAL exists that will return > OPAL_SUCCESS so there isn't the possibility of a bug today. > --- > drivers/mtd/devices/powernv_flash.c | 15 ++- > 1 file changed, 10 insertions(+), 5 deletions(-) > > diff --git a/drivers/mtd/devices/powernv_flash.c > b/drivers/mtd/devices/powernv_flash.c > index f9ec38281ff2..ca3ca6adf71e 100644 > --- a/drivers/mtd/devices/powernv_flash.c > +++ b/drivers/mtd/devices/powernv_flash.c > @@ -63,7 +63,6 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > if (token < 0) { > if (token != -ERESTARTSYS) > dev_err(dev, "Failed to get an async token\n"); > - > return token; > } > > @@ -83,21 +82,25 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > return -EIO; > } > > + if (rc == OPAL_SUCCESS) > + goto out_success; > + > if (rc != OPAL_ASYNC_COMPLETION) { > dev_err(dev, "opal_flash_async_op(op=%d) failed (rc %d)\n", > op, rc); > - opal_async_release_token(token); > - return -EIO; > + rc = -EIO; > + goto out; > } > > rc = opal_async_wait_response(token, &msg); > - opal_async_release_token(token); > if (rc) { > dev_err(dev, "opal async wait failed (rc %d)\n", rc); > - return -EIO; > + rc = -EIO; > + goto out; > } > > rc = opal_get_async_rc(msg); > +out_success: > if (rc == OPAL_SUCCESS) { > rc = 0; > if (retlen) > @@ -106,6 +109,8 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > rc = -EIO; > } > > +out: > + opal_async_release_token(token); > return rc; > } >
Re: [PATCH v4 01/10] mtd: powernv_flash: Use WARN_ON_ONCE() rather than BUG_ON()
On Tue, 10 Oct 2017 14:32:53 +1100 Cyril Bur wrote: > BUG_ON() should be reserved in situations where we can not longer > guarantee the integrity of the system. In the case where > powernv_flash_async_op() receives an impossible op, we can still > guarantee the integrity of the system. > > Signed-off-by: Cyril Bur Acked-by: Boris Brezillon > --- > drivers/mtd/devices/powernv_flash.c | 4 +++- > 1 file changed, 3 insertions(+), 1 deletion(-) > > diff --git a/drivers/mtd/devices/powernv_flash.c > b/drivers/mtd/devices/powernv_flash.c > index f5396f26ddb4..f9ec38281ff2 100644 > --- a/drivers/mtd/devices/powernv_flash.c > +++ b/drivers/mtd/devices/powernv_flash.c > @@ -78,7 +78,9 @@ static int powernv_flash_async_op(struct mtd_info *mtd, > enum flash_op op, > rc = opal_flash_erase(info->id, offset, len, token); > break; > default: > - BUG_ON(1); > + WARN_ON_ONCE(1); > + opal_async_release_token(token); > + return -EIO; > } > > if (rc != OPAL_ASYNC_COMPLETION) {
Re: [PATCH 00/16] Remove hash page table slot tracking from linux PTE
Paul Mackerras writes: > On Fri, Oct 27, 2017 at 10:57:13AM +0530, Aneesh Kumar K.V wrote: >> >> >> On 10/27/2017 10:04 AM, Paul Mackerras wrote: >> >How do we interpret these numbers? Are they times, or speed? Is >> >larger better or worse? >> >> Sorry for not including the details. They are time in seconds. Test case is >> a modified mmap_bench included in powerpc/selftest. >> >> > >> >Can you give us the mean and standard deviation for each set of 5 >> >please? >> > >> >> powernv without patch >> median= 51.432255 >> stdev = 0.370835 >> >> with patch >> median = 50.739922 >> stdev = 0.06419662 >> >> pseries without patch >> median = 116.617884 >> stdev = 3.04531023 >> >> with patch no hcall >> median = 119.42494 >> stdev = 0.85874552 >> >> with patch and hcall >> median = 117.735808 >> stdev = 2.7624151 > > So on powernv, the patch set *improves* performance by about 1.3% > (almost 2 standard deviations). Do we know why that is? I haven't looked at that closely. I was considering it within runtime variance (no impact with patch series). I will get perf record collected and will see if that points to any details. > > On pseries, performance is about 2.4% worse without new hcalls, but > that is less than 1 standard deviation. With new hcalls, performance > is 0.95% worse, only a third of a standard deviation. I think we need > to do more measurements to try to get a more accurate picture here. > > Were the pseries numbers done on KVM or PowerVM? Could you do a set > of measurements on the other one too please? (I assume the numbers > with the new hcall were done on KVM, and can't be done on PowerVM.) > The above pseries numbers were collected on KVM. PowerVM numbers on a different machine: Without patch 31.194165 31.372913 31.253494 31.416198 31.199180 MEDIAN = 31.253494 STDEV = 0.1018900 With patch series 31.538281 31.385996 31.492737 31.452514 31.259461 MEDIAN = 31.452514 STDEV = 0.108511 -aneesh
[mainline][ppc] sysfs: cannot create duplicate filename '/devices/hv_24x7/events/PM_CAPP1_APC_UOP_SEND_PB_CMD'
Hi, A warning is being triggered while booting mainline kernel on ppc machine. Machine Type: Power 9 Kernel : 4.14.0-rc6 gcc: 4.8.5 Test : Boot Boot logs: -- hv-24x7: found a duplicate event PM_CAPP1_XPT_MSG_SENT_GT_16_LE_64, ct=1 hv-24x7: found a duplicate event PM_CAPP1_XPT_MSG_SENT_TSIZE_GT_64_LE_128, ct=1 hv-24x7: read 1463 catalog entries, created 470 event attrs (0 failures), 253 descs audit: initializing netlink subsys (disabled) audit: type=2000 audit(1509236850.410:1): state=initialized audit_enabled=0 res=1 Kprobe smoke test: started Kprobe smoke test: passed successfully sysfs: cannot create duplicate filename '/devices/hv_24x7/events/PM_CAPP1_APC_UOP_SEND_PB_CMD' [ cut here ] WARNING: CPU: 1 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x80/0xb0 Modules linked in: CPU: 1 PID: 1 Comm: swapper/1 Not tainted 4.14.0-rc6-autotest-autotest #1 task: c000fe9c task.stack: c000fea0 NIP: c03f89c0 LR: c03f89bc CTR: 0073c874 REGS: c000fea03610 TRAP: 0700 Not tainted (4.14.0-rc6-autotest-autotest) MSR: 82029033 CR: 22022022 XER: 000f CFAR: c01692e8 SOFTE: 1 GPR00: c03f89bc c000fea03890 c10c5900 005e GPR04: 0080 000181207e2b5b77 00a2 GPR08: c103d410 c103d410 GPR12: 2000 cfac0a80 c000d168 GPR16: GPR20: GPR24: ee4b c000f0e0d590 c000f0df1c10 GPR28: c0f77090 c000f0e0d590 c000fe039c80 c000fe184000 NIP [c03f89c0] sysfs_warn_dup+0x80/0xb0 LR [c03f89bc] sysfs_warn_dup+0x7c/0xb0 Call Trace: [c000fea03890] [c03f89bc] sysfs_warn_dup+0x7c/0xb0 (unreliable) [c000fea03910] [c03f85b8] sysfs_add_file_mode_ns+0x1f8/0x210 [c000fea03990] [c03f998c] internal_create_group+0x12c/0x3a0 [c000fea03a30] [c03f9e10] sysfs_create_groups+0x70/0x120 [c000fea03a70] [c05fcaa0] device_add+0x3f0/0x760 [c000fea03b30] [c0246c48] pmu_dev_alloc+0xb8/0x140 [c000fea03bb0] [c0c9380c] perf_event_sysfs_init+0x90/0xf4 [c000fea03c40] [c000cf00] do_one_initcall+0x60/0x1c0 [c000fea03d00] [c0c6441c] kernel_init_freeable+0x278/0x358 [c000fea03dc0] [c000d184] kernel_init+0x24/0x140 [c000fea03e30] [c000b4e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: 7fa3eb78 3880 7fe5fb78 38c01000 4bffa529 6000 3c62ffad 7fe4fb78 38630e38 7fc5f378 4bd708f1 6000 <0fe0> 7fe3fb78 4bf0b1e1 6000 ---[ end trace 537ba3bf9fbd1b58 ]--- Failed to register pmu: hv_24x7, reason -17 [ cut here ] WARNING: CPU: 1 PID: 1 at kernel/events/core.c:11238 perf_event_sysfs_init+0xa8/0xf4 Modules linked in: CPU: 1 PID: 1 Comm: swapper/1 Tainted: GW 4.14.0-rc6-autotest-autotest #1 task: c000fe9c task.stack: c000fea0 NIP: c0c93824 LR: c0c93820 CTR: 0073c874 REGS: c000fea03930 TRAP: 0700 Tainted: GW (4.14.0-rc6-autotest-autotest) MSR: 82029033 CR: 2222 XER: 000c CFAR: c01692e8 SOFTE: 1 GPR00: c0c93820 c000fea03bb0 c10c5900 002b GPR04: 0080 000181207e2ff5a3 00a3 GPR08: c103d410 c103d410 GPR12: cfac0a80 c000d168 GPR16: GPR20: GPR24: c0c637ac c0c4d280 c0b83e00 GPR28: c0f8f8c8 c0f8f8e8 c0f77108 NIP [c0c93824] perf_event_sysfs_init+0xa8/0xf4 LR [c0c93820] perf_event_sysfs_init+0xa4/0xf4 Call Trace: [c000fea03bb0] [c0c93820] perf_event_sysfs_init+0xa4/0xf4 (unreliable) [c000fea03c40] [c000cf00] do_one_initcall+0x60/0x1c0 [c000fea03d00] [c0c6441c] kernel_init_freeable+0x278/0x358 [c000fea03dc0] [c000d184] kernel_init+0x24/0x140 [c000fea03e30] [c000b4e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: 2fa9 419e0030 813f0030 2f89 419c0024 4b5b3391 7c651b79 41e20018 e89f0028 7f63db78 4b4d5a8d 6000 <0fe0> ebff 4bb8 3921 ---[ end trace 537ba3bf9fbd1b59 ]--- Initialise system trusted keyrings A similar issue was reported on 4.11.0 https://lkml.org/lkml/2017/3/7/763 -- Regard's Abdul Haleem IBM Linux Technology Centre