[PATCH kernel] powerpc/powernv: Fix compile without CONFIG_TRACEPOINTS
The functions returns s64 but the return statement is missing. This adds the missing return statement. Signed-off-by: Alexey Kardashevskiy --- Just in case if this has not been caught just yet :) --- arch/powerpc/platforms/powernv/opal-call.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powernv/opal-call.c b/arch/powerpc/platforms/powernv/opal-call.c index 578757d403ab..daad8c45c8e7 100644 --- a/arch/powerpc/platforms/powernv/opal-call.c +++ b/arch/powerpc/platforms/powernv/opal-call.c @@ -86,6 +86,7 @@ static s64 __opal_call_trace(s64 a0, s64 a1, s64 a2, s64 a3, s64 a4, s64 a5, s64 a6, s64 a7, unsigned long opcode, unsigned long msr) { + return 0; } #define DO_TRACE false -- 2.17.1
Re: [PATCH 3/6] x86: clean up _TIF_SYSCALL_EMU handling using ptrace_syscall_enter hook
On Mon, Mar 11, 2019 at 6:35 PM Haibo Xu (Arm Technology China) wrote: > > On 2019/3/12 2:34, Sudeep Holla wrote: > > (I thought I had sent this email, last Tuesday itself, but saw this in my > > draft today, something went wrong, sorry for the delay) > > > > On Tue, Mar 05, 2019 at 02:14:47AM +, Haibo Xu (Arm Technology China) > > wrote: > >> On 2019/3/4 18:12, Sudeep Holla wrote: > >>> On Mon, Mar 04, 2019 at 08:25:28AM +, Haibo Xu (Arm Technology China) > >>> wrote: > On 2019/3/1 2:32, Sudeep Holla wrote: > > Now that we have a new hook ptrace_syscall_enter that can be called from > > syscall entry code and it handles PTRACE_SYSEMU in generic code, we > > can do some cleanup using the same in syscall_trace_enter. > > > > Further the extra logic to find single stepping PTRACE_SYSEMU_SINGLESTEP > > in syscall_slow_exit_work seems unnecessary. Let's remove the same. > > I think we should not change the logic here. Is so, it will double the > report of syscall > when PTRACE_SYSEMU_SINGLESTEP is enabled. > > >>> > >>> I don't think that should happen, but I may be missing something. > >>> Can you explain how ? > >>> > >> > >> When PTRACE_SYSEMU_SINGLESTEP is enabled, both the _TIF_SYSCALL_EMU and > >> _TIF_SINGLESTEP flags are set, but ptrace only need to report(send SIGTRAP) > >> at the entry of a system call, no need to report at the exit of a system > >> call. > >> > > Sorry, but I still not get it, we have: > > > > step = ((flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == _TIF_SINGLESTEP); > > > > For me, this is same as: > > step = ((flags & _TIF_SINGLESTEP) == _TIF_SINGLESTEP) > > or > > if (flags & _TIF_SINGLESTEP) > > step = true; > > > > I don't think so! As I mentioned in the last email loop, when > PTRACE_SYSEMU_SINGLESTEP > is enabled, both the _TIF_SYSCALL_EMU and _TIF_SINGLESTEP flags are set, in > which case > the step should be "false" for the old logic. But with the new logic, the > step is "true". > > > So when PTRACE_SYSEMU_SINGLESTEP, _TIF_SYSCALL_EMU and _TIF_SINGLESTEP > > are set and step evaluates to true. > > > > So dropping _TIF_SYSCALL_EMU here should be fine. Am I still missing > > something ? > > > > -- > > Regards, > > Sudeep > > > > For the PTRACE_SYSEMU_SINGLESTEP request, ptrace only need to report(send > SIGTRAP) > at the entry of a system call, no need to report at the exit of a system > call.That's > why the old logic-{step = ((flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == > _TIF_SINGLESTEP)} > here try to filter out the special case(PTRACE_SYSEMU_SINGLESTEP). > > Another way to make sure the logic is fine, you can run some tests with > respect to both logic, > and to check whether they have the same behavior. tools/testing/selftests/x86/ptrace_syscall.c has a test intended to exercise this. Can one of you either confirm that it does exercise it and that it still passes or can you improve the test? Thanks, Andy
Re: [PATCH 3/6] x86: clean up _TIF_SYSCALL_EMU handling using ptrace_syscall_enter hook
On 2019/3/12 2:34, Sudeep Holla wrote: > (I thought I had sent this email, last Tuesday itself, but saw this in my > draft today, something went wrong, sorry for the delay) > > On Tue, Mar 05, 2019 at 02:14:47AM +, Haibo Xu (Arm Technology China) > wrote: >> On 2019/3/4 18:12, Sudeep Holla wrote: >>> On Mon, Mar 04, 2019 at 08:25:28AM +, Haibo Xu (Arm Technology China) >>> wrote: On 2019/3/1 2:32, Sudeep Holla wrote: > Now that we have a new hook ptrace_syscall_enter that can be called from > syscall entry code and it handles PTRACE_SYSEMU in generic code, we > can do some cleanup using the same in syscall_trace_enter. > > Further the extra logic to find single stepping PTRACE_SYSEMU_SINGLESTEP > in syscall_slow_exit_work seems unnecessary. Let's remove the same. I think we should not change the logic here. Is so, it will double the report of syscall when PTRACE_SYSEMU_SINGLESTEP is enabled. >>> >>> I don't think that should happen, but I may be missing something. >>> Can you explain how ? >>> >> >> When PTRACE_SYSEMU_SINGLESTEP is enabled, both the _TIF_SYSCALL_EMU and >> _TIF_SINGLESTEP flags are set, but ptrace only need to report(send SIGTRAP) >> at the entry of a system call, no need to report at the exit of a system >> call. >> > Sorry, but I still not get it, we have: > > step = ((flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == _TIF_SINGLESTEP); > > For me, this is same as: > step = ((flags & _TIF_SINGLESTEP) == _TIF_SINGLESTEP) > or > if (flags & _TIF_SINGLESTEP) > step = true; > I don't think so! As I mentioned in the last email loop, when PTRACE_SYSEMU_SINGLESTEP is enabled, both the _TIF_SYSCALL_EMU and _TIF_SINGLESTEP flags are set, in which case the step should be "false" for the old logic. But with the new logic, the step is "true". > So when PTRACE_SYSEMU_SINGLESTEP, _TIF_SYSCALL_EMU and _TIF_SINGLESTEP > are set and step evaluates to true. > > So dropping _TIF_SYSCALL_EMU here should be fine. Am I still missing > something ? > > -- > Regards, > Sudeep > For the PTRACE_SYSEMU_SINGLESTEP request, ptrace only need to report(send SIGTRAP) at the entry of a system call, no need to report at the exit of a system call.That's why the old logic-{step = ((flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == _TIF_SINGLESTEP)} here try to filter out the special case(PTRACE_SYSEMU_SINGLESTEP). Another way to make sure the logic is fine, you can run some tests with respect to both logic, and to check whether they have the same behavior. Regards, Haibo IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.
[RFCv2 PATCH 4/4] powerpc: KASAN for 64bit Book3E
Wire up KASAN. Only outline instrumentation is supported. The KASAN shadow area is mapped into vmemmap space: 0x8000 0400 to 0x8000 0600 . To do this we require that vmemmap be disabled. (This is the default in the kernel config that QorIQ provides for the machine in their SDK anyway - they use flat memory.) Only the kernel linear mapping (0xc000...) is checked. The vmalloc and ioremap areas (also in 0x800...) are all mapped to the zero page. As with the Book3S hash series, this requires overriding the memory <-> shadow mapping. Also, as with both previous 64-bit series, early instrumentation is not supported. It would allow us to drop the check_return_arch_not_ready() hook in the KASAN core, but it's tricky to get it set up early enough: we need it setup before the first call to instrumented code like printk(). Perhaps in the future. Only KASAN_MINIMAL works. Tested on e6500. KVM, kexec and xmon have not been tested. The test_kasan module fires warnings as expected, except for the following tests: - Expected/by design: kasan test: memcg_accounted_kmem_cache allocate memcg accounted object - Due to only supporting KASAN_MINIMAL: kasan test: kasan_stack_oob out-of-bounds on stack kasan test: kasan_global_oob out-of-bounds global variable kasan test: kasan_alloca_oob_left out-of-bounds to left on alloca kasan test: kasan_alloca_oob_right out-of-bounds to right on alloca kasan test: use_after_scope_test use-after-scope on int kasan test: use_after_scope_test use-after-scope on array Thanks to those who have done the heavy lifting over the past several years: - Christophe's 32 bit series: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-February/185379.html - Aneesh's Book3S hash series: https://lwn.net/Articles/655642/ - Balbir's Book3S radix series: https://patchwork.ozlabs.org/patch/795211/ Cc: Christophe Leroy Cc: Aneesh Kumar K.V Cc: Balbir Singh Signed-off-by: Daniel Axtens --- While useful if you have a book3e device, this is mostly intended as a warm-up exercise for reviving Aneesh's series for book3s hash. In particular, changes to the kasan core are going to be required for hash and radix as well. --- arch/powerpc/Kconfig | 1 + arch/powerpc/Kconfig.debug | 2 +- arch/powerpc/include/asm/kasan.h | 73 +++- arch/powerpc/mm/Makefile | 2 + arch/powerpc/mm/kasan/Makefile | 1 + arch/powerpc/mm/kasan/kasan_init_book3e_64.c | 53 ++ 6 files changed, 129 insertions(+), 3 deletions(-) create mode 100644 arch/powerpc/mm/kasan/kasan_init_book3e_64.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8d6108c83299..01540873a79f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -173,6 +173,7 @@ config PPC select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if PPC32 + select HAVE_ARCH_KASAN if PPC_BOOK3E_64 && !SPARSEMEM_VMEMMAP select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 61febbbdd02b..fc1f5fa7554e 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -369,5 +369,5 @@ config PPC_FAST_ENDIAN_SWITCH config KASAN_SHADOW_OFFSET hex - depends on KASAN + depends on KASAN && PPC32 default 0xe000 diff --git a/arch/powerpc/include/asm/kasan.h b/arch/powerpc/include/asm/kasan.h index e4adc6bc1e8f..661a5700869b 100644 --- a/arch/powerpc/include/asm/kasan.h +++ b/arch/powerpc/include/asm/kasan.h @@ -15,14 +15,16 @@ #ifndef __ASSEMBLY__ #include +#include #define KASAN_SHADOW_SCALE_SHIFT 3 -#define KASAN_SHADOW_OFFSETASM_CONST(CONFIG_KASAN_SHADOW_OFFSET) - #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ (PAGE_OFFSET >> KASAN_SHADOW_SCALE_SHIFT)) +#ifdef CONFIG_PPC32 +#define KASAN_SHADOW_OFFSETASM_CONST(CONFIG_KASAN_SHADOW_OFFSET) + #define KASAN_SHADOW_END 0UL #define KASAN_SHADOW_SIZE (KASAN_SHADOW_END - KASAN_SHADOW_START) @@ -30,6 +32,73 @@ #ifdef CONFIG_KASAN void kasan_early_init(void); void kasan_mmu_init(void); +#endif +#endif /* CONFIG_PPC32 */ + +#ifdef CONFIG_PPC_BOOK3E_64 + +/* we don't put this in Kconfig as we only support KASAN_MINIMAL, and + * that will be disabled if the symbol is availabe in Kconfig */ +#define KASAN_SHADOW_OFFSET ASM_CONST(0x68000400) + +#define KASAN_SHADOW_SIZE (KERN_VIRT_SIZE >> KASAN_SHADOW_SCALE_SHIFT) + +extern struct static_key_false powerpc_kasan_enabled_key; +static inline bool kasan_arch_is_ready_book3e(void) { + if (static_branch_likely(_kasan_enabled_key)) + return true; + return false; +} +#define kasan_arch_is_ready kasan_arch_is_ready_book3e
[RFCv2 PATCH 3/4] kasan: allow architectures to provide an outline readiness check
In powerpc (as I understand it), we spend a lot of time in boot running in real mode before MMU paging is initalised. During this time we call a lot of generic code, including printk(). If we try to access the shadow region during this time, things fail. My attempts to move early init before the first printk have not been successful. (Both previous RFCs for ppc64 - by 2 different people - have needed this trick too!) So, allow architectures to define a kasan_arch_is_ready() hook that bails out of check_memory_region_inline() unless the arch has done all of the init. Link: https://lore.kernel.org/patchwork/patch/592820/ # ppc64 hash series Link: https://patchwork.ozlabs.org/patch/795211/ # ppc radix series Originally-by: Balbir Singh Cc: Aneesh Kumar K.V Signed-off-by: Daniel Axtens [check_return_arch_not_ready() ==> static inline kasan_arch_is_ready()] Signed-off-by: Christophe Leroy --- include/linux/kasan.h | 4 mm/kasan/generic.c| 3 +++ 2 files changed, 7 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index f6261840f94c..a630d53f1a36 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -14,6 +14,10 @@ struct task_struct; #include #include +#ifndef kasan_arch_is_ready +static inline bool kasan_arch_is_ready(void) { return true; } +#endif + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index bafa2f986660..6c6c30643d51 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -170,6 +170,9 @@ static __always_inline void check_memory_region_inline(unsigned long addr, size_t size, bool write, unsigned long ret_ip) { + if (!kasan_arch_is_ready()) + return; + if (unlikely(size == 0)) return; -- 2.19.1
[RFCv2 PATCH 2/4] kasan: allow architectures to manage the memory-to-shadow mapping
Currently, shadow addresses are always addr >> shift + offset. However, for powerpc, the virtual address space is fragmented in ways that make this simple scheme impractical. Allow architectures to override: - kasan_shadow_to_mem - kasan_mem_to_shadow - addr_has_shadow Rename addr_has_shadow to kasan_addr_has_shadow as if it is overridden it will be available in more places, increasing the risk of collisions. If architectures do not #define their own versions, the generic code will continue to run as usual. Reviewed-by: Dmitry Vyukov Signed-off-by: Daniel Axtens --- include/linux/kasan.h | 2 ++ mm/kasan/generic.c| 2 +- mm/kasan/generic_report.c | 2 +- mm/kasan/kasan.h | 6 +- mm/kasan/report.c | 6 +++--- mm/kasan/tags.c | 2 +- 6 files changed, 13 insertions(+), 7 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b40ea104dd36..f6261840f94c 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -23,11 +23,13 @@ extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; int kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end); +#ifndef kasan_mem_to_shadow static inline void *kasan_mem_to_shadow(const void *addr) { return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET; } +#endif /* Enable reporting bugs after kasan_disable_current() */ extern void kasan_enable_current(void); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index ffc64a9a97a5..bafa2f986660 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -173,7 +173,7 @@ static __always_inline void check_memory_region_inline(unsigned long addr, if (unlikely(size == 0)) return; - if (unlikely(!addr_has_shadow((void *)addr))) { + if (unlikely(!kasan_addr_has_shadow((void *)addr))) { kasan_report(addr, size, write, ret_ip); return; } diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c index 5e12035888f2..854f4de1fe10 100644 --- a/mm/kasan/generic_report.c +++ b/mm/kasan/generic_report.c @@ -110,7 +110,7 @@ static const char *get_wild_bug_type(struct kasan_access_info *info) const char *get_bug_type(struct kasan_access_info *info) { - if (addr_has_shadow(info->access_addr)) + if (kasan_addr_has_shadow(info->access_addr)) return get_shadow_bug_type(info); return get_wild_bug_type(info); } diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ea51b2d898ec..57ec24cf7bd1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -111,16 +111,20 @@ struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache, struct kasan_free_meta *get_free_info(struct kmem_cache *cache, const void *object); +#ifndef kasan_shadow_to_mem static inline const void *kasan_shadow_to_mem(const void *shadow_addr) { return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET) << KASAN_SHADOW_SCALE_SHIFT); } +#endif -static inline bool addr_has_shadow(const void *addr) +#ifndef kasan_addr_has_shadow +static inline bool kasan_addr_has_shadow(const void *addr) { return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } +#endif void kasan_poison_shadow(const void *address, size_t size, u8 value); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ca9418fe9232..bc3355ee2dd0 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -298,7 +298,7 @@ void kasan_report(unsigned long addr, size_t size, untagged_addr = reset_tag(tagged_addr); info.access_addr = tagged_addr; - if (addr_has_shadow(untagged_addr)) + if (kasan_addr_has_shadow(untagged_addr)) info.first_bad_addr = find_first_bad_addr(tagged_addr, size); else info.first_bad_addr = untagged_addr; @@ -309,11 +309,11 @@ void kasan_report(unsigned long addr, size_t size, start_report(); print_error_description(); - if (addr_has_shadow(untagged_addr)) + if (kasan_addr_has_shadow(untagged_addr)) print_tags(get_tag(tagged_addr), info.first_bad_addr); pr_err("\n"); - if (addr_has_shadow(untagged_addr)) { + if (kasan_addr_has_shadow(untagged_addr)) { print_address_description(untagged_addr); pr_err("\n"); print_shadow_for_address(info.first_bad_addr); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index bc759f8f1c67..cdefd0fe1f5d 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -109,7 +109,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write, return; untagged_addr = reset_tag((const void *)addr); - if (unlikely(!addr_has_shadow(untagged_addr))) { + if (unlikely(!kasan_addr_has_shadow(untagged_addr))) {
[RFCv2 PATCH 1/4] kasan: do not open-code addr_has_shadow
We have a couple of places checking for the existence of a shadow mapping for an address by open-coding the inverse of the check in addr_has_shadow. Replace the open-coded versions with the helper. This will be needed in future to allow architectures to override the layout of the shadow mapping. Reviewed-by: Andrew Donnellan Reviewed-by: Dmitry Vyukov Signed-off-by: Daniel Axtens --- mm/kasan/generic.c | 3 +-- mm/kasan/tags.c| 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index ccb6207276e3..ffc64a9a97a5 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -173,8 +173,7 @@ static __always_inline void check_memory_region_inline(unsigned long addr, if (unlikely(size == 0)) return; - if (unlikely((void *)addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_shadow((void *)addr))) { kasan_report(addr, size, write, ret_ip); return; } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 0777649e07c4..bc759f8f1c67 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -109,8 +109,7 @@ void check_memory_region(unsigned long addr, size_t size, bool write, return; untagged_addr = reset_tag((const void *)addr); - if (unlikely(untagged_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { + if (unlikely(!addr_has_shadow(untagged_addr))) { kasan_report(addr, size, write, ret_ip); return; } -- 2.19.1
[RFCv2 PATCH 0/4] powerpc: KASAN for 64-bit Book3E
Building on the work of Christophe, Aneesh and Balbir, I've ported KASAN to the e6500, a 64-bit Book3E processor which doesn't have a hashed page table. It applies on top of Christophe's series, v9, plus the proposed new version of patch 2. It requires some changes to the KASAN core; we use the less ugly outline readiness check patch proposed by Christophe. The KASAN shadow area is mapped into vmemmap space: 0x8000 0400 to 0x8000 0600 . To do this we require that vmemmap be disabled. (This is the default in the kernel config that QorIQ provides for the machine in their SDK anyway - they use flat memory.) Only outline instrumentation is supported and only KASAN_MINIMAL works. Only the kernel linear mapping (0xc000...) is checked. The vmalloc and ioremap areas (also in 0x800...) are all mapped to a zero page. As with the Book3S hash series, this requires overriding the memory <-> shadow mapping. Also, as with both previous 64-bit series, early instrumentation is not supported. KVM, kexec and xmon have not been tested. Thanks to those who have done the heavy lifting over the past several years: - Christophe's 32 bit series: https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-February/185379.html - Aneesh's Book3S hash series: https://lwn.net/Articles/655642/ - Balbir's Book3S radix series: https://patchwork.ozlabs.org/patch/795211/ While useful if you have an Book3E device, this is mostly intended as a warm-up exercise for reviving Aneesh's series for book3s hash. I expect that the changes to the KASAN core will be required for that too, but I'll check against the book3s version before I send a non-RFC version. Once I do that I'll revisit the vmemmap decision as well. Changes from RFCv1: - Use Christophe's new version of outline readiness check - Rebase on top of Christophe's v9 + the proposed changes to string/memory functions Regards, Daniel Daniel Axtens (4): kasan: do not open-code addr_has_shadow kasan: allow architectures to manage the memory-to-shadow mapping kasan: allow architectures to provide an outline readiness check powerpc: KASAN for 64bit Book3E arch/powerpc/Kconfig | 1 + arch/powerpc/Kconfig.debug | 2 +- arch/powerpc/include/asm/kasan.h | 73 +++- arch/powerpc/mm/Makefile | 2 + arch/powerpc/mm/kasan/Makefile | 1 + arch/powerpc/mm/kasan/kasan_init_book3e_64.c | 53 ++ include/linux/kasan.h| 6 ++ mm/kasan/generic.c | 6 +- mm/kasan/generic_report.c| 2 +- mm/kasan/kasan.h | 6 +- mm/kasan/report.c| 6 +- mm/kasan/tags.c | 3 +- 12 files changed, 149 insertions(+), 12 deletions(-) create mode 100644 arch/powerpc/mm/kasan/kasan_init_book3e_64.c -- 2.19.1
[PATCH 19/20] [Bionic] (upstream) KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size
From: Nicholas Piggin BugLink: https://bugs.launchpad.net/bugs/1788098 THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson Cc: "Aneesh Kumar K.V" Cc: kvm-...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras (cherry picked from commit 71d29f43b6332badc5598c656616a62575e83342 v4.19) Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 91 +++--- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 7efc42538ccf..ae023d2256ef 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -538,8 +538,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq, pte_size; - unsigned long gpa, gfn, hva, pfn; + unsigned long mmu_seq; + unsigned long gpa, gfn, hva; struct kvm_memory_slot *memslot; struct page *page = NULL; long ret; @@ -636,9 +636,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ hva = gfn_to_hva_memslot(memslot, gfn); if (upgrade_p && __get_user_pages_fast(hva, 1, 1, ) == 1) { - pfn = page_to_pfn(page); upgrade_write = true; } else { + unsigned long pfn; + /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, upgrade_p); @@ -652,63 +653,45 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } - /* See if we can insert a 1GB or 2MB large PTE here */ - level = 0; - if (page && PageCompound(page)) { - pte_size = PAGE_SIZE << compound_order(compound_head(page)); - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } - } - /* -* Compute the PTE value that we need to insert. +* Read the PTE from the process' radix tree and use that +* so we get the shift and attribute bits. */ - if (page) { - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | - _PAGE_ACCESSED; - if (writing || upgrade_write) - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, ); + pte = *ptep; + local_irq_enable(); + + /* Get pte level from shift/size */ + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; } else { - /* -* Read the PTE from the process' radix tree and use that -* so we get the attribute bits. -*/ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, ); - pte = *ptep; - local_irq_enable(); - if (shift == PUD_SHIFT && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - } else if (shift == PMD_SHIFT && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - } else if (shift && shift != PAGE_SHIFT) { - /* Adjust PFN */ - unsigned long
[PATCH] arch: powerpc: Kconfig: pedantic formatting
Formatting of Kconfig files doesn't look so pretty, so let the Great White Handkerchief come around and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult --- arch/powerpc/Kconfig | 28 ++-- arch/powerpc/kvm/Kconfig | 6 +++--- arch/powerpc/platforms/40x/Kconfig | 7 +++ arch/powerpc/platforms/44x/Kconfig | 10 +- arch/powerpc/platforms/85xx/Kconfig| 8 arch/powerpc/platforms/86xx/Kconfig| 6 +++--- arch/powerpc/platforms/maple/Kconfig | 2 +- arch/powerpc/platforms/pseries/Kconfig | 18 +- arch/powerpc/sysdev/xics/Kconfig | 13 ++--- 9 files changed, 48 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82..ea29d94 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -48,7 +48,7 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX # Allow randomisation to consume up to 512MB of address space (2^29). default 11 if PPC_256K_PAGES# 11 = 29 (512MB) - 18 (256K) default 13 if PPC_64K_PAGES # 13 = 29 (512MB) - 16 (64K) - default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K) + default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K) default 17 # 17 = 29 (512MB) - 12 (4K) config ARCH_MMAP_RND_COMPAT_BITS_MIN @@ -246,9 +246,9 @@ config PPC # config PPC_BARRIER_NOSPEC -bool -default y -depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E + bool + default y + depends on PPC_BOOK3S_64 || PPC_FSL_BOOK3E config EARLY_PRINTK bool @@ -430,13 +430,13 @@ config MATH_EMULATION_HW_UNIMPLEMENTED endchoice config PPC_TRANSACTIONAL_MEM - bool "Transactional Memory support for POWERPC" - depends on PPC_BOOK3S_64 - depends on SMP - select ALTIVEC - select VSX - ---help--- - Support user-mode Transactional Memory on POWERPC. + bool "Transactional Memory support for POWERPC" + depends on PPC_BOOK3S_64 + depends on SMP + select ALTIVEC + select VSX + ---help--- + Support user-mode Transactional Memory on POWERPC. config LD_HEAD_STUB_CATCH bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT @@ -937,7 +937,7 @@ config FSL_SOC bool config FSL_PCI - bool + bool select ARCH_HAS_DMA_SET_MASK select PPC_INDIRECT_PCI select PCI_QUIRKS @@ -1049,14 +1049,14 @@ config DYNAMIC_MEMSTART select NONSTATIC_KERNEL help This option enables the kernel to be loaded at any page aligned - physical address. The kernel creates a mapping from KERNELBASE to + physical address. The kernel creates a mapping from KERNELBASE to the address where the kernel is loaded. The page size here implies the TLB page size of the mapping for kernel on the particular platform. Please refer to the init code for finding the TLB page size. DYNAMIC_MEMSTART is an easy way of implementing pseudo-RELOCATABLE kernel image, where the only restriction is the page aligned kernel - load address. When this option is enabled, the compile time physical + load address. When this option is enabled, the compile time physical address CONFIG_PHYSICAL_START is ignored. This option is overridden by CONFIG_RELOCATABLE diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index bfdde04..cfc4cbe 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -184,9 +184,9 @@ config KVM_MPIC select HAVE_KVM_MSI help Enable support for emulating MPIC devices inside the - host kernel, rather than relying on userspace to emulate. - Currently, support is limited to certain versions of - Freescale's MPIC implementation. + host kernel, rather than relying on userspace to emulate. + Currently, support is limited to certain versions of + Freescale's MPIC implementation. config KVM_XICS bool "KVM in-kernel XICS emulation" diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index ad2bb14..6da813b 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -16,12 +16,12 @@ config EP405 This option enables support for the EP405/EP405PC boards. config HOTFOOT -bool "Hotfoot" + bool "Hotfoot" depends on 40x select PPC40x_SIMPLE select FORCE_PCI -help -This option enables support for the ESTEEM 195E Hotfoot board. + help + This option enables support for the ESTEEM 195E Hotfoot board. config KILAUEA bool "Kilauea" @@ -80,7 +80,6 @@ config OBS600 help This option enables support for PlatHome OpenBlockS 600 server -
[PATCH] sound: oao: Kconfig: pedantic formatting
Formatting of Kconfig files doesn't look so pretty, so let the Great White Handkerchief come around and clean it up. Signed-off-by: Enrico Weigelt, metux IT consult --- sound/aoa/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/aoa/Kconfig b/sound/aoa/Kconfig index c081e18..a1e3231 100644 --- a/sound/aoa/Kconfig +++ b/sound/aoa/Kconfig @@ -3,8 +3,8 @@ menuconfig SND_AOA depends on PPC_PMAC select SND_PCM ---help--- - This option enables the new driver for the various - Apple Onboard Audio components. + This option enables the new driver for the various + Apple Onboard Audio components. if SND_AOA @@ -14,4 +14,4 @@ source "sound/aoa/codecs/Kconfig" source "sound/aoa/soundbus/Kconfig" -endif # SND_AOA +endif # SND_AOA -- 1.9.1
[PATCH v3] powerpc/pseries: Only wait for dying CPU after call to rtas_stop_self()
When testing DLPAR CPU add/remove on a system under stress, pseries_cpu_die() doesn't wait long enough for a CPU to die: [ 446.983944] cpu 148 (hwid 148) Ready to die... [ 446.984062] cpu 149 (hwid 149) Ready to die... [ 446.993518] cpu 150 (hwid 150) Ready to die... [ 446.993543] Querying DEAD? cpu 150 (150) shows 2 [ 446.994098] cpu 151 (hwid 151) Ready to die... [ 447.133726] cpu 136 (hwid 136) Ready to die... [ 447.403532] cpu 137 (hwid 137) Ready to die... [ 447.403772] cpu 138 (hwid 138) Ready to die... [ 447.403839] cpu 139 (hwid 139) Ready to die... [ 447.403887] cpu 140 (hwid 140) Ready to die... [ 447.403937] cpu 141 (hwid 141) Ready to die... [ 447.403979] cpu 142 (hwid 142) Ready to die... [ 447.404038] cpu 143 (hwid 143) Ready to die... [ 447.513546] cpu 128 (hwid 128) Ready to die... [ 447.693533] cpu 129 (hwid 129) Ready to die... [ 447.693999] cpu 130 (hwid 130) Ready to die... [ 447.703530] cpu 131 (hwid 131) Ready to die... [ 447.704087] Querying DEAD? cpu 132 (132) shows 2 [ 447.704102] cpu 132 (hwid 132) Ready to die... [ 447.713534] cpu 133 (hwid 133) Ready to die... [ 447.714064] Querying DEAD? cpu 134 (134) shows 2 This is a race between one CPU stopping and another one calling pseries_cpu_die() to wait for it to stop. That function does a short busy loop calling RTAS query-cpu-stopped-state on the stopping CPU to verify that it is stopped, but I think there's a lot for the stopping CPU to do which may take longer than this loop allows. As can be seen in the dmesg right before or after the "Querying DEAD?" messages, if pseries_cpu_die() waited a little longer it would have seen the CPU in the stopped state. What I think is going on is that CPU 134 was inactive at the time it was unplugged. In that case, dlpar_offline_cpu() calls H_PROD on that CPU and immediately calls pseries_cpu_die(). Meanwhile, the prodded CPU activates and start the process of stopping itself. The busy loop is not long enough to allow for the CPU to wake up and complete the stopping process. This can be a problem because if the busy loop finishes too early, then the kernel may offline another CPU before the previous one finished dying, which would lead to two concurrent calls to rtas-stop-self, which is prohibited by the PAPR. We can make the race a lot more even if we only start querying if the CPU is stopped when the stopping CPU is close to call rtas_stop_self(). Since pseries_mach_cpu_die() sets the CPU current state to offline almost immediately before calling rtas_stop_self(), we use that as a signal that it is either already stopped or very close to that point, and we can start the busy loop. As suggested by Michael Ellerman, this patch also changes the busy loop to wait for a fixed amount of wall time. Based on the measurements that Gautham did on a POWER9 system, in successful cases of smp_query_cpu_stopped(cpu) returning affirmative, the maximum time spent inside the loop was was 10 ms. This patch loops for 20 ms just be sure. Signed-off-by: Thiago Jung Bauermann Analyzed-by: Gautham R Shenoy --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) I have seen this problem since v4.8. Should this patch go to stable as well? Changes since v2: - Increaded busy loop to 200 iterations so that it can last up to 20 ms (suggested by Gautham). - Changed commit message to include Gautham's remarks. diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 97feb6e79f1a..ac6dc35ab829 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -214,13 +214,22 @@ static void pseries_cpu_die(unsigned int cpu) msleep(1); } } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + /* +* If the current state is not offline yet, it means that the +* dying CPU (which is either in pseries_mach_cpu_die() or in +* the process of getting there) didn't have a chance yet to +* call rtas_stop_self() and therefore it's too early to query +* if the CPU is stopped. +*/ + spin_event_timeout(get_cpu_current_state(cpu) == CPU_STATE_OFFLINE, + 10, 100); - for (tries = 0; tries < 25; tries++) { + for (tries = 0; tries < 200; tries++) { cpu_status = smp_query_cpu_stopped(pcpu); if (cpu_status == QCSS_STOPPED || cpu_status == QCSS_HARDWARE_ERROR) break; - cpu_relax(); + udelay(100); } }
Re: [PATCH v2] powerpc/pseries: Only wait for dying CPU after call to rtas_stop_self()
Hello Gautham, Thanks for your review. Gautham R Shenoy writes: > Hello Thiago, > > On Fri, Feb 22, 2019 at 07:57:52PM -0300, Thiago Jung Bauermann wrote: >> I see two cases that can be causing this race: >> >> 1. It's possible that CPU 134 was inactive at the time it was unplugged. In >>that case, dlpar_offline_cpu() calls H_PROD on that CPU and immediately >>calls pseries_cpu_die(). Meanwhile, the prodded CPU activates and start >>the process of stopping itself. It's possible that the busy loop is not >>long enough to allow for the CPU to wake up and complete the stopping >>process. > > The problem is a bit more severe since, after printing "Querying > DEAD?" for CPU X, this CPU can prod another offline CPU Y on the same > core which, on waking up, will call rtas_stop_self. Thus we can have two > concurrent calls to rtas-stop-self, which is prohibited by the PAPR. Inded, very good point. I added this information to the patch description. >> 2. If CPU 134 was online at the time it was unplugged, it would have gone >>through the new CPU hotplug state machine in kernel/cpu.c that was >>introduced in v4.6 to get itself stopped. It's possible that the busy >>loop in pseries_cpu_die() was long enough for the older hotplug code but >>not for the new hotplug state machine. > > I haven't been able to observe the "Querying DEAD?" messages for the > online CPU which was being offlined and dlpar'ed out. Ah, thanks for pointing this out. That was a scenario I thought could happen when I was investigating this issue but I never confirmed whether it could really happen. I removed it from the patch description. >> I don't know if this race condition has any ill effects, but we can make >> the race a lot more even if we only start querying if the CPU is stopped >> when the stopping CPU is close to call rtas_stop_self(). >> >> Since pseries_mach_cpu_die() sets the CPU current state to offline almost >> immediately before calling rtas_stop_self(), we use that as a signal that >> it is either already stopped or very close to that point, and we can start >> the busy loop. >> >> As suggested by Michael Ellerman, this patch also changes the busy loop to >> wait for a fixed amount of wall time. >> >> Signed-off-by: Thiago Jung Bauermann >> --- >> arch/powerpc/platforms/pseries/hotplug-cpu.c | 10 +- >> 1 file changed, 9 insertions(+), 1 deletion(-) >> >> I tried to estimate good amounts for the timeout and loop delays, but >> I'm not sure how reasonable my numbers are. The busy loops will wait for >> 100 µs between each try, and spin_event_timeout() will timeout after >> 100 ms. I'll be happy to change these values if you have better >> suggestions. > > Based on the measurements that I did on a POWER9 system, in successful > cases of smp_query_cpu_stopped(cpu) returning affirmative, the maximum > time spent inside the loop was was 10ms. That's very good to know. I added this information to the patch description. I also added you in an Analyzed-by tag, I hope it's fine with you. >> Gautham was able to test this patch and it solved the race condition. >> >> v1 was a cruder patch which just increased the number of loops: >> https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-February/153734.html >> >> v1 also mentioned a kernel crash but Gautham narrowed it down to a bug >> in RTAS, which is in the process of being fixed. >> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c >> b/arch/powerpc/platforms/pseries/hotplug-cpu.c >> index 97feb6e79f1a..424146cc752e 100644 >> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c >> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c >> @@ -214,13 +214,21 @@ static void pseries_cpu_die(unsigned int cpu) >> msleep(1); >> } >> } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { >> +/* >> + * If the current state is not offline yet, it means that the >> + * dying CPU (which is in pseries_mach_cpu_die) didn't have a >> + * chance to call rtas_stop_self yet and therefore it's too >> + * early to query if the CPU is stopped. >> + */ >> +spin_event_timeout(get_cpu_current_state(cpu) == >> CPU_STATE_OFFLINE, >> + 10, 100); >> >> for (tries = 0; tries < 25; tries++) { > > Can we bumped up the tries to 100, so that we wait for 10ms before > printing the warning message ? Good idea. I increased the loop to 200 iterations so that it can take up to 20 ms, just to be sure. >> cpu_status = smp_query_cpu_stopped(pcpu); >> if (cpu_status == QCSS_STOPPED || >> cpu_status == QCSS_HARDWARE_ERROR) >> break; >> -cpu_relax(); >> +udelay(100); >> } >> } >> -- Thiago Jung Bauermann IBM Linux Technology
Re: [PATCH 3/6] x86: clean up _TIF_SYSCALL_EMU handling using ptrace_syscall_enter hook
(I thought I had sent this email, last Tuesday itself, but saw this in my draft today, something went wrong, sorry for the delay) On Tue, Mar 05, 2019 at 02:14:47AM +, Haibo Xu (Arm Technology China) wrote: > On 2019/3/4 18:12, Sudeep Holla wrote: > > On Mon, Mar 04, 2019 at 08:25:28AM +, Haibo Xu (Arm Technology China) > > wrote: > >> On 2019/3/1 2:32, Sudeep Holla wrote: > >>> Now that we have a new hook ptrace_syscall_enter that can be called from > >>> syscall entry code and it handles PTRACE_SYSEMU in generic code, we > >>> can do some cleanup using the same in syscall_trace_enter. > >>> > >>> Further the extra logic to find single stepping PTRACE_SYSEMU_SINGLESTEP > >>> in syscall_slow_exit_work seems unnecessary. Let's remove the same. > >> > >> I think we should not change the logic here. Is so, it will double the > >> report of syscall > >> when PTRACE_SYSEMU_SINGLESTEP is enabled. > >> > > > > I don't think that should happen, but I may be missing something. > > Can you explain how ? > > > > When PTRACE_SYSEMU_SINGLESTEP is enabled, both the _TIF_SYSCALL_EMU and > _TIF_SINGLESTEP flags are set, but ptrace only need to report(send SIGTRAP) > at the entry of a system call, no need to report at the exit of a system > call. > Sorry, but I still not get it, we have: step = ((flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) == _TIF_SINGLESTEP); For me, this is same as: step = ((flags & _TIF_SINGLESTEP) == _TIF_SINGLESTEP) or if (flags & _TIF_SINGLESTEP) step = true; So when PTRACE_SYSEMU_SINGLESTEP, _TIF_SYSCALL_EMU and _TIF_SINGLESTEP are set and step evaluates to true. So dropping _TIF_SYSCALL_EMU here should be fine. Am I still missing something ? -- Regards, Sudeep
[PATCH 19/20] KVM: PPC: Book3S HV: Don't use compound_order to determine host mapping size
From: Nicholas Piggin THP paths can defer splitting compound pages until after the actual remap and TLB flushes to split a huge PMD/PUD. This causes radix partition scope page table mappings to get out of synch with the host qemu page table mappings. This results in random memory corruption in the guest when running with THP. The easiest way to reproduce is use KVM balloon to free up a lot of memory in the guest and then shrink the balloon to give the memory back, while some work is being done in the guest. Cc: David Gibson Cc: "Aneesh Kumar K.V" Cc: kvm-...@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Nicholas Piggin Signed-off-by: Paul Mackerras Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 91 +++--- 1 file changed, 37 insertions(+), 54 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 7efc42538ccf..ae023d2256ef 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -538,8 +538,8 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; - unsigned long mmu_seq, pte_size; - unsigned long gpa, gfn, hva, pfn; + unsigned long mmu_seq; + unsigned long gpa, gfn, hva; struct kvm_memory_slot *memslot; struct page *page = NULL; long ret; @@ -636,9 +636,10 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ hva = gfn_to_hva_memslot(memslot, gfn); if (upgrade_p && __get_user_pages_fast(hva, 1, 1, ) == 1) { - pfn = page_to_pfn(page); upgrade_write = true; } else { + unsigned long pfn; + /* Call KVM generic code to do the slow-path check */ pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, writing, upgrade_p); @@ -652,63 +653,45 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } } - /* See if we can insert a 1GB or 2MB large PTE here */ - level = 0; - if (page && PageCompound(page)) { - pte_size = PAGE_SIZE << compound_order(compound_head(page)); - if (pte_size >= PUD_SIZE && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - pfn &= ~((PUD_SIZE >> PAGE_SHIFT) - 1); - } else if (pte_size >= PMD_SIZE && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); - } - } - /* -* Compute the PTE value that we need to insert. +* Read the PTE from the process' radix tree and use that +* so we get the shift and attribute bits. */ - if (page) { - pgflags = _PAGE_READ | _PAGE_EXEC | _PAGE_PRESENT | _PAGE_PTE | - _PAGE_ACCESSED; - if (writing || upgrade_write) - pgflags |= _PAGE_WRITE | _PAGE_DIRTY; - pte = pfn_pte(pfn, __pgprot(pgflags)); + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, ); + pte = *ptep; + local_irq_enable(); + + /* Get pte level from shift/size */ + if (shift == PUD_SHIFT && + (gpa & (PUD_SIZE - PAGE_SIZE)) == + (hva & (PUD_SIZE - PAGE_SIZE))) { + level = 2; + } else if (shift == PMD_SHIFT && + (gpa & (PMD_SIZE - PAGE_SIZE)) == + (hva & (PMD_SIZE - PAGE_SIZE))) { + level = 1; } else { - /* -* Read the PTE from the process' radix tree and use that -* so we get the attribute bits. -*/ - local_irq_disable(); - ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, ); - pte = *ptep; - local_irq_enable(); - if (shift == PUD_SHIFT && - (gpa & (PUD_SIZE - PAGE_SIZE)) == - (hva & (PUD_SIZE - PAGE_SIZE))) { - level = 2; - } else if (shift == PMD_SHIFT && - (gpa & (PMD_SIZE - PAGE_SIZE)) == - (hva & (PMD_SIZE - PAGE_SIZE))) { - level = 1; - } else if (shift && shift != PAGE_SHIFT) { - /* Adjust PFN */ - unsigned long mask = (1ul << shift) - PAGE_SIZE; - pte = __pte(pte_val(pte) | (hva & mask)); - } -
[PATCH RFC v4 21/21] powerpc/pci: Fix crash with enabled movable BARs
Check a resource for the UNSET flags. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fa6af52b5219..353b36727f6a 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2977,7 +2977,8 @@ static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, int index; int64_t rc; - if (!res || !res->flags || res->start > res->end) + if (!res || !res->flags || res->start > res->end || + (res->flags & IORESOURCE_UNSET)) return; if (res->flags & IORESOURCE_IO) { -- 2.20.1
[PATCH RFC v4 19/21] PCI: Prioritize fixed BAR assigning over the movable ones
The allocated bridge windows are big enough to house all the children bridges and BARs, but the fixed resources must be assigned first, so the movable ones later divide the rest of the window. That's the assignment order: 1. Bridge windows with fixed areas; 2. Fixed BARs; 3. The rest of BARs and bridge windows. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 69 - 1 file changed, 55 insertions(+), 14 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f4737339d5ec..932a6c020d10 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -272,31 +272,54 @@ static void reassign_resources_sorted(struct list_head *realloc_head, } } -/** - * assign_requested_resources_sorted() - satisfy resource requests - * - * @head : head of the list tracking requests for resources - * @fail_head : head of the list tracking requests that could - * not be allocated - * - * Satisfy resource requests of each element in the list. Add - * requests that could not satisfied to the failed_list. - */ -static void assign_requested_resources_sorted(struct list_head *head, -struct list_head *fail_head) +enum assign_step { + assign_fixed_bridge_windows, + assign_fixed_resources, + assign_float_resources, +}; + +static void _assign_requested_resources_sorted(struct list_head *head, + struct list_head *fail_head, + enum assign_step step) { struct resource *res; struct pci_dev_resource *dev_res; int idx; list_for_each_entry(dev_res, head, list) { + bool is_fixed; + bool is_fixed_bridge; + bool is_bridge; + if (pci_dev_is_ignored(dev_res->dev)) continue; res = dev_res->res; + if (!resource_size(res)) + continue; + idx = res - _res->dev->resource[0]; - if (resource_size(res) && - pci_assign_resource(dev_res->dev, idx)) { + is_fixed = res->flags & IORESOURCE_PCI_FIXED; + is_bridge = dev_res->dev->subordinate && idx >= PCI_BRIDGE_RESOURCES; + + if (is_bridge) { + struct pci_bus *child = dev_res->dev->subordinate; + int b_res_idx = pci_get_bridge_resource_idx(res); + struct resource *fixed_res = >fixed_range_hard[b_res_idx]; + + is_fixed_bridge = fixed_res->start < fixed_res->end; + } else { + is_fixed_bridge = false; + } + + if (assign_fixed_bridge_windows == step && !is_fixed_bridge) + continue; + else if (assign_fixed_resources == step && (!is_fixed || is_bridge)) + continue; + else if (assign_float_resources == step && (is_fixed || is_fixed_bridge)) + continue; + + if (pci_assign_resource(dev_res->dev, idx)) { if (fail_head) { /* * if the failed res is for ROM BAR, and it will @@ -315,6 +338,24 @@ static void assign_requested_resources_sorted(struct list_head *head, } } +/** + * assign_requested_resources_sorted() - satisfy resource requests + * + * @head : head of the list tracking requests for resources + * @fail_head : head of the list tracking requests that could + * not be allocated + * + * Satisfy resource requests of each element in the list. Add + * requests that could not satisfied to the failed_list. + */ +static void assign_requested_resources_sorted(struct list_head *head, + struct list_head *fail_head) +{ + _assign_requested_resources_sorted(head, fail_head, assign_fixed_bridge_windows); + _assign_requested_resources_sorted(head, fail_head, assign_fixed_resources); + _assign_requested_resources_sorted(head, fail_head, assign_float_resources); +} + static unsigned long pci_fail_res_type_mask(struct list_head *fail_head) { struct pci_dev_resource *fail_res; -- 2.20.1
[PATCH RFC v4 18/21] PCI: Make sure bridge windows include their fixed BARs
Consider previously calculated boundaries when allocating a bridge window, setting the lowest allowed address and checking the result. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/bus.c | 2 +- drivers/pci/setup-res.c | 31 +-- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index a9784144d6f2..ce2d2aeedbd3 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -192,7 +192,7 @@ static int pci_bus_alloc_from_region(struct pci_bus *bus, struct resource *res, * this is an already-configured bridge window, its start * overrides "min". */ - if (avail.start) + if (min_used < avail.start) min_used = avail.start; max = avail.end; diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index 732d18f60f1b..04442339548d 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -248,9 +248,22 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, struct resource *res = dev->resource + resno; resource_size_t min; int ret; + resource_size_t start = (resource_size_t)-1; + resource_size_t end = 0; min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM; + if (dev->subordinate && resno >= PCI_BRIDGE_RESOURCES) { + struct pci_bus *child_bus = dev->subordinate; + int b_resno = resno - PCI_BRIDGE_RESOURCES; + resource_size_t soft_start = child_bus->fixed_range_soft[b_resno].start; + + start = child_bus->fixed_range_hard[b_resno].start; + end = child_bus->fixed_range_hard[b_resno].end; + if (start < end) + min = soft_start; + } + /* * First, try exact prefetching match. Even if a 64-bit * prefetchable bridge window is below 4GB, we can't put a 32-bit @@ -262,7 +275,7 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, IORESOURCE_PREFETCH | IORESOURCE_MEM_64, pcibios_align_resource, dev); if (ret == 0) - return 0; + goto check_fixed; /* * If the prefetchable window is only 32 bits wide, we can put @@ -274,7 +287,7 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, IORESOURCE_PREFETCH, pcibios_align_resource, dev); if (ret == 0) - return 0; + goto check_fixed; } /* @@ -287,6 +300,20 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev, ret = pci_bus_alloc_resource(bus, res, size, align, min, 0, pcibios_align_resource, dev); +check_fixed: + if (ret == 0 && start < end) { + if (res->start > start || res->end < end) { + dev_err(>dev, "%s: fixed area 0x%llx-0x%llx for %s doesn't fit in the allocated %pR (0x%llx-0x%llx)", + __func__, + (unsigned long long)start, (unsigned long long)end, + dev_name(>dev), + res, (unsigned long long)res->start, + (unsigned long long)res->end); + release_resource(res); + return -1; + } + } + return ret; } -- 2.20.1
[PATCH RFC v4 20/21] PCI: pciehp: Add support for the movable BARs feature
With movable BARs, adding a hotplugged device may affect all the PCIe domain starting from the root, so use a pci_rescan_bus() function which handles the rearrangement of existing BARs and bridge windows. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/hotplug/pciehp_pci.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c index b9c1396db6fe..7c0871db5bae 100644 --- a/drivers/pci/hotplug/pciehp_pci.c +++ b/drivers/pci/hotplug/pciehp_pci.c @@ -56,12 +56,16 @@ int pciehp_configure_device(struct controller *ctrl) goto out; } - for_each_pci_bridge(dev, parent) - pci_hp_add_bridge(dev); + if (pci_movable_bars_enabled()) { + pci_rescan_bus(parent); + } else { + for_each_pci_bridge(dev, parent) + pci_hp_add_bridge(dev); - pci_assign_unassigned_bridge_resources(bridge); - pcie_bus_configure_settings(parent); - pci_bus_add_devices(parent); + pci_assign_unassigned_bridge_resources(bridge); + pcie_bus_configure_settings(parent); + pci_bus_add_devices(parent); + } out: pci_unlock_rescan_remove(); -- 2.20.1
[PATCH RFC v4 12/21] PCI: Don't allow hotplugged devices to steal resources
When movable BARs are enabled, the PCI subsystem at first releases all the bridge windows and then performs an attempt to assign new requested resources and re-assign the existing ones. If a hotplugged device gets its resources first, there could be no space left to re-assign resources of already working devices, which is unacceptable. If this happens, this patch marks one of the new devices with the new introduced flag PCI_DEV_IGNORE and retries the resource assignment. This patch adds a new res_mask bitmask to the struct pci_dev for storing the indices of assigned resources. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/bus.c | 5 ++ drivers/pci/pci.h | 11 + drivers/pci/probe.c | 100 +++- drivers/pci/setup-bus.c | 15 ++ include/linux/pci.h | 1 + 5 files changed, 130 insertions(+), 2 deletions(-) diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 5cb40b2518f9..a9784144d6f2 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -311,6 +311,11 @@ void pci_bus_add_device(struct pci_dev *dev) { int retval; + if (pci_dev_is_ignored(dev)) { + pci_warn(dev, "%s: don't enable the ignored device\n", __func__); + return; + } + /* * Can not put in pci_device_add yet because resources * are not assigned yet for some devices. diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index e06e8692a7b1..56b905068ac5 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -366,6 +366,7 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 +#define PCI_DEV_IGNORE 1 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) { @@ -377,6 +378,16 @@ static inline bool pci_dev_is_added(const struct pci_dev *dev) return test_bit(PCI_DEV_ADDED, >priv_flags); } +static inline void pci_dev_ignore(struct pci_dev *dev, bool ignore) +{ + assign_bit(PCI_DEV_IGNORE, >priv_flags, ignore); +} + +static inline bool pci_dev_is_ignored(const struct pci_dev *dev) +{ + return test_bit(PCI_DEV_IGNORE, >priv_flags); +} + #ifdef CONFIG_PCIEAER #include diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 692752c71f71..62f4058a001f 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3248,6 +3248,23 @@ unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge) return max; } +static unsigned int pci_dev_res_mask(struct pci_dev *dev) +{ + unsigned int res_mask = 0; + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) { + struct resource *r = >resource[i]; + + if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent) + continue; + + res_mask |= (1 << i); + } + + return res_mask; +} + static void pci_bus_rescan_prepare(struct pci_bus *bus) { struct pci_dev *dev; @@ -3257,6 +3274,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus) list_for_each_entry(dev, >devices, bus_list) { struct pci_bus *child = dev->subordinate; + dev->res_mask = pci_dev_res_mask(dev); + if (child) { pci_bus_rescan_prepare(child); } else if (dev->driver && @@ -3318,6 +3337,84 @@ static void pci_setup_bridges(struct pci_bus *bus) pci_setup_bridge(bus); } +static struct pci_dev *pci_find_next_new_device(struct pci_bus *bus) +{ + struct pci_dev *dev; + + if (!bus) + return NULL; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *child_bus = dev->subordinate; + + if (!pci_dev_is_added(dev) && !pci_dev_is_ignored(dev)) + return dev; + + if (child_bus) { + struct pci_dev *next_new_dev; + + next_new_dev = pci_find_next_new_device(child_bus); + if (next_new_dev) + return next_new_dev; + } + } + + return NULL; +} + +static bool pci_bus_validate_resources(struct pci_bus *bus) +{ + struct pci_dev *dev; + bool ret = true; + + if (!bus) + return false; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *child = dev->subordinate; + unsigned int res_mask = pci_dev_res_mask(dev); + + if (pci_dev_is_ignored(dev)) + continue; + + if (dev->res_mask & ~res_mask) { + pci_err(dev, "%s: Non-re-enabled resources found: 0x%x -> 0x%x\n", + __func__, dev->res_mask, res_mask); + ret = false; + } + + if (child && !pci_bus_validate_resources(child)) + ret = false; +
[PATCH RFC v4 16/21] PCI: Calculate fixed areas of bridge windows based on fixed BARs
For every (IO, MEM, MEM64) bridge window, count the fixed resources of its children endpoints and children bridge windows: | <- BAR -> || <- child bus fixed_range_hard -> | | <- fixed BAR -> | | <-bus's fixed_range_hard-> | | <- bus's bridge window -> | These ranges will be later used to arrange bridge windows in a way which covers every immovable BAR as well as the movable ones during hotplug. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/pci.h | 14 +++ drivers/pci/probe.c | 82 + drivers/pci/setup-bus.c | 17 + include/linux/pci.h | 6 +++ 4 files changed, 119 insertions(+) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 56b905068ac5..14e3ebe68010 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -364,6 +364,20 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) return dev->error_state == pci_channel_io_perm_failure; } +static inline int pci_get_bridge_resource_idx(struct resource *r) +{ + int idx = 1; + + if (r->flags & IORESOURCE_IO) + idx = 0; + else if (!(r->flags & IORESOURCE_PREFETCH)) + idx = 1; + else if (r->flags & IORESOURCE_MEM_64) + idx = 2; + + return idx; +} + /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 #define PCI_DEV_IGNORE 1 diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 62f4058a001f..70b15654f253 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -551,6 +551,7 @@ void pci_read_bridge_bases(struct pci_bus *child) static struct pci_bus *pci_alloc_bus(struct pci_bus *parent) { struct pci_bus *b; + int idx; b = kzalloc(sizeof(*b), GFP_KERNEL); if (!b) @@ -567,6 +568,11 @@ static struct pci_bus *pci_alloc_bus(struct pci_bus *parent) if (parent) b->domain_nr = parent->domain_nr; #endif + for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) { + b->fixed_range_hard[idx].start = (resource_size_t)-1; + b->fixed_range_hard[idx].end = 0; + } + return b; } @@ -3337,6 +3343,81 @@ static void pci_setup_bridges(struct pci_bus *bus) pci_setup_bridge(bus); } +static void pci_bus_update_fixed_range_hard(struct pci_bus *bus) +{ + struct pci_dev *dev; + int idx; + resource_size_t start, end; + + for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) { + bus->fixed_range_hard[idx].start = (resource_size_t)-1; + bus->fixed_range_hard[idx].end = 0; + } + + list_for_each_entry(dev, >devices, bus_list) + if (dev->subordinate) + pci_bus_update_fixed_range_hard(dev->subordinate); + + list_for_each_entry(dev, >devices, bus_list) { + int i; + + for (i = 0; i < PCI_BRIDGE_RESOURCES; ++i) { + struct resource *r = >resource[i]; + + if (!r->flags || (r->flags & IORESOURCE_UNSET) || !r->parent) + continue; + + if (r->flags & IORESOURCE_PCI_FIXED) { + idx = pci_get_bridge_resource_idx(r); + start = bus->fixed_range_hard[idx].start; + end = bus->fixed_range_hard[idx].end; + + if (start > r->start) + start = r->start; + if (end < r->end) + end = r->end; + + if (bus->fixed_range_hard[idx].start != start || + bus->fixed_range_hard[idx].end != end) { + dev_dbg(>dev, "%s: Found fixed 0x%llx-0x%llx in %s, expand the fixed bridge window %d to 0x%llx-0x%llx\n", + __func__, + (unsigned long long)r->start, + (unsigned long long)r->end, + dev_name(>dev), idx, + (unsigned long long)start, + (unsigned long long)end); + bus->fixed_range_hard[idx].start = start; + bus->fixed_range_hard[idx].end = end; + } + } + } + + if (dev->subordinate) { + struct pci_bus *child = dev->subordinate; + + for (idx = 0; idx < PCI_BRIDGE_RESOURCE_NUM; ++idx) { + start = bus->fixed_range_hard[idx].start; + end =
[PATCH RFC v4 17/21] PCI: Calculate boundaries for bridge windows
If a bridge window contains fixed areas (there are PCIe devices with immovable BARs located on this bus), this window must be allocated within the bound memory area, limited by windows size and by address range of fixed resources, calculated as follows: | <-- bus's fixed_range_hard --> | | <-- fixed_range_hard.end - window size --> | | <-- fixed_range_hard.start + window size --> | | <--bus's fixed_range_soft--> | Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 56 + include/linux/pci.h | 4 ++- 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index a1fd7f3c5ea8..f4737339d5ec 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1809,6 +1809,61 @@ static enum enable_type pci_realloc_detect(struct pci_bus *bus, } #endif +static void pci_bus_update_fixed_range_soft(struct pci_bus *bus) +{ + struct pci_dev *dev; + struct pci_bus *parent = bus->parent; + int idx; + + list_for_each_entry(dev, >devices, bus_list) + if (dev->subordinate) + pci_bus_update_fixed_range_soft(dev->subordinate); + + if (!parent || !bus->self) + return; + + for (idx = 0; idx < ARRAY_SIZE(bus->fixed_range_hard); ++idx) { + struct resource *r; + resource_size_t soft_start, soft_end; + resource_size_t hard_start = bus->fixed_range_hard[idx].start; + resource_size_t hard_end = bus->fixed_range_hard[idx].end; + + if (hard_start > hard_end) + continue; + + r = bus->resource[idx]; + + soft_start = hard_end - resource_size(r) + 1; + soft_end = hard_start + resource_size(r) - 1; + + if (soft_start > hard_start) + soft_start = hard_start; + + if (soft_end < hard_end) + soft_end = hard_end; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *sibling = dev->subordinate; + resource_size_t s_start, s_end; + + if (!sibling || sibling == bus) + continue; + + s_start = sibling->fixed_range_hard[idx].start; + s_end = sibling->fixed_range_hard[idx].end; + + if (s_start > s_end) + continue; + + if (s_end < hard_start && s_end > soft_start) + soft_start = s_end; + } + + bus->fixed_range_soft[idx].start = soft_start; + bus->fixed_range_soft[idx].end = soft_end; + } +} + /* * first try will not touch pci bridge res * second and later try will clear small leaf bridge res @@ -1847,6 +1902,7 @@ void pci_assign_unassigned_root_bus_resources(struct pci_bus *bus) /* Depth first, calculate sizes and alignments of all subordinate buses. */ __pci_bus_size_bridges(bus, add_list); + pci_bus_update_fixed_range_soft(bus); /* Depth last, allocate resources and update the hardware. */ __pci_bus_assign_resources(bus, add_list, _head); diff --git a/include/linux/pci.h b/include/linux/pci.h index 7a4d62d84bc1..75a56db73ad4 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -574,9 +574,11 @@ struct pci_bus { /* * If there are fixed resources in the bridge window, the hard range -* contains the lowest and the highest addresses of them. +* contains the lowest and the highest addresses of them, and this +* bridge window must reside within the soft range. */ struct resource fixed_range_hard[PCI_BRIDGE_RESOURCE_NUM]; + struct resource fixed_range_soft[PCI_BRIDGE_RESOURCE_NUM]; struct pci_ops *ops; /* Configuration access functions */ struct msi_controller *msi; /* MSI controller */ -- 2.20.1
[PATCH RFC v4 00/21] PCI: Allow BAR movement during hotplug
If the firmware or kernel has arranged memory for PCIe devices in a way that doesn't provide enough space for BARs of a new hotplugged device, the kernel can pause the drivers of the "obstructing" devices and move their BARs, so new BARs can fit into the freed spaces. When a driver is un-paused by the kernel after the PCIe rescan, it should check if its BARs had moved, and ioremap() them if needed. Drivers indicate their support of the feature by implementing the new rescan_prepare() and rescan_done() hooks in the struct pci_driver. If a driver doesn't yet support the feature, BARs of its devices will be marked as immovable by the IORESOURCE_PCI_FIXED flag. To re-arrange the BARs and bridge windows this patch releases all of them after a rescan and re-assigns in the same way as during the initial PCIe topology scan at system boot. Tested on: - x86_64 with "pci=realloc,assign-busses,use_crs pcie_movable_bars=force" - POWER8 PowerNV+PHB3 ppc64le with [1] and [2] applied and the following: "pci=realloc pcie_movable_bars=force" Not so many platforms and test cases were covered, so all who are interested are highly welcome to test on your setups - the more exotic the better! This patchset is a part of our work on adding support for hotplugging bridges full of NVME and GPU devices without special requirements such as Hot-Plug Controller, reservation of bus numbers or memory regions by firmware, etc. Future work will be devoted to implementing the movable bus numbers. [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2019-March/186618.html [2] https://lists.ozlabs.org/pipermail/skiboot/2019-March/013571.html Changes since v3: - Rebased to the upstream, so the patches apply cleanly again. Changes since v2: - Fixed double-assignment of bridge windows; - Fixed assignment of fixed prefetched resources; - Fixed releasing of fixed resources; - Fixed a debug message; - Removed auto-enabling the movable BARs for x86 - let's rely on the "pcie_movable_bars=force" option for now; - Reordered the patches - bugfixes first. Changes since v1: - Add a "pcie_movable_bars={ off | force }" command line argument; - Handle the IORESOURCE_PCI_FIXED flag properly; - Don't move BARs of devices which don't support the feature; - Guarantee that new hotplugged devices will not steal memory from working devices by ignoring the failing new devices with the new PCI_DEV_IGNORE flag; - Add rescan_prepare()+rescan_done() to the struct pci_driver instead of using the reset_prepare()+reset_done() from struct pci_error_handlers; - Add a bugfix of a race condition; - Fixed hotplug in a non-pre-enabled (by BIOS/firmware) bridge; - Fix the compatibility of the feature with pm_runtime and D3-state; - Hotplug events from pciehp also can move BARs; - Add support of the feature to the NVME driver. Sergey Miroshnichenko (21): PCI: Fix writing invalid BARs during pci_restore_state() PCI: Fix race condition in pci_enable/disable_device() PCI: Enable bridge's I/O and MEM access for hotplugged devices PCI: Define PCI-specific version of the release_child_resources() PCI: hotplug: Add a flag for the movable BARs feature PCI: Pause the devices with movable BARs during rescan PCI: Wake up bridges during rescan when movable BARs enabled nvme-pci: Handle movable BARs PCI: Mark immovable BARs with PCI_FIXED PCI: Fix assigning of fixed prefetchable resources PCI: Release and reassign the root bridge resources during rescan PCI: Don't allow hotplugged devices to steal resources PCI: Include fixed BARs into the bus size calculating PCI: Don't reserve memory for hotplug when enabled movable BARs PCI: Allow the failed resources to be reassigned later PCI: Calculate fixed areas of bridge windows based on fixed BARs PCI: Calculate boundaries for bridge windows PCI: Make sure bridge windows include their fixed BARs PCI: Prioritize fixed BAR assigning over the movable ones PCI: pciehp: Add support for the movable BARs feature powerpc/pci: Fix crash with enabled movable BARs .../admin-guide/kernel-parameters.txt | 7 + arch/powerpc/platforms/powernv/pci-ioda.c | 3 +- drivers/nvme/host/pci.c | 29 +- drivers/pci/bus.c | 7 +- drivers/pci/hotplug/pciehp_pci.c | 14 +- drivers/pci/pci.c | 60 +++- drivers/pci/pci.h | 26 ++ drivers/pci/probe.c | 271 +- drivers/pci/setup-bus.c | 245 ++-- drivers/pci/setup-res.c | 43 ++- include/linux/pci.h | 14 + 11 files changed, 678 insertions(+), 41 deletions(-) -- 2.20.1
[PATCH RFC v4 15/21] PCI: Allow the failed resources to be reassigned later
Don't lose the size of the requested EP's BAR if it can't be fit in a current trial, so this can be retried. But a failed bridge window must be dropped and recalculated in the next trial. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 3 ++- drivers/pci/setup-res.c | 12 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index f9d605cd1725..c1559a4a8564 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -309,7 +309,8 @@ static void assign_requested_resources_sorted(struct list_head *head, 0 /* don't care */, 0 /* don't care */); } - reset_resource(res); + if (!pci_movable_bars_enabled()) + reset_resource(res); } } } diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index d8ca40a97693..732d18f60f1b 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -298,6 +298,18 @@ static int _pci_assign_resource(struct pci_dev *dev, int resno, bus = dev->bus; while ((ret = __pci_assign_resource(bus, dev, resno, size, min_align))) { + if (pci_movable_bars_enabled()) { + if (resno >= PCI_BRIDGE_RESOURCES && + resno <= PCI_BRIDGE_RESOURCE_END) { + struct resource *res = dev->resource + resno; + + res->start = 0; + res->end = 0; + res->flags = 0; + } + break; + } + if (!bus->parent || !bus->self->transparent) break; bus = bus->parent; -- 2.20.1
[PATCH RFC v4 14/21] PCI: Don't reserve memory for hotplug when enabled movable BARs
pbus_size_mem() returns a precise amount of memory required to fit all the requested BARs and windows of children bridges. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 9d93f2b32bf1..f9d605cd1725 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1229,7 +1229,7 @@ void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head) case PCI_HEADER_TYPE_BRIDGE: pci_bridge_check_ranges(bus); - if (bus->self->is_hotplug_bridge) { + if (bus->self->is_hotplug_bridge && !pci_movable_bars_enabled()) { additional_io_size = pci_hotplug_io_size; additional_mem_size = pci_hotplug_mem_size; } -- 2.20.1
[PATCH RFC v4 11/21] PCI: Release and reassign the root bridge resources during rescan
When the movable BARs feature is enabled, don't rely on the memory gaps reserved by the BIOS/bootloader/firmware, but instead rearrange the BARs and bridge windows starting from the root. Endpoint device's BARs, after being released, are resorted and written back by the pci_assign_unassigned_root_bus_resources(). The last step of writing the recalculated windows to the bridges is done by the new pci_setup_bridges() function. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/pci.h | 1 + drivers/pci/probe.c | 22 ++ drivers/pci/setup-bus.c | 11 ++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 224d88634115..e06e8692a7b1 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -248,6 +248,7 @@ void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); +void pci_bus_release_root_bridge_resources(struct pci_bus *bus); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 1cf6ec960236..692752c71f71 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3299,6 +3299,25 @@ static void pci_bus_rescan_done(struct pci_bus *bus) pm_runtime_put(>dev); } +static void pci_setup_bridges(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *child; + + if (!pci_dev_is_added(dev) || pci_dev_is_ignored(dev)) + continue; + + child = dev->subordinate; + if (child) + pci_setup_bridges(child); + } + + if (bus->self) + pci_setup_bridge(bus); +} + /** * pci_rescan_bus - Scan a PCI bus for devices * @bus: PCI bus to scan @@ -3321,8 +3340,11 @@ unsigned int pci_rescan_bus(struct pci_bus *bus) pci_bus_rescan_prepare(root); max = pci_scan_child_bus(root); + + pci_bus_release_root_bridge_resources(root); pci_assign_unassigned_root_bus_resources(root); + pci_setup_bridges(root); pci_bus_rescan_done(root); } else { max = pci_scan_child_bus(bus); diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index be7d4e6d7b65..36a1907d9509 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1584,7 +1584,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, pci_printk(KERN_DEBUG, dev, "resource %d %pR released\n", PCI_BRIDGE_RESOURCES + idx, r); /* keep the old size */ - r->end = resource_size(r) - 1; + r->end = pci_movable_bars_enabled() ? 0 : (resource_size(r) - 1); r->start = 0; r->flags = 0; @@ -1637,6 +1637,15 @@ static void pci_bus_release_bridge_resources(struct pci_bus *bus, pci_bridge_release_resources(bus, type); } +void pci_bus_release_root_bridge_resources(struct pci_bus *root_bus) +{ + pci_bus_release_bridge_resources(root_bus, IORESOURCE_IO, whole_subtree); + pci_bus_release_bridge_resources(root_bus, IORESOURCE_MEM, whole_subtree); + pci_bus_release_bridge_resources(root_bus, +IORESOURCE_MEM_64 | IORESOURCE_PREFETCH, +whole_subtree); +} + static void pci_bus_dump_res(struct pci_bus *bus) { struct resource *res; -- 2.20.1
[PATCH RFC v4 13/21] PCI: Include fixed BARs into the bus size calculating
The only difference between the fixed and movable BARs is an offset preservation during the release+reassign procedure on PCIe rescan. When fixed BARs are included into the result of pbus_size_mem(), these BARs can be restricted: assign them to direct parents only. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 12 +++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 551108f48df7..9d93f2b32bf1 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1007,12 +1007,20 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask, struct resource *r = >resource[i]; resource_size_t r_size; - if (r->parent || (r->flags & IORESOURCE_PCI_FIXED) || + if (r->parent || ((r->flags & mask) != type && (r->flags & mask) != type2 && (r->flags & mask) != type3)) continue; r_size = resource_size(r); + + if (r->flags & IORESOURCE_PCI_FIXED) { + if (pci_movable_bars_enabled()) + size += r_size; + + continue; + } + #ifdef CONFIG_PCI_IOV /* put SRIOV requested res to the optional list */ if (realloc_head && i >= PCI_IOV_RESOURCES && @@ -1351,6 +1359,8 @@ static void pdev_assign_fixed_resources(struct pci_dev *dev) while (b && !r->parent) { assign_fixed_resource_on_bus(b, r); b = b->parent; + if (!r->parent && pci_movable_bars_enabled()) + break; } } } -- 2.20.1
[PATCH RFC v4 10/21] PCI: Fix assigning of fixed prefetchable resources
Allow matching them to non-prefetchable windows, as it is done for movable resources. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 3644feb13179..be7d4e6d7b65 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1301,15 +1301,20 @@ static void assign_fixed_resource_on_bus(struct pci_bus *b, struct resource *r) { int i; struct resource *parent_r; - unsigned long mask = IORESOURCE_IO | IORESOURCE_MEM | -IORESOURCE_PREFETCH; + unsigned long mask = IORESOURCE_TYPE_BITS; pci_bus_for_each_resource(b, parent_r, i) { if (!parent_r) continue; - if ((r->flags & mask) == (parent_r->flags & mask) && - resource_contains(parent_r, r)) + if ((r->flags & mask) != (parent_r->flags & mask)) + continue; + + if (parent_r->flags & IORESOURCE_PREFETCH && + !(r->flags & IORESOURCE_PREFETCH)) + continue; + + if (resource_contains(parent_r, r)) request_resource(parent_r, r); } } -- 2.20.1
[PATCH RFC v4 09/21] PCI: Mark immovable BARs with PCI_FIXED
If a PCIe device driver doesn't yet have support for movable BARs, mark device's BARs with IORESOURCE_PCI_FIXED. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/probe.c | 15 +++ 1 file changed, 15 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index dc935f82a595..1cf6ec960236 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3262,6 +3262,21 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus) } else if (dev->driver && dev->driver->rescan_prepare) { dev->driver->rescan_prepare(dev); + } else if (dev->driver || ((dev->class >> 8) == PCI_CLASS_DISPLAY_VGA)) { + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + struct resource *r = >resource[i]; + + if (!r->flags || !r->parent || + (r->flags & IORESOURCE_UNSET) || + (r->flags & IORESOURCE_PCI_FIXED)) + continue; + + r->flags |= IORESOURCE_PCI_FIXED; + pci_warn(dev, "%s: no support for movable BARs, mark BAR %d (%pR) as fixed\n", +__func__, i, r); + } } } } -- 2.20.1
[PATCH RFC v4 08/21] nvme-pci: Handle movable BARs
Hotplugged devices can affect the existing ones by moving their BARs. PCI subsystem will inform the NVME driver about this by invoking reset_prepare()+reset_done(), then iounmap()+ioremap() must be called. Signed-off-by: Sergey Miroshnichenko --- drivers/nvme/host/pci.c | 29 +++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 92bad1c810ac..ccea3033a67a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -106,6 +106,7 @@ struct nvme_dev { unsigned int num_vecs; int q_depth; u32 db_stride; + resource_size_t current_phys_bar; void __iomem *bar; unsigned long bar_mapped_size; struct work_struct remove_work; @@ -1672,13 +1673,16 @@ static int nvme_remap_bar(struct nvme_dev *dev, unsigned long size) { struct pci_dev *pdev = to_pci_dev(dev->dev); - if (size <= dev->bar_mapped_size) + if (dev->bar && + dev->current_phys_bar == pci_resource_start(pdev, 0) && + size <= dev->bar_mapped_size) return 0; if (size > pci_resource_len(pdev, 0)) return -ENOMEM; if (dev->bar) iounmap(dev->bar); - dev->bar = ioremap(pci_resource_start(pdev, 0), size); + dev->current_phys_bar = pci_resource_start(pdev, 0); + dev->bar = ioremap(dev->current_phys_bar, size); if (!dev->bar) { dev->bar_mapped_size = 0; return -ENOMEM; @@ -2504,6 +2508,8 @@ static void nvme_reset_work(struct work_struct *work) if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) goto out; + nvme_remap_bar(dev, db_bar_size(dev, 0)); + /* * If we're called to reset a live controller first shut it down before * moving on. @@ -2910,6 +2916,23 @@ static void nvme_error_resume(struct pci_dev *pdev) flush_work(>ctrl.reset_work); } +void nvme_rescan_prepare(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + nvme_dev_disable(dev, false); + nvme_dev_unmap(dev); + dev->bar = NULL; +} + +void nvme_rescan_done(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + nvme_dev_map(dev); + nvme_reset_ctrl_sync(>ctrl); +} + static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, .slot_reset = nvme_slot_reset, @@ -2974,6 +2997,8 @@ static struct pci_driver nvme_driver = { }, .sriov_configure = pci_sriov_configure_simple, .err_handler= _err_handler, + .rescan_prepare = nvme_rescan_prepare, + .rescan_done= nvme_rescan_done, }; static int __init nvme_init(void) -- 2.20.1
[PATCH RFC v4 07/21] PCI: Wake up bridges during rescan when movable BARs enabled
Use the PM runtime methods to wake up the bridges before accessing their config space. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/probe.c | 4 1 file changed, 4 insertions(+) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 88350dd56344..dc935f82a595 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3252,6 +3252,8 @@ static void pci_bus_rescan_prepare(struct pci_bus *bus) { struct pci_dev *dev; + pm_runtime_get_sync(>dev); + list_for_each_entry(dev, >devices, bus_list) { struct pci_bus *child = dev->subordinate; @@ -3278,6 +3280,8 @@ static void pci_bus_rescan_done(struct pci_bus *bus) dev->driver->rescan_done(dev); } } + + pm_runtime_put(>dev); } /** -- 2.20.1
[PATCH RFC v4 04/21] PCI: Define PCI-specific version of the release_child_resources()
Make the released resources of a bridge valid for later re-assignment: clear the STARTALIGN flag. Resources marked with PCI_FIXED must preserve their offset and size. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/setup-bus.c | 47 - 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index ec44a0f3a7ac..3644feb13179 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -1483,6 +1483,51 @@ static void __pci_bridge_assign_resources(const struct pci_dev *bridge, (IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH |\ IORESOURCE_MEM_64) +/* + * Similar to release_child_resources(), but aware of PCI_FIXED and STARTALIGN flags + */ +static void pci_release_child_resources(struct resource *r) +{ + struct resource *tmp, *p; + + if (!r) + return; + + if (r->flags & IORESOURCE_PCI_FIXED) + return; + + p = r->child; + r->child = NULL; + while (p) { + resource_size_t size = resource_size(p); + + tmp = p; + p = p->sibling; + + tmp->parent = NULL; + tmp->sibling = NULL; + pci_release_child_resources(tmp); + + if (!tmp->flags) + continue; + + if (tmp->flags & IORESOURCE_PCI_FIXED) { + pr_debug("PCI: release fixed %pR (%s), keep its flags, base and size\n", +tmp, tmp->name); + continue; + } + + pr_debug("PCI: release %pR (%s)\n", tmp, tmp->name); + + /* need to restore size, and keep all the flags but STARTALIGN */ + tmp->start = 0; + tmp->end = size - 1; + + tmp->flags &= ~IORESOURCE_STARTALIGN; + tmp->flags |= IORESOURCE_SIZEALIGN; + } +} + static void pci_bridge_release_resources(struct pci_bus *bus, unsigned long type) { @@ -1528,7 +1573,7 @@ static void pci_bridge_release_resources(struct pci_bus *bus, * if there are children under that, we should release them * all */ - release_child_resources(r); + pci_release_child_resources(r); if (!release_resource(r)) { type = old_flags = r->flags & PCI_RES_TYPE_MASK; pci_printk(KERN_DEBUG, dev, "resource %d %pR released\n", -- 2.20.1
[PATCH RFC v4 05/21] PCI: hotplug: Add a flag for the movable BARs feature
If a new PCIe device has been hot-plugged between the two active ones without big enough gap between their BARs, these BARs should be moved if their drivers support this feature. The drivers should be notified and paused during the procedure: 1) dev 8 (new) | v .. | dev 3 | dev 3 | dev 5 | dev 7 | .. | BAR 0 | BAR 1 | BAR 0 | BAR 0 | 2) dev 8 | v .. | dev 3 | dev 3 | --> --> | dev 5 | dev 7 | .. | BAR 0 | BAR 1 | --> --> | BAR 0 | BAR 0 | 3) .. | dev 3 | dev 3 | dev 8 | dev 8 | dev 5 | dev 7 | .. | BAR 0 | BAR 1 | BAR 0 | BAR 1 | BAR 0 | BAR 0 | Thus, prior reservation of memory regions by BIOS/bootloader/firmware is not required anymore for the PCIe hotplug. The PCI_MOVABLE_BARS flag is set by the platform is this feature is supported and tested, but can be overridden by the following command line option: pcie_movable_bars={ off | force } Signed-off-by: Sergey Miroshnichenko --- .../admin-guide/kernel-parameters.txt | 7 ++ drivers/pci/pci.c | 24 +++ include/linux/pci.h | 2 ++ 3 files changed, 33 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2b8ee90bb644..d40eaf993f80 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3417,6 +3417,13 @@ nomsi Do not use MSI for native PCIe PME signaling (this makes all PCIe root ports use INTx for all services). + pcie_movable_bars=[PCIE] + Override the movable BARs support detection: + off + Disable even if supported by the platform + force + Enable even if not explicitly declared as supported + pcmv= [HW,PCMCIA] BadgePAD 4 pd_ignore_unused diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 69898fe5255e..4dac49a887ec 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -139,6 +139,30 @@ static int __init pcie_port_pm_setup(char *str) } __setup("pcie_port_pm=", pcie_port_pm_setup); +static bool pcie_movable_bars_off; +static bool pcie_movable_bars_force; +static int __init pcie_movable_bars_setup(char *str) +{ + if (!strcmp(str, "off")) + pcie_movable_bars_off = true; + else if (!strcmp(str, "force")) + pcie_movable_bars_force = true; + return 1; +} +__setup("pcie_movable_bars=", pcie_movable_bars_setup); + +bool pci_movable_bars_enabled(void) +{ + if (pcie_movable_bars_off) + return false; + + if (pcie_movable_bars_force) + return true; + + return pci_has_flag(PCI_MOVABLE_BARS); +} +EXPORT_SYMBOL(pci_movable_bars_enabled); + /* Time to wait after a reset for device to become responsive */ #define PCIE_RESET_READY_POLL_MS 6 diff --git a/include/linux/pci.h b/include/linux/pci.h index cb2760a31fe2..cbe661aff9f5 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -866,6 +866,7 @@ enum { PCI_ENABLE_PROC_DOMAINS = 0x0010, /* Enable domains in /proc */ PCI_COMPAT_DOMAIN_0 = 0x0020, /* ... except domain 0 */ PCI_SCAN_ALL_PCIE_DEVS = 0x0040, /* Scan all, not just dev 0 */ + PCI_MOVABLE_BARS= 0x0080, /* Runtime BAR reassign after hotplug */ }; /* These external functions are only available when PCI support is enabled */ @@ -1345,6 +1346,7 @@ unsigned char pci_bus_max_busnr(struct pci_bus *bus); void pci_setup_bridge(struct pci_bus *bus); resource_size_t pcibios_window_alignment(struct pci_bus *bus, unsigned long type); +bool pci_movable_bars_enabled(void); #define PCI_VGA_STATE_CHANGE_BRIDGE (1 << 0) #define PCI_VGA_STATE_CHANGE_DECODES (1 << 1) -- 2.20.1
[PATCH RFC v4 03/21] PCI: Enable bridge's I/O and MEM access for hotplugged devices
After updating the bridge window resources, the PCI_COMMAND_IO and PCI_COMMAND_MEMORY bits of the bridge must be addressed as well. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/pci.c | 8 1 file changed, 8 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 895201d4c9e6..69898fe5255e 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1622,6 +1622,14 @@ static void pci_enable_bridge(struct pci_dev *dev) pci_enable_bridge(bridge); if (pci_is_enabled(dev)) { + int i, bars = 0; + + for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) { + if (dev->resource[i].flags & (IORESOURCE_MEM | IORESOURCE_IO)) + bars |= (1 << i); + } + do_pci_enable_device(dev, bars); + if (!dev->is_busmaster) pci_set_master(dev); mutex_unlock(>enable_mutex); -- 2.20.1
[PATCH RFC v4 06/21] PCI: Pause the devices with movable BARs during rescan
Drivers indicate their support of movable BARs by implementing the new rescan_prepare() and rescan_done() hooks in the struct pci_driver. All device's activity must be stopped during a rescan, and iounmap() +ioremap() must be applied to every used BAR. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/probe.c | 51 +++-- include/linux/pci.h | 2 ++ 2 files changed, 51 insertions(+), 2 deletions(-) diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 977a127ce791..88350dd56344 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -3248,6 +3248,38 @@ unsigned int pci_rescan_bus_bridge_resize(struct pci_dev *bridge) return max; } +static void pci_bus_rescan_prepare(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *child = dev->subordinate; + + if (child) { + pci_bus_rescan_prepare(child); + } else if (dev->driver && + dev->driver->rescan_prepare) { + dev->driver->rescan_prepare(dev); + } + } +} + +static void pci_bus_rescan_done(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, >devices, bus_list) { + struct pci_bus *child = dev->subordinate; + + if (child) { + pci_bus_rescan_done(child); + } else if (dev->driver && + dev->driver->rescan_done) { + dev->driver->rescan_done(dev); + } + } +} + /** * pci_rescan_bus - Scan a PCI bus for devices * @bus: PCI bus to scan @@ -3261,8 +3293,23 @@ unsigned int pci_rescan_bus(struct pci_bus *bus) { unsigned int max; - max = pci_scan_child_bus(bus); - pci_assign_unassigned_bus_resources(bus); + if (pci_movable_bars_enabled()) { + struct pci_bus *root = bus; + + while (!pci_is_root_bus(root)) + root = root->parent; + + pci_bus_rescan_prepare(root); + + max = pci_scan_child_bus(root); + pci_assign_unassigned_root_bus_resources(root); + + pci_bus_rescan_done(root); + } else { + max = pci_scan_child_bus(bus); + pci_assign_unassigned_bus_resources(bus); + } + pci_bus_add_devices(bus); return max; diff --git a/include/linux/pci.h b/include/linux/pci.h index cbe661aff9f5..3d52f5538282 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -780,6 +780,8 @@ struct pci_driver { int (*resume)(struct pci_dev *dev);/* Device woken up */ void (*shutdown)(struct pci_dev *dev); int (*sriov_configure)(struct pci_dev *dev, int num_vfs); /* On PF */ + void (*rescan_prepare)(struct pci_dev *dev); + void (*rescan_done)(struct pci_dev *dev); const struct pci_error_handlers *err_handler; const struct attribute_group **groups; struct device_driverdriver; -- 2.20.1
[PATCH RFC v4 02/21] PCI: Fix race condition in pci_enable/disable_device()
CPU0 CPU1 pci_enable_device_mem() pci_enable_device_mem() pci_enable_bridge() pci_enable_bridge() pci_is_enabled() return false; atomic_inc_return(enable_cnt) Start actual enabling the bridge ... pci_is_enabled() ... return true; ... Start memory requests <-- FAIL ... Set the PCI_COMMAND_MEMORY bit <-- Must wait for this This patch protects the pci_enable/disable_device() and pci_enable_bridge() with mutexes. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/pci.c | 26 ++ drivers/pci/probe.c | 1 + include/linux/pci.h | 1 + 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index f006068be209..895201d4c9e6 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1615,6 +1615,8 @@ static void pci_enable_bridge(struct pci_dev *dev) struct pci_dev *bridge; int retval; + mutex_lock(>enable_mutex); + bridge = pci_upstream_bridge(dev); if (bridge) pci_enable_bridge(bridge); @@ -1622,6 +1624,7 @@ static void pci_enable_bridge(struct pci_dev *dev) if (pci_is_enabled(dev)) { if (!dev->is_busmaster) pci_set_master(dev); + mutex_unlock(>enable_mutex); return; } @@ -1630,11 +1633,14 @@ static void pci_enable_bridge(struct pci_dev *dev) pci_err(dev, "Error enabling bridge (%d), continuing\n", retval); pci_set_master(dev); + mutex_unlock(>enable_mutex); } static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) { struct pci_dev *bridge; + /* Enable-locking of bridges is performed within the pci_enable_bridge() */ + bool need_lock = !dev->subordinate; int err; int i, bars = 0; @@ -1650,8 +1656,13 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) dev->current_state = (pmcsr & PCI_PM_CTRL_STATE_MASK); } - if (atomic_inc_return(>enable_cnt) > 1) + if (need_lock) + mutex_lock(>enable_mutex); + if (pci_is_enabled(dev)) { + if (need_lock) + mutex_unlock(>enable_mutex); return 0; /* already enabled */ + } bridge = pci_upstream_bridge(dev); if (bridge) @@ -1666,8 +1677,10 @@ static int pci_enable_device_flags(struct pci_dev *dev, unsigned long flags) bars |= (1 << i); err = do_pci_enable_device(dev, bars); - if (err < 0) - atomic_dec(>enable_cnt); + if (err >= 0) + atomic_inc(>enable_cnt); + if (need_lock) + mutex_unlock(>enable_mutex); return err; } @@ -1910,15 +1923,20 @@ void pci_disable_device(struct pci_dev *dev) if (dr) dr->enabled = 0; + mutex_lock(>enable_mutex); dev_WARN_ONCE(>dev, atomic_read(>enable_cnt) <= 0, "disabling already-disabled device"); - if (atomic_dec_return(>enable_cnt) != 0) + if (atomic_dec_return(>enable_cnt) != 0) { + mutex_unlock(>enable_mutex); return; + } do_pci_disable_device(dev); dev->is_busmaster = 0; + + mutex_unlock(>enable_mutex); } EXPORT_SYMBOL(pci_disable_device); diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index 2ec0df04e0dc..977a127ce791 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2267,6 +2267,7 @@ struct pci_dev *pci_alloc_dev(struct pci_bus *bus) INIT_LIST_HEAD(>bus_list); dev->dev.type = _dev_type; dev->bus = pci_bus_get(bus); + mutex_init(>enable_mutex); return dev; } diff --git a/include/linux/pci.h b/include/linux/pci.h index 77448215ef5b..cb2760a31fe2 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -419,6 +419,7 @@ struct pci_dev { unsigned intno_vf_scan:1; /* Don't scan for VFs after IOV enablement */ pci_dev_flags_t dev_flags; atomic_tenable_cnt; /* pci_enable_device has been called */ + struct mutexenable_mutex; u32 saved_config_space[16]; /* Config space saved at suspend time */ struct hlist_head saved_cap_space; -- 2.20.1
[PATCH RFC v4 01/21] PCI: Fix writing invalid BARs during pci_restore_state()
If BAR movement has happened (due to PCIe hotplug) after pci_save_state(), the saved addresses will become outdated. Restore them the most recently calculated values, not the ones stored in an arbitrary moment. Signed-off-by: Sergey Miroshnichenko --- drivers/pci/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 7c1b362f599a..f006068be209 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1376,7 +1376,7 @@ static void pci_restore_config_space(struct pci_dev *pdev) if (pdev->hdr_type == PCI_HEADER_TYPE_NORMAL) { pci_restore_config_space_range(pdev, 10, 15, 0, false); /* Restore BARs before the command register. */ - pci_restore_config_space_range(pdev, 4, 9, 10, false); + pci_restore_bars(pdev); pci_restore_config_space_range(pdev, 0, 3, 0, false); } else if (pdev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { pci_restore_config_space_range(pdev, 12, 15, 0, false); -- 2.20.1
[PATCH] powerpc: fix mistake in arch_get_random_seed_int()
Way back in v4.3 the PPC hardware RNG stuff was changed to only provide the RNG seed functions, but this line was missed during the change. Fixes: 01c9348c7620 ("powerpc: Use hardware RNG for arch_get_random_seed_* not arch_get_random_*") Signed-off-by: Jon DeVree --- arch/powerpc/include/asm/archrandom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h index 9c63b596e6ce..a09595f00cab 100644 --- a/arch/powerpc/include/asm/archrandom.h +++ b/arch/powerpc/include/asm/archrandom.h @@ -28,7 +28,7 @@ static inline int arch_get_random_seed_int(unsigned int *v) unsigned long val; int rc; - rc = arch_get_random_long(); + rc = arch_get_random_seed_long(); if (rc) *v = val; -- 2.20.1
Re: [PATCH v2 5/7] counter: add FlexTimer Module Quadrature decoder counter driver
On Wed, 6 Mar 2019 12:12:06 +0100 Patrick Havelange wrote: > This driver exposes the counter for the quadrature decoder of the > FlexTimer Module, present in the LS1021A soc. > > Signed-off-by: Patrick Havelange A few really trivial bits inline to add to William's feedback. Otherwise I'm happy enough, Reviewed-by: Jonathan Cameron > --- > Changes v2 > - Rebased on new counter subsystem > - Cleaned up included headers > - Use devm_ioremap() > - Correct order of devm_ and unmanaged resources > --- > drivers/counter/Kconfig | 9 + > drivers/counter/Makefile | 1 + > drivers/counter/ftm-quaddec.c | 356 ++ > 3 files changed, 366 insertions(+) > create mode 100644 drivers/counter/ftm-quaddec.c > > diff --git a/drivers/counter/Kconfig b/drivers/counter/Kconfig > index 87c491a19c63..233ac305d878 100644 > --- a/drivers/counter/Kconfig > +++ b/drivers/counter/Kconfig > @@ -48,4 +48,13 @@ config STM32_LPTIMER_CNT > To compile this driver as a module, choose M here: the > module will be called stm32-lptimer-cnt. > > +config FTM_QUADDEC > + tristate "Flex Timer Module Quadrature decoder driver" > + help > + Select this option to enable the Flex Timer Quadrature decoder > + driver. > + > + To compile this driver as a module, choose M here: the > + module will be called ftm-quaddec. > + > endif # COUNTER > diff --git a/drivers/counter/Makefile b/drivers/counter/Makefile > index 5589976d37f8..0c9e622a6bea 100644 > --- a/drivers/counter/Makefile > +++ b/drivers/counter/Makefile > @@ -7,3 +7,4 @@ obj-$(CONFIG_COUNTER) += counter.o > obj-$(CONFIG_104_QUAD_8) += 104-quad-8.o > obj-$(CONFIG_STM32_TIMER_CNT)+= stm32-timer-cnt.o > obj-$(CONFIG_STM32_LPTIMER_CNT) += stm32-lptimer-cnt.o > +obj-$(CONFIG_FTM_QUADDEC)+= ftm-quaddec.o > diff --git a/drivers/counter/ftm-quaddec.c b/drivers/counter/ftm-quaddec.c > new file mode 100644 > index ..1bc9e075a386 > --- /dev/null > +++ b/drivers/counter/ftm-quaddec.c > @@ -0,0 +1,356 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Flex Timer Module Quadrature decoder > + * > + * This module implements a driver for decoding the FTM quadrature > + * of ex. a LS1021A > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +struct ftm_quaddec { > + struct counter_device counter; > + struct platform_device *pdev; > + void __iomem *ftm_base; > + bool big_endian; > + struct mutex ftm_quaddec_mutex; > +}; > + > +static void ftm_read(struct ftm_quaddec *ftm, uint32_t offset, uint32_t > *data) > +{ > + if (ftm->big_endian) > + *data = ioread32be(ftm->ftm_base + offset); > + else > + *data = ioread32(ftm->ftm_base + offset); > +} > + > +static void ftm_write(struct ftm_quaddec *ftm, uint32_t offset, uint32_t > data) > +{ > + if (ftm->big_endian) > + iowrite32be(data, ftm->ftm_base + offset); > + else > + iowrite32(data, ftm->ftm_base + offset); > +} > + > +/* > + * take mutex > + * call ftm_clear_write_protection > + * update settings > + * call ftm_set_write_protection > + * release mutex > + */ > +static void ftm_clear_write_protection(struct ftm_quaddec *ftm) > +{ > + uint32_t flag; > + > + /* First see if it is enabled */ > + ftm_read(ftm, FTM_FMS, ); > + > + if (flag & FTM_FMS_WPEN) { > + ftm_read(ftm, FTM_MODE, ); > + ftm_write(ftm, FTM_MODE, flag | FTM_MODE_WPDIS); > + } > +} > + > +static void ftm_set_write_protection(struct ftm_quaddec *ftm) > +{ > + ftm_write(ftm, FTM_FMS, FTM_FMS_WPEN); > +} > + > +static void ftm_reset_counter(struct ftm_quaddec *ftm) > +{ > + /* Reset hardware counter to CNTIN */ > + ftm_write(ftm, FTM_CNT, 0x0); > +} > + > +static void ftm_quaddec_init(struct ftm_quaddec *ftm) > +{ > + ftm_clear_write_protection(ftm); > + > + /* > + * Do not write in the region from the CNTIN register through the > + * PWMLOAD register when FTMEN = 0. > + */ > + ftm_write(ftm, FTM_MODE, FTM_MODE_FTMEN); > + ftm_write(ftm, FTM_CNTIN, 0x); > + ftm_write(ftm, FTM_MOD, 0x); > + ftm_write(ftm, FTM_CNT, 0x0); > + ftm_write(ftm, FTM_SC, FTM_SC_PS_1); > + > + /* Select quad mode */ > + ftm_write(ftm, FTM_QDCTRL, FTM_QDCTRL_QUADEN); > + > + /* Unused features and reset to default section */ > + ftm_write(ftm, FTM_POL, 0x0); > + ftm_write(ftm, FTM_FLTCTRL, 0x0); > + ftm_write(ftm, FTM_SYNCONF, 0x0); > + ftm_write(ftm, FTM_SYNC, 0x); > + > + /* Lock the FTM */ > + ftm_set_write_protection(ftm); > +} > + > +static void ftm_quaddec_disable(struct ftm_quaddec *ftm) > +{ > + ftm_write(ftm, FTM_MODE, 0); > +} > + > +static int ftm_quaddec_get_prescaler(struct counter_device *counter, > + struct counter_count *count, > +
Re: [PATCH v2 6/7] counter: ftm-quaddec: Documentation: Add specific counter sysfs documentation
On Thu, 7 Mar 2019 20:42:16 +0900 William Breathitt Gray wrote: > On Wed, Mar 06, 2019 at 12:12:07PM +0100, Patrick Havelange wrote: > > This adds documentation for the specific prescaler entry. > > > > Signed-off-by: Patrick Havelange > > --- > > Changes v2 > > - Add doc for prescaler entry > > --- > > .../ABI/testing/sysfs-bus-counter-ftm-quaddec| 16 > > 1 file changed, 16 insertions(+) > > create mode 100644 Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec > > > > diff --git a/Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec > > b/Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec > > new file mode 100644 > > index ..2da629d6d485 > > --- /dev/null > > +++ b/Documentation/ABI/testing/sysfs-bus-counter-ftm-quaddec > > @@ -0,0 +1,16 @@ > > +What: > > /sys/bus/counter/devices/counterX/countY/prescaler_available > > +KernelVersion: 5.1 > > +Contact: linux-...@vger.kernel.org > > +Description: > > + Discrete set of available values for the respective Count Y > > + configuration are listed in this file. Values are delimited by > > + newline characters. > > + > > +What: /sys/bus/counter/devices/counterX/countY/prescaler > > +KernelVersion: 5.1 > > +Contact: linux-...@vger.kernel.org > > +Description: > > + Configure the prescaler value associated with Count Y. > > + On the FlexTimer, the counter clock source passes through a > > + prescaler that is a 7-bit counter. This acts like a clock > > + divider. > > -- > > 2.19.1 > > Hmm, prescalers seem common enough among counter devices to permit these > attributes to be listed in the sysfs-bus-counter documentation file. > However, I'd like to wait until we get another counter driver for a > device with a prescaler before we make that move. From there, we'll have > a better vantage point to determine a fitting standard prescaler > attribute behavior. > > So for now, we'll keep these attributes documented here in the > sysfs-bus-counter-ftm-quaddec file, until the time comes to broach the > discussion again. Agreed. As long as the definition is sufficiently non-specific so it can be moved later. I'm not sure for example that the docs need to say that it is a 7 bit counter. That should be apparent from prescaler_available - or at least possible values should be which is all we need to know. Jonathan > > William Breathitt Gray
[PATCH v5 5/8] powerpc/pci/IOV: Add support for runtime enabling the VFs
When called within pcibios_sriov_enable(), the pci_sriov_get_totalvfs(pdev) returns zero, because the device is yet preparing to enable the VFs. With this patch it becomes possible to enable VFs via sysfs "sriov_numvfs" on PowerNV. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/include/asm/pci-bridge.h | 4 +-- arch/powerpc/kernel/pci_dn.c | 32 ++- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +-- arch/powerpc/platforms/pseries/pci.c | 4 +-- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index fc188e0e9179..6479bc96e0b6 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -225,8 +225,8 @@ struct pci_dn { extern struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, int devfn); extern struct pci_dn *pci_get_pdn(struct pci_dev *pdev); -extern struct pci_dn *add_dev_pci_data(struct pci_dev *pdev); -extern void remove_dev_pci_data(struct pci_dev *pdev); +extern struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs); +extern void pci_destroy_vf_pdns(struct pci_dev *pdev); extern struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, struct device_node *dn); extern void pci_remove_device_node_info(struct device_node *dn); diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 7f12882d8882..7fa362f8038d 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -222,18 +222,19 @@ static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev, return pdn; } -struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) +struct pci_dn *pci_create_vf_pdns(struct pci_dev *pdev, int num_vfs) { + struct pci_dn *pdn = pci_get_pdn(pdev); + #ifdef CONFIG_PCI_IOV - struct pci_dn *parent, *pdn; + struct pci_dn *parent; int i; /* Only support IOV for now */ if (!pdev->is_physfn) - return pci_get_pdn(pdev); + return pdn; /* Check if VFs have been populated */ - pdn = pci_get_pdn(pdev); if (!pdn || (pdn->flags & PCI_DN_FLAG_IOV_VF)) return NULL; @@ -242,33 +243,38 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) if (!parent) return NULL; - for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { + for (i = 0; i < num_vfs; i++) { struct eeh_dev *edev __maybe_unused; + struct pci_dn *vpdn; - pdn = pci_alloc_pdn(parent, - pci_iov_virtfn_bus(pdev, i), - pci_iov_virtfn_devfn(pdev, i)); - if (!pdn) { + vpdn = pci_alloc_pdn(parent, +pci_iov_virtfn_bus(pdev, i), +pci_iov_virtfn_devfn(pdev, i)); + if (!vpdn) { dev_warn(>dev, "%s: Cannot create firmware data for VF#%d\n", __func__, i); return NULL; } - pdn->vf_index = i; + vpdn->vf_index = i; + vpdn->vendor_id = pdn->vendor_id; + vpdn->device_id = pdn->device_id; + vpdn->class_code = pdn->class_code; + vpdn->pci_ext_config_space = 0; #ifdef CONFIG_EEH /* Create the EEH device for the VF */ - edev = eeh_dev_init(pdn); + edev = eeh_dev_init(vpdn); BUG_ON(!edev); edev->physfn = pdev; #endif /* CONFIG_EEH */ } #endif /* CONFIG_PCI_IOV */ - return pci_get_pdn(pdev); + return pdn; } -void remove_dev_pci_data(struct pci_dev *pdev) +void pci_destroy_vf_pdns(struct pci_dev *pdev) { #ifdef CONFIG_PCI_IOV struct pci_dn *parent; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index ed500f51d449..979c901535f2 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1720,14 +1720,14 @@ int pnv_pcibios_sriov_disable(struct pci_dev *pdev) pnv_pci_sriov_disable(pdev); /* Release PCI data */ - remove_dev_pci_data(pdev); + pci_destroy_vf_pdns(pdev); return 0; } int pnv_pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs) { /* Allocate PCI data */ - add_dev_pci_data(pdev); + pci_create_vf_pdns(pdev, num_vfs); return pnv_pci_sriov_enable(pdev, num_vfs); } diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 37a77e57893e..5e87596903a6 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -205,7 +205,7 @@
[PATCH v5 7/8] powerpc/powernv/pci: Hook up the writes to PCI_SECONDARY_BUS register
Writing a new value to the PCI_SECONDARY_BUS register of the bridge means that its children will become addressable on another address (new B in BDF) or even un-addressable if the secondary bus is set to zero. On PowerNV, device PEs are heavily BDF-dependent, so they must be updated on every such change of its address. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/platforms/powernv/pci.c | 118 ++- 1 file changed, 116 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 8cc6661781e2..40f68955f34f 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -722,13 +722,127 @@ int pnv_pci_cfg_read(struct pci_dn *pdn, where, size, val); } +static void invalidate_children_pes(struct pci_dn *pdn) +{ + struct pnv_phb *phb = pdn->phb->private_data; + struct pci_dn *child; + bool found_pe = false; + int pe_num; + int pe_bus; + + list_for_each_entry(child, >child_list, list) { + struct pnv_ioda_pe *pe = (child->pe_number != IODA_INVALID_PE) ? + >ioda.pe_array[child->pe_number] : + NULL; + + if (!child->busno) + continue; + + if ((child->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) + invalidate_children_pes(child); + + if (pe) { + u8 rid_bus = (pe->rid >> 8) & 0xff; + + if (rid_bus) { + pe_num = child->pe_number; + pe_bus = rid_bus; + found_pe = true; + } + + pe->rid &= 0xff; + } + + child->busno = 0; + } + + if (found_pe) { + u16 rid = pe_bus << 8; + + opal_pci_set_pe(phb->opal_id, pe_num, rid, 7, 0, 0, OPAL_UNMAP_PE); + } +} + +static u8 pre_hook_new_sec_bus(struct pci_dn *pdn, u8 new_secondary_bus) +{ + u32 old_secondary_bus = 0; + + if ((pdn->class_code >> 8) != PCI_CLASS_BRIDGE_PCI) + return 0; + + pnv_pci_cfg_read(pdn, PCI_SECONDARY_BUS, 1, _secondary_bus); + old_secondary_bus &= 0xff; + + if (old_secondary_bus != new_secondary_bus) + invalidate_children_pes(pdn); + + return old_secondary_bus; +} + +static void update_children_pes(struct pci_dn *pdn, u8 new_secondary_bus) +{ + struct pnv_phb *phb = pdn->phb->private_data; + struct pci_dn *child; + bool found_pe = false; + int pe_num; + + if (!new_secondary_bus) + return; + + list_for_each_entry(child, >child_list, list) { + struct pnv_ioda_pe *pe = (child->pe_number != IODA_INVALID_PE) ? + >ioda.pe_array[child->pe_number] : + NULL; + + if (child->busno) + continue; + + child->busno = new_secondary_bus; + + if (pe) { + pe->rid |= (child->busno << 8); + pe_num = child->pe_number; + found_pe = true; + } + } + + if (found_pe) { + u16 rid = new_secondary_bus << 8; + + opal_pci_set_pe(phb->opal_id, pe_num, rid, 7, 0, 0, OPAL_MAP_PE); + } +} + +static void post_hook_new_sec_bus(struct pci_dn *pdn, u8 new_secondary_bus) +{ + if ((pdn->class_code >> 8) != PCI_CLASS_BRIDGE_PCI) + return; + + update_children_pes(pdn, new_secondary_bus); +} + int pnv_pci_cfg_write(struct pci_dn *pdn, int where, int size, u32 val) { struct pnv_phb *phb = pdn->phb->private_data; + u8 old_secondary_bus = 0, new_secondary_bus = 0; + int rc; + + if (where == PCI_SECONDARY_BUS) { + new_secondary_bus = val & 0xff; + old_secondary_bus = pre_hook_new_sec_bus(pdn, new_secondary_bus); + } else if (where == PCI_PRIMARY_BUS && size > 1) { + new_secondary_bus = (val >> 8) & 0xff; + old_secondary_bus = pre_hook_new_sec_bus(pdn, new_secondary_bus); + } - return pnv_pci_cfg_write_raw(phb->opal_id, pdn->busno, pdn->devfn, -where, size, val); + rc = pnv_pci_cfg_write_raw(phb->opal_id, pdn->busno, pdn->devfn, + where, size, val); + + if (new_secondary_bus && old_secondary_bus != new_secondary_bus) + post_hook_new_sec_bus(pdn, new_secondary_bus); + + return rc; } #if CONFIG_EEH -- 2.20.1
[PATCH v5 6/8] powerpc/pci: Don't rely on DT is the PCI_REASSIGN_ALL_BUS is set
If supported by the platform, endpoint's pci_dn can be created dynamically, without need to wait for DT updates from the firmware. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/kernel/pci_dn.c | 6 -- arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 7fa362f8038d..17362a9b4678 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -555,8 +555,10 @@ void pci_devs_phb_init_dynamic(struct pci_controller *phb) phb->pci_data = pdn; } - /* Update dn->phb ptrs for new phb and children devices */ - pci_traverse_device_nodes(dn, add_pdn, phb); + if (!pci_has_flag(PCI_REASSIGN_ALL_BUS)) { + /* Update dn->phb ptrs for new phb and children devices */ + pci_traverse_device_nodes(dn, add_pdn, phb); + } } /** diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index f38078976c5d..40feff2653a0 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -47,7 +47,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (!pdev->is_virtfn) + if (!pci_has_flag(PCI_REASSIGN_ALL_BUS) && !pdev->is_virtfn) return; /* -- 2.20.1
[PATCH v5 8/8] powerpc/powernv/pci: Enable reassigning the bus numbers
When the pci=realloc command line switch is enabled (which should only be set when working on on top of the skiboot with the "core/pci: Sync VFs and the changes of bdfns between the firmware and the OS" patch serie applied), PowerNV will not depend on PCIe topology info from DT anymore. This makes possible to re-enumerate the fabric, assign the new bus numbers and switch from the pnv_php module to the standard pciehp driver for PCIe hotplug functionality. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/kernel/pci_dn.c | 12 1 file changed, 12 insertions(+) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 17362a9b4678..9437af1a3b20 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -595,3 +595,15 @@ static void pci_dev_pdn_setup(struct pci_dev *pdev) pdev->dev.archdata.pci_data = pdn; } DECLARE_PCI_FIXUP_EARLY(PCI_ANY_ID, PCI_ANY_ID, pci_dev_pdn_setup); + +char * __init pcibios_setup(char *str) +{ + if (!strncmp(str, "realloc=", 8)) { + if (!strncmp(str + 8, "on", 2)) + pci_add_flags(PCI_REASSIGN_ALL_BUS); + } else if (!strncmp(str, "realloc", 7)) { + pci_add_flags(PCI_REASSIGN_ALL_BUS); + } + + return str; +} -- 2.20.1
[PATCH v5 2/8] powerpc/powernv/pci: Suppress an EEH error when reading an empty slot
Reading an empty slot returns all ones, which triggers a false EEH error event on PowerNV. This patch unfreezes the bus where it has happened. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/include/asm/ppc-pci.h | 1 + arch/powerpc/kernel/pci_dn.c | 2 +- arch/powerpc/platforms/powernv/pci.c | 31 +--- 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-pci.h b/arch/powerpc/include/asm/ppc-pci.h index f191ef0d2a0a..a22d52a9bb1f 100644 --- a/arch/powerpc/include/asm/ppc-pci.h +++ b/arch/powerpc/include/asm/ppc-pci.h @@ -40,6 +40,7 @@ void *traverse_pci_dn(struct pci_dn *root, void *(*fn)(struct pci_dn *, void *), void *data); extern void pci_devs_phb_init_dynamic(struct pci_controller *phb); +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus); /* From rtas_pci.h */ extern void init_pci_config_tokens (void); diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index ab147a1909c8..341ed71250f1 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -40,7 +40,7 @@ * one of PF's bridge. For other devices, their firmware * data is linked to that of their bridge. */ -static struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) +struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) { struct pci_bus *pbus; struct device_node *dn; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 41a381dfc2a1..8cc6661781e2 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -761,6 +761,21 @@ static inline pnv_pci_cfg_check(struct pci_dn *pdn) } #endif /* CONFIG_EEH */ +static int get_bus_pe_number(struct pci_bus *bus) +{ + struct pci_dn *pdn = pci_bus_to_pdn(bus); + struct pci_dn *child; + + if (!pdn) + return IODA_INVALID_PE; + + list_for_each_entry(child, >child_list, list) + if (child->pe_number != IODA_INVALID_PE) + return child->pe_number; + + return IODA_INVALID_PE; +} + static int pnv_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) @@ -772,9 +787,19 @@ static int pnv_pci_read_config(struct pci_bus *bus, *val = 0x; pdn = pci_get_pdn_by_devfn(bus, devfn); - if (!pdn) - return pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn, - where, size, val); + if (!pdn) { + int pe_number = get_bus_pe_number(bus); + + ret = pnv_pci_cfg_read_raw(phb->opal_id, bus->number, devfn, + where, size, val); + + if (!ret && (*val == EEH_IO_ERROR_VALUE(size)) && phb->unfreeze_pe) + phb->unfreeze_pe(phb, (pe_number == IODA_INVALID_PE) ? +phb->ioda.reserved_pe_idx : pe_number, +OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); + + return ret; + } if (!pnv_pci_cfg_check(pdn)) return PCIBIOS_DEVICE_NOT_FOUND; -- 2.20.1
[PATCH v5 4/8] powerpc/pci: Reduce code duplication in pci_add_device_node_info
It is possible now to allocate and fill a new pdn with add_one_dev_pci_data Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/kernel/pci_dn.c | 38 +++- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index b594b055b2cf..7f12882d8882 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -159,22 +159,20 @@ static struct pci_dn *pci_alloc_pdn(struct pci_dn *parent, { struct pci_dn *pdn; - /* Except PHB, we always have the parent */ - if (!parent) - return NULL; - pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); if (!pdn) return NULL; - pdn->phb = parent->phb; pdn->parent = parent; pdn->busno = busno; pdn->devfn = devfn; pdn->pe_number = IODA_INVALID_PE; INIT_LIST_HEAD(>child_list); INIT_LIST_HEAD(>list); - list_add_tail(>list, >child_list); + if (parent) { + pdn->phb = parent->phb; + list_add_tail(>list, >child_list); + } return pdn; } @@ -341,25 +339,29 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, const __be32 *regs; struct device_node *parent; struct pci_dn *pdn; + int busno = 0, devfn = 0; #ifdef CONFIG_EEH struct eeh_dev *edev; #endif - pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); - if (pdn == NULL) - return NULL; - dn->data = pdn; - pdn->phb = hose; - pdn->pe_number = IODA_INVALID_PE; regs = of_get_property(dn, "reg", NULL); if (regs) { u32 addr = of_read_number(regs, 1); /* First register entry is addr (00BBSS00) */ - pdn->busno = (addr >> 16) & 0xff; - pdn->devfn = (addr >> 8) & 0xff; + busno = (addr >> 16) & 0xff; + devfn = (addr >> 8) & 0xff; } + parent = of_get_parent(dn); + pdn = pci_alloc_pdn(parent ? PCI_DN(parent) : NULL, + busno, devfn); + if (!pdn) + return NULL; + + dn->data = pdn; + pdn->phb = hose; + /* vendor/device IDs and class code */ regs = of_get_property(dn, "vendor-id", NULL); pdn->vendor_id = regs ? of_read_number(regs, 1) : 0; @@ -380,14 +382,6 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, } #endif - /* Attach to parent node */ - INIT_LIST_HEAD(>child_list); - INIT_LIST_HEAD(>list); - parent = of_get_parent(dn); - pdn->parent = parent ? PCI_DN(parent) : NULL; - if (pdn->parent) - list_add_tail(>list, >parent->child_list); - return pdn; } EXPORT_SYMBOL_GPL(pci_add_device_node_info); -- 2.20.1
[PATCH v5 3/8] powerpc/pci: Create pci_dn on demand
If a struct pci_dn hasn't yet been created for the PCIe device (there was no DT node for it), allocate this structure and fill with info read from the device directly. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/kernel/pci_dn.c | 88 ++-- 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 341ed71250f1..b594b055b2cf 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -33,6 +33,9 @@ #include #include +static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev, + struct pci_dn *parent); + /* * The function is used to find the firmware data of one * specific PCI device, which is attached to the indicated @@ -65,6 +68,9 @@ struct pci_dn *pci_bus_to_pdn(struct pci_bus *bus) dn = pci_bus_to_OF_node(pbus); pdn = dn ? PCI_DN(dn) : NULL; + if (!pdn && pbus->self) + pdn = pbus->self->dev.archdata.pci_data; + return pdn; } @@ -74,10 +80,13 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, struct device_node *dn = NULL; struct pci_dn *parent, *pdn; struct pci_dev *pdev = NULL; + bool pdev_found = false; /* Fast path: fetch from PCI device */ list_for_each_entry(pdev, >devices, bus_list) { if (pdev->devfn == devfn) { + pdev_found = true; + if (pdev->dev.archdata.pci_data) return pdev->dev.archdata.pci_data; @@ -86,6 +95,9 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, } } + if (!pdev_found) + pdev = NULL; + /* Fast path: fetch from device node */ pdn = dn ? PCI_DN(dn) : NULL; if (pdn) @@ -98,9 +110,12 @@ struct pci_dn *pci_get_pdn_by_devfn(struct pci_bus *bus, list_for_each_entry(pdn, >child_list, list) { if (pdn->busno == bus->number && -pdn->devfn == devfn) -return pdn; -} + pdn->devfn == devfn) { + if (pdev) + pdev->dev.archdata.pci_data = pdn; + return pdn; + } + } return NULL; } @@ -130,17 +145,17 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) list_for_each_entry(pdn, >child_list, list) { if (pdn->busno == pdev->bus->number && - pdn->devfn == pdev->devfn) + pdn->devfn == pdev->devfn) { + pdev->dev.archdata.pci_data = pdn; return pdn; + } } - return NULL; + return pci_create_pdn_from_dev(pdev, parent); } -#ifdef CONFIG_PCI_IOV -static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, - int vf_index, - int busno, int devfn) +static struct pci_dn *pci_alloc_pdn(struct pci_dn *parent, + int busno, int devfn) { struct pci_dn *pdn; @@ -156,7 +171,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, pdn->parent = parent; pdn->busno = busno; pdn->devfn = devfn; - pdn->vf_index = vf_index; pdn->pe_number = IODA_INVALID_PE; INIT_LIST_HEAD(>child_list); INIT_LIST_HEAD(>list); @@ -164,7 +178,51 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return pdn; } -#endif + +static struct pci_dn *pci_create_pdn_from_dev(struct pci_dev *pdev, + struct pci_dn *parent) +{ + struct pci_dn *pdn = NULL; + u32 class_code; + u16 device_id; + u16 vendor_id; + + if (!parent) + return NULL; + + pdn = pci_alloc_pdn(parent, pdev->bus->busn_res.start, pdev->devfn); + pci_info(pdev, "Create a new pdn for devfn %2x\n", pdev->devfn / 8); + + if (!pdn) { + pci_err(pdev, "%s: Failed to allocate pdn\n", __func__); + return NULL; + } + + #ifdef CONFIG_EEH + if (!eeh_dev_init(pdn)) { + kfree(pdn); + pci_err(pdev, "%s: Failed to allocate edev\n", __func__); + return NULL; + } + #endif /* CONFIG_EEH */ + + pci_bus_read_config_word(pdev->bus, pdev->devfn, +PCI_VENDOR_ID, _id); + pdn->vendor_id = vendor_id; + + pci_bus_read_config_word(pdev->bus, pdev->devfn, +PCI_DEVICE_ID, _id); + pdn->device_id = device_id; + + pci_bus_read_config_dword(pdev->bus, pdev->devfn, + PCI_CLASS_REVISION, _code); + class_code >>= 8; + pdn->class_code = class_code; + +
[PATCH v5 0/8] powerpc/powernv/pci: Make hotplug self-sufficient, independent of FW and DT
This patchset allows switching from the pnv_php module to the standard pciehp driver for PCIe hotplug functionality, if the platform supports it: PowerNV working on on top of the skiboot with the "core/pci: Sync VFs and the changes of bdfns between the firmware and the OS" [1] patch serie applied. The feature is activated by the "pci=realloc" command line argument. The goal is ability to hotplug bridges full of devices in the future. The "Movable BARs" [2] is a platform-independent part of our work in this. The final part will be movable bus numbers to support inserting a bridge in the middle of an existing PCIe tree. Tested on POWER8 PowerNV+PHB3 ppc64le (our Vesnin server) with: - the pciehp driver active; - the pnv_php driver disabled; - The "pci=realloc" argument is passed; - surprise hotplug of an NVME disk works; - controlled hotplug of a network card with SR-IOV works; - activating of SR-IOV on a network card works; - [with extra patches] manually initiated (via sysfs) rescan has found and turned on a hotplugged bridge; - Without "pci=realloc" works just as before. Changes since v4: - Fixed failing build when EEH is disabled in a kernel config; - Unfreeze the bus on EEH_IO_ERROR_VALUE(size), not only 0x; - Replaced the 0xff magic constant with phb->ioda.reserved_pe_idx; - Renamed create_pdn() -> pci_create_pdn_from_dev(); - Renamed add_one_dev_pci_data(..., vf_index, ...) -> pci_alloc_pdn(); - Renamed add_dev_pci_data() -> pci_create_vf_pdns(); - Renamed remove_dev_pci_data() -> pci_destroy_vf_pdns(); - Removed the patch fixing uninitialized IOMMU group - now it is fixed in commit 8f5b27347e88 ("powerpc/powernv/sriov: Register IOMMU groups for VFs") Changes since v3 [3]: - Subject changed; - Don't disable EEH during rescan anymore - instead just unfreeze the target buses deliberately; - Add synchronization with the firmware when changing the PCIe topology; - Fixed for VFs; - Code cleanup. Changes since v2: - Don't reassign bus numbers on PowerNV by default (to retain the default behavior), but only when pci=realloc is passed; - Less code affected; - pci_add_device_node_info is refactored with add_one_dev_pci_data; - Minor code cleanup. Changes since v1: - Fixed build for ppc64le and ppc64be when CONFIG_PCI_IOV is disabled; - Fixed build for ppc64e when CONFIG_EEH is disabled; - Fixed code style warnings. [1] https://lists.ozlabs.org/pipermail/skiboot/2019-March/013571.html [2] https://www.spinics.net/lists/linux-pci/msg79995.html [3] https://lists.ozlabs.org/pipermail/linuxppc-dev/2018-September/178053.html Sergey Miroshnichenko (8): powerpc/pci: Access PCI config space directly w/o pci_dn powerpc/powernv/pci: Suppress an EEH error when reading an empty slot powerpc/pci: Create pci_dn on demand powerpc/pci: Reduce code duplication in pci_add_device_node_info powerpc/pci/IOV: Add support for runtime enabling the VFs powerpc/pci: Don't rely on DT is the PCI_REASSIGN_ALL_BUS is set powerpc/powernv/pci: Hook up the writes to PCI_SECONDARY_BUS register powerpc/powernv/pci: Enable reassigning the bus numbers arch/powerpc/include/asm/pci-bridge.h| 4 +- arch/powerpc/include/asm/ppc-pci.h | 1 + arch/powerpc/kernel/pci_dn.c | 170 ++- arch/powerpc/kernel/rtas_pci.c | 97 ++--- arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c| 4 +- arch/powerpc/platforms/powernv/pci.c | 205 +-- arch/powerpc/platforms/pseries/pci.c | 4 +- 8 files changed, 379 insertions(+), 108 deletions(-) -- 2.20.1
[PATCH v5 1/8] powerpc/pci: Access PCI config space directly w/o pci_dn
To fetch an updated DT for the newly hotplugged device, OS must explicitly request it from the firmware via the pnv_php driver. If pnv_php wasn't triggered/loaded, it is still possible to discover new devices if PCIe I/O will not stop in absence of the pci_dn structure. Signed-off-by: Sergey Miroshnichenko --- arch/powerpc/kernel/rtas_pci.c | 97 +++- arch/powerpc/platforms/powernv/pci.c | 64 -- 2 files changed, 109 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index c2b148b1634a..f675b5ecb5bc 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -55,10 +55,26 @@ static inline int config_access_valid(struct pci_dn *dn, int where) return 0; } -int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +static int rtas_read_raw_config(unsigned long buid, int busno, unsigned int devfn, + int where, int size, u32 *val) { int returnval = -1; - unsigned long buid, addr; + unsigned long addr = rtas_config_addr(busno, devfn, where); + int ret; + + if (buid) { + ret = rtas_call(ibm_read_pci_config, 4, 2, , + addr, BUID_HI(buid), BUID_LO(buid), size); + } else { + ret = rtas_call(read_pci_config, 2, 2, , addr, size); + } + *val = returnval; + + return ret; +} + +int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) +{ int ret; if (!pdn) @@ -71,16 +87,8 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) return PCIBIOS_SET_FAILED; #endif - addr = rtas_config_addr(pdn->busno, pdn->devfn, where); - buid = pdn->phb->buid; - if (buid) { - ret = rtas_call(ibm_read_pci_config, 4, 2, , - addr, BUID_HI(buid), BUID_LO(buid), size); - } else { - ret = rtas_call(read_pci_config, 2, 2, , addr, size); - } - *val = returnval; - + ret = rtas_read_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn, + where, size, val); if (ret) return PCIBIOS_DEVICE_NOT_FOUND; @@ -98,18 +106,44 @@ static int rtas_pci_read_config(struct pci_bus *bus, pdn = pci_get_pdn_by_devfn(bus, devfn); - /* Validity of pdn is checked in here */ - ret = rtas_read_config(pdn, where, size, val); - if (*val == EEH_IO_ERROR_VALUE(size) && - eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) - return PCIBIOS_DEVICE_NOT_FOUND; + if (pdn) { + /* Validity of pdn is checked in here */ + ret = rtas_read_config(pdn, where, size, val); + + if (*val == EEH_IO_ERROR_VALUE(size) && + eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) + ret = PCIBIOS_DEVICE_NOT_FOUND; + } else { + struct pci_controller *phb = pci_bus_to_host(bus); + + ret = rtas_read_raw_config(phb->buid, bus->number, devfn, + where, size, val); + } return ret; } +static int rtas_write_raw_config(unsigned long buid, int busno, unsigned int devfn, +int where, int size, u32 val) +{ + unsigned long addr = rtas_config_addr(busno, devfn, where); + int ret; + + if (buid) { + ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, + BUID_HI(buid), BUID_LO(buid), size, (ulong)val); + } else { + ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); + } + + if (ret) + return PCIBIOS_DEVICE_NOT_FOUND; + + return PCIBIOS_SUCCESSFUL; +} + int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) { - unsigned long buid, addr; int ret; if (!pdn) @@ -122,15 +156,8 @@ int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) return PCIBIOS_SET_FAILED; #endif - addr = rtas_config_addr(pdn->busno, pdn->devfn, where); - buid = pdn->phb->buid; - if (buid) { - ret = rtas_call(ibm_write_pci_config, 5, 1, NULL, addr, - BUID_HI(buid), BUID_LO(buid), size, (ulong) val); - } else { - ret = rtas_call(write_pci_config, 3, 1, NULL, addr, size, (ulong)val); - } - + ret = rtas_write_raw_config(pdn->phb->buid, pdn->busno, pdn->devfn, + where, size, val); if (ret) return PCIBIOS_DEVICE_NOT_FOUND; @@ -141,12 +168,20 @@ static int rtas_pci_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct
Re: [PATCH v2 5/7] counter: add FlexTimer Module Quadrature decoder counter driver
On Thu, Mar 7, 2019 at 12:32 PM William Breathitt Gray wrote: > > +/* > > + * take mutex > > + * call ftm_clear_write_protection > > + * update settings > > + * call ftm_set_write_protection > > + * release mutex > > + */ > > Jonathan mentioned it before in the previous review, and I think I agree > too, that this comment block is superfluous: the context of this code is > simple enough that the function call order is naturally obvious (i.e. > write protection must be cleared before settings are modified). > > The only important thing to mention here is that the mutex must be held > before the write protection state is modified so a comment along the > following lines should suffice: > > /* hold mutex before modifying write protection state */ I think that keeping the more verbose comments is better. You directly see what operations are needed, and is a good reminder, especially if you are not familiar with the driver. I'll use your comment on the next version if you insist (see below for why new versoion). > > +static void ftm_quaddec_disable(struct ftm_quaddec *ftm) > > +{ > > + ftm_write(ftm, FTM_MODE, 0); > > +} > > The ftm_quaddec_disable function is only used for cleanup when the > driver is being removed. Is disabling the FTM counter on removal > actually something we need to do? It might provide some power-saving, so I would keep that function. > > While it's true that the register will keep updating, since the driver > is no longer loaded, we don't care about that register value. Once we > take control of the hardware again (by reloading our driver or via a new > one), we reinitialize the counter and set the count value back to 0 > anyway -- so whatever value the register had no longer matters. > Indeed the previous values at start do not matter. It's there just to shut down the device properly. This discussion made me verify again the specs and in its current form the disable doesn't even work at all : - That register should be written with write protection disabled (duh!) - It doesn't even stop the FTM from running, the clock must be disabled for that. So I'll probably provide a fix for that (in some days/weeks). > > + > > +enum ftm_quaddec_count_function { > > + FTM_QUADDEC_COUNT_ENCODER_MODE_1, > > +}; > > The FlexTimer Module supports more than just a quadrature counter mode > doesn't it? > > We should keep this initial patch simple since we are still introducing > the Counter subsystem, but it'll be nice to add support in the future > for the other counter modes such as single-edge capture. yes it provides more features, those are in a backlog ;). I would prefer if this simple version(I mean, with the disable/shutdown fixed) of the driver could be merged already before extending support. > > > + > > +static struct counter_signal ftm_quaddec_signals[] = { > > + { > > + .id = 0, > > + .name = "Channel 1 Quadrature A" > > + }, > > + { > > + .id = 1, > > + .name = "Channel 1 Quadrature B" > > + } > > +}; > > If possible, these names should match the FTM datasheet naming > convention. The reason is to make it easier for users to match the > input signals described in the datasheet with the Signal data provided > by the Generic Counter interface. > > I think the FTM datasheet describes these signals as "Phase A" and > "Phase B", so perhaps "Channel 1 Phase A" and "Channel 1 Phase B" may be > more appropriate names in this case. I'll verify those, > > +static int ftm_quaddec_remove(struct platform_device *pdev) > > +{ > > + struct ftm_quaddec *ftm = platform_get_drvdata(pdev); > > + > > + counter_unregister(>counter); > > + > > + ftm_quaddec_disable(ftm); > > + > > + return 0; > > +} > > If the ftm_quaddec_disable is not necessary, then we can eliminate the > ftm_quaddec_remove function as well by replacing the counter_register > call with a devm_counter_register call. yes, but as stated before, I would keep it for potential energy saving. Thanks for your feedback :)
Re: [PATCH v5 05/10] powerpc: Add a framework for Kernel Userspace Access Protection
Le 08/03/2019 à 02:16, Michael Ellerman a écrit : From: Christophe Leroy This patch implements a framework for Kernel Userspace Access Protection. Then subarches will have the possibility to provide their own implementation by providing setup_kuap() and allow/prevent_user_access(). Some platforms will need to know the area accessed and whether it is accessed from read, write or both. Therefore source, destination and size and handed over to the two functions. mpe: Rename to allow/prevent rather than unlock/lock, and add read/write wrappers. Drop the 32-bit code for now until we have an implementation for it. Add kuap to pt_regs for 64-bit as well as 32-bit. Don't split strings, use pr_crit_ratelimited(). Signed-off-by: Christophe Leroy Signed-off-by: Russell Currey Signed-off-by: Michael Ellerman --- v5: Futex ops need read/write so use allow_user_acccess() there. Use #ifdef CONFIG_PPC64 in kup.h to fix build errors. Allow subarch to override allow_read/write_from/to_user(). v4: mpe: Rename to allow/prevent rather than unlock/lock, and add read/write wrappers. Drop the 32-bit code for now until we have an implementation for it. Add kuap to pt_regs for 64-bit as well as 32-bit. Don't split strings, use pr_crit_ratelimited(). We know have on top of v5 an implementation for 32-bit 8xx and book3s/32 that works (tested on 8xx, 83xx and QEMU MAC99). Christophe .../admin-guide/kernel-parameters.txt | 2 +- arch/powerpc/include/asm/futex.h | 4 ++ arch/powerpc/include/asm/kup.h| 24 arch/powerpc/include/asm/ptrace.h | 11 +- arch/powerpc/include/asm/uaccess.h| 38 +++ arch/powerpc/kernel/asm-offsets.c | 4 ++ arch/powerpc/lib/checksum_wrappers.c | 4 ++ arch/powerpc/mm/fault.c | 19 -- arch/powerpc/mm/init-common.c | 10 + arch/powerpc/platforms/Kconfig.cputype| 12 ++ 10 files changed, 113 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f81d79de4de0..16883f2a05fd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2809,7 +2809,7 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings - nosmap [X86] + nosmap [X86,PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/futex.h b/arch/powerpc/include/asm/futex.h index 88b38b37c21b..3a6aa57b9d90 100644 --- a/arch/powerpc/include/asm/futex.h +++ b/arch/powerpc/include/asm/futex.h @@ -35,6 +35,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, { int oldval = 0, ret; + allow_write_to_user(uaddr, sizeof(*uaddr)); pagefault_disable(); switch (op) { @@ -62,6 +63,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, if (!ret) *oval = oldval; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } @@ -75,6 +77,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(uaddr, sizeof(u32))) return -EFAULT; + allow_write_to_user(uaddr, sizeof(*uaddr)); __asm__ __volatile__ ( PPC_ATOMIC_ENTRY_BARRIER "1: lwarx %1,0,%3 # futex_atomic_cmpxchg_inatomic\n\ @@ -95,6 +98,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "cc", "memory"); *uval = prev; + prevent_write_to_user(uaddr, sizeof(*uaddr)); return ret; } diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index a2a959cb4e36..4410625f4364 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -4,6 +4,8 @@ #ifndef __ASSEMBLY__ +#include + void setup_kup(void); #ifdef CONFIG_PPC_KUEP @@ -12,6 +14,28 @@ void setup_kuep(bool disabled); static inline void setup_kuep(bool disabled) { } #endif /* CONFIG_PPC_KUEP */ +#ifdef CONFIG_PPC_KUAP +void setup_kuap(bool disabled); +#else +static inline void setup_kuap(bool disabled) { } +static inline void allow_user_access(void __user *to, const void __user *from, +unsigned long size) { } +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) { } +static inline void allow_read_from_user(const void __user *from, unsigned long size) {} +static inline void allow_write_to_user(void __user *to, unsigned long size) {} +#endif /* CONFIG_PPC_KUAP */ + +static inline void
Re: [PATCH v2 0/2] Append new variables to vmcoreinfo (PTRS_PER_PGD for arm64 and MAX_PHYSMEM_BITS for all archs)
Hi Bhupesh, On 03/10/19 at 03:34pm, Bhupesh Sharma wrote: > Changes since v1: > > - v1 was sent out as a single patch which can be seen here: > http://lists.infradead.org/pipermail/kexec/2019-February/022411.html > > - v2 breaks the single patch into two independent patches: > [PATCH 1/2] appends 'PTRS_PER_PGD' to vmcoreinfo for arm64 arch, whereas > [PATCH 2/2] appends 'MAX_PHYSMEM_BITS' to vmcoreinfo in core kernel code > (all archs) > > This patchset primarily fixes the regression reported in user-space > utilities like 'makedumpfile' and 'crash-utility' on arm64 architecture > with the availability of 52-bit address space feature in underlying > kernel. These regressions have been reported both on CPUs which don't > support ARMv8.2 extensions (i.e. LVA, LPA) and are running newer kernels > and also on prototype platforms (like ARMv8 FVP simulator model) which > support ARMv8.2 extensions and are running newer kernels. > > The reason for these regressions is that right now user-space tools > have no direct access to these values (since these are not exported > from the kernel) and hence need to rely on a best-guess method of > determining value of 'PTRS_PER_PGD' and 'MAX_PHYSMEM_BITS' supported > by underlying kernel. > > Exporting these values via vmcoreinfo will help user-land in such cases. > In addition, as per suggestion from makedumpfile maintainer (Kazu) > during v1 review, it makes more sense to append 'MAX_PHYSMEM_BITS' to > vmcoreinfo in the core code itself rather than in arm64 arch-specific > code, so that the user-space code for other archs can also benefit from > this addition to the vmcoreinfo and use it as a standard way of > determining 'SECTIONS_SHIFT' value in user-land. > > Cc: Mark Rutland > Cc: James Morse > Cc: Will Deacon > Cc: Boris Petkov > Cc: Ingo Molnar > Cc: Thomas Gleixner > Cc: Michael Ellerman > Cc: Paul Mackerras > Cc: Benjamin Herrenschmidt > Cc: Dave Anderson > Cc: Kazuhito Hagio > Cc: x...@kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Cc: linux-arm-ker...@lists.infradead.org > Cc: linux-ker...@vger.kernel.org > Cc: ke...@lists.infradead.org > > Bhupesh Sharma (2): > arm64, vmcoreinfo : Append 'PTRS_PER_PGD' to vmcoreinfo > crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to vmcoreinfo > > arch/arm64/kernel/crash_core.c | 1 + > kernel/crash_core.c| 1 + > 2 files changed, 2 insertions(+) > Lianbo's document patch has been merged, would you mind to add vmcoreinfo doc patch as well in your series? Thanks Dave
[PATCH v2 10/10] powerpc/32s: Implement Kernel Userspace Access Protection
This patch implements Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. The previous patch modifies the page protection so that RW user pages are RW for Key 0 and RO for Key 1, and it sets Key 0 for both user and kernel. This patch changes userspace segment registers are set to Ku 0 and Ks 1. When kernel needs to write to RW pages, the associated segment register is then changed to Ks 0 in order to allow write access to the kernel. In order to avoid having the read all segment registers when locking/unlocking the access, some data is kept in the thread_struct and saved on stack on exceptions. The field identifies both the first unlocked segment and the first segment following the last unlocked one. When no segment is unlocked, it contains value 0. As the hash_page() function is not able to easily determine if a protfault is due to a bad kernel access to userspace, protfaults need to be handled by handle_page_fault when KUAP is set. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/book3s/32/kup.h | 107 +++ arch/powerpc/include/asm/processor.h | 3 + arch/powerpc/kernel/asm-offsets.c| 3 + arch/powerpc/kernel/head_32.S| 11 arch/powerpc/mm/ppc_mmu_32.c | 10 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + 6 files changed, 135 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h index 5f97c742ca71..b3560b2de435 100644 --- a/arch/powerpc/include/asm/book3s/32/kup.h +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -37,6 +37,113 @@ #endif .endm +#ifdef CONFIG_PPC_KUAP + +.macro kuap_update_sr gpr1, gpr2, gpr3/* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi\gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000/* address of next segment */ + cmplw \gpr2, \gpr3 + blt-101b + isync +.endm + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lwz \gpr2, KUAP(\thread) + rlwinm. \gpr3, \gpr2, 28, 0xf000 + stw \gpr2, STACK_REGS_KUAP(\sp) + beq+102f + li \gpr1, 0 + stw \gpr1, KUAP(\thread) + mfsrin \gpr1, \gpr2 + oris\gpr1, \gpr1, SR_KS@h /* set Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_restoresp, current, gpr1, gpr2, gpr3 + lwz \gpr2, STACK_REGS_KUAP(\sp) + rlwinm. \gpr3, \gpr2, 28, 0xf000 + stw \gpr2, THREAD + KUAP(\current) + beq+102f + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_KS /* Clear Ks */ + kuap_update_sr \gpr1, \gpr2, \gpr3 +102: +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + lwz \gpr2, KUAP(thread) +999: twnei \gpr, 0 + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#endif /* CONFIG_PPC_KUAP */ + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +static inline void kuap_update_sr(u32 sr, u32 addr, u32 end) +{ + barrier(); /* make sure thread.kuap is updated before playing with SRs */ + while (addr < end) { + mtsrin(sr, addr); + sr += 0x111;/* next VSID */ + sr &= 0xf0ff; /* clear VSID overflow */ + addr += 0x1000; /* address of next segment */ + } + isync();/* Context sync required after mtsrin() */ +} + +static inline void allow_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr = (__force u32)to; + u32 end = min(addr + size, TASK_SIZE); + + if (!addr || addr >= TASK_SIZE || !size) + return; + + current->thread.kuap = (addr & 0xf000) | end - 1) >> 28) + 1) & 0xf); + kuap_update_sr(mfsrin(addr) & ~SR_KS, addr, end); /* Clear Ks */ +} + +static inline void prevent_user_access(void __user *to, const void __user *from, u32 size) +{ + u32 addr = (__force u32)to; + u32 end = min(addr + size, TASK_SIZE); + + if (!addr || addr >= TASK_SIZE || !size) + return; + + current->thread.kuap = 0; + kuap_update_sr(mfsrin(addr) | SR_KS, addr, end);/* set Ks */ +} + +static inline void allow_read_from_user(const void __user *from, unsigned long size) +{ +} + +static inline void allow_write_to_user(void __user *to, unsigned long size) +{ + allow_user_access(to, NULL, size); +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + if (!is_write) + return false; + + return WARN(!regs->kuap, "Bug:
[PATCH v2 05/10] powerpc/8xx: Only define APG0 and APG1
Since the 8xx implements hardware page table walk assistance, the PGD entries always point to a 4k aligned page, so the 2 upper bits of the APG are not clobbered anymore and remain 0. Therefore only APG0 and APG1 are used and need a definition. We set the other APG to the lowest permission level. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 0a1a3fc54e54..fc5a653d5dd2 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -35,11 +35,11 @@ * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => No user => 01 (all accesses performed according to page definition) + * 0 => Kernel => 01 (all accesses performed according to page definition) * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * We define all 16 groups so that all other bits of APG can take any value + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) */ -#define MI_APG_INIT0x +#define MI_APG_INIT0x4fff /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in @@ -108,11 +108,11 @@ * Then we use the APG to say whether accesses are according to Page rules or * "all Supervisor" rules (Access to all) * Therefore, we define 2 APG groups. lsb is _PMD_USER - * 0 => No user => 01 (all accesses performed according to page definition) + * 0 => Kernel => 01 (all accesses performed according to page definition) * 1 => User => 00 (all accesses performed as supervisor iaw page definition) - * We define all 16 groups so that all other bits of APG can take any value + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) */ -#define MD_APG_INIT0x +#define MD_APG_INIT0x4fff /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in -- 2.13.3
[PATCH v2 09/10] powerpc/32s: Prepare Kernel Userspace Access Protection
This patch prepares Kernel Userspace Access Protection for book3s/32. Due to limitations of the processor page protection capabilities, the protection is only against writing. read protection cannot be achieved using page protection. book3s/32 provides the following values for PP bits: PP00 provides RW for Key 0 and NA for Key 1 PP01 provides RW for Key 0 and RO for Key 1 PP10 provides RW for all PP11 provides RO for all Today PP10 is used for RW pages and PP11 for RO pages, and user segment register's Kp and Ks are set to 1. This patch modifies page protection to use PP01 for RW pages and sets user segment registers to Kp 0 and Ks 0. This will allow to setup Userspace write access protection by settng Ks to 1 in the following patch. Kernel space segment registers remain unchanged. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/book3s/32/mmu-hash.h | 2 ++ arch/powerpc/kernel/head_32.S | 22 +++--- arch/powerpc/mm/hash_low_32.S | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 8c5727a322b1..f9eae105a9f4 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -65,6 +65,8 @@ typedef pte_t *pgtable_t; /* Values for Segment Registers */ #define SR_NX 0x1000 /* No Execute */ +#define SR_KP 0x2000 /* User key */ +#define SR_KS 0x4000 /* Supervisor key */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 5e792f2634fc..dfc1a68fc647 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -522,9 +522,9 @@ InstructionTLBMiss: andc. r1,r1,r0/* check access & ~permission */ bne-InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30/* _PAGE_USER -> PP msb */ - ori r1, r1, 0xe05 /* clear out reserved bits */ - andcr1, r0, r1 /* PP = user? 2 : 0 */ + rlwimi r0,r0,32-2,31,31/* _PAGE_USER -> PP lsb */ + ori r1, r1, 0xe06 /* clear out reserved bits */ + andcr1, r0, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -590,11 +590,11 @@ DataLoadTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-10,31,31 /* _PAGE_RW -> PP lsb */ + rlwinm r1,r0,32-9,30,30/* _PAGE_RW -> PP msb */ rlwimi r0,r0,32-1,30,30/* _PAGE_USER -> PP msb */ rlwimi r0,r0,32-1,31,31/* _PAGE_USER -> PP lsb */ ori r1,r1,0xe04 /* clear out reserved bits */ - andcr1,r0,r1/* PP = user? rw? 2: 3: 0 */ + andcr1,r0,r1/* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -670,9 +670,9 @@ DataStoreTLBMiss: * we would need to update the pte atomically with lwarx/stwcx. */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-1,30,30/* _PAGE_USER -> PP msb */ - li r1,0xe05/* clear out reserved bits & PP lsb */ - andcr1,r0,r1/* PP = user? 2: 0 */ + rlwimi r0,r0,32-2,31,31/* _PAGE_USER -> PP lsb */ + li r1,0xe06/* clear out reserved bits & PP msb */ + andcr1,r0,r1/* PP = user? 1: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -900,9 +900,9 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0, NUM_USER_SEGMENTS /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up user segment register values */ mtctr r0 /* for context 0 */ - lis r3,0x2000 /* Ku = 1, VSID = 0 */ + li r3, 0 /* Kp = 0, Ks = 0, VSID = 0 */ #ifdef CONFIG_PPC_KUEP orisr3, r3, SR_NX@h /* Set Nx */ #endif @@ -914,6 +914,7 @@ load_up_mmu: li r0, 16 - NUM_USER_SEGMENTS /* load up kernel segment registers */ mtctr r0 /* for context 0 */ rlwinm r3, r3, 0, ~SR_NX /* Nx = 0 */ + orisr3, r3, SR_KP@h /* Kp = 1 */ 3: mtsrin r3, r4 addir3, r3, 0x111 /*
[PATCH v2 07/10] powerpc/8xx: Add Kernel Userspace Access Protection
This patch adds Kernel Userspace Access Protection on the 8xx. When a page is RO or RW, it is set RO or RW for Key 0 and NA for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to access user data. At exception entry, the kernel saves SPRN_MD_AP in the regs struct, and reapply the protection. At exception exit it restores SPRN_MD_AP with the value saved on exception entry. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/kup.h | 3 ++ arch/powerpc/include/asm/nohash/32/kup-8xx.h | 68 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 +++ arch/powerpc/mm/8xx_mmu.c| 12 + arch/powerpc/platforms/Kconfig.cputype | 1 + 5 files changed, 91 insertions(+) create mode 100644 arch/powerpc/include/asm/nohash/32/kup-8xx.h diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 632b367b93f4..75ade5a54607 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -5,6 +5,9 @@ #ifdef CONFIG_PPC64 #include #endif +#ifdef CONFIG_PPC_8xx +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/include/asm/nohash/32/kup-8xx.h b/arch/powerpc/include/asm/nohash/32/kup-8xx.h new file mode 100644 index ..a44cc6c1b901 --- /dev/null +++ b/arch/powerpc/include/asm/nohash/32/kup-8xx.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_KUP_8XX_H_ +#define _ASM_POWERPC_KUP_8XX_H_ + +#include + +#ifdef CONFIG_PPC_KUAP + +#ifdef __ASSEMBLY__ + +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 + lis \gpr2, MD_APG_KUAP@h/* only APG0 and APG1 are used */ + mfspr \gpr1, SPRN_MD_AP + mtspr SPRN_MD_AP, \gpr2 + stw \gpr1, STACK_REGS_KUAP(\sp) +.endm + +.macro kuap_restoresp, current, gpr1, gpr2, gpr3 + lwz \gpr1, STACK_REGS_KUAP(\sp) + mtspr SPRN_MD_AP, \gpr1 +.endm + +.macro kuap_check current, gpr +#ifdef CONFIG_PPC_KUAP_DEBUG + mfspr \gpr, SPRN_MD_AP + rlwinm \gpr, \gpr, 16, 0x +999: twnei \gpr, MD_APG_KUAP@h + EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#include + +static inline void allow_user_access(void __user *to, const void __user *from, +unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_INIT); +} + +static inline void prevent_user_access(void __user *to, const void __user *from, + unsigned long size) +{ + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} + +static inline void allow_read_from_user(const void __user *from, unsigned long size) +{ + allow_user_access(NULL, from, size); +} + +static inline void allow_write_to_user(void __user *to, unsigned long size) +{ + allow_user_access(to, NULL, size); +} + +static inline bool bad_kuap_fault(struct pt_regs *regs, bool is_write) +{ + return WARN(!((regs->kuap ^ MD_APG_KUAP) & 0xf000), + "Bug: fault blocked by AP register !"); +} + +#endif /* !__ASSEMBLY__ */ + +#endif /* CONFIG_PPC_KUAP */ + +#endif /* _ASM_POWERPC_KUP_8XX_H_ */ diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 3cb743284e09..f620adef54fc 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -121,6 +121,13 @@ */ #define MD_APG_INIT0x4fff +/* + * 0 => No user => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MD_APG_KUAP0x6fff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MD_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index e257a0c9bd08..87648b58d295 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -225,3 +225,15 @@ void __init setup_kuep(bool disabled) mtspr(SPRN_MI_AP, MI_APG_KUEP); } #endif + +#ifdef CONFIG_PPC_KUAP +void __init setup_kuap(bool disabled) +{ + pr_info("Activating Kernel Userspace Access Protection\n"); + + if (disabled) + pr_warn("KUAP cannot be disabled yet on 8xx when compiled in\n"); + + mtspr(SPRN_MD_AP, MD_APG_KUAP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 00fa0d110dcb..ab586963893a 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -35,6 +35,7 @@ config
[PATCH v2 08/10] powerpc/32s: Implement Kernel Userspace Execution Prevention.
To implement Kernel Userspace Execution Prevention, this patch sets NX bit on all user segments on kernel entry and clears NX bit on all user segments on kernel exit. Note that powerpc 601 doesn't have the NX bit, so KUEP will not work on it. A warning is displayed at startup. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/book3s/32/kup.h | 42 +++ arch/powerpc/include/asm/book3s/32/mmu-hash.h | 3 ++ arch/powerpc/include/asm/kup.h| 3 ++ arch/powerpc/kernel/entry_32.S| 9 ++ arch/powerpc/kernel/head_32.S | 15 +- arch/powerpc/mm/ppc_mmu_32.c | 13 + arch/powerpc/platforms/Kconfig.cputype| 1 + 7 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/book3s/32/kup.h diff --git a/arch/powerpc/include/asm/book3s/32/kup.h b/arch/powerpc/include/asm/book3s/32/kup.h new file mode 100644 index ..5f97c742ca71 --- /dev/null +++ b/arch/powerpc/include/asm/book3s/32/kup.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_32_KUP_H +#define _ASM_POWERPC_BOOK3S_32_KUP_H + +#include + +#ifdef __ASSEMBLY__ + +.macro kuep_update_sr gpr1, gpr2 /* NEVER use r0 as gpr2 due to addis */ +101: mtsrin \gpr1, \gpr2 + addi\gpr1, \gpr1, 0x111 /* next VSID */ + rlwinm \gpr1, \gpr1, 0, 0xf0ff /* clear VSID overflow */ + addis \gpr2, \gpr2, 0x1000/* address of next segment */ + bdnz101b + isync +.endm + +.macro kuep_lock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + oris\gpr1, \gpr1, SR_NX@h /* set Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +.macro kuep_unlock gpr1, gpr2 +#ifdef CONFIG_PPC_KUEP + li \gpr1, NUM_USER_SEGMENTS + li \gpr2, 0 + mtctr \gpr1 + mfsrin \gpr1, \gpr2 + rlwinm \gpr1, \gpr1, 0, ~SR_NX /* Clear Nx */ + kuep_update_sr \gpr1, \gpr2 +#endif +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_POWERPC_BOOK3S_32_KUP_H */ diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h index 5cb588395fdc..8c5727a322b1 100644 --- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h @@ -63,6 +63,9 @@ typedef pte_t *pgtable_t; #define PP_RWRW 2 /* Supervisor read/write, User read/write */ #define PP_RXRX 3 /* Supervisor read, User read */ +/* Values for Segment Registers */ +#define SR_NX 0x1000 /* No Execute */ + #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index 75ade5a54607..c34627967901 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -8,6 +8,9 @@ #ifdef CONFIG_PPC_8xx #include #endif +#ifdef CONFIG_PPC_BOOK3S_32 +#include +#endif #ifdef __ASSEMBLY__ #ifndef CONFIG_PPC_KUAP diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 1182bf603d3c..2f3d159c11d7 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -162,6 +162,9 @@ transfer_to_handler: andis. r12,r12,DBCR0_IDM@h #endif ACCOUNT_CPU_USER_ENTRY(r2, r11, r12) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_lock r11, r12 +#endif #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) beq+3f /* From user and task is ptraced - load up global dbcr0 */ @@ -427,6 +430,9 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ ACCOUNT_CPU_USER_EXIT(r2, r5, r7) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r5, r7 +#endif kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) @@ -821,6 +827,9 @@ restore_user: bnel- load_dbcr0 #endif ACCOUNT_CPU_USER_EXIT(r2, r10, r11) +#ifdef CONFIG_PPC_BOOK3S_32 + kuep_unlock r10, r11 +#endif b restore diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 48051c8977c5..5e792f2634fc 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -900,14 +900,24 @@ load_up_mmu: tophys(r6,r6) lwz r6,_SDR1@l(r6) mtspr SPRN_SDR1,r6 - li r0,16 /* load up segment register values */ + li r0, NUM_USER_SEGMENTS /* load up segment register values */ mtctr r0 /* for context 0 */ lis r3,0x2000 /* Ku = 1, VSID = 0 */ +#ifdef CONFIG_PPC_KUEP + orisr3, r3, SR_NX@h /* Set Nx */ +#endif li r4,0 3: mtsrin r3,r4 addir3,r3,0x111 /* increment VSID */ addis
[PATCH v2 06/10] powerpc/8xx: Add Kernel Userspace Execution Prevention
This patch adds Kernel Userspace Execution Prevention on the 8xx. When a page is Executable, it is set Executable for Key 0 and NX for Key 1. Up to now, the User group is defined with Key 0 for both User and Supervisor. By changing the group to Key 0 for User and Key 1 for Supervisor, this patch prevents the Kernel from being able to execute user code. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 7 +++ arch/powerpc/mm/8xx_mmu.c| 12 arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index fc5a653d5dd2..3cb743284e09 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -41,6 +41,13 @@ */ #define MI_APG_INIT0x4fff +/* + * 0 => Kernel => 01 (all accesses performed according to page definition) + * 1 => User => 10 (all accesses performed according to swaped page definition) + * 2-16 => NA => 11 (all accesses performed as user iaw page definition) + */ +#define MI_APG_KUEP0x6fff + /* The effective page number register. When read, contains the information * about the last instruction TLB miss. When MI_RPN is written, bits in * this register are used to create the TLB entry. diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c index fe1f6443d57f..e257a0c9bd08 100644 --- a/arch/powerpc/mm/8xx_mmu.c +++ b/arch/powerpc/mm/8xx_mmu.c @@ -213,3 +213,15 @@ void flush_instruction_cache(void) mtspr(SPRN_IC_CST, IDC_INVALL); isync(); } + +#ifdef CONFIG_PPC_KUEP +void __init setup_kuep(bool disabled) +{ + if (disabled) + return; + + pr_info("Activating Kernel Userspace Execution Prevention\n"); + + mtspr(SPRN_MI_AP, MI_APG_KUEP); +} +#endif diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 2e45a6e2bc99..00fa0d110dcb 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -34,6 +34,7 @@ config PPC_8xx bool "Freescale 8xx" select FSL_SOC select SYS_SUPPORTS_HUGETLBFS + select PPC_HAVE_KUEP config 40x bool "AMCC 40x" -- 2.13.3
[PATCH v2 03/10] powerpc/32: Remove MSR_PR test when returning from syscall
syscalls are from user only, so we can account time without checking whether returning to kernel or user as it will only be user. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/entry_32.S | 5 - 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index b61cfd29c76f..aaf7c5f44823 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -422,12 +422,7 @@ BEGIN_FTR_SECTION lwarx r7,0,r1 END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - andi. r4,r8,MSR_PR - beq 3f ACCOUNT_CPU_USER_EXIT(r2, r5, r7) -3: -#endif lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlrr4 -- 2.13.3
[PATCH v2 04/10] powerpc/32: Prepare for Kernel Userspace Access Protection
This patch adds ASM macros for saving, restoring and checking the KUAP state, and modifies setup_32 to call them on exceptions from kernel. The macros are defined as empty by default for when CONFIG_PPC_KUAP is not selected and/or for platforms which don't handle (yet) KUAP. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/kup.h | 15 ++- arch/powerpc/kernel/entry_32.S | 16 arch/powerpc/platforms/Kconfig.cputype | 2 +- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kup.h b/arch/powerpc/include/asm/kup.h index ccbd2a249575..632b367b93f4 100644 --- a/arch/powerpc/include/asm/kup.h +++ b/arch/powerpc/include/asm/kup.h @@ -6,7 +6,20 @@ #include #endif -#ifndef __ASSEMBLY__ +#ifdef __ASSEMBLY__ +#ifndef CONFIG_PPC_KUAP +.macro kuap_save_and_lock sp, thread, gpr1, gpr2, gpr3 +.endm + +.macro kuap_restoresp, current, gpr1, gpr2, gpr3 +.endm + +.macro kuap_check current, gpr +.endm + +#endif + +#else /* !__ASSEMBLY__ */ #include diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index aaf7c5f44823..1182bf603d3c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -36,6 +36,7 @@ #include #include #include +#include /* * MSR_KERNEL is > 0x1 on 4xx/Book-E since it include MSR_CE. @@ -150,8 +151,8 @@ transfer_to_handler: stw r12,_CTR(r11) stw r2,_XER(r11) mfspr r12,SPRN_SPRG_THREAD - addir2,r12,-THREAD beq 2f /* if from user, fix up THREAD.regs */ + addir2, r12, -THREAD addir11,r1,STACK_FRAME_OVERHEAD stw r11,PT_REGS(r12) #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) @@ -186,6 +187,8 @@ transfer_to_handler: 2: /* if from kernel, check interrupted DOZE/NAP mode and * check for stack overflow */ + kuap_save_and_lock r11, r12, r9, r2, r0 + addir2, r12, -THREAD lwz r9,KSP_LIMIT(r12) cmplw r1,r9 /* if r1 <= ksp_limit */ ble-stack_ovf /* then the kernel stack overflowed */ @@ -272,6 +275,7 @@ reenable_mmu: /* re-enable mmu so we can */ lwz r9,_MSR(r11)/* if sleeping, clear MSR.EE */ rlwinm r9,r9,0,~MSR_EE lwz r12,_LINK(r11) /* and return to address in LR */ + kuap_restore r11, r2, r3, r4, r5 b fast_exception_return #endif @@ -423,6 +427,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_NEED_PAIRED_STWCX) stwcx. r0,0,r1 /* to clear the reservation */ ACCOUNT_CPU_USER_EXIT(r2, r5, r7) + kuap_check r2, r4 lwz r4,_LINK(r1) lwz r5,_CCR(r1) mtlrr4 @@ -673,6 +678,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE) stw r10,_CCR(r1) stw r1,KSP(r3) /* Set old stack pointer */ + kuap_check r2, r4 #ifdef CONFIG_SMP /* We need a sync somewhere here to make sure that if the * previous task gets rescheduled on another CPU, it sees all @@ -861,12 +867,12 @@ resume_kernel: /* check current_thread_info->preempt_count */ lwz r0,TI_PREEMPT(r2) cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ - bne restore + bne restore_kuap andi. r8,r8,_TIF_NEED_RESCHED - beq+restore + beq+restore_kuap lwz r3,_MSR(r1) andi. r0,r3,MSR_EE/* interrupts off? */ - beq restore /* don't schedule if so */ + beq restore_kuap/* don't schedule if so */ #ifdef CONFIG_TRACE_IRQFLAGS /* Lockdep thinks irqs are enabled, we need to call * preempt_schedule_irq with IRQs off, so we inform lockdep @@ -885,6 +891,8 @@ resume_kernel: bl trace_hardirqs_on #endif #endif /* CONFIG_PREEMPT */ +restore_kuap: + kuap_restore r1, r2, r9, r10, r0 /* interrupts are hard-disabled at this point */ restore: diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 5e53b9fd62aa..2e45a6e2bc99 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -373,7 +373,7 @@ config PPC_KUAP config PPC_KUAP_DEBUG bool "Extra debugging for Kernel Userspace Access Protection" - depends on PPC_HAVE_KUAP && PPC_RADIX_MMU + depends on PPC_HAVE_KUAP && (PPC_RADIX_MMU || PPC_32) help Add extra debugging for Kernel Userspace Access Protection (KUAP) If you're unsure, say N. -- 2.13.3
[PATCH v2 02/10] powerpc/mm: Detect bad KUAP faults (Squash of v5 series)
This is a squash of the v5 series, not intended to be merged. Signed-off-by: Christophe Leroy --- Documentation/admin-guide/kernel-parameters.txt | 4 +- arch/powerpc/include/asm/book3s/64/kup-radix.h | 119 arch/powerpc/include/asm/exception-64s.h| 2 + arch/powerpc/include/asm/feature-fixups.h | 3 + arch/powerpc/include/asm/futex.h| 4 + arch/powerpc/include/asm/kup.h | 46 + arch/powerpc/include/asm/mmu.h | 10 +- arch/powerpc/include/asm/ptrace.h | 11 ++- arch/powerpc/include/asm/uaccess.h | 38 ++-- arch/powerpc/kernel/asm-offsets.c | 4 + arch/powerpc/kernel/entry_64.S | 27 +- arch/powerpc/kernel/exceptions-64s.S| 3 + arch/powerpc/kernel/idle_book3s.S | 39 arch/powerpc/kernel/setup_64.c | 10 ++ arch/powerpc/lib/checksum_wrappers.c| 4 + arch/powerpc/lib/code-patching.c| 4 +- arch/powerpc/mm/fault.c | 49 -- arch/powerpc/mm/init-common.c | 26 ++ arch/powerpc/mm/init_32.c | 3 + arch/powerpc/mm/pgtable-radix.c | 30 +- arch/powerpc/mm/pkeys.c | 1 + arch/powerpc/platforms/Kconfig.cputype | 33 +++ 22 files changed, 440 insertions(+), 30 deletions(-) create mode 100644 arch/powerpc/include/asm/book3s/64/kup-radix.h create mode 100644 arch/powerpc/include/asm/kup.h diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a422560fbc15..c0431a25c57b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2823,11 +2823,11 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings - nosmap [X86] + nosmap [X86,PPC] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. - nosmep [X86] + nosmep [X86,PPC] Disable SMEP (Supervisor Mode Execution Prevention) even if it is supported by processor. diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h b/arch/powerpc/include/asm/book3s/64/kup-radix.h new file mode 100644 index ..8d2ddc61e92e --- /dev/null +++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H +#define _ASM_POWERPC_BOOK3S_64_KUP_RADIX_H + +#include + +#define AMR_KUAP_BLOCK_READUL(0x4000) +#define AMR_KUAP_BLOCK_WRITE UL(0x8000) +#define AMR_KUAP_BLOCKED (AMR_KUAP_BLOCK_READ | AMR_KUAP_BLOCK_WRITE) +#define AMR_KUAP_SHIFT 62 + +#ifdef __ASSEMBLY__ + +.macro kuap_restore_amrgpr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + ld \gpr, STACK_REGS_KUAP(r1) + mtspr SPRN_AMR, \gpr + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_check_amr gpr1 gpr2 +#ifdef CONFIG_PPC_KUAP_DEBUG + BEGIN_MMU_FTR_SECTION_NESTED(67) + mfspr \gpr1, SPRN_AMR + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi\gpr2, \gpr2, AMR_KUAP_SHIFT +999: tdne\gpr1, \gpr2 + EMIT_BUG_ENTRY 999b,__FILE__,__LINE__, \ + (BUGFLAG_WARNING|BUGFLAG_ONCE) + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +.macro kuap_save_amr_and_lock gpr1, gpr2, use_cr, msr_pr_cr +#ifdef CONFIG_PPC_KUAP + BEGIN_MMU_FTR_SECTION_NESTED(67) + .ifnb \msr_pr_cr + bne \msr_pr_cr, 99f + .endif + mfspr \gpr1, SPRN_AMR + std \gpr1, STACK_REGS_KUAP(r1) + li \gpr2, (AMR_KUAP_BLOCKED >> AMR_KUAP_SHIFT) + sldi\gpr2, \gpr2, AMR_KUAP_SHIFT + cmpd\use_cr, \gpr1, \gpr2 + beq \use_cr, 99f + // We don't isync here because we very recently entered via rfid + mtspr SPRN_AMR, \gpr2 + isync +99: + END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67) +#endif +.endm + +#else /* !__ASSEMBLY__ */ + +#ifdef CONFIG_PPC_KUAP + +#include + +/* + * We support individually allowing read or write, but we don't support nesting + * because that would require an expensive read/modify write of the AMR. + */ + +static inline void set_kuap(unsigned long value) +{ + if (!mmu_has_feature(MMU_FTR_RADIX_KUAP)) + return; + + /* +* ISA v3.0B says we need a CSI (Context Synchronising Instruction) both +* before and after the move to AMR. See table 6 on page 1134. +*/ + isync(); +
[PATCH v2 00/10] Kernel Userspace protection for PPC32
This series intend to implement Kernel Userspace protection for PPC32. It comes on top of the v5 series for Radix. The first patch of the series is a fix which is expected to be merged soon. The second patch is a squash of Russel/Michael series for Radix. Tested on: - 8xx - 83xx (ie book3s32 without hash table) - QEMU MAC99 (ie book3s32 with hash table) v2: - Rebased/adapted the series on top of the v5 series for Radix. - Reordered the patches so that we first have the ones common to 32 bits, then the 8xx, then book3s32 - Fixed lockup on bad data write (unauthorised write to user) on book3s32 hash. - Added KUEP for book3s32 Christophe Leroy (9): powerpc/6xx: fix setup and use of SPRN_SPRG_PGDIR for hash32 powerpc/32: Remove MSR_PR test when returning from syscall powerpc/32: Prepare for Kernel Userspace Access Protection powerpc/8xx: Only define APG0 and APG1 powerpc/8xx: Add Kernel Userspace Execution Prevention powerpc/8xx: Add Kernel Userspace Access Protection powerpc/32s: Implement Kernel Userspace Execution Prevention. powerpc/32s: Prepare Kernel Userspace Access Protection powerpc/32s: Implement Kernel Userspace Access Protection Russell Currey (1): powerpc/mm: Detect bad KUAP faults (Squash of v5 series) Documentation/admin-guide/kernel-parameters.txt | 4 +- arch/powerpc/include/asm/book3s/32/kup.h| 149 arch/powerpc/include/asm/book3s/32/mmu-hash.h | 5 + arch/powerpc/include/asm/book3s/64/kup-radix.h | 119 +++ arch/powerpc/include/asm/exception-64s.h| 2 + arch/powerpc/include/asm/feature-fixups.h | 3 + arch/powerpc/include/asm/futex.h| 4 + arch/powerpc/include/asm/kup.h | 65 +++ arch/powerpc/include/asm/mmu.h | 10 +- arch/powerpc/include/asm/nohash/32/kup-8xx.h| 68 +++ arch/powerpc/include/asm/nohash/32/mmu-8xx.h| 26 - arch/powerpc/include/asm/processor.h| 3 + arch/powerpc/include/asm/ptrace.h | 11 +- arch/powerpc/include/asm/uaccess.h | 38 -- arch/powerpc/kernel/asm-offsets.c | 7 ++ arch/powerpc/kernel/cpu_setup_6xx.S | 3 - arch/powerpc/kernel/entry_32.S | 28 +++-- arch/powerpc/kernel/entry_64.S | 27 - arch/powerpc/kernel/exceptions-64s.S| 3 + arch/powerpc/kernel/head_32.S | 52 +++-- arch/powerpc/kernel/idle_book3s.S | 39 +++ arch/powerpc/kernel/setup_64.c | 10 ++ arch/powerpc/lib/checksum_wrappers.c| 4 + arch/powerpc/lib/code-patching.c| 4 +- arch/powerpc/mm/8xx_mmu.c | 24 arch/powerpc/mm/fault.c | 49 ++-- arch/powerpc/mm/hash_low_32.S | 14 +-- arch/powerpc/mm/init-common.c | 26 + arch/powerpc/mm/init_32.c | 3 + arch/powerpc/mm/pgtable-radix.c | 30 - arch/powerpc/mm/pkeys.c | 1 + arch/powerpc/mm/ppc_mmu_32.c| 23 arch/powerpc/platforms/Kconfig.cputype | 37 ++ 33 files changed, 826 insertions(+), 65 deletions(-) create mode 100644 arch/powerpc/include/asm/book3s/32/kup.h create mode 100644 arch/powerpc/include/asm/book3s/64/kup-radix.h create mode 100644 arch/powerpc/include/asm/kup.h create mode 100644 arch/powerpc/include/asm/nohash/32/kup-8xx.h -- 2.13.3
[PATCH v2 01/10] powerpc/6xx: fix setup and use of SPRN_SPRG_PGDIR for hash32
Not only the 603 but all 6xx need SPRN_SPRG_PGDIR to be initialised at startup. This patch move it from __setup_cpu_603() to start_here() and __secondary_start(), close to the initialisation of SPRN_THREAD. Previously, virt addr of PGDIR was retrieved from thread struct. Now that it is the phys addr which is stored in SPRN_SPRG_PGDIR, hash_page() shall not convert it to phys anymore. This patch removes the conversion. Fixes: 93c4a162b014("powerpc/6xx: Store PGDIR physical address in a SPRG") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/cpu_setup_6xx.S | 3 --- arch/powerpc/kernel/head_32.S | 6 ++ arch/powerpc/mm/hash_low_32.S | 8 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kernel/cpu_setup_6xx.S b/arch/powerpc/kernel/cpu_setup_6xx.S index 6f1c11e0691f..7534ecff5e92 100644 --- a/arch/powerpc/kernel/cpu_setup_6xx.S +++ b/arch/powerpc/kernel/cpu_setup_6xx.S @@ -24,9 +24,6 @@ BEGIN_MMU_FTR_SECTION li r10,0 mtspr SPRN_SPRG_603_LRU,r10 /* init SW LRU tracking */ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) - lis r10, (swapper_pg_dir - PAGE_OFFSET)@h - ori r10, r10, (swapper_pg_dir - PAGE_OFFSET)@l - mtspr SPRN_SPRG_PGDIR, r10 BEGIN_FTR_SECTION bl __init_fpu_registers diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index ce6a972f2584..48051c8977c5 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -855,6 +855,9 @@ __secondary_start: li r3,0 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ #endif + lis r4, (swapper_pg_dir - PAGE_OFFSET)@h + ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r4 /* enable MMU and jump to start_secondary */ li r4,MSR_KERNEL @@ -942,6 +945,9 @@ start_here: li r3,0 stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ #endif + lis r4, (swapper_pg_dir - PAGE_OFFSET)@h + ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l + mtspr SPRN_SPRG_PGDIR, r4 /* stack */ lis r1,init_thread_union@ha diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 1f13494efb2b..a6c491f18a04 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -70,12 +70,12 @@ _GLOBAL(hash_page) lis r0,KERNELBASE@h /* check if kernel address */ cmplw 0,r4,r0 ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ - mfspr r5, SPRN_SPRG_PGDIR /* virt page-table root */ + mfspr r5, SPRN_SPRG_PGDIR /* phys page-table root */ blt+112f/* assume user more likely */ - lis r5,swapper_pg_dir@ha/* if kernel address, use */ - addir5,r5,swapper_pg_dir@l /* kernel page table */ + lis r5, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ + addir5 ,r5 ,(swapper_pg_dir - PAGE_OFFSET)@l/* kernel page table */ rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ -112: tophys(r5, r5) +112: #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ lwz r8,0(r5)/* get pmd entry */ -- 2.13.3