[PATCH v4 11/11] powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing
Skips slow part of serialize_against_pte_lookup if there is no running lockless pagetable walk. Signed-off-by: Leonardo Bras --- arch/powerpc/mm/book3s64/pgtable.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 6ba6195bff1b..27966481f2a6 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -95,7 +95,8 @@ static void do_nothing(void *unused) void serialize_against_pte_lookup(struct mm_struct *mm) { smp_mb(); - smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); + if (running_lockless_pgtbl_walk(mm)) + smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); } /* -- 2.20.1
[PATCH v4 10/11] powerpc/book3s_64: Enables counting method to monitor lockless pgtbl walk
Enables count-based monitoring method for lockless pagetable walks on PowerPC book3s_64. Other architectures/platforms fallback to using generic dummy functions. Signed-off-by: Leonardo Bras --- arch/powerpc/include/asm/book3s/64/pgtable.h | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8308f32e9782..eb9b26a4a483 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1370,5 +1370,10 @@ static inline bool pgd_is_leaf(pgd_t pgd) return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE)); } +#define __HAVE_ARCH_LOCKLESS_PGTBL_WALK_COUNTER +void start_lockless_pgtbl_walk(struct mm_struct *mm); +void end_lockless_pgtbl_walk(struct mm_struct *mm); +int running_lockless_pgtbl_walk(struct mm_struct *mm); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ -- 2.20.1
[PATCH v4 09/11] powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all book3s_64-related functions that do lockless pagetable walks. Adds comments explaining that some lockless pagetable walks don't need protection due to guest pgd not being a target of THP collapse/split, or due to being called from Realmode + MSR_EE = 0. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_64_mmu_hv.c| 2 ++ arch/powerpc/kvm/book3s_64_mmu_radix.c | 30 ++ arch/powerpc/kvm/book3s_64_vio_hv.c| 3 +++ 3 files changed, 35 insertions(+) diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 9a75f0e1933b..fcd3dad1297f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -620,6 +620,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * We need to protect against page table destruction * hugepage split and collapse. */ + start_lockless_pgtbl_walk(kvm->mm); local_irq_save(flags); ptep = find_current_mm_pte(current->mm->pgd, hva, NULL, NULL); @@ -629,6 +630,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, write_ok = 1; } local_irq_restore(flags); + end_lockless_pgtbl_walk(kvm->mm); } } diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 2d415c36a61d..9b374b9838fa 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -813,6 +813,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * Read the PTE from the process' radix tree and use that * so we get the shift and attribute bits. */ + start_lockless_pgtbl_walk(kvm->mm); local_irq_disable(); ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); /* @@ -821,12 +822,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, */ if (!ptep) { local_irq_enable(); + end_lockless_pgtbl_walk(kvm->mm); if (page) put_page(page); return RESUME_GUEST; } pte = *ptep; local_irq_enable(); + end_lockless_pgtbl_walk(kvm->mm); /* If we're logging dirty pages, always map single pages */ large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); @@ -972,10 +975,16 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gpa = gfn << PAGE_SHIFT; unsigned int shift; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep)) kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot, kvm->arch.lpid); + return 0; } @@ -989,6 +998,11 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, int ref = 0; unsigned long old, *rmapp; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) { old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, @@ -1013,6 +1027,11 @@ int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned int shift; int ref = 0; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep && pte_present(*ptep) && pte_young(*ptep)) ref = 1; @@ -1030,6 +1049,11 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, int ret = 0; unsigned long old, *rmapp; + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (ptep &&
[PATCH v4 08/11] powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all book3s_hv related functions that do lockless pagetable walks. Adds comments explaining that some lockless pagetable walks don't need protection due to guest pgd not being a target of THP collapse/split, or due to being called from Realmode + MSR_EE = 0 kvmppc_do_h_enter: Fixes where local_irq_restore() must be placed (after the last usage of ptep). Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/book3s_hv_nested.c | 22 -- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 ++ 2 files changed, 34 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 735e0ac6f5b2..5a641b559de7 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -803,7 +803,11 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, if (!gp) return; - /* Find the pte */ + /* Find the pte: +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); /* * If the pte is present and the pfn is still the same, update the pte. @@ -853,7 +857,11 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, if (!gp) return; - /* Find and invalidate the pte */ + /* Find and invalidate the pte: +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); /* Don't spuriously invalidate ptes if the pfn has changed */ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) @@ -921,6 +929,11 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, int shift; spin_lock(&kvm->mmu_lock); + /* +* We are walking the nested guest (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift); if (!shift) shift = PAGE_SHIFT; @@ -1362,6 +1375,11 @@ static long int __kvmhv_nested_page_fault(struct kvm_run *run, /* See if can find translation in our partition scoped tables for L1 */ pte = __pte(0); spin_lock(&kvm->mmu_lock); + /* +* We are walking the secondary (partition-scoped) page table here. +* We can do this without disabling irq because the Linux MM +* subsystem doesn't do THP splits and collapses on this tree. +*/ pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); if (!shift) shift = PAGE_SHIFT; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 63e0ce91e29d..2076a7ac230a 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -252,6 +252,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * If we had a page table table change after lookup, we would * retry via mmu_notifier_retry. */ + start_lockless_pgtbl_walk(kvm->mm); if (!realmode) local_irq_save(irq_flags); /* @@ -287,8 +288,6 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, pa |= gpa & ~PAGE_MASK; } } - if (!realmode) - local_irq_restore(irq_flags); ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1); ptel |= pa; @@ -311,6 +310,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); ptel |= HPTE_R_M; } + if (!realmode) + local_irq_restore(irq_flags); + end_lockless_pgtbl_walk(kvm->mm); /* Find and lock the HPTEG slot to use */ do_insert: @@ -885,11 +887,19 @@ static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa, /* Translate to host virtual address */ hva = __gfn_to_hva_memslot(memslot, gfn); - /* Try to find the host pte for that virtual address */ + /* Try to find the host pte for that virtual address : +* Called by hcall_real_table (real mode + MSR_EE=0) +* Interrupts are disabled here. +*/ + start_lockless_pgtbl_walk(kvm->mm); ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); - if (!ptep) + if (!ptep) { + end_lockless_pgtbl_walk(kvm->m
[PATCH v4 07/11] powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on kvmppc_e500_shadow_map(). Fixes the place where local_irq_restore() is called: previously, if ptep was NULL, local_irq_restore() would never be called. Signed-off-by: Leonardo Bras --- arch/powerpc/kvm/e500_mmu_host.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 321db0fdb9db..a0be76ad8d14 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -473,6 +473,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * We are holding kvm->mmu_lock so a notifier invalidate * can't run hence pfn won't change. */ + start_lockless_pgtbl_walk(kvm->mm); local_irq_save(flags); ptep = find_linux_pte(pgdir, hva, NULL, NULL); if (ptep) { @@ -481,15 +482,18 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, if (pte_present(pte)) { wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & MAS2_WIMGE_MASK; - local_irq_restore(flags); } else { local_irq_restore(flags); + end_lockless_pgtbl_walk(kvm->mm); pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", __func__, (long)gfn, pfn); ret = -EINVAL; goto out; } } + local_irq_restore(flags); + end_lockless_pgtbl_walk(kvm->mm); + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, -- 2.20.1
[PATCH v4 06/11] powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring all hash-related functions that do lockless pagetable walks. hash_page_mm: Adds comment that explain that there is no need to local_int_disable/save given that it is only called from DataAccess interrupt, so interrupts are already disabled. Signed-off-by: Leonardo Bras --- arch/powerpc/mm/book3s64/hash_tlb.c | 2 ++ arch/powerpc/mm/book3s64/hash_utils.c | 12 +++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c b/arch/powerpc/mm/book3s64/hash_tlb.c index 4a70d8dd39cd..5e5213c3f7c4 100644 --- a/arch/powerpc/mm/book3s64/hash_tlb.c +++ b/arch/powerpc/mm/book3s64/hash_tlb.c @@ -209,6 +209,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, * to being hashed). This is not the most performance oriented * way to do things but is fine for our needs here. */ + start_lockless_pgtbl_walk(mm); local_irq_save(flags); arch_enter_lazy_mmu_mode(); for (; start < end; start += PAGE_SIZE) { @@ -230,6 +231,7 @@ void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, } arch_leave_lazy_mmu_mode(); local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); } void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr) diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index b8ad14bb1170..8615fab87c43 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1321,7 +1321,11 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1); #endif /* CONFIG_PPC_64K_PAGES */ - /* Get PTE and page size from page tables */ + /* Get PTE and page size from page tables : +* Called in from DataAccess interrupt (data_access_common: 0x300), +* interrupts are disabled here. +*/ + start_lockless_pgtbl_walk(mm); ptep = find_linux_pte(pgdir, ea, &is_thp, &hugeshift); if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW(" no PTE !\n"); @@ -1438,6 +1442,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea, DBG_LOW(" -> rc=%d\n", rc); bail: + end_lockless_pgtbl_walk(mm); exception_exit(prev_state); return rc; } @@ -1547,10 +1552,12 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, vsid = get_user_vsid(&mm->context, ea, ssize); if (!vsid) return; + /* * Hash doesn't like irqs. Walking linux page table with irq disabled * saves us from holding multiple locks. */ + start_lockless_pgtbl_walk(mm); local_irq_save(flags); /* @@ -1597,6 +1604,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea, pte_val(*ptep)); out_exit: local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); } #ifdef CONFIG_PPC_MEM_KEYS @@ -1613,11 +1621,13 @@ u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address) if (!mm || !mm->pgd) return 0; + start_lockless_pgtbl_walk(mm); local_irq_save(flags); ptep = find_linux_pte(mm->pgd, address, NULL, NULL); if (ptep) pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep))); local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); return pkey; } -- 2.20.1
[PATCH v4 04/11] powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on addr_to_pfn(). Signed-off-by: Leonardo Bras --- arch/powerpc/kernel/mce_power.c | 13 ++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index a814d2dfb5b0..0f2f87da4cd1 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -27,6 +27,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) { pte_t *ptep; unsigned long flags; + unsigned long pfn; struct mm_struct *mm; if (user_mode(regs)) @@ -34,15 +35,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long addr) else mm = &init_mm; + start_lockless_pgtbl_walk(mm); local_irq_save(flags); if (mm == current->mm) ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL); else ptep = find_init_mm_pte(addr, NULL); - local_irq_restore(flags); + if (!ptep || pte_special(*ptep)) - return ULONG_MAX; - return pte_pfn(*ptep); + pfn = ULONG_MAX; + else + pfn = pte_pfn(*ptep); + + local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); + return pfn; } /* flush SLBs and reload */ -- 2.20.1
[PATCH v4 05/11] powerpc/perf: Applies counting method to monitor lockless pgtbl walks
Applies the counting-based method for monitoring lockless pgtable walks on read_user_stack_slow. Signed-off-by: Leonardo Bras --- arch/powerpc/perf/callchain.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c index c84bbd4298a0..9d76194a2a8f 100644 --- a/arch/powerpc/perf/callchain.c +++ b/arch/powerpc/perf/callchain.c @@ -113,16 +113,18 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) int ret = -EFAULT; pgd_t *pgdir; pte_t *ptep, pte; + struct mm_struct *mm = current->mm; unsigned shift; unsigned long addr = (unsigned long) ptr; unsigned long offset; unsigned long pfn, flags; void *kaddr; - pgdir = current->mm->pgd; + pgdir = mm->pgd; if (!pgdir) return -EFAULT; + start_lockless_pgtbl_walk(mm); local_irq_save(flags); ptep = find_current_mm_pte(pgdir, addr, NULL, &shift); if (!ptep) @@ -146,6 +148,7 @@ static int read_user_stack_slow(void __user *ptr, void *buf, int nb) ret = 0; err_out: local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); return ret; } -- 2.20.1
[PATCH v4 03/11] mm/gup: Applies counting method to monitor gup_pgd_range
As decribed, gup_pgd_range is a lockless pagetable walk. So, in order to monitor against THP split/collapse with the couting method, it's necessary to bound it with {start,end}_lockless_pgtbl_walk. There are dummy functions, so it is not going to add any overhead on archs that don't use this method. Signed-off-by: Leonardo Bras --- mm/gup.c | 8 1 file changed, 8 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index 98f13ab37bac..7105c829cf44 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2325,6 +2325,7 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { + struct mm_struct *mm; unsigned long len, end; unsigned long flags; int nr = 0; @@ -2352,9 +2353,12 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + mm = current->mm; + start_lockless_pgtbl_walk(mm); local_irq_save(flags); gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr); local_irq_restore(flags); + end_lockless_pgtbl_walk(mm); } return nr; @@ -2404,6 +2408,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) { unsigned long addr, len, end; + struct mm_struct *mm; int nr = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM))) @@ -2421,9 +2426,12 @@ int get_user_pages_fast(unsigned long start, int nr_pages, if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) { + mm = current->mm; + start_lockless_pgtbl_walk(mm); local_irq_disable(); gup_pgd_range(addr, end, gup_flags, pages, &nr); local_irq_enable(); + end_lockless_pgtbl_walk(mm); ret = nr; } -- 2.20.1
[PATCH v4 02/11] asm-generic/pgtable: Adds dummy functions to monitor lockless pgtable walks
There is a need to monitor lockless pagetable walks, in order to avoid doing THP splitting/collapsing during them. Some methods rely on local_irq_{save,restore}, but that can be slow on cases with a lot of cpus are used for the process. In order to speedup these cases, I propose a refcount-based approach, that counts the number of lockless pagetable walks happening on the process. Given that there are lockless pagetable walks on generic code, it's necessary to create dummy functions for archs that won't use the approach. Signed-off-by: Leonardo Bras --- include/asm-generic/pgtable.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 75d9d68a6de7..0831475e72d3 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1172,6 +1172,21 @@ static inline bool arch_has_pfn_modify_check(void) #endif #endif +#ifndef __HAVE_ARCH_LOCKLESS_PGTBL_WALK_COUNTER +static inline void start_lockless_pgtbl_walk(struct mm_struct *mm) +{ +} + +static inline void end_lockless_pgtbl_walk(struct mm_struct *mm) +{ +} + +static inline int running_lockless_pgtbl_walk(struct mm_struct *mm) +{ + return 0; +} +#endif + /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. -- 2.20.1
[PATCH v4 00/11] Introduces new count-based method for monitoring lockless pagetable walks
If a process (qemu) with a lot of CPUs (128) try to munmap() a large chunk of memory (496GB) mapped with THP, it takes an average of 275 seconds, which can cause a lot of problems to the load (in qemu case, the guest will lock for this time). Trying to find the source of this bug, I found out most of this time is spent on serialize_against_pte_lookup(). This function will take a lot of time in smp_call_function_many() if there is more than a couple CPUs running the user process. Since it has to happen to all THP mapped, it will take a very long time for large amounts of memory. By the docs, serialize_against_pte_lookup() is needed in order to avoid pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless pagetable walk, to happen concurrently with THP splitting/collapsing. It does so by calling a do_nothing() on each CPU in mm->cpu_bitmap[], after interrupts are re-enabled. Since, interrupts are (usually) disabled during lockless pagetable walk, and serialize_against_pte_lookup will only return after interrupts are enabled, it is protected. So, by what I could understand, if there is no lockless pagetable walk running, there is no need to call serialize_against_pte_lookup(). So, to avoid the cost of running serialize_against_pte_lookup(), I propose a counter that keeps track of how many find_current_mm_pte() are currently running, and if there is none, just skip smp_call_function_many(). The related functions are: start_lockless_pgtbl_walk(mm) Insert before starting any lockless pgtable walk end_lockless_pgtbl_walk(mm) Insert after the end of any lockless pgtable walk (Mostly after the ptep is last used) running_lockless_pgtbl_walk(mm) Returns the number of lockless pgtable walks running On my workload (qemu), I could see munmap's time reduction from 275 seconds to 418ms. Also, I documented some lockless pagetable walks in which it's not necessary to keep track, given they work on init_mm or guest pgd. Changes since v3: Adds memory barrier to {start,end}_lockless_pgtbl_walk() Explain (comments) why some lockless pgtbl walks don't need local_irq_disable (real mode + MSR_EE=0) Explain (comments) places where counting method is not needed (guest pgd, which is not touched by THP) Fixes some misplaced local_irq_restore() Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=132417 Changes since v2: Rebased to v5.3 Adds support on __get_user_pages_fast Adds usage decription to *_lockless_pgtbl_walk() Better style to dummy functions Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=131839 Changes since v1: Isolated atomic operations in functions *_lockless_pgtbl_walk() Fixed behavior of decrementing before last ptep was used Link: http://patchwork.ozlabs.org/patch/1163093/ Leonardo Bras (11): powerpc/mm: Adds counting method to monitor lockless pgtable walks asm-generic/pgtable: Adds dummy functions to monitor lockless pgtable walks mm/gup: Applies counting method to monitor gup_pgd_range powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks powerpc/perf: Applies counting method to monitor lockless pgtbl walks powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks powerpc/book3s_64: Enables counting method to monitor lockless pgtbl walk powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing arch/powerpc/include/asm/book3s/64/mmu.h | 3 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 5 ++ arch/powerpc/kernel/mce_power.c | 13 -- arch/powerpc/kvm/book3s_64_mmu_hv.c | 2 + arch/powerpc/kvm/book3s_64_mmu_radix.c | 30 arch/powerpc/kvm/book3s_64_vio_hv.c | 3 ++ arch/powerpc/kvm/book3s_hv_nested.c | 22 - arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 ++-- arch/powerpc/kvm/e500_mmu_host.c | 6 ++- arch/powerpc/mm/book3s64/hash_tlb.c | 2 + arch/powerpc/mm/book3s64/hash_utils.c| 12 - arch/powerpc/mm/book3s64/mmu_context.c | 1 + arch/powerpc/mm/book3s64/pgtable.c | 48 +++- arch/powerpc/perf/callchain.c| 5 +- include/asm-generic/pgtable.h| 15 ++ mm/gup.c | 8 16 files changed, 180 insertions(+), 13 deletions(-) -- 2.20.1
[PATCH v4 01/11] powerpc/mm: Adds counting method to monitor lockless pgtable walks
It's necessary to monitor lockless pagetable walks, in order to avoid doing THP splitting/collapsing during them. Some methods rely on local_irq_{save,restore}, but that can be slow on cases with a lot of cpus are used for the process. In order to speedup some cases, I propose a refcount-based approach, that counts the number of lockless pagetable walks happening on the process. This method does not exclude the current irq-oriented method. It works as a complement to skip unnecessary waiting. start_lockless_pgtbl_walk(mm) Insert before starting any lockless pgtable walk end_lockless_pgtbl_walk(mm) Insert after the end of any lockless pgtable walk (Mostly after the ptep is last used) running_lockless_pgtbl_walk(mm) Returns the number of lockless pgtable walks running Signed-off-by: Leonardo Bras --- arch/powerpc/include/asm/book3s/64/mmu.h | 3 ++ arch/powerpc/mm/book3s64/mmu_context.c | 1 + arch/powerpc/mm/book3s64/pgtable.c | 45 3 files changed, 49 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 23b83d3593e2..13b006e7dde4 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -116,6 +116,9 @@ typedef struct { /* Number of users of the external (Nest) MMU */ atomic_t copros; + /* Number of running instances of lockless pagetable walk*/ + atomic_t lockless_pgtbl_walk_count; + struct hash_mm_context *hash_context; unsigned long vdso_base; diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 2d0cb5ba9a47..3dd01c0ca5be 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -200,6 +200,7 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm) #endif atomic_set(&mm->context.active_cpus, 0); atomic_set(&mm->context.copros, 0); + atomic_set(&mm->context.lockless_pgtbl_walk_count, 0); return 0; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 7d0e0d0d22c4..6ba6195bff1b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -98,6 +98,51 @@ void serialize_against_pte_lookup(struct mm_struct *mm) smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1); } +/* + * Counting method to monitor lockless pagetable walks: + * Uses start_lockless_pgtbl_walk and end_lockless_pgtbl_walk to track the + * number of lockless pgtable walks happening, and + * running_lockless_pgtbl_walk to return this value. + */ + +/* start_lockless_pgtbl_walk: Must be inserted before a function call that does + * lockless pagetable walks, such as __find_linux_pte() + */ +void start_lockless_pgtbl_walk(struct mm_struct *mm) +{ + atomic_inc(&mm->context.lockless_pgtbl_walk_count); + /* Avoid reorder to garantee that the increment will happen before any +* part of the walkless pagetable walk after it. +*/ + smp_mb(); +} +EXPORT_SYMBOL(start_lockless_pgtbl_walk); + +/* + * end_lockless_pgtbl_walk: Must be inserted after the last use of a pointer + * returned by a lockless pagetable walk, such as __find_linux_pte() +*/ +void end_lockless_pgtbl_walk(struct mm_struct *mm) +{ + /* Avoid reorder to garantee that it will only decrement after the last +* use of the returned ptep from the lockless pagetable walk. +*/ + smp_mb(); + atomic_dec(&mm->context.lockless_pgtbl_walk_count); +} +EXPORT_SYMBOL(end_lockless_pgtbl_walk); + +/* + * running_lockless_pgtbl_walk: Returns the number of lockless pagetable walks + * currently running. If it returns 0, there is no running pagetable walk, and + * THP split/collapse can be safely done. This can be used to avoid more + * expensive approaches like serialize_against_pte_lookup() + */ +int running_lockless_pgtbl_walk(struct mm_struct *mm) +{ + return atomic_read(&mm->context.lockless_pgtbl_walk_count); +} + /* * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. -- 2.20.1
Re: [PATCH v3 00/11] Introduces new count-based method for monitoring lockless pagetable walks
On Fri, 2019-09-27 at 11:46 -0300, Leonardo Bras wrote: > I am not sure if it would be ok to use irq_{save,restore} in real mode, > I will do some more reading of the docs before addressing this. It looks like it's unsafe to merge irq_{save,restore} in {start,end}_lockless_pgtbl_walk(), due to a possible access of code that is not accessible in real mode. I am sending a v4 for the changes so far. I will look forward for your feedback. signature.asc Description: This is a digitally signed message part
Re: [PATCH] powerpc: use
ping? On Wed, Aug 07, 2019 at 06:07:52PM +0300, Christoph Hellwig wrote: > The powerpc version of dma-mapping.h only contains a version of > get_arch_dma_ops that always return NULL. Replace it with the > asm-generic version that does the same. > > Signed-off-by: Christoph Hellwig > --- > arch/powerpc/include/asm/Kbuild| 1 + > arch/powerpc/include/asm/dma-mapping.h | 18 -- > 2 files changed, 1 insertion(+), 18 deletions(-) > delete mode 100644 arch/powerpc/include/asm/dma-mapping.h > > diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild > index 9a1d2fc6ceb7..15bb09ad5dc2 100644 > --- a/arch/powerpc/include/asm/Kbuild > +++ b/arch/powerpc/include/asm/Kbuild > @@ -4,6 +4,7 @@ generated-y += syscall_table_64.h > generated-y += syscall_table_c32.h > generated-y += syscall_table_spu.h > generic-y += div64.h > +generic-y += dma-mapping.h > generic-y += export.h > generic-y += irq_regs.h > generic-y += local64.h > diff --git a/arch/powerpc/include/asm/dma-mapping.h > b/arch/powerpc/include/asm/dma-mapping.h > deleted file mode 100644 > index 565d6f74b189.. > --- a/arch/powerpc/include/asm/dma-mapping.h > +++ /dev/null > @@ -1,18 +0,0 @@ > -/* SPDX-License-Identifier: GPL-2.0 */ > -/* > - * Copyright (C) 2004 IBM > - */ > -#ifndef _ASM_DMA_MAPPING_H > -#define _ASM_DMA_MAPPING_H > - > -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type > *bus) > -{ > - /* We don't handle the NULL dev case for ISA for now. We could > - * do it via an out of line call but it is not needed for now. The > - * only ISA DMA device we support is the floppy and we have a hack > - * in the floppy driver directly to get a device for us. > - */ > - return NULL; > -} > - > -#endif /* _ASM_DMA_MAPPING_H */ > -- > 2.20.1 ---end quoted text---
Re: [PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature
On Fri, Aug 16, 2019 at 07:50:41PM +0300, Sergey Miroshnichenko wrote: > When hot-adding a device, the bridge may have windows not big enough (or > fragmented too much) for newly requested BARs to fit in. And expanding > these bridge windows may be impossible because blocked by "neighboring" > BARs and bridge windows. > > Still, it may be possible to allocate a memory region for new BARs with the > following procedure: > > 1) notify all the drivers which support movable BARs to pause and release >the BARs; the rest of the drivers are guaranteed that their devices will >not get BARs moved; > > 2) release all the bridge windows except of root bridges; > > 3) try to recalculate new bridge windows that will fit all the BAR types: >- fixed; >- immovable; >- movable; >- newly requested by hot-added devices; > > 4) if the previous step fails, disable BARs for one of the hot-added >devices and retry from step 3; > > 5) notify the drivers, so they remap BARs and resume. You don't do the actual recalculation in *this* patch, but since you mention the procedure here, are we confident that we never make things worse? It's possible that a hot-add will trigger this attempt to move things around, and it's possible that we won't find space for the new device even if we move things around. But are we certain that every device that worked *before* the hot-add will still work *afterwards*? Much of the assignment was probably done by the BIOS using different algorithms than Linux has, so I think there's some chance that the BIOS did a better job and if we lose that BIOS assignment, we might not be able to recreate it. > This makes the prior reservation of memory by BIOS/bootloader/firmware not > required anymore for the PCI hotplug. > > Drivers indicate their support of movable BARs by implementing the new > .rescan_prepare() and .rescan_done() hooks in the struct pci_driver. All > device's activity must be paused during a rescan, and iounmap()+ioremap() > must be applied to every used BAR. > > The platform also may need to prepare to BAR movement, so new hooks added: > pcibios_rescan_prepare(pci_dev) and pcibios_rescan_prepare(pci_dev). > > This patch is a preparation for future patches with actual implementation, > and for now it just does the following: > - declares the feature; > - defines pci_movable_bars_enabled(), pci_dev_movable_bars_supported(dev); > - invokes the .rescan_prepare() and .rescan_done() driver notifiers; > - declares and invokes the pcibios_rescan_prepare()/_done() hooks; > - adds the PCI_IMMOVABLE_BARS flag. > > The feature is disabled by default (via PCI_IMMOVABLE_BARS) until the final > patch of the series. It can be overridden per-arch using this flag or by > the following command line option: > > pcie_movable_bars={ off | force } > > CC: Sam Bobroff > CC: Rajat Jain > CC: Lukas Wunner > CC: Oliver O'Halloran > CC: David Laight > Signed-off-by: Sergey Miroshnichenko > --- > .../admin-guide/kernel-parameters.txt | 7 ++ > drivers/pci/pci-driver.c | 2 + > drivers/pci/pci.c | 24 ++ > drivers/pci/pci.h | 2 + > drivers/pci/probe.c | 86 ++- > include/linux/pci.h | 7 ++ > 6 files changed, 126 insertions(+), 2 deletions(-) > > diff --git a/Documentation/admin-guide/kernel-parameters.txt > b/Documentation/admin-guide/kernel-parameters.txt > index 47d981a86e2f..e2274ee87a35 100644 > --- a/Documentation/admin-guide/kernel-parameters.txt > +++ b/Documentation/admin-guide/kernel-parameters.txt > @@ -3526,6 +3526,13 @@ > nomsi Do not use MSI for native PCIe PME signaling (this makes > all PCIe root ports use INTx for all services). > > + pcie_movable_bars=[PCIE] This isn't a PCIe-specific feature, it's just a function of whether drivers are smart enough, so we shouldn't tie it specifically to PCIe. We could eventually do this for conventional PCI as well. > + Override the movable BARs support detection: > + off > + Disable even if supported by the platform > + force > + Enable even if not explicitly declared as supported What's the need for "force"? If it's possible, I think we should enable this functionality all the time and just have a disable switch in case we trip over cases where it doesn't work, e.g., something like: pci=no_movable_bars > pcmv= [HW,PCMCIA] BadgePAD 4 > > pd_ignore_unused > diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c > index a8124e47bf6e..d11909e79263 100644 > --- a/drivers/pci/pci-driver.c > +++ b/drivers/pci/pci-driver.c > @@ -1688,6 +1688,8 @@ static int __init pci_driver_init(void) > { > int ret; > > + pci_add_flags(PCI_IMMOVABLE_BARS); > + > ret = bus_register(&pci
Re: [PATCH v5 02/23] PCI: Enable bridge's I/O and MEM access for hotplugged devices
On Fri, Aug 16, 2019 at 07:50:40PM +0300, Sergey Miroshnichenko wrote: > The PCI_COMMAND_IO and PCI_COMMAND_MEMORY bits of the bridge must be > updated not only when enabling the bridge for the first time, but also if a > hotplugged device requests these types of resources. Yeah, this assumption that pci_is_enabled() means PCI_COMMAND_IO and PCI_COMMAND_MEMORY are set correctly even though we may now need *different* settings than when we incremented pdev->enable_cnt is quite broken. > Originally these bits were set by the pci_enable_device_flags() only, which > exits early if the bridge is already pci_is_enabled(). So if the bridge was > empty initially (an edge case), then hotplugged devices fail to IO/MEM. > > Signed-off-by: Sergey Miroshnichenko > --- > drivers/pci/pci.c | 8 > 1 file changed, 8 insertions(+) > > diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c > index e7f8c354e644..61d951766087 100644 > --- a/drivers/pci/pci.c > +++ b/drivers/pci/pci.c > @@ -1652,6 +1652,14 @@ static void pci_enable_bridge(struct pci_dev *dev) > pci_enable_bridge(bridge); > > if (pci_is_enabled(dev)) { > + int i, bars = 0; > + > + for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) { > + if (dev->resource[i].flags & (IORESOURCE_MEM | > IORESOURCE_IO)) > + bars |= (1 << i); > + } > + do_pci_enable_device(dev, bars); > + > if (!dev->is_busmaster) > pci_set_master(dev); > mutex_unlock(&dev->enable_mutex); > -- > 2.21.0 >
Re: [PATCH v5 01/23] PCI: Fix race condition in pci_enable/disable_device()
On Fri, Aug 16, 2019 at 07:50:39PM +0300, Sergey Miroshnichenko wrote: > This is a yet another approach to fix an old [1-2] concurrency issue, when: > - two or more devices are being hot-added into a bridge which was >initially empty; > - a bridge with two or more devices is being hot-added; > - during boot, if BIOS/bootloader/firmware doesn't pre-enable bridges. > > The problem is that a bridge is reported as enabled before the MEM/IO bits > are actually written to the PCI_COMMAND register, so another driver thread > starts memory requests through the not-yet-enabled bridge: > > CPU0CPU1 > > pci_enable_device_mem() pci_enable_device_mem() >pci_enable_bridge() pci_enable_bridge() > pci_is_enabled() >return false; > atomic_inc_return(enable_cnt) > Start actual enabling the bridge > ... pci_is_enabled() > ... return true; > ... Start memory requests <-- FAIL > ... > Set the PCI_COMMAND_MEMORY bit <-- Must wait for this > > Protect the pci_enable/disable_device() and pci_enable_bridge(), which is > similar to the previous solution from commit 40f11adc7cd9 ("PCI: Avoid race > while enabling upstream bridges"), but adding a per-device mutexes and > preventing the dev->enable_cnt from from incrementing early. This isn't directly related to the movable BARs functionality; is it here because you see the problem more frequently when moving BARs?
Re: [PATCH] powerpc/prom_init: Undo relocation before entering secure mode
Thiago Jung Bauermann writes: > Thiago Jung Bauermann writes: > >> The ultravisor will do an integrity check of the kernel image but we >> relocated it so the check will fail. Restore the original image by >> relocating it back to the kernel virtual base address. >> >> This works because during build vmlinux is linked with an expected virtual >> runtime address of KERNELBASE. >> >> Fixes: 6a9c930bd775 ("powerpc/prom_init: Add the ESM call to prom_init") >> Signed-off-by: Thiago Jung Bauermann > > I meant to put a Suggested-by: Paul Mackerras > > Sorry. Will add it if there's a v2. Ping? -- Thiago Jung Bauermann IBM Linux Technology Center
Re: [RFC] powerpc/pseries/hcall: remove the save/restore of CR
On 9/26/19 10:48 PM, Lijun Pan wrote: > According to the PAPR, hcalls should not modify the Condition > Register fields, hence save/restore the CR is not necessary. Just curious: could you provide a more specific reference? PC
Re: [PATCH v2 1/1] powerpc/pci: Fix pcibios_setup_device() ordering
On 9/9/19 2:59 AM, Alexey Kardashevskiy wrote: On 06/09/2019 05:13, Shawn Anastasio wrote: Move PCI device setup from pcibios_add_device() and pcibios_fixup_bus() to pcibios_bus_add_device(). This ensures that platform-specific DMA and IOMMU setup occurs after the device has been registered in sysfs, which is a requirement for IOMMU group assignment to work This fixes IOMMU group assignment for hotplugged devices on pseries, where the existing behavior results in IOMMU assignment before registration. Although this is a correct approach which we should proceed with, this breaks adding of SRIOV VFs from pnv_tce_iommu_bus_notifier (and possibly the bare metal PCI hotplug), I am trying to fix that now... Were you able to make any progress? I can think of a couple of ways to fix SRIOV, but they're not particularly elegant and involve duplication.
Re: [v3,3/3] Documentation: dt: binding: fsl: Add 'fsl,ippdexpcr-alt-addr' property
On Wed, Sep 25, 2019 at 03:34:47AM +, Leo Li wrote: > > > > -Original Message- > > From: Biwen Li > > Sent: Tuesday, September 24, 2019 10:30 PM > > To: Leo Li ; shawn...@kernel.org; > > robh...@kernel.org; mark.rutl...@arm.com; Ran Wang > > > > Cc: linuxppc-dev@lists.ozlabs.org; linux-arm-ker...@lists.infradead.org; > > linux-ker...@vger.kernel.org; devicet...@vger.kernel.org > > Subject: RE: [v3,3/3] Documentation: dt: binding: fsl: Add > > 'fsl,ippdexpcr-alt- > > addr' property > > > > > > > > > > > > > > The 'fsl,ippdexpcr-alt-addr' property is used to handle an > > > > > > errata > > > > > > A-008646 on LS1021A > > > > > > > > > > > > Signed-off-by: Biwen Li > > > > > > --- > > > > > > Change in v3: > > > > > > - rename property name > > > > > > fsl,rcpm-scfg -> fsl,ippdexpcr-alt-addr > > > > > > > > > > > > Change in v2: > > > > > > - update desc of the property 'fsl,rcpm-scfg' > > > > > > > > > > > > Documentation/devicetree/bindings/soc/fsl/rcpm.txt | 14 > > > > > > ++ > > > > > > 1 file changed, 14 insertions(+) > > > > > > > > > > > > diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > > > > > > b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > > > > > > index 5a33619d881d..157dcf6da17c 100644 > > > > > > --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > > > > > > +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > > > > > > @@ -34,6 +34,11 @@ Chassis Version Example Chips > > > > > > Optional properties: > > > > > > - little-endian : RCPM register block is Little Endian. Without > > > > > > it RCPM > > > > > > will be Big Endian (default case). > > > > > > + - fsl,ippdexpcr-alt-addr : Must add the property for SoC > > > > > > + LS1021A, > > > > > > > > > > You probably should mention this is related to a hardware issue on > > > > > LS1021a and only needed on LS1021a. > > > > Okay, got it, thanks, I will add this in v4. > > > > > > > > > > > + Must include n + 1 entries (n = #fsl,rcpm-wakeup-cells, such as: > > > > > > + #fsl,rcpm-wakeup-cells equal to 2, then must include 2 + 1 > > > > > > entries). > > > > > > > > > > #fsl,rcpm-wakeup-cells is the number of IPPDEXPCR registers on an SoC. > > > > > However you are defining an offset to scfg registers here. Why > > > > > these two are related? The length here should actually be related > > > > > to the #address-cells of the soc/. But since this is only needed > > > > > for LS1021, you can > > > > just make it 3. > > > > I need set the value of IPPDEXPCR resgiters from ftm_alarm0 device > > > > node(fsl,rcpm-wakeup = <&rcpm 0x0 0x2000>; > > > > 0x0 is a value for IPPDEXPCR0, 0x2000 is a value for IPPDEXPCR1). > > > > But because of the hardware issue on LS1021A, I need store the value > > > > of IPPDEXPCR registers to an alt address. So I defining an offset to > > > > scfg registers, then RCPM driver get an abosolute address from > > > > offset, RCPM driver write the value of IPPDEXPCR registers to these > > > > abosolute addresses(backup the value of IPPDEXPCR registers). > > > > > > I understand what you are trying to do. The problem is that the new > > > fsl,ippdexpcr-alt-addr property contains a phandle and an offset. The > > > size of it shouldn't be related to #fsl,rcpm-wakeup-cells. > > You maybe like this: fsl,ippdexpcr-alt-addr = <&scfg 0x51c>;/* > > SCFG_SPARECR8 */ > > No. The #address-cell for the soc/ is 2, so the offset to scfg > should be 0x0 0x51c. The total size should be 3, but it shouldn't be > coming from #fsl,rcpm-wakeup-cells like you mentioned in the binding. Um, no. #address-cells applies to reg and ranges, not some vendor specific property. You can just define how many cells you need if that's fixed. However, I suggested doing this another way in the next version of this. Rob
Re: [PATCH] powerpc/vio: use simple dummy struct device as bus parent
On Fri, Sep 27, 2019 at 09:04:02AM -0400, Dan Streetman wrote: > The dummy vio_bus_device creates the /sys/devices/vio directory, which > contains real vio devices under it; since it represents itself as having > a bus = &vio_bus_type, its /sys/devices/vio/uevent does call the bus's > .uevent function, vio_hotplug(), and as that function won't find a real > device for the dummy vio_dev, it will return -ENODEV. > > One of the main users of the uevent node is udevadm, e.g. when it is called > with 'udevadm trigger --devices'. Up until recently, it would ignore any > errors returned when writing to devices' uevent file, but it was recently > changed to start returning error if it gets an error writing to any uevent > file: > https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f > > since the /sys/devices/vio/uevent file has always returned ENODEV from > any write to it, this now causes the udevadm trigger command to return > an error. This may be fixed in udevadm to ignore ENODEV errors, but the > vio driver should still be fixed. > > This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy' > parent device into a real dummy device with no .bus, so its uevent > file will stop returning ENODEV and simply do nothing and return 0. > > Signed-off-by: Dan Streetman > --- > arch/powerpc/platforms/pseries/vio.c | 11 --- > 1 file changed, 4 insertions(+), 7 deletions(-) > > diff --git a/arch/powerpc/platforms/pseries/vio.c > b/arch/powerpc/platforms/pseries/vio.c > index 79e2287991db..63bc16631680 100644 > --- a/arch/powerpc/platforms/pseries/vio.c > +++ b/arch/powerpc/platforms/pseries/vio.c > @@ -32,11 +32,8 @@ > #include > #include > > -static struct vio_dev vio_bus_device = { /* fake "parent" device */ > - .name = "vio", > - .type = "", > - .dev.init_name = "vio", > - .dev.bus = &vio_bus_type, > +static struct device vio_bus = { > + .init_name = "vio", Eeek, no! Why are you creating a static device that will then be reference counted? Not nice :( What's wrong with a simple call to device_create() for your "fake" device you want to make here? That's what it is there for :) thanks, greg k-h
Re: [v4,3/3] Documentation: dt: binding: fsl: Add 'fsl,ippdexpcr1-alt-addr' property
On Thu, Sep 26, 2019 at 10:41:18AM +0800, Biwen Li wrote: > The 'fsl,ippdexpcr1-alt-addr' property is used to handle an errata A-008646 > on LS1021A > > Signed-off-by: Biwen Li > --- > Change in v4: > - rename property name > fsl,ippdexpcr-alt-addr -> fsl,ippdexpcr1-alt-addr > > Change in v3: > - rename property name > fsl,rcpm-scfg -> fsl,ippdexpcr-alt-addr > > Change in v2: > - update desc of the property 'fsl,rcpm-scfg' > > .../devicetree/bindings/soc/fsl/rcpm.txt | 21 +++ > 1 file changed, 21 insertions(+) > > diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > index 5a33619d881d..751a7655b694 100644 > --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt > @@ -34,6 +34,13 @@ Chassis VersionExample Chips > Optional properties: > - little-endian : RCPM register block is Little Endian. Without it RCPM > will be Big Endian (default case). > + - fsl,ippdexpcr1-alt-addr : The property is related to a hardware issue > + on SoC LS1021A and only needed on SoC LS1021A. > + Must include 1 + 2 entries. > + The first entry must be a link to the SCFG device node. > + The non-first entry must be offset of registers of SCFG. > + The second and third entry compose an alt offset address > + for IPPDEXPCR1(SCFG_SPARECR8) If only on 1 SoC, can't all this be implied by "fsl,ls1021a-rcpm"? Adding a property means you need both a new dtb and kernel to fix the errata. Using the compatible string means you only need a new kernel. > > Example: > The RCPM node for T4240: > @@ -43,6 +50,20 @@ The RCPM node for T4240: > #fsl,rcpm-wakeup-cells = <2>; > }; > > +The RCPM node for LS1021A: > + rcpm: rcpm@1ee2140 { > + compatible = "fsl,ls1021a-rcpm", "fsl,qoriq-rcpm-2.1+"; Both of these compatible strings aren't documented. > + reg = <0x0 0x1ee2140 0x0 0x8>; > + #fsl,rcpm-wakeup-cells = <2>; > + > + /* > + * The second and third entry compose an alt offset > + * address for IPPDEXPCR1(SCFG_SPARECR8) > + */ > + fsl,ippdexpcr1-alt-addr = <&scfg 0x0 0x51c>; > + }; > + > + > * Freescale RCPM Wakeup Source Device Tree Bindings > --- > Required fsl,rcpm-wakeup property should be added to a device node if the > device > -- > 2.17.1 >
Re: [PATCH v4 05/11] dt-bindings: pci: layerscape-pci: Add compatible strings for ls1088a and ls2088a
On Tue, 24 Sep 2019 10:18:43 +0800, Xiaowei Bao wrote: > Add compatible strings for ls1088a and ls2088a. > > Signed-off-by: Xiaowei Bao > --- > v2: > - No change. > v3: > - Use one valid combination of compatible strings. > v4: > - Add the comma between the two compatible. > > Documentation/devicetree/bindings/pci/layerscape-pci.txt | 2 ++ > 1 file changed, 2 insertions(+) > Acked-by: Rob Herring
Re: [PATCH 03/11] powerpc/powernv/ioda: set up PE on opencapi device when enabling
On 9/9/19 5:45 pm, Frederic Barrat wrote: The PE for an opencapi device was set as part of a late PHB fixup operation, when creating the PHB. To use the PCI hotplug framework, this is not going to work, as the PHB stays the same, it's only the devices underneath which are updated. For regular PCI devices, it is done as part of the reconfiguration of the bridge, but for opencapi PHBs, we don't have an intermediate bridge. So let's define the PE when the device is enabled. PEs are meaningless for opencapi, the NPU doesn't define them and opal is not doing anything with them. Signed-off-by: Frederic Barrat Reviewed-by: Andrew Donnellan --- arch/powerpc/platforms/powernv/pci-ioda.c | 31 +-- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3dbbf5365c1c..06ce7ddaa0cf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1260,8 +1260,6 @@ static void pnv_pci_ioda_setup_PEs(void) { struct pci_controller *hose; struct pnv_phb *phb; - struct pci_bus *bus; - struct pci_dev *pdev; struct pnv_ioda_pe *pe; list_for_each_entry(hose, &hose_list, list_node) { @@ -1273,11 +1271,6 @@ static void pnv_pci_ioda_setup_PEs(void) if (phb->model == PNV_PHB_MODEL_NPU2) WARN_ON_ONCE(pnv_npu2_init(hose)); } - if (phb->type == PNV_PHB_NPU_OCAPI) { - bus = hose->bus; - list_for_each_entry(pdev, &bus->devices, bus_list) - pnv_ioda_setup_dev_PE(pdev); - } } list_for_each_entry(hose, &hose_list, list_node) { phb = hose->private_data; @@ -3385,6 +3378,28 @@ static bool pnv_pci_enable_device_hook(struct pci_dev *dev) return true; } +static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev) +{ + struct pci_controller *hose = pci_bus_to_host(dev->bus); + struct pnv_phb *phb = hose->private_data; + struct pci_dn *pdn; + struct pnv_ioda_pe *pe; + + if (!phb->initialized) + return true; + + pdn = pci_get_pdn(dev); + if (!pdn) + return false; + + if (pdn->pe_number == IODA_INVALID_PE) { + pe = pnv_ioda_setup_dev_PE(dev); + if (!pe) + return false; + } + return true; +} + static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group, int num) { @@ -3625,7 +3640,7 @@ static const struct pci_controller_ops pnv_npu_ioda_controller_ops = { }; static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = { - .enable_device_hook = pnv_pci_enable_device_hook, + .enable_device_hook = pnv_ocapi_enable_device_hook, .window_alignment = pnv_pci_window_alignment, .reset_secondary_bus= pnv_pci_reset_secondary_bus, .shutdown = pnv_pci_ioda_shutdown, -- Andrew Donnellan OzLabs, ADL Canberra a...@linux.ibm.com IBM Australia Limited
Re: [PATCH 09/11] pci/hotplug/pnv-php: Relax check when disabling slot
On 9/9/19 5:45 pm, Frederic Barrat wrote: The driver only allows to disable a slot in the POPULATED state. However, if an error occurs while enabling the slot, say because the link couldn't be trained, then the POPULATED state may not be reached, yet the power state of the slot is on. So allow to disable a slot in the REGISTERED state. Removing the devices will do nothing since it's not populated, and we'll set the power state of the slot back to off. Signed-off-by: Frederic Barrat Makes sense Reviewed-by: Andrew Donnellan --- drivers/pci/hotplug/pnv_php.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c index f0a7360154e7..5ca51d67db4b 100644 --- a/drivers/pci/hotplug/pnv_php.c +++ b/drivers/pci/hotplug/pnv_php.c @@ -523,7 +523,13 @@ static int pnv_php_disable_slot(struct hotplug_slot *slot) struct pnv_php_slot *php_slot = to_pnv_php_slot(slot); int ret; - if (php_slot->state != PNV_PHP_STATE_POPULATED) + /* +* Allow to disable a slot already in the registered state to +* cover cases where the slot couldn't be enabled and never +* reached the populated state +*/ + if (php_slot->state != PNV_PHP_STATE_POPULATED && + php_slot->state != PNV_PHP_STATE_REGISTERED) return 0; /* Remove all devices behind the slot */ -- Andrew Donnellan OzLabs, ADL Canberra a...@linux.ibm.com IBM Australia Limited
Re: [PATCH v3 00/11] Introduces new count-based method for monitoring lockless pagetable walks
John Hubbard writes: > Hi Leonardo, > > Thanks for adding linux-mm to CC for this next round of reviews. For the > benefit > of any new reviewers, I'd like to add that there are some issues that were > discovered > while reviewing the v2 patchset, that are not (yet) addressed in this v3 > series. > Since those issues are not listed in the cover letter above, I'll list them > here Thanks for bringing that. The cover letter is a great place to put this info, I will keep that in mind for future patchsets. > > 1. The locking model requires a combination of disabling interrupts and > atomic counting and memory barriers, but > > a) some memory barriers are missing > (start/end_lockless_pgtbl_walk), and It seems that it works fine today because of the amount of intructions executed between the irq_disable / start_lockless_pgtbl_walk and where the THP collapse/split can happen. (It's very unlikely that it reorders that much). But I don't think it would be so bad to put a memory barrier after irq_disable just in case. > b) some cases (patch #8) fail to disable interrupts I have done some looking into that, and it seems that some uses of {start,end}_lockless_pgtbl_walk are unneeded, because they operate in (nested) guest pgd and I was told it's safe against THP split/collapse. In other uses, there is no interrupt disable because the function is called in real mode, with MSR_EE=0, and there we have instructions disabled, so there is no need to disable them again. > > ...so the synchronization appears to be inadequate. (And if it *is* adequate, > then > definitely we need the next item, to explain it.) > > 2. Documentation of the synchronization/locking model needs to exist, once we > figure out the exact details of (1). I will add the missing doc in the code, so it may be easier to understand in the future. > > 3. Related to (1), I've asked to change things so that interrupt controls and > atomic inc/dec are in the same start/end calls--assuming, of course, that the > caller can tolerate that. I am not sure if it would be ok to use irq_{save,restore} in real mode, I will do some more reading of the docs before addressing this. > > 4. Please see the v2 series for any other details I've missed. > > thanks, > -- > John Hubbard > NVIDIA > Thank you for helping, John! Best regards, Leonardo Bras signature.asc Description: This is a digitally signed message part
[PATCH] powerpc/vio: drop bus_type from parent device
Commit df44b479654f62b478c18ee4d8bc4e9f897a9844 ("kobject: return error code if writing /sys/.../uevent fails") started returning failure when writing to /sys/devices/vio/uevent. This causes an early udevadm trigger to fail. On some installer versions of Ubuntu, this will cause init to exit, thus panicing the system very early during boot. Removing the bus_type from the parent device will remove some of the extra empty files from /sys/devices/vio/, but will keep the rest of the layout for vio devices, keeping them under /sys/devices/vio/. It has been tested that uevents for vio devices don't change after this fix, they still contain MODALIAS. Signed-off-by: Thadeu Lima de Souza Cascardo --- arch/powerpc/platforms/pseries/vio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index 6601b9d404dc..d570d5494f30 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -36,7 +36,6 @@ static struct vio_dev vio_bus_device = { /* fake "parent" device */ .name = "vio", .type = "", .dev.init_name = "vio", - .dev.bus = &vio_bus_type, }; #ifdef CONFIG_PPC_SMLPAR -- 2.20.1
[PATCH v6 9/9] powerpc/ima: update ima arch policy to check for blacklist
This patch updates the arch specific policies for PowernV systems to add check against blacklisted hashes before doing the verification. Signed-off-by: Nayna Jain --- arch/powerpc/kernel/ima_arch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c index 77c61b142042..3f57433c0824 100644 --- a/arch/powerpc/kernel/ima_arch.c +++ b/arch/powerpc/kernel/ima_arch.c @@ -24,9 +24,9 @@ bool arch_ima_get_secureboot(void) static const char *const arch_rules[] = { "measure func=KEXEC_KERNEL_CHECK template=ima-modsig", "measure func=MODULE_CHECK template=ima-modsig", - "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", + "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", #if !IS_ENABLED(CONFIG_MODULE_SIG) - "appraise func=MODULE_CHECK appraise_type=imasig|modsig", + "appraise func=MODULE_CHECK appraise_flag=check_blacklist appraise_type=imasig|modsig", #endif NULL }; -- 2.20.1
[PATCH v6 8/9] ima: deprecate permit_directio, instead use appraise_flag
This patch deprecates the existing permit_directio flag, instead adds it as possible value to appraise_flag parameter. For eg. appraise_flag=permit_directio Signed-off-by: Nayna Jain --- Documentation/ABI/testing/ima_policy | 4 ++-- security/integrity/ima/ima_policy.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index 4c97afcc0f3c..9a2a140dc561 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -24,8 +24,8 @@ Description: [euid=] [fowner=] [fsname=]] lsm:[[subj_user=] [subj_role=] [subj_type=] [obj_user=] [obj_role=] [obj_type=]] - option: [[appraise_type=]] [template=] [permit_directio] - [appraise_flag=[check_blacklist]] + option: [[appraise_type=]] [template=] [permit_directio(deprecated)] + [appraise_flag=[check_blacklist]|[permit_directio]] base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] [FIRMWARE_CHECK] [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index ad3b3af69460..d9df54c75d46 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -1177,6 +1177,8 @@ static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) ima_log_string(ab, "appraise_flag", args[0].from); if (strstr(args[0].from, "blacklist")) entry->flags |= IMA_CHECK_BLACKLIST; + if (strstr(args[0].from, "permit_directio")) + entry->flags |= IMA_PERMIT_DIRECTIO; break; case Opt_permit_directio: entry->flags |= IMA_PERMIT_DIRECTIO; -- 2.20.1
[PATCH v6 7/9] ima: check against blacklisted hashes for files with modsig
Asymmetric private keys are used to sign multiple files. The kernel currently support checking against the blacklisted keys. However, if the public key is blacklisted, any file signed by the blacklisted key will automatically fail signature verification. We might not want to blacklist all the files signed by a particular key, but just a single file. Blacklisting the public key is not fine enough granularity. This patch adds support for blacklisting binaries with appended signatures, based on the IMA policy. Defined is a new policy option "appraise_flag=check_blacklist". Signed-off-by: Nayna Jain --- Documentation/ABI/testing/ima_policy | 1 + security/integrity/ima/ima.h | 12 + security/integrity/ima/ima_appraise.c | 35 +++ security/integrity/ima/ima_main.c | 8 -- security/integrity/ima/ima_policy.c | 10 ++-- security/integrity/integrity.h| 1 + 6 files changed, 63 insertions(+), 4 deletions(-) diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index 29ebe9afdac4..4c97afcc0f3c 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -25,6 +25,7 @@ Description: lsm:[[subj_user=] [subj_role=] [subj_type=] [obj_user=] [obj_role=] [obj_type=]] option: [[appraise_type=]] [template=] [permit_directio] + [appraise_flag=[check_blacklist]] base: func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] [FIRMWARE_CHECK] [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 9bf509217e8e..2c034728b239 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -254,6 +254,9 @@ int ima_policy_show(struct seq_file *m, void *v); #define IMA_APPRAISE_KEXEC 0x40 #ifdef CONFIG_IMA_APPRAISE +int ima_blacklist_measurement(struct integrity_iint_cache *iint, + const struct modsig *modsig, int action, + int pcr, struct ima_template_desc *template_desc); int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, const unsigned char *filename, @@ -269,6 +272,15 @@ int ima_read_xattr(struct dentry *dentry, struct evm_ima_xattr_data **xattr_value); #else +static inline int ima_blacklist_measurement(struct integrity_iint_cache *iint, + const struct modsig *modsig, + int action, int pcr, + struct ima_template_desc + *template_desc) +{ + return 0; +} + static inline int ima_appraise_measurement(enum ima_hooks func, struct integrity_iint_cache *iint, struct file *file, diff --git a/security/integrity/ima/ima_appraise.c b/security/integrity/ima/ima_appraise.c index 136ae4e0ee92..a5a82e870e24 100644 --- a/security/integrity/ima/ima_appraise.c +++ b/security/integrity/ima/ima_appraise.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "ima.h" @@ -303,6 +304,40 @@ static int modsig_verify(enum ima_hooks func, const struct modsig *modsig, return rc; } +/* + * ima_blacklist_measurement - checks if the file measurement is blacklisted + * + * Returns -EKEYREJECTED if the hash is blacklisted. + */ +int ima_blacklist_measurement(struct integrity_iint_cache *iint, + const struct modsig *modsig, int action, int pcr, + struct ima_template_desc *template_desc) +{ + enum hash_algo hash_algo; + const u8 *digest = NULL; + u32 digestsize = 0; + u32 secid; + int rc = 0; + + if (!(iint->flags & IMA_CHECK_BLACKLIST)) + return 0; + + if (iint->flags & IMA_MODSIG_ALLOWED) { + security_task_getsecid(current, &secid); + ima_get_modsig_digest(modsig, &hash_algo, &digest, &digestsize); + + rc = is_hash_blacklisted(digest, digestsize, "bin"); + + /* Returns -EKEYREJECTED on blacklisted hash found */ + if ((rc == -EKEYREJECTED) && (iint->flags & IMA_MEASURE)) + process_buffer_measurement(digest, digestsize, + "blacklisted-hash", pcr, + template_desc); + } + + return rc; +} + /* * ima_appraise_measurement - appraise file measurement * diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index
[PATCH v6 6/9] ima: make process_buffer_measurement() non static
To add the support for checking against blacklist, it would be needed to add an additional measurement record that identifies the record as blacklisted. This patch modifies the process_buffer_measurement() and makes it non static to be used by blacklist functionality. It modifies the function to handle more than just the KEXEC_CMDLINE. Signed-off-by: Nayna Jain --- security/integrity/ima/ima.h | 3 +++ security/integrity/ima/ima_main.c | 29 ++--- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h index 19769bf5f6ab..9bf509217e8e 100644 --- a/security/integrity/ima/ima.h +++ b/security/integrity/ima/ima.h @@ -215,6 +215,9 @@ void ima_store_measurement(struct integrity_iint_cache *iint, struct file *file, struct evm_ima_xattr_data *xattr_value, int xattr_len, const struct modsig *modsig, int pcr, struct ima_template_desc *template_desc); +void process_buffer_measurement(const void *buf, int size, + const char *eventname, int pcr, + struct ima_template_desc *template_desc); void ima_audit_measurement(struct integrity_iint_cache *iint, const unsigned char *filename); int ima_alloc_init_template(struct ima_event_data *event_data, diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 79c01516211b..ae0c1bdc4eaf 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -626,14 +626,14 @@ int ima_load_data(enum kernel_load_data_id id) * @buf: pointer to the buffer that needs to be added to the log. * @size: size of buffer(in bytes). * @eventname: event name to be used for the buffer entry. - * @cred: a pointer to a credentials structure for user validation. - * @secid: the secid of the task to be validated. + * @pcr: pcr to extend the measurement + * @template_desc: template description * * Based on policy, the buffer is measured into the ima log. */ -static void process_buffer_measurement(const void *buf, int size, - const char *eventname, - const struct cred *cred, u32 secid) +void process_buffer_measurement(const void *buf, int size, + const char *eventname, int pcr, + struct ima_template_desc *template_desc) { int ret = 0; struct ima_template_entry *entry = NULL; @@ -642,19 +642,11 @@ static void process_buffer_measurement(const void *buf, int size, .filename = eventname, .buf = buf, .buf_len = size}; - struct ima_template_desc *template_desc = NULL; struct { struct ima_digest_data hdr; char digest[IMA_MAX_DIGEST_SIZE]; } hash = {}; int violation = 0; - int pcr = CONFIG_IMA_MEASURE_PCR_IDX; - int action = 0; - - action = ima_get_action(NULL, cred, secid, 0, KEXEC_CMDLINE, &pcr, - &template_desc); - if (!(action & IMA_MEASURE)) - return; iint.ima_hash = &hash.hdr; iint.ima_hash->algo = ima_hash_algo; @@ -686,12 +678,19 @@ static void process_buffer_measurement(const void *buf, int size, */ void ima_kexec_cmdline(const void *buf, int size) { + int pcr = CONFIG_IMA_MEASURE_PCR_IDX; + struct ima_template_desc *template_desc = NULL; + int action; u32 secid; if (buf && size != 0) { security_task_getsecid(current, &secid); - process_buffer_measurement(buf, size, "kexec-cmdline", - current_cred(), secid); + action = ima_get_action(NULL, current_cred(), secid, 0, + KEXEC_CMDLINE, &pcr, &template_desc); + if (!(action & IMA_MEASURE)) + return; + process_buffer_measurement(buf, size, "kexec-cmdline", pcr, + template_desc); } } -- 2.20.1
[PATCH v6 5/9] powerpc/ima: add measurement rules to ima arch specific policy
This patch adds the measurement rules to the arch specific policies for the systems with trusted boot. Signed-off-by: Nayna Jain --- arch/powerpc/kernel/ima_arch.c | 44 +++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c index 39401b67f19e..77c61b142042 100644 --- a/arch/powerpc/kernel/ima_arch.c +++ b/arch/powerpc/kernel/ima_arch.c @@ -12,8 +12,18 @@ bool arch_ima_get_secureboot(void) return is_powerpc_os_secureboot_enabled(); } -/* Defines IMA appraise rules for secureboot */ +/* + * The "arch_rules" contains both the securebot and trustedboot rules for adding + * the kexec kernel image and kernel modules file hashes to the IMA measurement + * list and verifying the file signatures against known good values. + * + * The "appraise_type=imasig|modsig" option allows the good signature to be + * stored as an xattr or as an appended signature. The "template=ima-modsig" + * option includes the appended signature in the IMA measurement list. + */ static const char *const arch_rules[] = { + "measure func=KEXEC_KERNEL_CHECK template=ima-modsig", + "measure func=MODULE_CHECK template=ima-modsig", "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", #if !IS_ENABLED(CONFIG_MODULE_SIG) "appraise func=MODULE_CHECK appraise_type=imasig|modsig", @@ -22,12 +32,40 @@ static const char *const arch_rules[] = { }; /* - * Returns the relevant IMA arch policies based on the system secureboot state. + * The "measure_rules" are enabled only on "trustedboot" enabled systems. + * These rules add the kexec kernel image and kernel modules file hashes to + * the IMA measurement list. + */ +static const char *const measure_rules[] = { + "measure func=KEXEC_KERNEL_CHECK", + "measure func=MODULE_CHECK", + NULL +}; + +/* + * Returns the relevant IMA arch policies based on the system secureboot + * and trustedboot state. */ const char *const *arch_get_ima_policy(void) { - if (is_powerpc_os_secureboot_enabled()) + const char *const *rules; + int offset = 0; + + for (rules = arch_rules; *rules != NULL; rules++) { + if (strncmp(*rules, "appraise", 8) == 0) + break; + offset++; + } + + if (is_powerpc_os_secureboot_enabled() + && is_powerpc_trustedboot_enabled()) return arch_rules; + if (is_powerpc_os_secureboot_enabled()) + return arch_rules + offset; + + if (is_powerpc_trustedboot_enabled()) + return measure_rules; + return NULL; } -- 2.20.1
[PATCH v6 4/9] powerpc: detect the trusted boot state of the system
PowerNV systems enables the IMA measurement rules only if the trusted boot is enabled on the system. This patch adds the function to detect if the system has trusted boot enabled. Signed-off-by: Nayna Jain --- arch/powerpc/include/asm/secure_boot.h | 6 ++ arch/powerpc/kernel/secure_boot.c | 14 ++ 2 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/secure_boot.h b/arch/powerpc/include/asm/secure_boot.h index 4e8e2b08a993..192caaedbe7a 100644 --- a/arch/powerpc/include/asm/secure_boot.h +++ b/arch/powerpc/include/asm/secure_boot.h @@ -14,6 +14,7 @@ bool is_powerpc_os_secureboot_enabled(void); int get_powerpc_os_sb_node(struct device_node **node); +bool is_powerpc_trustedboot_enabled(void); #else @@ -27,5 +28,10 @@ static inline int get_powerpc_os_sb_node(struct device_node **node) return -ENOENT; } +static inline bool is_powerpc_os_trustedboot_enabled(void) +{ + return false; +} + #endif #endif diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c index 45ca19f5e836..9d452e1550ae 100644 --- a/arch/powerpc/kernel/secure_boot.c +++ b/arch/powerpc/kernel/secure_boot.c @@ -80,3 +80,17 @@ bool is_powerpc_os_secureboot_enabled(void) pr_info("secureboot mode disabled\n"); return false; } + +bool is_powerpc_trustedboot_enabled(void) +{ + struct device_node *node; + + node = get_powerpc_fw_sb_node(); + if (node && (of_find_property(node, "trusted-enabled", NULL))) { + pr_info("trustedboot mode enabled\n"); + return true; + } + + pr_info("trustedboot mode disabled\n"); + return false; +} -- 2.20.1
[PATCH v6 3/9] powerpc: add support to initialize ima policy rules
PowerNV systems uses kernel based bootloader, thus its secure boot implementation uses kernel IMA security subsystem to verify the kernel before kexec. Since the verification policy might differ based on the secure boot mode of the system, the policies are defined at runtime. This patch implements the arch-specific support to define the IMA policy rules based on the runtime secure boot mode of the system. This patch provides arch-specific IMA policies if PPC_SECURE_BOOT config is enabled. Signed-off-by: Nayna Jain --- arch/powerpc/Kconfig | 2 ++ arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/ima_arch.c | 33 + include/linux/ima.h| 3 ++- 4 files changed, 38 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/kernel/ima_arch.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2c54beb29f1a..54eda07c74e5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -916,6 +916,8 @@ config PPC_SECURE_BOOT prompt "Enable secure boot support" bool depends on PPC_POWERNV + depends on IMA + depends on IMA_ARCH_POLICY help Systems with firmware secure boot enabled needs to define security policies to extend secure boot to the OS. This config allows user diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 875b0785a20e..7156ac1fc956 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -157,7 +157,7 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o -obj-$(CONFIG_PPC_SECURE_BOOT) += secure_boot.o +obj-$(CONFIG_PPC_SECURE_BOOT) += secure_boot.o ima_arch.o # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c new file mode 100644 index ..39401b67f19e --- /dev/null +++ b/arch/powerpc/kernel/ima_arch.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + */ + +#include +#include + +bool arch_ima_get_secureboot(void) +{ + return is_powerpc_os_secureboot_enabled(); +} + +/* Defines IMA appraise rules for secureboot */ +static const char *const arch_rules[] = { + "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig", +#if !IS_ENABLED(CONFIG_MODULE_SIG) + "appraise func=MODULE_CHECK appraise_type=imasig|modsig", +#endif + NULL +}; + +/* + * Returns the relevant IMA arch policies based on the system secureboot state. + */ +const char *const *arch_get_ima_policy(void) +{ + if (is_powerpc_os_secureboot_enabled()) + return arch_rules; + + return NULL; +} diff --git a/include/linux/ima.h b/include/linux/ima.h index a20ad398d260..10af09b5b478 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -29,7 +29,8 @@ extern void ima_kexec_cmdline(const void *buf, int size); extern void ima_add_kexec_buffer(struct kimage *image); #endif -#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) +#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) \ + || defined(CONFIG_PPC_SECURE_BOOT) extern bool arch_ima_get_secureboot(void); extern const char * const *arch_get_ima_policy(void); #else -- 2.20.1
[PATCH v6 2/9] powerpc: detect the secure boot mode of the system
Secure boot on PowerNV defines different IMA policies based on the secure boot state of the system. This patch defines a function to detect the secure boot state of the system. The PPC_SECURE_BOOT config represents the base enablement of secureboot on POWER. Signed-off-by: Nayna Jain --- arch/powerpc/Kconfig | 10 arch/powerpc/include/asm/secure_boot.h | 31 ++ arch/powerpc/kernel/Makefile | 2 + arch/powerpc/kernel/secure_boot.c | 82 ++ 4 files changed, 125 insertions(+) create mode 100644 arch/powerpc/include/asm/secure_boot.h create mode 100644 arch/powerpc/kernel/secure_boot.c diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 77f6ebf97113..2c54beb29f1a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -912,6 +912,16 @@ config PPC_MEM_KEYS If unsure, say y. +config PPC_SECURE_BOOT + prompt "Enable secure boot support" + bool + depends on PPC_POWERNV + help + Systems with firmware secure boot enabled needs to define security + policies to extend secure boot to the OS. This config allows user + to enable OS secure boot on systems that have firmware support for + it. If in doubt say N. + endmenu config ISA_DMA_API diff --git a/arch/powerpc/include/asm/secure_boot.h b/arch/powerpc/include/asm/secure_boot.h new file mode 100644 index ..4e8e2b08a993 --- /dev/null +++ b/arch/powerpc/include/asm/secure_boot.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Secure boot definitions + * + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + */ +#ifndef _ASM_POWER_SECURE_BOOT_H +#define _ASM_POWER_SECURE_BOOT_H + +#ifdef CONFIG_PPC_SECURE_BOOT + +#define SECURE_BOOT_MASK 0x + +bool is_powerpc_os_secureboot_enabled(void); +int get_powerpc_os_sb_node(struct device_node **node); + +#else + +static inline bool is_powerpc_os_secureboot_enabled(void) +{ + return false; +} + +static inline int get_powerpc_os_sb_node(struct device_node **node) +{ + return -ENOENT; +} + +#endif +#endif diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index ea0c69236789..875b0785a20e 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -157,6 +157,8 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o +obj-$(CONFIG_PPC_SECURE_BOOT) += secure_boot.o + # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n KCOV_INSTRUMENT_prom_init.o := n diff --git a/arch/powerpc/kernel/secure_boot.c b/arch/powerpc/kernel/secure_boot.c new file mode 100644 index ..45ca19f5e836 --- /dev/null +++ b/arch/powerpc/kernel/secure_boot.c @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Nayna Jain + */ +#include +#include +#include + +static struct device_node *get_powerpc_fw_sb_node(void) +{ + return of_find_node_by_name(NULL, "ibm,secureboot"); +} + +bool is_powerpc_os_sb_supported(void) +{ + struct device_node *node = NULL; + + node = get_powerpc_fw_sb_node(); + if (node && of_device_is_compatible(node, "ibm,secureboot-v3")) + return true; + + return false; +} + +int get_powerpc_os_sb_node(struct device_node **node) +{ + struct device_node *fwsbnode; + + if (!is_powerpc_os_sb_supported()) + return -ENOTSUPP; + + fwsbnode = get_powerpc_fw_sb_node(); + if (!fwsbnode) + return -ENOENT; + + *node = of_find_node_by_name(fwsbnode, "secvar"); + if (*node) + return 0; + + return -ENOENT; +} + +bool is_powerpc_os_secureboot_enabled(void) +{ + struct device_node *node; + u64 sbmode = 0; + int rc; + + rc = get_powerpc_os_sb_node(&node); + if (rc == -ENOTSUPP) + goto disabled; + + /* Fail secure for any failure related to secvar */ + if (rc) { + pr_err("Expected secure variables support, fail secure\n"); + goto enabled; + } + + if (!of_device_is_available(node)) { + pr_err("Secure variables support is in error state, fail secure\n"); + goto enabled; + } + + rc = of_property_read_u64(node, "os-secure-mode", &sbmode); + if (rc) + goto enabled; + + sbmode = be64_to_cpu(sbmode); + + /* checks for the secure mode enforcing bit */ + if (!(sbmode & SECURE_BOOT_MASK)) + goto disabled; + +enabled: + pr_info("secureboot mode enabled\n"); + return true; + +disabled: + pr_info("secureboot mode disabled\n"); + return false; +} -- 2.20.1
[PATCH v6 1/9] dt-bindings: ibm, secureboot: secure boot specific properties for PowerNV
PowerNV represents both the firmware and Host OS secureboot state of the system via device tree. This patch adds the documentation to give the definition of the nodes and the properties. Signed-off-by: Nayna Jain --- .../bindings/powerpc/ibm,secureboot.rst | 76 .../devicetree/bindings/powerpc/secvar.rst| 89 +++ 2 files changed, 165 insertions(+) create mode 100644 Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst create mode 100644 Documentation/devicetree/bindings/powerpc/secvar.rst diff --git a/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst b/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst new file mode 100644 index ..03d32099d2eb --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: GPL-2.0 +*** NOTE *** +This document is copied from OPAL firmware +(skiboot/doc/device-tree/ibm,secureboot.rst) + +.. _device-tree/ibm,secureboot: + +ibm,secureboot +== + +The ``??bm,secureboot`` node provides secure boot and trusted boot information +up to the target OS. Further information can be found in :ref:`stb-overview`. + +Required properties +--- + +.. code-block:: none + +compatible: Either one of the following values: + +ibm,secureboot-v1 : The container-verification-code + is stored in a secure ROM memory. + +ibm,secureboot-v2 : The container-verification-code + is stored in a reserved memory. + It described by the ibm,cvc child + node. + +ibm,secureboot-v3 : The container-verification-code + is stored in a reserved memory. + It described by the ibm,cvc child + node. Secure variables are + supported. `secvar` node should + be created. + +secure-enabled: this property exists when the firmware stack is booting +in secure mode (hardware secure boot jumper asserted). + +trusted-enabled:this property exists when the firmware stack is booting +in trusted mode. + +hw-key-hash:hash of the three hardware public keys trusted by the +platformw owner. This is used to verify if a firmware +code is signed with trusted keys. + +hw-key-hash-size: hw-key-hash size + +secvar: this node is created if the platform supports secure +variables. Contains information about the current +secvar status, see 'secvar.rst'. + +Obsolete properties +--- + +.. code-block:: none + +hash-algo: Superseded by the hw-key-hash-size property in +'ibm,secureboot-v2'. + +Example +--- + +.. code-block:: dts + +ibm,secureboot { +compatible = "ibm,secureboot-v2"; +secure-enabled; +trusted-enabled; +hw-key-hash-size = <0x40>; +hw-key-hash = <0x40d487ff 0x7380ed6a 0xd54775d5 0x795fea0d 0xe2f541fe + 0xa9db06b8 0x466a42a3 0x20e65f75 0xb4866546 0x0017d907 + 0x515dc2a5 0xf9fc5095 0x4d6ee0c9 0xb67d219d 0xfb708535 + 0x1d01d6d1>; +phandle = <0x10fd>; +linux,phandle = <0x10fd>; +}; diff --git a/Documentation/devicetree/bindings/powerpc/secvar.rst b/Documentation/devicetree/bindings/powerpc/secvar.rst new file mode 100644 index ..47793ab9c2a7 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/secvar.rst @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0 +*** NOTE *** +This document is copied from OPAL firmware +(skiboot/doc/device-tree/secvar.rst) + +.. _device-tree/ibm,secureboot/secvar: + +secvar +== + +The ``secvar`` node provides secure variable information for the secure +boot of the target OS. + +Required properties +--- + +.. code-block:: none + +compatible: this property is set based on the current secure +variable scheme as set by the platform. + +status: set to "fail" if the secure variables could not +be initialized, validated, or some other +catastrophic failure. + +update-status: contains the return code of the update queue +process run during initialization. Signifies if +updates were processed or not, and if there was +an error. See table belo
[PATCH v6 0/9] powerpc: Enabling IMA arch specific secure boot policies
This patchset extends the previous version of the patchset[1] by adding the support for checking against the binary blacklisted hashes. IMA subsystem supports custom, built-in, arch-specific policies to define the files to be measured and appraised. These policies are honored based on the priority where arch-specific policies is the highest and custom is the lowest. PowerNV systems uses the linux based bootloader and kexec the Host OS. It rely on IMA for signature verification of the kernel before doing the kexec. This patchset adds support for powerpc arch specific ima policies that are defined based on system's OS secureboot and trustedboot state. The OS secureboot and trustedboot state are determined via device-tree properties. The verification needs to be done only for the binaries which are not blacklisted. The kernel currently checks against the blacklisted keys. However that results in blacklisting all the binaries that are signed by that key. In order to prevent single binary from loading, it is required to support checking against blacklisting of the binary hash. This patchset adds the support in IMA to check against blacklisted hashes for the files signed by appended signature. [1] http://patchwork.ozlabs.org/cover/1149262/ Changelog: v6: * includes feedbacks from Michael Ellerman on the patchset v5 * removed email ids from comments * add the doc for the device-tree * renames the secboot.c to secure_boot.c and secboot.h to secure_boot.h * other code specific fixes * split the patches to differentiate between secureboot and trustedboot state of the system * adds the patches to support the blacklisting of the binary hash. v5: * secureboot state is now read via device tree entry rather than OPAL secure variables * ima arch policies are updated to use policy based template for measurement rules v4: * Fixed the build issue as reported by Satheesh Rajendran. v3: * OPAL APIs in Patch 1 are updated to provide generic interface based on key/keylen. This patchset updates kernel OPAL APIs to be compatible with generic interface. * Patch 2 is cleaned up to use new OPAL APIs. * Since OPAL can support different types of backend which can vary in the variable interpretation, the Patch 2 is updated to add a check for the backend version * OPAL API now expects consumer to first check the supported backend version before calling other secvar OPAL APIs. This check is now added in patch 2. * IMA policies in Patch 3 is updated to specify appended signature and per policy template. * The patches now are free of any EFIisms. v2: * Removed Patch 1: powerpc/include: Override unneeded early ioremap functions * Updated Subject line and patch description of the Patch 1 of this series * Removed dependency of OPAL_SECVAR on EFI, CPU_BIG_ENDIAN and UCS2_STRING * Changed OPAL APIs from static to non-static. Added opal-secvar.h for the same * Removed EFI hooks from opal_secvar.c * Removed opal_secvar_get_next(), opal_secvar_enqueue() and opal_query_variable_info() function * get_powerpc_sb_mode() in secboot.c now directly calls OPAL Runtime API rather than via EFI hooks. * Fixed log messages in get_powerpc_sb_mode() function. * Added dependency for PPC_SECURE_BOOT on configs PPC64 and OPAL_SECVAR * Replaced obj-$(CONFIG_IMA) with obj-$(CONFIG_PPC_SECURE_BOOT) in arch/powerpc/kernel/Makefile Nayna Jain (9): dt-bindings: ibm,secureboot: secure boot specific properties for PowerNV powerpc: detect the secure boot mode of the system powerpc: add support to initialize ima policy rules powerpc: detect the trusted boot state of the system powerpc/ima: add measurement rules to ima arch specific policy ima: make process_buffer_measurement() non-static ima: check against blacklisted hashes for files with modsig ima: deprecate permit_directio, instead use appraise_flag powerpc/ima: update ima arch policy to check for blacklist Documentation/ABI/testing/ima_policy | 3 +- .../bindings/powerpc/ibm,secureboot.rst | 76 +++ .../devicetree/bindings/powerpc/secvar.rst| 89 + arch/powerpc/Kconfig | 12 +++ arch/powerpc/include/asm/secure_boot.h| 37 +++ arch/powerpc/kernel/Makefile | 2 + arch/powerpc/kernel/ima_arch.c| 71 ++ arch/powerpc/kernel/secure_boot.c | 96 +++ include/linux/ima.h | 3 +- security/integrity/ima/ima.h | 15 +++ security/integrity/ima/ima_appraise.c | 35 +++ security/integrity/ima/ima_main.c | 37 +++ security/integrity/ima/ima_policy.c | 12 ++- security/integrity/integrity.h| 1 + 14 files changed, 468 insertions(+), 21 deletions(-) create mode 100644 Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst create mode 100644 Documentation/devicetree/bindings/powerpc/secvar.rst create mode 100644 arch/powerpc/include/
[RFC PATCH] powernv/eeh: Fix oops when probing cxl devices
Recent cleanup in the way EEH support is added to a device causes a kernel oops when the cxl driver probes a device and creates virtual devices discovered on the FPGA: BUG: Kernel NULL pointer dereference at 0x00a0 Faulting instruction address: 0xc0048070 Oops: Kernel access of bad area, sig: 7 [#1] ... NIP [c0048070] eeh_add_device_late.part.9+0x50/0x1e0 LR [c004805c] eeh_add_device_late.part.9+0x3c/0x1e0 Call Trace: [c000200e43983900] [c079e250] _dev_info+0x5c/0x6c (unreliable) [c000200e43983980] [c00d1ad0] pnv_pcibios_bus_add_device+0x60/0xb0 [c000200e439839f0] [c00606d0] pcibios_bus_add_device+0x40/0x60 [c000200e43983a10] [c06aa3a0] pci_bus_add_device+0x30/0x100 [c000200e43983a80] [c06aa4d4] pci_bus_add_devices+0x64/0xd0 [c000200e43983ac0] [c0081c429118] cxl_pci_vphb_add+0xe0/0x130 [cxl] [c000200e43983b00] [c0081c4242ac] cxl_probe+0x504/0x5b0 [cxl] [c000200e43983bb0] [c06bba1c] local_pci_probe+0x6c/0x110 [c000200e43983c30] [c0159278] work_for_cpu_fn+0x38/0x60 The root cause is that those cxl virtual devices don't have a representation in the device tree and therefore no associated pci_dn structure. In eeh_add_device_late(), pdn is NULL, so edev is NULL and we oops. We never had explicit support for EEH for those virtual devices. Instead, EEH events are reported to the (real) pci device and handled by the cxl driver. Which can then forward to the virtual devices and handle dependencies. The fact that we try adding EEH support for the virtual devices is new and a side-effect of the recent cleanup. This patch fixes it by skipping adding EEH support on powernv for devices which don't have a pci_dn structure. Fixes: b905f8cdca77 ("powerpc/eeh: EEH for pSeries hot plug") Signed-off-by: Frederic Barrat --- Sending as an RFC, as I'm afraid of hiding potential issues and would be interested in comments. The powernv eeh code expects a struct pci_dn, so the fix seems safe. I'm wondering if there could be cases (other than capi virtual devices) where we'd want to blow up and fix instead of going undetected with this patch. arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 6bc24a47e9ef..6f300ab7f0e9 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -42,7 +42,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev) { struct pci_dn *pdn = pci_get_pdn(pdev); - if (eeh_has_flag(EEH_FORCE_DISABLED)) + if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED)) return; dev_dbg(&pdev->dev, "EEH: Setting up device\n"); -- 2.21.0
Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()
On Tue, Aug 27, 2019 at 06:18:22PM +0300, Andy Shevchenko wrote: > This simplifies and standardizes slot manipulation code > by using for_each_set_bit() library function. > > Signed-off-by: Andy Shevchenko > --- > drivers/pci/pcie/aer.c | 19 --- > 1 file changed, 8 insertions(+), 11 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index b45bc47d04fe..f883f81d759a 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -15,6 +15,7 @@ > #define pr_fmt(fmt) "AER: " fmt > #define dev_fmt pr_fmt > > +#include > #include > #include > #include > @@ -657,7 +658,8 @@ const struct attribute_group aer_stats_attr_group = { > static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > struct aer_err_info *info) > { > - int status, i, max = -1; > + unsigned long status = info->status & ~info->mask; > + int i, max = -1; > u64 *counter = NULL; > struct aer_stats *aer_stats = pdev->aer_stats; > > @@ -682,10 +684,8 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > break; > } > > - status = (info->status & ~info->mask); > - for (i = 0; i < max; i++) > - if (status & (1 << i)) > - counter[i]++; > + for_each_set_bit(i, &status, max) I applied this, but I confess to being a little ambivalent. It's arguably a little easier to read, but it's not nearly as efficient (not a great concern here) and more importantly much harder to verify that it's correct because you have to chase through for_each_set_bit(), find_first_bit(), _ffs(), etc, etc. No doubt it's great for bitmaps of arbitrary size, but for a simple 32-bit register I'm a little hesitant. But I applied it anyway. > + counter[i]++; > } > > static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, > @@ -717,14 +717,11 @@ static void __print_tlp_header(struct pci_dev *dev, > static void __aer_print_error(struct pci_dev *dev, > struct aer_err_info *info) > { > - int i, status; > + unsigned long status = info->status & ~info->mask; > const char *errmsg = NULL; > - status = (info->status & ~info->mask); > - > - for (i = 0; i < 32; i++) { > - if (!(status & (1 << i))) > - continue; > + int i; > > + for_each_set_bit(i, &status, 32) { > if (info->severity == AER_CORRECTABLE) > errmsg = i < ARRAY_SIZE(aer_correctable_error_string) ? > aer_correctable_error_string[i] : NULL; > -- > 2.23.0.rc1 >
[PATCH v2 6/6] KVM: PPC: Book3S HV: XIVE: Allow userspace to set the # of VPs
Add a new attribute to both XIVE and XICS-on-XIVE KVM devices so that userspace can tell how many interrupt servers it needs. If a VM needs less than the current default of KVM_MAX_VCPUS (2048), we can allocate less VPs in OPAL. Combined with a core stride (VSMT) that matches the number of guest threads per core, this may substantially increases the number of VMs that can run concurrently with an in-kernel XIVE device. Since the legacy XIVE KVM device is exposed to userspace through the XICS KVM API, a new attribute group is added to it for this purpose. While here, fix the syntax of the existing KVM_DEV_XICS_GRP_SOURCES in the XICS documentation. Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater --- v2: - changelog update --- Documentation/virt/kvm/devices/xics.txt | 14 -- Documentation/virt/kvm/devices/xive.txt |8 arch/powerpc/include/uapi/asm/kvm.h |3 +++ arch/powerpc/kvm/book3s_xive.c | 10 ++ arch/powerpc/kvm/book3s_xive_native.c |3 +++ 5 files changed, 36 insertions(+), 2 deletions(-) diff --git a/Documentation/virt/kvm/devices/xics.txt b/Documentation/virt/kvm/devices/xics.txt index 42864935ac5d..423332dda7bc 100644 --- a/Documentation/virt/kvm/devices/xics.txt +++ b/Documentation/virt/kvm/devices/xics.txt @@ -3,9 +3,19 @@ XICS interrupt controller Device type supported: KVM_DEV_TYPE_XICS Groups: - KVM_DEV_XICS_SOURCES + 1. KVM_DEV_XICS_GRP_SOURCES Attributes: One per interrupt source, indexed by the source number. + 2. KVM_DEV_XICS_GRP_CTRL + Attributes: +2.1 KVM_DEV_XICS_NR_SERVERS (write only) + The kvm_device_attr.addr points to a __u32 value which is the number of + interrupt server numbers (ie, highest possible vcpu id plus one). + Errors: +-EINVAL: Value greater than KVM_MAX_VCPU_ID. +-EFAULT: Invalid user pointer for attr->addr. +-EBUSY: A vcpu is already connected to the device. + This device emulates the XICS (eXternal Interrupt Controller Specification) defined in PAPR. The XICS has a set of interrupt sources, each identified by a 20-bit source number, and a set of @@ -38,7 +48,7 @@ least-significant end of the word: Each source has 64 bits of state that can be read and written using the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the -KVM_DEV_XICS_SOURCES attribute group, with the attribute number being +KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being the interrupt source number. The 64 bit state word has the following bitfields, starting from the least-significant end of the word: diff --git a/Documentation/virt/kvm/devices/xive.txt b/Documentation/virt/kvm/devices/xive.txt index 9a24a4525253..f5d1d6b5af61 100644 --- a/Documentation/virt/kvm/devices/xive.txt +++ b/Documentation/virt/kvm/devices/xive.txt @@ -78,6 +78,14 @@ the legacy interrupt mode, referred as XICS (POWER7/8). migrating the VM. Errors: none +1.3 KVM_DEV_XIVE_NR_SERVERS (write only) +The kvm_device_attr.addr points to a __u32 value which is the number of +interrupt server numbers (ie, highest possible vcpu id plus one). +Errors: + -EINVAL: Value greater than KVM_MAX_VCPU_ID. + -EFAULT: Invalid user pointer for attr->addr. + -EBUSY: A vCPU is already connected to the device. + 2. KVM_DEV_XIVE_GRP_SOURCE (write only) Initializes a new source in the XIVE device and mask it. Attributes: diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index b0f72dea8b11..264e266a85bf 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -667,6 +667,8 @@ struct kvm_ppc_cpu_char { /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ +#define KVM_DEV_XICS_GRP_CTRL 2 +#define KVM_DEV_XICS_NR_SERVERS 1 /* Layout of 64-bit source attribute values */ #define KVM_XICS_DESTINATION_SHIFT0 @@ -683,6 +685,7 @@ struct kvm_ppc_cpu_char { #define KVM_DEV_XIVE_GRP_CTRL 1 #define KVM_DEV_XIVE_RESET 1 #define KVM_DEV_XIVE_EQ_SYNC 2 +#define KVM_DEV_XIVE_NR_SERVERS 3 #define KVM_DEV_XIVE_GRP_SOURCE2 /* 64-bit source identifier */ #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */ #define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */ diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6c35b3d95986..66858b7d3c6b 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1911,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) switch (attr->group) { case KVM_DEV_XICS_GRP_SOURCES: return xive_set_source(xive, attr->attr, attr->addr); + case KVM_DEV_XICS_GRP_CTRL: + switch (attr->attr) { +
[PATCH v2 5/6] KVM: PPC: Book3S HV: XIVE: Make VP block size configurable
The XIVE VP is an internal structure which allow the XIVE interrupt controller to maintain the interrupt context state of vCPUs non dispatched on HW threads. When a guest is started, the XIVE KVM device allocates a block of XIVE VPs in OPAL, enough to accommodate the highest possible vCPU id KVM_MAX_VCPU_ID (16384) packed down to KVM_MAX_VCPUS (2048). With a guest's core stride of 8 and a threading mode of 1 (QEMU's default), a VM must run at least 256 vCPUs to actually need such a range of VPs. A POWER9 system has a limited XIVE VP space : 512k and KVM is currently wasting this HW resource with large VP allocations, especially since a typical VM likely runs with a lot less vCPUs. Make the size of the VP block configurable. Add an nr_servers field to the XIVE structure and a function to set it for this purpose. Split VP allocation out of the device create function. Since the VP block isn't used before the first vCPU connects to the XIVE KVM device, allocation is now performed by kvmppc_xive_connect_vcpu(). This gives the opportunity to set nr_servers in between: kvmppc_xive_create() / kvmppc_xive_native_create() . . kvmppc_xive_set_nr_servers() . . kvmppc_xive_connect_vcpu() / kvmppc_xive_native_connect_vcpu() The connect_vcpu() functions check that the vCPU id is below nr_servers and if it is the first vCPU they allocate the VP block. This is protected against a concurrent update of nr_servers by kvmppc_xive_set_nr_servers() with the xive->lock mutex. Also, the block is allocated once for the device lifetime: nr_servers should stay constant otherwise connect_vcpu() could generate a boggus VP id and likely crash OPAL. It is thus forbidden to update nr_servers once the block is allocated. If the VP allocation fail, return ENOSPC which seems more appropriate to report the depletion of system wide HW resource than ENOMEM or ENXIO. A VM using a stride of 8 and 1 thread per core with 32 vCPUs would hence only need 256 VPs instead of 2048. If the stride is set to match the number of threads per core, this goes further down to 32. This will be exposed to userspace by a subsequent patch. Signed-off-by: Greg Kurz --- v2: - update nr_server check and clip down to KVM_MAX_VCPUS in kvmppc_xive_set_nr_servers() --- arch/powerpc/kvm/book3s_xive.c| 65 +++-- arch/powerpc/kvm/book3s_xive.h|4 ++ arch/powerpc/kvm/book3s_xive_native.c | 18 +++-- 3 files changed, 62 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index d84da9f6ee88..6c35b3d95986 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1213,13 +1213,13 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) { - /* We have a block of KVM_MAX_VCPUS VPs. We just need to check + /* We have a block of xive->nr_servers VPs. We just need to check * raw vCPU ids are below the expected limit for this guest's * core stride ; kvmppc_pack_vcpu_id() will pack them down to an * index that can be safely used to compute a VP id that belongs * to the VP block. */ - return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode; + return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode; } int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) @@ -1231,6 +1231,14 @@ int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) return -EINVAL; } + if (xive->vp_base == XIVE_INVALID_VP) { + xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers); + pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, xive->nr_servers); + + if (xive->vp_base == XIVE_INVALID_VP) + return -ENOSPC; + } + vp_id = kvmppc_xive_vp(xive, cpu); if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { pr_devel("Duplicate !\n"); @@ -1858,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, return 0; } +int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr) +{ + u32 __user *ubufp = (u32 __user *) addr; + u32 nr_servers; + int rc = 0; + + if (get_user(nr_servers, ubufp)) + return -EFAULT; + + pr_devel("%s nr_servers=%u\n", __func__, nr_servers); + + if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID) + return -EINVAL; + + mutex_lock(&xive->lock); + if (xive->vp_base != XIVE_INVALID_VP) + /* The VP block is allocated once and freed when the device +* is released. Better not allow to change its size since its +
[PATCH v2 4/6] KVM: PPC: Book3S HV: XIVE: Compute the VP id in a common helper
Reduce code duplication by consolidating the checking of vCPU ids and VP ids to a common helper used by both legacy and native XIVE KVM devices. And explain the magic with a comment. Signed-off-by: Greg Kurz --- arch/powerpc/kvm/book3s_xive.c| 42 ++--- arch/powerpc/kvm/book3s_xive.h|1 + arch/powerpc/kvm/book3s_xive_native.c | 11 ++--- 3 files changed, 36 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 0b7859e40f66..d84da9f6ee88 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1211,6 +1211,37 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) vcpu->arch.xive_vcpu = NULL; } +static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) +{ + /* We have a block of KVM_MAX_VCPUS VPs. We just need to check +* raw vCPU ids are below the expected limit for this guest's +* core stride ; kvmppc_pack_vcpu_id() will pack them down to an +* index that can be safely used to compute a VP id that belongs +* to the VP block. +*/ + return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode; +} + +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) +{ + u32 vp_id; + + if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) { + pr_devel("Out of bounds !\n"); + return -EINVAL; + } + + vp_id = kvmppc_xive_vp(xive, cpu); + if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { + pr_devel("Duplicate !\n"); + return -EEXIST; + } + + *vp = vp_id; + + return 0; +} + int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu) { @@ -1229,20 +1260,13 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } /* We need to synchronize with queue provisioning */ mutex_lock(&xive->lock); - vp_id = kvmppc_xive_vp(xive, cpu); - if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { - pr_devel("Duplicate !\n"); - r = -EEXIST; + r = kvmppc_xive_compute_vp_id(xive, cpu, &vp_id); + if (r) goto bail; - } xc = kzalloc(sizeof(*xc), GFP_KERNEL); if (!xc) { diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index fe3ed50e0818..90cf6ec35a68 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -296,6 +296,7 @@ int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio, struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type); void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); +int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 43a86858390a..5bb480b2aafd 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -118,19 +118,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, return -EPERM; if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT) return -EBUSY; - if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) { - pr_devel("Out of bounds !\n"); - return -EINVAL; - } mutex_lock(&xive->lock); - vp_id = kvmppc_xive_vp(xive, server_num); - if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) { - pr_devel("Duplicate !\n"); - rc = -EEXIST; + rc = kvmppc_xive_compute_vp_id(xive, server_num, &vp_id); + if (rc) goto bail; - } xc = kzalloc(sizeof(*xc), GFP_KERNEL); if (!xc) {
[PATCH v2 3/6] KVM: PPC: Book3S HV: XIVE: Show VP id in debugfs
Print out the VP id of each connected vCPU, this allow to see: - the VP block base in which OPAL encodes information that may be useful when debugging - the packed vCPU id which may differ from the raw vCPU id if the latter is >= KVM_MAX_VCPUS (2048) Signed-off-by: Greg Kurz --- arch/powerpc/kvm/book3s_xive.c|4 ++-- arch/powerpc/kvm/book3s_xive_native.c |4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index baa740815b3c..0b7859e40f66 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -2107,9 +2107,9 @@ static int xive_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x" + seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x" " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n", - xc->server_num, xc->cppr, xc->hw_cppr, + xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr, xc->mfrr, xc->pending, xc->stat_rm_h_xirr, xc->stat_vm_h_xirr); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index ebb4215baf45..43a86858390a 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1204,8 +1204,8 @@ static int xive_native_debug_show(struct seq_file *m, void *private) if (!xc) continue; - seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", - xc->server_num, + seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n", + xc->server_num, xc->vp_id, vcpu->arch.xive_saved_state.nsr, vcpu->arch.xive_saved_state.cppr, vcpu->arch.xive_saved_state.ipb,
[PATCH v2 2/6] KVM: PPC: Book3S HV: XIVE: Ensure VP isn't already in use
Connecting a vCPU to a XIVE KVM device means establishing a 1:1 association between a vCPU id and the offset (VP id) of a VP structure within a fixed size block of VPs. We currently try to enforce the 1:1 relationship by checking that a vCPU with the same id isn't already connected. This is good but unfortunately not enough because we don't map VP ids to raw vCPU ids but to packed vCPU ids, and the packing function kvmppc_pack_vcpu_id() isn't bijective by design. We got away with it because QEMU passes vCPU ids that fit well in the packing pattern. But nothing prevents userspace to come up with a forged vCPU id resulting in a packed id collision which causes the KVM device to associate two vCPUs to the same VP. This greatly confuses the irq layer and ultimately crashes the kernel, as shown below. Example: a guest with 1 guest thread per core, a core stride of 8 and 300 vCPUs has vCPU ids 0,8,16...2392. If QEMU is patched to inject at some point an invalid vCPU id 348, which is the packed version of itself and 2392, we get: genirq: Flags mismatch irq 199. 0001 (kvm-2-2392) vs. 0001 (kvm-2-348) CPU: 24 PID: 88176 Comm: qemu-system-ppc Not tainted 5.3.0-xive-nr-servers-5.3-gku+ #38 Call Trace: [c03f7f9937e0] [c0c0110c] dump_stack+0xb0/0xf4 (unreliable) [c03f7f993820] [c01cb480] __setup_irq+0xa70/0xad0 [c03f7f9938d0] [c01cb75c] request_threaded_irq+0x13c/0x260 [c03f7f993940] [c0080d44e7ac] kvmppc_xive_attach_escalation+0x104/0x270 [kvm] [c03f7f9939d0] [c0080d45013c] kvmppc_xive_connect_vcpu+0x424/0x620 [kvm] [c03f7f993ac0] [c0080d28] kvm_arch_vcpu_ioctl+0x260/0x448 [kvm] [c03f7f993b90] [c0080d43593c] kvm_vcpu_ioctl+0x154/0x7c8 [kvm] [c03f7f993d00] [c04840f0] do_vfs_ioctl+0xe0/0xc30 [c03f7f993db0] [c0484d44] ksys_ioctl+0x104/0x120 [c03f7f993e00] [c0484d88] sys_ioctl+0x28/0x80 [c03f7f993e20] [c000b278] system_call+0x5c/0x68 xive-kvm: Failed to request escalation interrupt for queue 0 of VCPU 2392 [ cut here ] remove_proc_entry: removing non-empty directory 'irq/199', leaking at least 'kvm-2-348' WARNING: CPU: 24 PID: 88176 at /home/greg/Work/linux/kernel-kvm-ppc/fs/proc/generic.c:684 remove_proc_entry+0x1ec/0x200 Modules linked in: kvm_hv kvm dm_mod vhost_net vhost tap xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter squashfs loop fuse i2c_dev sg ofpart ocxl powernv_flash at24 xts mtd uio_pdrv_genirq vmx_crypto opal_prd ipmi_powernv uio ipmi_devintf ipmi_msghandler ibmpowernv ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ip_tables ext4 mbcache jbd2 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq libcrc32c raid1 raid0 linear sd_mod ast i2c_algo_bit drm_vram_helper ttm drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci libata tg3 drm_panel_orientation_quirks [last unloaded: kvm] CPU: 24 PID: 88176 Comm: qemu-system-ppc Not tainted 5.3.0-xive-nr-servers-5.3-gku+ #38 NIP: c053b0cc LR: c053b0c8 CTR: c00ba3b0 REGS: c03f7f9934b0 TRAP: 0700 Not tainted (5.3.0-xive-nr-servers-5.3-gku+) MSR: 90029033 CR: 48228222 XER: 2004 CFAR: c0131a50 IRQMASK: 0 GPR00: c053b0c8 c03f7f993740 c15ec500 0057 GPR04: 0001 49fb98484262 1bcf GPR08: 0007 0007 0001 90001033 GPR12: 8000 c03eb800 00012f4ce5a1 GPR16: 00012ef5a0c8 00012f113bb0 GPR20: 00012f45d918 c03f863758b0 c03f86375870 0006 GPR24: c03f86375a30 0007 c0002039373d9020 c14c4a48 GPR28: 0001 c03fe62a4f6b c00020394b2e9fab c03fe62a4ec0 NIP [c053b0cc] remove_proc_entry+0x1ec/0x200 LR [c053b0c8] remove_proc_entry+0x1e8/0x200 Call Trace: [c03f7f993740] [c053b0c8] remove_proc_entry+0x1e8/0x200 (unreliable) [c03f7f9937e0] [c01d3654] unregister_irq_proc+0x114/0x150 [c03f7f993880] [c01c6284] free_desc+0x54/0xb0 [c03f7f9938c0] [c01c65ec] irq_free_descs+0xac/0x100 [c03f7f993910] [c01d1ff8] irq_dispose_mapping+0x68/0x80 [c03f7f993940] [c0080d44e8a4] kvmppc_xive_attach_escalation+0x1fc/0x270 [kvm] [c03f7f9939d0] [c0080d45013c] kvmppc_xive_connect_vcpu+0x424/0x620 [kvm] [c03f7f993ac0] [c0080d28] kvm_arch_vcpu_ioctl+0x260/0x448 [kvm] [c03f7f993b90] [c0080d43593c] kvm_vcpu_ioctl+0x154/0x7c8 [kvm] [c03f7f993d00] [c04840f0] do_vfs_ioctl+0xe0/0xc30 [c03f7f993db0] [c0484d44] ksys_ioctl+0x104/0x120 [
[PATCH v2 1/6] KVM: PPC: Book3S HV: XIVE: Set kvm->arch.xive when VPs are allocated
If we cannot allocate the XIVE VPs in OPAL, the creation of a XIVE or XICS-on-XIVE device is aborted as expected, but we leave kvm->arch.xive set forever since the release method isn't called in this case. Any subsequent tentative to create a XIVE or XICS-on-XIVE for this VM will thus always fail (DoS). This is a problem for QEMU since it destroys and re-creates these devices when the VM is reset: the VM would be restricted to using the much slower emulated XIVE or XICS forever. As an alternative to adding rollback, do not assign kvm->arch.xive before making sure the XIVE VPs are allocated in OPAL. Cc: sta...@vger.kernel.org # v5.2 Fixes: 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy' method by a 'release' method") Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater --- arch/powerpc/kvm/book3s_xive.c| 11 +-- arch/powerpc/kvm/book3s_xive_native.c |2 +- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 591bfb4bfd0f..99884662facb 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1997,6 +1997,10 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) pr_devel("Creating xive for partition\n"); + /* Already there ? */ + if (kvm->arch.xive) + return -EEXIST; + xive = kvmppc_xive_get_device(kvm, type); if (!xive) return -ENOMEM; @@ -2006,12 +2010,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) xive->kvm = kvm; mutex_init(&xive->lock); - /* Already there ? */ - if (kvm->arch.xive) - ret = -EEXIST; - else - kvm->arch.xive = xive; - /* We use the default queue size set by the host */ xive->q_order = xive_native_default_eq_shift(); if (xive->q_order < PAGE_SHIFT) @@ -2031,6 +2029,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (ret) return ret; + kvm->arch.xive = xive; return 0; } diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 248c1ea9e788..6b91c74839d5 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -1079,7 +1079,6 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) dev->private = xive; xive->dev = dev; xive->kvm = kvm; - kvm->arch.xive = xive; mutex_init(&xive->mapping_lock); mutex_init(&xive->lock); @@ -1100,6 +1099,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) if (ret) return ret; + kvm->arch.xive = xive; return 0; }
[PATCH v2 0/6] KVM: PPC: Book3S: HV: XIVE: Allocate less VPs in OPAL
This brings some fixes and allows to start more VMs with an in-kernel XIVE or XICS-on-XIVE device. Changes since v1 (https://patchwork.ozlabs.org/cover/1166099/): - drop a useless patch - add a patch to show VP ids in debugfs - update some changelogs - fix buggy check in patch 5 - Cc: stable -- Greg --- Greg Kurz (6): KVM: PPC: Book3S HV: XIVE: Set kvm->arch.xive when VPs are allocated KVM: PPC: Book3S HV: XIVE: Ensure VP isn't already in use KVM: PPC: Book3S HV: XIVE: Show VP id in debugfs KVM: PPC: Book3S HV: XIVE: Compute the VP id in a common helper KVM: PPC: Book3S HV: XIVE: Make VP block size configurable KVM: PPC: Book3S HV: XIVE: Allow userspace to set the # of VPs Documentation/virt/kvm/devices/xics.txt | 14 +++ Documentation/virt/kvm/devices/xive.txt |8 ++ arch/powerpc/include/uapi/asm/kvm.h |3 + arch/powerpc/kvm/book3s_xive.c | 142 --- arch/powerpc/kvm/book3s_xive.h | 17 arch/powerpc/kvm/book3s_xive_native.c | 40 +++-- 6 files changed, 167 insertions(+), 57 deletions(-)
Re: [PATCH v1] powerpc/pseries: CMM: Drop page array
On 25.09.19 09:37, David Hildenbrand wrote: > On 10.09.19 18:39, David Hildenbrand wrote: >> We can simply store the pages in a list (page->lru), no need for a >> separate data structure (+ complicated handling). This is how most >> other balloon drivers store allocated pages without additional tracking >> data. >> >> For the notifiers, use page_to_pfn() to check if a page is in the >> applicable range. plpar_page_set_loaned()/plpar_page_set_active() were >> called with __pa(page_address()) for now, I assume we can simply switch >> to page_to_phys() here. The pfn_to_kaddr() handling is now mostly gone. >> >> Cc: Benjamin Herrenschmidt >> Cc: Paul Mackerras >> Cc: Michael Ellerman >> Cc: Arun KS >> Cc: Pavel Tatashin >> Cc: Thomas Gleixner >> Cc: Andrew Morton >> Cc: Vlastimil Babka >> Signed-off-by: David Hildenbrand >> --- >> >> Only compile-tested. I hope the page_to_phys() thingy is correct and I >> didn't mess up something else / ignoring something important why the array >> is needed. >> >> I stumbled over this while looking at how the memory isolation notifier is >> used - and wondered why the additional array is necessary. Also, I think >> by switching to the generic balloon compaction mechanism, we could get >> rid of the memory hotplug notifier and the memory isolation notifier in >> this code, as the migration capability of the inflated pages is the real >> requirement: >> commit 14b8a76b9d53346f2871bf419da2aaf219940c50 >> Author: Robert Jennings >> Date: Thu Dec 17 14:44:52 2009 + >> >> powerpc: Make the CMM memory hotplug aware >> >> The Collaborative Memory Manager (CMM) module allocates individual >> pages >> over time that are not migratable. On a long running system this >> can >> severely impact the ability to find enough pages to support a >> hotplug >> memory remove operation. >> [...] >> >> Thoughts? > > Ping, is still feature still used at all? > > If nobody can test, any advise on which HW I need and how to trigger it? > So ... if CMM is no longer alive I propose ripping it out completely. Does anybody know if this feature is still getting used? Getting rid of the memory isolation notifier sounds desirable - either by scrapping CMM or by properly wiring up balloon compaction. -- Thanks, David / dhildenb
Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()
On Tue, Aug 27, 2019 at 06:18:22PM +0300, Andy Shevchenko wrote: > This simplifies and standardizes slot manipulation code > by using for_each_set_bit() library function. > > Signed-off-by: Andy Shevchenko Applied both with Kuppuswamy's reviewed-by to pci/aer for v5.5, thanks! > --- > drivers/pci/pcie/aer.c | 19 --- > 1 file changed, 8 insertions(+), 11 deletions(-) > > diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c > index b45bc47d04fe..f883f81d759a 100644 > --- a/drivers/pci/pcie/aer.c > +++ b/drivers/pci/pcie/aer.c > @@ -15,6 +15,7 @@ > #define pr_fmt(fmt) "AER: " fmt > #define dev_fmt pr_fmt > > +#include > #include > #include > #include > @@ -657,7 +658,8 @@ const struct attribute_group aer_stats_attr_group = { > static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > struct aer_err_info *info) > { > - int status, i, max = -1; > + unsigned long status = info->status & ~info->mask; > + int i, max = -1; > u64 *counter = NULL; > struct aer_stats *aer_stats = pdev->aer_stats; > > @@ -682,10 +684,8 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev, > break; > } > > - status = (info->status & ~info->mask); > - for (i = 0; i < max; i++) > - if (status & (1 << i)) > - counter[i]++; > + for_each_set_bit(i, &status, max) > + counter[i]++; > } > > static void pci_rootport_aer_stats_incr(struct pci_dev *pdev, > @@ -717,14 +717,11 @@ static void __print_tlp_header(struct pci_dev *dev, > static void __aer_print_error(struct pci_dev *dev, > struct aer_err_info *info) > { > - int i, status; > + unsigned long status = info->status & ~info->mask; > const char *errmsg = NULL; > - status = (info->status & ~info->mask); > - > - for (i = 0; i < 32; i++) { > - if (!(status & (1 << i))) > - continue; > + int i; > > + for_each_set_bit(i, &status, 32) { > if (info->severity == AER_CORRECTABLE) > errmsg = i < ARRAY_SIZE(aer_correctable_error_string) ? > aer_correctable_error_string[i] : NULL; > -- > 2.23.0.rc1 >
Re: [PATCH v4 5/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory
Comment below... On Thu, 26 Sep 2019 at 12:18, Alastair D'Silva wrote: > > From: Alastair D'Silva > > When presented with large amounts of memory being hotplugged > (in my test case, ~890GB), the call to flush_dcache_range takes > a while (~50 seconds), triggering RCU stalls. > > This patch breaks up the call into 1GB chunks, calling > cond_resched() inbetween to allow the scheduler to run. > > Signed-off-by: Alastair D'Silva > --- > arch/powerpc/mm/mem.c | 27 +-- > 1 file changed, 25 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c > index cff947cb2a84..a2758e473d58 100644 > --- a/arch/powerpc/mm/mem.c > +++ b/arch/powerpc/mm/mem.c > @@ -104,6 +104,27 @@ int __weak remove_section_mapping(unsigned long start, > unsigned long end) > return -ENODEV; > } > > +#define FLUSH_CHUNK_SIZE SZ_1G > +/** > + * flush_dcache_range_chunked(): Write any modified data cache blocks out to > + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE > + * Does not invalidate the corresponding instruction cache blocks. > + * > + * @start: the start address > + * @stop: the stop address (exclusive) > + * @chunk: the max size of the chunks > + */ > +static void flush_dcache_range_chunked(unsigned long start, unsigned long > stop, > + unsigned long chunk) > +{ > + unsigned long i; > + > + for (i = start; i < stop; i += FLUSH_CHUNK_SIZE) { Here you ignore the function parameter "chunk" and use the define FLUSH_CHUNK_SIZE. You should do one or the other; use the parameter or remove it. > + flush_dcache_range(i, min(stop, start + FLUSH_CHUNK_SIZE)); > + cond_resched(); > + } > +} > + > int __ref arch_add_memory(int nid, u64 start, u64 size, > struct mhp_restrictions *restrictions) > { > @@ -120,7 +141,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size, > start, start + size, rc); > return -EFAULT; > } > - flush_dcache_range(start, start + size); > + > + flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); > > return __add_pages(nid, start_pfn, nr_pages, restrictions); > } > @@ -137,7 +159,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 > size, > > /* Remove htab bolted mappings for this section of memory */ > start = (unsigned long)__va(start); > - flush_dcache_range(start, start + size); > + flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE); > + > ret = remove_section_mapping(start, start + size); > WARN_ON_ONCE(ret); > > -- > 2.21.0 >