[PATCH v4 11/11] powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing

2019-09-27 Thread Leonardo Bras
Skips slow part of serialize_against_pte_lookup if there is no running
lockless pagetable walk.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/pgtable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 6ba6195bff1b..27966481f2a6 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -95,7 +95,8 @@ static void do_nothing(void *unused)
 void serialize_against_pte_lookup(struct mm_struct *mm)
 {
smp_mb();
-   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+   if (running_lockless_pgtbl_walk(mm))
+   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
 /*
-- 
2.20.1



[PATCH v4 10/11] powerpc/book3s_64: Enables counting method to monitor lockless pgtbl walk

2019-09-27 Thread Leonardo Bras
Enables count-based monitoring method for lockless pagetable walks on
PowerPC book3s_64.

Other architectures/platforms fallback to using generic dummy functions.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8308f32e9782..eb9b26a4a483 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1370,5 +1370,10 @@ static inline bool pgd_is_leaf(pgd_t pgd)
return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PTE));
 }
 
+#define __HAVE_ARCH_LOCKLESS_PGTBL_WALK_COUNTER
+void start_lockless_pgtbl_walk(struct mm_struct *mm);
+void end_lockless_pgtbl_walk(struct mm_struct *mm);
+int running_lockless_pgtbl_walk(struct mm_struct *mm);
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
-- 
2.20.1



[PATCH v4 09/11] powerpc/kvm/book3s_64: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring all book3s_64-related
functions that do lockless pagetable walks.

Adds comments explaining that some lockless pagetable walks don't need
protection due to guest pgd not being a target of THP collapse/split, or
due to being called from Realmode + MSR_EE = 0.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_64_mmu_hv.c|  2 ++
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 30 ++
 arch/powerpc/kvm/book3s_64_vio_hv.c|  3 +++
 3 files changed, 35 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9a75f0e1933b..fcd3dad1297f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -620,6 +620,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 * We need to protect against page table destruction
 * hugepage split and collapse.
 */
+   start_lockless_pgtbl_walk(kvm->mm);
local_irq_save(flags);
ptep = find_current_mm_pte(current->mm->pgd,
   hva, NULL, NULL);
@@ -629,6 +630,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
write_ok = 1;
}
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(kvm->mm);
}
}
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 2d415c36a61d..9b374b9838fa 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -813,6 +813,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 * Read the PTE from the process' radix tree and use that
 * so we get the shift and attribute bits.
 */
+   start_lockless_pgtbl_walk(kvm->mm);
local_irq_disable();
ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, );
/*
@@ -821,12 +822,14 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 */
if (!ptep) {
local_irq_enable();
+   end_lockless_pgtbl_walk(kvm->mm);
if (page)
put_page(page);
return RESUME_GUEST;
}
pte = *ptep;
local_irq_enable();
+   end_lockless_pgtbl_walk(kvm->mm);
 
/* If we're logging dirty pages, always map single pages */
large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
@@ -972,10 +975,16 @@ int kvm_unmap_radix(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
 kvm->arch.lpid);
+
return 0;   
 }
 
@@ -989,6 +998,11 @@ int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot 
*memslot,
int ref = 0;
unsigned long old, *rmapp;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
@@ -1013,6 +1027,11 @@ int kvm_test_age_radix(struct kvm *kvm, struct 
kvm_memory_slot *memslot,
unsigned int shift;
int ref = 0;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
if (ptep && pte_present(*ptep) && pte_young(*ptep))
ref = 1;
@@ -1030,6 +1049,11 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
int ret = 0;
unsigned long old, *rmapp;
 
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
if (ptep && pte_present(*ptep) && 

[PATCH v4 08/11] powerpc/kvm/book3s_hv: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring all book3s_hv related
functions that do lockless pagetable walks.

Adds comments explaining that some lockless pagetable walks don't need
protection due to guest pgd not being a target of THP collapse/split, or
due to being called from Realmode + MSR_EE = 0

kvmppc_do_h_enter: Fixes where local_irq_restore() must be placed (after
the last usage of ptep).

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/book3s_hv_nested.c | 22 --
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 18 ++
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 735e0ac6f5b2..5a641b559de7 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -803,7 +803,11 @@ static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 
n_rmap,
if (!gp)
return;
 
-   /* Find the pte */
+   /* Find the pte:
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
/*
 * If the pte is present and the pfn is still the same, update the pte.
@@ -853,7 +857,11 @@ static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 
n_rmap,
if (!gp)
return;
 
-   /* Find and invalidate the pte */
+   /* Find and invalidate the pte:
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
@@ -921,6 +929,11 @@ static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu 
*vcpu,
int shift;
 
spin_lock(>mmu_lock);
+   /*
+* We are walking the nested guest (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, );
if (!shift)
shift = PAGE_SHIFT;
@@ -1362,6 +1375,11 @@ static long int __kvmhv_nested_page_fault(struct kvm_run 
*run,
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(>mmu_lock);
+   /*
+* We are walking the secondary (partition-scoped) page table here.
+* We can do this without disabling irq because the Linux MM
+* subsystem doesn't do THP splits and collapses on this tree.
+*/
pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, );
if (!shift)
shift = PAGE_SHIFT;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 63e0ce91e29d..2076a7ac230a 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -252,6 +252,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 * If we had a page table table change after lookup, we would
 * retry via mmu_notifier_retry.
 */
+   start_lockless_pgtbl_walk(kvm->mm);
if (!realmode)
local_irq_save(irq_flags);
/*
@@ -287,8 +288,6 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
pa |= gpa & ~PAGE_MASK;
}
}
-   if (!realmode)
-   local_irq_restore(irq_flags);
 
ptel &= HPTE_R_KEY | HPTE_R_PP0 | (psize-1);
ptel |= pa;
@@ -311,6 +310,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
ptel |= HPTE_R_M;
}
+   if (!realmode)
+   local_irq_restore(irq_flags);
+   end_lockless_pgtbl_walk(kvm->mm);
 
/* Find and lock the HPTEG slot to use */
  do_insert:
@@ -885,11 +887,19 @@ static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned 
long gpa,
/* Translate to host virtual address */
hva = __gfn_to_hva_memslot(memslot, gfn);
 
-   /* Try to find the host pte for that virtual address */
+   /* Try to find the host pte for that virtual address :
+* Called by hcall_real_table (real mode + MSR_EE=0)
+* Interrupts are disabled here.
+*/
+   start_lockless_pgtbl_walk(kvm->mm);
ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, );
-   if (!ptep)
+   if (!ptep) {
+   end_lockless_pgtbl_walk(kvm->mm);
return H_TOO_HARD;

[PATCH v4 07/11] powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
kvmppc_e500_shadow_map().

Fixes the place where local_irq_restore() is called: previously, if ptep
was NULL, local_irq_restore() would never be called.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kvm/e500_mmu_host.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 321db0fdb9db..a0be76ad8d14 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -473,6 +473,7 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
 * We are holding kvm->mmu_lock so a notifier invalidate
 * can't run hence pfn won't change.
 */
+   start_lockless_pgtbl_walk(kvm->mm);
local_irq_save(flags);
ptep = find_linux_pte(pgdir, hva, NULL, NULL);
if (ptep) {
@@ -481,15 +482,18 @@ static inline int kvmppc_e500_shadow_map(struct 
kvmppc_vcpu_e500 *vcpu_e500,
if (pte_present(pte)) {
wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) &
MAS2_WIMGE_MASK;
-   local_irq_restore(flags);
} else {
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(kvm->mm);
pr_err_ratelimited("%s: pte not present: gfn %lx,pfn 
%lx\n",
   __func__, (long)gfn, pfn);
ret = -EINVAL;
goto out;
}
}
+   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(kvm->mm);
+
kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg);
 
kvmppc_e500_setup_stlbe(_e500->vcpu, gtlbe, tsize,
-- 
2.20.1



[PATCH v4 06/11] powerpc/mm/book3s64/hash: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring all hash-related functions
that do lockless pagetable walks.

hash_page_mm: Adds comment that explain that there is no need to
local_int_disable/save given that it is only called from DataAccess
interrupt, so interrupts are already disabled.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/mm/book3s64/hash_tlb.c   |  2 ++
 arch/powerpc/mm/book3s64/hash_utils.c | 12 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/book3s64/hash_tlb.c 
b/arch/powerpc/mm/book3s64/hash_tlb.c
index 4a70d8dd39cd..5e5213c3f7c4 100644
--- a/arch/powerpc/mm/book3s64/hash_tlb.c
+++ b/arch/powerpc/mm/book3s64/hash_tlb.c
@@ -209,6 +209,7 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
 * to being hashed). This is not the most performance oriented
 * way to do things but is fine for our needs here.
 */
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
arch_enter_lazy_mmu_mode();
for (; start < end; start += PAGE_SIZE) {
@@ -230,6 +231,7 @@ void __flush_hash_table_range(struct mm_struct *mm, 
unsigned long start,
}
arch_leave_lazy_mmu_mode();
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
 }
 
 void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b8ad14bb1170..8615fab87c43 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1321,7 +1321,11 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
ea &= ~((1ul << mmu_psize_defs[psize].shift) - 1);
 #endif /* CONFIG_PPC_64K_PAGES */
 
-   /* Get PTE and page size from page tables */
+   /* Get PTE and page size from page tables :
+* Called in from DataAccess interrupt (data_access_common: 0x300),
+* interrupts are disabled here.
+*/
+   start_lockless_pgtbl_walk(mm);
ptep = find_linux_pte(pgdir, ea, _thp, );
if (ptep == NULL || !pte_present(*ptep)) {
DBG_LOW(" no PTE !\n");
@@ -1438,6 +1442,7 @@ int hash_page_mm(struct mm_struct *mm, unsigned long ea,
DBG_LOW(" -> rc=%d\n", rc);
 
 bail:
+   end_lockless_pgtbl_walk(mm);
exception_exit(prev_state);
return rc;
 }
@@ -1547,10 +1552,12 @@ void hash_preload(struct mm_struct *mm, unsigned long 
ea,
vsid = get_user_vsid(>context, ea, ssize);
if (!vsid)
return;
+
/*
 * Hash doesn't like irqs. Walking linux page table with irq disabled
 * saves us from holding multiple locks.
 */
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
 
/*
@@ -1597,6 +1604,7 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
   pte_val(*ptep));
 out_exit:
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
 }
 
 #ifdef CONFIG_PPC_MEM_KEYS
@@ -1613,11 +1621,13 @@ u16 get_mm_addr_key(struct mm_struct *mm, unsigned long 
address)
if (!mm || !mm->pgd)
return 0;
 
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
if (ptep)
pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
 
return pkey;
 }
-- 
2.20.1



[PATCH v4 04/11] powerpc/mce_power: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
addr_to_pfn().

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/kernel/mce_power.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index a814d2dfb5b0..0f2f87da4cd1 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -27,6 +27,7 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned long 
addr)
 {
pte_t *ptep;
unsigned long flags;
+   unsigned long pfn;
struct mm_struct *mm;
 
if (user_mode(regs))
@@ -34,15 +35,21 @@ unsigned long addr_to_pfn(struct pt_regs *regs, unsigned 
long addr)
else
mm = _mm;
 
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
if (mm == current->mm)
ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
else
ptep = find_init_mm_pte(addr, NULL);
-   local_irq_restore(flags);
+
if (!ptep || pte_special(*ptep))
-   return ULONG_MAX;
-   return pte_pfn(*ptep);
+   pfn = ULONG_MAX;
+   else
+   pfn = pte_pfn(*ptep);
+
+   local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
+   return pfn;
 }
 
 /* flush SLBs and reload */
-- 
2.20.1



[PATCH v4 05/11] powerpc/perf: Applies counting method to monitor lockless pgtbl walks

2019-09-27 Thread Leonardo Bras
Applies the counting-based method for monitoring lockless pgtable walks on
read_user_stack_slow.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/perf/callchain.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/perf/callchain.c b/arch/powerpc/perf/callchain.c
index c84bbd4298a0..9d76194a2a8f 100644
--- a/arch/powerpc/perf/callchain.c
+++ b/arch/powerpc/perf/callchain.c
@@ -113,16 +113,18 @@ static int read_user_stack_slow(void __user *ptr, void 
*buf, int nb)
int ret = -EFAULT;
pgd_t *pgdir;
pte_t *ptep, pte;
+   struct mm_struct *mm = current->mm;
unsigned shift;
unsigned long addr = (unsigned long) ptr;
unsigned long offset;
unsigned long pfn, flags;
void *kaddr;
 
-   pgdir = current->mm->pgd;
+   pgdir = mm->pgd;
if (!pgdir)
return -EFAULT;
 
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
ptep = find_current_mm_pte(pgdir, addr, NULL, );
if (!ptep)
@@ -146,6 +148,7 @@ static int read_user_stack_slow(void __user *ptr, void 
*buf, int nb)
ret = 0;
 err_out:
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
return ret;
 }
 
-- 
2.20.1



[PATCH v4 03/11] mm/gup: Applies counting method to monitor gup_pgd_range

2019-09-27 Thread Leonardo Bras
As decribed, gup_pgd_range is a lockless pagetable walk. So, in order to
monitor against THP split/collapse with the couting method, it's necessary
to bound it with {start,end}_lockless_pgtbl_walk.

There are dummy functions, so it is not going to add any overhead on archs
that don't use this method.

Signed-off-by: Leonardo Bras 
---
 mm/gup.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index 98f13ab37bac..7105c829cf44 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2325,6 +2325,7 @@ static bool gup_fast_permitted(unsigned long start, 
unsigned long end)
 int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
  struct page **pages)
 {
+   struct mm_struct *mm;
unsigned long len, end;
unsigned long flags;
int nr = 0;
@@ -2352,9 +2353,12 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
 
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
+   mm = current->mm;
+   start_lockless_pgtbl_walk(mm);
local_irq_save(flags);
gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, );
local_irq_restore(flags);
+   end_lockless_pgtbl_walk(mm);
}
 
return nr;
@@ -2404,6 +2408,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
 {
unsigned long addr, len, end;
+   struct mm_struct *mm;
int nr = 0, ret = 0;
 
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
@@ -2421,9 +2426,12 @@ int get_user_pages_fast(unsigned long start, int 
nr_pages,
 
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
+   mm = current->mm;
+   start_lockless_pgtbl_walk(mm);
local_irq_disable();
gup_pgd_range(addr, end, gup_flags, pages, );
local_irq_enable();
+   end_lockless_pgtbl_walk(mm);
ret = nr;
}
 
-- 
2.20.1



[PATCH v4 02/11] asm-generic/pgtable: Adds dummy functions to monitor lockless pgtable walks

2019-09-27 Thread Leonardo Bras
There is a need to monitor lockless pagetable walks, in order to avoid
doing THP splitting/collapsing during them.

Some methods rely on local_irq_{save,restore}, but that can be slow on
cases with a lot of cpus are used for the process.

In order to speedup these cases, I propose a refcount-based approach, that
counts the number of lockless pagetable walks happening on the process.

Given that there are lockless pagetable walks on generic code, it's
necessary to create dummy functions for archs that won't use the approach.

Signed-off-by: Leonardo Bras 
---
 include/asm-generic/pgtable.h | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 75d9d68a6de7..0831475e72d3 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -1172,6 +1172,21 @@ static inline bool arch_has_pfn_modify_check(void)
 #endif
 #endif
 
+#ifndef __HAVE_ARCH_LOCKLESS_PGTBL_WALK_COUNTER
+static inline void start_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+}
+
+static inline void end_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+}
+
+static inline int running_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   return 0;
+}
+#endif
+
 /*
  * On some architectures it depends on the mm if the p4d/pud or pmd
  * layer of the page table hierarchy is folded or not.
-- 
2.20.1



[PATCH v4 00/11] Introduces new count-based method for monitoring lockless pagetable walks

2019-09-27 Thread Leonardo Bras
If a process (qemu) with a lot of CPUs (128) try to munmap() a large
chunk of memory (496GB) mapped with THP, it takes an average of 275
seconds, which can cause a lot of problems to the load (in qemu case,
the guest will lock for this time).

Trying to find the source of this bug, I found out most of this time is
spent on serialize_against_pte_lookup(). This function will take a lot
of time in smp_call_function_many() if there is more than a couple CPUs
running the user process. Since it has to happen to all THP mapped, it
will take a very long time for large amounts of memory.

By the docs, serialize_against_pte_lookup() is needed in order to avoid
pmd_t to pte_t casting inside find_current_mm_pte(), or any lockless
pagetable walk, to happen concurrently with THP splitting/collapsing.

It does so by calling a do_nothing() on each CPU in mm->cpu_bitmap[],
after interrupts are re-enabled.
Since, interrupts are (usually) disabled during lockless pagetable
walk, and serialize_against_pte_lookup will only return after
interrupts are enabled, it is protected.

So, by what I could understand, if there is no lockless pagetable walk
running, there is no need to call serialize_against_pte_lookup().

So, to avoid the cost of running serialize_against_pte_lookup(), I
propose a counter that keeps track of how many find_current_mm_pte()
are currently running, and if there is none, just skip
smp_call_function_many().

The related functions are:
start_lockless_pgtbl_walk(mm)
Insert before starting any lockless pgtable walk
end_lockless_pgtbl_walk(mm)
Insert after the end of any lockless pgtable walk
(Mostly after the ptep is last used)
running_lockless_pgtbl_walk(mm)
Returns the number of lockless pgtable walks running

On my workload (qemu), I could see munmap's time reduction from 275
seconds to 418ms.

Also, I documented some lockless pagetable walks in which it's not
necessary to keep track, given they work on init_mm or guest pgd.

Changes since v3:
 Adds memory barrier to {start,end}_lockless_pgtbl_walk()
 Explain (comments) why some lockless pgtbl walks don't need
local_irq_disable (real mode + MSR_EE=0)
 Explain (comments) places where counting method is not needed (guest pgd,
which is not touched by THP)
 Fixes some misplaced local_irq_restore()
 Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=132417

Changes since v2:
 Rebased to v5.3
 Adds support on __get_user_pages_fast
 Adds usage decription to *_lockless_pgtbl_walk()
 Better style to dummy functions
 Link: http://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=131839

Changes since v1:
 Isolated atomic operations in functions *_lockless_pgtbl_walk()
 Fixed behavior of decrementing before last ptep was used
 Link: http://patchwork.ozlabs.org/patch/1163093/


Leonardo Bras (11):
  powerpc/mm: Adds counting method to monitor lockless pgtable walks
  asm-generic/pgtable: Adds dummy functions to monitor lockless pgtable
walks
  mm/gup: Applies counting method to monitor gup_pgd_range
  powerpc/mce_power: Applies counting method to monitor lockless pgtbl
walks
  powerpc/perf: Applies counting method to monitor lockless pgtbl walks
  powerpc/mm/book3s64/hash: Applies counting method to monitor lockless
pgtbl walks
  powerpc/kvm/e500: Applies counting method to monitor lockless pgtbl
walks
  powerpc/kvm/book3s_hv: Applies counting method to monitor lockless
pgtbl walks
  powerpc/kvm/book3s_64: Applies counting method to monitor lockless
pgtbl walks
  powerpc/book3s_64: Enables counting method to monitor lockless pgtbl
walk
  powerpc/mm/book3s64/pgtable: Uses counting method to skip serializing

 arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
 arch/powerpc/include/asm/book3s/64/pgtable.h |  5 ++
 arch/powerpc/kernel/mce_power.c  | 13 --
 arch/powerpc/kvm/book3s_64_mmu_hv.c  |  2 +
 arch/powerpc/kvm/book3s_64_mmu_radix.c   | 30 
 arch/powerpc/kvm/book3s_64_vio_hv.c  |  3 ++
 arch/powerpc/kvm/book3s_hv_nested.c  | 22 -
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 18 ++--
 arch/powerpc/kvm/e500_mmu_host.c |  6 ++-
 arch/powerpc/mm/book3s64/hash_tlb.c  |  2 +
 arch/powerpc/mm/book3s64/hash_utils.c| 12 -
 arch/powerpc/mm/book3s64/mmu_context.c   |  1 +
 arch/powerpc/mm/book3s64/pgtable.c   | 48 +++-
 arch/powerpc/perf/callchain.c|  5 +-
 include/asm-generic/pgtable.h| 15 ++
 mm/gup.c |  8 
 16 files changed, 180 insertions(+), 13 deletions(-)

-- 
2.20.1



[PATCH v4 01/11] powerpc/mm: Adds counting method to monitor lockless pgtable walks

2019-09-27 Thread Leonardo Bras
It's necessary to monitor lockless pagetable walks, in order to avoid doing
THP splitting/collapsing during them.

Some methods rely on local_irq_{save,restore}, but that can be slow on
cases with a lot of cpus are used for the process.

In order to speedup some cases, I propose a refcount-based approach, that
counts the number of lockless pagetable walks happening on the process.

This method does not exclude the current irq-oriented method. It works as a
complement to skip unnecessary waiting.

start_lockless_pgtbl_walk(mm)
Insert before starting any lockless pgtable walk
end_lockless_pgtbl_walk(mm)
Insert after the end of any lockless pgtable walk
(Mostly after the ptep is last used)
running_lockless_pgtbl_walk(mm)
Returns the number of lockless pgtable walks running

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/book3s/64/mmu.h |  3 ++
 arch/powerpc/mm/book3s64/mmu_context.c   |  1 +
 arch/powerpc/mm/book3s64/pgtable.c   | 45 
 3 files changed, 49 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 23b83d3593e2..13b006e7dde4 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -116,6 +116,9 @@ typedef struct {
/* Number of users of the external (Nest) MMU */
atomic_t copros;
 
+   /* Number of running instances of lockless pagetable walk*/
+   atomic_t lockless_pgtbl_walk_count;
+
struct hash_mm_context *hash_context;
 
unsigned long vdso_base;
diff --git a/arch/powerpc/mm/book3s64/mmu_context.c 
b/arch/powerpc/mm/book3s64/mmu_context.c
index 2d0cb5ba9a47..3dd01c0ca5be 100644
--- a/arch/powerpc/mm/book3s64/mmu_context.c
+++ b/arch/powerpc/mm/book3s64/mmu_context.c
@@ -200,6 +200,7 @@ int init_new_context(struct task_struct *tsk, struct 
mm_struct *mm)
 #endif
atomic_set(>context.active_cpus, 0);
atomic_set(>context.copros, 0);
+   atomic_set(>context.lockless_pgtbl_walk_count, 0);
 
return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 7d0e0d0d22c4..6ba6195bff1b 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -98,6 +98,51 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
+/*
+ * Counting method to monitor lockless pagetable walks:
+ * Uses start_lockless_pgtbl_walk and end_lockless_pgtbl_walk to track the
+ * number of lockless pgtable walks happening, and
+ * running_lockless_pgtbl_walk to return this value.
+ */
+
+/* start_lockless_pgtbl_walk: Must be inserted before a function call that does
+ *   lockless pagetable walks, such as __find_linux_pte()
+ */
+void start_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   atomic_inc(>context.lockless_pgtbl_walk_count);
+   /* Avoid reorder to garantee that the increment will happen before any
+* part of the walkless pagetable walk after it.
+*/
+   smp_mb();
+}
+EXPORT_SYMBOL(start_lockless_pgtbl_walk);
+
+/*
+ * end_lockless_pgtbl_walk: Must be inserted after the last use of a pointer
+ *   returned by a lockless pagetable walk, such as __find_linux_pte()
+*/
+void end_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   /* Avoid reorder to garantee that it will only decrement after the last
+* use of the returned ptep from the lockless pagetable walk.
+*/
+   smp_mb();
+   atomic_dec(>context.lockless_pgtbl_walk_count);
+}
+EXPORT_SYMBOL(end_lockless_pgtbl_walk);
+
+/*
+ * running_lockless_pgtbl_walk: Returns the number of lockless pagetable walks
+ *   currently running. If it returns 0, there is no running pagetable walk, 
and
+ *   THP split/collapse can be safely done. This can be used to avoid more
+ *   expensive approaches like serialize_against_pte_lookup()
+ */
+int running_lockless_pgtbl_walk(struct mm_struct *mm)
+{
+   return atomic_read(>context.lockless_pgtbl_walk_count);
+}
+
 /*
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
-- 
2.20.1



Re: [PATCH v3 00/11] Introduces new count-based method for monitoring lockless pagetable walks

2019-09-27 Thread Leonardo Bras
On Fri, 2019-09-27 at 11:46 -0300, Leonardo Bras wrote:
> I am not sure if it would be ok to use irq_{save,restore} in real mode,
> I will do some more reading of the docs before addressing this. 

It looks like it's unsafe to merge irq_{save,restore} in
{start,end}_lockless_pgtbl_walk(), due to a possible access of code
that is not accessible in real mode.

I am sending a v4 for the changes so far.
I will look forward for your feedback.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] powerpc: use

2019-09-27 Thread Christoph Hellwig
ping?

On Wed, Aug 07, 2019 at 06:07:52PM +0300, Christoph Hellwig wrote:
> The powerpc version of dma-mapping.h only contains a version of
> get_arch_dma_ops that always return NULL.  Replace it with the
> asm-generic version that does the same.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/powerpc/include/asm/Kbuild|  1 +
>  arch/powerpc/include/asm/dma-mapping.h | 18 --
>  2 files changed, 1 insertion(+), 18 deletions(-)
>  delete mode 100644 arch/powerpc/include/asm/dma-mapping.h
> 
> diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
> index 9a1d2fc6ceb7..15bb09ad5dc2 100644
> --- a/arch/powerpc/include/asm/Kbuild
> +++ b/arch/powerpc/include/asm/Kbuild
> @@ -4,6 +4,7 @@ generated-y += syscall_table_64.h
>  generated-y += syscall_table_c32.h
>  generated-y += syscall_table_spu.h
>  generic-y += div64.h
> +generic-y += dma-mapping.h
>  generic-y += export.h
>  generic-y += irq_regs.h
>  generic-y += local64.h
> diff --git a/arch/powerpc/include/asm/dma-mapping.h 
> b/arch/powerpc/include/asm/dma-mapping.h
> deleted file mode 100644
> index 565d6f74b189..
> --- a/arch/powerpc/include/asm/dma-mapping.h
> +++ /dev/null
> @@ -1,18 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0 */
> -/*
> - * Copyright (C) 2004 IBM
> - */
> -#ifndef _ASM_DMA_MAPPING_H
> -#define _ASM_DMA_MAPPING_H
> -
> -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type 
> *bus)
> -{
> - /* We don't handle the NULL dev case for ISA for now. We could
> -  * do it via an out of line call but it is not needed for now. The
> -  * only ISA DMA device we support is the floppy and we have a hack
> -  * in the floppy driver directly to get a device for us.
> -  */
> - return NULL;
> -}
> -
> -#endif   /* _ASM_DMA_MAPPING_H */
> -- 
> 2.20.1
---end quoted text---


Re: [PATCH v5 03/23] PCI: hotplug: Add a flag for the movable BARs feature

2019-09-27 Thread Bjorn Helgaas
On Fri, Aug 16, 2019 at 07:50:41PM +0300, Sergey Miroshnichenko wrote:
> When hot-adding a device, the bridge may have windows not big enough (or
> fragmented too much) for newly requested BARs to fit in. And expanding
> these bridge windows may be impossible because blocked by "neighboring"
> BARs and bridge windows.
> 
> Still, it may be possible to allocate a memory region for new BARs with the
> following procedure:
> 
> 1) notify all the drivers which support movable BARs to pause and release
>the BARs; the rest of the drivers are guaranteed that their devices will
>not get BARs moved;
> 
> 2) release all the bridge windows except of root bridges;
> 
> 3) try to recalculate new bridge windows that will fit all the BAR types:
>- fixed;
>- immovable;
>- movable;
>- newly requested by hot-added devices;
> 
> 4) if the previous step fails, disable BARs for one of the hot-added
>devices and retry from step 3;
> 
> 5) notify the drivers, so they remap BARs and resume.

You don't do the actual recalculation in *this* patch, but since you
mention the procedure here, are we confident that we never make things
worse?

It's possible that a hot-add will trigger this attempt to move things
around, and it's possible that we won't find space for the new device
even if we move things around.  But are we certain that every device
that worked *before* the hot-add will still work *afterwards*?

Much of the assignment was probably done by the BIOS using different
algorithms than Linux has, so I think there's some chance that the
BIOS did a better job and if we lose that BIOS assignment, we might
not be able to recreate it.

> This makes the prior reservation of memory by BIOS/bootloader/firmware not
> required anymore for the PCI hotplug.
> 
> Drivers indicate their support of movable BARs by implementing the new
> .rescan_prepare() and .rescan_done() hooks in the struct pci_driver. All
> device's activity must be paused during a rescan, and iounmap()+ioremap()
> must be applied to every used BAR.
> 
> The platform also may need to prepare to BAR movement, so new hooks added:
> pcibios_rescan_prepare(pci_dev) and pcibios_rescan_prepare(pci_dev).
> 
> This patch is a preparation for future patches with actual implementation,
> and for now it just does the following:
>  - declares the feature;
>  - defines pci_movable_bars_enabled(), pci_dev_movable_bars_supported(dev);
>  - invokes the .rescan_prepare() and .rescan_done() driver notifiers;
>  - declares and invokes the pcibios_rescan_prepare()/_done() hooks;
>  - adds the PCI_IMMOVABLE_BARS flag.
> 
> The feature is disabled by default (via PCI_IMMOVABLE_BARS) until the final
> patch of the series. It can be overridden per-arch using this flag or by
> the following command line option:
> 
> pcie_movable_bars={ off | force }
> 
> CC: Sam Bobroff 
> CC: Rajat Jain 
> CC: Lukas Wunner 
> CC: Oliver O'Halloran 
> CC: David Laight 
> Signed-off-by: Sergey Miroshnichenko 
> ---
>  .../admin-guide/kernel-parameters.txt |  7 ++
>  drivers/pci/pci-driver.c  |  2 +
>  drivers/pci/pci.c | 24 ++
>  drivers/pci/pci.h |  2 +
>  drivers/pci/probe.c   | 86 ++-
>  include/linux/pci.h   |  7 ++
>  6 files changed, 126 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 47d981a86e2f..e2274ee87a35 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -3526,6 +3526,13 @@
>   nomsi   Do not use MSI for native PCIe PME signaling (this makes
>   all PCIe root ports use INTx for all services).
>  
> + pcie_movable_bars=[PCIE]

This isn't a PCIe-specific feature, it's just a function of whether
drivers are smart enough, so we shouldn't tie it specifically to PCIe.
We could eventually do this for conventional PCI as well.

> + Override the movable BARs support detection:
> + off
> + Disable even if supported by the platform
> + force
> + Enable even if not explicitly declared as supported

What's the need for "force"?  If it's possible, I think we should
enable this functionality all the time and just have a disable switch
in case we trip over cases where it doesn't work, e.g., something
like:

  pci=no_movable_bars

>   pcmv=   [HW,PCMCIA] BadgePAD 4
>  
>   pd_ignore_unused
> diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
> index a8124e47bf6e..d11909e79263 100644
> --- a/drivers/pci/pci-driver.c
> +++ b/drivers/pci/pci-driver.c
> @@ -1688,6 +1688,8 @@ static int __init pci_driver_init(void)
>  {
>   int ret;
>  
> + pci_add_flags(PCI_IMMOVABLE_BARS);
> +
>   ret = 

Re: [PATCH v5 02/23] PCI: Enable bridge's I/O and MEM access for hotplugged devices

2019-09-27 Thread Bjorn Helgaas
On Fri, Aug 16, 2019 at 07:50:40PM +0300, Sergey Miroshnichenko wrote:
> The PCI_COMMAND_IO and PCI_COMMAND_MEMORY bits of the bridge must be
> updated not only when enabling the bridge for the first time, but also if a
> hotplugged device requests these types of resources.

Yeah, this assumption that pci_is_enabled() means PCI_COMMAND_IO and
PCI_COMMAND_MEMORY are set correctly even though we may now need
*different* settings than when we incremented pdev->enable_cnt is
quite broken.

> Originally these bits were set by the pci_enable_device_flags() only, which
> exits early if the bridge is already pci_is_enabled(). So if the bridge was
> empty initially (an edge case), then hotplugged devices fail to IO/MEM.
> 
> Signed-off-by: Sergey Miroshnichenko 
> ---
>  drivers/pci/pci.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index e7f8c354e644..61d951766087 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -1652,6 +1652,14 @@ static void pci_enable_bridge(struct pci_dev *dev)
>   pci_enable_bridge(bridge);
>  
>   if (pci_is_enabled(dev)) {
> + int i, bars = 0;
> +
> + for (i = PCI_BRIDGE_RESOURCES; i < DEVICE_COUNT_RESOURCE; i++) {
> + if (dev->resource[i].flags & (IORESOURCE_MEM | 
> IORESOURCE_IO))
> + bars |= (1 << i);
> + }
> + do_pci_enable_device(dev, bars);
> +
>   if (!dev->is_busmaster)
>   pci_set_master(dev);
>   mutex_unlock(>enable_mutex);
> -- 
> 2.21.0
> 


Re: [PATCH v5 01/23] PCI: Fix race condition in pci_enable/disable_device()

2019-09-27 Thread Bjorn Helgaas
On Fri, Aug 16, 2019 at 07:50:39PM +0300, Sergey Miroshnichenko wrote:
> This is a yet another approach to fix an old [1-2] concurrency issue, when:
>  - two or more devices are being hot-added into a bridge which was
>initially empty;
>  - a bridge with two or more devices is being hot-added;
>  - during boot, if BIOS/bootloader/firmware doesn't pre-enable bridges.
> 
> The problem is that a bridge is reported as enabled before the MEM/IO bits
> are actually written to the PCI_COMMAND register, so another driver thread
> starts memory requests through the not-yet-enabled bridge:
> 
>  CPU0CPU1
> 
>  pci_enable_device_mem() pci_enable_device_mem()
>pci_enable_bridge() pci_enable_bridge()
>  pci_is_enabled()
>return false;
>  atomic_inc_return(enable_cnt)
>  Start actual enabling the bridge
>  ... pci_is_enabled()
>  ...   return true;
>  ... Start memory requests <-- FAIL
>  ...
>  Set the PCI_COMMAND_MEMORY bit <-- Must wait for this
> 
> Protect the pci_enable/disable_device() and pci_enable_bridge(), which is
> similar to the previous solution from commit 40f11adc7cd9 ("PCI: Avoid race
> while enabling upstream bridges"), but adding a per-device mutexes and
> preventing the dev->enable_cnt from from incrementing early.

This isn't directly related to the movable BARs functionality; is it
here because you see the problem more frequently when moving BARs?


Re: [PATCH] powerpc/prom_init: Undo relocation before entering secure mode

2019-09-27 Thread Thiago Jung Bauermann


Thiago Jung Bauermann  writes:

> Thiago Jung Bauermann  writes:
>
>> The ultravisor will do an integrity check of the kernel image but we
>> relocated it so the check will fail. Restore the original image by
>> relocating it back to the kernel virtual base address.
>>
>> This works because during build vmlinux is linked with an expected virtual
>> runtime address of KERNELBASE.
>>
>> Fixes: 6a9c930bd775 ("powerpc/prom_init: Add the ESM call to prom_init")
>> Signed-off-by: Thiago Jung Bauermann 
>
> I meant to put a Suggested-by: Paul Mackerras 
>
> Sorry. Will add it if there's a v2.

Ping?

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


Re: [RFC] powerpc/pseries/hcall: remove the save/restore of CR

2019-09-27 Thread Paul Clarke
On 9/26/19 10:48 PM, Lijun Pan wrote:
> According to the PAPR, hcalls should not modify the Condition
> Register fields, hence save/restore the CR is not necessary.

Just curious: could you provide a more specific reference?

PC


Re: [PATCH v2 1/1] powerpc/pci: Fix pcibios_setup_device() ordering

2019-09-27 Thread Shawn Anastasio

On 9/9/19 2:59 AM, Alexey Kardashevskiy wrote:



On 06/09/2019 05:13, Shawn Anastasio wrote:

Move PCI device setup from pcibios_add_device() and pcibios_fixup_bus() to
pcibios_bus_add_device(). This ensures that platform-specific DMA and IOMMU
setup occurs after the device has been registered in sysfs, which is a
requirement for IOMMU group assignment to work

This fixes IOMMU group assignment for hotplugged devices on pseries, where
the existing behavior results in IOMMU assignment before registration.



Although this is a correct approach which we should proceed with, this
breaks adding of SRIOV VFs from pnv_tce_iommu_bus_notifier (and possibly
the bare metal PCI hotplug), I am trying to fix that now...


Were you able to make any progress? I can think of a couple of ways
to fix SRIOV, but they're not particularly elegant and involve
duplication.


Re: [v3,3/3] Documentation: dt: binding: fsl: Add 'fsl,ippdexpcr-alt-addr' property

2019-09-27 Thread Rob Herring
On Wed, Sep 25, 2019 at 03:34:47AM +, Leo Li wrote:
> 
> 
> > -Original Message-
> > From: Biwen Li
> > Sent: Tuesday, September 24, 2019 10:30 PM
> > To: Leo Li ; shawn...@kernel.org;
> > robh...@kernel.org; mark.rutl...@arm.com; Ran Wang
> > 
> > Cc: linuxppc-dev@lists.ozlabs.org; linux-arm-ker...@lists.infradead.org;
> > linux-ker...@vger.kernel.org; devicet...@vger.kernel.org
> > Subject: RE: [v3,3/3] Documentation: dt: binding: fsl: Add 
> > 'fsl,ippdexpcr-alt-
> > addr' property
> > 
> > > > > >
> > > > > > The 'fsl,ippdexpcr-alt-addr' property is used to handle an
> > > > > > errata
> > > > > > A-008646 on LS1021A
> > > > > >
> > > > > > Signed-off-by: Biwen Li 
> > > > > > ---
> > > > > > Change in v3:
> > > > > > - rename property name
> > > > > >   fsl,rcpm-scfg -> fsl,ippdexpcr-alt-addr
> > > > > >
> > > > > > Change in v2:
> > > > > > - update desc of the property 'fsl,rcpm-scfg'
> > > > > >
> > > > > >  Documentation/devicetree/bindings/soc/fsl/rcpm.txt | 14
> > > > > > ++
> > > > > >  1 file changed, 14 insertions(+)
> > > > > >
> > > > > > diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> > > > > > b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> > > > > > index 5a33619d881d..157dcf6da17c 100644
> > > > > > --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> > > > > > +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> > > > > > @@ -34,6 +34,11 @@ Chassis Version  Example Chips
> > > > > >  Optional properties:
> > > > > >   - little-endian : RCPM register block is Little Endian. Without 
> > > > > > it RCPM
> > > > > > will be Big Endian (default case).
> > > > > > + - fsl,ippdexpcr-alt-addr : Must add the property for SoC
> > > > > > + LS1021A,
> > > > >
> > > > > You probably should mention this is related to a hardware issue on
> > > > > LS1021a and only needed on LS1021a.
> > > > Okay, got it, thanks, I will add this in v4.
> > > > >
> > > > > > +   Must include n + 1 entries (n = #fsl,rcpm-wakeup-cells, such as:
> > > > > > +   #fsl,rcpm-wakeup-cells equal to 2, then must include 2 + 1 
> > > > > > entries).
> > > > >
> > > > > #fsl,rcpm-wakeup-cells is the number of IPPDEXPCR registers on an SoC.
> > > > > However you are defining an offset to scfg registers here.  Why
> > > > > these two are related?  The length here should actually be related
> > > > > to the #address-cells of the soc/.  But since this is only needed
> > > > > for LS1021, you can
> > > > just make it 3.
> > > > I need set the value of IPPDEXPCR resgiters from ftm_alarm0 device
> > > > node(fsl,rcpm-wakeup = < 0x0 0x2000>;
> > > > 0x0 is a value for IPPDEXPCR0, 0x2000 is a value for IPPDEXPCR1).
> > > > But because of the hardware issue on LS1021A, I need store the value
> > > > of IPPDEXPCR registers to an alt address. So I defining an offset to
> > > > scfg registers, then RCPM driver get an abosolute address from
> > > > offset, RCPM driver write the value of IPPDEXPCR registers to these
> > > > abosolute addresses(backup the value of IPPDEXPCR registers).
> > >
> > > I understand what you are trying to do.  The problem is that the new
> > > fsl,ippdexpcr-alt-addr property contains a phandle and an offset.  The
> > > size of it shouldn't be related to #fsl,rcpm-wakeup-cells.
> > You maybe like this: fsl,ippdexpcr-alt-addr = < 0x51c>;/*
> > SCFG_SPARECR8 */
> 
> No.  The #address-cell for the soc/ is 2, so the offset to scfg 
> should be 0x0 0x51c.  The total size should be 3, but it shouldn't be 
> coming from #fsl,rcpm-wakeup-cells like you mentioned in the binding.

Um, no. #address-cells applies to reg and ranges, not some vendor 
specific property. You can just define how many cells you need if that's 
fixed. 

However, I suggested doing this another way in the next version of this.

Rob


Re: [PATCH] powerpc/vio: use simple dummy struct device as bus parent

2019-09-27 Thread Greg Kroah-Hartman
On Fri, Sep 27, 2019 at 09:04:02AM -0400, Dan Streetman wrote:
> The dummy vio_bus_device creates the /sys/devices/vio directory, which
> contains real vio devices under it; since it represents itself as having
> a bus = _bus_type, its /sys/devices/vio/uevent does call the bus's
> .uevent function, vio_hotplug(), and as that function won't find a real
> device for the dummy vio_dev, it will return -ENODEV.
> 
> One of the main users of the uevent node is udevadm, e.g. when it is called
> with 'udevadm trigger --devices'.  Up until recently, it would ignore any
> errors returned when writing to devices' uevent file, but it was recently
> changed to start returning error if it gets an error writing to any uevent
> file:
> https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f
> 
> since the /sys/devices/vio/uevent file has always returned ENODEV from
> any write to it, this now causes the udevadm trigger command to return
> an error.  This may be fixed in udevadm to ignore ENODEV errors, but the
> vio driver should still be fixed.
> 
> This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy'
> parent device into a real dummy device with no .bus, so its uevent
> file will stop returning ENODEV and simply do nothing and return 0.
> 
> Signed-off-by: Dan Streetman 
> ---
>  arch/powerpc/platforms/pseries/vio.c | 11 ---
>  1 file changed, 4 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/vio.c 
> b/arch/powerpc/platforms/pseries/vio.c
> index 79e2287991db..63bc16631680 100644
> --- a/arch/powerpc/platforms/pseries/vio.c
> +++ b/arch/powerpc/platforms/pseries/vio.c
> @@ -32,11 +32,8 @@
>  #include 
>  #include 
>  
> -static struct vio_dev vio_bus_device  = { /* fake "parent" device */
> - .name = "vio",
> - .type = "",
> - .dev.init_name = "vio",
> - .dev.bus = _bus_type,
> +static struct device vio_bus = {
> + .init_name  = "vio",

Eeek, no!  Why are you creating a static device that will then be
reference counted?  Not nice :(

What's wrong with a simple call to device_create() for your "fake"
device you want to make here?  That's what it is there for :)

thanks,

greg k-h


Re: [v4,3/3] Documentation: dt: binding: fsl: Add 'fsl,ippdexpcr1-alt-addr' property

2019-09-27 Thread Rob Herring
On Thu, Sep 26, 2019 at 10:41:18AM +0800, Biwen Li wrote:
> The 'fsl,ippdexpcr1-alt-addr' property is used to handle an errata A-008646
> on LS1021A
> 
> Signed-off-by: Biwen Li 
> ---
> Change in v4:
>   - rename property name
> fsl,ippdexpcr-alt-addr -> fsl,ippdexpcr1-alt-addr
> 
> Change in v3:
>   - rename property name
> fsl,rcpm-scfg -> fsl,ippdexpcr-alt-addr
> 
> Change in v2:
>   - update desc of the property 'fsl,rcpm-scfg'
> 
>  .../devicetree/bindings/soc/fsl/rcpm.txt  | 21 +++
>  1 file changed, 21 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt 
> b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> index 5a33619d881d..751a7655b694 100644
> --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt
> @@ -34,6 +34,13 @@ Chassis VersionExample Chips
>  Optional properties:
>   - little-endian : RCPM register block is Little Endian. Without it RCPM
> will be Big Endian (default case).
> + - fsl,ippdexpcr1-alt-addr : The property is related to a hardware issue
> +   on SoC LS1021A and only needed on SoC LS1021A.
> +   Must include 1 + 2 entries.
> +   The first entry must be a link to the SCFG device node.
> +   The non-first entry must be offset of registers of SCFG.
> +   The second and third entry compose an alt offset address
> +   for IPPDEXPCR1(SCFG_SPARECR8)

If only on 1 SoC, can't all this be implied by "fsl,ls1021a-rcpm"?

Adding a property means you need both a new dtb and kernel to fix the 
errata. Using the compatible string means you only need a new kernel.

>  
>  Example:
>  The RCPM node for T4240:
> @@ -43,6 +50,20 @@ The RCPM node for T4240:
>   #fsl,rcpm-wakeup-cells = <2>;
>   };
>  
> +The RCPM node for LS1021A:
> + rcpm: rcpm@1ee2140 {
> + compatible = "fsl,ls1021a-rcpm", "fsl,qoriq-rcpm-2.1+";

Both of these compatible strings aren't documented.

> + reg = <0x0 0x1ee2140 0x0 0x8>;
> + #fsl,rcpm-wakeup-cells = <2>;
> +
> + /*
> +  * The second and third entry compose an alt offset
> +  * address for IPPDEXPCR1(SCFG_SPARECR8)
> +  */
> + fsl,ippdexpcr1-alt-addr = < 0x0 0x51c>;
> + };
> +
> +
>  * Freescale RCPM Wakeup Source Device Tree Bindings
>  ---
>  Required fsl,rcpm-wakeup property should be added to a device node if the 
> device
> -- 
> 2.17.1
> 


Re: [PATCH v4 05/11] dt-bindings: pci: layerscape-pci: Add compatible strings for ls1088a and ls2088a

2019-09-27 Thread Rob Herring
On Tue, 24 Sep 2019 10:18:43 +0800, Xiaowei Bao wrote:
> Add compatible strings for ls1088a and ls2088a.
> 
> Signed-off-by: Xiaowei Bao 
> ---
> v2:
>  - No change.
> v3:
>  - Use one valid combination of compatible strings.
> v4:
>  - Add the comma between the two compatible.
> 
>  Documentation/devicetree/bindings/pci/layerscape-pci.txt | 2 ++
>  1 file changed, 2 insertions(+)
> 

Acked-by: Rob Herring 


Re: [PATCH 03/11] powerpc/powernv/ioda: set up PE on opencapi device when enabling

2019-09-27 Thread Andrew Donnellan

On 9/9/19 5:45 pm, Frederic Barrat wrote:

The PE for an opencapi device was set as part of a late PHB fixup
operation, when creating the PHB. To use the PCI hotplug framework,
this is not going to work, as the PHB stays the same, it's only the
devices underneath which are updated. For regular PCI devices, it is
done as part of the reconfiguration of the bridge, but for opencapi
PHBs, we don't have an intermediate bridge. So let's define the PE
when the device is enabled. PEs are meaningless for opencapi, the NPU
doesn't define them and opal is not doing anything with them.

Signed-off-by: Frederic Barrat 


Reviewed-by: Andrew Donnellan 



---
  arch/powerpc/platforms/powernv/pci-ioda.c | 31 +--
  1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 3dbbf5365c1c..06ce7ddaa0cf 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1260,8 +1260,6 @@ static void pnv_pci_ioda_setup_PEs(void)
  {
struct pci_controller *hose;
struct pnv_phb *phb;
-   struct pci_bus *bus;
-   struct pci_dev *pdev;
struct pnv_ioda_pe *pe;
  
  	list_for_each_entry(hose, _list, list_node) {

@@ -1273,11 +1271,6 @@ static void pnv_pci_ioda_setup_PEs(void)
if (phb->model == PNV_PHB_MODEL_NPU2)
WARN_ON_ONCE(pnv_npu2_init(hose));
}
-   if (phb->type == PNV_PHB_NPU_OCAPI) {
-   bus = hose->bus;
-   list_for_each_entry(pdev, >devices, bus_list)
-   pnv_ioda_setup_dev_PE(pdev);
-   }
}
list_for_each_entry(hose, _list, list_node) {
phb = hose->private_data;
@@ -3385,6 +3378,28 @@ static bool pnv_pci_enable_device_hook(struct pci_dev 
*dev)
return true;
  }
  
+static bool pnv_ocapi_enable_device_hook(struct pci_dev *dev)

+{
+   struct pci_controller *hose = pci_bus_to_host(dev->bus);
+   struct pnv_phb *phb = hose->private_data;
+   struct pci_dn *pdn;
+   struct pnv_ioda_pe *pe;
+
+   if (!phb->initialized)
+   return true;
+
+   pdn = pci_get_pdn(dev);
+   if (!pdn)
+   return false;
+
+   if (pdn->pe_number == IODA_INVALID_PE) {
+   pe = pnv_ioda_setup_dev_PE(dev);
+   if (!pe)
+   return false;
+   }
+   return true;
+}
+
  static long pnv_pci_ioda1_unset_window(struct iommu_table_group *table_group,
   int num)
  {
@@ -3625,7 +3640,7 @@ static const struct pci_controller_ops 
pnv_npu_ioda_controller_ops = {
  };
  
  static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {

-   .enable_device_hook = pnv_pci_enable_device_hook,
+   .enable_device_hook = pnv_ocapi_enable_device_hook,
.window_alignment   = pnv_pci_window_alignment,
.reset_secondary_bus= pnv_pci_reset_secondary_bus,
.shutdown   = pnv_pci_ioda_shutdown,



--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited



Re: [PATCH 09/11] pci/hotplug/pnv-php: Relax check when disabling slot

2019-09-27 Thread Andrew Donnellan

On 9/9/19 5:45 pm, Frederic Barrat wrote:

The driver only allows to disable a slot in the POPULATED
state. However, if an error occurs while enabling the slot, say
because the link couldn't be trained, then the POPULATED state may not
be reached, yet the power state of the slot is on. So allow to disable
a slot in the REGISTERED state. Removing the devices will do nothing
since it's not populated, and we'll set the power state of the slot
back to off.

Signed-off-by: Frederic Barrat 


Makes sense

Reviewed-by: Andrew Donnellan 


---
  drivers/pci/hotplug/pnv_php.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
index f0a7360154e7..5ca51d67db4b 100644
--- a/drivers/pci/hotplug/pnv_php.c
+++ b/drivers/pci/hotplug/pnv_php.c
@@ -523,7 +523,13 @@ static int pnv_php_disable_slot(struct hotplug_slot *slot)
struct pnv_php_slot *php_slot = to_pnv_php_slot(slot);
int ret;
  
-	if (php_slot->state != PNV_PHP_STATE_POPULATED)

+   /*
+* Allow to disable a slot already in the registered state to
+* cover cases where the slot couldn't be enabled and never
+* reached the populated state
+*/
+   if (php_slot->state != PNV_PHP_STATE_POPULATED &&
+   php_slot->state != PNV_PHP_STATE_REGISTERED)
return 0;
  
  	/* Remove all devices behind the slot */




--
Andrew Donnellan  OzLabs, ADL Canberra
a...@linux.ibm.com IBM Australia Limited



Re: [PATCH v3 00/11] Introduces new count-based method for monitoring lockless pagetable walks

2019-09-27 Thread Leonardo Bras
John Hubbard  writes:

> Hi Leonardo,
>
> Thanks for adding linux-mm to CC for this next round of reviews. For the 
> benefit
> of any new reviewers, I'd like to add that there are some issues that were 
> discovered
> while reviewing the v2 patchset, that are not (yet) addressed in this v3 
> series.

> Since those issues are not listed in the cover letter above, I'll list them 
> here

Thanks for bringing that.
The cover letter is a great place to put this info, I will keep that in
mind for future patchsets.

>
> 1. The locking model requires a combination of disabling interrupts and
> atomic counting and memory barriers, but
>
>   a) some memory barriers are missing
>   (start/end_lockless_pgtbl_walk), and

It seems that it works fine today because of the amount of intructions
executed between the irq_disable / start_lockless_pgtbl_walk and where
the THP collapse/split can happen. (It's very unlikely that it reorders
that much).

But I don't think it would be so bad to put a memory barrier after
irq_disable just in case.

>   b) some cases (patch #8) fail to disable interrupts

I have done some looking into that, and it seems that some uses of
{start,end}_lockless_pgtbl_walk are unneeded, because they operate in
(nested) guest pgd and I was told it's safe against THP split/collapse.

In other uses, there is no interrupt disable because the function is
called in real mode, with MSR_EE=0, and there we have instructions
disabled, so there is no need to disable them again.

>
> ...so the synchronization appears to be inadequate. (And if it *is* adequate, 
> then
> definitely we need the next item, to explain it.)


>
> 2. Documentation of the synchronization/locking model needs to exist, once we
> figure out the exact details of (1).

I will add the missing doc in the code, so it may be easier to
understand in the future.

>
> 3. Related to (1), I've asked to change things so that interrupt controls and 
> atomic inc/dec are in the same start/end calls--assuming, of course, that the
> caller can tolerate that. 

I am not sure if it would be ok to use irq_{save,restore} in real mode,
I will do some more reading of the docs before addressing this. 
>
> 4. Please see the v2 series for any other details I've missed.
>
> thanks,
> -- 
> John Hubbard
> NVIDIA
>

Thank you for helping, John!

Best regards,
Leonardo Bras


signature.asc
Description: This is a digitally signed message part


[PATCH] powerpc/vio: drop bus_type from parent device

2019-09-27 Thread Thadeu Lima de Souza Cascardo
Commit df44b479654f62b478c18ee4d8bc4e9f897a9844 ("kobject: return error code if
writing /sys/.../uevent fails") started returning failure when writing to
/sys/devices/vio/uevent.

This causes an early udevadm trigger to fail. On some installer versions of
Ubuntu, this will cause init to exit, thus panicing the system very early
during boot.

Removing the bus_type from the parent device will remove some of the extra
empty files from /sys/devices/vio/, but will keep the rest of the layout for
vio devices, keeping them under /sys/devices/vio/.

It has been tested that uevents for vio devices don't change after this fix,
they still contain MODALIAS.

Signed-off-by: Thadeu Lima de Souza Cascardo 
---
 arch/powerpc/platforms/pseries/vio.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 6601b9d404dc..d570d5494f30 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -36,7 +36,6 @@ static struct vio_dev vio_bus_device  = { /* fake "parent" 
device */
.name = "vio",
.type = "",
.dev.init_name = "vio",
-   .dev.bus = _bus_type,
 };
 
 #ifdef CONFIG_PPC_SMLPAR
-- 
2.20.1



[PATCH v6 9/9] powerpc/ima: update ima arch policy to check for blacklist

2019-09-27 Thread Nayna Jain
This patch updates the arch specific policies for PowernV systems
to add check against blacklisted hashes before doing the verification.

Signed-off-by: Nayna Jain 
---
 arch/powerpc/kernel/ima_arch.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c
index 77c61b142042..3f57433c0824 100644
--- a/arch/powerpc/kernel/ima_arch.c
+++ b/arch/powerpc/kernel/ima_arch.c
@@ -24,9 +24,9 @@ bool arch_ima_get_secureboot(void)
 static const char *const arch_rules[] = {
"measure func=KEXEC_KERNEL_CHECK template=ima-modsig",
"measure func=MODULE_CHECK template=ima-modsig",
-   "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig",
+   "appraise func=KEXEC_KERNEL_CHECK appraise_flag=check_blacklist 
appraise_type=imasig|modsig",
 #if !IS_ENABLED(CONFIG_MODULE_SIG)
-   "appraise func=MODULE_CHECK appraise_type=imasig|modsig",
+   "appraise func=MODULE_CHECK appraise_flag=check_blacklist 
appraise_type=imasig|modsig",
 #endif
NULL
 };
-- 
2.20.1



[PATCH v6 8/9] ima: deprecate permit_directio, instead use appraise_flag

2019-09-27 Thread Nayna Jain
This patch deprecates the existing permit_directio flag, instead adds
it as possible value to appraise_flag parameter.
For eg.
appraise_flag=permit_directio

Signed-off-by: Nayna Jain 
---
 Documentation/ABI/testing/ima_policy | 4 ++--
 security/integrity/ima/ima_policy.c  | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/ABI/testing/ima_policy 
b/Documentation/ABI/testing/ima_policy
index 4c97afcc0f3c..9a2a140dc561 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -24,8 +24,8 @@ Description:
[euid=] [fowner=] [fsname=]]
lsm:[[subj_user=] [subj_role=] [subj_type=]
 [obj_user=] [obj_role=] [obj_type=]]
-   option: [[appraise_type=]] [template=] [permit_directio]
-   [appraise_flag=[check_blacklist]]
+   option: [[appraise_type=]] [template=] 
[permit_directio(deprecated)]
+   
[appraise_flag=[check_blacklist]|[permit_directio]]
base:   func:= 
[BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
diff --git a/security/integrity/ima/ima_policy.c 
b/security/integrity/ima/ima_policy.c
index ad3b3af69460..d9df54c75d46 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -1177,6 +1177,8 @@ static int ima_parse_rule(char *rule, struct 
ima_rule_entry *entry)
ima_log_string(ab, "appraise_flag", args[0].from);
if (strstr(args[0].from, "blacklist"))
entry->flags |= IMA_CHECK_BLACKLIST;
+   if (strstr(args[0].from, "permit_directio"))
+   entry->flags |= IMA_PERMIT_DIRECTIO;
break;
case Opt_permit_directio:
entry->flags |= IMA_PERMIT_DIRECTIO;
-- 
2.20.1



[PATCH v6 7/9] ima: check against blacklisted hashes for files with modsig

2019-09-27 Thread Nayna Jain
Asymmetric private keys are used to sign multiple files. The kernel
currently support checking against the blacklisted keys. However, if the
public key is blacklisted, any file signed by the blacklisted key will
automatically fail signature verification. We might not want to blacklist
all the files signed by a particular key, but just a single file. 
Blacklisting the public key is not fine enough granularity.

This patch adds support for blacklisting binaries with appended signatures,
based on the IMA policy.  Defined is a new policy option
"appraise_flag=check_blacklist".

Signed-off-by: Nayna Jain 
---
 Documentation/ABI/testing/ima_policy  |  1 +
 security/integrity/ima/ima.h  | 12 +
 security/integrity/ima/ima_appraise.c | 35 +++
 security/integrity/ima/ima_main.c |  8 --
 security/integrity/ima/ima_policy.c   | 10 ++--
 security/integrity/integrity.h|  1 +
 6 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/Documentation/ABI/testing/ima_policy 
b/Documentation/ABI/testing/ima_policy
index 29ebe9afdac4..4c97afcc0f3c 100644
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -25,6 +25,7 @@ Description:
lsm:[[subj_user=] [subj_role=] [subj_type=]
 [obj_user=] [obj_role=] [obj_type=]]
option: [[appraise_type=]] [template=] [permit_directio]
+   [appraise_flag=[check_blacklist]]
base:   func:= 
[BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK]
[FIRMWARE_CHECK]
[KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK]
diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 9bf509217e8e..2c034728b239 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -254,6 +254,9 @@ int ima_policy_show(struct seq_file *m, void *v);
 #define IMA_APPRAISE_KEXEC 0x40
 
 #ifdef CONFIG_IMA_APPRAISE
+int ima_blacklist_measurement(struct integrity_iint_cache *iint,
+ const struct modsig *modsig, int action,
+ int pcr, struct ima_template_desc *template_desc);
 int ima_appraise_measurement(enum ima_hooks func,
 struct integrity_iint_cache *iint,
 struct file *file, const unsigned char *filename,
@@ -269,6 +272,15 @@ int ima_read_xattr(struct dentry *dentry,
   struct evm_ima_xattr_data **xattr_value);
 
 #else
+static inline int ima_blacklist_measurement(struct integrity_iint_cache *iint,
+   const struct modsig *modsig,
+   int action, int pcr,
+   struct ima_template_desc
+   *template_desc)
+{
+   return 0;
+}
+
 static inline int ima_appraise_measurement(enum ima_hooks func,
   struct integrity_iint_cache *iint,
   struct file *file,
diff --git a/security/integrity/ima/ima_appraise.c 
b/security/integrity/ima/ima_appraise.c
index 136ae4e0ee92..a5a82e870e24 100644
--- a/security/integrity/ima/ima_appraise.c
+++ b/security/integrity/ima/ima_appraise.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ima.h"
 
@@ -303,6 +304,40 @@ static int modsig_verify(enum ima_hooks func, const struct 
modsig *modsig,
return rc;
 }
 
+/*
+ * ima_blacklist_measurement - checks if the file measurement is blacklisted
+ *
+ * Returns -EKEYREJECTED if the hash is blacklisted.
+ */
+int ima_blacklist_measurement(struct integrity_iint_cache *iint,
+ const struct modsig *modsig, int action, int pcr,
+ struct ima_template_desc *template_desc)
+{
+   enum hash_algo hash_algo;
+   const u8 *digest = NULL;
+   u32 digestsize = 0;
+   u32 secid;
+   int rc = 0;
+
+   if (!(iint->flags & IMA_CHECK_BLACKLIST))
+   return 0;
+
+   if (iint->flags & IMA_MODSIG_ALLOWED) {
+   security_task_getsecid(current, );
+   ima_get_modsig_digest(modsig, _algo, , );
+
+   rc = is_hash_blacklisted(digest, digestsize, "bin");
+
+   /* Returns -EKEYREJECTED on blacklisted hash found */
+   if ((rc == -EKEYREJECTED) && (iint->flags & IMA_MEASURE))
+   process_buffer_measurement(digest, digestsize,
+  "blacklisted-hash", pcr,
+  template_desc);
+   }
+
+   return rc;
+}
+
 /*
  * ima_appraise_measurement - appraise file measurement
  *
diff --git a/security/integrity/ima/ima_main.c 
b/security/integrity/ima/ima_main.c
index ae0c1bdc4eaf..92c446045637 

[PATCH v6 6/9] ima: make process_buffer_measurement() non static

2019-09-27 Thread Nayna Jain
To add the support for checking against blacklist, it would be needed
to add an additional measurement record that identifies the record
as blacklisted.

This patch modifies the process_buffer_measurement() and makes it
non static to be used by blacklist functionality. It modifies the
function to handle more than just the KEXEC_CMDLINE.

Signed-off-by: Nayna Jain 
---
 security/integrity/ima/ima.h  |  3 +++
 security/integrity/ima/ima_main.c | 29 ++---
 2 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/security/integrity/ima/ima.h b/security/integrity/ima/ima.h
index 19769bf5f6ab..9bf509217e8e 100644
--- a/security/integrity/ima/ima.h
+++ b/security/integrity/ima/ima.h
@@ -215,6 +215,9 @@ void ima_store_measurement(struct integrity_iint_cache 
*iint, struct file *file,
   struct evm_ima_xattr_data *xattr_value,
   int xattr_len, const struct modsig *modsig, int pcr,
   struct ima_template_desc *template_desc);
+void process_buffer_measurement(const void *buf, int size,
+   const char *eventname, int pcr,
+   struct ima_template_desc *template_desc);
 void ima_audit_measurement(struct integrity_iint_cache *iint,
   const unsigned char *filename);
 int ima_alloc_init_template(struct ima_event_data *event_data,
diff --git a/security/integrity/ima/ima_main.c 
b/security/integrity/ima/ima_main.c
index 79c01516211b..ae0c1bdc4eaf 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -626,14 +626,14 @@ int ima_load_data(enum kernel_load_data_id id)
  * @buf: pointer to the buffer that needs to be added to the log.
  * @size: size of buffer(in bytes).
  * @eventname: event name to be used for the buffer entry.
- * @cred: a pointer to a credentials structure for user validation.
- * @secid: the secid of the task to be validated.
+ * @pcr: pcr to extend the measurement
+ * @template_desc: template description
  *
  * Based on policy, the buffer is measured into the ima log.
  */
-static void process_buffer_measurement(const void *buf, int size,
-  const char *eventname,
-  const struct cred *cred, u32 secid)
+void process_buffer_measurement(const void *buf, int size,
+   const char *eventname, int pcr,
+   struct ima_template_desc *template_desc)
 {
int ret = 0;
struct ima_template_entry *entry = NULL;
@@ -642,19 +642,11 @@ static void process_buffer_measurement(const void *buf, 
int size,
.filename = eventname,
.buf = buf,
.buf_len = size};
-   struct ima_template_desc *template_desc = NULL;
struct {
struct ima_digest_data hdr;
char digest[IMA_MAX_DIGEST_SIZE];
} hash = {};
int violation = 0;
-   int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
-   int action = 0;
-
-   action = ima_get_action(NULL, cred, secid, 0, KEXEC_CMDLINE, ,
-   _desc);
-   if (!(action & IMA_MEASURE))
-   return;
 
iint.ima_hash = 
iint.ima_hash->algo = ima_hash_algo;
@@ -686,12 +678,19 @@ static void process_buffer_measurement(const void *buf, 
int size,
  */
 void ima_kexec_cmdline(const void *buf, int size)
 {
+   int pcr = CONFIG_IMA_MEASURE_PCR_IDX;
+   struct ima_template_desc *template_desc = NULL;
+   int action;
u32 secid;
 
if (buf && size != 0) {
security_task_getsecid(current, );
-   process_buffer_measurement(buf, size, "kexec-cmdline",
-  current_cred(), secid);
+   action = ima_get_action(NULL, current_cred(), secid, 0,
+   KEXEC_CMDLINE, , _desc);
+   if (!(action & IMA_MEASURE))
+   return;
+   process_buffer_measurement(buf, size, "kexec-cmdline", pcr,
+  template_desc);
}
 }
 
-- 
2.20.1



[PATCH v6 5/9] powerpc/ima: add measurement rules to ima arch specific policy

2019-09-27 Thread Nayna Jain
This patch adds the measurement rules to the arch specific policies for the
systems with trusted boot.

Signed-off-by: Nayna Jain 
---
 arch/powerpc/kernel/ima_arch.c | 44 +++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c
index 39401b67f19e..77c61b142042 100644
--- a/arch/powerpc/kernel/ima_arch.c
+++ b/arch/powerpc/kernel/ima_arch.c
@@ -12,8 +12,18 @@ bool arch_ima_get_secureboot(void)
return is_powerpc_os_secureboot_enabled();
 }
 
-/* Defines IMA appraise rules for secureboot */
+/*
+ * The "arch_rules" contains both the securebot and trustedboot rules for 
adding
+ * the kexec kernel image and kernel modules file hashes to the IMA measurement
+ * list and verifying the file signatures against known good values.
+ *
+ * The "appraise_type=imasig|modsig" option allows the good signature to be
+ * stored as an xattr or as an appended signature. The "template=ima-modsig"
+ * option includes the appended signature in the IMA measurement list.
+ */
 static const char *const arch_rules[] = {
+   "measure func=KEXEC_KERNEL_CHECK template=ima-modsig",
+   "measure func=MODULE_CHECK template=ima-modsig",
"appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig",
 #if !IS_ENABLED(CONFIG_MODULE_SIG)
"appraise func=MODULE_CHECK appraise_type=imasig|modsig",
@@ -22,12 +32,40 @@ static const char *const arch_rules[] = {
 };
 
 /*
- * Returns the relevant IMA arch policies based on the system secureboot state.
+ * The "measure_rules" are enabled only on "trustedboot" enabled systems.
+ * These rules add the kexec kernel image and kernel modules file hashes to
+ * the IMA measurement list.
+ */
+static const char *const measure_rules[] = {
+   "measure func=KEXEC_KERNEL_CHECK",
+   "measure func=MODULE_CHECK",
+   NULL
+};
+
+/*
+ * Returns the relevant IMA arch policies based on the system secureboot
+ * and trustedboot state.
  */
 const char *const *arch_get_ima_policy(void)
 {
-   if (is_powerpc_os_secureboot_enabled())
+   const char *const *rules;
+   int offset = 0;
+
+   for (rules = arch_rules; *rules != NULL; rules++) {
+   if (strncmp(*rules, "appraise", 8) == 0)
+   break;
+   offset++;
+   }
+
+   if (is_powerpc_os_secureboot_enabled()
+   && is_powerpc_trustedboot_enabled())
return arch_rules;
 
+   if (is_powerpc_os_secureboot_enabled())
+   return arch_rules + offset;
+
+   if (is_powerpc_trustedboot_enabled())
+   return measure_rules;
+
return NULL;
 }
-- 
2.20.1



[PATCH v6 4/9] powerpc: detect the trusted boot state of the system

2019-09-27 Thread Nayna Jain
PowerNV systems enables the IMA measurement rules only if the
trusted boot is enabled on the system.

This patch adds the function to detect if the system has trusted
boot enabled.

Signed-off-by: Nayna Jain 
---
 arch/powerpc/include/asm/secure_boot.h |  6 ++
 arch/powerpc/kernel/secure_boot.c  | 14 ++
 2 files changed, 20 insertions(+)

diff --git a/arch/powerpc/include/asm/secure_boot.h 
b/arch/powerpc/include/asm/secure_boot.h
index 4e8e2b08a993..192caaedbe7a 100644
--- a/arch/powerpc/include/asm/secure_boot.h
+++ b/arch/powerpc/include/asm/secure_boot.h
@@ -14,6 +14,7 @@
 
 bool is_powerpc_os_secureboot_enabled(void);
 int get_powerpc_os_sb_node(struct device_node **node);
+bool is_powerpc_trustedboot_enabled(void);
 
 #else
 
@@ -27,5 +28,10 @@ static inline int get_powerpc_os_sb_node(struct device_node 
**node)
return -ENOENT;
 }
 
+static inline bool is_powerpc_os_trustedboot_enabled(void)
+{
+   return false;
+}
+
 #endif
 #endif
diff --git a/arch/powerpc/kernel/secure_boot.c 
b/arch/powerpc/kernel/secure_boot.c
index 45ca19f5e836..9d452e1550ae 100644
--- a/arch/powerpc/kernel/secure_boot.c
+++ b/arch/powerpc/kernel/secure_boot.c
@@ -80,3 +80,17 @@ bool is_powerpc_os_secureboot_enabled(void)
pr_info("secureboot mode disabled\n");
return false;
 }
+
+bool is_powerpc_trustedboot_enabled(void)
+{
+   struct device_node *node;
+
+   node = get_powerpc_fw_sb_node();
+   if (node && (of_find_property(node, "trusted-enabled", NULL))) {
+   pr_info("trustedboot mode enabled\n");
+   return true;
+   }
+
+   pr_info("trustedboot mode disabled\n");
+   return false;
+}
-- 
2.20.1



[PATCH v6 3/9] powerpc: add support to initialize ima policy rules

2019-09-27 Thread Nayna Jain
PowerNV systems uses kernel based bootloader, thus its secure boot
implementation uses kernel IMA security subsystem to verify the kernel
before kexec. Since the verification policy might differ based on the
secure boot mode of the system, the policies are defined at runtime.

This patch implements the arch-specific support to define the IMA policy
rules based on the runtime secure boot mode of the system.

This patch provides arch-specific IMA policies if PPC_SECURE_BOOT
config is enabled.

Signed-off-by: Nayna Jain 
---
 arch/powerpc/Kconfig   |  2 ++
 arch/powerpc/kernel/Makefile   |  2 +-
 arch/powerpc/kernel/ima_arch.c | 33 +
 include/linux/ima.h|  3 ++-
 4 files changed, 38 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kernel/ima_arch.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 2c54beb29f1a..54eda07c74e5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -916,6 +916,8 @@ config PPC_SECURE_BOOT
prompt "Enable secure boot support"
bool
depends on PPC_POWERNV
+   depends on IMA
+   depends on IMA_ARCH_POLICY
help
  Systems with firmware secure boot enabled needs to define security
  policies to extend secure boot to the OS. This config allows user
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 875b0785a20e..7156ac1fc956 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -157,7 +157,7 @@ endif
 obj-$(CONFIG_EPAPR_PARAVIRT)   += epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o
 
-obj-$(CONFIG_PPC_SECURE_BOOT)  += secure_boot.o
+obj-$(CONFIG_PPC_SECURE_BOOT)  += secure_boot.o ima_arch.o
 
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
diff --git a/arch/powerpc/kernel/ima_arch.c b/arch/powerpc/kernel/ima_arch.c
new file mode 100644
index ..39401b67f19e
--- /dev/null
+++ b/arch/powerpc/kernel/ima_arch.c
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Nayna Jain
+ */
+
+#include 
+#include 
+
+bool arch_ima_get_secureboot(void)
+{
+   return is_powerpc_os_secureboot_enabled();
+}
+
+/* Defines IMA appraise rules for secureboot */
+static const char *const arch_rules[] = {
+   "appraise func=KEXEC_KERNEL_CHECK appraise_type=imasig|modsig",
+#if !IS_ENABLED(CONFIG_MODULE_SIG)
+   "appraise func=MODULE_CHECK appraise_type=imasig|modsig",
+#endif
+   NULL
+};
+
+/*
+ * Returns the relevant IMA arch policies based on the system secureboot state.
+ */
+const char *const *arch_get_ima_policy(void)
+{
+   if (is_powerpc_os_secureboot_enabled())
+   return arch_rules;
+
+   return NULL;
+}
diff --git a/include/linux/ima.h b/include/linux/ima.h
index a20ad398d260..10af09b5b478 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -29,7 +29,8 @@ extern void ima_kexec_cmdline(const void *buf, int size);
 extern void ima_add_kexec_buffer(struct kimage *image);
 #endif
 
-#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390)
+#if (defined(CONFIG_X86) && defined(CONFIG_EFI)) || defined(CONFIG_S390) \
+   || defined(CONFIG_PPC_SECURE_BOOT)
 extern bool arch_ima_get_secureboot(void);
 extern const char * const *arch_get_ima_policy(void);
 #else
-- 
2.20.1



[PATCH v6 2/9] powerpc: detect the secure boot mode of the system

2019-09-27 Thread Nayna Jain
Secure boot on PowerNV defines different IMA policies based on the secure
boot state of the system.

This patch defines a function to detect the secure boot state of the
system.

The PPC_SECURE_BOOT config represents the base enablement of secureboot
on POWER.

Signed-off-by: Nayna Jain 
---
 arch/powerpc/Kconfig   | 10 
 arch/powerpc/include/asm/secure_boot.h | 31 ++
 arch/powerpc/kernel/Makefile   |  2 +
 arch/powerpc/kernel/secure_boot.c  | 82 ++
 4 files changed, 125 insertions(+)
 create mode 100644 arch/powerpc/include/asm/secure_boot.h
 create mode 100644 arch/powerpc/kernel/secure_boot.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 77f6ebf97113..2c54beb29f1a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -912,6 +912,16 @@ config PPC_MEM_KEYS
 
  If unsure, say y.
 
+config PPC_SECURE_BOOT
+   prompt "Enable secure boot support"
+   bool
+   depends on PPC_POWERNV
+   help
+ Systems with firmware secure boot enabled needs to define security
+ policies to extend secure boot to the OS. This config allows user
+ to enable OS secure boot on systems that have firmware support for
+ it. If in doubt say N.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/secure_boot.h 
b/arch/powerpc/include/asm/secure_boot.h
new file mode 100644
index ..4e8e2b08a993
--- /dev/null
+++ b/arch/powerpc/include/asm/secure_boot.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Secure boot definitions
+ *
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Nayna Jain
+ */
+#ifndef _ASM_POWER_SECURE_BOOT_H
+#define _ASM_POWER_SECURE_BOOT_H
+
+#ifdef CONFIG_PPC_SECURE_BOOT
+
+#define SECURE_BOOT_MASK   0x
+
+bool is_powerpc_os_secureboot_enabled(void);
+int get_powerpc_os_sb_node(struct device_node **node);
+
+#else
+
+static inline bool is_powerpc_os_secureboot_enabled(void)
+{
+   return false;
+}
+
+static inline int get_powerpc_os_sb_node(struct device_node **node)
+{
+   return -ENOENT;
+}
+
+#endif
+#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index ea0c69236789..875b0785a20e 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -157,6 +157,8 @@ endif
 obj-$(CONFIG_EPAPR_PARAVIRT)   += epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o
 
+obj-$(CONFIG_PPC_SECURE_BOOT)  += secure_boot.o
+
 # Disable GCOV, KCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 KCOV_INSTRUMENT_prom_init.o := n
diff --git a/arch/powerpc/kernel/secure_boot.c 
b/arch/powerpc/kernel/secure_boot.c
new file mode 100644
index ..45ca19f5e836
--- /dev/null
+++ b/arch/powerpc/kernel/secure_boot.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 IBM Corporation
+ * Author: Nayna Jain
+ */
+#include 
+#include 
+#include 
+
+static struct device_node *get_powerpc_fw_sb_node(void)
+{
+   return of_find_node_by_name(NULL, "ibm,secureboot");
+}
+
+bool is_powerpc_os_sb_supported(void)
+{
+   struct device_node *node = NULL;
+
+   node = get_powerpc_fw_sb_node();
+   if (node && of_device_is_compatible(node, "ibm,secureboot-v3"))
+   return true;
+
+   return false;
+}
+
+int get_powerpc_os_sb_node(struct device_node **node)
+{
+   struct device_node *fwsbnode;
+
+   if (!is_powerpc_os_sb_supported())
+   return -ENOTSUPP;
+
+   fwsbnode = get_powerpc_fw_sb_node();
+   if (!fwsbnode)
+   return -ENOENT;
+
+   *node = of_find_node_by_name(fwsbnode, "secvar");
+   if (*node)
+   return 0;
+
+   return -ENOENT;
+}
+
+bool is_powerpc_os_secureboot_enabled(void)
+{
+   struct device_node *node;
+   u64 sbmode = 0;
+   int rc;
+
+   rc = get_powerpc_os_sb_node();
+   if (rc == -ENOTSUPP)
+   goto disabled;
+
+   /* Fail secure for any failure related to secvar */
+   if (rc) {
+   pr_err("Expected secure variables support, fail secure\n");
+   goto enabled;
+   }
+
+   if (!of_device_is_available(node)) {
+   pr_err("Secure variables support is in error state, fail 
secure\n");
+   goto enabled;
+   }
+
+   rc = of_property_read_u64(node, "os-secure-mode", );
+   if (rc)
+   goto enabled;
+
+   sbmode = be64_to_cpu(sbmode);
+
+   /* checks for the secure mode enforcing bit */
+   if (!(sbmode & SECURE_BOOT_MASK))
+   goto disabled;
+
+enabled:
+   pr_info("secureboot mode enabled\n");
+   return true;
+
+disabled:
+   pr_info("secureboot mode disabled\n");
+   return false;
+}
-- 
2.20.1



[PATCH v6 1/9] dt-bindings: ibm, secureboot: secure boot specific properties for PowerNV

2019-09-27 Thread Nayna Jain
PowerNV represents both the firmware and Host OS secureboot state of the
system via device tree. This patch adds the documentation to give
the definition of the nodes and the properties.

Signed-off-by: Nayna Jain 
---
 .../bindings/powerpc/ibm,secureboot.rst   | 76 
 .../devicetree/bindings/powerpc/secvar.rst| 89 +++
 2 files changed, 165 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst
 create mode 100644 Documentation/devicetree/bindings/powerpc/secvar.rst

diff --git a/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst 
b/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst
new file mode 100644
index ..03d32099d2eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: GPL-2.0
+*** NOTE ***
+This document is copied from OPAL firmware
+(skiboot/doc/device-tree/ibm,secureboot.rst)
+
+.. _device-tree/ibm,secureboot:
+
+ibm,secureboot
+==
+
+The ``??bm,secureboot`` node provides secure boot and trusted boot information
+up to the target OS. Further information can be found in :ref:`stb-overview`.
+
+Required properties
+---
+
+.. code-block:: none
+
+compatible: Either one of the following values:
+
+ibm,secureboot-v1  :  The container-verification-code
+  is stored in a secure ROM memory.
+
+ibm,secureboot-v2  :  The container-verification-code
+  is stored in a reserved memory.
+  It described by the ibm,cvc child
+  node.
+
+ibm,secureboot-v3  :  The container-verification-code
+  is stored in a reserved memory.
+  It described by the ibm,cvc child
+  node. Secure variables are
+  supported. `secvar` node should
+  be created.
+
+secure-enabled: this property exists when the firmware stack is booting
+in secure mode (hardware secure boot jumper asserted).
+
+trusted-enabled:this property exists when the firmware stack is booting
+in trusted mode.
+
+hw-key-hash:hash of the three hardware public keys trusted by the
+platformw owner. This is used to verify if a firmware
+code is signed with trusted keys.
+
+hw-key-hash-size:   hw-key-hash size
+
+secvar: this node is created if the platform supports secure
+variables. Contains information about the current
+secvar status, see 'secvar.rst'.
+
+Obsolete properties
+---
+
+.. code-block:: none
+
+hash-algo:  Superseded by the hw-key-hash-size property in
+'ibm,secureboot-v2'.
+
+Example
+---
+
+.. code-block:: dts
+
+ibm,secureboot {
+compatible = "ibm,secureboot-v2";
+secure-enabled;
+trusted-enabled;
+hw-key-hash-size = <0x40>;
+hw-key-hash = <0x40d487ff 0x7380ed6a 0xd54775d5 0x795fea0d 0xe2f541fe
+   0xa9db06b8 0x466a42a3 0x20e65f75 0xb4866546 0x0017d907
+   0x515dc2a5 0xf9fc5095 0x4d6ee0c9 0xb67d219d 0xfb708535
+   0x1d01d6d1>;
+phandle = <0x10fd>;
+linux,phandle = <0x10fd>;
+};
diff --git a/Documentation/devicetree/bindings/powerpc/secvar.rst 
b/Documentation/devicetree/bindings/powerpc/secvar.rst
new file mode 100644
index ..47793ab9c2a7
--- /dev/null
+++ b/Documentation/devicetree/bindings/powerpc/secvar.rst
@@ -0,0 +1,89 @@
+# SPDX-License-Identifier: GPL-2.0
+*** NOTE ***
+This document is copied from OPAL firmware
+(skiboot/doc/device-tree/secvar.rst)
+
+.. _device-tree/ibm,secureboot/secvar:
+
+secvar
+==
+
+The ``secvar`` node provides secure variable information for the secure
+boot of the target OS.
+
+Required properties
+---
+
+.. code-block:: none
+
+compatible: this property is set based on the current secure
+variable scheme as set by the platform.
+
+status: set to "fail" if the secure variables could not
+be initialized, validated, or some other
+catastrophic failure.
+
+update-status:  contains the return code of the update queue
+process run during initialization. Signifies if
+updates were processed or not, and if there was
+an error. See table 

[PATCH v6 0/9] powerpc: Enabling IMA arch specific secure boot policies

2019-09-27 Thread Nayna Jain
This patchset extends the previous version of the patchset[1] by adding
the support for checking against the binary blacklisted hashes.

IMA subsystem supports custom, built-in, arch-specific policies to define
the files to be measured and appraised. These policies are honored based
on the priority where arch-specific policies is the highest and custom
is the lowest.

PowerNV systems uses the linux based bootloader and kexec the Host OS.
It rely on IMA for signature verification of the kernel before doing the
kexec. This patchset adds support for powerpc arch specific ima policies
that are defined based on system's OS secureboot and trustedboot state.
The OS secureboot and trustedboot state are determined via device-tree
properties.

The verification needs to be done only for the binaries which are not
blacklisted. The kernel currently checks against the blacklisted keys.
However that results in blacklisting all the binaries that are signed by
that key. In order to prevent single binary from loading, it is required
to support checking against blacklisting of the binary hash. This patchset
adds the support in IMA to check against blacklisted hashes for the files
signed by appended signature.

[1] http://patchwork.ozlabs.org/cover/1149262/ 

Changelog:
v6:
* includes feedbacks from Michael Ellerman on the patchset v5
  * removed email ids from comments
  * add the doc for the device-tree
  * renames the secboot.c to secure_boot.c and secboot.h to secure_boot.h
  * other code specific fixes
* split the patches to differentiate between secureboot and trustedboot
state of the system
* adds the patches to support the blacklisting of the binary hash.

v5:
* secureboot state is now read via device tree entry rather than OPAL
secure variables
* ima arch policies are updated to use policy based template for
measurement rules

v4:
* Fixed the build issue as reported by Satheesh Rajendran.

v3:
* OPAL APIs in Patch 1 are updated to provide generic interface based on
key/keylen. This patchset updates kernel OPAL APIs to be compatible with
generic interface.
* Patch 2 is cleaned up to use new OPAL APIs.
* Since OPAL can support different types of backend which can vary in the
variable interpretation, the Patch 2 is updated to add a check for the
backend version
* OPAL API now expects consumer to first check the supported backend version
before calling other secvar OPAL APIs. This check is now added in patch 2.
* IMA policies in Patch 3 is updated to specify appended signature and
per policy template.
* The patches now are free of any EFIisms.

v2:

* Removed Patch 1: powerpc/include: Override unneeded early ioremap
functions
* Updated Subject line and patch description of the Patch 1 of this series
* Removed dependency of OPAL_SECVAR on EFI, CPU_BIG_ENDIAN and UCS2_STRING
* Changed OPAL APIs from static to non-static. Added opal-secvar.h for the
same
* Removed EFI hooks from opal_secvar.c
* Removed opal_secvar_get_next(), opal_secvar_enqueue() and
opal_query_variable_info() function
* get_powerpc_sb_mode() in secboot.c now directly calls OPAL Runtime API
rather than via EFI hooks.
* Fixed log messages in get_powerpc_sb_mode() function.
* Added dependency for PPC_SECURE_BOOT on configs PPC64 and OPAL_SECVAR
* Replaced obj-$(CONFIG_IMA) with obj-$(CONFIG_PPC_SECURE_BOOT) in
arch/powerpc/kernel/Makefile

Nayna Jain (9):
  dt-bindings: ibm,secureboot: secure boot specific properties for
PowerNV
  powerpc: detect the secure boot mode of the system
  powerpc: add support to initialize ima policy rules
  powerpc: detect the trusted boot state of the system
  powerpc/ima: add measurement rules to ima arch specific policy
  ima: make process_buffer_measurement() non-static
  ima: check against blacklisted hashes for files with modsig
  ima: deprecate permit_directio, instead use appraise_flag
  powerpc/ima: update ima arch policy to check for blacklist

 Documentation/ABI/testing/ima_policy  |  3 +-
 .../bindings/powerpc/ibm,secureboot.rst   | 76 +++
 .../devicetree/bindings/powerpc/secvar.rst| 89 +
 arch/powerpc/Kconfig  | 12 +++
 arch/powerpc/include/asm/secure_boot.h| 37 +++
 arch/powerpc/kernel/Makefile  |  2 +
 arch/powerpc/kernel/ima_arch.c| 71 ++
 arch/powerpc/kernel/secure_boot.c | 96 +++
 include/linux/ima.h   |  3 +-
 security/integrity/ima/ima.h  | 15 +++
 security/integrity/ima/ima_appraise.c | 35 +++
 security/integrity/ima/ima_main.c | 37 +++
 security/integrity/ima/ima_policy.c   | 12 ++-
 security/integrity/integrity.h|  1 +
 14 files changed, 468 insertions(+), 21 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/powerpc/ibm,secureboot.rst
 create mode 100644 Documentation/devicetree/bindings/powerpc/secvar.rst
 create mode 100644 

[RFC PATCH] powernv/eeh: Fix oops when probing cxl devices

2019-09-27 Thread Frederic Barrat
Recent cleanup in the way EEH support is added to a device causes a
kernel oops when the cxl driver probes a device and creates virtual
devices discovered on the FPGA:

BUG: Kernel NULL pointer dereference at 0x00a0
Faulting instruction address: 0xc0048070
Oops: Kernel access of bad area, sig: 7 [#1]
...
NIP [c0048070] eeh_add_device_late.part.9+0x50/0x1e0
LR [c004805c] eeh_add_device_late.part.9+0x3c/0x1e0
Call Trace:
[c000200e43983900] [c079e250] _dev_info+0x5c/0x6c (unreliable)
[c000200e43983980] [c00d1ad0] pnv_pcibios_bus_add_device+0x60/0xb0
[c000200e439839f0] [c00606d0] pcibios_bus_add_device+0x40/0x60
[c000200e43983a10] [c06aa3a0] pci_bus_add_device+0x30/0x100
[c000200e43983a80] [c06aa4d4] pci_bus_add_devices+0x64/0xd0
[c000200e43983ac0] [c0081c429118] cxl_pci_vphb_add+0xe0/0x130 [cxl]
[c000200e43983b00] [c0081c4242ac] cxl_probe+0x504/0x5b0 [cxl]
[c000200e43983bb0] [c06bba1c] local_pci_probe+0x6c/0x110
[c000200e43983c30] [c0159278] work_for_cpu_fn+0x38/0x60

The root cause is that those cxl virtual devices don't have a
representation in the device tree and therefore no associated pci_dn
structure. In eeh_add_device_late(), pdn is NULL, so edev is NULL and
we oops.

We never had explicit support for EEH for those virtual
devices. Instead, EEH events are reported to the (real) pci device and
handled by the cxl driver. Which can then forward to the virtual
devices and handle dependencies. The fact that we try adding EEH
support for the virtual devices is new and a side-effect of the recent
cleanup.

This patch fixes it by skipping adding EEH support on powernv for
devices which don't have a pci_dn structure.

Fixes: b905f8cdca77 ("powerpc/eeh: EEH for pSeries hot plug")
Signed-off-by: Frederic Barrat 
---

Sending as an RFC, as I'm afraid of hiding potential issues and would
be interested in comments. The powernv eeh code expects a struct
pci_dn, so the fix seems safe. I'm wondering if there could be cases
(other than capi virtual devices) where we'd want to blow up and fix
instead of going undetected with this patch.


 arch/powerpc/platforms/powernv/eeh-powernv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6bc24a47e9ef..6f300ab7f0e9 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -42,7 +42,7 @@ void pnv_pcibios_bus_add_device(struct pci_dev *pdev)
 {
struct pci_dn *pdn = pci_get_pdn(pdev);
 
-   if (eeh_has_flag(EEH_FORCE_DISABLED))
+   if (!pdn || eeh_has_flag(EEH_FORCE_DISABLED))
return;
 
dev_dbg(>dev, "EEH: Setting up device\n");
-- 
2.21.0



Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()

2019-09-27 Thread Bjorn Helgaas
On Tue, Aug 27, 2019 at 06:18:22PM +0300, Andy Shevchenko wrote:
> This simplifies and standardizes slot manipulation code
> by using for_each_set_bit() library function.
> 
> Signed-off-by: Andy Shevchenko 
> ---
>  drivers/pci/pcie/aer.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index b45bc47d04fe..f883f81d759a 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -15,6 +15,7 @@
>  #define pr_fmt(fmt) "AER: " fmt
>  #define dev_fmt pr_fmt
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -657,7 +658,8 @@ const struct attribute_group aer_stats_attr_group = {
>  static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
>  struct aer_err_info *info)
>  {
> - int status, i, max = -1;
> + unsigned long status = info->status & ~info->mask;
> + int i, max = -1;
>   u64 *counter = NULL;
>   struct aer_stats *aer_stats = pdev->aer_stats;
>  
> @@ -682,10 +684,8 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
>   break;
>   }
>  
> - status = (info->status & ~info->mask);
> - for (i = 0; i < max; i++)
> - if (status & (1 << i))
> - counter[i]++;
> + for_each_set_bit(i, , max)

I applied this, but I confess to being a little ambivalent.  It's
arguably a little easier to read, but it's not nearly as efficient
(not a great concern here) and more importantly much harder to verify
that it's correct because you have to chase through
for_each_set_bit(), find_first_bit(), _ffs(), etc, etc.  No doubt it's
great for bitmaps of arbitrary size, but for a simple 32-bit register
I'm a little hesitant.  But I applied it anyway.

> + counter[i]++;
>  }
>  
>  static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
> @@ -717,14 +717,11 @@ static void __print_tlp_header(struct pci_dev *dev,
>  static void __aer_print_error(struct pci_dev *dev,
> struct aer_err_info *info)
>  {
> - int i, status;
> + unsigned long status = info->status & ~info->mask;
>   const char *errmsg = NULL;
> - status = (info->status & ~info->mask);
> -
> - for (i = 0; i < 32; i++) {
> - if (!(status & (1 << i)))
> - continue;
> + int i;
>  
> + for_each_set_bit(i, , 32) {
>   if (info->severity == AER_CORRECTABLE)
>   errmsg = i < ARRAY_SIZE(aer_correctable_error_string) ?
>   aer_correctable_error_string[i] : NULL;
> -- 
> 2.23.0.rc1
> 


[PATCH v2 6/6] KVM: PPC: Book3S HV: XIVE: Allow userspace to set the # of VPs

2019-09-27 Thread Greg Kurz
Add a new attribute to both XIVE and XICS-on-XIVE KVM devices so that
userspace can tell how many interrupt servers it needs. If a VM needs
less than the current default of KVM_MAX_VCPUS (2048), we can allocate
less VPs in OPAL. Combined with a core stride (VSMT) that matches the
number of guest threads per core, this may substantially increases the
number of VMs that can run concurrently with an in-kernel XIVE device.

Since the legacy XIVE KVM device is exposed to userspace through the
XICS KVM API, a new attribute group is added to it for this purpose.
While here, fix the syntax of the existing KVM_DEV_XICS_GRP_SOURCES
in the XICS documentation.

Signed-off-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
---
v2: - changelog update
---
 Documentation/virt/kvm/devices/xics.txt |   14 --
 Documentation/virt/kvm/devices/xive.txt |8 
 arch/powerpc/include/uapi/asm/kvm.h |3 +++
 arch/powerpc/kvm/book3s_xive.c  |   10 ++
 arch/powerpc/kvm/book3s_xive_native.c   |3 +++
 5 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/Documentation/virt/kvm/devices/xics.txt 
b/Documentation/virt/kvm/devices/xics.txt
index 42864935ac5d..423332dda7bc 100644
--- a/Documentation/virt/kvm/devices/xics.txt
+++ b/Documentation/virt/kvm/devices/xics.txt
@@ -3,9 +3,19 @@ XICS interrupt controller
 Device type supported: KVM_DEV_TYPE_XICS
 
 Groups:
-  KVM_DEV_XICS_SOURCES
+  1. KVM_DEV_XICS_GRP_SOURCES
   Attributes: One per interrupt source, indexed by the source number.
 
+  2. KVM_DEV_XICS_GRP_CTRL
+  Attributes:
+2.1 KVM_DEV_XICS_NR_SERVERS (write only)
+  The kvm_device_attr.addr points to a __u32 value which is the number of
+  interrupt server numbers (ie, highest possible vcpu id plus one).
+  Errors:
+-EINVAL: Value greater than KVM_MAX_VCPU_ID.
+-EFAULT: Invalid user pointer for attr->addr.
+-EBUSY:  A vcpu is already connected to the device.
+
 This device emulates the XICS (eXternal Interrupt Controller
 Specification) defined in PAPR.  The XICS has a set of interrupt
 sources, each identified by a 20-bit source number, and a set of
@@ -38,7 +48,7 @@ least-significant end of the word:
 
 Each source has 64 bits of state that can be read and written using
 the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the
-KVM_DEV_XICS_SOURCES attribute group, with the attribute number being
+KVM_DEV_XICS_GRP_SOURCES attribute group, with the attribute number being
 the interrupt source number.  The 64 bit state word has the following
 bitfields, starting from the least-significant end of the word:
 
diff --git a/Documentation/virt/kvm/devices/xive.txt 
b/Documentation/virt/kvm/devices/xive.txt
index 9a24a4525253..f5d1d6b5af61 100644
--- a/Documentation/virt/kvm/devices/xive.txt
+++ b/Documentation/virt/kvm/devices/xive.txt
@@ -78,6 +78,14 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
 migrating the VM.
 Errors: none
 
+1.3 KVM_DEV_XIVE_NR_SERVERS (write only)
+The kvm_device_attr.addr points to a __u32 value which is the number of
+interrupt server numbers (ie, highest possible vcpu id plus one).
+Errors:
+  -EINVAL: Value greater than KVM_MAX_VCPU_ID.
+  -EFAULT: Invalid user pointer for attr->addr.
+  -EBUSY:  A vCPU is already connected to the device.
+
   2. KVM_DEV_XIVE_GRP_SOURCE (write only)
   Initializes a new source in the XIVE device and mask it.
   Attributes:
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index b0f72dea8b11..264e266a85bf 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -667,6 +667,8 @@ struct kvm_ppc_cpu_char {
 
 /* PPC64 eXternal Interrupt Controller Specification */
 #define KVM_DEV_XICS_GRP_SOURCES   1   /* 64-bit source attributes */
+#define KVM_DEV_XICS_GRP_CTRL  2
+#define   KVM_DEV_XICS_NR_SERVERS  1
 
 /* Layout of 64-bit source attribute values */
 #define  KVM_XICS_DESTINATION_SHIFT0
@@ -683,6 +685,7 @@ struct kvm_ppc_cpu_char {
 #define KVM_DEV_XIVE_GRP_CTRL  1
 #define   KVM_DEV_XIVE_RESET   1
 #define   KVM_DEV_XIVE_EQ_SYNC 2
+#define   KVM_DEV_XIVE_NR_SERVERS  3
 #define KVM_DEV_XIVE_GRP_SOURCE2   /* 64-bit source 
identifier */
 #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3   /* 64-bit source identifier */
 #define KVM_DEV_XIVE_GRP_EQ_CONFIG 4   /* 64-bit EQ identifier */
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 6c35b3d95986..66858b7d3c6b 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1911,6 +1911,11 @@ static int xive_set_attr(struct kvm_device *dev, struct 
kvm_device_attr *attr)
switch (attr->group) {
case KVM_DEV_XICS_GRP_SOURCES:
return xive_set_source(xive, attr->attr, attr->addr);
+   case KVM_DEV_XICS_GRP_CTRL:
+   switch (attr->attr) {
+  

[PATCH v2 5/6] KVM: PPC: Book3S HV: XIVE: Make VP block size configurable

2019-09-27 Thread Greg Kurz
The XIVE VP is an internal structure which allow the XIVE interrupt
controller to maintain the interrupt context state of vCPUs non
dispatched on HW threads.

When a guest is started, the XIVE KVM device allocates a block of
XIVE VPs in OPAL, enough to accommodate the highest possible vCPU
id KVM_MAX_VCPU_ID (16384) packed down to KVM_MAX_VCPUS (2048).
With a guest's core stride of 8 and a threading mode of 1 (QEMU's
default), a VM must run at least 256 vCPUs to actually need such a
range of VPs.

A POWER9 system has a limited XIVE VP space : 512k and KVM is
currently wasting this HW resource with large VP allocations,
especially since a typical VM likely runs with a lot less vCPUs.

Make the size of the VP block configurable. Add an nr_servers
field to the XIVE structure and a function to set it for this
purpose.

Split VP allocation out of the device create function. Since the
VP block isn't used before the first vCPU connects to the XIVE KVM
device, allocation is now performed by kvmppc_xive_connect_vcpu().
This gives the opportunity to set nr_servers in between:

  kvmppc_xive_create() / kvmppc_xive_native_create()
   .
   .
 kvmppc_xive_set_nr_servers()
   .
   .
kvmppc_xive_connect_vcpu() / kvmppc_xive_native_connect_vcpu()

The connect_vcpu() functions check that the vCPU id is below nr_servers
and if it is the first vCPU they allocate the VP block. This is protected
against a concurrent update of nr_servers by kvmppc_xive_set_nr_servers()
with the xive->lock mutex.

Also, the block is allocated once for the device lifetime: nr_servers
should stay constant otherwise connect_vcpu() could generate a boggus
VP id and likely crash OPAL. It is thus forbidden to update nr_servers
once the block is allocated.

If the VP allocation fail, return ENOSPC which seems more appropriate to
report the depletion of system wide HW resource than ENOMEM or ENXIO.

A VM using a stride of 8 and 1 thread per core with 32 vCPUs would hence
only need 256 VPs instead of 2048. If the stride is set to match the number
of threads per core, this goes further down to 32.

This will be exposed to userspace by a subsequent patch.

Signed-off-by: Greg Kurz 
---
v2: - update nr_server check and clip down to KVM_MAX_VCPUS
  in kvmppc_xive_set_nr_servers()
---
 arch/powerpc/kvm/book3s_xive.c|   65 +++--
 arch/powerpc/kvm/book3s_xive.h|4 ++
 arch/powerpc/kvm/book3s_xive_native.c |   18 +++--
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index d84da9f6ee88..6c35b3d95986 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1213,13 +1213,13 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
 
 static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
 {
-   /* We have a block of KVM_MAX_VCPUS VPs. We just need to check
+   /* We have a block of xive->nr_servers VPs. We just need to check
 * raw vCPU ids are below the expected limit for this guest's
 * core stride ; kvmppc_pack_vcpu_id() will pack them down to an
 * index that can be safely used to compute a VP id that belongs
 * to the VP block.
 */
-   return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode;
+   return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode;
 }
 
 int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
@@ -1231,6 +1231,14 @@ int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, 
u32 cpu, u32 *vp)
return -EINVAL;
}
 
+   if (xive->vp_base == XIVE_INVALID_VP) {
+   xive->vp_base = xive_native_alloc_vp_block(xive->nr_servers);
+   pr_devel("VP_Base=%x nr_servers=%d\n", xive->vp_base, 
xive->nr_servers);
+
+   if (xive->vp_base == XIVE_INVALID_VP)
+   return -ENOSPC;
+   }
+
vp_id = kvmppc_xive_vp(xive, cpu);
if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
pr_devel("Duplicate !\n");
@@ -1858,6 +1866,43 @@ int kvmppc_xive_set_irq(struct kvm *kvm, int 
irq_source_id, u32 irq, int level,
return 0;
 }
 
+int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr)
+{
+   u32 __user *ubufp = (u32 __user *) addr;
+   u32 nr_servers;
+   int rc = 0;
+
+   if (get_user(nr_servers, ubufp))
+   return -EFAULT;
+
+   pr_devel("%s nr_servers=%u\n", __func__, nr_servers);
+
+   if (!nr_servers || nr_servers > KVM_MAX_VCPU_ID)
+   return -EINVAL;
+
+   mutex_lock(>lock);
+   if (xive->vp_base != XIVE_INVALID_VP)
+   /* The VP block is allocated once and freed when the device
+* is released. Better not allow to change its size since its
+

[PATCH v2 4/6] KVM: PPC: Book3S HV: XIVE: Compute the VP id in a common helper

2019-09-27 Thread Greg Kurz
Reduce code duplication by consolidating the checking of vCPU ids and VP
ids to a common helper used by both legacy and native XIVE KVM devices.
And explain the magic with a comment.

Signed-off-by: Greg Kurz 
---
 arch/powerpc/kvm/book3s_xive.c|   42 ++---
 arch/powerpc/kvm/book3s_xive.h|1 +
 arch/powerpc/kvm/book3s_xive_native.c |   11 ++---
 3 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 0b7859e40f66..d84da9f6ee88 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1211,6 +1211,37 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
vcpu->arch.xive_vcpu = NULL;
 }
 
+static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu)
+{
+   /* We have a block of KVM_MAX_VCPUS VPs. We just need to check
+* raw vCPU ids are below the expected limit for this guest's
+* core stride ; kvmppc_pack_vcpu_id() will pack them down to an
+* index that can be safely used to compute a VP id that belongs
+* to the VP block.
+*/
+   return cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode;
+}
+
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp)
+{
+   u32 vp_id;
+
+   if (!kvmppc_xive_vcpu_id_valid(xive, cpu)) {
+   pr_devel("Out of bounds !\n");
+   return -EINVAL;
+   }
+
+   vp_id = kvmppc_xive_vp(xive, cpu);
+   if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+   pr_devel("Duplicate !\n");
+   return -EEXIST;
+   }
+
+   *vp = vp_id;
+
+   return 0;
+}
+
 int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 struct kvm_vcpu *vcpu, u32 cpu)
 {
@@ -1229,20 +1260,13 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
return -EPERM;
if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
-   if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
-   pr_devel("Out of bounds !\n");
-   return -EINVAL;
-   }
 
/* We need to synchronize with queue provisioning */
mutex_lock(>lock);
 
-   vp_id = kvmppc_xive_vp(xive, cpu);
-   if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
-   pr_devel("Duplicate !\n");
-   r = -EEXIST;
+   r = kvmppc_xive_compute_vp_id(xive, cpu, _id);
+   if (r)
goto bail;
-   }
 
xc = kzalloc(sizeof(*xc), GFP_KERNEL);
if (!xc) {
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index fe3ed50e0818..90cf6ec35a68 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -296,6 +296,7 @@ int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 
prio,
 struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
 void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
struct kvmppc_xive_vcpu *xc, int irq);
+int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp);
 
 #endif /* CONFIG_KVM_XICS */
 #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
b/arch/powerpc/kvm/book3s_xive_native.c
index 43a86858390a..5bb480b2aafd 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -118,19 +118,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device 
*dev,
return -EPERM;
if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
-   if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
-   pr_devel("Out of bounds !\n");
-   return -EINVAL;
-   }
 
mutex_lock(>lock);
 
-   vp_id = kvmppc_xive_vp(xive, server_num);
-   if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
-   pr_devel("Duplicate !\n");
-   rc = -EEXIST;
+   rc = kvmppc_xive_compute_vp_id(xive, server_num, _id);
+   if (rc)
goto bail;
-   }
 
xc = kzalloc(sizeof(*xc), GFP_KERNEL);
if (!xc) {



[PATCH v2 3/6] KVM: PPC: Book3S HV: XIVE: Show VP id in debugfs

2019-09-27 Thread Greg Kurz
Print out the VP id of each connected vCPU, this allow to see:
- the VP block base in which OPAL encodes information that may be
  useful when debugging
- the packed vCPU id which may differ from the raw vCPU id if the
  latter is >= KVM_MAX_VCPUS (2048)

Signed-off-by: Greg Kurz 
---
 arch/powerpc/kvm/book3s_xive.c|4 ++--
 arch/powerpc/kvm/book3s_xive_native.c |4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index baa740815b3c..0b7859e40f66 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -2107,9 +2107,9 @@ static int xive_debug_show(struct seq_file *m, void 
*private)
if (!xc)
continue;
 
-   seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
+   seq_printf(m, "cpu server %#x VP:%#x CPPR:%#x HWCPPR:%#x"
   " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
-  xc->server_num, xc->cppr, xc->hw_cppr,
+  xc->server_num, xc->vp_id, xc->cppr, xc->hw_cppr,
   xc->mfrr, xc->pending,
   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
 
diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
b/arch/powerpc/kvm/book3s_xive_native.c
index ebb4215baf45..43a86858390a 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1204,8 +1204,8 @@ static int xive_native_debug_show(struct seq_file *m, 
void *private)
if (!xc)
continue;
 
-   seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x 
PIPR=%02x w01=%016llx w2=%08x\n",
-  xc->server_num,
+   seq_printf(m, "cpu server %#x VP=%#x NSR=%02x CPPR=%02x 
IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
+  xc->server_num, xc->vp_id,
   vcpu->arch.xive_saved_state.nsr,
   vcpu->arch.xive_saved_state.cppr,
   vcpu->arch.xive_saved_state.ipb,



[PATCH v2 2/6] KVM: PPC: Book3S HV: XIVE: Ensure VP isn't already in use

2019-09-27 Thread Greg Kurz
Connecting a vCPU to a XIVE KVM device means establishing a 1:1
association between a vCPU id and the offset (VP id) of a VP
structure within a fixed size block of VPs. We currently try to
enforce the 1:1 relationship by checking that a vCPU with the
same id isn't already connected. This is good but unfortunately
not enough because we don't map VP ids to raw vCPU ids but to
packed vCPU ids, and the packing function kvmppc_pack_vcpu_id()
isn't bijective by design. We got away with it because QEMU passes
vCPU ids that fit well in the packing pattern. But nothing prevents
userspace to come up with a forged vCPU id resulting in a packed id
collision which causes the KVM device to associate two vCPUs to the
same VP. This greatly confuses the irq layer and ultimately crashes
the kernel, as shown below.

Example: a guest with 1 guest thread per core, a core stride of
8 and 300 vCPUs has vCPU ids 0,8,16...2392. If QEMU is patched to
inject at some point an invalid vCPU id 348, which is the packed
version of itself and 2392, we get:

genirq: Flags mismatch irq 199. 0001 (kvm-2-2392) vs. 0001 (kvm-2-348)
CPU: 24 PID: 88176 Comm: qemu-system-ppc Not tainted 
5.3.0-xive-nr-servers-5.3-gku+ #38
Call Trace:
[c03f7f9937e0] [c0c0110c] dump_stack+0xb0/0xf4 (unreliable)
[c03f7f993820] [c01cb480] __setup_irq+0xa70/0xad0
[c03f7f9938d0] [c01cb75c] request_threaded_irq+0x13c/0x260
[c03f7f993940] [c0080d44e7ac] kvmppc_xive_attach_escalation+0x104/0x270 
[kvm]
[c03f7f9939d0] [c0080d45013c] kvmppc_xive_connect_vcpu+0x424/0x620 [kvm]
[c03f7f993ac0] [c0080d28] kvm_arch_vcpu_ioctl+0x260/0x448 [kvm]
[c03f7f993b90] [c0080d43593c] kvm_vcpu_ioctl+0x154/0x7c8 [kvm]
[c03f7f993d00] [c04840f0] do_vfs_ioctl+0xe0/0xc30
[c03f7f993db0] [c0484d44] ksys_ioctl+0x104/0x120
[c03f7f993e00] [c0484d88] sys_ioctl+0x28/0x80
[c03f7f993e20] [c000b278] system_call+0x5c/0x68
xive-kvm: Failed to request escalation interrupt for queue 0 of VCPU 2392
[ cut here ]
remove_proc_entry: removing non-empty directory 'irq/199', leaking at least 
'kvm-2-348'
WARNING: CPU: 24 PID: 88176 at 
/home/greg/Work/linux/kernel-kvm-ppc/fs/proc/generic.c:684 
remove_proc_entry+0x1ec/0x200
Modules linked in: kvm_hv kvm dm_mod vhost_net vhost tap xt_CHECKSUM 
iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack 
nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 tun bridge stp llc 
ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter squashfs loop 
fuse i2c_dev sg ofpart ocxl powernv_flash at24 xts mtd uio_pdrv_genirq 
vmx_crypto opal_prd ipmi_powernv uio ipmi_devintf ipmi_msghandler ibmpowernv 
ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi 
scsi_transport_iscsi ip_tables ext4 mbcache jbd2 raid10 raid456 
async_raid6_recov async_memcpy async_pq async_xor xor async_tx raid6_pq 
libcrc32c raid1 raid0 linear sd_mod ast i2c_algo_bit drm_vram_helper ttm 
drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops drm ahci libahci 
libata tg3 drm_panel_orientation_quirks [last unloaded: kvm]
CPU: 24 PID: 88176 Comm: qemu-system-ppc Not tainted 
5.3.0-xive-nr-servers-5.3-gku+ #38
NIP:  c053b0cc LR: c053b0c8 CTR: c00ba3b0
REGS: c03f7f9934b0 TRAP: 0700   Not tainted  
(5.3.0-xive-nr-servers-5.3-gku+)
MSR:  90029033   CR: 48228222  XER: 2004
CFAR: c0131a50 IRQMASK: 0
GPR00: c053b0c8 c03f7f993740 c15ec500 0057
GPR04: 0001  49fb98484262 1bcf
GPR08: 0007 0007 0001 90001033
GPR12: 8000 c03eb800  00012f4ce5a1
GPR16: 00012ef5a0c8  00012f113bb0 
GPR20: 00012f45d918 c03f863758b0 c03f86375870 0006
GPR24: c03f86375a30 0007 c0002039373d9020 c14c4a48
GPR28: 0001 c03fe62a4f6b c00020394b2e9fab c03fe62a4ec0
NIP [c053b0cc] remove_proc_entry+0x1ec/0x200
LR [c053b0c8] remove_proc_entry+0x1e8/0x200
Call Trace:
[c03f7f993740] [c053b0c8] remove_proc_entry+0x1e8/0x200 (unreliable)
[c03f7f9937e0] [c01d3654] unregister_irq_proc+0x114/0x150
[c03f7f993880] [c01c6284] free_desc+0x54/0xb0
[c03f7f9938c0] [c01c65ec] irq_free_descs+0xac/0x100
[c03f7f993910] [c01d1ff8] irq_dispose_mapping+0x68/0x80
[c03f7f993940] [c0080d44e8a4] kvmppc_xive_attach_escalation+0x1fc/0x270 
[kvm]
[c03f7f9939d0] [c0080d45013c] kvmppc_xive_connect_vcpu+0x424/0x620 [kvm]
[c03f7f993ac0] [c0080d28] kvm_arch_vcpu_ioctl+0x260/0x448 [kvm]
[c03f7f993b90] [c0080d43593c] kvm_vcpu_ioctl+0x154/0x7c8 [kvm]
[c03f7f993d00] [c04840f0] do_vfs_ioctl+0xe0/0xc30
[c03f7f993db0] [c0484d44] ksys_ioctl+0x104/0x120

[PATCH v2 1/6] KVM: PPC: Book3S HV: XIVE: Set kvm->arch.xive when VPs are allocated

2019-09-27 Thread Greg Kurz
If we cannot allocate the XIVE VPs in OPAL, the creation of a XIVE or
XICS-on-XIVE device is aborted as expected, but we leave kvm->arch.xive
set forever since the release method isn't called in this case. Any
subsequent tentative to create a XIVE or XICS-on-XIVE for this VM will
thus always fail (DoS). This is a problem for QEMU since it destroys
and re-creates these devices when the VM is reset: the VM would be
restricted to using the much slower emulated XIVE or XICS forever.

As an alternative to adding rollback, do not assign kvm->arch.xive before
making sure the XIVE VPs are allocated in OPAL.

Cc: sta...@vger.kernel.org # v5.2
Fixes: 5422e95103cf ("KVM: PPC: Book3S HV: XIVE: Replace the 'destroy' method 
by a 'release' method")
Signed-off-by: Greg Kurz 
Reviewed-by: Cédric Le Goater 
---
 arch/powerpc/kvm/book3s_xive.c|   11 +--
 arch/powerpc/kvm/book3s_xive_native.c |2 +-
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 591bfb4bfd0f..99884662facb 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1997,6 +1997,10 @@ static int kvmppc_xive_create(struct kvm_device *dev, 
u32 type)
 
pr_devel("Creating xive for partition\n");
 
+   /* Already there ? */
+   if (kvm->arch.xive)
+   return -EEXIST;
+
xive = kvmppc_xive_get_device(kvm, type);
if (!xive)
return -ENOMEM;
@@ -2006,12 +2010,6 @@ static int kvmppc_xive_create(struct kvm_device *dev, 
u32 type)
xive->kvm = kvm;
mutex_init(>lock);
 
-   /* Already there ? */
-   if (kvm->arch.xive)
-   ret = -EEXIST;
-   else
-   kvm->arch.xive = xive;
-
/* We use the default queue size set by the host */
xive->q_order = xive_native_default_eq_shift();
if (xive->q_order < PAGE_SHIFT)
@@ -2031,6 +2029,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 
type)
if (ret)
return ret;
 
+   kvm->arch.xive = xive;
return 0;
 }
 
diff --git a/arch/powerpc/kvm/book3s_xive_native.c 
b/arch/powerpc/kvm/book3s_xive_native.c
index 248c1ea9e788..6b91c74839d5 100644
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -1079,7 +1079,6 @@ static int kvmppc_xive_native_create(struct kvm_device 
*dev, u32 type)
dev->private = xive;
xive->dev = dev;
xive->kvm = kvm;
-   kvm->arch.xive = xive;
mutex_init(>mapping_lock);
mutex_init(>lock);
 
@@ -1100,6 +1099,7 @@ static int kvmppc_xive_native_create(struct kvm_device 
*dev, u32 type)
if (ret)
return ret;
 
+   kvm->arch.xive = xive;
return 0;
 }
 



[PATCH v2 0/6] KVM: PPC: Book3S: HV: XIVE: Allocate less VPs in OPAL

2019-09-27 Thread Greg Kurz
This brings some fixes and allows to start more VMs with an in-kernel
XIVE or XICS-on-XIVE device.

Changes since v1 (https://patchwork.ozlabs.org/cover/1166099/):
- drop a useless patch
- add a patch to show VP ids in debugfs
- update some changelogs
- fix buggy check in patch 5
- Cc: stable 

--
Greg

---

Greg Kurz (6):
  KVM: PPC: Book3S HV: XIVE: Set kvm->arch.xive when VPs are allocated
  KVM: PPC: Book3S HV: XIVE: Ensure VP isn't already in use
  KVM: PPC: Book3S HV: XIVE: Show VP id in debugfs
  KVM: PPC: Book3S HV: XIVE: Compute the VP id in a common helper
  KVM: PPC: Book3S HV: XIVE: Make VP block size configurable
  KVM: PPC: Book3S HV: XIVE: Allow userspace to set the # of VPs


 Documentation/virt/kvm/devices/xics.txt |   14 +++
 Documentation/virt/kvm/devices/xive.txt |8 ++
 arch/powerpc/include/uapi/asm/kvm.h |3 +
 arch/powerpc/kvm/book3s_xive.c  |  142 ---
 arch/powerpc/kvm/book3s_xive.h  |   17 
 arch/powerpc/kvm/book3s_xive_native.c   |   40 +++--
 6 files changed, 167 insertions(+), 57 deletions(-)



Re: [PATCH v1] powerpc/pseries: CMM: Drop page array

2019-09-27 Thread David Hildenbrand
On 25.09.19 09:37, David Hildenbrand wrote:
> On 10.09.19 18:39, David Hildenbrand wrote:
>> We can simply store the pages in a list (page->lru), no need for a
>> separate data structure (+ complicated handling). This is how most
>> other balloon drivers store allocated pages without additional tracking
>> data.
>>
>> For the notifiers, use page_to_pfn() to check if a page is in the
>> applicable range. plpar_page_set_loaned()/plpar_page_set_active() were
>> called with __pa(page_address()) for now, I assume we can simply switch
>> to page_to_phys() here. The pfn_to_kaddr() handling is now mostly gone.
>>
>> Cc: Benjamin Herrenschmidt 
>> Cc: Paul Mackerras 
>> Cc: Michael Ellerman 
>> Cc: Arun KS 
>> Cc: Pavel Tatashin 
>> Cc: Thomas Gleixner 
>> Cc: Andrew Morton 
>> Cc: Vlastimil Babka 
>> Signed-off-by: David Hildenbrand 
>> ---
>>
>> Only compile-tested. I hope the page_to_phys() thingy is correct and I
>> didn't mess up something else / ignoring something important why the array
>> is needed.
>>
>> I stumbled over this while looking at how the memory isolation notifier is
>> used - and wondered why the additional array is necessary. Also, I think
>> by switching to the generic balloon compaction mechanism, we could get
>> rid of the memory hotplug notifier and the memory isolation notifier in
>> this code, as the migration capability of the inflated pages is the real
>> requirement:
>>  commit 14b8a76b9d53346f2871bf419da2aaf219940c50
>>  Author: Robert Jennings 
>>  Date:   Thu Dec 17 14:44:52 2009 +
>>  
>>  powerpc: Make the CMM memory hotplug aware
>>  
>>  The Collaborative Memory Manager (CMM) module allocates individual 
>> pages
>>  over time that are not migratable.  On a long running system this 
>> can
>>  severely impact the ability to find enough pages to support a 
>> hotplug
>>  memory remove operation.
>>  [...]
>>
>> Thoughts?
> 
> Ping, is still feature still used at all?
> 
> If nobody can test, any advise on which HW I need and how to trigger it?
> 

So ... if CMM is no longer alive I propose ripping it out completely.
Does anybody know if this feature is still getting used? Getting rid of
the memory isolation notifier sounds desirable - either by scrapping CMM
or by properly wiring up balloon compaction.

-- 

Thanks,

David / dhildenb


Re: [PATCH v1 1/2] PCI/AER: Use for_each_set_bit()

2019-09-27 Thread Bjorn Helgaas
On Tue, Aug 27, 2019 at 06:18:22PM +0300, Andy Shevchenko wrote:
> This simplifies and standardizes slot manipulation code
> by using for_each_set_bit() library function.
> 
> Signed-off-by: Andy Shevchenko 

Applied both with Kuppuswamy's reviewed-by to pci/aer for v5.5,
thanks!

> ---
>  drivers/pci/pcie/aer.c | 19 ---
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index b45bc47d04fe..f883f81d759a 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -15,6 +15,7 @@
>  #define pr_fmt(fmt) "AER: " fmt
>  #define dev_fmt pr_fmt
>  
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -657,7 +658,8 @@ const struct attribute_group aer_stats_attr_group = {
>  static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
>  struct aer_err_info *info)
>  {
> - int status, i, max = -1;
> + unsigned long status = info->status & ~info->mask;
> + int i, max = -1;
>   u64 *counter = NULL;
>   struct aer_stats *aer_stats = pdev->aer_stats;
>  
> @@ -682,10 +684,8 @@ static void pci_dev_aer_stats_incr(struct pci_dev *pdev,
>   break;
>   }
>  
> - status = (info->status & ~info->mask);
> - for (i = 0; i < max; i++)
> - if (status & (1 << i))
> - counter[i]++;
> + for_each_set_bit(i, , max)
> + counter[i]++;
>  }
>  
>  static void pci_rootport_aer_stats_incr(struct pci_dev *pdev,
> @@ -717,14 +717,11 @@ static void __print_tlp_header(struct pci_dev *dev,
>  static void __aer_print_error(struct pci_dev *dev,
> struct aer_err_info *info)
>  {
> - int i, status;
> + unsigned long status = info->status & ~info->mask;
>   const char *errmsg = NULL;
> - status = (info->status & ~info->mask);
> -
> - for (i = 0; i < 32; i++) {
> - if (!(status & (1 << i)))
> - continue;
> + int i;
>  
> + for_each_set_bit(i, , 32) {
>   if (info->severity == AER_CORRECTABLE)
>   errmsg = i < ARRAY_SIZE(aer_correctable_error_string) ?
>   aer_correctable_error_string[i] : NULL;
> -- 
> 2.23.0.rc1
> 


Re: [PATCH v4 5/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-09-27 Thread Mark Marshall
Comment below...

On Thu, 26 Sep 2019 at 12:18, Alastair D'Silva  wrote:
>
> From: Alastair D'Silva 
>
> When presented with large amounts of memory being hotplugged
> (in my test case, ~890GB), the call to flush_dcache_range takes
> a while (~50 seconds), triggering RCU stalls.
>
> This patch breaks up the call into 1GB chunks, calling
> cond_resched() inbetween to allow the scheduler to run.
>
> Signed-off-by: Alastair D'Silva 
> ---
>  arch/powerpc/mm/mem.c | 27 +--
>  1 file changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index cff947cb2a84..a2758e473d58 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -104,6 +104,27 @@ int __weak remove_section_mapping(unsigned long start, 
> unsigned long end)
> return -ENODEV;
>  }
>
> +#define FLUSH_CHUNK_SIZE SZ_1G
> +/**
> + * flush_dcache_range_chunked(): Write any modified data cache blocks out to
> + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
> + * Does not invalidate the corresponding instruction cache blocks.
> + *
> + * @start: the start address
> + * @stop: the stop address (exclusive)
> + * @chunk: the max size of the chunks
> + */
> +static void flush_dcache_range_chunked(unsigned long start, unsigned long 
> stop,
> +  unsigned long chunk)
> +{
> +   unsigned long i;
> +
> +   for (i = start; i < stop; i += FLUSH_CHUNK_SIZE) {
Here you ignore the function parameter "chunk" and use the define
FLUSH_CHUNK_SIZE.
You should do one or the other; use the parameter or remove it.
> +   flush_dcache_range(i, min(stop, start + FLUSH_CHUNK_SIZE));
> +   cond_resched();
> +   }
> +}
> +
>  int __ref arch_add_memory(int nid, u64 start, u64 size,
> struct mhp_restrictions *restrictions)
>  {
> @@ -120,7 +141,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
> start, start + size, rc);
> return -EFAULT;
> }
> -   flush_dcache_range(start, start + size);
> +
> +   flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
>
> return __add_pages(nid, start_pfn, nr_pages, restrictions);
>  }
> @@ -137,7 +159,8 @@ void __ref arch_remove_memory(int nid, u64 start, u64 
> size,
>
> /* Remove htab bolted mappings for this section of memory */
> start = (unsigned long)__va(start);
> -   flush_dcache_range(start, start + size);
> +   flush_dcache_range_chunked(start, start + size, FLUSH_CHUNK_SIZE);
> +
> ret = remove_section_mapping(start, start + size);
> WARN_ON_ONCE(ret);
>
> --
> 2.21.0
>


Re: [PATCH 01/11] powerpc/powernv/ioda: Fix ref count for devices with their own PE

2019-09-27 Thread Alexey Kardashevskiy



On 27/09/2019 03:15, Frederic Barrat wrote:
> 
> 
> Le 26/09/2019 à 18:44, Andrew Donnellan a écrit :
>> On 9/9/19 5:45 pm, Frederic Barrat wrote:
>>> Taking a reference on the pci_dev structure was required with initial
>>> commit 184cd4a3b962 ("powerpc/powernv: PCI support for p7IOC under
>>> OPAL v2"), where we were storing the pci_dev in the pci_dn structure.
>>>
>>> However, the pci_dev was later removed from the pci_dn structure, but
>>> the reference was kept. See commit 902bdc57451c ("powerpc/powernv/idoa:
>>> Remove unnecessary pcidev from pci_dn").
>>>
>>> The pnv_ioda_pe structure life cycle is the same as the pci_dev
>>> structure, the PE is freed when the device is released. So we don't
>>> need a reference for the pci_dev stored in the PE, otherwise the
>>> pci_dev will never be released. Which is not really a surprise as the
>>> comment (removed here as no longer needed) was stating as much.
>>>
>>> Fixes: 902bdc57451c ("powerpc/powernv/idoa: Remove unnecessary pcidev from 
>>> pci_dn")
>>> Signed-off-by: Frederic Barrat 
>>> ---
>>>   arch/powerpc/platforms/powernv/pci-ioda.c | 11 +--
>>>   1 file changed, 1 insertion(+), 10 deletions(-)
>>>
>>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> index d8080558d020..92767f006f20 100644
>>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>> @@ -1062,14 +1062,6 @@ static struct pnv_ioda_pe 
>>> *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>>>   return NULL;
>>>   }
>>> -    /* NOTE: We get only one ref to the pci_dev for the pdn, not for the
>>> - * pointer in the PE data structure, both should be destroyed at the
>>> - * same time. However, this needs to be looked at more closely again
>>> - * once we actually start removing things (Hotplug, SR-IOV, ...)
>>> - *
>>> - * At some point we want to remove the PDN completely anyways
>>> - */
>>> -    pci_dev_get(dev);
>>>   pdn->pe_number = pe->pe_number;
>>>   pe->flags = PNV_IODA_PE_DEV;
>>>   pe->pdev = dev;
>>> @@ -1084,7 +1076,6 @@ static struct pnv_ioda_pe 
>>> *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
>>>   pnv_ioda_free_pe(pe);
>>>   pdn->pe_number = IODA_INVALID_PE;
>>>   pe->pdev = NULL;
>>> -    pci_dev_put(dev);
>>>   return NULL;
>>>   }
>>> @@ -1228,7 +1219,7 @@ static struct pnv_ioda_pe 
>>> *pnv_ioda_setup_npu_PE(struct pci_dev *npu_pdev)
>>>    */
>>>   dev_info(_pdev->dev,
>>>   "Associating to existing PE %x\n", pe_num);
>>> -    pci_dev_get(npu_pdev);
>>> +    pci_dev_get(npu_pdev); // still needed after 902bdc57451c ?
>>
>> Did you mean to leave that comment in?
> 
> Yes, I assumed the series wouldn't get in on the first try and a 
> nvlink-minded developer would weigh in :)

I am looking at it and I am still not so sure what exactly guarantees that 
lifetime(@dev) == lifetime(@pe). For example,
sriov_disable() removes VFs first, and only then releases PEs so there is a 
window when we may have a stale pdev. Not sure.


> 
>   Fred
> 
> 
>>>   npu_pdn = pci_get_pdn(npu_pdev);
>>>   rid = npu_pdev->bus->number << 8 | npu_pdn->devfn;
>>>   npu_pdn->pe_number = pe_num;
>>>
>>
> 

-- 
Alexey


RE: [PATCH v4 5/6] powerpc: Chunk calls to flush_dcache_range in arch_*_memory

2019-09-27 Thread Alastair D'Silva
On Fri, 2019-09-27 at 08:37 +0200, Mark Marshall wrote:
> Comment below...
> 
> On Thu, 26 Sep 2019 at 12:18, Alastair D'Silva 
> wrote:
> > From: Alastair D'Silva 
> > 
> > When presented with large amounts of memory being hotplugged
> > (in my test case, ~890GB), the call to flush_dcache_range takes
> > a while (~50 seconds), triggering RCU stalls.
> > 
> > This patch breaks up the call into 1GB chunks, calling
> > cond_resched() inbetween to allow the scheduler to run.
> > 
> > Signed-off-by: Alastair D'Silva 
> > ---
> >  arch/powerpc/mm/mem.c | 27 +--
> >  1 file changed, 25 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> > index cff947cb2a84..a2758e473d58 100644
> > --- a/arch/powerpc/mm/mem.c
> > +++ b/arch/powerpc/mm/mem.c
> > @@ -104,6 +104,27 @@ int __weak remove_section_mapping(unsigned
> > long start, unsigned long end)
> > return -ENODEV;
> >  }
> > 
> > +#define FLUSH_CHUNK_SIZE SZ_1G
> > +/**
> > + * flush_dcache_range_chunked(): Write any modified data cache
> > blocks out to
> > + * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE
> > + * Does not invalidate the corresponding instruction cache blocks.
> > + *
> > + * @start: the start address
> > + * @stop: the stop address (exclusive)
> > + * @chunk: the max size of the chunks
> > + */
> > +static void flush_dcache_range_chunked(unsigned long start,
> > unsigned long stop,
> > +  unsigned long chunk)
> > +{
> > +   unsigned long i;
> > +
> > +   for (i = start; i < stop; i += FLUSH_CHUNK_SIZE) {
> Here you ignore the function parameter "chunk" and use the define
> FLUSH_CHUNK_SIZE.
> You should do one or the other; use the parameter or remove it.

Good catch, thankyou :)

> > +   flush_dcache_range(i, min(stop, start +
> > FLUSH_CHUNK_SIZE));
> > +   cond_resched();
> > +   }
> > +}
> > +
> >  int __ref arch_add_memory(int nid, u64 start, u64 size,
> > struct mhp_restrictions *restrictions)
> >  {
> > @@ -120,7 +141,8 @@ int __ref arch_add_memory(int nid, u64 start,
> > u64 size,
> > start, start + size, rc);
> > return -EFAULT;
> > }
> > -   flush_dcache_range(start, start + size);
> > +
> > +   flush_dcache_range_chunked(start, start + size,
> > FLUSH_CHUNK_SIZE);
> > 
> > return __add_pages(nid, start_pfn, nr_pages, restrictions);
> >  }
> > @@ -137,7 +159,8 @@ void __ref arch_remove_memory(int nid, u64
> > start, u64 size,
> > 
> > /* Remove htab bolted mappings for this section of memory
> > */
> > start = (unsigned long)__va(start);
> > -   flush_dcache_range(start, start + size);
> > +   flush_dcache_range_chunked(start, start + size,
> > FLUSH_CHUNK_SIZE);
> > +
> > ret = remove_section_mapping(start, start + size);
> > WARN_ON_ONCE(ret);
> > 
> > --
> > 2.21.0
> > 
-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819



[PATCH] powerpc/papr_scm: Fix an off-by-one check in papr_scm_meta_{get, set}

2019-09-27 Thread Vaibhav Jain
A validation check to prevent out of bounds read/write inside
functions papr_scm_meta_{get,set}() is off-by-one that prevent reads
and writes to the last byte of the label area.

This bug manifests as a failure to probe a dimm when libnvdimm is
unable to read the entire config-area as advertised by
ND_CMD_GET_CONFIG_SIZE. This usually happens when there are large
number of namespaces created in the region backed by the dimm and the
label-index spans max possible config-area. An error of the form below
usually reported in the kernel logs:

[  255.293912] nvdimm: probe of nmem0 failed with error -22

The patch fixes these validation checks there by letting libnvdimm
access the entire config-area.

Fixes: 53e80bd042773('powerpc/nvdimm: Add support for multibyte read/write for 
metadata')
Signed-off-by: Vaibhav Jain 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index a5ac371a3f06..0d0f4974a301 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -124,7 +124,7 @@ static int papr_scm_meta_get(struct papr_scm_priv *p,
int len, read;
int64_t ret;
 
-   if ((hdr->in_offset + hdr->in_length) >= p->metadata_size)
+   if ((hdr->in_offset + hdr->in_length) > p->metadata_size)
return -EINVAL;
 
for (len = hdr->in_length; len; len -= read) {
@@ -178,7 +178,7 @@ static int papr_scm_meta_set(struct papr_scm_priv *p,
__be64 data_be;
int64_t ret;
 
-   if ((hdr->in_offset + hdr->in_length) >= p->metadata_size)
+   if ((hdr->in_offset + hdr->in_length) > p->metadata_size)
return -EINVAL;
 
for (len = hdr->in_length; len; len -= wrote) {
-- 
2.21.0