[RFC PATCH 12/17] powerpc/kvm/hash: Implement HASH_PROTECT hcall
This is equivalent to H_PROTECT hcall, but then takes hash value as the arg instead of hashpte slot number. We will use this later to speed up invalidate operation in guest. Instead of finding slot number using H_READ4 hcall, we can use hash value directly using this hcall. H_AVPN flag value is needed. Otherwise will return error. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/hvcall.h | 3 +- arch/powerpc/include/asm/plpar_wrappers.h | 7 +++ arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 74 ++- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 1 + 5 files changed, 63 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 6a09e91889cf..c234be675774 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -281,7 +281,8 @@ #define H_REGISTER_PROC_TBL0x37C #define H_SIGNAL_SYS_RESET 0x380 #define H_HASH_REMOVE 0x384 -#define MAX_HCALL_OPCODE H_HASH_REMOVE +#define H_HASH_PROTECT 0x388 +#define MAX_HCALL_OPCODE H_HASH_PROTECT /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 8160fea9b5bc..27e30ca6105d 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -226,6 +226,13 @@ static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex, return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn); } +static inline long plpar_pte_hash_protect(unsigned long flags, + unsigned long hash, + unsigned long avpn) +{ + return plpar_hcall_norets(H_HASH_PROTECT, flags, hash, avpn); +} + static inline long plpar_resize_hpt_prepare(unsigned long flags, unsigned long shift) { diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 86c66af38637..d7be56339d53 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4144,6 +4144,7 @@ static unsigned int default_hcall_list[] = { H_XIRR, H_XIRR_X, #endif + H_HASH_PROTECT, H_HASH_REMOVE, 0 }; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 85fedb72469b..2aa507614819 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -752,33 +752,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) return ret; } -long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long va) +long __kvmppc_do_hash_protect(struct kvm *kvm, __be64 *hpte, + unsigned long flags, unsigned long pte_index) { - struct kvm *kvm = vcpu->kvm; - __be64 *hpte; + u64 pte_v, pte_r; struct revmap_entry *rev; unsigned long v, r, rb, mask, bits; - u64 pte_v, pte_r; - - if (kvm_is_radix(kvm)) - return H_FUNCTION; - if (pte_index >= kvmppc_hpt_npte(>arch.hpt)) - return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); - while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) - cpu_relax(); v = pte_v = be64_to_cpu(hpte[0]); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1])); - if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || - ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) { - __unlock_hpte(hpte, pte_v); - return H_NOT_FOUND; - } - pte_r = be64_to_cpu(hpte[1]); bits = (flags << 55) & HPTE_R_PP0; bits |= (flags << 48) & HPTE_R_KEY_HI; @@ -823,6 +804,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + __be64 *hpte; + u64 v, pte_v; + struct kvm *kvm = vcpu->kvm; + + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(>arch.hpt)) + return H_PARAMETER; + + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + v = pte_v = be64_to_cpu(hpte[0]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1])); + if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || + ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) { +
[RFC PATCH 11/17] powerpc/kvm/hash: Implement HASH_REMOVE hcall
This is equivalent to H_REMOVE hcall, but then takes hash value as the arg instead of hashpte slot number. We will use this later to speed up invalidate operation in guest. Instead of finding slot number using H_READ4 hcall, we can use hash value directly using this hcall. Only support flag value for the operation is H_AVPN. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/hvcall.h | 3 +- arch/powerpc/include/asm/plpar_wrappers.h | 16 arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/kvm/book3s_hv_rm_mmu.c | 134 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 + 5 files changed, 138 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 57d38b504ff7..6a09e91889cf 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -280,7 +280,8 @@ #define H_RESIZE_HPT_COMMIT0x370 #define H_REGISTER_PROC_TBL0x37C #define H_SIGNAL_SYS_RESET 0x380 -#define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET +#define H_HASH_REMOVE 0x384 +#define MAX_HCALL_OPCODE H_HASH_REMOVE /* H_VIOCTL functions */ #define H_GET_VIOA_DUMP_SIZE 0x01 diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index c7b164836bc3..8160fea9b5bc 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -124,6 +124,22 @@ static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex, return rc; } +static inline long plpar_pte_hash_remove(unsigned long flags, unsigned long hash, + unsigned long avpn, unsigned long *old_pteh_ret, + unsigned long *old_ptel_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_HASH_REMOVE, retbuf, flags, hash, avpn); + + *old_pteh_ret = retbuf[0]; + *old_ptel_ret = retbuf[1]; + + return rc; +} + + /* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */ static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex, unsigned long avpn, unsigned long *old_pteh_ret, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0b436df746fc..86c66af38637 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4144,6 +4144,7 @@ static unsigned int default_hcall_list[] = { H_XIRR, H_XIRR_X, #endif + H_HASH_REMOVE, 0 }; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 5abaed27708b..85fedb72469b 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -465,34 +465,21 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, } } -long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, - unsigned long pte_index, unsigned long avpn, - unsigned long *hpret) +static long __kvmppc_do_hash_remove(struct kvm *kvm, __be64 *hpte, + unsigned long pte_index, + unsigned long *hpret) { - __be64 *hpte; + unsigned long v, r, rb; struct revmap_entry *rev; u64 pte, orig_pte, pte_r; - if (kvm_is_radix(kvm)) - return H_FUNCTION; - if (pte_index >= kvmppc_hpt_npte(>arch.hpt)) - return H_PARAMETER; - hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); - while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) - cpu_relax(); pte = orig_pte = be64_to_cpu(hpte[0]); pte_r = be64_to_cpu(hpte[1]); if (cpu_has_feature(CPU_FTR_ARCH_300)) { pte = hpte_new_to_old_v(pte, pte_r); pte_r = hpte_new_to_old_r(pte_r); } - if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || - ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || - ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - __unlock_hpte(hpte, orig_pte); - return H_NOT_FOUND; - } rev = real_vmalloc_addr(>arch.hpt.rev[pte_index]); v = pte & ~HPTE_V_HVLOCK; @@ -525,6 +512,35 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, hpret[1] = r; return H_SUCCESS; } + +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) +{ + __be64 *hpte; + u64 pte, orig_pte, pte_r; + + if (kvm_is_radix(kvm)) + return H_FUNCTION; + if (pte_index >= kvmppc_hpt_npte(>arch.hpt)) + return H_PARAMETER; + hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4)); +
[RFC PATCH 10/17] powerpc/mm: Add new firmware feature HASH API
We will use this feature to check whether hypervisor implements hash based remove and protect hcalls Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/firmware.h | 3 ++- arch/powerpc/kvm/powerpc.c| 4 arch/powerpc/platforms/pseries/firmware.c | 1 + include/uapi/linux/kvm.h | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 8645897472b1..152d704ac3c3 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -51,6 +51,7 @@ #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000) #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001) #define FW_FEATURE_PRRNASM_CONST(0x0002) +#define FW_FEATURE_HASH_APIASM_CONST(0x0004) #ifndef __ASSEMBLY__ @@ -67,7 +68,7 @@ enum { FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | - FW_FEATURE_HPT_RESIZE, + FW_FEATURE_HPT_RESIZE | FW_FEATURE_HASH_API, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1a75c0b5f4ca..bd551edfa155 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -632,6 +632,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) /* Disable this on POWER9 until code handles new HPTE format */ r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300); break; + case KVM_CAP_SPAPR_HASH_API: + /* Only enable for HV kvm */ + r = is_kvmppc_hv_enabled(kvm); + break; #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_FWNMI: diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 63cc82ad58ac..32081d4406e8 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -65,6 +65,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_SET_MODE, "hcall-set-mode"}, {FW_FEATURE_BEST_ENERGY,"hcall-best-energy-1*"}, {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, + {FW_FEATURE_HASH_API, "hcall-hash-api"}, }; /* Build up the firmware features bitmask using the contents of diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 6cd63c18708a..698b202b4c53 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -929,6 +929,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SMT_POSSIBLE 147 #define KVM_CAP_HYPERV_SYNIC2 148 #define KVM_CAP_HYPERV_VP_INDEX 149 +#define KVM_CAP_SPAPR_HASH_API 150 #ifdef KVM_CAP_IRQ_ROUTING -- 2.13.3
[RFC PATCH 09/17] powerpc/mm: Remove unused flag arg in global_invalidates
Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index fedb0139524c..5abaed27708b 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -42,7 +42,7 @@ static void *real_vmalloc_addr(void *x) } /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ -static int global_invalidates(struct kvm *kvm, unsigned long flags) +static int global_invalidates(struct kvm *kvm) { int global; int cpu; @@ -499,7 +499,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); rb = compute_tlbie_rb(v, pte_r, pte_index); - do_tlbies(kvm, , 1, global_invalidates(kvm, flags), true); + do_tlbies(kvm, , 1, global_invalidates(kvm), true); /* * The reference (R) and change (C) bits in a HPT * entry can be set by hardware at any time up until @@ -549,7 +549,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) if (kvm_is_radix(kvm)) return H_FUNCTION; - global = global_invalidates(kvm, 0); + global = global_invalidates(kvm); for (i = 0; i < 4 && ret == H_SUCCESS; ) { n = 0; for (; i < 4; ++i) { @@ -709,8 +709,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, rb = compute_tlbie_rb(v, r, pte_index); hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) | HPTE_V_ABSENT); - do_tlbies(kvm, , 1, global_invalidates(kvm, flags), - true); + do_tlbies(kvm, , 1, global_invalidates(kvm), true); /* Don't lose R/C bit updates done by hardware */ r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C); hpte[1] = cpu_to_be64(r); -- 2.13.3
[RFC PATCH 08/17] powerpc/mm/hash: Don't track hash pte slot number in linux page table.
Now that we have updated all MMU hash operations to work with hash value instead of slot, remove slot tracking completely. We also remove real_pte because without slot tracking 4k, 64k and 64k subpages all have similar pte format. One of the side effect of this is, we now don't track whether we have taken a fault on 4k subpages on a 64k page config. That means a invalidate will try to invalidate all the 4k subpages. To minimize the impact from above THP still track the slot details. With THP we have 4096 subpages and we want to avoid calling invalidate on all. For THP we don't track slot details as part of linux page table, but are tracked in the deposited page table Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/book3s/64/hash-4k.h | 16 +++- arch/powerpc/include/asm/book3s/64/hash-64k.h | 44 +- arch/powerpc/include/asm/book3s/64/hash.h | 5 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 26 -- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 3 +- arch/powerpc/include/asm/pgtable-be-types.h| 10 --- arch/powerpc/include/asm/pgtable-types.h | 9 -- arch/powerpc/mm/dump_linuxpagetables.c | 10 --- arch/powerpc/mm/hash64_4k.c| 2 - arch/powerpc/mm/hash64_64k.c | 95 +- arch/powerpc/mm/hash_native_64.c | 12 +-- arch/powerpc/mm/hash_utils_64.c| 22 + arch/powerpc/mm/hugetlbpage-hash64.c | 4 - arch/powerpc/mm/tlb_hash64.c | 9 +- arch/powerpc/platforms/pseries/lpar.c | 4 +- 15 files changed, 50 insertions(+), 221 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 0c4e470571ca..d65dcb5826ff 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -17,8 +17,7 @@ #define H_PGD_TABLE_SIZE (sizeof(pgd_t) << H_PGD_INDEX_SIZE) /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \ -H_PAGE_F_SECOND | H_PAGE_F_GIX) +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE) /* * Not supported by 4k linux page size */ @@ -27,6 +26,19 @@ #define H_PAGE_COMBO 0x0 #define H_PTE_FRAG_NR 0 #define H_PTE_FRAG_SIZE_SHIFT 0 + +#define pte_iterate_hashed_subpages(vpn, psize, index, shift) \ + do {\ + index = 0; \ + shift = mmu_psize_defs[psize].shift;\ + +#define pte_iterate_hashed_end() } while(0) +/* + * We expect this to be called only for user addresses or kernel virtual + * addresses other than the linear mapping. + */ +#define pte_pagesize_index(mm, addr, pte) MMU_PAGE_4K + /* * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 9732837aaae8..ab36323b8a3e 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -25,8 +25,7 @@ #define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND) /* PTE flags to conserve for HPTE identification */ -#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \ -H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO) +#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO) /* * we support 16 fragments per PTE page of 64K size. */ @@ -40,55 +39,16 @@ #ifndef __ASSEMBLY__ #include - -/* - * With 64K pages on hash table, we have a special PTE format that - * uses a second "half" of the page table to encode sub-page information - * in order to deal with 64K made of 4K HW pages. Thus we override the - * generic accessors and iterators here - */ -#define __real_pte __real_pte -static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep) -{ - real_pte_t rpte; - unsigned long *hidxp; - - rpte.pte = pte; - rpte.hidx = 0; - if (pte_val(pte) & H_PAGE_COMBO) { - /* -* Make sure we order the hidx load against the H_PAGE_COMBO -* check. The store side ordering is done in __hash_page_4K -*/ - smp_rmb(); - hidxp = (unsigned long *)(ptep + PTRS_PER_PTE); - rpte.hidx = *hidxp; - } - return rpte; -} - -static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long index) -{ - if ((pte_val(rpte.pte) & H_PAGE_COMBO)) - return (rpte.hidx >> (index<<2)) & 0xf; - return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf; -} - -#define __rpte_to_pte(r) ((r).pte) -extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index); /* * Trick: we
[RFC PATCH 07/17] powerpc/mm: Add hash updatepp callback
Add hash based updatepp callback and use that during hash pte fault. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 6 + arch/powerpc/mm/hash64_4k.c | 7 + arch/powerpc/mm/hash64_64k.c | 17 +++- arch/powerpc/mm/hash_native_64.c | 37 +++ arch/powerpc/mm/hugetlbpage-hash64.c | 9 ++- arch/powerpc/platforms/ps3/htab.c | 29 + arch/powerpc/platforms/pseries/lpar.c | 31 ++ 7 files changed, 109 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 7e1fcae472f0..a784d4ac4fb1 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -144,6 +144,12 @@ struct mmu_hash_ops { unsigned long vpn, int bpsize, int apsize, int ssize, unsigned long flags); + long(*hash_updatepp)(unsigned long hash, +unsigned long newpp, +unsigned long vpn, +int bpsize, int apsize, +int ssize, unsigned long flags); + void(*hpte_updateboltedpp)(unsigned long newpp, unsigned long ea, int psize, int ssize); diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 6fa450c12d6d..d262d814ca55 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -65,12 +65,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & H_PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; - - if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K, + if (mmu_hash_ops.hash_updatepp(hash, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; } diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 1a68cb19b0e3..2b72f2c5ed10 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -127,17 +127,11 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, int ret; hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(rpte, subpg_index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - - ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, + ret = mmu_hash_ops.hash_updatepp(hash, rflags, vpn, MMU_PAGE_4K, MMU_PAGE_4K, ssize, flags); /* -*if we failed because typically the HPTE wasn't really here +* if we failed because typically the HPTE wasn't really here * we try an insertion. */ if (ret == -1) @@ -268,12 +262,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, * There MIGHT be an HPTE for this pte */ hash = hpt_hash(vpn, shift, ssize); - if (old_pte & H_PAGE_F_SECOND) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT; - - if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K, + if (mmu_hash_ops.hash_updatepp(hash, rflags, vpn, MMU_PAGE_64K, MMU_PAGE_64K, ssize, flags) == -1) old_pte &= ~_PAGE_HPTEFLAGS; diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index c3fdd684a287..2eaded4680ae 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -391,6 +391,42 @@ struct hash_pte *native_hpte_find(unsigned long hash, unsigned long vpn, return NULL; } +static long native_hash_updatepp(unsigned long hash, unsigned long newpp, +unsigned long
[RFC PATCH 06/17] powerpc/mm: Switch flush_hash_range to not use slot
Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/mm/hash_native_64.c | 28 arch/powerpc/platforms/pseries/lpar.c | 13 - 2 files changed, 12 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index ce25e125dd06..c3fdd684a287 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -702,10 +702,8 @@ static void native_hpte_clear(void) static void native_flush_hash_range(unsigned long number, int local) { unsigned long vpn; - unsigned long hash, index, hidx, shift, slot; + unsigned long hash, index, shift; struct hash_pte *hptep; - unsigned long hpte_v; - unsigned long want_v; unsigned long flags; real_pte_t pte; struct ppc64_tlb_batch *batch = this_cpu_ptr(_tlb_batch); @@ -725,23 +723,13 @@ static void native_flush_hash_range(unsigned long number, int local) pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - hptep = htab_address + slot; - want_v = hpte_encode_avpn(vpn, psize, ssize); - native_lock_hpte(hptep); - hpte_v = be64_to_cpu(hptep->v); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - hpte_v = hpte_new_to_old_v(hpte_v, - be64_to_cpu(hptep->r)); - if (!HPTE_V_COMPARE(hpte_v, want_v) || - !(hpte_v & HPTE_V_VALID)) - native_unlock_hpte(hptep); - else - hptep->v = 0; + hptep = native_hpte_find(hash, vpn, psize, ssize); + if (!hptep) + continue; + /* +* Invalidate the hpte. NOTE: this also unlocks it +*/ + hptep->v = 0; } pte_iterate_hashed_end(); } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index e366252e0e93..ad7838171bb0 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -580,14 +580,14 @@ static int pSeries_lpar_hpte_removebolted(unsigned long ea, static void pSeries_lpar_flush_hash_range(unsigned long number, int local) { unsigned long vpn; - unsigned long i, pix, rc; + unsigned long i, rc; unsigned long flags = 0; struct ppc64_tlb_batch *batch = this_cpu_ptr(_tlb_batch); int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); unsigned long param[PLPAR_HCALL9_BUFSIZE]; - unsigned long hash, index, shift, hidx, slot; + unsigned long index, shift, slot; real_pte_t pte; - int psize, ssize; + int psize, ssize, pix; if (lock_tlbie) spin_lock_irqsave(_lpar_tlbie_lock, flags); @@ -599,12 +599,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long number, int local) vpn = batch->vpn[i]; pte = batch->pte[i]; pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { - hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; + slot = pSeries_lpar_hpte_find(vpn, psize, ssize); if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { /* * lpar doesn't use the passed actual page size -- 2.13.3
[RFC PATCH 05/17] powerpc/mm: use hash_invalidate for __kernel_map_pages()
Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/mm/hash_utils_64.c | 32 +--- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index a02570b4cfed..66f12b48f838 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -118,11 +118,6 @@ EXPORT_SYMBOL_GPL(mmu_slb_size); #ifdef CONFIG_PPC_64K_PAGES int mmu_ci_restrictions; #endif -#ifdef CONFIG_DEBUG_PAGEALLOC -static u8 *linear_map_hash_slots; -static unsigned long linear_map_hash_count; -static DEFINE_SPINLOCK(linear_map_hash_lock); -#endif /* CONFIG_DEBUG_PAGEALLOC */ struct mmu_hash_ops mmu_hash_ops; EXPORT_SYMBOL(mmu_hash_ops); @@ -1746,7 +1741,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned long vpn, } #ifdef CONFIG_DEBUG_PAGEALLOC -static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) +static void kernel_map_linear_page(unsigned long vaddr) { unsigned long hash; unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize); @@ -1763,12 +1758,7 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode, HPTE_V_BOLTED, mmu_linear_psize, mmu_kernel_ssize); - BUG_ON (ret < 0); - spin_lock(_map_hash_lock); - BUG_ON(linear_map_hash_slots[lmi] & 0x80); - linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -1778,35 +1768,23 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(_map_hash_lock); - BUG_ON(!(linear_map_hash_slots[lmi] & 0x80)); - hidx = linear_map_hash_slots[lmi] & 0x7f; - linear_map_hash_slots[lmi] = 0; - spin_unlock(_map_hash_lock); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize, + mmu_hash_ops.hash_invalidate(hash, vpn, mmu_linear_psize, mmu_linear_psize, mmu_kernel_ssize, 0); } void __kernel_map_pages(struct page *page, int numpages, int enable) { - unsigned long flags, vaddr, lmi; + unsigned long flags, vaddr; int i; local_irq_save(flags); for (i = 0; i < numpages; i++, page++) { vaddr = (unsigned long)page_address(page); - lmi = __pa(vaddr) >> PAGE_SHIFT; - if (lmi >= linear_map_hash_count) - continue; if (enable) - kernel_map_linear_page(vaddr, lmi); + kernel_map_linear_page(vaddr); else - kernel_unmap_linear_page(vaddr, lmi); + kernel_unmap_linear_page(vaddr); } local_irq_restore(flags); } -- 2.13.3
[RFC PATCH 04/17] powerpc/mm: Add hash invalidate callback
Add hash based invalidate callback and use that in flush_hash_page. Note: In a later patch, we will drop the slot tracking completely. At that point we will also loose the __rpte_sub_valid() check in pte_iterate_hashed_subpages(). That means we call the invalidate for all subpages irrespective of whether we took a hash fault on that or not. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 4 arch/powerpc/mm/hash_native_64.c | 27 +++ arch/powerpc/mm/hash_utils_64.c | 11 +++ arch/powerpc/platforms/ps3/htab.c | 22 ++ arch/powerpc/platforms/pseries/lpar.c | 26 ++ 5 files changed, 82 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index f9cce40a4035..7e1fcae472f0 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -135,6 +135,10 @@ struct mmu_hash_ops { unsigned long vpn, int bpsize, int apsize, int ssize, int local); + void(*hash_invalidate)(unsigned long hash, + unsigned long vpn, + int bpsize, int apsize, + int ssize, int local); long(*hpte_updatepp)(unsigned long slot, unsigned long newpp, unsigned long vpn, diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 4b3f6d66e7f0..ce25e125dd06 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -492,6 +492,32 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, local_irq_restore(flags); } +static void native_hash_invalidate(unsigned long hash, unsigned long vpn, + int bpsize, int apsize, int ssize, int local) +{ + unsigned long flags; + struct hash_pte *hptep; + + DBG_LOW("invalidate(vpn=%016lx, hash: %lx)\n", vpn, hash); + local_irq_save(flags); + hptep = native_hpte_find(hash, vpn, bpsize, ssize); + if (hptep) { + /* +* Invalidate the hpte. NOTE: this also unlocks it +*/ + hptep->v = 0; + } + /* +* We need to invalidate the TLB always because hpte_remove doesn't do +* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less +* random entry from it. When we do that we don't invalidate the TLB +* (hpte_remove) because we assume the old translation is still +* technically "valid". +*/ + tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(flags); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void native_hugepage_invalidate(unsigned long vsid, unsigned long addr, @@ -771,6 +797,7 @@ static int native_register_proc_table(unsigned long base, unsigned long page_siz void __init hpte_init_native(void) { mmu_hash_ops.hpte_invalidate= native_hpte_invalidate; + mmu_hash_ops.hash_invalidate= native_hash_invalidate; mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index ff3c9522a2b3..a02570b4cfed 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1598,23 +1598,18 @@ static inline void tm_flush_hash_page(int local) void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize, unsigned long flags) { - unsigned long hash, index, shift, hidx, slot; + unsigned long hash, index, shift; int local = flags & HPTE_LOCAL_UPDATE; DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn); pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) { hash = hpt_hash(vpn, shift, ssize); - hidx = __rpte_to_hidx(pte, index); - if (hidx & _PTEIDX_SECONDARY) - hash = ~hash; - slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot += hidx & _PTEIDX_GROUP_IX; - DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx); + DBG_LOW(" sub %ld: hash=%lx\n", index, hash); /* * We use same base page size and actual psize, because we don't * use these functions for hugepage */ -
[RFC PATCH 03/17] powerpc/ps3/mm: Add helper for finding hash pte slot using hash value
We will use this in later patch. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/platforms/ps3/htab.c | 37 + 1 file changed, 37 insertions(+) diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index cc2b281a3766..255b7a33fefe 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -193,6 +193,43 @@ static void ps3_hpte_clear(void) ps3_mm_vas_destroy(); } +static long ps3_hpte_find(unsigned long hash, unsigned long want_v) +{ + unsigned long i, j, result; + unsigned long hpte_group; + bool secondary_search = false; + u64 hpte_v_array[4], hpte_rs; + + + /* first check primary */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + +search_again: + for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { + + result = lv1_read_htab_entries(PS3_LPAR_VAS_ID_CURRENT, + hpte_group & ~0x3UL, _v_array[0], + _v_array[1], _v_array[2], + _v_array[3], _rs); + /* ignore failures ? */ + if (result) + continue; + + for (j = 0; j < 4; j++) { + if (HPTE_V_COMPARE(hpte_v_array[j], want_v) && + (hpte_v_array[j] & HPTE_V_VALID)) { + return hpte_group + j; + } + } + } + if (!secondary_search) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + secondary_search = true; + goto search_again; + } + return -1; +} + void __init ps3_hpte_init(unsigned long htab_size) { mmu_hash_ops.hpte_invalidate = ps3_hpte_invalidate; -- 2.13.3
[RFC PATCH 02/17] powerpc/pseries: Update hpte find helper to take hash value
The helper now also does secondary hash search so that we can use this in other functions. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/platforms/pseries/lpar.c | 28 +--- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 495ba4e7336d..edab68d9f9f3 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -328,15 +328,21 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, return 0; } -static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_group) +static long __pSeries_lpar_hpte_find(unsigned long hash, unsigned long want_v) { long lpar_rc; unsigned long i, j; + unsigned long hpte_group; + bool secondary_search = false; struct { unsigned long pteh; unsigned long ptel; } ptes[4]; + /* first check primary */ + hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; + +search_again: for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) { lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes); @@ -346,31 +352,31 @@ static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long hpte_gr for (j = 0; j < 4; j++) { if (HPTE_V_COMPARE(ptes[j].pteh, want_v) && (ptes[j].pteh & HPTE_V_VALID)) - return i + j; + return hpte_group + j; } } - + if (!secondary_search) { + hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + secondary_search = true; + goto search_again; + } return -1; } static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize) { long slot; - unsigned long hash; - unsigned long want_v; - unsigned long hpte_group; + unsigned long hash, want_v; hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); want_v = hpte_encode_avpn(vpn, psize, ssize); - - /* Bolted entries are always in the primary group */ - hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP; - slot = __pSeries_lpar_hpte_find(want_v, hpte_group); + slot = __pSeries_lpar_hpte_find(hash, want_v); if (slot < 0) return -1; - return hpte_group + slot; + return slot; } + static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) -- 2.13.3
[RFC PATCH 01/17] powerpc/mm: Update native_hpte_find to return hash pte
The helper now also does a secondary hash search so that we can use this in other functions. Signed-off-by: Aneesh Kumar K.V--- arch/powerpc/mm/hash_native_64.c | 70 +++- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 3848af167df9..4b3f6d66e7f0 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -351,32 +351,44 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, return ret; } -static long native_hpte_find(unsigned long vpn, int psize, int ssize) +/* returns a locked hash pte */ +struct hash_pte *native_hpte_find(unsigned long hash, unsigned long vpn, + unsigned long bpsize, unsigned long ssize) { + int i; + unsigned long hpte_v; struct hash_pte *hptep; - unsigned long hash; - unsigned long i; - long slot; - unsigned long want_v, hpte_v; - - hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - want_v = hpte_encode_avpn(vpn, psize, ssize); + unsigned long want_v, slot; + bool secondary_search = false; - /* Bolted mappings are only ever in the primary group */ + want_v = hpte_encode_avpn(vpn, bpsize, ssize); slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; - for (i = 0; i < HPTES_PER_GROUP; i++) { - hptep = htab_address + slot; + + /* +* search for hpte in the primary group +*/ +search_again: + hptep = htab_address + slot; + for (i = 0; i < HPTES_PER_GROUP; i++, hptep++) { + /* +* FIXME!! Should we check locklessly check first ? +*/ + native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); if (cpu_has_feature(CPU_FTR_ARCH_300)) hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); - - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) - /* HPTE matches */ - return slot; - ++slot; + if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) + native_unlock_hpte(hptep); + else + return hptep; } - - return -1; + if (!secondary_search) { + /* Search for hpte in the secondary group */ + slot = (~hash & htab_hash_mask) * HPTES_PER_GROUP; + secondary_search = true; + goto search_again; + } + return NULL; } /* @@ -389,23 +401,22 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, int psize, int ssize) { - unsigned long vpn; - unsigned long vsid; - long slot; + unsigned long hash; + unsigned long vpn, vsid; struct hash_pte *hptep; vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); - - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); + hptep = native_hpte_find(hash, vpn, psize, ssize); + if (!hptep) panic("could not find page to bolt\n"); - hptep = htab_address + slot; /* Update the HPTE */ hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) & ~(HPTE_R_PPP | HPTE_R_N)) | (newpp & (HPTE_R_PPP | HPTE_R_N))); + native_unlock_hpte(hptep); /* * Ensure it is out of the tlb too. Bolted entries base and * actual page size will be same. @@ -422,18 +433,17 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) { unsigned long vpn; unsigned long vsid; - long slot; + unsigned long hash; struct hash_pte *hptep; vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); + hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize); - slot = native_hpte_find(vpn, psize, ssize); - if (slot == -1) + hptep = native_hpte_find(hash, vpn, psize, ssize); + if (!hptep) return -ENOENT; - hptep = htab_address + slot; - VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED)); /* Invalidate the hpte */ -- 2.13.3
[RFC PATCH 00/17] Remove slot tracking from linux page table
Hi, This patch series removes hash pte slot tracking in linux page table. This free up 4 bits from linux page table and brings the hash and radix linux page table closer. The series also attempt remove __real_pte_t because without slot tracking 4k subpage and 64k page , pte formats are similar. However not tracking slot implies we search the hash group during invalidate and updatepp operations. That involves searching max 16 slots to find the matching hash page table entry. W.r.t subpages, since we don't track the validity of slots, when invalidating 64K page, we ends up calling invalidate for all subpages irrespective of whether we have taken a subpage fault or not. W.r.t THP, we skip the above and still track slots in level deposited page table. The patch series do have an impact, hence i am sending this as an RFC series before doing further measurements with kvm. On baremetal a kernel build gives. Without patch: /usr/bin/time -p make vmlinux modules > /dev/null real 270.70 user 280.23 sys 57.99 With patch /usr/bin/time -p make vmlinux modules > /dev/null real 272.97 user 281.32 sys 61.46 That is 6% impact on system time: The real time impact is within the runtime variance. Let me know if you think we should continue with this approach. -aneesh Aneesh Kumar K.V (17): powerpc/mm: Update native_hpte_find to return hash pte powerpc/pseries: Update hpte find helper to take hash value powerpc/ps3/mm: Add helper for finding hash pte slot using hash value powerpc/mm: Add hash invalidate callback powerpc/mm: use hash_invalidate for __kernel_map_pages() powerpc/mm: Switch flush_hash_range to not use slot powerpc/mm: Add hash updatepp callback powerpc/mm/hash: Don't track hash pte slot number in linux page table. powerpc/mm: Remove unused flag arg in global_invalidates powerpc/mm: Add new firmware feature HASH API powerpc/kvm/hash: Implement HASH_REMOVE hcall powerpc/kvm/hash: Implement HASH_PROTECT hcall powerpc/kvm/hash: Implement HASH_BULK_REMOVE hcall powerpc/mm/pseries: Use HASH_PROTECT hcall in guest powerpc/mm/pseries: Use HASH_REMOVE hcall in guest powerpc/mm/pseries: Move slot based bulk remove to helper powerpc/mm/pseries: Use HASH_BULK_REMOVE hcall in guest arch/powerpc/include/asm/book3s/64/hash-4k.h | 16 +- arch/powerpc/include/asm/book3s/64/hash-64k.h | 44 +-- arch/powerpc/include/asm/book3s/64/hash.h | 5 +- arch/powerpc/include/asm/book3s/64/mmu-hash.h | 12 + arch/powerpc/include/asm/book3s/64/pgtable.h | 26 -- arch/powerpc/include/asm/book3s/64/tlbflush-hash.h | 3 +- arch/powerpc/include/asm/firmware.h| 3 +- arch/powerpc/include/asm/hvcall.h | 5 +- arch/powerpc/include/asm/pgtable-be-types.h| 10 - arch/powerpc/include/asm/pgtable-types.h | 9 - arch/powerpc/include/asm/plpar_wrappers.h | 23 ++ arch/powerpc/kvm/book3s_hv.c | 3 + arch/powerpc/kvm/book3s_hv_rm_mmu.c| 306 ++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S| 4 + arch/powerpc/kvm/powerpc.c | 4 + arch/powerpc/mm/dump_linuxpagetables.c | 10 - arch/powerpc/mm/hash64_4k.c| 9 +- arch/powerpc/mm/hash64_64k.c | 108 ++-- arch/powerpc/mm/hash_native_64.c | 172 arch/powerpc/mm/hash_utils_64.c| 65 + arch/powerpc/mm/hugetlbpage-hash64.c | 13 +- arch/powerpc/mm/tlb_hash64.c | 9 +- arch/powerpc/platforms/ps3/htab.c | 88 ++ arch/powerpc/platforms/pseries/firmware.c | 1 + arch/powerpc/platforms/pseries/lpar.c | 193 ++--- include/uapi/linux/kvm.h | 1 + 26 files changed, 736 insertions(+), 406 deletions(-) -- 2.13.3
Re: [PATCH] POWER9 PMU stops after idle workaround
Hi Nick, > POWER9 DD2 PMU can stop after a state-loss idle in some conditions. > > A solution is to set then clear MMCRA[60] after wake from state-loss > idle. Looks good. Acked-by: Anton BlanchardAnton > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/kernel/idle_book3s.S | 8 +++- > 1 file changed, 7 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/kernel/idle_book3s.S > b/arch/powerpc/kernel/idle_book3s.S index 516ebef905c0..e6252c5a57a4 > 100644 --- a/arch/powerpc/kernel/idle_book3s.S > +++ b/arch/powerpc/kernel/idle_book3s.S > @@ -460,11 +460,17 @@ pnv_restore_hyp_resource_arch300: > /* >* Workaround for POWER9, if we lost resources, the ERAT >* might have been mixed up and needs flushing. We also need > - * to reload MMCR0 (see comment above). > + * to reload MMCR0 (see comment above). We also need to set > + * then clear bit 60 in MMCRA to ensure the PMU starts > running. */ > blt cr3,1f > PPC_INVALIDATE_ERAT > ld r1,PACAR1(r13) > + mfspr r4,SPRN_MMCRA > + ori r4,r4,(1 << (63-60)) > + mtspr SPRN_MMCRA,r4 > + xorir4,r4,(1 << (63-60)) > + mtspr SPRN_MMCRA,r4 > ld r4,_MMCR0(r1) > mtspr SPRN_MMCR0,r4 > 1:
Re: [PATCH] mpc832x_rdb: fix of_irq_to_resource() error check
On Mon, 2017-07-31 at 20:04 +1000, Michael Ellerman wrote: > Scott Woodwrites: > > > On Sat, 2017-07-29 at 22:52 +0300, Sergei Shtylyov wrote: > > > of_irq_to_resource() has recently been fixed to return negative error > > > #'s > > > along with 0 in case of failure, however the Freescale MPC832x RDB > > > board > > > code still only regards 0 as as failure indication -- fix it up. > > > > > > Fixes: 7a4228bbff76 ("of: irq: use of_irq_get() in > > > of_irq_to_resource()") > > > Signed-off-by: Sergei Shtylyov > > > > > > --- > > > The patch is against the 'master' branch of Scott Wood's 'linux.git' > > > repo > > > (the 'fixes' branch is too much behind). > > > > The master branch is also old. Those branches are only used when needed > > to > > apply patches; I don't update them just to sync up. If they're older than > > what's in Michael's or Linus's tree (as they almost always are), then use > > those instead. > > > > Not that I expect it to make a difference to this patch... > > Do you want me to grab this as a fix for 4.13 ? Sure: Acked-by: Scott Wood -Scott
Re: [RFC Part1 PATCH v3 11/17] x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory pages
On Mon, Jul 24, 2017 at 02:07:51PM -0500, Brijesh Singh wrote: > From: Tom Lendacky> > In order for memory pages to be properly mapped when SEV is active, we > need to use the PAGE_KERNEL protection attribute as the base protection. > This will insure that memory mapping of, e.g. ACPI tables, receives the > proper mapping attributes. > > Signed-off-by: Tom Lendacky > Signed-off-by: Brijesh Singh > --- > arch/x86/mm/ioremap.c | 28 > include/linux/ioport.h | 3 +++ > kernel/resource.c | 17 + > 3 files changed, 48 insertions(+) > > diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c > index c0be7cf..7b27332 100644 > --- a/arch/x86/mm/ioremap.c > +++ b/arch/x86/mm/ioremap.c > @@ -69,6 +69,26 @@ static int __ioremap_check_ram(unsigned long start_pfn, > unsigned long nr_pages, > return 0; > } > > +static int __ioremap_res_desc_other(struct resource *res, void *arg) > +{ > + return (res->desc != IORES_DESC_NONE); > +} > + > +/* > + * This function returns true if the target memory is marked as > + * IORESOURCE_MEM and IORESOURCE_BUSY and described as other than > + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES). > + */ > +static bool __ioremap_check_if_mem(resource_size_t addr, unsigned long size) > +{ > + u64 start, end; > + > + start = (u64)addr; > + end = start + size - 1; > + > + return (walk_mem_res(start, end, NULL, __ioremap_res_desc_other) == 1); > +} > + > /* > * Remap an arbitrary physical address space into the kernel virtual > * address space. It transparently creates kernel huge I/O mapping when > @@ -146,7 +166,15 @@ static void __iomem *__ioremap_caller(resource_size_t > phys_addr, > pcm = new_pcm; > } > > + /* > + * If the page being mapped is in memory and SEV is active then > + * make sure the memory encryption attribute is enabled in the > + * resulting mapping. > + */ > prot = PAGE_KERNEL_IO; > + if (sev_active() && __ioremap_check_if_mem(phys_addr, size)) > + prot = pgprot_encrypted(prot); Hmm, so this function already does walk_system_ram_range() a bit earlier and now on SEV systems we're going to do it again. Can we make walk_system_ram_range() return a distinct value for SEV systems and act accordingly in __ioremap_caller() instead of repeating the operation? It looks to me like we could... -- Regards/Gruss, Boris. SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nürnberg) --
Re: [PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines
On Tue, 1 Aug 2017 21:08:49 +0200 christophe leroywrote: > Le 01/08/2017 à 13:25, Balbir Singh a écrit : > > Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX > > feature support we got support for changing the page permissions > > for pte ranges. This patch adds support for both radix and hash > > so that we can change their permissions via set/clear masks. > > > > A new helper is required for hash (hash__change_memory_range() > > is changed to hash__change_boot_memory_range() as it deals with > > bolted PTE's). > > > > hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests > > for permission changes. hash__change_memory_range() does not invoke > > updatepp, instead it changes the software PTE and invalidates the PTE. > > > > For radix, radix__change_memory_range() is setup to do the right > > thing for vmalloc'd addresses. It takes a new parameter to decide > > what attributes to set. > > > > Signed-off-by: Balbir Singh > > --- > > arch/powerpc/include/asm/book3s/64/hash.h | 6 +++ > > arch/powerpc/include/asm/book3s/64/radix.h | 6 +++ > > arch/powerpc/include/asm/set_memory.h | 34 +++ > > arch/powerpc/mm/pgtable-hash64.c | 51 -- > > arch/powerpc/mm/pgtable-radix.c| 26 ++-- > > arch/powerpc/mm/pgtable_64.c | 68 > > ++ > > 6 files changed, 175 insertions(+), 16 deletions(-) > > create mode 100644 arch/powerpc/include/asm/set_memory.h > > > > [...] > > > diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c > > index 0736e94..3ee4c7d 100644 > > --- a/arch/powerpc/mm/pgtable_64.c > > +++ b/arch/powerpc/mm/pgtable_64.c > > @@ -514,3 +514,71 @@ void mark_initmem_nx(void) > > hash__mark_initmem_nx(); > > } > > #endif > > + > > +#ifdef CONFIG_ARCH_HAS_SET_MEMORY > > +/* > > + * Some of these bits are taken from arm64/mm/page_attr.c > > + */ > > +static int change_memory_common(unsigned long addr, int numpages, > > + unsigned long set, unsigned long clear) > > +{ > > + unsigned long start = addr; > > + unsigned long size = PAGE_SIZE*numpages; > > + unsigned long end = start + size; > > + struct vm_struct *area; > > + > > + if (!PAGE_ALIGNED(addr)) { > > + start &= PAGE_MASK; > > + end = start + size; > > + WARN_ON_ONCE(1); > > + } > > Why not just set start = addr & PAGE_MASK, then just do > WARN_ON_ONCE(start != addr), instead of that if () The code has been taken from arch/arm64/mm/page_attr.c. I did not change any bits, but we could make changes. > > > + > > + /* > > +* So check whether the [addr, addr + size) interval is entirely > > +* covered by precisely one VM area that has the VM_ALLOC flag set. > > +*/ > > + area = find_vm_area((void *)addr); > > + if (!area || > > + end > (unsigned long)area->addr + area->size || > > + !(area->flags & VM_ALLOC)) > > + return -EINVAL; > > + > > + if (!numpages) > > + return 0; > > Shouldn't that be tested earlier ? > Same as above > > + > > + if (radix_enabled()) > > + return radix__change_memory_range(start, start + size, > > + set, clear); > > + else > > + return hash__change_memory_range(start, start + size, > > + set, clear); > > +} > > The following functions should go in a place common to PPC32 and PPC64, > otherwise they will have to be duplicated when implementing for PPC32. > Maybe the above function should also go in a common place, only the last > part should remain in a PPC64 dedicated part. It could be called > change_memory_range(), something like > > int change_memory_range(unsigned long start, unsigned long end, > unsigned long set, unsigned long clear) > { > if (radix_enabled()) > return radix__change_memory_range(start, end, > set, clear); > return hash__change_memory_range(start, end, set, clear); > } > > Then change_memory_range() could also be implemented for PPC32 later. I was hoping that when we implement support for PPC32, we could refactor the code then and move it to arch/powerpc/mm/page_attr.c if required. What do you think? > > > + > > +int set_memory_ro(unsigned long addr, int numpages) > > +{ > > + return change_memory_common(addr, numpages, > > + 0, _PAGE_WRITE); > > +} > > +EXPORT_SYMBOL(set_memory_ro); > > Take care that _PAGE_WRITE has value 0 when _PAGE_RO instead of _PAGE_RW > is defined (eg for the 8xx). > > It would be better to use accessors like pte_wrprotect() and pte_mkwrite() > Sure we can definitely refactor this for PPC32, pte_wrprotect() and pte_mkwrite() would require us to make the
Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome
Daniel Axtenswrites: > Hi Matt, > >> The raid6 Q syndrome check has been optimised using the vpermxor >> instruction. > > Very much a nit, but normally we'd write the change that the patch makes > as a command: "Optimise the raid6 Q syndrome generation using the > vpermxor instruction" - see > https://www.kernel.org/doc/html/v4.11/process/submitting-patches.html#describe-your-changes There's a good list here: https://chris.beams.io/posts/git-commit/ Which includes "Use the imperative mood in the subject line". And has a good rule of thumb: A properly formed Git commit subject line should always be able to complete the following sentence: If applied, this commit will [your subject line here] In this case Matt's subject is fine, but IMHO you should also use the imperative mood for the body of the change log - which is basically what you said :) cheers
Re: [v5 1/2] lib/raid6: Build proper files on corresponding arch
Daniel Axtenswrites: > Hi Matt, > >> --- a/lib/raid6/test/Makefile >> +++ b/lib/raid6/test/Makefile >> @@ -44,10 +44,12 @@ else ifeq ($(HAS_NEON),yes) >> CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 >> else >> HAS_ALTIVEC := $(shell printf '\#include \nvector int >> a;\n' |\ >> - gcc -c -x c - >&/dev/null && \ >> - rm ./-.o && echo yes) >> + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) > > From memory the change here (s/>&/>/) was necessary to get the build to > succeed - did we ever figure out why that was? I'm not enough of a shell > guru to grok the difference. Using >& redirects stdout and stderr, whereas > only redirects stdout. So possibly it doesn't fix anything, but rather lets you see any error emitted by the compiler rather than swallowing it? cheers
Re: [PATCH v2 3/4] powerpc: add irq accounting for system reset interrupts
On Tue, 1 Aug 2017 22:00:53 +1000 Nicholas Pigginwrote: > diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c > index b67f8b03a32d..4b9a567c9975 100644 > --- a/arch/powerpc/kernel/watchdog.c > +++ b/arch/powerpc/kernel/watchdog.c > @@ -204,6 +204,9 @@ void soft_nmi_interrupt(struct pt_regs *regs) > return; > > nmi_enter(); > + > + __this_cpu_inc(irq_stat.soft_nmi_irqs); > + > tb = get_tb(); > if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { > per_cpu(wd_timer_tb, cpu) = tb; Sorry, this hunk leaked into patch 3. Should be in patch 4.
QRE: [PATCH v2] qe: fix compile issue for arm64
Michael Ellermanwrote: > -Original Message- > From: Michael Ellerman [mailto:m...@ellerman.id.au] > Sent: Monday, July 31, 2017 6:37 PM > To: Qiang Zhao ; o...@buserror.net > Cc: valentin.longch...@keymile.com; linuxppc-dev@lists.ozlabs.org; linux- > ker...@vger.kernel.org > Subject: RE: [PATCH v2] qe: fix compile issue for arm64 > > Qiang Zhao writes: > > > Fri 7/28/2017 2:14 PM, Michael Ellerman wrote: > > > >> -Original Message- > >> From: Michael Ellerman [mailto:m...@ellerman.id.au] > >> Sent: Friday, July 28, 2017 2:14 PM > >> To: Qiang Zhao ; o...@buserror.net > >> Cc: valentin.longch...@keymile.com; linuxppc-dev@lists.ozlabs.org; > >> linux- ker...@vger.kernel.org; Qiang Zhao > >> Subject: Re: [PATCH v2] qe: fix compile issue for arm64 > >> > >> Zhao Qiang writes: > >> > >> > Signed-off-by: Zhao Qiang > >> > --- > >> > Changes for v2: > >> > - include all Errata QE_General4 in #ifdef > >> > > >> > drivers/soc/fsl/qe/qe.c | 2 ++ > >> > 1 file changed, 2 insertions(+) > >> > >> AFAICS this driver can only be built on PPC, what am I missing? > >> > >> config QUICC_ENGINE > >> bool "Freescale QUICC Engine (QE) Support" > >> depends on FSL_SOC && PPC32 > >> > >> cheers > > > > I sent another patchset to support it on arm64. > > Where? I don't see it. > > Shouldn't this patch be part of that series? Otherwise when that series is > merged > the build will break on arm64. > You are correct, thanks for your recommend. I will add this patch to the patchset. Thank you! BR Qiang Zhao
Re: [PATCH v4 1/5] powerpc/lib/sstep: Add cmpb instruction emulation
On Tue, Aug 1, 2017 at 10:44 PM, Segher Boessenkoolwrote: > Hi! > > On Mon, Jul 31, 2017 at 10:58:22AM +1000, Matt Brown wrote: >> @@ -1049,6 +1065,10 @@ int analyse_instr(struct instruction_op *op, struct >> pt_regs *regs, >> do_cmp_unsigned(regs, val, val2, rd >> 2); >> goto instr_done; >> >> + case 508: /* cmpb */ >> + do_cmpb(regs, regs->gpr[rd], regs->gpr[rb], ra); >> + goto instr_done; > > Should this then be under an ifdef for 64-bit? I don't think so, the cmpb instruction should be 32 and 64-bit. It isn't listed under the '64-bit Fixed-point Logical Instructions' section in the ISA either. Thanks, Matt > > > Segher
Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome
Oh, one final thing - I just realised there's a .gitignore file in lib/raid6/.gitignore that needs to be updated to include the vpermxor generated files. That should be part of this patch. Regards, Daniel
Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome
Hi Matt, > The raid6 Q syndrome check has been optimised using the vpermxor > instruction. Very much a nit, but normally we'd write the change that the patch makes as a command: "Optimise the raid6 Q syndrome generation using the vpermxor instruction" - see https://www.kernel.org/doc/html/v4.11/process/submitting-patches.html#describe-your-changes > +static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t > bytes, > + void **ptrs) > +{ > + u8 **dptr = (u8 **)ptrs; > + u8 *p, *q; > + int d, z, z0; > + unative_t wp$$, wq$$, wd$$; > + > + z0 = disks - 3; /* Highest data disk */ > + p = dptr[z0+1]; /* XOR parity */ > + q = dptr[z0+2]; /* RS syndrome */ > + > + for (d = 0; d < bytes; d += NSIZE*$#) { > + wp$$ = wq$$ = *(unative_t *)[z0][d+$$*NSIZE]; > + > + for (z = z0-1; z>=0; z--) { > + wd$$ = *(unative_t *)[z][d+$$*NSIZE]; > + /* P syndrome */ > + wp$$ = vec_xor(wp$$, wd$$); > + > + /*Q syndrome */ > + asm("vpermxor %0,%1,%2,%3":"=v"(wq$$):"v"(gf_high), > "v"(gf_low), "v"(wq$$)); Initially I thought "why can't we break this over 2 lines?" and then I remembered that the awk script can't handle that. A space between /* and Q would be good though. > + wq$$ = vec_xor(wq$$, wd$$); I generated some of the unrolled code and inspected it. It's non-trivial to follow but that's justifiable, it's due to: - the complex maths - the unrolling process - consistency with the altivec code, which I think is worth keeping I am not sure how you could make it any easier to read, so I don't think that should block its acceptance into the kernel. I am confident that this code works correctly and as described. Reviewed-by: Daniel AxtensRegards, Daniel > -- > 2.9.3
Re: [v5 1/2] lib/raid6: Build proper files on corresponding arch
Hi Matt, > --- a/lib/raid6/test/Makefile > +++ b/lib/raid6/test/Makefile > @@ -44,10 +44,12 @@ else ifeq ($(HAS_NEON),yes) > CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1 > else > HAS_ALTIVEC := $(shell printf '\#include \nvector int > a;\n' |\ > - gcc -c -x c - >&/dev/null && \ > - rm ./-.o && echo yes) > + gcc -c -x c - >/dev/null && rm ./-.o && echo yes) >From memory the change here (s/>&/>/) was necessary to get the build to succeed - did we ever figure out why that was? I'm not enough of a shell guru to grok the difference. If it's easy to explain it would be good to put it in the commit message, rather than just saying you fixed an unspecified bug. > ifeq ($(HAS_ALTIVEC),yes) > -OBJS += altivec1.o altivec2.o altivec4.o altivec8.o > +CFLAGS += -I../../../arch/powerpc/include > +CFLAGS += -DCONFIG_ALTIVEC > +OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \ > +vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o You've added vpermxor here, but you don't define them until the next patch, so the tests will fail. Please move the change to OBJS to the next patch. With that change, I'd be happy to formally Review this patch. Regards, Daniel > endif > endif > ifeq ($(ARCH),tilegx) > -- > 2.9.3
Re: [PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines
Le 01/08/2017 à 13:25, Balbir Singh a écrit : Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX feature support we got support for changing the page permissions for pte ranges. This patch adds support for both radix and hash so that we can change their permissions via set/clear masks. A new helper is required for hash (hash__change_memory_range() is changed to hash__change_boot_memory_range() as it deals with bolted PTE's). hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests for permission changes. hash__change_memory_range() does not invoke updatepp, instead it changes the software PTE and invalidates the PTE. For radix, radix__change_memory_range() is setup to do the right thing for vmalloc'd addresses. It takes a new parameter to decide what attributes to set. Signed-off-by: Balbir Singh--- arch/powerpc/include/asm/book3s/64/hash.h | 6 +++ arch/powerpc/include/asm/book3s/64/radix.h | 6 +++ arch/powerpc/include/asm/set_memory.h | 34 +++ arch/powerpc/mm/pgtable-hash64.c | 51 -- arch/powerpc/mm/pgtable-radix.c| 26 ++-- arch/powerpc/mm/pgtable_64.c | 68 ++ 6 files changed, 175 insertions(+), 16 deletions(-) create mode 100644 arch/powerpc/include/asm/set_memory.h [...] diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 0736e94..3ee4c7d 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -514,3 +514,71 @@ void mark_initmem_nx(void) hash__mark_initmem_nx(); } #endif + +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +/* + * Some of these bits are taken from arm64/mm/page_attr.c + */ +static int change_memory_common(unsigned long addr, int numpages, + unsigned long set, unsigned long clear) +{ + unsigned long start = addr; + unsigned long size = PAGE_SIZE*numpages; + unsigned long end = start + size; + struct vm_struct *area; + + if (!PAGE_ALIGNED(addr)) { + start &= PAGE_MASK; + end = start + size; + WARN_ON_ONCE(1); + } Why not just set start = addr & PAGE_MASK, then just do WARN_ON_ONCE(start != addr), instead of that if () + + /* +* So check whether the [addr, addr + size) interval is entirely +* covered by precisely one VM area that has the VM_ALLOC flag set. +*/ + area = find_vm_area((void *)addr); + if (!area || + end > (unsigned long)area->addr + area->size || + !(area->flags & VM_ALLOC)) + return -EINVAL; + + if (!numpages) + return 0; Shouldn't that be tested earlier ? + + if (radix_enabled()) + return radix__change_memory_range(start, start + size, + set, clear); + else + return hash__change_memory_range(start, start + size, + set, clear); +} The following functions should go in a place common to PPC32 and PPC64, otherwise they will have to be duplicated when implementing for PPC32. Maybe the above function should also go in a common place, only the last part should remain in a PPC64 dedicated part. It could be called change_memory_range(), something like int change_memory_range(unsigned long start, unsigned long end, unsigned long set, unsigned long clear) { if (radix_enabled()) return radix__change_memory_range(start, end, set, clear); return hash__change_memory_range(start, end, set, clear); } Then change_memory_range() could also be implemented for PPC32 later. + +int set_memory_ro(unsigned long addr, int numpages) +{ + return change_memory_common(addr, numpages, + 0, _PAGE_WRITE); +} +EXPORT_SYMBOL(set_memory_ro); Take care that _PAGE_WRITE has value 0 when _PAGE_RO instead of _PAGE_RW is defined (eg for the 8xx). It would be better to use accessors like pte_wrprotect() and pte_mkwrite() + +int set_memory_rw(unsigned long addr, int numpages) +{ + return change_memory_common(addr, numpages, + _PAGE_WRITE, 0); +} +EXPORT_SYMBOL(set_memory_rw); + +int set_memory_nx(unsigned long addr, int numpages) +{ + return change_memory_common(addr, numpages, + 0, _PAGE_EXEC); +} +EXPORT_SYMBOL(set_memory_nx); + +int set_memory_x(unsigned long addr, int numpages) +{ + return change_memory_common(addr, numpages, + _PAGE_EXEC, 0); +} +EXPORT_SYMBOL(set_memory_x); +#endif Christophe --- L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel antivirus Avast.
Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?
On Mon, Jul 31, 2017 at 04:27:57PM +0100, Jonathan Cameron wrote: > On Mon, 31 Jul 2017 08:04:11 -0700 > "Paul E. McKenney"wrote: > > > On Mon, Jul 31, 2017 at 12:08:47PM +0100, Jonathan Cameron wrote: > > > On Fri, 28 Jul 2017 12:03:50 -0700 > > > "Paul E. McKenney" wrote: > > > > > > > On Fri, Jul 28, 2017 at 06:27:05PM +0100, Jonathan Cameron wrote: > > > > > On Fri, 28 Jul 2017 09:55:29 -0700 > > > > > "Paul E. McKenney" wrote: > > > > > > > > > > > On Fri, Jul 28, 2017 at 02:24:03PM +0100, Jonathan Cameron wrote: > > > > > > > > > > > > > On Fri, 28 Jul 2017 08:44:11 +0100 > > > > > > > Jonathan Cameron wrote: > > > > > > > > > > > > [ . . . ] > > > > > > > > > > > > > Ok. Some info. I disabled a few driver (usb and SAS) in the > > > > > > > interest of having > > > > > > > fewer timer events. Issue became much easier to trigger (on some > > > > > > > runs before > > > > > > > I could get tracing up and running) > > > > > > >e > > > > > > > So logs are large enough that pastebin doesn't like them - please > > > > > > > shoet if > > > > > > >>e another timer period is of interest. > > > > > > > > > > > > > > https://pastebin.com/iUZDfQGM for the timer trace. > > > > > > > https://pastebin.com/3w1F7amH for dmesg. > > > > > > > > > > > > > > The relevant timeout on the RCU stall detector was 8 seconds. > > > > > > > Event is > > > > > > > detected around 835. > > > > > > > > > > > > > > It's a lot of logs, so I haven't identified a smoking gun yet but > > > > > > > there > > > > > > > may well be one in there. > > > > > > > > > > > > The dmesg says: > > > > > > > > > > > > rcu_preempt kthread starved for 2508 jiffies! g112 c111 f0x0 > > > > > > RCU_GP_WAIT_FQS(3) ->state=0x1 > > > > > > > > > > > > So I look for "rcu_preempt" timer events and find these: > > > > > > > > > > > > rcu_preempt-9 [019] 827.579114: timer_init: > > > > > > timer=8017d5fc7da0 > > > > > > rcu_preempt-9 [019] d..1 827.579115: timer_start: > > > > > > timer=8017d5fc7da0 function=process_timeout > > > > > > > > > > > > Next look for "8017d5fc7da0" and I don't find anything else. > > > > > It does show up off the bottom of what would fit in pastebin... > > > > > > > > > > rcu_preempt-9 [001] d..1 837.681077: timer_cancel: > > > > > timer=8017d5fc7da0 > > > > > rcu_preempt-9 [001] 837.681086: timer_init: > > > > > timer=8017d5fc7da0 > > > > > rcu_preempt-9 [001] d..1 837.681087: timer_start: > > > > > timer=8017d5fc7da0 function=process_timeout expires=4295101298 > > > > > [timeout=1] cpu=1 idx=0 flags= > > > > > > > > Odd. I would expect an expiration... And ten seconds is way longer > > > > than the requested one jiffy! > > > > > > > > > > The timeout was one jiffy, and more than a second later, no > > > > > > expiration. > > > > > > Is it possible that this event was lost? I am not seeing any sign > > > > > > of > > > > > > this is the trace. > > > > > > > > > > > > I don't see any sign of CPU hotplug (and I test with lots of that in > > > > > > any case). > > > > > > > > > > > > The last time we saw something like this it was a timer HW/driver > > > > > > problem, > > > > > > but it is a bit hard to imagine such a problem affecting both ARM64 > > > > > > and SPARC. ;-) > > > > > Could be different issues, both of which were hidden by that lockup > > > > > detector. > > > > > > > > > > There is an errata work around for the timers on this particular > > > > > board. > > > > > I'm only vaguely aware of it, so may be unconnected. > > > > > > > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/clocksource/arm_arch_timer.c?h=v4.13-rc2=bb42ca47401010fc02901b5e8f79e40a26f208cb > > > > > > > > > > Seems unlikely though! + we've not yet seen it on the other chips that > > > > > errata effects (not that that means much). > > > > > > > > If you can reproduce quickly, might be worth trying anyway... > > > > > > > > Thanx, Paul > > > Errata fix is running already and was for all those tests. > > > > I was afraid of that... ;-) > It's a pretty rare errata it seems. Not actually managed to catch > one yet. > > > > > I'll have a dig into the timers today and see where I get to. > > > > Look forward to seeing what you find! > Nothing obvious turning up other than we don't seem to have issue > when we aren't running hrtimers. > > On a plus side I just got a report that it is effecting our d03 > boards which is good on the basis I couldn't tell what the difference > could be wrt to this issue! > > It indeed looks like we are consistently missing a timer before > the rcu splat occurs. And for my part, my tests with CONFIG_HZ_PERIODIC=y and
Re: [RFC v6 21/62] powerpc: introduce execute-only pkey
Michael Ellermanwrites: > Thiago Jung Bauermann writes: >> Ram Pai writes: > ... >>> + >>> + /* We got one, store it and use it from here on out */ >>> + if (need_to_set_mm_pkey) >>> + mm->context.execute_only_pkey = execute_only_pkey; >>> + return execute_only_pkey; >>> +} >> >> If you follow the code flow in __execute_only_pkey, the AMR and UAMOR >> are read 3 times in total, and AMR is written twice. IAMR is read and >> written twice. Since they are SPRs and access to them is slow (or isn't >> it?), > > SPRs read/writes are slow, but they're not *that* slow in comparison to > a system call (which I think is where this code is being called?). Yes, this code runs on mprotect and mmap syscalls if the memory is requested to have execute but not read nor write permissions. > So we should try to avoid too many SPR read/writes, but at the same time > we can accept more than the minimum if it makes the code much easier to > follow. Ok. Ram had asked me to suggest a way to optimize the SPR reads and writes and I came up with the patch below. Do you think it's worth it? The patch applies on top of this series, but if Ram includes it I think he would break it up and merge it into the other patches. -- Thiago Jung Bauermann IBM Linux Technology Center >From f6e73e67d325c4a1952c375072ca35156a9f2042 Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Mon, 31 Jul 2017 20:22:59 -0300 Subject: [PATCH] powerpc: Cache protection key registers in __execute_only_pkey Pass around a struct with the contents of AMR, IAMR and AMOR, as well as flags indicating whether those fields hold valid values and whether they should be committed back to the registers. Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/pkeys.h | 18 -- arch/powerpc/mm/pkeys.c | 120 +-- 2 files changed, 104 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index e61ed6c332db..66f15dbc5855 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -129,12 +129,15 @@ static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) mm_set_pkey_is_allocated(mm, pkey)); } -extern void __arch_activate_pkey(int pkey); +struct pkey_regs_cache; + +extern void __arch_activate_pkey(int pkey, struct pkey_regs_cache *regs); extern void __arch_deactivate_pkey(int pkey); /* * Returns a positive, 5-bit key on success, or -1 on failure. */ -static inline int mm_pkey_alloc(struct mm_struct *mm) +static inline int __mm_pkey_alloc(struct mm_struct *mm, + struct pkey_regs_cache *regs) { /* * Note: this is the one and only place we make sure @@ -162,10 +165,15 @@ static inline int mm_pkey_alloc(struct mm_struct *mm) * enable the key in the hardware */ if (ret > 0) - __arch_activate_pkey(ret); + __arch_activate_pkey(ret, regs); return ret; } +static inline int mm_pkey_alloc(struct mm_struct *mm) +{ + return __mm_pkey_alloc(mm, NULL); +} + static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!pkey_inited) @@ -206,13 +214,13 @@ static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, } extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val); + unsigned long init_val, struct pkey_regs_cache *regs); static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { if (!pkey_inited) return -EINVAL; - return __arch_set_user_pkey_access(tsk, pkey, init_val); + return __arch_set_user_pkey_access(tsk, pkey, init_val, NULL); } static inline bool arch_pkeys_enabled(void) diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c index 1424c79f45f6..718ea23f8184 100644 --- a/arch/powerpc/mm/pkeys.c +++ b/arch/powerpc/mm/pkeys.c @@ -22,52 +22,92 @@ u32 initial_allocation_mask; /* bits set for reserved keys */ #define PKEY_REG_BITS (sizeof(u64)*8) #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY)) -static bool is_pkey_enabled(int pkey) +/* + * The registers controlling memory protection keys are expensive to access, so + * we want to cache their values in code paths that might need to use them more + * than once. + */ +struct pkey_regs_cache { + u64 amr; + u64 iamr; + u64 uamor; + + bool amr_valid; + bool iamr_valid; + bool uamor_valid; + + bool write_amr; + bool write_iamr; + bool write_uamor; +}; + +static bool is_pkey_enabled(int pkey, struct pkey_regs_cache *regs) { - return !!(read_uamor() & (0x3ul <<
Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage
On 08/01/2017 12:55 AM, Michael Ellerman wrote: > Jens Axboewrites: > ... >> >> Can you try the below fix? Should be more palatable than the previous >> one. Brian, maybe you can take a look at the IRQ issue mentioned above? > > Given the patch from Brian fixed the lockdep warning, do you still want > me to try and test this one? Nope, we don't have to do that. I'd much rather just add a WARN_ON() or similar to make sure we catch buggy users earlier. scsi_run_queue() needs a WARN_ON(in_interrupt()); but it might be better to put that in __blk_mq_run_hw_queue(). -- Jens Axboe
[PATCH] ipr: Fix scsi-mq lockdep issue
Fixes the following lockdep warning that can occur when scsi-mq is enabled with ipr due to ipr calling scsi_unblock_requests from irq context. The fix is to move the call to scsi_unblock_requests to ipr's existing workqueue. stack backtrace: CPU: 28 PID: 0 Comm: swapper/28 Not tainted 4.13.0-rc2-gcc6x-gf74c89b #1 Call Trace: [c01fffe97550] [c0b50818] dump_stack+0xe8/0x160 (unreliable) [c01fffe97590] [c01586d0] print_usage_bug+0x2d0/0x390 [c01fffe97640] [c0158f34] mark_lock+0x7a4/0x8e0 [c01fffe976f0] [c015a000] __lock_acquire+0x6a0/0x1a70 [c01fffe97860] [c015befc] lock_acquire+0xec/0x2e0 [c01fffe97930] [c0b71514] _raw_spin_lock+0x44/0x70 [c01fffe97960] [c05b60f4] blk_mq_sched_dispatch_requests+0xa4/0x2a0 [c01fffe979c0] [c05acac0] __blk_mq_run_hw_queue+0x100/0x2c0 [c01fffe97a00] [c05ad478] __blk_mq_delay_run_hw_queue+0x118/0x130 [c01fffe97a40] [c05ad61c] blk_mq_start_hw_queues+0x6c/0xa0 [c01fffe97a80] [c0797aac] scsi_kick_queue+0x2c/0x60 [c01fffe97aa0] [c0797cf0] scsi_run_queue+0x210/0x360 [c01fffe97b10] [c079b888] scsi_run_host_queues+0x48/0x80 [c01fffe97b40] [c07b6090] ipr_ioa_bringdown_done+0x70/0x1e0 [c01fffe97bc0] [c07bc860] ipr_reset_ioa_job+0x80/0xf0 [c01fffe97bf0] [c07b4d50] ipr_reset_timer_done+0xd0/0x100 [c01fffe97c30] [c01937bc] call_timer_fn+0xdc/0x4b0 [c01fffe97cf0] [c0193d08] expire_timers+0x178/0x330 [c01fffe97d60] [c01940c8] run_timer_softirq+0xb8/0x120 [c01fffe97de0] [c0b726a8] __do_softirq+0x168/0x6d8 [c01fffe97ef0] [c00df2c8] irq_exit+0x108/0x150 [c01fffe97f10] [c0017bf4] __do_irq+0x2a4/0x4a0 [c01fffe97f90] [c002da50] call_do_irq+0x14/0x24 [c007fad93aa0] [c0017e8c] do_IRQ+0x9c/0x140 [c007fad93af0] [c0008b98] hardware_interrupt_common+0x138/0x140 Reported-by: Michael EllermanSigned-off-by: Brian King --- Index: linux-2.6.git/drivers/scsi/ipr.c === --- linux-2.6.git.orig/drivers/scsi/ipr.c +++ linux-2.6.git/drivers/scsi/ipr.c @@ -3351,6 +3351,16 @@ static void ipr_worker_thread(struct wor return; } + if (ioa_cfg->scsi_unblock) { + ioa_cfg->scsi_unblock = 0; + ioa_cfg->scsi_blocked = 0; + spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); + scsi_unblock_requests(ioa_cfg->host); + spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags); + if (ioa_cfg->scsi_blocked) + scsi_block_requests(ioa_cfg->host); + } + if (!ioa_cfg->scan_enabled) { spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags); return; @@ -7211,9 +7221,8 @@ static int ipr_ioa_bringdown_done(struct ENTER; if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { ipr_trace; - spin_unlock_irq(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock_irq(ioa_cfg->host->host_lock); + ioa_cfg->scsi_unblock = 1; + schedule_work(_cfg->work_q); } ioa_cfg->in_reset_reload = 0; @@ -7287,13 +7296,7 @@ static int ipr_ioa_reset_done(struct ipr list_add_tail(_cmd->queue, _cmd->hrrq->hrrq_free_q); wake_up_all(_cfg->reset_wait_q); - spin_unlock(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock(ioa_cfg->host->host_lock); - - if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].allow_cmds) - scsi_block_requests(ioa_cfg->host); - + ioa_cfg->scsi_unblock = 1; schedule_work(_cfg->work_q); LEAVE; return IPR_RC_JOB_RETURN; @@ -9249,8 +9252,11 @@ static void _ipr_initiate_ioa_reset(stru spin_unlock(_cfg->hrrq[i]._lock); } wmb(); - if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) + if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { + ioa_cfg->scsi_unblock = 0; + ioa_cfg->scsi_blocked = 1; scsi_block_requests(ioa_cfg->host); + } ipr_cmd = ipr_get_free_ipr_cmnd(ioa_cfg); ioa_cfg->reset_cmd = ipr_cmd; @@ -9306,9 +9312,8 @@ static void ipr_initiate_ioa_reset(struc wake_up_all(_cfg->reset_wait_q); if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) { - spin_unlock_irq(ioa_cfg->host->host_lock); - scsi_unblock_requests(ioa_cfg->host); - spin_lock_irq(ioa_cfg->host->host_lock); + ioa_cfg->scsi_unblock = 1; +
Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch
On 08/01/2017 11:05 AM, Nathan Fontenot wrote: On 08/01/2017 04:59 AM, Michael Ellerman wrote: Daniel Henrique Barbozawrites: Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") reverted the auto-online feature for pseries due to problems with LMB removals not updating the device struct properly. Among other things, this commit made the following change in arch/powerpc/configs/pseries_defconfig: @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y CONFIG_IRQ_ALL_CPUS=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y CONFIG_KSM=y The intent was to disable the option in the defconfig of pseries, since after that the code doesn't have this support anymore. It's always polite to Cc the author of a commit you're referring to, so I added Nathan. Noted. Thanks for adding Nathan in the CC. The intention when we merged that fix was that the auto-online code would be "fixed" to mark the device online. I say "fixed" because it wasn't entirely clear if that was the correct behaviour, though it definitely seemed like it should be. I've lost track of where/if the discussion got to on whether the auto-online code should do that or not. Did anything get resolved? I think, though I should go back and test to be sure, that everything works in the latest mainline code. The issue causing this to be a problem was in the original implementation of auto_online support. If you wanted to auto online memory, the code was calling memory_block_change_state(). This worked but did not update the device struct for each of the memory block that was online'ed such that dev->offline == true even after the memory was online. I sent a patch earlier this year (commit dc18d706a436) that corrected this to call device_online() instead of memory_block_change_state(). With this fix (appears to have gone into the 4.11 kernel) it should be possible to use auto_online on power systems. Commit dc18d706a436 was present in the 4.11 kernels that experiences this issue (Fedora 26 and Ubuntu 17.10 in my tests). So I am not entirely sure that we can use auto_online on power systems, at least in those kernels. At this point I don't think we need this patch to disable auto online for ppc64. I would be curious if this is still broken with the latest mainline code though. If the auto_online feature is already working in upstream 4.13 kernel then I don't see a reason to apply this patch either. We can leave it as a FYI/reminder of a problem that was happening in 4.11 and got solved later on. Thanks, Daniel -Nathan However, this change alone isn't enough to prevent situations such as [1], where distros can enable the option unaware of the consequences of doing it (e.g. breaking LMB hotplug altogether). Instead of relying on all distros knowing that pseries can't handle CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config unavailable for the PPC64 arch. [1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380 Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'") Signed-off-by: Daniel Henrique Barboza --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) I don't own that file, so we at least need an Ack from the mm folks. cheers diff --git a/mm/Kconfig b/mm/Kconfig index 48b1af4..a342c77 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE config MEMORY_HOTPLUG_DEFAULT_ONLINE bool "Online the newly added memory blocks by default" default n -depends on MEMORY_HOTPLUG +depends on MEMORY_HOTPLUG && !PPC64 help This option sets the default policy setting for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks) which -- 2.9.4
Re: [PATCH 2/3] powerpc/xmon: Disable and enable tracing command
Hi Naveen, On Tue, Aug 01, 2017 at 12:10:24PM +0530, Naveen N. Rao wrote: > On 2017/07/31 02:22PM, Breno Leitao wrote: > > If tracing is enabled and you get into xmon, the tracing buffer > > continues to be updated, causing possible loss of data due to buffer > > overflow and unnecessary tracing information coming from xmon functions. > > > > This patch adds a new option that allows the tracing to be disabled and > > re-enabled from inside xmon. > > How is this new option useful? In the next patch, you disable tracing by > default -- in what scenario do you expect to have to re-enable tracing > from within xmon? I see it being useful on two different scenarios: 1) You can reenable tracing if you want to call a function from xmon (with 'p'), or even for code stepping (with 's'). 2) You may also want to reenable tracing once you resume from xmon with 'zr'. > > + case 'v': > > + if (tracing_is_on()) { > > + printk("Disabling tracing\n"); > > + tracing_enabled = 0; > > + tracing_off(); > > This only disables trace buffer updates - ftrace (and all its callbacks, > et al) remains active, which isn't desirable. Why isn't it desirable? In fact, I thought it would be *the* desirable function to call, since it does not do a lot of stuff, as disabling tracing, in xmon mode, but, just disable the tracing buffer to be updated. Since we are in xmon, we are in a very bad state, and something went very wrong. Disabling the whole tracing might not be what we want to do in this scenario, since it can hit the broken subsystem causing xmon to fail. For bad state scenario, I understand that it is desirable to be less instrusive as possible, and tracing_off() does exactly it. > Can you see if this works for you: > https://patchwork.ozlabs.org/patch/769611/ Well, I understand that this patch solves a different issue, this does not reduce the tracing caused by function tracer after you got into into xmon. As for example, with your patch applied, I can see a lot of xmon functions polluting the tracing buffer as: 1:mon> dt [ 359.196593] Dumping ftrace buffer: [ 359.196689] - [ 359.196904] 1) | xmon_printf() { <110+ lines snipped> [ 359.197727] 1) + 22.930 us | } [ 359.199405] 1) | skipbl() { <50+ lines snipped> [ 359.225069] 1) + 23.750 us | } Since tracing continues to be enabled during xmon, these messages continue to show up. That is exactly what I am trying to avoid with this current patchset. Avoiding all xmon-related tracing is my main goal. Thanks for your review, Breno
Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch
On 08/01/2017 04:59 AM, Michael Ellerman wrote: > Daniel Henrique Barbozawrites: > >> Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online >> hotplugged memory'") reverted the auto-online feature for pseries due >> to problems with LMB removals not updating the device struct properly. >> Among other things, this commit made the following change in >> arch/powerpc/configs/pseries_defconfig: >> >> @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y >> CONFIG_IRQ_ALL_CPUS=y >> CONFIG_MEMORY_HOTPLUG=y >> CONFIG_MEMORY_HOTREMOVE=y >> -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y >> CONFIG_KSM=y >> >> The intent was to disable the option in the defconfig of pseries, since >> after that the code doesn't have this support anymore. > > It's always polite to Cc the author of a commit you're referring to, so > I added Nathan. > > The intention when we merged that fix was that the auto-online code > would be "fixed" to mark the device online. I say "fixed" because it > wasn't entirely clear if that was the correct behaviour, though it > definitely seemed like it should be. > > I've lost track of where/if the discussion got to on whether the > auto-online code should do that or not. Did anything get resolved? I think, though I should go back and test to be sure, that everything works in the latest mainline code. The issue causing this to be a problem was in the original implementation of auto_online support. If you wanted to auto online memory, the code was calling memory_block_change_state(). This worked but did not update the device struct for each of the memory block that was online'ed such that dev->offline == true even after the memory was online. I sent a patch earlier this year (commit dc18d706a436) that corrected this to call device_online() instead of memory_block_change_state(). With this fix (appears to have gone into the 4.11 kernel) it should be possible to use auto_online on power systems. At this point I don't think we need this patch to disable auto online for ppc64. I would be curious if this is still broken with the latest mainline code though. -Nathan > >> However, this change >> alone isn't enough to prevent situations such as [1], where >> distros can enable the option unaware of the consequences of >> doing it (e.g. breaking LMB hotplug altogether). >> >> Instead of relying on all distros knowing that pseries can't handle >> CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch >> changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config >> unavailable for the PPC64 arch. >> >> [1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380 >> >> Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged >> memory'") >> Signed-off-by: Daniel Henrique Barboza >> --- >> mm/Kconfig | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) > > I don't own that file, so we at least need an Ack from the mm folks. > > cheers > >> diff --git a/mm/Kconfig b/mm/Kconfig >> index 48b1af4..a342c77 100644 >> --- a/mm/Kconfig >> +++ b/mm/Kconfig >> @@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE >> config MEMORY_HOTPLUG_DEFAULT_ONLINE >> bool "Online the newly added memory blocks by default" >> default n >> -depends on MEMORY_HOTPLUG >> +depends on MEMORY_HOTPLUG && !PPC64 >> help >>This option sets the default policy setting for memory hotplug >>onlining policy (/sys/devices/system/memory/auto_online_blocks) which >> -- >> 2.9.4 >
[PATCH] powerpc/64: Fix __check_irq_replay missing decrementer interrupt
If the decrementer wraps and de-asserts the decrementer exception while hard-disabled, __check_irq_replay has a test to notice the wrap when interrupts are re-enabled. The decrementer check must be done when clearing the PACA_IRQ_HARD_DIS flag, not when the PACA_IRQ_DEC flag is tested. Previously this worked because the decrementer interrupt was always the first one checked after clearing the hard disable flag, but HMI check was moved ahead of that, which introduced this bug. This can cause a missed decrementer interrupt if we soft-disable interrupts then take an HMI which is recorded in irq_happened, then hard-disable interrupts for > 4s to wrap the decrementer. Fixes: e0e0d6b739 ("powerpc/64: Replay hypervisor maintenance interrupt first") Signed-off-by: Nicholas Piggin--- arch/powerpc/kernel/irq.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0bcec745a672..f291f7826abc 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -145,6 +145,19 @@ notrace unsigned int __check_irq_replay(void) /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + if (happened & PACA_IRQ_HARD_DIS) { + /* +* We may have missed a decrementer interrupt if hard disabled. +* Check the decrementer register in case we had a rollover +* while hard disabled. +*/ + if (!(happened & PACA_IRQ_DEC)) { + if (decrementer_check_overflow()) { + local_paca->irq_happened |= PACA_IRQ_DEC; + happened |= PACA_IRQ_DEC; + } + } + } /* * Force the delivery of pending soft-disabled interrupts on PS3. @@ -170,7 +183,7 @@ notrace unsigned int __check_irq_replay(void) * in case we also had a rollover while hard disabled */ local_paca->irq_happened &= ~PACA_IRQ_DEC; - if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) + if (happened & PACA_IRQ_DEC) return 0x900; /* Finally check if an external interrupt happened */ -- 2.11.0
Re: [PATCH v2] powerpc/powernv: Use darn instr for random_seed on p9
On Mon, Jul 31, 2017 at 07:10:15PM +1000, Michael Ellerman wrote: > And ___PPC_RA() is not quite right. The L field is only 2 bits wide, not > the 5 that ___PPC_RA() allows. > > We don't have a __PPC_L() macro, because L fields vary in size and > location. So I think you're best of open coding it, eg: > > +#define PPC_DARN(t, l) stringify_in_c(.long PPC_INST_DARN | \ > + __PPC_RT(t)| \ > + (((l) & 0x3) << 16)) It would be better if you could do a compile-time error if the L value is out of range. Hrm, nothing else does such checking either? Segher
Re: [PATCH v4 1/5] powerpc/lib/sstep: Add cmpb instruction emulation
Hi! On Mon, Jul 31, 2017 at 10:58:22AM +1000, Matt Brown wrote: > @@ -1049,6 +1065,10 @@ int analyse_instr(struct instruction_op *op, struct > pt_regs *regs, > do_cmp_unsigned(regs, val, val2, rd >> 2); > goto instr_done; > > + case 508: /* cmpb */ > + do_cmpb(regs, regs->gpr[rd], regs->gpr[rb], ra); > + goto instr_done; Should this then be under an ifdef for 64-bit? Segher
Re: [PATCH] drivers: cpuidle: Disable preemption before get_lppaca function call in pseries_idle_probe function
Em 2017-07-20 18:21, Benjamin Herrenschmidt escreveu: On Thu, 2017-07-20 at 14:57 -0300, Victor Aoqui wrote: When CONFIG_PREEMPT=y, the following warning shows up: BUG: using smp_processor_id() in preemptible [] code: swapper/0/1 caller is pseries_processor_idle_init+0x58/0x21c This warning shows up because preemption cannot occur when using get_paca(), otherwise the paca_struct it points to may be the wrong one just after. For this reason, preemption needs to be disabled before lppaca_shared_proc(get_lppaca()). Also chekc the generated assembly. We had all sort of interesting issues where gcc would copy the paca pointer or the lppaca pointer to a GPR *outside* of the preempt disabled section... In that specific case it's not a big deal but overall, I am not comfortable with PREEMPT on powerpc until we do something a bit more drastic... I would like to remove all such direct accesses to paca, instead have a "new" get_paca() written in asm that does the preempt disable then returns the PACA in a GPR (not directly use r13, hide that from gcc), and which is paired with a put_paca(). The few places where we want to directly access r13 should be hand written in asm too to hide r13 from gcc, for accessing the irq_happened in the fast path of local_irq_enable/disable/... we should do the same with lock tokens. Ben. Hi Benjamin, Sorry for the delay. I was a little bit busy last days. I took note of your comments and I will work on those changes. I will let you know soon when it's done. Thanks -- Victor Aoqui
[PATCH v2 4/4] powerpc: add irq accounting for watchdog interrupts
This adds an irq counter for the watchdog soft-NMI. This interrupt only fires when interrupts are soft-disabled, so it will not increment much even when the watchdog is running. However it's useful for debugging and sanity checking. Signed-off-by: Nicholas Piggin--- arch/powerpc/include/asm/hardirq.h | 3 +++ arch/powerpc/kernel/irq.c | 10 ++ 2 files changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h index 64b73b03d473..c97603d617e3 100644 --- a/arch/powerpc/include/asm/hardirq.h +++ b/arch/powerpc/include/asm/hardirq.h @@ -13,6 +13,9 @@ typedef struct { unsigned int spurious_irqs; unsigned int hmi_exceptions; unsigned int sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + unsigned int soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL unsigned int doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c18335580b6..77a7f7514327 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -475,6 +475,13 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); seq_printf(p, " System Reset interrupts\n"); +#ifdef CONFIG_PPC_WATCHDOG + seq_printf(p, "%*s: ", prec, "WDG"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs); + seq_printf(p, " Watchdog soft-NMI interrupts\n"); +#endif + #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { seq_printf(p, "%*s: ", prec, "DBL"); @@ -500,6 +507,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).timer_irqs_others; sum += per_cpu(irq_stat, cpu).hmi_exceptions; sum += per_cpu(irq_stat, cpu).sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + sum += per_cpu(irq_stat, cpu).soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL sum += per_cpu(irq_stat, cpu).doorbell_irqs; #endif -- 2.11.0
[PATCH v2 3/4] powerpc: add irq accounting for system reset interrupts
Signed-off-by: Nicholas Piggin--- arch/powerpc/include/asm/hardirq.h | 1 + arch/powerpc/kernel/irq.c | 6 ++ arch/powerpc/kernel/traps.c| 2 ++ arch/powerpc/kernel/watchdog.c | 3 +++ 4 files changed, 12 insertions(+) diff --git a/arch/powerpc/include/asm/hardirq.h b/arch/powerpc/include/asm/hardirq.h index 8add8b861e8d..64b73b03d473 100644 --- a/arch/powerpc/include/asm/hardirq.h +++ b/arch/powerpc/include/asm/hardirq.h @@ -12,6 +12,7 @@ typedef struct { unsigned int mce_exceptions; unsigned int spurious_irqs; unsigned int hmi_exceptions; + unsigned int sreset_irqs; #ifdef CONFIG_PPC_DOORBELL unsigned int doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0bcec745a672..5c18335580b6 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -470,6 +470,11 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); + seq_printf(p, " System Reset interrupts\n"); + #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { seq_printf(p, "%*s: ", prec, "DBL"); @@ -494,6 +499,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).spurious_irqs; sum += per_cpu(irq_stat, cpu).timer_irqs_others; sum += per_cpu(irq_stat, cpu).hmi_exceptions; + sum += per_cpu(irq_stat, cpu).sreset_irqs; #ifdef CONFIG_PPC_DOORBELL sum += per_cpu(irq_stat, cpu).doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5adfea2dc822..6a892ca7bf18 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -288,6 +288,8 @@ void system_reset_exception(struct pt_regs *regs) if (!nested) nmi_enter(); + __this_cpu_inc(irq_stat.sreset_irqs); + /* See if any machine dependent calls */ if (ppc_md.system_reset_exception) { if (ppc_md.system_reset_exception(regs)) diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index b67f8b03a32d..4b9a567c9975 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -204,6 +204,9 @@ void soft_nmi_interrupt(struct pt_regs *regs) return; nmi_enter(); + + __this_cpu_inc(irq_stat.soft_nmi_irqs); + tb = get_tb(); if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { per_cpu(wd_timer_tb, cpu) = tb; -- 2.11.0
[PATCH v2 2/4] powerpc: Fix powerpc-specific watchdog build configuration
The powerpc kernel/watchdog.o should be built when HARDLOCKUP_DETECTOR and HAVE_HARDLOCKUP_DETECTOR_ARCH are both selected. If only the former is selected, then the generic perf watchdog has been selected. To simplify this check, introduce a new Kconfig symbol PPC_WATCHDOG that depends on both. This Kconfig option means the powerpc specific watchdog is enabled. Without this patch, Book3E will attempt to build the powerpc watchdog. Fixes: 2104180a53 ("powerpc/64s: implement arch-specific hardlockup watchdog") Signed-off-by: Nicholas Piggin--- arch/powerpc/Kconfig | 11 +++ arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/exceptions-64s.S | 6 +++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 36f858c37ca7..2a5060aa1674 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -85,6 +85,17 @@ config NMI_IPI depends on SMP && (DEBUGGER || KEXEC_CORE || HARDLOCKUP_DETECTOR) default y +config PPC_WATCHDOG + bool + depends on HARDLOCKUP_DETECTOR + depends on HAVE_HARDLOCKUP_DETECTOR_ARCH + default y + help + This is a placeholder when the powerpc hardlockup detector + watchdog is selected (arch/powerpc/kernel/watchdog.c). It is + seleted via the generic lockup detector menu which is why we + have no standalone config option for it here. + config STACKTRACE_SUPPORT bool default y diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 4aa7c147e447..5622bd0248e5 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -38,7 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ -obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_power.o diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9029afd1fa2a..48aaca3e0b20 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1314,7 +1314,7 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif -#if defined(CONFIG_HARDLOCKUP_DETECTOR) && defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH) +#ifdef CONFIG_PPC_WATCHDOG #define MASKED_DEC_HANDLER_LABEL 3f @@ -1335,10 +1335,10 @@ EXC_COMMON_BEGIN(soft_nmi_common) ADD_NVGPRS;ADD_RECONCILE) b ret_from_except -#else +#else /* CONFIG_PPC_WATCHDOG */ #define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ #define MASKED_DEC_HANDLER(_H) -#endif +#endif /* CONFIG_PPC_WATCHDOG */ /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: -- 2.11.0
[PATCH v2 1/4] powerpc/64s: fix mce accounting for powernv
--- arch/powerpc/kernel/traps.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index bfcfd9ef09f2..5adfea2dc822 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -755,7 +755,14 @@ void machine_check_exception(struct pt_regs *regs) enum ctx_state prev_state = exception_enter(); int recover = 0; +#ifdef CONFIG_PPC_BOOK3S_64 + /* 64s accounts the mce in machine_check_early when in HVMODE */ + if (!cpu_has_feature(CPU_FTR_HVMODE)) + __this_cpu_inc(irq_stat.mce_exceptions); +#else __this_cpu_inc(irq_stat.mce_exceptions); +#endif + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); -- 2.11.0
[PATCH v2 0/4] powerpc: improve accounting of non maskable interrupts
This is the same as the last patch, but broken out and in the process of making ifdefs nicer, also found and fixed a watchdog build bug in patch 2. Patches 1-2 are fixes that should go to 4.13. Patches 3-4 are probably simple enough they could also go to 4.13. Nicholas Piggin (4): powerpc/64s: fix mce accounting for powernv powerpc: fix powerpc-specific watchdog build configuration powerpc: add irq accounting for system reset interrupts powerpc: add irq accounting for watchdog interrupts arch/powerpc/Kconfig | 11 +++ arch/powerpc/include/asm/hardirq.h | 4 arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/exceptions-64s.S | 6 +++--- arch/powerpc/kernel/irq.c| 16 arch/powerpc/kernel/traps.c | 9 + arch/powerpc/kernel/watchdog.c | 3 +++ 7 files changed, 47 insertions(+), 4 deletions(-) -- 2.11.0
[PATCH v1 3/3] arch/powerpc/net/bpf: Basic EBPF support
Signed-off-by: Balbir Singh--- arch/powerpc/net/bpf_jit_comp64.c | 13 + 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 861c5af..d81110e 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1054,6 +1054,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) fp->jited = 1; fp->jited_len = alloclen; + bpf_jit_binary_lock_ro(bpf_hdr); bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); out: @@ -1064,15 +1065,3 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) return fp; } - -/* Overriding bpf_jit_free() as we don't set images read-only. */ -void bpf_jit_free(struct bpf_prog *fp) -{ - unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; - struct bpf_binary_header *bpf_hdr = (void *)addr; - - if (fp->jited) - bpf_jit_binary_free(bpf_hdr); - - bpf_prog_unlock_free(fp); -} -- 2.9.4
[PATCH v1 2/3] Enable ARCH_HAS_SET_MEMORY
Signed-off-by: Balbir Singh--- arch/powerpc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b5b8ba8..7be710d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -165,6 +165,7 @@ config PPC select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK + select ARCH_HAS_SET_MEMORY if (PPC_BOOK3S_64) select ARCH_HAS_STRICT_KERNEL_RWX if (PPC_BOOK3S_64 && !HIBERNATION) select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select HAVE_CBPF_JITif !PPC64 -- 2.9.4
[PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines
Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX feature support we got support for changing the page permissions for pte ranges. This patch adds support for both radix and hash so that we can change their permissions via set/clear masks. A new helper is required for hash (hash__change_memory_range() is changed to hash__change_boot_memory_range() as it deals with bolted PTE's). hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests for permission changes. hash__change_memory_range() does not invoke updatepp, instead it changes the software PTE and invalidates the PTE. For radix, radix__change_memory_range() is setup to do the right thing for vmalloc'd addresses. It takes a new parameter to decide what attributes to set. Signed-off-by: Balbir Singh--- arch/powerpc/include/asm/book3s/64/hash.h | 6 +++ arch/powerpc/include/asm/book3s/64/radix.h | 6 +++ arch/powerpc/include/asm/set_memory.h | 34 +++ arch/powerpc/mm/pgtable-hash64.c | 51 -- arch/powerpc/mm/pgtable-radix.c| 26 ++-- arch/powerpc/mm/pgtable_64.c | 68 ++ 6 files changed, 175 insertions(+), 16 deletions(-) create mode 100644 arch/powerpc/include/asm/set_memory.h diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 36fc7bf..65003c9 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -94,6 +94,12 @@ extern void hash__mark_rodata_ro(void); extern void hash__mark_initmem_nx(void); #endif +/* + * For set_memory_* + */ +extern int hash__change_memory_range(unsigned long start, unsigned long end, +unsigned long set, unsigned long clear); + extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long pte, int huge); extern unsigned long htab_convert_pte_flags(unsigned long pteflags); diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 50b..5ca0636 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -121,6 +121,12 @@ extern void radix__mark_rodata_ro(void); extern void radix__mark_initmem_nx(void); #endif +/* + * For set_memory_* + */ +extern int radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long set, unsigned long clear); + static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, unsigned long set) { diff --git a/arch/powerpc/include/asm/set_memory.h b/arch/powerpc/include/asm/set_memory.h new file mode 100644 index 000..b19c67c --- /dev/null +++ b/arch/powerpc/include/asm/set_memory.h @@ -0,0 +1,34 @@ +/* + * set_memory.h + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Balbir Singh + */ + +#ifndef __ASM_SET_MEMORY_H +#define __ASM_SET_MEMORY_H + +/* + * Functions to change memory attributes. + */ +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); + +#endif diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 656f7f3..db5b477 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -424,9 +424,52 @@ int hash__has_transparent_hugepage(void) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +/* + * This routine will change pte protection only for vmalloc'd + * PAGE_SIZE pages, do not invoke for bolted pages + */ +int hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long set, unsigned long clear) +{ + unsigned long idx; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; + + start = ALIGN_DOWN(start, PAGE_SIZE); + end = PAGE_ALIGN(end); // aligns up + + /* +* Update the software PTE and flush the entry. +* This should cause a new fault with the
[PATCH v1 0/3] Implement set_memory_xx for ppc64 book3s
After implementing STRICT_KERNEL_RWX, it turns out that implementing set_memory_ro/rw/x/nx is quite easy. The first patch is applied on top (http://patchwork.ozlabs.org/patch/795745/). The first patch implements the various routines, the second patch enables ARCH_HAS_SET_MEMORY for PPC_BOOK3S_64 and the third patch enables the BPF infrastructure to use the set_memory_ro and set_memory_rw routines. Balbir Singh (3): arch/powerpc/set_memory: Implement set_memory_xx routines Enable ARCH_HAS_SET_MEMORY arch/powerpc/net/bpf: Basic EBPF support arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/hash.h | 6 +++ arch/powerpc/include/asm/book3s/64/radix.h | 6 +++ arch/powerpc/include/asm/set_memory.h | 34 +++ arch/powerpc/mm/pgtable-hash64.c | 51 -- arch/powerpc/mm/pgtable-radix.c| 26 ++-- arch/powerpc/mm/pgtable_64.c | 68 ++ arch/powerpc/net/bpf_jit_comp64.c | 13 +- 8 files changed, 177 insertions(+), 28 deletions(-) create mode 100644 arch/powerpc/include/asm/set_memory.h -- 2.9.4
Re: [v3 PATCH 1/2] powernv/powerpc:Save/Restore additional SPRs for stop4 cpuidle
"Gautham R. Shenoy"writes: > > Subject: [v3 PATCH 1/2] powernv/powerpc:Save/Restore additional SPRs for > stop4 cpuidle I know it's not a big deal, but can we agree on the subject format? powerpc/powernv: Save/Restore additional SPRs for stop4 cpuidle cheers
Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?
Sorry - accidental send. No content! Jonathan On Mon, 31 Jul 2017 12:55:48 +0100 Jonathan Cameronwrote: > On Mon, 31 Jul 2017 12:09:08 +0100 > Jonathan Cameron wrote: > > > On Wed, 26 Jul 2017 16:15:05 -0700 > > "Paul E. McKenney" wrote: > > > > > On Wed, Jul 26, 2017 at 03:45:40PM -0700, David Miller wrote: > > > > From: "Paul E. McKenney" > > > > Date: Wed, 26 Jul 2017 15:36:58 -0700 > > > > > > > > > And without CONFIG_SOFTLOCKUP_DETECTOR, I see five runs of 24 with RCU > > > > > CPU stall warnings. So it seems likely that > > > > > CONFIG_SOFTLOCKUP_DETECTOR > > > > > really is having an effect. > > > > > > > > Thanks for all of the info Paul, I'll digest this and scan over the > > > > code myself. > > > > > > > > Just out of curiousity, what x86 idle method is your machine using? > > > > The mwait one or the one which simply uses 'halt'? The mwait variant > > > > might mask this bug, and halt would be a lot closer to how sparc64 and > > > > Jonathan's system operates. > > > > > > My kernel builds with CONFIG_INTEL_IDLE=n, which I believe means that > > > I am not using the mwait one. Here is a grep for IDLE in my .config: > > > > > > CONFIG_NO_HZ_IDLE=y > > > CONFIG_GENERIC_SMP_IDLE_THREAD=y > > > # CONFIG_IDLE_PAGE_TRACKING is not set > > > CONFIG_ACPI_PROCESSOR_IDLE=y > > > CONFIG_CPU_IDLE=y > > > # CONFIG_CPU_IDLE_GOV_LADDER is not set > > > CONFIG_CPU_IDLE_GOV_MENU=y > > > # CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set > > > # CONFIG_INTEL_IDLE is not set > > > > > > > On sparc64 the cpu yield we do in the idle loop sleeps the cpu. It's > > > > local TICK register keeps advancing, and the local timer therefore > > > > will still trigger. Also, any externally generated interrupts > > > > (including cross calls) will wake up the cpu as well. > > > > > > > > The tick-sched code is really tricky wrt. NO_HZ even in the NO_HZ_IDLE > > > > case. One of my running theories is that we miss scheduling a tick > > > > due to a race. That would be consistent with the behavior we see > > > > in the RCU dumps, I think. > > > > > > But wouldn't you have to miss a -lot- of ticks to get an RCU CPU stall > > > warning? By default, your grace period needs to extend for more than > > > 21 seconds (more than one-third of a -minute-) to get one. Or do > > > you mean that the ticks get shut off now and forever, as opposed to > > > just losing one of them? > > > > > > > Anyways, just a theory, and that's why I keep mentioning that commit > > > > about the revert of the revert (specifically > > > > 411fe24e6b7c283c3a1911450cdba6dd3aaea56e). > > > > > > > > :-) > > > > > > I am running an overnight test in preparation for attempting to push > > > some fixes for regressions into 4.12, but will try reverting this > > > and enabling CONFIG_HZ_PERIODIC tomorrow. > > > > > > Jonathan, might the commit that Dave points out above be what reduces > > > the probability of occurrence as you test older releases? > > I just got around to trying this out of curiosity. Superficially it did > > appear to possibly make the issue harder to hit took over 30 minutes > > but the issue otherwise looks much the same with or without that patch. > > > > Just out of curiosity, next thing on my list is to disable hrtimers entirely > > and see what happens. > > > > Jonathan > > > > > > Thanx, Paul > > > > > > > ___ > > linuxarm mailing list > > linux...@huawei.com > > http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm > > ___ > linuxarm mailing list > linux...@huawei.com > http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm
[PATCH 3/3] powerpc/mm/hash64: Make vmalloc 56T on hash
On 64-bit book3s, with the hash MMU, we currently define the kernel virtual space (vmalloc, ioremap etc.), to be 16T in size. This is a leftover from pre v3.7 when our user VM was also 16T. Of that 16T we split it 50/50, with half used for PCI IO and ioremap and the other 8T for vmalloc. We never bothered to make it any bigger because 8T of vmalloc ought to be enough for anybody. But it turns out that's not true, the per cpu allocator wants large amounts of vmalloc space, not to make large allocations, but to allow a large stride between allocations, because we use pcpu_embed_first_chunk(). With a bit of juggling we can keep 8T for the IO etc. and make the vmalloc space 56T. The only complication is the check of the address in the SLB miss handler, see the comment in the code. Signed-off-by: Michael Ellerman--- arch/powerpc/include/asm/book3s/64/hash.h | 4 ++-- arch/powerpc/mm/slb_low.S | 18 +++--- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d613653ed5b9..f88452019114 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,7 +40,7 @@ * Define the address range of the kernel non-linear virtual area */ #define H_KERN_VIRT_START ASM_CONST(0xD000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x1000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x4000) /* 64T */ /* * The vmalloc space starts at the beginning of that region, and @@ -48,7 +48,7 @@ * (we keep a quarter for the virtual memmap) */ #define H_VMALLOC_STARTH_KERN_VIRT_START -#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_SIZE ASM_CONST(0x3800) /* 56T */ #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) #define H_KERN_IO_STARTH_VMALLOC_END diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 2eb1b92a68ff..906a86fe457b 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -121,9 +121,21 @@ slb_miss_kernel_load_vmemmap: 1: #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - clrldi r11,r10,48 - cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 - bgt 5f + /* +* r10 contains the ESID, which is the original faulting EA shifted +* right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) +* which is 0xd00038000. That can't be used as an immediate, even if we +* ignored the 0xd, so we have to load it into a register, and we only +* have one register free. So we must load all of (H_VMALLOC_END >> 28) +* into a register and compare ESID against that. +*/ + lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xd000 + ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xd0003800 + // Rotate left 4, then mask with 0x0 + rldic r11,r11,4,28// r11 = 0xd00038000 + cmpld r10,r11 // if r10 >= r11 + bge 5f // goto io_mapping + /* * vmalloc mapping gets the encoding from the PACA as the mapping * can be demoted from 64K -> 4K dynamically on some machines. -- 2.7.4
[PATCH 2/3] powerpc/mm/slb: Move comment next to the code it's referring to
There is a comment in slb_allocate() referring to the load of paca->vmalloc_sllp, but it's several lines prior in the assembly. We're about to change this code, and we want to add another comment, so move the comment immediately prior to the instruction it's talking about. Signed-off-by: Michael Ellerman--- arch/powerpc/mm/slb_low.S | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index bde378559d01..2eb1b92a68ff 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -121,12 +121,13 @@ slb_miss_kernel_load_vmemmap: 1: #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - /* vmalloc mapping gets the encoding from the PACA as the mapping -* can be demoted from 64K -> 4K dynamically on some machines -*/ clrldi r11,r10,48 cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 bgt 5f + /* +* vmalloc mapping gets the encoding from the PACA as the mapping +* can be demoted from 64K -> 4K dynamically on some machines. +*/ lhz r11,PACAVMALLOCSLLP(r13) b 6f 5: -- 2.7.4
[PATCH 1/3] powerpc/mm/book3s64: Make KERN_IO_START a variable
Currently KERN_IO_START is defined as: #define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) Although it looks like a constant, both the components are actually variables, to allow us to have a different value between Radix and Hash with a single kernel. However that still requires both Radix and Hash to place the kernel IO region at the same location relative to the start and end of the kernel virtual region (namely 1/2 way through it), and we'd like to change that. So split KERN_IO_START out into its own variable, and initialise it for Radix and Hash. In the medium term we should be able to reconsolidate this, by doing a more involved rearrangement of the location of the regions. Signed-off-by: Michael Ellerman--- arch/powerpc/include/asm/book3s/64/hash.h| 2 ++ arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++- arch/powerpc/include/asm/book3s/64/radix.h | 2 ++ arch/powerpc/mm/hash_utils_64.c | 1 + arch/powerpc/mm/pgtable-radix.c | 1 + arch/powerpc/mm/pgtable_64.c | 2 ++ 6 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 36fc7bfe9e11..d613653ed5b9 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -51,6 +51,8 @@ #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_IO_STARTH_VMALLOC_END + /* * Region IDs */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index d1da415e283c..18a8580d3ddc 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -272,8 +272,10 @@ extern unsigned long __vmalloc_end; extern unsigned long __kernel_virt_start; extern unsigned long __kernel_virt_size; +extern unsigned long __kernel_io_start; #define KERN_VIRT_START __kernel_virt_start #define KERN_VIRT_SIZE __kernel_virt_size +#define KERN_IO_START __kernel_io_start extern struct page *vmemmap; extern unsigned long ioremap_bot; extern unsigned long pci_io_base; @@ -298,7 +300,6 @@ extern unsigned long pci_io_base; * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE */ -#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) #define FULL_IO_SIZE 0x8000ul #define ISA_IO_BASE (KERN_IO_START) #define ISA_IO_END(KERN_IO_START + 0x1ul) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 50b5aff3..1e5ba94e62ef 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -110,6 +110,8 @@ */ #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) +#define RADIX_KERN_IO_START(RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) + #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 7419fc1854ad..a93137c358ea 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1019,6 +1019,7 @@ void __init hash__early_init_mmu(void) __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 5cc50d47ce3f..d37e68495acc 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -494,6 +494,7 @@ void __init radix__early_init_mmu(void) __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 0736e94c7615..ac0717a90ca6 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -104,6 +104,8 @@ unsigned long __vmalloc_start; EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); +unsigned long __kernel_io_start; +EXPORT_SYMBOL(__kernel_io_start); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; -- 2.7.4
Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?
On Mon, 31 Jul 2017 12:09:08 +0100 Jonathan Cameronwrote: > On Wed, 26 Jul 2017 16:15:05 -0700 > "Paul E. McKenney" wrote: > > > On Wed, Jul 26, 2017 at 03:45:40PM -0700, David Miller wrote: > > > From: "Paul E. McKenney" > > > Date: Wed, 26 Jul 2017 15:36:58 -0700 > > > > > > > And without CONFIG_SOFTLOCKUP_DETECTOR, I see five runs of 24 with RCU > > > > CPU stall warnings. So it seems likely that CONFIG_SOFTLOCKUP_DETECTOR > > > > really is having an effect. > > > > > > Thanks for all of the info Paul, I'll digest this and scan over the > > > code myself. > > > > > > Just out of curiousity, what x86 idle method is your machine using? > > > The mwait one or the one which simply uses 'halt'? The mwait variant > > > might mask this bug, and halt would be a lot closer to how sparc64 and > > > Jonathan's system operates. > > > > My kernel builds with CONFIG_INTEL_IDLE=n, which I believe means that > > I am not using the mwait one. Here is a grep for IDLE in my .config: > > > > CONFIG_NO_HZ_IDLE=y > > CONFIG_GENERIC_SMP_IDLE_THREAD=y > > # CONFIG_IDLE_PAGE_TRACKING is not set > > CONFIG_ACPI_PROCESSOR_IDLE=y > > CONFIG_CPU_IDLE=y > > # CONFIG_CPU_IDLE_GOV_LADDER is not set > > CONFIG_CPU_IDLE_GOV_MENU=y > > # CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set > > # CONFIG_INTEL_IDLE is not set > > > > > On sparc64 the cpu yield we do in the idle loop sleeps the cpu. It's > > > local TICK register keeps advancing, and the local timer therefore > > > will still trigger. Also, any externally generated interrupts > > > (including cross calls) will wake up the cpu as well. > > > > > > The tick-sched code is really tricky wrt. NO_HZ even in the NO_HZ_IDLE > > > case. One of my running theories is that we miss scheduling a tick > > > due to a race. That would be consistent with the behavior we see > > > in the RCU dumps, I think. > > > > But wouldn't you have to miss a -lot- of ticks to get an RCU CPU stall > > warning? By default, your grace period needs to extend for more than > > 21 seconds (more than one-third of a -minute-) to get one. Or do > > you mean that the ticks get shut off now and forever, as opposed to > > just losing one of them? > > > > > Anyways, just a theory, and that's why I keep mentioning that commit > > > about the revert of the revert (specifically > > > 411fe24e6b7c283c3a1911450cdba6dd3aaea56e). > > > > > > :-) > > > > I am running an overnight test in preparation for attempting to push > > some fixes for regressions into 4.12, but will try reverting this > > and enabling CONFIG_HZ_PERIODIC tomorrow. > > > > Jonathan, might the commit that Dave points out above be what reduces > > the probability of occurrence as you test older releases? > I just got around to trying this out of curiosity. Superficially it did > appear to possibly make the issue harder to hit took over 30 minutes > but the issue otherwise looks much the same with or without that patch. > > Just out of curiosity, next thing on my list is to disable hrtimers entirely > and see what happens. > > Jonathan > > > > Thanx, Paul > > > > ___ > linuxarm mailing list > linux...@huawei.com > http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm
Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch
Daniel Henrique Barbozawrites: > Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online > hotplugged memory'") reverted the auto-online feature for pseries due > to problems with LMB removals not updating the device struct properly. > Among other things, this commit made the following change in > arch/powerpc/configs/pseries_defconfig: > > @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y > CONFIG_IRQ_ALL_CPUS=y > CONFIG_MEMORY_HOTPLUG=y > CONFIG_MEMORY_HOTREMOVE=y > -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y > CONFIG_KSM=y > > The intent was to disable the option in the defconfig of pseries, since > after that the code doesn't have this support anymore. It's always polite to Cc the author of a commit you're referring to, so I added Nathan. The intention when we merged that fix was that the auto-online code would be "fixed" to mark the device online. I say "fixed" because it wasn't entirely clear if that was the correct behaviour, though it definitely seemed like it should be. I've lost track of where/if the discussion got to on whether the auto-online code should do that or not. Did anything get resolved? > However, this change > alone isn't enough to prevent situations such as [1], where > distros can enable the option unaware of the consequences of > doing it (e.g. breaking LMB hotplug altogether). > > Instead of relying on all distros knowing that pseries can't handle > CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch > changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config > unavailable for the PPC64 arch. > > [1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380 > > Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged > memory'") > Signed-off-by: Daniel Henrique Barboza > --- > mm/Kconfig | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) I don't own that file, so we at least need an Ack from the mm folks. cheers > diff --git a/mm/Kconfig b/mm/Kconfig > index 48b1af4..a342c77 100644 > --- a/mm/Kconfig > +++ b/mm/Kconfig > @@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE > config MEMORY_HOTPLUG_DEFAULT_ONLINE > bool "Online the newly added memory blocks by default" > default n > -depends on MEMORY_HOTPLUG > +depends on MEMORY_HOTPLUG && !PPC64 > help > This option sets the default policy setting for memory hotplug > onlining policy (/sys/devices/system/memory/auto_online_blocks) which > -- > 2.9.4
RE: [RESEND][PATCH V10 0/3] powernv : Add support for OPAL-OCC command/response interface
From: Shilpasri G Bhat > Sent: 31 July 2017 08:43 > In P9, OCC (On-Chip-Controller) supports shared memory based > commad-response interface. Within the shared memory there is an OPAL ^ typo > command buffer and OCC response buffer that can be used to send > inband commands to OCC. The following commands are supported: ... David
Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage
Jens Axboewrites: ... > > Can you try the below fix? Should be more palatable than the previous > one. Brian, maybe you can take a look at the IRQ issue mentioned above? Given the patch from Brian fixed the lockdep warning, do you still want me to try and test this one? cheers > diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c > index f6097b89d5d3..dfb89596af81 100644 > --- a/drivers/scsi/scsi_lib.c > +++ b/drivers/scsi/scsi_lib.c > @@ -481,13 +481,14 @@ static void scsi_starved_list_run(struct Scsi_Host > *shost) > * Purpose:Select a proper request queue to serve next > * > * Arguments: q - last request's queue > + * async - run queues async, if we need to > * > * Returns: Nothing > * > * Notes: The previous command was completely finished, start > * a new one if possible. > */ > -static void scsi_run_queue(struct request_queue *q) > +static void scsi_run_queue(struct request_queue *q, bool async) > { > struct scsi_device *sdev = q->queuedata; > > @@ -497,7 +498,7 @@ static void scsi_run_queue(struct request_queue *q) > scsi_starved_list_run(sdev->host); > > if (q->mq_ops) > - blk_mq_run_hw_queues(q, false); > + blk_mq_run_hw_queues(q, async); > else > blk_run_queue(q); > } > @@ -509,7 +510,7 @@ void scsi_requeue_run_queue(struct work_struct *work) > > sdev = container_of(work, struct scsi_device, requeue_work); > q = sdev->request_queue; > - scsi_run_queue(q); > + scsi_run_queue(q, false); > } > > /* > @@ -543,17 +544,22 @@ static void scsi_requeue_command(struct request_queue > *q, struct scsi_cmnd *cmd) > blk_requeue_request(q, req); > spin_unlock_irqrestore(q->queue_lock, flags); > > - scsi_run_queue(q); > + scsi_run_queue(q, true); > > put_device(>sdev_gendev); > } > > -void scsi_run_host_queues(struct Scsi_Host *shost) > +static void __scsi_run_host_queues(struct Scsi_Host *shost, bool async) > { > struct scsi_device *sdev; > > shost_for_each_device(sdev, shost) > - scsi_run_queue(sdev->request_queue); > + scsi_run_queue(sdev->request_queue, async); > +} > + > +void scsi_run_host_queues(struct Scsi_Host *shost) > +{ > + __scsi_run_host_queues(shost, false); > } > > static void scsi_uninit_cmd(struct scsi_cmnd *cmd) > @@ -671,7 +677,7 @@ static bool scsi_end_request(struct request *req, > blk_status_t error, > blk_finish_request(req, error); > spin_unlock_irqrestore(q->queue_lock, flags); > > - scsi_run_queue(q); > + scsi_run_queue(q, false); > } > > put_device(>sdev_gendev); > @@ -2293,7 +2299,7 @@ EXPORT_SYMBOL(scsi_block_requests); > void scsi_unblock_requests(struct Scsi_Host *shost) > { > shost->host_self_blocked = 0; > - scsi_run_host_queues(shost); > + __scsi_run_host_queues(shost, true); > } > EXPORT_SYMBOL(scsi_unblock_requests); > > @@ -2897,10 +2903,10 @@ scsi_device_quiesce(struct scsi_device *sdev) > if (err) > return err; > > - scsi_run_queue(sdev->request_queue); > + scsi_run_queue(sdev->request_queue, false); > while (atomic_read(>device_busy)) { > msleep_interruptible(200); > - scsi_run_queue(sdev->request_queue); > + scsi_run_queue(sdev->request_queue, false); > } > return 0; > } > @@ -2924,7 +2930,7 @@ void scsi_device_resume(struct scsi_device *sdev) > mutex_lock(>state_mutex); > if (sdev->sdev_state == SDEV_QUIESCE && > scsi_device_set_state(sdev, SDEV_RUNNING) == 0) > - scsi_run_queue(sdev->request_queue); > + scsi_run_queue(sdev->request_queue, false); > mutex_unlock(>state_mutex); > } > EXPORT_SYMBOL(scsi_device_resume); > > -- > Jens Axboe
Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage
Michael Ellermanwrites: > Brian King writes: > >> On 07/28/2017 10:17 AM, Brian J King wrote: >>> Jens Axboe wrote on 07/28/2017 09:25:48 AM: >>> Can you try the below fix? Should be more palatable than the previous one. Brian, maybe you can take a look at the IRQ issue mentioned above? >> >> Michael, >> >> Does this address the issue you are seeing? > > Yes it seems to, thanks. > > I only see the trace on reboot, and not 100% of the time. But I've > survived a couple of reboots now without seeing anything, so I think > this is helping. > > I'll put the patch in my Jenkins over night and let you know how it > survives that, which should be ~= 25 boots. No lockdep warnings or other oddness over night, so that patch looks good to me. cheers
Re: [RFC v6 21/62] powerpc: introduce execute-only pkey
Thiago Jung Bauermannwrites: > Ram Pai writes: ... >> + >> +/* We got one, store it and use it from here on out */ >> +if (need_to_set_mm_pkey) >> +mm->context.execute_only_pkey = execute_only_pkey; >> +return execute_only_pkey; >> +} > > If you follow the code flow in __execute_only_pkey, the AMR and UAMOR > are read 3 times in total, and AMR is written twice. IAMR is read and > written twice. Since they are SPRs and access to them is slow (or isn't > it?), SPRs read/writes are slow, but they're not *that* slow in comparison to a system call (which I think is where this code is being called?). So we should try to avoid too many SPR read/writes, but at the same time we can accept more than the minimum if it makes the code much easier to follow. cheers
Re: [PATCH v3] powerpc/powernv: Enable PCI peer-to-peer
Brian Kingwrites: > Michael, > > What do we need on this one before we can pull into your -next branch? This skiboot side to be merged. cheers
Re: [PATCH 2/3] powerpc/xmon: Disable and enable tracing command
On 2017/07/31 02:22PM, Breno Leitao wrote: > If tracing is enabled and you get into xmon, the tracing buffer > continues to be updated, causing possible loss of data due to buffer > overflow and unnecessary tracing information coming from xmon functions. > > This patch adds a new option that allows the tracing to be disabled and > re-enabled from inside xmon. How is this new option useful? In the next patch, you disable tracing by default -- in what scenario do you expect to have to re-enable tracing from within xmon? > > Signed-off-by: Breno Leitao> --- > arch/powerpc/xmon/xmon.c | 16 +++- > 1 file changed, 15 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c > index 0cbd910193fa..19276d2f2f25 100644 > --- a/arch/powerpc/xmon/xmon.c > +++ b/arch/powerpc/xmon/xmon.c > @@ -89,6 +89,7 @@ static unsigned long nidump = 16; > static unsigned long ncsum = 4096; > static int termch; > static char tmpstr[128]; > +static char tracing_enabled = 1; > > static long bus_error_jmp[JMP_BUF_LEN]; > static int catch_memory_errors; > @@ -268,6 +269,7 @@ Commands:\n\ >Sr # read SPR #\n\ >Sw #v write v to SPR #\n\ >t print backtrace\n\ > + v trace enable/disable\n\ >x exit monitor and recover\n\ >X exit monitor and don't recover\n" > #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E) > @@ -983,6 +985,17 @@ cmds(struct pt_regs *excp) > case 'x': > case 'X': > return cmd; > + case 'v': > + if (tracing_is_on()) { > + printk("Disabling tracing\n"); > + tracing_enabled = 0; > + tracing_off(); This only disables trace buffer updates - ftrace (and all its callbacks, et al) remains active, which isn't desirable. Can you see if this works for you: https://patchwork.ozlabs.org/patch/769611/ - Naveen