[PATCH -V2 12/14] kvm: Add struct kvm arg to memslot APIs
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We will use that in the later patch to find the kvm ops handler Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/arm/kvm/arm.c | 5 +++-- arch/ia64/kvm/kvm-ia64.c | 5 +++-- arch/mips/kvm/kvm_mips.c | 5 +++-- arch/powerpc/include/asm/kvm_ppc.h | 6 -- arch/powerpc/kvm/book3s.c | 4 ++-- arch/powerpc/kvm/booke.c | 4 ++-- arch/powerpc/kvm/powerpc.c | 9 + arch/s390/kvm/kvm-s390.c | 5 +++-- arch/x86/kvm/x86.c | 5 +++-- include/linux/kvm_host.h | 5 +++-- virt/kvm/kvm_main.c| 12 ++-- 11 files changed, 37 insertions(+), 28 deletions(-) diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 9c697db..e96c48f 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -152,12 +152,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index bdfd878..985bf80 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -1550,12 +1550,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c index a7b0445..73b3482 100644 --- a/arch/mips/kvm/kvm_mips.c +++ b/arch/mips/kvm/kvm_mips.c @@ -198,12 +198,13 @@ kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) return -ENOIOCTLCMD; } -void kvm_arch_free_memslot(struct kvm_memory_slot *free, +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { } -int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) { return 0; } diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c13f15d..20f4616 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -134,9 +134,11 @@ extern struct page *kvm_alloc_hpt(unsigned long nr_pages); extern void kvm_release_hpt(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); -extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +extern void kvmppc_core_free_memslot(struct kvm *kvm, +struct kvm_memory_slot *free, struct kvm_memory_slot *dont); -extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +extern int kvmppc_core_create_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, unsigned long npages); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 39d2994..130fe1d 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -761,13 +761,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) return kvmppc_ops-get_dirty_log(kvm, log); } -void kvmppc_core_free_memslot(struct kvm_memory_slot *free, +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { kvmppc_ops-free_memslot(free, dont); } -int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, unsigned long npages) { return kvmppc_ops-create_memslot(slot, npages); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 1769354..cb2d986 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1662,12 +1662,12 @@ int
[PATCH -V2 13/14] kvm: powerpc: book3s: Allow the HV and PR selection per virtual machine
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This moves the kvmppc_ops callbacks to be a per VM entity. This enables us to select HV and PR mode when creating a VM. We also allow both kvm-hv and kvm-pr kernel module to be loaded. To achieve this we move /dev/kvm ownership to kvm.ko module. Depending on which KVM mode we select during VM creation we take a reference count on respective module Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/kvm_ppc.h | 7 +-- arch/powerpc/kvm/44x.c | 7 ++- arch/powerpc/kvm/book3s.c | 89 + arch/powerpc/kvm/book3s.h | 2 + arch/powerpc/kvm/book3s_hv.c| 18 arch/powerpc/kvm/book3s_pr.c| 25 +++ arch/powerpc/kvm/book3s_xics.c | 2 +- arch/powerpc/kvm/booke.c| 22 - arch/powerpc/kvm/e500.c | 8 +++- arch/powerpc/kvm/e500mc.c | 6 ++- arch/powerpc/kvm/emulate.c | 11 ++--- arch/powerpc/kvm/powerpc.c | 76 ++- include/uapi/linux/kvm.h| 4 ++ 14 files changed, 187 insertions(+), 91 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e86db97..c7a041d 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -275,6 +275,7 @@ struct kvm_arch { #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; #endif + struct kvmppc_ops *kvm_ops; }; /* diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 20f4616..3069cf4 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -182,6 +182,7 @@ union kvmppc_one_reg { }; struct kvmppc_ops { + struct module *owner; bool is_hv_enabled; int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); @@ -217,7 +218,6 @@ struct kvmppc_ops { unsigned long npages); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); - int (*check_processor_compat)(void); int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info); int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); @@ -229,7 +229,8 @@ struct kvmppc_ops { }; -extern struct kvmppc_ops *kvmppc_ops; +extern struct kvmppc_ops *kvmppc_hv_ops; +extern struct kvmppc_ops *kvmppc_pr_ops; /* * Cuts out inst bits with ordering according to spec. @@ -326,7 +327,7 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi) static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu) { - kvmppc_ops-fast_vcpu_kick(vcpu); + vcpu-kvm-arch.kvm_ops-fast_vcpu_kick(vcpu); } #else diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index a765bcd..93221e8 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -213,16 +213,19 @@ static int __init kvmppc_44x_init(void) if (r) goto err_out; - r = kvm_init(kvm_ops_44x, sizeof(struct kvmppc_vcpu_44x), -0, THIS_MODULE); + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE); if (r) goto err_out; + kvm_ops_44x.owner = THIS_MODULE; + kvmppc_pr_ops = kvm_ops_44x; + err_out: return r; } static void __exit kvmppc_44x_exit(void) { + kvmppc_pr_ops = NULL; kvmppc_booke_exit(); } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 130fe1d..ad8f6ed 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -34,6 +34,7 @@ #include linux/vmalloc.h #include linux/highmem.h +#include book3s.h #include trace.h #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -71,7 +72,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) { - if (!kvmppc_ops-is_hv_enabled) + if (!vcpu-kvm-arch.kvm_ops-is_hv_enabled) return to_book3s(vcpu)-hior; return 0; } @@ -79,7 +80,7 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { - if (kvmppc_ops-is_hv_enabled) + if (vcpu-kvm-arch.kvm_ops-is_hv_enabled) return; if (pending_now) vcpu-arch.shared-int_pending = 1; @@ -93,7 +94,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) ulong crit_r1; bool crit; - if (kvmppc_ops-is_hv_enabled) + if (vcpu-kvm-arch.kvm_ops
[PATCH -V2 14/14] kvm: powerpc: book3s: drop is_hv_enabled
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com drop is_hv_enabled, because that should not be a callback property Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_ppc.h | 6 +- arch/powerpc/kvm/book3s.c | 6 +++--- arch/powerpc/kvm/book3s_hv.c | 1 - arch/powerpc/kvm/book3s_pr.c | 1 - arch/powerpc/kvm/book3s_xics.c | 2 +- arch/powerpc/kvm/powerpc.c | 2 +- 6 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 3069cf4..c8317fb 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -183,7 +183,6 @@ union kvmppc_one_reg { struct kvmppc_ops { struct module *owner; - bool is_hv_enabled; int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id, @@ -232,6 +231,11 @@ struct kvmppc_ops { extern struct kvmppc_ops *kvmppc_hv_ops; extern struct kvmppc_ops *kvmppc_pr_ops; +static inline bool is_kvmppc_hv_enabled(struct kvm *kvm) +{ + return kvm-arch.kvm_ops == kvmppc_hv_ops; +} + /* * Cuts out inst bits with ordering according to spec. * That means the leftmost bit is zero. All given bits are included. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ad8f6ed..8912608 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -72,7 +72,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) { - if (!vcpu-kvm-arch.kvm_ops-is_hv_enabled) + if (!is_kvmppc_hv_enabled(vcpu-kvm)) return to_book3s(vcpu)-hior; return 0; } @@ -80,7 +80,7 @@ static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, unsigned long pending_now, unsigned long old_pending) { - if (vcpu-kvm-arch.kvm_ops-is_hv_enabled) + if (is_kvmppc_hv_enabled(vcpu-kvm)) return; if (pending_now) vcpu-arch.shared-int_pending = 1; @@ -94,7 +94,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) ulong crit_r1; bool crit; - if (vcpu-kvm-arch.kvm_ops-is_hv_enabled) + if (is_kvmppc_hv_enabled(vcpu-kvm)) return false; crit_raw = vcpu-arch.shared-critical; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 31922d5..b5229eb 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2160,7 +2160,6 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp, } static struct kvmppc_ops kvm_ops_hv = { - .is_hv_enabled = true, .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, .get_one_reg = kvmppc_get_one_reg_hv, diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index fbd985f..df36cf2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1526,7 +1526,6 @@ static long kvm_arch_vm_ioctl_pr(struct file *filp, } static struct kvmppc_ops kvm_ops_pr = { - .is_hv_enabled = false, .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, .get_one_reg = kvmppc_get_one_reg_pr, diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 76ef525..20d56ec 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) } /* Check for real mode returning too hard */ - if (xics-real_mode vcpu-kvm-arch.kvm_ops-is_hv_enabled) + if (xics-real_mode is_kvmppc_hv_enabled(vcpu-kvm)) return kvmppc_xics_rm_complete(vcpu, req); switch (req) { diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 00a995a..058f9d6 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -200,7 +200,7 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu) goto out; /* HV KVM can only do PAPR mode for now */ - if (!vcpu-arch.papr_enabled vcpu-kvm-arch.kvm_ops-is_hv_enabled) + if (!vcpu-arch.papr_enabled is_kvmppc_hv_enabled(vcpu-kvm)) goto out; #ifdef CONFIG_KVM_BOOKE_HV -- 1.8.1.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 06/14] kvm: powerpc: booke: Convert BOOKE to use kvmppc_ops callbacks
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Make required changes to get BOOKE configs to build with the introduction of kvmppc_ops callback Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_ppc.h | 4 +-- arch/powerpc/kvm/44x.c | 55 +++--- arch/powerpc/kvm/44x_emulate.c | 8 +++--- arch/powerpc/kvm/44x_tlb.c | 2 +- arch/powerpc/kvm/booke.c | 47 +++- arch/powerpc/kvm/booke.h | 24 + arch/powerpc/kvm/e500.c| 53 +--- arch/powerpc/kvm/e500_emulate.c| 8 +++--- arch/powerpc/kvm/e500_mmu.c| 2 +- arch/powerpc/kvm/e500mc.c | 54 ++--- 10 files changed, 194 insertions(+), 63 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 1d22b53..326033c 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -285,10 +285,10 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int lsb, int value) __v;\ }) -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); -void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c index 2f5c6b6..a765bcd 100644 --- a/arch/powerpc/kvm/44x.c +++ b/arch/powerpc/kvm/44x.c @@ -31,13 +31,13 @@ #include 44x_tlb.h #include booke.h -void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu) { kvmppc_booke_vcpu_load(vcpu, cpu); kvmppc_44x_tlb_load(vcpu); } -void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu) { kvmppc_44x_tlb_put(vcpu); kvmppc_booke_vcpu_put(vcpu); @@ -114,29 +114,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, return 0; } -void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) { - kvmppc_get_sregs_ivor(vcpu, sregs); + return kvmppc_get_sregs_ivor(vcpu, sregs); } -int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu, +struct kvm_sregs *sregs) { return kvmppc_set_sregs_ivor(vcpu, sregs); } -int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, - union kvmppc_one_reg *val) +static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) { return -EINVAL; } -struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm, + unsigned int id) { struct kvmppc_vcpu_44x *vcpu_44x; struct kvm_vcpu *vcpu; @@ -167,7 +170,7 @@ out: return ERR_PTR(err); } -void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu); @@ -176,24 +179,46 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) kmem_cache_free(kvm_vcpu_cache, vcpu_44x); } -int kvmppc_core_init_vm(struct kvm *kvm) +static int kvmppc_core_init_vm_44x(struct kvm *kvm) { return 0; } -void kvmppc_core_destroy_vm(struct kvm *kvm) +static void kvmppc_core_destroy_vm_44x(struct kvm *kvm) { } +static struct kvmppc_ops kvm_ops_44x = { + .get_sregs = kvmppc_core_get_sregs_44x, + .set_sregs = kvmppc_core_set_sregs_44x, + .get_one_reg = kvmppc_get_one_reg_44x, + .set_one_reg = kvmppc_set_one_reg_44x, + .vcpu_load = kvmppc_core_vcpu_load_44x, + .vcpu_put= kvmppc_core_vcpu_put_44x, + .vcpu_create = kvmppc_core_vcpu_create_44x, + .vcpu_free = kvmppc_core_vcpu_free_44x, + .mmu_destroy = kvmppc_mmu_destroy_44x, + .init_vm = kvmppc_core_init_vm_44x, + .destroy_vm
[PATCH -V2 09/14] kvm: powerpc: book3s: pr: move PR related tracepoints to a separate header
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch moves PR related tracepoints to a separate header. This enables in converting PR to a kernel module which will be done in later patches Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_host.c | 2 +- arch/powerpc/kvm/book3s_mmu_hpte.c| 2 +- arch/powerpc/kvm/book3s_pr.c | 4 +- arch/powerpc/kvm/trace.h | 234 +-- arch/powerpc/kvm/trace_pr.h | 297 ++ 5 files changed, 309 insertions(+), 230 deletions(-) create mode 100644 arch/powerpc/kvm/trace_pr.h diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 819672c..0d513af 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -27,7 +27,7 @@ #include asm/machdep.h #include asm/mmu_context.h #include asm/hw_irq.h -#include trace.h +#include trace_pr.h #define PTE_SIZE 12 diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c index 6b79bfc..5a1ab12 100644 --- a/arch/powerpc/kvm/book3s_mmu_hpte.c +++ b/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -28,7 +28,7 @@ #include asm/mmu_context.h #include asm/hw_irq.h -#include trace.h +#include trace_pr.h #define PTE_SIZE 12 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b6a525d..ca6c73d 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -42,7 +42,9 @@ #include linux/highmem.h #include book3s.h -#include trace.h + +#define CREATE_TRACE_POINTS +#include trace_pr.h /* #define EXIT_DEBUG */ /* #define DEBUG_EXT */ diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h index 9e8368e..80f252a 100644 --- a/arch/powerpc/kvm/trace.h +++ b/arch/powerpc/kvm/trace.h @@ -85,6 +85,12 @@ TRACE_EVENT(kvm_ppc_instr, {41, HV_PRIV} #endif +#ifndef CONFIG_KVM_BOOK3S_PR_POSSIBLE +/* + * For pr we define this in trace_pr.h since it pr can be built as + * a module + */ + TRACE_EVENT(kvm_exit, TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), TP_ARGS(exit_nr, vcpu), @@ -94,9 +100,6 @@ TRACE_EVENT(kvm_exit, __field(unsigned long, pc ) __field(unsigned long, msr ) __field(unsigned long, dar ) -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - __field(unsigned long, srr1) -#endif __field(unsigned long, last_inst ) ), @@ -105,9 +108,6 @@ TRACE_EVENT(kvm_exit, __entry-pc = kvmppc_get_pc(vcpu); __entry-dar= kvmppc_get_fault_dar(vcpu); __entry-msr= vcpu-arch.shared-msr; -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - __entry-srr1 = vcpu-arch.shadow_srr1; -#endif __entry-last_inst = vcpu-arch.last_inst; ), @@ -115,18 +115,12 @@ TRACE_EVENT(kvm_exit, | pc=0x%lx | msr=0x%lx | dar=0x%lx -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE -| srr1=0x%lx -#endif | last_inst=0x%lx , __print_symbolic(__entry-exit_nr, kvm_trace_symbol_exit), __entry-pc, __entry-msr, __entry-dar, -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - __entry-srr1, -#endif __entry-last_inst ) ); @@ -145,6 +139,7 @@ TRACE_EVENT(kvm_unmap_hva, TP_printk(unmap hva 0x%lx\n, __entry-hva) ); +#endif TRACE_EVENT(kvm_stlb_inval, TP_PROTO(unsigned int stlb_index), @@ -231,221 +226,6 @@ TRACE_EVENT(kvm_check_requests, __entry-cpu_nr, __entry-requests) ); - -/* - * Book3S trace points * - */ - -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - -TRACE_EVENT(kvm_book3s_reenter, - TP_PROTO(int r, struct kvm_vcpu *vcpu), - TP_ARGS(r, vcpu), - - TP_STRUCT__entry( - __field(unsigned int, r ) - __field(unsigned long, pc ) - ), - - TP_fast_assign( - __entry-r = r; - __entry-pc = kvmppc_get_pc(vcpu); - ), - - TP_printk(reentry r=%d | pc=0x%lx, __entry-r, __entry-pc) -); - -#ifdef CONFIG_PPC_BOOK3S_64 - -TRACE_EVENT(kvm_book3s_64_mmu_map, - TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, -struct kvmppc_pte *orig_pte), - TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), - - TP_STRUCT__entry( - __field(unsigned char
[PATCH -V2 08/14] kvm: powerpc: book3s: Add is_hv_enabled to kvmppc_ops
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This help us to identify whether we are running with hypervisor mode KVM enabled. The change is needed so that we can have both HV and PR kvm enabled in the same kernel. If both HV and PR KVM are included, interrupts come in to the HV version of the kvmppc_interrupt code, which then jumps to the PR handler, renamed to kvmppc_interrupt_pr, if the guest is a PR guest. Allowing both PR and HV in the same kernel required some changes to kvm_dev_ioctl_check_extension(), since the values returned now can't be selected with #ifdefs as much as previously. We look at is_hv_enabled to return the right value when checking for capabilities.For capabilities that are only provided by HV KVM, we return the HV value only if is_hv_enabled is true. For capabilities provided by PR KVM but not HV, we return the PR value only if is_hv_enabled is false. NOTE: in later patch we replace is_hv_enabled with a static inline function comparing kvm_ppc_ops Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 53 -- arch/powerpc/include/asm/kvm_ppc.h| 5 ++-- arch/powerpc/kvm/book3s.c | 44 arch/powerpc/kvm/book3s_hv.c | 1 + arch/powerpc/kvm/book3s_pr.c | 1 + arch/powerpc/kvm/book3s_xics.c| 2 +- arch/powerpc/kvm/powerpc.c| 54 +++ 7 files changed, 79 insertions(+), 81 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 315a5d6..4a594b7 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -301,59 +301,6 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu *vcpu) return vcpu-arch.fault_dar; } -#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return to_book3s(vcpu)-hior; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ - if (pending_now) - vcpu-arch.shared-int_pending = 1; - else if (old_pending) - vcpu-arch.shared-int_pending = 0; -} - -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - ulong crit_raw = vcpu-arch.shared-critical; - ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); - bool crit; - - /* Truncate crit indicators in 32 bit mode */ - if (!(vcpu-arch.shared-msr MSR_SF)) { - crit_raw = 0x; - crit_r1 = 0x; - } - - /* Critical section when crit == r1 */ - crit = (crit_raw == crit_r1); - /* ... and we're in supervisor mode */ - crit = crit !(vcpu-arch.shared-msr MSR_PR); - - return crit; -} -#else /* CONFIG_KVM_BOOK3S_PR_POSSIBLE */ - -static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, - unsigned long pending_now, unsigned long old_pending) -{ -} - -static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) -{ - return false; -} -#endif - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R30x113724FA diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 326033c..c13f15d 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -180,6 +180,7 @@ union kvmppc_one_reg { }; struct kvmppc_ops { + bool is_hv_enabled; int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id, @@ -309,10 +310,10 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr) static inline u32 kvmppc_get_xics_latch(void) { - u32 xirr = get_paca()-kvm_hstate.saved_xirr; + u32 xirr; + xirr = get_paca()-kvm_hstate.saved_xirr; get_paca()-kvm_hstate.saved_xirr = 0; - return xirr; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 784a1d5..493aff7 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -69,6 +69,50 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu) { } +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!kvmppc_ops-is_hv_enabled) + return to_book3s(vcpu)-hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (kvmppc_ops-is_hv_enabled
Re: [PATCH -V2 1/2] powerpc: Use HPTE constants when updating hpte bits
Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Even though we have same value for linux PTE bits and hash PTE pits use the hash pte bits wen updating hash pte ... diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 02d6e21..78f2c59 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -146,8 +146,9 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, flags = 0; /* Make pHyp happy */ - if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) - hpte_r = ~_PAGE_COHERENT; + if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) + hpte_r = ~HPTE_R_M; + - if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) + if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) the if loop part of the change is already done in upstream. Since V2 was generated after moving V1 patch to a different tree (kvm tree), ended up with that additional change. Let me know if you want me respin the patch if (firmware_has_feature(FW_FEATURE_XCMO) !(hpte_r HPTE_R_N)) flags |= H_COALESCE_CAND; -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH -V2 00/14] Allow PR and HV KVM to coexist in one kernel
Hi Alex, Any update on this ? -aneesh Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes: Hi All, This patch series support enabling HV and PR KVM together in the same kernel. We extend machine property with new property kvm_type. A value of HV will force HV KVM and PR PR KVM. If we don't specify kvm_type we will select the fastest KVM mode. ie, HV if that is supported otherwise PR. With Qemu command line having -machine pseries,accel=kvm,kvm_type=HV [root@llmp24l02 qemu]# bash ../qemu failed to initialize KVM: Invalid argument [root@llmp24l02 qemu]# modprobe kvm-pr [root@llmp24l02 qemu]# bash ../qemu failed to initialize KVM: Invalid argument [root@llmp24l02 qemu]# modprobe kvm-hv [root@llmp24l02 qemu]# bash ../qemu now with -machine pseries,accel=kvm,kvm_type=PR [root@llmp24l02 qemu]# rmmod kvm-pr [root@llmp24l02 qemu]# bash ../qemu failed to initialize KVM: Invalid argument [root@llmp24l02 qemu]# [root@llmp24l02 qemu]# modprobe kvm-pr [root@llmp24l02 qemu]# bash ../qemu Changes from V1: * Build fixes for BOOKE (only compile tested) * Address review feedback -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH -V2 11/14] kvm: powerpc: book3s: Support building HV and PR KVM as module
The below patch fix a compile issue with KVM_XICS. Please fold diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index cef3de9..c3c832b 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -840,6 +840,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) return rc; } +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: Fix 64K page size support for PPC44x
Alistair Popple alist...@popple.id.au writes: PPC44x supports page sizes other than 4K however when 64K page sizes are selected compilation fails. This is due to a change in the definition of pgtable_t introduced by the following patch: commit 5c1f6ee9a31cbdac90bbb8ae1ba4475031ac74b4 Author: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com powerpc: Reduce PTE table memory wastage The above patch only implements the new layout for PPC64 so it doesn't compile for PPC32 with a 64K page size. Ideally we should implement the same layout for PPC32 however for the meantime this patch reverts the definition of pgtable_t for PPC32. Signed-off-by: Alistair Popple alist...@popple.id.au --- arch/powerpc/include/asm/page.h |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b9f4262..b142d58 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -403,7 +403,7 @@ void arch_free_page(struct page *page, int order); struct vm_area_struct; -#ifdef CONFIG_PPC_64K_PAGES +#if defined(CONFIG_PPC_64K_PAGES) defined(PPC64) ^^^ CONFIG_PPC64 ? typedef pte_t *pgtable_t; #else typedef struct page *pgtable_t; -- 1.7.10.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH -V2 06/14] kvm: powerpc: booke: Convert BOOKE to use kvmppc_ops callbacks
Alexander Graf ag...@suse.de writes: On 07.10.2013, at 18:47, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Make required changes to get BOOKE configs to build with the introduction of kvmppc_ops callback Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This can not be a separate commit, as you're breaking bisectability for booke this way. The only reason I split that into two was to make review easy. But yes when merging to your tree we should fold. I've squashed this in with the previous commit. Ok. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 0/9] powerpc: mm: Numa faults support for ppc64
Hi, This patch series add support for numa faults on ppc64 architecture. We steal the _PAGE_COHERENCE bit and use that for indicating _PAGE_NUMA. We clear the _PAGE_PRESENT bit and also invalidate the hpte entry on setting _PAGE_NUMA. The next fault on that page will be considered a numa fault. NOTE: __ Issue: I am finding large lock contention on page_table_lock with this series on a 95 cpu 4 node box with autonuma benchmark I will out on vacation till NOV 6 without email access. Hence i will not be able to respond to review feedbacks till then. lock_stat version 0.3 --- class namecon-bouncescontentions waittime-min waittime-max waittime-totalacq-bounces acquisitions holdtime-mi hold time hold total - (mm-page_table_lock)-rlock: 713531791 719610919 0.09 3038193.19 357867523236.3 729709189 7500401620.0 236991.36 1159646899.68 -- (mm-page_table_lock)-rlock 1 [c0218880] .anon_vma_prepare+0xb0/0x1e0 (mm-page_table_lock)-rlock 93 [c0207ebc] .do_numa_page+0x4c/0x190 (mm-page_table_lock)-rlock 301678 [c02139d4] .change_protection+0x1d4/0x560 (mm-page_table_lock)-rlock 244524 [c0213be8] .change_protection+0x3e8/0x560 -- (mm-page_table_lock)-rlock 1 [c0206a38] .__do_fault+0x198/0x6b0 (mm-page_table_lock)-rlock 704163 [c02139d4] .change_protection+0x1d4/0x560 (mm-page_table_lock)-rlock 207227 [c0213be8] .change_protection+0x3e8/0x560 (mm-page_table_lock)-rlock 95 [c0207ebc] .do_numa_page+0x4c/0x190 -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 4/9] powerpc: mm: Only check for _PAGE_PRESENT in set_pte/pmd functions
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We want to make sure we don't use these function when updating a pte or pmd entry that have a valid hpte entry, because these functions don't invalidate them. So limit the check to _PAGE_PRESENT bit. Numafault core changes use these functions for updating _PAGE_NUMA bits. That should be ok because when _PAGE_NUMA is set we can be sure that hpte entries are not present. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/mm/pgtable.c| 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index edda589..10c09b6 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -187,7 +187,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(*ptep)); + WARN_ON(pte_val(*ptep) _PAGE_PRESENT); #endif /* Note: mm-context.id might not yet have been assigned as * this context might not have been activated yet when this diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 536eec72..56b7586 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -686,7 +686,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_none(*pmdp)); + WARN_ON(pmd_val(*pmdp) _PAGE_PRESENT); assert_spin_locked(mm-page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 1/9] powerpc: Use HPTE constants when updating hpte bits
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Even though we have same value for linux PTE bits and hash PTE pits use the hash pte bits wen updating hash pte Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/platforms/cell/beat_htab.c | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index c34ee4e..d4d245c 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -111,7 +111,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r); if (rflags _PAGE_NO_CACHE) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; raw_spin_lock(beat_htab_lock); lpar_rc = beat_read_mask(hpte_group); @@ -337,7 +337,7 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r); if (rflags _PAGE_NO_CACHE) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; /* insert into not-volted entry */ lpar_rc = beat_insert_htab_entry3(0, hpte_group, hpte_v, hpte_r, diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 356bc75..c8fbef23 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -153,7 +153,8 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Make pHyp happy */ if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; + if (firmware_has_feature(FW_FEATURE_XCMO) !(hpte_r HPTE_R_N)) flags |= H_COALESCE_CAND; -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 3/9] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE. On archs like ppc64 that don't use _PAGE_PROTNONE and also have a separate page table outside linux pagetable, we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- include/linux/mm.h | 3 --- mm/mempolicy.c | 9 - 2 files changed, 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8b6e55e..5ab0e22 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1668,11 +1668,8 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); -#endif - struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0472964..efb4300 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -612,7 +612,6 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -626,7 +625,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1); if (nr_updated) @@ -634,13 +632,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ /* * Walk through page tables and collect pages to be migrated. -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 8/9] powerpc: mm: Support setting _PAGE_NUMA bit on pmd entry which are pointer to PTE page
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h | 18 -- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 46db094..f828944 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -150,8 +150,22 @@ #define pmd_set(pmdp, pmdval) (pmd_val(*(pmdp)) = (pmdval)) #define pmd_none(pmd) (!pmd_val(pmd)) -#definepmd_bad(pmd)(!is_kernel_addr(pmd_val(pmd)) \ -|| (pmd_val(pmd) PMD_BAD_BITS)) + +static inline int pmd_bad(pmd_t pmd) +{ +#ifdef CONFIG_NUMA_BALANCING + /* +* For numa balancing we can have this set +*/ + if (pmd_val(pmd) _PAGE_NUMA) + return 0; +#endif + if (!is_kernel_addr(pmd_val(pmd)) || + (pmd_val(pmd) PMD_BAD_BITS)) + return 1; + return 0; +} + #definepmd_present(pmd)(pmd_val(pmd) != 0) #definepmd_clear(pmdp) (pmd_val(*(pmdp)) = 0) #define pmd_page_vaddr(pmd)(pmd_val(pmd) ~PMD_MASKED_BITS) -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 9/9] powerpc: mm: Enable numa faulting for hugepages
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Provide numa related functions for updating pmd entries. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 67ea8fb..aa3add7 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -95,19 +95,19 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, #define pmd_numa pmd_numa static inline int pmd_numa(pmd_t pmd) { - return 0; + return pte_numa(pmd_pte(pmd)); } #define pmd_mknonnuma pmd_mknonnuma static inline pmd_t pmd_mknonnuma(pmd_t pmd) { - return pmd; + return pte_pmd(pte_mknonnuma(pmd_pte(pmd))); } #define pmd_mknuma pmd_mknuma static inline pmd_t pmd_mknuma(pmd_t pmd) { - return pmd; + return pte_pmd(pte_mknuma(pmd_pte(pmd))); } # else -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 7/9] mm: numafaults: Use change_pmd_protnuma for updating _PAGE_NUMA for regular pmds
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Archs like ppc64 have different layout for pmd entries pointing to PTE page. Hence add a separate function for modifying them Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 17 + include/asm-generic/pgtable.h | 20 mm/memory.c| 2 +- mm/mprotect.c | 24 ++-- 4 files changed, 44 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 9d87125..67ea8fb 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -75,6 +75,23 @@ static inline pte_t pte_mknuma(pte_t pte) return pte; } +#define change_pmd_protnuma change_pmd_protnuma +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, int prot_numa) +{ + /* +* We don't track the _PAGE_PRESENT bit here +*/ + unsigned long pmd_val; + pmd_val = pmd_val(*pmdp); + if (prot_numa) + pmd_val |= _PAGE_NUMA; + else + pmd_val = ~_PAGE_NUMA; + pmd_set(pmdp, pmd_val | _PAGE_NUMA); +} + + #define pmd_numa pmd_numa static inline int pmd_numa(pmd_t pmd) { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f330d28..568a8c4 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -697,6 +697,18 @@ static inline pmd_t pmd_mknuma(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } #endif + +#ifndef change_pmd_protnuma +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd, int prot_numa) +{ + if (prot_numa) + set_pmd_at(mm, addr PMD_MASK, pmd, pmd_mknuma(*pmd)); + else + set_pmd_at(mm, addr PMD_MASK, pmd, pmd_mknonnuma(*pmd)); +} + +#endif #else extern int pte_numa(pte_t pte); extern int pmd_numa(pmd_t pmd); @@ -704,6 +716,8 @@ extern pte_t pte_mknonnuma(pte_t pte); extern pmd_t pmd_mknonnuma(pmd_t pmd); extern pte_t pte_mknuma(pte_t pte); extern pmd_t pmd_mknuma(pmd_t pmd); +extern void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd, int prot_numa); #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ #else static inline int pmd_numa(pmd_t pmd) @@ -735,6 +749,12 @@ static inline pmd_t pmd_mknuma(pmd_t pmd) { return pmd; } + +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd, int prot_numa) +{ + BUG(); +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ diff --git a/mm/memory.c b/mm/memory.c index ca00039..e930e50 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3605,7 +3605,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(mm-page_table_lock); pmd = *pmdp; if (pmd_numa(pmd)) { - set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + change_pmd_protnuma(mm, _addr, pmdp, 0); numa = true; } spin_unlock(mm-page_table_lock); diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4..88de575 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -112,22 +112,6 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return pages; } -#ifdef CONFIG_NUMA_BALANCING -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - spin_lock(mm-page_table_lock); - set_pmd_at(mm, addr PMD_MASK, pmd, pmd_mknuma(*pmd)); - spin_unlock(mm-page_table_lock); -} -#else -static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, - pmd_t *pmd) -{ - BUG(); -} -#endif /* CONFIG_NUMA_BALANCING */ - static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -161,8 +145,12 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, * node. This allows a regular PMD to be handled as one fault * and effectively batches the taking of the PTL */ - if (prot_numa all_same_node) - change_pmd_protnuma(vma-vm_mm, addr, pmd); + if (prot_numa all_same_node) { + spin_lock(vma-vm_mm-page_table_lock); + change_pmd_protnuma(vma-vm_mm, addr, pmd, 1); + spin_unlock(vma-vm_mm-page_table_lock); + + } } while (pmd
[RFC PATCH 6/9] powerpc: mm: book3s: Disable hugepaged pmd format for book3s
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com After commit e2b3d202d1dba8f3546ed28224ce485bc50010be we have the below possible formats for pmd entry (1) invalid (all zeroes) (2) pointer to next table, as normal; bottom 6 bits == 0 (3) leaf pte for huge page, bottom two bits != 00 (4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table On book3s we don't really use the (4). For Numa balancing we need to tag pmd entries that are pointer to next table with _PAGE_NUMA for performance reason (9532fec118d485ea37ab6e3ea372d68cd8b4cd0d). This patch enables that by disabling hugepd support for book3s if NUMA_BALANCING is enabled. We ideally want to get rid of hugepd pointer completely. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/page.h | 11 +++ arch/powerpc/mm/hugetlbpage.c | 8 +++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h index b9f4262..791ab56 100644 --- a/arch/powerpc/include/asm/page.h +++ b/arch/powerpc/include/asm/page.h @@ -369,11 +369,22 @@ typedef struct { signed long pd; } hugepd_t; #ifdef CONFIG_PPC_BOOK3S_64 static inline int hugepd_ok(hugepd_t hpd) { +#ifdef CONFIG_NUMA_BALANCING + /* +* In order to enable batch handling of pte numa faults, Numa balancing +* code use the _PAGE_NUMA bit even on pmd that is pointing to PTE PAGE. +* 9532fec118d485ea37ab6e3ea372d68cd8b4cd0d. After commit +* e2b3d202d1dba8f3546ed28224ce485bc50010be we really don't need to +* support hugepd for ppc64. +*/ + return 0; +#else /* * hugepd pointer, bottom two bits == 00 and next 4 bits * indicate size of table */ return (((hpd.pd 0x3) == 0x0) ((hpd.pd HUGEPD_SHIFT_MASK) != 0)); +#endif } #else static inline int hugepd_ok(hugepd_t hpd) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d67db4b..71bd214 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -235,8 +235,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz if (!hpdp) return NULL; +#ifdef CONFIG_NUMA_BALANCING + /* +* We cannot support hugepd format with numa balancing support +* enabled. +*/ + return NULL; +#endif BUG_ON(!hugepd_none(*hpdp) !hugepd_ok(*hpdp)); - if (hugepd_none(*hpdp) __hugepte_alloc(mm, hpdp, addr, pdshift, pshift)) return NULL; -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 5/9] powerpc: mm: book3s: Enable _PAGE_NUMA for book3s
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes. This patch still disables the numa hinting using pmd entries. That require further changes to pmd entry format which is done in later patches. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 66 +- arch/powerpc/include/asm/pte-hash64.h | 6 arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 7d6eacf..9d87125 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -3,6 +3,7 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ +#include linux/mmdebug.h #include asm/processor.h /* For TASK_SIZE */ #include asm/mmu.h #include asm/page.h @@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) _PAGE_FILE; } static inline int pte_special(pte_t pte) { return pte_val(pte) _PAGE_SPECIAL; } -static inline int pte_present(pte_t pte) { return pte_val(pte) _PAGE_PRESENT; } static inline int pte_none(pte_t pte) { return (pte_val(pte) ~_PTE_NONE_MASK) == 0; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) PAGE_PROT_BITS); } +#ifdef CONFIG_NUMA_BALANCING + +static inline int pte_present(pte_t pte) +{ + return pte_val(pte) (_PAGE_PRESENT | _PAGE_NUMA); +} + +#define pte_numa pte_numa +static inline int pte_numa(pte_t pte) +{ + return (pte_val(pte) + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} + +#define pte_mknonnuma pte_mknonnuma +static inline pte_t pte_mknonnuma(pte_t pte) +{ + pte_val(pte) = ~_PAGE_NUMA; + pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED; + return pte; +} + +#define pte_mknuma pte_mknuma +static inline pte_t pte_mknuma(pte_t pte) +{ + /* +* We should not set _PAGE_NUMA on non present ptes. Also clear the +* present bit so that hash_page will return 1 and we collect this +* as numa fault. +*/ + if (pte_present(pte)) { + pte_val(pte) |= _PAGE_NUMA; + pte_val(pte) = ~_PAGE_PRESENT; + } else + VM_BUG_ON(1); + return pte; +} + +#define pmd_numa pmd_numa +static inline int pmd_numa(pmd_t pmd) +{ + return 0; +} + +#define pmd_mknonnuma pmd_mknonnuma +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + return pmd; +} + +#define pmd_mknuma pmd_mknuma +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + return pmd; +} + +# else + +static inline int pte_present(pte_t pte) +{ + return pte_val(pte) _PAGE_PRESENT; +} +#endif /* CONFIG_NUMA_BALANCING */ + /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 55aea0c..2505d8e 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -27,6 +27,12 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE hash are busy */ +/* + * Used for tracking numa faults + */ +#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */ + + /* No separate kernel read-only */ #define _PAGE_KERNEL_RW(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6704e2e..c9d6223 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,6 +72,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES + select ARCH_SUPPORTS_NUMA_BALANCING config PPC_BOOK3E_64 bool Embedded processors -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 2/9] powerpc: Free up _PAGE_COHERENCE for numa fault use later
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Set memory coherence always on hash64 config. If a platform cannot have memory coherence always set they can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU like in lpar. So we dont' really need a separate bit for tracking _PAGE_COHERENCE. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pte-hash64.h | 2 +- arch/powerpc/mm/hash_low_64.S | 15 --- arch/powerpc/mm/hash_utils_64.c | 7 --- arch/powerpc/mm/hugepage-hash64.c | 6 +- arch/powerpc/mm/hugetlbpage-hash64.c | 4 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 0419eeb..55aea0c 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -19,7 +19,7 @@ #define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */ #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we invert) */ #define _PAGE_GUARDED 0x0008 -#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ +/* We can derive Memory coherence from _PAGE_NO_CACHE */ #define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ #define _PAGE_WRITETHRU0x0040 /* W: cache write-through */ #define _PAGE_DIRTY0x0080 /* C: page changed */ diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index d3cbda6..1136d26 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r30,r0 /* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r3,r0/* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r30,r0 /* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index bde8b55..fb176e9 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags _PAGE_USER) !((pteflags _PAGE_RW) (pteflags _PAGE_DIRTY))) rflags |= 1; - - /* Always add C */ - return rflags | HPTE_R_C; + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + return rflags | HPTE_R_C | HPTE_R_M; } int htab_bolt_mapping(unsigned long vstart, unsigned long vend, diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 34de9e0..826893f 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -127,7 +127,11 @@ repeat: /* Add in WIMG bits */ rflags |= (new_pmd (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); + _PAGE_GUARDED)); + /* +* enable the memory coherence always +*/ + rflags |= HPTE_R_M; /* Insert into the hash table, primary slot */ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage
[PATCH] powerpc: book3s: kvm: Don't abuse host r2 in exit path
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We don't use PACATOC for PR. Avoid updating HOST_R2 with PR KVM mode when both HV and PR are enabled in the kernel. Without this we get the below crash (qemu) Unable to handle kernel paging request for data at address 0x8310 Faulting instruction address: 0xc001d5a4 cpu 0x2: Vector: 300 (Data Access) at [c001dc53aef0] pc: c001d5a4: .vtime_delta.isra.1+0x34/0x1d0 lr: c001d760: .vtime_account_system+0x20/0x60 sp: c001dc53b170 msr: 80009032 dar: 8310 dsisr: 4000 current = 0xc001d76c62d0 paca= 0xcfef1100 softe: 0irq_happened: 0x01 pid = 4472, comm = qemu-system-ppc enter ? for help [c001dc53b200] c001d760 .vtime_account_system+0x20/0x60 [c001dc53b290] c008d050 .kvmppc_handle_exit_pr+0x60/0xa50 [c001dc53b340] c008f51c kvm_start_lightweight+0xb4/0xc4 [c001dc53b510] c008cdf0 .kvmppc_vcpu_run_pr+0x150/0x2e0 [c001dc53b9e0] c008341c .kvmppc_vcpu_run+0x2c/0x40 [c001dc53ba50] c0080af4 .kvm_arch_vcpu_ioctl_run+0x54/0x1b0 [c001dc53bae0] c007b4c8 .kvm_vcpu_ioctl+0x478/0x730 [c001dc53bca0] c02140cc .do_vfs_ioctl+0x4ac/0x770 [c001dc53bd80] c02143e8 .SyS_ioctl+0x58/0xb0 [c001dc53be30] c0009e58 syscall_exit+0x0/0x98 --- Exception: c00 (System Call) at 1f960160 SP (1ecbe3c0) is in userspace These changes were originally part of http://mid.gmane.org/20130806042205.gr19...@iris.ozlabs.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 7 +++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 0bd9348..69fe837 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -79,6 +79,7 @@ struct kvmppc_host_state { ulong vmhandler; ulong scratch0; ulong scratch1; + ulong scratch2; u8 in_guest; u8 restore_hid5; u8 napping; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8e6ede6..841a4c8 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -583,6 +583,7 @@ int main(void) HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler); HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); + HSTATE_FIELD(HSTATE_SCRATCH2, scratch2); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_NAPPING, napping); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 339aa5e..16f7654 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -750,15 +750,14 @@ kvmppc_interrupt_hv: * guest CR, R12 saved in shadow VCPU SCRATCH1/0 * guest R13 saved in SPRN_SCRATCH0 */ - /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ - std r9, HSTATE_HOST_R2(r13) + std r9, HSTATE_SCRATCH2(r13) lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE cmpwi r9, KVM_GUEST_MODE_GUEST - ld r9, HSTATE_HOST_R2(r13) + ld r9, HSTATE_SCRATCH2(r13) beq kvmppc_interrupt_pr #endif /* We're now back in the host but in guest MMU context */ @@ -778,7 +777,7 @@ kvmppc_interrupt_hv: std r6, VCPU_GPR(R6)(r9) std r7, VCPU_GPR(R7)(r9) std r8, VCPU_GPR(R8)(r9) - ld r0, HSTATE_HOST_R2(r13) + ld r0, HSTATE_SCRATCH2(r13) std r0, VCPU_GPR(R9)(r9) std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Don't try to compute these values. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: I am not sure why we were originally computing dsisr and dar. So may be we need a variant of this patch. But with this and the additional patch powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a Little Endian PR guest to boot. arch/powerpc/kvm/book3s_emulate.c | 64 ++- 1 file changed, 2 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 99d40f8..62768f9 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -569,70 +569,10 @@ unprivileged: u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) { - u32 dsisr = 0; - - /* -* This is what the spec says about DSISR bits (not mentioned = 0): -* -* 12:13[DS]Set to bits 30:31 -* 15:16[X] Set to bits 29:30 -* 17 [X] Set to bit 25 -* [D/DS] Set to bit 5 -* 18:21[X] Set to bits 21:24 -* [D/DS] Set to bits 1:4 -* 22:26Set to bits 6:10 (RT/RS/FRT/FRS) -* 27:31Set to bits 11:15 (RA) -*/ - - switch (get_op(inst)) { - /* D-form */ - case OP_LFS: - case OP_LFD: - case OP_STFD: - case OP_STFS: - dsisr |= (inst 12) 0x4000; /* bit 17 */ - dsisr |= (inst 17) 0x3c00; /* bits 18:21 */ - break; - /* X-form */ - case 31: - dsisr |= (inst 14) 0x18000; /* bits 15:16 */ - dsisr |= (inst 8) 0x04000; /* bit 17 */ - dsisr |= (inst 3) 0x03c00; /* bits 18:21 */ - break; - default: - printk(KERN_INFO KVM: Unaligned instruction 0x%x\n, inst); - break; - } - - dsisr |= (inst 16) 0x03ff; /* bits 22:31 */ - - return dsisr; + return vcpu-arch.fault_dsisr; } ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) { - ulong dar = 0; - ulong ra = get_ra(inst); - ulong rb = get_rb(inst); - - switch (get_op(inst)) { - case OP_LFS: - case OP_LFD: - case OP_STFD: - case OP_STFS: - if (ra) - dar = kvmppc_get_gpr(vcpu, ra); - dar += (s32)((s16)inst); - break; - case 31: - if (ra) - dar = kvmppc_get_gpr(vcpu, ra); - dar += kvmppc_get_gpr(vcpu, rb); - break; - default: - printk(KERN_INFO KVM: Unaligned instruction 0x%x\n, inst); - break; - } - - return dar; + return vcpu-arch.fault_dar; } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: book3s: PR: Enable Little Endian PR guest
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch make sure we inherit the LE bit correctly in different case so that we can run Little Endian distro in PR mode Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- This patch depends on the below two changes 1) [PATCH v5 0/6] KVM: PPC: Book3S: MMIO support for Little Endian guests (kvm-ppc) http://mid.gmane.org/1383672128-26795-1-git-send-email-...@fr.ibm.com 2) [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values http://mid.gmane.org/1384178577-23721-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com arch/powerpc/kvm/book3s_64_mmu.c | 2 +- arch/powerpc/kvm/book3s_pr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 83da1f8..d339096 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -38,7 +38,7 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF); + kvmppc_set_msr(vcpu, MSR_SF | (vcpu-arch.shared-msr MSR_LE)); } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index a7fe87a..cf9362c 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -226,7 +226,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) ulong smsr = vcpu-arch.shared-msr; /* Guest MSR values */ - smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; + smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: booke: Fix build failures
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com arch/powerpc/platforms/wsp/wsp.c: In function ‘wsp_probe_devices’: arch/powerpc/platforms/wsp/wsp.c:76:3: error: implicit declaration of function ‘of_address_to_resource’ [-Werror=implicit-function-declaration] Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/platforms/wsp/chroma.c | 1 + arch/powerpc/platforms/wsp/h8.c | 1 + arch/powerpc/platforms/wsp/ics.c | 2 ++ arch/powerpc/platforms/wsp/opb_pic.c | 2 ++ arch/powerpc/platforms/wsp/psr2.c | 1 + arch/powerpc/platforms/wsp/scom_wsp.c | 1 + arch/powerpc/platforms/wsp/wsp.c | 1 + 7 files changed, 9 insertions(+) diff --git a/arch/powerpc/platforms/wsp/chroma.c b/arch/powerpc/platforms/wsp/chroma.c index 8ef53bc2e70e..aaa46b353715 100644 --- a/arch/powerpc/platforms/wsp/chroma.c +++ b/arch/powerpc/platforms/wsp/chroma.c @@ -15,6 +15,7 @@ #include linux/of.h #include linux/smp.h #include linux/time.h +#include linux/of_fdt.h #include asm/machdep.h #include asm/udbg.h diff --git a/arch/powerpc/platforms/wsp/h8.c b/arch/powerpc/platforms/wsp/h8.c index d18e6cc19df3..a3c87f395750 100644 --- a/arch/powerpc/platforms/wsp/h8.c +++ b/arch/powerpc/platforms/wsp/h8.c @@ -10,6 +10,7 @@ #include linux/kernel.h #include linux/of.h #include linux/io.h +#include linux/of_address.h #include wsp.h diff --git a/arch/powerpc/platforms/wsp/ics.c b/arch/powerpc/platforms/wsp/ics.c index 2d3b1dd9571d..9cd92e645028 100644 --- a/arch/powerpc/platforms/wsp/ics.c +++ b/arch/powerpc/platforms/wsp/ics.c @@ -18,6 +18,8 @@ #include linux/smp.h #include linux/spinlock.h #include linux/types.h +#include linux/of_address.h +#include linux/of_irq.h #include asm/io.h #include asm/irq.h diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c index cb565bf93650..3f6729807938 100644 --- a/arch/powerpc/platforms/wsp/opb_pic.c +++ b/arch/powerpc/platforms/wsp/opb_pic.c @@ -15,6 +15,8 @@ #include linux/of.h #include linux/slab.h #include linux/time.h +#include linux/of_address.h +#include linux/of_irq.h #include asm/reg_a2.h #include asm/irq.h diff --git a/arch/powerpc/platforms/wsp/psr2.c b/arch/powerpc/platforms/wsp/psr2.c index 508ec8282b96..a87b414c766a 100644 --- a/arch/powerpc/platforms/wsp/psr2.c +++ b/arch/powerpc/platforms/wsp/psr2.c @@ -15,6 +15,7 @@ #include linux/of.h #include linux/smp.h #include linux/time.h +#include linux/of_fdt.h #include asm/machdep.h #include asm/udbg.h diff --git a/arch/powerpc/platforms/wsp/scom_wsp.c b/arch/powerpc/platforms/wsp/scom_wsp.c index 8928507affea..6538b4de34fc 100644 --- a/arch/powerpc/platforms/wsp/scom_wsp.c +++ b/arch/powerpc/platforms/wsp/scom_wsp.c @@ -14,6 +14,7 @@ #include linux/of.h #include linux/spinlock.h #include linux/types.h +#include linux/of_address.h #include asm/cputhreads.h #include asm/reg_a2.h diff --git a/arch/powerpc/platforms/wsp/wsp.c b/arch/powerpc/platforms/wsp/wsp.c index ddb6efe88914..58cd1f00e1ef 100644 --- a/arch/powerpc/platforms/wsp/wsp.c +++ b/arch/powerpc/platforms/wsp/wsp.c @@ -13,6 +13,7 @@ #include linux/smp.h #include linux/delay.h #include linux/time.h +#include linux/of_address.h #include asm/scom.h -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 1/5] powerpc: Use HPTE constants when updating hpte bits
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Even though we have same value for linux PTE bits and hash PTE pits use the hash pte bits wen updating hash pte Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/platforms/cell/beat_htab.c | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/platforms/cell/beat_htab.c b/arch/powerpc/platforms/cell/beat_htab.c index c34ee4e60873..d4d245c0d787 100644 --- a/arch/powerpc/platforms/cell/beat_htab.c +++ b/arch/powerpc/platforms/cell/beat_htab.c @@ -111,7 +111,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group, DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r); if (rflags _PAGE_NO_CACHE) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; raw_spin_lock(beat_htab_lock); lpar_rc = beat_read_mask(hpte_group); @@ -337,7 +337,7 @@ static long beat_lpar_hpte_insert_v3(unsigned long hpte_group, DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r); if (rflags _PAGE_NO_CACHE) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; /* insert into not-volted entry */ lpar_rc = beat_insert_htab_entry3(0, hpte_group, hpte_v, hpte_r, diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 356bc75ca74f..c8fbef238d4b 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -153,7 +153,8 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, /* Make pHyp happy */ if ((rflags _PAGE_NO_CACHE) !(rflags _PAGE_WRITETHRU)) - hpte_r = ~_PAGE_COHERENT; + hpte_r = ~HPTE_R_M; + if (firmware_has_feature(FW_FEATURE_XCMO) !(hpte_r HPTE_R_N)) flags |= H_COALESCE_CAND; -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 4/5] powerpc: mm: Only check for _PAGE_PRESENT in set_pte/pmd functions
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We want to make sure we don't use these function when updating a pte or pmd entry that have a valid hpte entry, because these functions don't invalidate them. So limit the check to _PAGE_PRESENT bit. Numafault core changes use these functions for updating _PAGE_NUMA bits. That should be ok because when _PAGE_NUMA is set we can be sure that hpte entries are not present. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/mm/pgtable.c| 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 841e0d00863c..ad90429bbd8b 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -174,7 +174,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { #ifdef CONFIG_DEBUG_VM - WARN_ON(pte_present(*ptep)); + WARN_ON(pte_val(*ptep) _PAGE_PRESENT); #endif /* Note: mm-context.id might not yet have been assigned as * this context might not have been activated yet when this diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 9d95786aa80f..02e8681fb865 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { #ifdef CONFIG_DEBUG_VM - WARN_ON(!pmd_none(*pmdp)); + WARN_ON(pmd_val(*pmdp) _PAGE_PRESENT); assert_spin_locked(mm-page_table_lock); WARN_ON(!pmd_trans_huge(pmd)); #endif -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 3/5] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE. On archs like ppc64 that don't use _PAGE_PROTNONE and also have a separate page table outside linux pagetable, we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- include/linux/mm.h | 3 --- mm/mempolicy.c | 9 - 2 files changed, 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0548eb201e05..51794c1a1d7e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1851,11 +1851,8 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); -#endif - struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c4403cdf3433..cae10af4fdc4 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -613,7 +613,6 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -627,7 +626,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1); if (nr_updated) @@ -635,13 +633,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, return nr_updated; } -#else -static unsigned long change_prot_numa(struct vm_area_struct *vma, - unsigned long addr, unsigned long end) -{ - return 0; -} -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ /* * Walk through page tables and collect pages to be migrated. -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 2/5] powerpc: Free up _PAGE_COHERENCE for numa fault use later
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Set memory coherence always on hash64 config. If a platform cannot have memory coherence always set they can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU like in lpar. So we dont' really need a separate bit for tracking _PAGE_COHERENCE. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pte-hash64.h | 2 +- arch/powerpc/mm/hash_low_64.S | 15 --- arch/powerpc/mm/hash_utils_64.c | 7 --- arch/powerpc/mm/hugepage-hash64.c | 6 +- arch/powerpc/mm/hugetlbpage-hash64.c | 4 5 files changed, 26 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 0419eeb53274..55aea0caf95e 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -19,7 +19,7 @@ #define _PAGE_FILE 0x0002 /* (!present only) software: pte holds file offset */ #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we invert) */ #define _PAGE_GUARDED 0x0008 -#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */ +/* We can derive Memory coherence from _PAGE_NO_CACHE */ #define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */ #define _PAGE_WRITETHRU0x0040 /* W: cache write-through */ #define _PAGE_DIRTY0x0080 /* C: page changed */ diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S index d3cbda62857b..1136d26a95ae 100644 --- a/arch/powerpc/mm/hash_low_64.S +++ b/arch/powerpc/mm/hash_low_64.S @@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r30,r0 /* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r3,r0/* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) @@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) and r0,r0,r4/* _PAGE_RW _PAGE_DIRTY -r0 bit 30*/ andcr0,r30,r0 /* r0 = pte ~r0 */ rlwimi r3,r0,32-1,31,31/* Insert result into PP lsb */ - ori r3,r3,HPTE_R_C /* Always add C bit for perf. */ + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + ori r3,r3,HPTE_R_C | HPTE_R_M /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 6176b3cdf579..de6881259aef 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long pteflags) if ((pteflags _PAGE_USER) !((pteflags _PAGE_RW) (pteflags _PAGE_DIRTY))) rflags |= 1; - - /* Always add C */ - return rflags | HPTE_R_C; + /* +* Always add C bit for perf. Memory coherence is always enabled +*/ + return rflags | HPTE_R_C | HPTE_R_M; } int htab_bolt_mapping(unsigned long vstart, unsigned long vend, diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index 34de9e0cdc34..826893fcb3a7 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c @@ -127,7 +127,11 @@ repeat: /* Add in WIMG bits */ rflags |= (new_pmd (_PAGE_WRITETHRU | _PAGE_NO_CACHE | - _PAGE_COHERENT | _PAGE_GUARDED)); + _PAGE_GUARDED)); + /* +* enable the memory coherence always +*/ + rflags |= HPTE_R_M; /* Insert into the hash table, primary slot */ slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0, diff --git a/arch/powerpc/mm/hugetlbpage
[PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64
Hi, This patch series add support for numa faults on ppc64 architecture. We steal the _PAGE_COHERENCE bit and use that for indicating _PAGE_NUMA. We clear the _PAGE_PRESENT bit and also invalidate the hpte entry on setting _PAGE_NUMA. The next fault on that page will be considered a numa fault. Changes from V1: * Dropped few patches related pmd update because batch handling of pmd pages got dropped from core code 0f19c17929c952c6f0966d93ab05558e7bf814cc mm: numa: Do not batch handle PMD pages This also avoided the large lock contention on page_table_lock that we observed with the previous series. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2 5/5] powerpc: mm: book3s: Enable _PAGE_NUMA for book3s
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 66 +- arch/powerpc/include/asm/pte-hash64.h | 6 arch/powerpc/platforms/Kconfig.cputype | 1 + 3 files changed, 72 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 7d6eacf249cf..b999ca318985 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -3,6 +3,7 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ +#include linux/mmdebug.h #include asm/processor.h /* For TASK_SIZE */ #include asm/mmu.h #include asm/page.h @@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte) { return pte_val(pte) _PAGE_DIRTY; } static inline int pte_young(pte_t pte) { return pte_val(pte) _PAGE_ACCESSED; } static inline int pte_file(pte_t pte) { return pte_val(pte) _PAGE_FILE; } static inline int pte_special(pte_t pte) { return pte_val(pte) _PAGE_SPECIAL; } -static inline int pte_present(pte_t pte) { return pte_val(pte) _PAGE_PRESENT; } static inline int pte_none(pte_t pte) { return (pte_val(pte) ~_PTE_NONE_MASK) == 0; } static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) PAGE_PROT_BITS); } +#ifdef CONFIG_NUMA_BALANCING + +static inline int pte_present(pte_t pte) +{ + return pte_val(pte) (_PAGE_PRESENT | _PAGE_NUMA); +} + +#define pte_numa pte_numa +static inline int pte_numa(pte_t pte) +{ + return (pte_val(pte) + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} + +#define pte_mknonnuma pte_mknonnuma +static inline pte_t pte_mknonnuma(pte_t pte) +{ + pte_val(pte) = ~_PAGE_NUMA; + pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED; + return pte; +} + +#define pte_mknuma pte_mknuma +static inline pte_t pte_mknuma(pte_t pte) +{ + /* +* We should not set _PAGE_NUMA on non present ptes. Also clear the +* present bit so that hash_page will return 1 and we collect this +* as numa fault. +*/ + if (pte_present(pte)) { + pte_val(pte) |= _PAGE_NUMA; + pte_val(pte) = ~_PAGE_PRESENT; + } else + VM_BUG_ON(1); + return pte; +} + +#define pmd_numa pmd_numa +static inline int pmd_numa(pmd_t pmd) +{ + return pte_numa(pmd_pte(pmd)); +} + +#define pmd_mknonnuma pmd_mknonnuma +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + return pte_pmd(pte_mknonnuma(pmd_pte(pmd))); +} + +#define pmd_mknuma pmd_mknuma +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + return pte_pmd(pte_mknuma(pmd_pte(pmd))); +} + +# else + +static inline int pte_present(pte_t pte) +{ + return pte_val(pte) _PAGE_PRESENT; +} +#endif /* CONFIG_NUMA_BALANCING */ + /* Conversion functions: convert a page and protection to a page entry, * and a page entry and page directory to the page they refer to. * diff --git a/arch/powerpc/include/asm/pte-hash64.h b/arch/powerpc/include/asm/pte-hash64.h index 55aea0caf95e..2505d8eab15c 100644 --- a/arch/powerpc/include/asm/pte-hash64.h +++ b/arch/powerpc/include/asm/pte-hash64.h @@ -27,6 +27,12 @@ #define _PAGE_RW 0x0200 /* software: user write access allowed */ #define _PAGE_BUSY 0x0800 /* software: PTE hash are busy */ +/* + * Used for tracking numa faults + */ +#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */ + + /* No separate kernel read-only */ #define _PAGE_KERNEL_RW(_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index c2a566fb8bb8..2048655d8ec4 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -72,6 +72,7 @@ config PPC_BOOK3S_64 select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES + select ARCH_SUPPORTS_NUMA_BALANCING config PPC_BOOK3E_64 bool Embedded processors -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2] powerpc: book3s: PR: Enable Little Endian PR guest
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch make sure we inherit the LE bit correctly in different case so that we can run Little Endian distro in PR mode Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V1: * Use LPCR bit to find whether to enable LE on interrupt. We do it more or less same as HV now. We keep it separate at this point because HV H_SETMODE does a lot more than what we do here. This patch depends on the below two changes 1) [PATCH v5 0/6] KVM: PPC: Book3S: MMIO support for Little Endian guests (kvm-ppc) http://mid.gmane.org/1383672128-26795-1-git-send-email-...@fr.ibm.com 2) [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values http://mid.gmane.org/1384178577-23721-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com 3) [PATCH 11/15] KVM: PPC: Book3S HV: Add little-endian guest support http://mid.gmane.org/1383995103-24732-12-git-send-email-pau...@samba.org With further changes to make it apply to latest upstream. arch/powerpc/include/asm/kvm_host.h | 4 +-- arch/powerpc/kernel/asm-offsets.c | 4 +-- arch/powerpc/kvm/book3s_64_mmu.c| 2 +- arch/powerpc/kvm/book3s_pr.c| 3 +- arch/powerpc/kvm/book3s_pr_papr.c | 57 + 5 files changed, 64 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index cecd88338f28..1e67adc725d2 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -240,7 +240,6 @@ struct kvm_arch { unsigned long sdr1; unsigned long host_sdr1; int tlbie_lock; - unsigned long lpcr; unsigned long rmor; struct kvm_rma_info *rma; unsigned long vrma_slb_v; @@ -261,6 +260,7 @@ struct kvm_arch { struct mutex hpt_mutex; #endif #ifdef CONFIG_PPC_BOOK3S_64 + unsigned long lpcr; struct list_head spapr_tce_tables; struct list_head rtas_tokens; #endif @@ -524,6 +524,7 @@ struct kvm_vcpu_arch { #ifdef CONFIG_PPC_BOOK3S ulong fault_dar; u32 fault_dsisr; + unsigned long intr_msr; #endif #ifdef CONFIG_BOOKE @@ -616,7 +617,6 @@ struct kvm_vcpu_arch { spinlock_t tbacct_lock; u64 busy_stolen; u64 busy_preempt; - unsigned long intr_msr; #endif }; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 39dbcb3d3d7d..136c4bec52ab 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -473,7 +473,6 @@ int main(void) #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); - DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); @@ -484,9 +483,9 @@ int main(void) DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty)); - DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); #endif #ifdef CONFIG_PPC_BOOK3S + DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id)); DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr)); DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr)); @@ -510,6 +509,7 @@ int main(void) DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 83da1f868fd5..8231b83c493b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -38,7 +38,7 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF); + kvmppc_set_msr(vcpu, vcpu-arch.intr_msr); } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index f84778547c6b..dc22643a45d2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -226,7 +226,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) ulong smsr = vcpu-arch.shared-msr
Re: [PATCH -V2 3/5] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE
Adding Mel and Rik to cc: Benjamin Herrenschmidt b...@au1.ibm.com writes: On Mon, 2013-11-18 at 14:58 +0530, Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE. On archs like ppc64 that don't use _PAGE_PROTNONE and also have a separate page table outside linux pagetable, we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. That patch doesn't look right... You are essentially making change_prot_numa() do whatever it does (which I don't completely understand) *for all architectures* now, whether they have CONFIG_ARCH_USES_NUMA_PROT_NONE or not ... So because you want that behaviour on powerpc book3s64, you change everybody. Is that correct ? Yes. Also what exactly is that doing, can you explain ? From what I can see, it calls back into the core of mprotect to change the protection to vma-vm_page_prot, which I would have expected is already the protection there, with the added prot_numa flag passed down. it set the _PAGE_NUMA bit. Now we also want to make sure that when we set _PAGE_NUMA, we would get a pagefault on that so that we can track that fault as a numa fault. To ensure that, we had the below BUILD_BUG BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); But other than that the function doesn't really have any dependency on _PAGE_PROTNONE. The only requirement is when we set _PAGE_NUMA, the architecture should do enough to ensure that we get a page fault. Now on ppc64 we does that by clearlying hpte entry and also clearing _PAGE_PRESENT. Since we have _PAGE_PRESENT cleared hash_page will return 1 and we get to page fault handler. Your changeset comment says On archs like ppc64 [...] we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. But change_prot_numa() does a lot more than that ... it does pte_mknuma(), do we need it ? I assume we do or we wouldn't have added that PTE bit to begin with... Now it *might* be allright and it might be that no other architecture cares anyway etc... but I need at least some mm folks to ack on that patch before I can take it because it *will* change behaviour of other architectures. Ok, I can move the changes below #ifdef CONFIG_NUMA_BALANCING ? We call change_prot_numa from task_numa_work and queue_pages_range(). The later may be an issue. So doing the below will help ? -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/3] powerpc: mm: make _PAGE_NUMA take effect
Liu Ping Fan kernelf...@gmail.com writes: To enable the do_numa_page(), we should not fix _PAGE_NUMA in hash_page(), so bail out for the case of pte_numa(). Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com --- arch/powerpc/mm/hash_utils_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index fb176e9..9bf1195 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -1033,7 +1033,7 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) /* Get PTE and page size from page tables */ ptep = find_linux_pte_or_hugepte(pgdir, ea, hugeshift); - if (ptep == NULL || !pte_present(*ptep)) { + if (ptep == NULL || !pte_present(*ptep) || pte_numa(*ptep)) { DBG_LOW( no PTE !\n); rc = 1; goto bail; why ? , All the hash routines do check for _PAGE_PRESENT via access variable. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] powerpc: mm: optimize for the correctly placed page
Liu Ping Fan kernelf...@gmail.com writes: The period check of _PAGE_NUMA can probably trigger the check on the correctly placed page. For this case, we can just insert hpte and do fast exception return. I still don't understand why we need to handle numa faults in hash page ? Are you trying to optimize the code path ? If so can you explain the benefits ? Some numbers showing it is helping ? Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com --- arch/powerpc/mm/hash_utils_64.c | 34 +- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 9bf1195..735678c 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -965,6 +965,10 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) const struct cpumask *tmp; int rc, user_region = 0, local = 0; int psize, ssize; + pte_t old, new; + struct vm_area_struct *vma; + int page_nid, target_nid; + struct page *test_page; DBG_LOW(hash_page(ea=%016lx, access=%lx, trap=%lx\n, ea, access, trap); @@ -1033,12 +1037,40 @@ int hash_page(unsigned long ea, unsigned long access, unsigned long trap) /* Get PTE and page size from page tables */ ptep = find_linux_pte_or_hugepte(pgdir, ea, hugeshift); - if (ptep == NULL || !pte_present(*ptep) || pte_numa(*ptep)) { + if (ptep == NULL || !pte_present(*ptep)) { DBG_LOW( no PTE !\n); rc = 1; goto bail; } + old = pte_val(*ptep); + if (pte_numa(old)) { + /* If fail to lock, let do_page_fault() to handle it */ + if (down_read_trylock(mm-mmap_sem)) { hmm is that something we want to do in hash_page ? + vma = find_vma(mm, ea); + up_read(mm-mmap_sem); + test_page = pte_page(old); + page_nid = page_to_nid(test_page); + target_nid = numa_migrate_prep(test_page, vma, ea, + page_nid); + if (target_nid 0) { + new = pte_mknonnuma(old); + /* If ptep is modified under us, + * just retry the access + */ + if (unlikely(cmpxchg(ptep, old, new) != old)) { + put_page(test_page); + return 0; + } + put_page(test_page); + } + } else { + put_page(test_page); + rc = 1; + goto bail; + } + } + /* Add _PAGE_PRESENT to the required access perm */ access |= _PAGE_PRESENT; -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V3] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE. On archs like ppc64 that don't use _PAGE_PROTNONE and also have a separate page table outside linux pagetable, we just need to make sure that when calling change_prot_numa we flush the hardware page table entry so that next page access result in a numa fault. We still need to make sure we use the numa faulting logic only when CONFIG_NUMA_BALANCING is set. This implies the migrate-on-fault (Lazy migration) via mbind will only work if CONFIG_NUMA_BALANCING is set. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Previous discussion around the patch can be found at http://article.gmane.org/gmane.linux.kernel.mm/109305 changes from V2: * Move the numa faulting definition within CONFIG_NUMA_BALANCING include/linux/mm.h | 2 +- mm/mempolicy.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cedd000cf29..a7b4e310bf42 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1842,7 +1842,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long start, unsigned long end); #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eca4a3129129..9f73b29d304d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct vm_area_struct *vma, return 0; } -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +#ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these @@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; - BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE); nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1); if (nr_updated) @@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, { return 0; } -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#endif /* CONFIG_NUMA_BALANCING */ /* * Walk through page tables and collect pages to be migrated. -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor
Hong H. Pham hong.p...@windriver.com writes: In pte_alloc_one(), pgtable_page_ctor() is passed an address that has not been converted by page_address() to the newly allocated PTE page. When the PTE is freed, __pte_free_tlb() calls pgtable_page_dtor() with an address to the PTE page that has been converted by page_address(). The mismatch in the PTE's page address causes pgtable_page_dtor() to access invalid memory, so resources for that PTE (such as the page lock) is not properly cleaned up. This bug was introduced by commit d614bb041209fd7cb5e4b35e11a7b2f6ee8f62b8 powerpc: Move the pte free routines from common header. On a preempt-rt kernel, a spinlock is dynamically allocated for each PTE in pgtable_page_ctor(). When the PTE is freed, calling pgtable_page_dtor() with a mismatched page address causes a memory leak, as the pointer to the PTE's spinlock is bogus. On mainline, there isn't any immediately obvious symptoms, but the problem still exists here. can you also specifiy the config details here. ie, 4K page size functions are broken ? Fixes: d614bb041209fd7c powerpc: Move the pte free routes from common header Cc: Paul Mackerras pau...@samba.org Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: linux-stable sta...@vger.kernel.org # v3.10+ Signed-off-by: Hong H. Pham hong.p...@windriver.com --- arch/powerpc/include/asm/pgalloc-32.h | 2 +- arch/powerpc/include/asm/pgalloc-64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index 27b2386..7ff24f0 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -87,7 +87,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, struct page *page = page_address(table); tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(page); + pgtable_page_dtor(table); pgtable_free_tlb(tlb, page, 0); } #endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..b187dc5 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -147,7 +147,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, struct page *page = page_address(table); That one is also wrong right ? why not tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(page); + pgtable_page_dtor(table); pgtable_free_tlb(tlb, page, 0); } make it closer to what it was before, pgtable_page_dtor(table); pgtable_free_tlb(tlb, page_address(table), 0); This is what we had before -static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage, - unsigned long address) -{ - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(ptepage); - pgtable_free_tlb(tlb, page_address(ptepage), 0); -} -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor
Benjamin Herrenschmidt b...@kernel.crashing.org writes: On Sat, 2013-12-07 at 09:06 -0500, Hong H. Pham wrote: diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index 27b2386..842846c 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -84,10 +84,8 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { -struct page *page = page_address(table); - tlb_flush_pgtable(tlb, address); -pgtable_page_dtor(page); -pgtable_free_tlb(tlb, page, 0); +pgtable_page_dtor(table); +pgtable_free_tlb(tlb, page_address(table), 0); } Ok so your description of the problem confused me a bit, but I see that in the !64K page, pgtable_t is already a struct page so yes, the page_address() call here is bogus. However, I also noticed that in the 64k page case, we don't call the dto at all. Is that a problem ? Also, Aneesh, shouldn't we just fix the disconnect here and have pgtable_t always be the same type ? The way this is now is confusing and error prone... With pte page fragments that may not be possible right ?. With PTE fragments, we share the page allocated with multiple pmd entries 5c1f6ee9a31cbdac90bbb8ae1ba4475031ac74b4 should have more details #endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..256d6f8 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -144,11 +144,9 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { -struct page *page = page_address(table); - tlb_flush_pgtable(tlb, address); -pgtable_page_dtor(page); -pgtable_free_tlb(tlb, page, 0); +pgtable_page_dtor(table); +pgtable_free_tlb(tlb, page_address(table), 0); } #else /* if CONFIG_PPC_64K_PAGES */ Ben. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v3] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor
Hong H. Pham hong.p...@windriver.com writes: From: Hong H. Pham hong.p...@windriver.com In pte_alloc_one(), pgtable_page_ctor() is passed an address that has not been converted by page_address() to the newly allocated PTE page. When the PTE is freed, __pte_free_tlb() calls pgtable_page_dtor() with an address to the PTE page that has been converted by page_address(). The mismatch in the PTE's page address causes pgtable_page_dtor() to access invalid memory, so resources for that PTE (such as the page lock) is not properly cleaned up. On PPC32, only SMP kernels are affected. On PPC64, only SMP kernels with 4K page size are affected. This bug was introduced by commit d614bb041209fd7cb5e4b35e11a7b2f6ee8f62b8 powerpc: Move the pte free routines from common header. On a preempt-rt kernel, a spinlock is dynamically allocated for each PTE in pgtable_page_ctor(). When the PTE is freed, calling pgtable_page_dtor() with a mismatched page address causes a memory leak, as the pointer to the PTE's spinlock is bogus. On mainline, there isn't any immediately obvious symptoms, but the problem still exists here. Fixes: d614bb041209fd7c powerpc: Move the pte free routes from common header Cc: Paul Mackerras pau...@samba.org Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Cc: linux-stable sta...@vger.kernel.org # v3.10+ Signed-off-by: Hong H. Pham hong.p...@windriver.com Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgalloc-32.h | 6 ++ arch/powerpc/include/asm/pgalloc-64.h | 6 ++ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index 27b2386..842846c 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -84,10 +84,8 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { - struct page *page = page_address(table); - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(page); - pgtable_free_tlb(tlb, page, 0); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); } #endif /* _ASM_POWERPC_PGALLOC_32_H */ diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index f65e27b..256d6f8 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -144,11 +144,9 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb, static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table, unsigned long address) { - struct page *page = page_address(table); - tlb_flush_pgtable(tlb, address); - pgtable_page_dtor(page); - pgtable_free_tlb(tlb, page, 0); + pgtable_page_dtor(table); + pgtable_free_tlb(tlb, page_address(table), 0); } #else /* if CONFIG_PPC_64K_PAGES */ -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] KVM: PPC: Use schedule instead of cond_resched
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We already checked need_resched. So we can call schedule directly Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: This patch also work around a regression upstream w.r.t PR KVM BUG: soft lockup - CPU#0 stuck for 23s! [qemu-system-ppc:4394] Modules linked in: CPU: 0 PID: 4394 Comm: qemu-system-ppc Not tainted 3.13.0-rc3+ #98 task: c001d0788400 ti: c001dca0 task.ti: c001dca0 NIP: c082dd80 LR: c0081ae0 CTR: c0062ba0 REGS: c001dca02f70 TRAP: 0901 Not tainted (3.13.0-rc3+) MSR: 80009032 SF,EE,ME,IR,DR,RI CR: 24822024 XER: CFAR: c0081adc SOFTE: 1 GPR00: c0081ae0 c001dca031f0 c0d67ab0 0001 GPR04: 7102 0001 0189a0d786b7 018c GPR08: 0001 c0da GPR12: 0c00 cfef NIP [c082dd80] ._cond_resched+0x0/0x40 LR [c0081ae0] .kvmppc_prepare_to_enter+0x2a0/0x2e0 Call Trace: [c001dca031f0] [c0081ae0] .kvmppc_prepare_to_enter+0x2a0/0x2e0 (unreliable) [c001dca03290] [c008f2cc] .kvmppc_handle_exit_pr+0xec/0xa40 [c001dca03340] [c00918c4] kvm_start_lightweight+0xac/0xbc [c001dca03510] [c008efe0] .kvmppc_vcpu_run_pr+0x130/0x2a0 [c001dca039e0] [c00855bc] .kvmppc_vcpu_run+0x2c/0x40 [c001dca03a50] [c0082c94] .kvm_arch_vcpu_ioctl_run+0x54/0x1b0 [c001dca03ae0] [c007d5f8] .kvm_vcpu_ioctl+0x478/0x740 [c001dca03ca0] [c0218864] .do_vfs_ioctl+0x4a4/0x760 [c001dca03d80] [c0218b78] .SyS_ioctl+0x58/0xb0 [c001dca03e30] [c0009e58] syscall_exit+0x0/0x98 Instruction dump: e92d0260 e94911c0 812a0004 5529f07e 5529103e 912a0004 38210080 e8010010 ebc1fff0 ebe1fff8 7c0803a6 4e800020 7c0802a6 3860 f8010010 f821ff91 arch/powerpc/kvm/powerpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index e4d511c8b38b..6a49b23a3276 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -74,7 +74,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) while (true) { if (need_resched()) { local_irq_enable(); - cond_resched(); + schedule(); local_irq_disable(); continue; } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] KVM: PPC: Use schedule instead of cond_resched
Alexander Graf ag...@suse.de writes: On 10.12.2013, at 15:21, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We already checked need_resched. So we can call schedule directly Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com The real fix for the issue you're seeing is https://lkml.org/lkml/2013/11/28/241 True, I mentioned that in the thread https://lkml.org/lkml/2013/12/9/64 But do we need to do cond_resched after we checked for need_resched() ? -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: book3s: kvm: Don't abuse host r2 in exit path
Hi Alex, Any update on this ? We need this to got into 3.13. -aneesh Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We don't use PACATOC for PR. Avoid updating HOST_R2 with PR KVM mode when both HV and PR are enabled in the kernel. Without this we get the below crash (qemu) Unable to handle kernel paging request for data at address 0x8310 Faulting instruction address: 0xc001d5a4 cpu 0x2: Vector: 300 (Data Access) at [c001dc53aef0] pc: c001d5a4: .vtime_delta.isra.1+0x34/0x1d0 lr: c001d760: .vtime_account_system+0x20/0x60 sp: c001dc53b170 msr: 80009032 dar: 8310 dsisr: 4000 current = 0xc001d76c62d0 paca= 0xcfef1100 softe: 0irq_happened: 0x01 pid = 4472, comm = qemu-system-ppc enter ? for help [c001dc53b200] c001d760 .vtime_account_system+0x20/0x60 [c001dc53b290] c008d050 .kvmppc_handle_exit_pr+0x60/0xa50 [c001dc53b340] c008f51c kvm_start_lightweight+0xb4/0xc4 [c001dc53b510] c008cdf0 .kvmppc_vcpu_run_pr+0x150/0x2e0 [c001dc53b9e0] c008341c .kvmppc_vcpu_run+0x2c/0x40 [c001dc53ba50] c0080af4 .kvm_arch_vcpu_ioctl_run+0x54/0x1b0 [c001dc53bae0] c007b4c8 .kvm_vcpu_ioctl+0x478/0x730 [c001dc53bca0] c02140cc .do_vfs_ioctl+0x4ac/0x770 [c001dc53bd80] c02143e8 .SyS_ioctl+0x58/0xb0 [c001dc53be30] c0009e58 syscall_exit+0x0/0x98 --- Exception: c00 (System Call) at 1f960160 SP (1ecbe3c0) is in userspace These changes were originally part of http://mid.gmane.org/20130806042205.gr19...@iris.ozlabs.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 7 +++ 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 0bd9348..69fe837 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -79,6 +79,7 @@ struct kvmppc_host_state { ulong vmhandler; ulong scratch0; ulong scratch1; + ulong scratch2; u8 in_guest; u8 restore_hid5; u8 napping; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 8e6ede6..841a4c8 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -583,6 +583,7 @@ int main(void) HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler); HSTATE_FIELD(HSTATE_SCRATCH0, scratch0); HSTATE_FIELD(HSTATE_SCRATCH1, scratch1); + HSTATE_FIELD(HSTATE_SCRATCH2, scratch2); HSTATE_FIELD(HSTATE_IN_GUEST, in_guest); HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5); HSTATE_FIELD(HSTATE_NAPPING, napping); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 339aa5e..16f7654 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -750,15 +750,14 @@ kvmppc_interrupt_hv: * guest CR, R12 saved in shadow VCPU SCRATCH1/0 * guest R13 saved in SPRN_SCRATCH0 */ - /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */ - std r9, HSTATE_HOST_R2(r13) + std r9, HSTATE_SCRATCH2(r13) lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE cmpwi r9, KVM_GUEST_MODE_GUEST - ld r9, HSTATE_HOST_R2(r13) + ld r9, HSTATE_SCRATCH2(r13) beq kvmppc_interrupt_pr #endif /* We're now back in the host but in guest MMU context */ @@ -778,7 +777,7 @@ kvmppc_interrupt_hv: std r6, VCPU_GPR(R6)(r9) std r7, VCPU_GPR(R7)(r9) std r8, VCPU_GPR(R8)(r9) - ld r0, HSTATE_HOST_R2(r13) + ld r0, HSTATE_SCRATCH2(r13) std r0, VCPU_GPR(R9)(r9) std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) -- 1.8.3.2 -- To unsubscribe from this list: send the line unsubscribe kvm-ppc in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values
Alexander Graf ag...@suse.de writes: On 11.11.2013, at 15:02, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Don't try to compute these values. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: I am not sure why we were originally computing dsisr and dar. So may be we need a variant of this patch. But with this and the additional patch powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a Little Endian PR guest to boot. It's quite easy to find out - git blame tells you all the history and points you to commit ca7f4203b. commit ca7f4203b9b66e12d0d9968ff7dfe781f3a9695a Author: Alexander Graf ag...@suse.de Date: Wed Mar 24 21:48:28 2010 +0100 KVM: PPC: Implement alignment interrupt Mac OS X has some applications - namely the Finder - that require alignment interrupts to work properly. So we need to implement them. But the spec for 970 and 750 also looks different. While 750 requires the DSISR and DAR fields to reflect some instruction bits (DSISR) and the fault address (DAR), the 970 declares this as an optional feature. So we need to reconstruct DSISR and DAR manually. Signed-off-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com Read this as on 970, alignment interrupts don't give us DSISR and DAR of the faulting instruction as otherwise I wouldn't have implemented it. So this is clearly a nack on this patch :). I can possibly do a if (cpu_has_feature(CPU_FTR_ARCH_201)). But do we need to do that ? According to Paul we should always find DAR. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values
Alexander Graf ag...@suse.de writes: Am 19.12.2013 um 08:02 schrieb Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com: Alexander Graf ag...@suse.de writes: On 11.11.2013, at 15:02, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Don't try to compute these values. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: I am not sure why we were originally computing dsisr and dar. So may be we need a variant of this patch. But with this and the additional patch powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a Little Endian PR guest to boot. It's quite easy to find out - git blame tells you all the history and points you to commit ca7f4203b. commit ca7f4203b9b66e12d0d9968ff7dfe781f3a9695a Author: Alexander Graf ag...@suse.de Date: Wed Mar 24 21:48:28 2010 +0100 KVM: PPC: Implement alignment interrupt Mac OS X has some applications - namely the Finder - that require alignment interrupts to work properly. So we need to implement them. But the spec for 970 and 750 also looks different. While 750 requires the DSISR and DAR fields to reflect some instruction bits (DSISR) and the fault address (DAR), the 970 declares this as an optional feature. So we need to reconstruct DSISR and DAR manually. Signed-off-by: Alexander Graf ag...@suse.de Signed-off-by: Avi Kivity a...@redhat.com Read this as on 970, alignment interrupts don't give us DSISR and DAR of the faulting instruction as otherwise I wouldn't have implemented it. So this is clearly a nack on this patch :). I can possibly do a if (cpu_has_feature(CPU_FTR_ARCH_201)). But do we need to do that ? According to Paul we should always find DAR. Paul only mentioned DAR, not DSISR. Please verify whether 970 gives us a proper DAR value - we can then remove that part. But for DSISR I'm not convinced CPUs above 970 handle this correctly. So we would at least need a guest cpu check to find out whether the vcpu expects a working dsisr and emulate it then. I don't really fully understand the problem though. Why does the calculation break at all for you? IIRC this was to get little endian PR setup to work. This is to avoid handling new instructions, because in little endian mode we get alignment interrupt for a larger instructon set -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2] POWERPC: BOOK3S: KVM: Use the saved dsisr and dar values on book3s 64
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Although it's optional IBM POWER cpus always had DAR value set on alignment interrupt. So don't try to compute these values. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 28 1 file changed, 28 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 502a47ac4453..d8e2d079483d 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -599,6 +599,19 @@ unprivileged: u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) { +#ifdef CONFIG_PPC_BOOK3S_64 + return vcpu-arch.fault_dsisr; +#else + /* +* Mac OS X has some applications - namely the Finder - that require +* alignment interrupts to work properly. So we need to implement them. + +* But the spec for 970 and 750 also looks different. While 750 requires +* the DSISR and DAR fields to reflect some instruction bits (DSISR) and +* the fault address (DAR), the 970 declares this as an optional feature. +* So we need to reconstruct DSISR and DAR manually. +*/ + u32 dsisr = 0; /* @@ -637,10 +650,24 @@ u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) dsisr |= (inst 16) 0x03ff; /* bits 22:31 */ return dsisr; +#endif } ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) { +#ifdef CONFIG_PPC_BOOK3S_64 + return vcpu-arch.fault_dar; +#else + /* +* Mac OS X has some applications - namely the Finder - that require +* alignment interrupts to work properly. So we need to implement them. + +* But the spec for 970 and 750 also looks different. While 750 requires +* the DSISR and DAR fields to reflect some instruction bits (DSISR) and +* the fault address (DAR), the 970 declares this as an optional feature. +* So we need to reconstruct DSISR and DAR manually. +*/ + ulong dar = 0; ulong ra = get_ra(inst); ulong rb = get_rb(inst); @@ -665,4 +692,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) } return dar; +#endif } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: For other archs we would just be removing the pgtable from the list and adding it back. I didn't find an easy way to make it not do that without lots of #ifdef around. Any suggestion around that is welcome. mm/huge_memory.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..eb2e60d9ba45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1500,24 +1500,23 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, */ ret = __pmd_trans_huge_lock(old_pmd, vma, old_ptl); if (ret == 1) { + pgtable_t pgtable; + new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ - pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); - pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - + if (new_ptl != old_ptl) spin_unlock(new_ptl); - } spin_unlock(old_ptl); } out: -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V2] powerpc: thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V1: * limit the withraw/deposit to only ppc64 arch/Kconfig | 3 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + include/linux/huge_mm.h| 6 ++ mm/huge_memory.c | 21 - 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index f1cf895c040f..3759e70a649d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -371,6 +371,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + bool + config HAVE_ARCH_SOFT_DIRTY bool diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index bca2465a9c34..5f83b4334e5f 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS + select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 91672e2deec3..836242a738a5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -230,4 +230,10 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 1 +#else +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 0 +#endif + #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..32006b51d102 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1505,19 +1505,22 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +* +* With split pmd lock we also need to move preallocated +* PTE page table if new_pmd is on different PMD page table. +*/ + if (new_ptl != old_ptl || ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW) { pgtable_t pgtable; - - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
RE: [PATCH -V2] powerpc: thp: Fix crash on mremap
Kirill A. Shutemov kirill.shute...@linux.intel.com writes: Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V1: * limit the withraw/deposit to only ppc64 arch/Kconfig | 3 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + include/linux/huge_mm.h| 6 ++ mm/huge_memory.c | 21 - 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index f1cf895c040f..3759e70a649d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -371,6 +371,9 @@ config HAVE_IRQ_TIME_ACCOUNTING config HAVE_ARCH_TRANSPARENT_HUGEPAGE bool +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW I don't like name of the option, but can't find any better... :( +bool + config HAVE_ARCH_SOFT_DIRTY bool diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index bca2465a9c34..5f83b4334e5f 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS +select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 91672e2deec3..836242a738a5 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -230,4 +230,10 @@ static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_str #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 1 +#else +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 0 +#endif + Just use config option directly: if (new_ptl != old_ptl || IS_ENABLED(CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW)) I didn't like that. I found the earlier one easier for reading. If you and others strongly feel about this, I can redo the patch. Please let me know ... Otherwise, looks good: Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7de1bf85f683..32006b51d102 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1505,19 +1505,22 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); -set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); -if (new_ptl != old_ptl) { +/* + * Archs like ppc64 use pgtable to store per pmd + * specific information. So when we switch the pmd, + * we should also withdraw and deposit the pgtable + * + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + */ +if (new_ptl != old_ptl || ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW) { pgtable_t pgtable; - -/* - * Move preallocated PTE page table if new_pmd is on - * different PMD page table. - */ pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - -spin_unlock(new_ptl); } +set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); +if (new_ptl != old_ptl) +spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.3.2 -- Kirill A. Shutemov ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https
Re: [PATCH -V2] powerpc: thp: Fix crash on mremap
Benjamin Herrenschmidt b...@kernel.crashing.org writes: On Thu, 2014-01-02 at 16:22 +0530, Aneesh Kumar K.V wrote: Just use config option directly: if (new_ptl != old_ptl || IS_ENABLED(CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW)) I didn't like that. I found the earlier one easier for reading. If you and others strongly feel about this, I can redo the patch. Please let me know Yes, use IS_ENABLED, no need to have two indirections of #define's Another option is to have if (pmd_move_must_withdraw(new,old)) { } With in a generic header: #ifndef pmd_move_must_withdraw static inline bool pmd_move_must_withdraw(spinlock_t *new_ptl, ...) { return new_ptl != old_ptl; } #endif And in powerpc: static inline bool pmd_move_must_withdraw(spinlock_t *new_ptl, ...) { return true; } #define pmd_move_must_withdraw pmd_move_must_withdraw This is better i guess. It is also in-line with rest of transparent hugepage functions. I will do this. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V3 2/2] powerpc: thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++ include/asm-generic/pgtable.h| 12 mm/huge_memory.c | 14 +- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 9935e9b79524..ff3afce40f3b 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -12,6 +12,7 @@ #ifndef __ASSEMBLY__ +#include linux/spinlock.h /* * This is the default implementation of various PTE accessors, it's * used in all cases except Book3S with 64K pages where we have a @@ -459,5 +460,18 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +#define pmd_move_must_withdraw pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + return true; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index db0923458940..8e4f41d9af4d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* With split pmd lock we also need to move preallocated +* PTE page table if new_pmd is on different PMD page table. +*/ + return new_pmd_ptl != old_pmd_ptl; +} +#endif + /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c0b17295ba0..b77bb5df4db9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH -V3 1/2] powerpc: mm: Move ppc64 page table range definitions to separate header
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This avoid mmu-hash64.h including pagetable-ppc64.h. That inclusion cause issues like CC arch/powerpc/kernel/asm-offsets.s In file included from /home/aneesh/linus/arch/powerpc/include/asm/mmu-hash64.h:23:0, from /home/aneesh/linus/arch/powerpc/include/asm/mmu.h:196, from /home/aneesh/linus/arch/powerpc/include/asm/lppaca.h:36, from /home/aneesh/linus/arch/powerpc/include/asm/paca.h:21, from /home/aneesh/linus/arch/powerpc/include/asm/hw_irq.h:41, from /home/aneesh/linus/arch/powerpc/include/asm/irqflags.h:11, from include/linux/irqflags.h:15, from include/linux/spinlock.h:53, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: /home/aneesh/linus/arch/powerpc/include/asm/pgtable-ppc64.h:563:42: error: unknown type name ‘spinlock_t’ static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- NOTE: We can either do this or stuck a typdef struct spinlock spinlock_t; in pgtable-ppc64.h arch/powerpc/include/asm/mmu-hash64.h | 2 +- arch/powerpc/include/asm/pgtable-ppc64-range.h | 101 + arch/powerpc/include/asm/pgtable-ppc64.h | 101 + 3 files changed, 103 insertions(+), 101 deletions(-) create mode 100644 arch/powerpc/include/asm/pgtable-ppc64-range.h diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h index 807014dde821..895b4df31fec 100644 --- a/arch/powerpc/include/asm/mmu-hash64.h +++ b/arch/powerpc/include/asm/mmu-hash64.h @@ -20,7 +20,7 @@ * need for various slices related matters. Note that this isn't the * complete pgtable.h but only a portion of it. */ -#include asm/pgtable-ppc64.h +#include asm/pgtable-ppc64-range.h #include asm/bug.h /* diff --git a/arch/powerpc/include/asm/pgtable-ppc64-range.h b/arch/powerpc/include/asm/pgtable-ppc64-range.h new file mode 100644 index ..b48b089fb209 --- /dev/null +++ b/arch/powerpc/include/asm/pgtable-ppc64-range.h @@ -0,0 +1,101 @@ +#ifndef _ASM_POWERPC_PGTABLE_PPC64_RANGE_H_ +#define _ASM_POWERPC_PGTABLE_PPC64_RANGE_H_ +/* + * This file contains the functions and defines necessary to modify and use + * the ppc64 hashed page table. + */ + +#ifdef CONFIG_PPC_64K_PAGES +#include asm/pgtable-ppc64-64k.h +#else +#include asm/pgtable-ppc64-4k.h +#endif +#include asm/barrier.h + +#define FIRST_USER_ADDRESS 0 + +/* + * Size of EA range mapped by our pagetables. + */ +#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \ + PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT) +#define PGTABLE_RANGE (ASM_CONST(1) PGTABLE_EADDR_SIZE) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define PMD_CACHE_INDEX(PMD_INDEX_SIZE + 1) +#else +#define PMD_CACHE_INDEXPMD_INDEX_SIZE +#endif +/* + * Define the address range of the kernel non-linear virtual area + */ + +#ifdef CONFIG_PPC_BOOK3E +#define KERN_VIRT_START ASM_CONST(0x8000) +#else +#define KERN_VIRT_START ASM_CONST(0xD000) +#endif +#define KERN_VIRT_SIZE ASM_CONST(0x1000) + +/* + * The vmalloc space starts at the beginning of that region, and + * occupies half of it on hash CPUs and a quarter of it on Book3E + * (we keep a quarter for the virtual memmap) + */ +#define VMALLOC_START KERN_VIRT_START +#ifdef CONFIG_PPC_BOOK3E +#define VMALLOC_SIZE (KERN_VIRT_SIZE 2) +#else +#define VMALLOC_SIZE (KERN_VIRT_SIZE 1) +#endif +#define VMALLOC_END(VMALLOC_START + VMALLOC_SIZE) + +/* + * The second half of the kernel virtual space is used for IO mappings, + * it's itself carved into the PIO region (ISA and PHB IO space) and + * the ioremap space + * + * ISA_IO_BASE = KERN_IO_START, 64K reserved area + * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces + * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE + */ +#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE 1)) +#define FULL_IO_SIZE 0x8000ul +#define ISA_IO_BASE (KERN_IO_START) +#define ISA_IO_END(KERN_IO_START + 0x1ul) +#define PHB_IO_BASE (ISA_IO_END) +#define PHB_IO_END(KERN_IO_START + FULL_IO_SIZE) +#define IOREMAP_BASE (PHB_IO_END) +#define IOREMAP_END(KERN_VIRT_START + KERN_VIRT_SIZE) + + +/* + * Region IDs + */ +#define REGION_SHIFT 60UL +#define REGION_MASK(0xfUL REGION_SHIFT) +#define REGION_ID(ea) (((unsigned long)(ea)) REGION_SHIFT) + +#define VMALLOC_REGION_ID (REGION_ID(VMALLOC_START)) +#define KERNEL_REGION_ID (REGION_ID
Re: [PATCH -V3 1/2] powerpc: mm: Move ppc64 page table range definitions to separate header
Benjamin Herrenschmidt b...@kernel.crashing.org writes: On Mon, 2014-01-06 at 14:33 +0530, Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This avoid mmu-hash64.h including pagetable-ppc64.h. That inclusion cause issues like I don't like this. We have that stuff split into too many includes already it's a mess. I understand. Let me know, if you have any suggestion on cleaning that up. I can do that. Why do we need to include it from mmu*.h ? in mmu-hash64.h added by me via 78f1dbde9fd020419313c2a0c3b602ea2427118f /* * This is necessary to get the definition of PGTABLE_RANGE which we * need for various slices related matters. Note that this isn't the * complete pgtable.h but only a portion of it. */ #include asm/pgtable-ppc64.h -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V4] powerpc: thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V3: * Drop powerpc: mm: Move ppc64 page table range definitions to separate header patch arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++ include/asm-generic/pgtable.h| 12 mm/huge_memory.c | 14 +- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 4a191c472867..d27960c89a71 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +#define pmd_move_must_withdraw pmd_move_must_withdraw +typedef struct spinlock spinlock_t; +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + return true; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index db0923458940..8e4f41d9af4d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* With split pmd lock we also need to move preallocated +* PTE page table if new_pmd is on different PMD page table. +*/ + return new_pmd_ptl != old_pmd_ptl; +} +#endif + /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..5d80c53b87cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V4] powerpc: thp: Fix crash on mremap
Benjamin Herrenschmidt b...@kernel.crashing.org writes: On Mon, 2014-01-13 at 11:34 +0530, Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash Andrea, can you ack the generic bit please ? Thanks ! Kirill A. Shutemov did ack an earlier version http://article.gmane.org/gmane.linux.kernel.mm/111368 -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH powerpc] Fix compile error of pgtable-ppc64.h
Li Zhong zh...@linux.vnet.ibm.com writes: It seems that forward declaration couldn't work well with typedef, use struct spinlock directly to avoiding following build errors: In file included from include/linux/spinlock.h:81, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t' /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here what compiler version ? I have seen that error in gcc 4.3 and it was concluded that it is too old a compiler version to worry about. That specific compiler version also gave error for forward declaring struct; Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d27960c..bc141c9 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw -typedef struct spinlock spinlock_t; -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, - spinlock_t *old_pmd_ptl) +struct spinlock; +static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, + struct spinlock *old_pmd_ptl) { /* * Archs like ppc64 use pgtable to store per pmd ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/4] powernv: kvm: make _PAGE_NUMA take effect
Liu Ping Fan kernelf...@gmail.com writes: To make _PAGE_NUMA take effect, we should force the checking when guest uses hypercall to setup hpte. Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9c51544..af8602d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -232,7 +232,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; pte = lookup_linux_pte(pgdir, hva, writing, pte_size); - if (pte_present(pte)) { + if (pte_present(pte) !pte_numa(pte)) { if (writing !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); How did we end up doing h_enter on a pte entry with pte_numa bit set ? -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/4] powernv: kvm: numa fault improvement
Liu ping fan kernelf...@gmail.com writes: On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote: On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote: This series is based on Aneesh's series [PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64 For this series, I apply the same idea from the previous thread [PATCH 0/3] optimize for powerpc _PAGE_NUMA (for which, I still try to get a machine to show nums) But for this series, I think that I have a good justification -- the fact of heavy cost when switching context between guest and host, which is well known. This cover letter isn't really telling me anything. Please put a proper description of what you're trying to achieve, why you're trying to achieve what you're trying and convince your readers that it's a good idea to do it the way you do it. Sorry for the unclear message. After introducing the _PAGE_NUMA, kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it should rely on host's kvmppc_book3s_hv_page_fault() to call do_numa_page() to do the numa fault check. This incurs the overhead when exiting from rmode to vmode. My idea is that in kvmppc_do_h_enter(), we do a quick check, if the page is right placed, there is no need to exit to vmode (i.e saving htab, slab switching) Can you explain more. Are we looking at hcall from guest and hypervisor handling them in real mode ? If so why would guest issue a hcall on a pte entry that have PAGE_NUMA set. Or is this about hypervisor handling a missing hpte, because of host swapping this page out ? In that case how we end up in h_enter ? IIUC for that case we should get to kvmppc_hpte_hv_fault. If my suppose is correct, will CCing k...@vger.kernel.org from next version. This translates to me as This is an RFC? Yes, I am not quite sure about it. I have no bare-metal to verify it. So I hope at least, from the theory, it is correct. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/4] powernv: kvm: numa fault improvement
Liu ping fan kernelf...@gmail.com writes: On Mon, Jan 20, 2014 at 11:45 PM, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: Liu ping fan kernelf...@gmail.com writes: On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote: On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote: This series is based on Aneesh's series [PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64 For this series, I apply the same idea from the previous thread [PATCH 0/3] optimize for powerpc _PAGE_NUMA (for which, I still try to get a machine to show nums) But for this series, I think that I have a good justification -- the fact of heavy cost when switching context between guest and host, which is well known. This cover letter isn't really telling me anything. Please put a proper description of what you're trying to achieve, why you're trying to achieve what you're trying and convince your readers that it's a good idea to do it the way you do it. Sorry for the unclear message. After introducing the _PAGE_NUMA, kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it should rely on host's kvmppc_book3s_hv_page_fault() to call do_numa_page() to do the numa fault check. This incurs the overhead when exiting from rmode to vmode. My idea is that in kvmppc_do_h_enter(), we do a quick check, if the page is right placed, there is no need to exit to vmode (i.e saving htab, slab switching) Can you explain more. Are we looking at hcall from guest and hypervisor handling them in real mode ? If so why would guest issue a hcall on a pte entry that have PAGE_NUMA set. Or is this about hypervisor handling a missing hpte, because of host swapping this page out ? In that case how we end up in h_enter ? IIUC for that case we should get to kvmppc_hpte_hv_fault. After setting _PAGE_NUMA, we should flush out all hptes both in host's htab and guest's. So when guest tries to access memory, host finds that there is not hpte ready for guest in guest's htab. And host should raise dsi to guest. Now guest receive that fault, removes the PAGE_NUMA bit and do an hpte_insert. So before we do an hpte_insert (or H_ENTER) we should have cleared PAGE_NUMA bit. This incurs that guest ends up in h_enter. And you can see in current code, we also try this quick path firstly. Only if fail, we will resort to slow path -- kvmppc_hpte_hv_fault. hmm ? hpte_hv_fault is the hypervisor handling the fault. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect
Liu Ping Fan kernelf...@gmail.com writes: To make sure that on host, the pages marked with _PAGE_NUMA result in a fault when guest access them, we should force the checking when guest uses hypercall to setup hpte. Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com When we mark pte with _PAGE_NUMA we already call mmu_notifier_invalidate_range_start and mmu_notifier_invalidate_range_end, which will mark existing guest hpte entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new guest hpte entries. This patch does that. --- v2: It should be the reply to [PATCH 2/4] powernv: kvm: make _PAGE_NUMA take effect And I imporve the changelog according to Aneesh's suggestion. --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9c51544..af8602d 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -232,7 +232,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* Look up the Linux PTE for the backing page */ pte_size = psize; pte = lookup_linux_pte(pgdir, hva, writing, pte_size); - if (pte_present(pte)) { + if (pte_present(pte) !pte_numa(pte)) { if (writing !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); -- 1.8.1.4 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/4] powernv: kvm: numa fault improvement
Paul Mackerras pau...@samba.org writes: On Mon, Jan 20, 2014 at 03:48:36PM +0100, Alexander Graf wrote: On 15.01.2014, at 07:36, Liu ping fan kernelf...@gmail.com wrote: On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote: On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote: This series is based on Aneesh's series [PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64 For this series, I apply the same idea from the previous thread [PATCH 0/3] optimize for powerpc _PAGE_NUMA (for which, I still try to get a machine to show nums) But for this series, I think that I have a good justification -- the fact of heavy cost when switching context between guest and host, which is well known. This cover letter isn't really telling me anything. Please put a proper description of what you're trying to achieve, why you're trying to achieve what you're trying and convince your readers that it's a good idea to do it the way you do it. Sorry for the unclear message. After introducing the _PAGE_NUMA, kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it should rely on host's kvmppc_book3s_hv_page_fault() to call do_numa_page() to do the numa fault check. This incurs the overhead when exiting from rmode to vmode. My idea is that in kvmppc_do_h_enter(), we do a quick check, if the page is right placed, there is no need to exit to vmode (i.e saving htab, slab switching) If my suppose is correct, will CCing k...@vger.kernel.org from next version. This translates to me as This is an RFC? Yes, I am not quite sure about it. I have no bare-metal to verify it. So I hope at least, from the theory, it is correct. Paul, could you please give this some thought and maybe benchmark it? OK, once I get Aneesh to tell me how I get to have ptes with _PAGE_NUMA set in the first place. :) I guess we want patch 2, Which Liu has sent separately and I have reviewed. http://article.gmane.org/gmane.comp.emulators.kvm.powerpc.devel/8619 I am not sure about the rest of the patches in the series. We definitely don't want to numa migrate on henter. We may want to do that on fault. But even there, IMHO, we should let the host take the fault and do the numa migration instead of doing this in guest context. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect
Alexander Graf ag...@suse.de writes: On 21.01.2014, at 10:42, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: Liu Ping Fan kernelf...@gmail.com writes: To make sure that on host, the pages marked with _PAGE_NUMA result in a fault when guest access them, we should force the checking when guest uses hypercall to setup hpte. Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com When we mark pte with _PAGE_NUMA we already call mmu_notifier_invalidate_range_start and mmu_notifier_invalidate_range_end, which will mark existing guest hpte entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new guest hpte entries. This patch does that. So what happens next? We insert a page into the HTAB without HPTE_V_VALID set, so the guest will fail to use it. If the guest does an H_READ on it it will suddenly turn to V_VALID though? As per the guest the entry is valid, so yes an hread should return a valid entry. But in real hpte we would mark it not valid. I might need a crash course in the use of HPTE_V_ABSENT. When guest tries to access the address, the host will handle the fault. kvmppc_hpte_hv_fault should give more info -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect
Alexander Graf ag...@suse.de writes: On 27.01.2014, at 11:28, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: Alexander Graf ag...@suse.de writes: On 21.01.2014, at 10:42, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com wrote: Liu Ping Fan kernelf...@gmail.com writes: To make sure that on host, the pages marked with _PAGE_NUMA result in a fault when guest access them, we should force the checking when guest uses hypercall to setup hpte. Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com When we mark pte with _PAGE_NUMA we already call mmu_notifier_invalidate_range_start and mmu_notifier_invalidate_range_end, which will mark existing guest hpte entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new guest hpte entries. This patch does that. So what happens next? We insert a page into the HTAB without HPTE_V_VALID set, so the guest will fail to use it. If the guest does an H_READ on it it will suddenly turn to V_VALID though? As per the guest the entry is valid, so yes an hread should return a valid entry. But in real hpte we would mark it not valid. Ah, yes. I might need a crash course in the use of HPTE_V_ABSENT. When guest tries to access the address, the host will handle the fault. kvmppc_hpte_hv_fault should give more info Thanks for the pointer. So we fault it in lazily. Is there any particular reason we can't do that on h_enter already? After all this just means an additional roundtrip because the guest is pretty likely to use the page it just entered, no? We could get wrong numa fault information if we didn't do h_enter from the right node from which we faulted. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V3] POWERPC: BOOK3S: KVM: Use the saved dsisr and dar values on book3s 64
Although it's optional IBM POWER cpus always had DAR value set on alignment interrupt. So don't try to compute these values. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V2: * Depend on cpu feature flag to decide whether to use fault_dsir or not arch/powerpc/include/asm/cputable.h| 1 + arch/powerpc/include/asm/disassemble.h | 34 + arch/powerpc/kernel/align.c| 34 + arch/powerpc/kernel/cputable.c | 15 +++- arch/powerpc/kvm/book3s_emulate.c | 69 -- 5 files changed, 82 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h index 0d4939ba48e7..1922dce6124d 100644 --- a/arch/powerpc/include/asm/cputable.h +++ b/arch/powerpc/include/asm/cputable.h @@ -555,6 +555,7 @@ static inline int cpu_has_feature(unsigned long feature) } #define HBP_NUM 1 +extern struct cpu_spec *find_cpuspec(unsigned int pvr); #endif /* !__ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 856f8deb557a..6330a61b875a 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -81,4 +81,38 @@ static inline unsigned int get_oc(u32 inst) { return (inst 11) 0x7fff; } + +#define IS_XFORM(inst) (get_op(inst) == 31) +#define IS_DSFORM(inst)(get_op(inst) = 56) + +/* + * Create a DSISR value from the instruction + */ +static inline unsigned make_dsisr(unsigned instr) +{ + unsigned dsisr; + + + /* bits 6:15 -- 22:31 */ + dsisr = (instr 0x03ff) 16; + + if (IS_XFORM(instr)) { + /* bits 29:30 -- 15:16 */ + dsisr |= (instr 0x0006) 14; + /* bit 25 --17 */ + dsisr |= (instr 0x0040) 8; + /* bits 21:24 -- 18:21 */ + dsisr |= (instr 0x0780) 3; + } else { + /* bit 5 --17 */ + dsisr |= (instr 0x0400) 12; + /* bits 1: 4 -- 18:21 */ + dsisr |= (instr 0x7800) 17; + /* bits 30:31 -- 12:13 */ + if (IS_DSFORM(instr)) + dsisr |= (instr 0x0003) 18; + } + + return dsisr; +} #endif /* __ASM_PPC_DISASSEMBLE_H__ */ diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index de91f3ae631e..111d93ec7f34 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -25,14 +25,13 @@ #include asm/cputable.h #include asm/emulated_ops.h #include asm/switch_to.h +#include asm/disassemble.h struct aligninfo { unsigned char len; unsigned char flags; }; -#define IS_XFORM(inst) (((inst) 26) == 31) -#define IS_DSFORM(inst)(((inst) 26) = 56) #define INVALID{ 0, 0 } @@ -192,37 +191,6 @@ static struct aligninfo aligninfo[128] = { }; /* - * Create a DSISR value from the instruction - */ -static inline unsigned make_dsisr(unsigned instr) -{ - unsigned dsisr; - - - /* bits 6:15 -- 22:31 */ - dsisr = (instr 0x03ff) 16; - - if (IS_XFORM(instr)) { - /* bits 29:30 -- 15:16 */ - dsisr |= (instr 0x0006) 14; - /* bit 25 --17 */ - dsisr |= (instr 0x0040) 8; - /* bits 21:24 -- 18:21 */ - dsisr |= (instr 0x0780) 3; - } else { - /* bit 5 --17 */ - dsisr |= (instr 0x0400) 12; - /* bits 1: 4 -- 18:21 */ - dsisr |= (instr 0x7800) 17; - /* bits 30:31 -- 12:13 */ - if (IS_DSFORM(instr)) - dsisr |= (instr 0x0003) 18; - } - - return dsisr; -} - -/* * The dcbz (data cache block zero) instruction * gives an alignment fault if used on non-cacheable * memory. We handle the fault mainly for the diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 597d954e5860..b367f5b772f6 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -116,7 +116,7 @@ extern void __restore_cpu_e6500(void); PPC_FEATURE_BOOKE) #endif -static struct cpu_spec __initdata cpu_specs[] = { +static struct cpu_spec cpu_specs[] = { #ifdef CONFIG_PPC_BOOK3S_64 { /* Power3 */ .pvr_mask = 0x, @@ -2258,3 +2258,16 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr) return NULL; } + +struct cpu_spec *find_cpuspec(unsigned int pvr) +{ + int i; + struct cpu_spec *s = cpu_specs; + + for (i = 0; i ARRAY_SIZE(cpu_specs); i++, s++) { + if ((pvr s-pvr_mask) == s-pvr_value) + return s; + } + return NULL
[PATCH V3] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest
This patch make sure we inherit the LE bit correctly in different case so that we can run Little Endian distro in PR mode Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- Changes from V2: * Move H_SET_MODE to qemu arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kvm/book3s_64_mmu.c| 2 +- arch/powerpc/kvm/book3s_pr.c| 32 +++- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 207b7826c9b1..f4be7be14330 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -550,6 +550,7 @@ struct kvm_vcpu_arch { #ifdef CONFIG_PPC_BOOK3S ulong fault_dar; u32 fault_dsisr; + unsigned long intr_msr; #endif #ifdef CONFIG_BOOKE diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index b754f629a177..7484676b8f25 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -518,6 +518,7 @@ int main(void) DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid)); diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 83da1f868fd5..8231b83c493b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -38,7 +38,7 @@ static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) { - kvmppc_set_msr(vcpu, MSR_SF); + kvmppc_set_msr(vcpu, vcpu-arch.intr_msr); } static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index eb070eb4da40..828056ec208f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -263,7 +263,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) ulong smsr = vcpu-arch.shared-msr; /* Guest MSR values */ - smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; + smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; /* Process MSR values */ smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; /* External providers the guest reserved */ @@ -1178,6 +1178,15 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, break; } #endif /* CONFIG_VSX */ + case KVM_REG_PPC_LPCR: + /* +* We are only interested in the LPCR_ILE bit +*/ + if (vcpu-arch.intr_msr MSR_LE) + *val = get_reg_val(id, LPCR_ILE); + else + *val = get_reg_val(id, 0); + break; default: r = -EINVAL; break; @@ -1186,6 +1195,23 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, return r; } +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + struct kvm *kvm = vcpu-kvm; + /* +* If ILE (interrupt little-endian) has changed, update the +* MSR_LE bit in the intr_msr for each vcpu in this vcore. +*/ + if ((new_lpcr LPCR_ILE) != (vcpu-arch.intr_msr MSR_LE)) { + mutex_lock(kvm-lock); + if (new_lpcr LPCR_ILE) + vcpu-arch.intr_msr |= MSR_LE; + else + vcpu-arch.intr_msr = ~MSR_LE; + mutex_unlock(kvm-lock); + } +} + static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) { @@ -1209,6 +1235,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, break; } #endif /* CONFIG_VSX */ + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); + break; default: r = -EINVAL; break; @@ -1261,6 +1290,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, vcpu-arch.pvr = 0x3C0301; if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) vcpu-arch.pvr = mfspr(SPRN_PVR); + vcpu-arch.intr_msr = MSR_SF; #else /* default to book3s_32 (750) */ vcpu-arch.pvr = 0x84202; -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: thp: Fix crash on mremap
This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.11 stable series Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/Kconfig | 3 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + mm/huge_memory.c | 12 3 files changed, 16 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index 1feb169274fe..c5863b35d054 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -368,6 +368,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE config HAVE_ARCH_SOFT_DIRTY bool +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 47d9a03dd415..d11a34be018d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS + select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 12acb0ba7991..beaa7cc9de75 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1461,8 +1461,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, ret = __pmd_trans_huge_lock(old_pmd, vma); if (ret == 1) { +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + pgtable_t pgtable; +#endif pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); +#endif set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(mm-page_table_lock); } -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH] powerpc: thp: Fix crash on mremap
This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.12 stable series Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/Kconfig | 3 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + mm/huge_memory.c | 12 3 files changed, 16 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index af2cc6eabcc7..bca9e7a18bd2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -365,6 +365,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE config HAVE_ARCH_SOFT_DIRTY bool +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6704e2e20e6b..0225011231ea 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS + select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 292a266e0d42..89b7a647f1cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1474,8 +1474,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, ret = __pmd_trans_huge_lock(old_pmd, vma); if (ret == 1) { +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + pgtable_t pgtable; +#endif pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); +#endif set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(mm-page_table_lock); } -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] Fix compile error of pgtable-ppc64.h
From: Li Zhong zh...@linux.vnet.ibm.com It seems that forward declaration couldn't work well with typedef, use struct spinlock directly to avoiding following build errors: In file included from include/linux/spinlock.h:81, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t' /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.13 stable series Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d27960c89a71..bc141c950b1e 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw -typedef struct spinlock spinlock_t; -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, -spinlock_t *old_pmd_ptl) +struct spinlock; +static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, +struct spinlock *old_pmd_ptl) { /* * Archs like ppc64 use pgtable to store per pmd -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] powerpc/thp: Fix crash on mremap
This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.13 stable series Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org --- arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++ include/asm-generic/pgtable.h| 12 mm/huge_memory.c | 14 +- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 4a191c472867..d27960c89a71 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +#define pmd_move_must_withdraw pmd_move_must_withdraw +typedef struct spinlock spinlock_t; +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + return true; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index db0923458940..8e4f41d9af4d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* With split pmd lock we also need to move preallocated +* PTE page table if new_pmd is on different PMD page table. +*/ + return new_pmd_ptl != old_pmd_ptl; +} +#endif + /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..5d80c53b87cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Add POWER8 support
Hello, This patch series implements PR KVM support for POWER8 platform -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Fix PURR and SPURR emulation
We definitely don't need to emulate mtspr, because both the registers are hypervisor resource. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s.h | 2 -- arch/powerpc/include/asm/kvm_host.h | 4 ++-- arch/powerpc/kvm/book3s_emulate.c | 16 arch/powerpc/kvm/book3s_pr.c | 10 ++ 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index bc23b1ba7980..396448afa38b 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -83,8 +83,6 @@ struct kvmppc_vcpu_book3s { u64 sdr1; u64 hior; u64 msr_mask; - u64 purr_offset; - u64 spurr_offset; #ifdef CONFIG_PPC_BOOK3S_32 u32 vsid_pool[VSID_POOL_SIZE]; u32 vsid_next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9a0cdb2c9d58..0a3785271f34 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -506,8 +506,8 @@ struct kvm_vcpu_arch { #ifdef CONFIG_BOOKE u32 decar; #endif - u32 tbl; - u32 tbu; + /* Time base value when we entered the guest */ + u64 entry_tb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index a7d54aa203d0..e1f1e5e16449 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -422,12 +422,6 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) (mfmsr() MSR_HV)) vcpu-arch.hflags |= BOOK3S_HFLAG_DCBZ32; break; - case SPRN_PURR: - to_book3s(vcpu)-purr_offset = spr_val - get_tb(); - break; - case SPRN_SPURR: - to_book3s(vcpu)-spurr_offset = spr_val - get_tb(); - break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: @@ -523,10 +517,16 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val *spr_val = 0; break; case SPRN_PURR: - *spr_val = get_tb() + to_book3s(vcpu)-purr_offset; + /* +* On exit we would have updated purr +*/ + *spr_val = vcpu-arch.purr; break; case SPRN_SPURR: - *spr_val = get_tb() + to_book3s(vcpu)-purr_offset; + /* +* On exit we would have updated spurr +*/ + *spr_val = vcpu-arch.spurr; break; case SPRN_GQR0: case SPRN_GQR1: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index fdcbabdfb709..02231f5193c2 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -115,6 +115,11 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, svcpu-lr = vcpu-arch.lr; svcpu-pc = vcpu-arch.pc; svcpu-in_use = true; + /* +* Now also save the current time base value. We use this +* to find the guest purr and spurr value. +*/ + vcpu-arch.entry_tb = get_tb(); } /* Copy data touched by real-mode code from shadow vcpu back to vcpu */ @@ -161,6 +166,11 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, out: preempt_enable(); + /* +* Update purr and spurr using time base +*/ + vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb; + vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb; } static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 02/10] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
virtual time base register is a per vm register and need to saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/reg.h | 7 +++ arch/powerpc/include/asm/time.h | 12 arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_pr.c| 3 +++ 5 files changed, 26 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 0a3785271f34..9ebdd12e50a9 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -508,6 +508,7 @@ struct kvm_vcpu_arch { #endif /* Time base value when we entered the guest */ u64 entry_tb; + u64 entry_vtb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e789f76c9bc2..6c649355b1e9 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1161,6 +1161,13 @@ #define mtspr(rn, v) asm volatile(mtspr __stringify(rn) ,%0 : \ : r ((unsigned long)(v)) \ : memory) +#ifdef CONFIG_PPC_BOOK3S_64 +#define mfvtb()({unsigned long rval; \ + asm volatile(mfspr %0, %1 : \ +=r (rval) : i (SPRN_VTB)); rval;}) +#else +#define mfvtb() BUG() +#endif #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index c1f267694acb..1e89dbc665d9 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -101,6 +101,18 @@ static inline u64 get_rtc(void) return (u64)hi * 10 + lo; } +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 get_vtb(void) +{ + return mfvtb(); +} +#else +static inline u64 get_vtb(void) +{ + return 0; +} +#endif + #ifdef CONFIG_PPC64 static inline u64 get_tb(void) { diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index e1f1e5e16449..4b58d8a90cb5 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -528,6 +528,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 02231f5193c2..b5598e9cdd09 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -120,6 +120,8 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, * to find the guest purr and spurr value. */ vcpu-arch.entry_tb = get_tb(); + vcpu-arch.entry_vtb = get_vtb(); + } /* Copy data touched by real-mode code from shadow vcpu back to vcpu */ @@ -171,6 +173,7 @@ out: */ vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb; + vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb; } static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 04/10] KVM: PPC: BOOK3S: PR: Emulate Thread identification register
Since PR KVM doesn't support SMT yet, we always return 0. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index abe6f3057e5b..e74dda36ebea 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -561,6 +561,12 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_DABR: *spr_val = 0; break; + case SPRN_TIR: + /* +* We don't have SMT support for PR yet, hence always return 0 +*/ + *spr_val = 0; + break; default: unprivileged: printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn); -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 05/10] KVM: PPC: BOOK3S: PR: Doorbell support
We don't have SMT support yet, hence we should not find a doorbell message generated Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index e74dda36ebea..9cf0392e3dcf 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -28,7 +28,9 @@ #define OP_19_XOP_RFI 50 #define OP_31_XOP_MFMSR83 +#define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MTMSR146 +#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MTMSRD 178 #define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSRIN 242 @@ -286,6 +288,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + case OP_31_XOP_MSGSNDP: + { + /* +* PR KVM still don't support SMT mode. So we should +* not see a MSGSNDP/MSGCLRP used with PR KVM +*/ + pr_info(KVM: MSGSNDP used in non SMT case\n); + emulated = EMULATE_FAIL; + break; + } + case OP_31_XOP_MSGCLRP: + { + pr_infoKVM: MSGCLRP used in non SMT case\n); + emulated = EMULATE_FAIL; + break; + } default: emulated = EMULATE_FAIL; } -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 06/10] KVM: PPC: BOOK3S: PR: Emulate DPDES register
Since we don't support SMT yet, we should always find zero in Directed privileged doorbell exception state register. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 9cf0392e3dcf..7f25adbd2590 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -585,6 +585,12 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = 0; break; + case SPRN_DPDES: + /* +* We don't have SMT support for PR yet, hence always return 0 +*/ + *spr_val = 0; + break; default: unprivileged: printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn); -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 08/10] KVM: PPC: BOOK3S: PR: Add support for facility unavailable interrupt
At this point we allow all the supported facilities except EBB. So forward the interrupt to guest as illegal instruction. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_asm.h | 4 +++- arch/powerpc/kvm/book3s.c | 4 arch/powerpc/kvm/book3s_emulate.c | 18 ++ arch/powerpc/kvm/book3s_pr.c | 17 + 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 1bd92fd43cfb..799244face51 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -99,6 +99,7 @@ #define BOOK3S_INTERRUPT_PERFMON 0xf00 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 #define BOOK3S_INTERRUPT_VSX 0xf40 +#define BOOK3S_INTERRUPT_FAC_UNAVAIL0xf60 #define BOOK3S_IRQPRIO_SYSTEM_RESET0 #define BOOK3S_IRQPRIO_DATA_SEGMENT1 @@ -117,7 +118,8 @@ #define BOOK3S_IRQPRIO_DECREMENTER 14 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 16 -#define BOOK3S_IRQPRIO_MAX 17 +#define BOOK3S_IRQPRIO_FAC_UNAVAIL 17 +#define BOOK3S_IRQPRIO_MAX 18 #define BOOK3S_HFLAG_DCBZ320x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 8912608b7e1b..a9aea28c2677 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -143,6 +143,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;break; case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; + case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; default:prio = BOOK3S_IRQPRIO_MAX; break; } @@ -273,6 +274,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: vec = BOOK3S_INTERRUPT_PERFMON; break; + case BOOK3S_IRQPRIO_FAC_UNAVAIL: + vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; + break; default: deliver = 0; printk(KERN_ERR KVM: Unknown interrupt: 0x%x\n, priority); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 60d0b6b745e7..bf6b11021250 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -481,6 +481,15 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) vcpu-arch.shadow_fscr = vcpu-arch.fscr host_fscr; break; } + case SPRN_EBBHR: + vcpu-arch.ebbhr = spr_val; + break; + case SPRN_EBBRR: + vcpu-arch.ebbrr = spr_val; + break; + case SPRN_BESCR: + vcpu-arch.bescr = spr_val; + break; unprivileged: default: printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn); @@ -607,6 +616,15 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_FSCR: *spr_val = vcpu-arch.fscr; break; + case SPRN_EBBHR: + *spr_val = vcpu-arch.ebbhr; + break; + case SPRN_EBBRR: + *spr_val = vcpu-arch.ebbrr; + break; + case SPRN_BESCR: + *spr_val = vcpu-arch.bescr; + break; default: unprivileged: printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 51d469f8c9fd..828056ec208f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -900,6 +900,23 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PERFMON: r = RESUME_GUEST; break; + case BOOK3S_INTERRUPT_FAC_UNAVAIL: + { + /* +* Check for the facility that need to be emulated +*/ + ulong fscr_ic = vcpu-arch.shadow_fscr 56; + if (fscr_ic != FSCR_EBB_LG) { + /* +* We only disable EBB facility. +* So only emulate that. +*/ + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + } + /* Fall through */ + } case BOOK3S_INTERRUPT_PROGRAM: case BOOK3S_INTERRUPT_H_EMUL_ASSIST: { -- 1.8.5.3 ___ Linuxppc-dev mailing
[RFC PATCH 07/10] KVM: PPC: BOOK3S: PR: Emulate facility status and control register
We allow priv-mode update of this. The guest value is saved in fscr, and the value actually used is saved in shadow_fscr. shadow_fscr only contains values that are allowed by the host. On facility unavailable interrupt, if the facility is allowed by fscr but disabled in shadow_fscr we need to emulate the support. Currently all but EBB is disabled. We still don't support performance monitoring in PR guest. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_book3s_asm.h | 1 + arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kernel/asm-offsets.c | 2 ++ arch/powerpc/kvm/book3s_emulate.c | 16 arch/powerpc/kvm/book3s_interrupts.S | 25 ++--- 5 files changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h index 192917d2239c..abd42523ad93 100644 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h @@ -103,6 +103,7 @@ struct kvmppc_host_state { #ifdef CONFIG_PPC_BOOK3S_64 u64 cfar; u64 ppr; + u64 host_fscr; #endif }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e0b13aca98e6..f4be7be14330 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -478,6 +478,7 @@ struct kvm_vcpu_arch { ulong ppr; ulong pspb; ulong fscr; + ulong shadow_fscr; ulong tfhar; ulong tfiar; ulong texasr; diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 2c2227da6917..7484676b8f25 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -525,6 +525,7 @@ int main(void) DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar)); DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr)); DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr)); + DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr)); DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb)); DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar)); DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar)); @@ -626,6 +627,7 @@ int main(void) #ifdef CONFIG_PPC_BOOK3S_64 HSTATE_FIELD(HSTATE_CFAR, cfar); HSTATE_FIELD(HSTATE_PPR, ppr); + HSTATE_FIELD(HSTATE_FSCR, host_fscr); #endif /* CONFIG_PPC_BOOK3S_64 */ #else /* CONFIG_PPC_BOOK3S */ diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 7f25adbd2590..60d0b6b745e7 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -468,6 +468,19 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_MSSSR0: case SPRN_DABR: break; + case SPRN_FSCR: + { + ulong host_fscr = mfspr(SPRN_FSCR); + /* +* We disable FSCR_EBB for pr guest. TAR and DSCR are always +* enabled. +*/ + if (spr_val ~(FSCR_TAR|FSCR_DSCR|FSCR_EBB)) + pr_info(KVM: invalud FSCR value 0x%lx, spr_val); + vcpu-arch.fscr = spr_val (FSCR_TAR|FSCR_DSCR); + vcpu-arch.shadow_fscr = vcpu-arch.fscr host_fscr; + break; + } unprivileged: default: printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn); @@ -591,6 +604,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = 0; break; + case SPRN_FSCR: + *spr_val = vcpu-arch.fscr; + break; default: unprivileged: printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn); diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S index f779450cb07c..fcbdf4817301 100644 --- a/arch/powerpc/kvm/book3s_interrupts.S +++ b/arch/powerpc/kvm/book3s_interrupts.S @@ -107,6 +107,14 @@ kvm_start_lightweight: ld r3, VCPU_SHARED(r4) ld r3, VCPU_SHARED_SPRG3(r3) mtspr SPRN_SPRG3, r3 + +BEGIN_FTR_SECTION + mfspr r3,SPRN_FSCR + PPC_STL r3, HSTATE_FSCR(r13) + + PPC_LL r3, VCPU_SHADOW_FSCR(r4) + mtspr SPRN_FSCR, r3 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) #endif /* CONFIG_PPC_BOOK3S_64 */ PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ @@ -148,6 +156,9 @@ kvm_start_lightweight: bl FUNC(kvmppc_copy_from_svcpu) nop + /* R7 = vcpu */ + PPC_LL r7, GPR4(r1) + #ifdef CONFIG_PPC_BOOK3S_64 /* * Reload kernel SPRG3 value. @@ -155,10 +166,18 @@ kvm_start_lightweight: */ ld r3, PACA_SPRG3(r13) mtspr SPRN_SPRG3, r3 -#endif
[RFC PATCH 10/10] PPC: BOOK3S: Disable/Enable TM looking at the ibm, pa-features device tree entry
Runtime disable transactional memory feature looking at pa-features device tree entry. We need to do this so that we can run a kernel built with TM config in PR mode. For PR guest we provide a device tree entry with TM feature disabled in pa-features Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kernel/prom.c | 5 + 1 file changed, 5 insertions(+) diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index fa0ad8aafbcc..de8c2caf1024 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -160,6 +160,11 @@ static struct ibm_pa_feature { {CPU_FTR_NODSISRALIGN, 0, 0,1, 1, 1}, {0, MMU_FTR_CI_LARGE_PAGE, 0, 1, 2, 0}, {CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0}, + /* +* We should use CPU_FTR_TM_COMP so that if we disable TM, it won't get +* enabled via device tree +*/ + {CPU_FTR_TM_COMP, 0, 0, 22, 0, 0}, }; static void __init scan_features(unsigned long node, unsigned char *ftrs, -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 09/10] KVM: PPC: BOOK3S: PR: Ignore write to monitor mode control register
We ignore write to these registers now Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index bf6b11021250..c0aee34ef04f 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -490,6 +490,16 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) case SPRN_BESCR: vcpu-arch.bescr = spr_val; break; + case SPRN_MMCRS: + break; + case SPRN_MMCRA: + break; + case SPRN_MMCR0: + break; + case SPRN_MMCR1: + break; + case SPRN_MMCR2: + break; unprivileged: default: printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn); -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[RFC PATCH 03/10] KVM: PPC: BOOK3S: PR: Emulate instruction counter
Writing to IC is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_pr.c| 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9ebdd12e50a9..e0b13aca98e6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -509,6 +509,7 @@ struct kvm_vcpu_arch { /* Time base value when we entered the guest */ u64 entry_tb; u64 entry_vtb; + u64 entry_ic; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 4b58d8a90cb5..abe6f3057e5b 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -531,6 +531,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_VTB: *spr_val = vcpu-arch.vtb; break; + case SPRN_IC: + *spr_val = vcpu-arch.ic; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b5598e9cdd09..51d469f8c9fd 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -121,6 +121,7 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, */ vcpu-arch.entry_tb = get_tb(); vcpu-arch.entry_vtb = get_vtb(); + vcpu-arch.entry_ic = mfspr(SPRN_IC); } @@ -174,6 +175,7 @@ out: vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb; + vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic; } static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) -- 1.8.5.3 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: don't re-issue spinlock typedef that breaks older gcc
Paul Gortmaker paul.gortma...@windriver.com writes: Commit b3084f4db3aeb991c507ca774337c7e7893ed04f (powerpc/thp: Fix crash on mremap) added a typedef struct spinlock spinlock_t; which on gcc 4.5.2 (and possibly other versions) causes many of: include/linux/spinlock_types.h:76:3: error: redefinition of typedef 'spinlock_t' arch/powerpc/include/asm/pgtable-ppc64.h:563:25: note: previous declaration of 'spinlock_t' was here In file included from include/linux/mutex.h:15:0, from include/linux/notifier.h:13, from include/linux/pm_qos.h:8, from include/linux/netdevice.h:28, from drivers/net/wireless/ath/wil6210/wil6210.h:20, from drivers/net/wireless/ath/wil6210/debug.c:17: It appears that somewhere between gcc 4.5.2 and 4.6.3 this redefinition restriction was lifted. Using the proper header from within !ASSEMBLY seems to fix it up in an acceptable way. Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Cc: Kirill A. Shutemov kirill.shute...@linux.intel.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com --- http://mid.gmane.org/1389939036.3000.7.ca...@thinkpad-t5421.cn.ibm.com This was posted earlier. [ Note that b3084f4db3 isn't mainline yet, it is currently in benh/powerpc.git #merge -- but is headed there soon via: https://lkml.org/lkml/2014/1/27/599 ] arch/powerpc/include/asm/pgtable-ppc64.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d27960c89a71..3b638411646a 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -111,6 +111,8 @@ #ifndef __ASSEMBLY__ +#include linux/spinlock_types.h + /* * This is the default implementation of various PTE accessors, it's * used in all cases except Book3S with 64K pages where we have a @@ -560,7 +562,6 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw -typedef struct spinlock spinlock_t; static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, spinlock_t *old_pmd_ptl) { -- 1.8.5.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH] powerpc: don't re-issue spinlock typedef that breaks older gcc
Paul Gortmaker paul.gortma...@windriver.com writes: On 14-01-28 12:28 PM, Aneesh Kumar K.V wrote: Paul Gortmaker paul.gortma...@windriver.com writes: Commit b3084f4db3aeb991c507ca774337c7e7893ed04f (powerpc/thp: Fix crash on mremap) added a typedef struct spinlock spinlock_t; which on gcc 4.5.2 (and possibly other versions) causes many of: include/linux/spinlock_types.h:76:3: error: redefinition of typedef 'spinlock_t' arch/powerpc/include/asm/pgtable-ppc64.h:563:25: note: previous declaration of 'spinlock_t' was here In file included from include/linux/mutex.h:15:0, from include/linux/notifier.h:13, from include/linux/pm_qos.h:8, from include/linux/netdevice.h:28, from drivers/net/wireless/ath/wil6210/wil6210.h:20, from drivers/net/wireless/ath/wil6210/debug.c:17: It appears that somewhere between gcc 4.5.2 and 4.6.3 this redefinition restriction was lifted. Using the proper header from within !ASSEMBLY seems to fix it up in an acceptable way. Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Cc: Kirill A. Shutemov kirill.shute...@linux.intel.com Cc: Benjamin Herrenschmidt b...@kernel.crashing.org Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com --- http://mid.gmane.org/1389939036.3000.7.ca...@thinkpad-t5421.cn.ibm.com This was posted earlier. I see. Well I guess Ben didn't use it since it is the same as the temporary not-signed-off-by hack patch I posted earlier as well. https://lkml.org/lkml/2014/1/27/584 I believe what I've posted here below to be the proper fix. I had another variant which needed this http://mid.gmane.org/1388999012-14424-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com BTW I had added the above struct spinlock; patch as the backport to stable 3.13 series. So if we are picking another one, we may need to update stable also -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] Fix compile error of pgtable-ppc64.h
Greg KH g...@kroah.com writes: On Thu, Jan 30, 2014 at 09:57:36AM +1100, Benjamin Herrenschmidt wrote: On Wed, 2014-01-29 at 10:45 -0800, Greg KH wrote: On Tue, Jan 28, 2014 at 05:52:42PM +0530, Aneesh Kumar K.V wrote: From: Li Zhong zh...@linux.vnet.ibm.com It seems that forward declaration couldn't work well with typedef, use struct spinlock directly to avoiding following build errors: In file included from include/linux/spinlock.h:81, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t' /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.13 stable series I don't understand, why is this needed? Is there a corrisponding patch upstream that already does this? What went wrong with a normal backport of the patch to 3.13? There's a corresponding patch in powerpc-next that I'm about to send to Linus today, but for the backport, the fix could be folded into the original offending patch. Oh come on, you know better than to try to send me a patch that isn't in Linus's tree already. Crap, I can't take that at all. Send me the git commit id when it is in Linus's tree, otherwise I'm not taking it. And no, don't fold in anything, that's not ok either. I'll just go drop this patch entirely from all of my -stable trees for now. Feel free to resend them when all of the needed stuff is upstream. The fix for mremap crash is already in Linus tree. It is the build failure for older gcc compiler version that is not in linus tree. We missed that in the first pull request. Do we really need to drop the patch from 3.11 and 3.12 trees ? The patch their is a variant, and don't require this build fix. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] Fix compile error of pgtable-ppc64.h
Greg KH g...@kroah.com writes: On Thu, Jan 30, 2014 at 11:08:52PM +0530, Aneesh Kumar K.V wrote: Greg KH g...@kroah.com writes: On Thu, Jan 30, 2014 at 09:57:36AM +1100, Benjamin Herrenschmidt wrote: On Wed, 2014-01-29 at 10:45 -0800, Greg KH wrote: On Tue, Jan 28, 2014 at 05:52:42PM +0530, Aneesh Kumar K.V wrote: From: Li Zhong zh...@linux.vnet.ibm.com It seems that forward declaration couldn't work well with typedef, use struct spinlock directly to avoiding following build errors: In file included from include/linux/spinlock.h:81, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t' /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.13 stable series I don't understand, why is this needed? Is there a corrisponding patch upstream that already does this? What went wrong with a normal backport of the patch to 3.13? There's a corresponding patch in powerpc-next that I'm about to send to Linus today, but for the backport, the fix could be folded into the original offending patch. Oh come on, you know better than to try to send me a patch that isn't in Linus's tree already. Crap, I can't take that at all. Send me the git commit id when it is in Linus's tree, otherwise I'm not taking it. And no, don't fold in anything, that's not ok either. I'll just go drop this patch entirely from all of my -stable trees for now. Feel free to resend them when all of the needed stuff is upstream. The fix for mremap crash is already in Linus tree. What is the git commit id? upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f That is patch 1 in this series. It is the build failure for older gcc compiler version that is not in linus tree. That is what I can not take. We missed that in the first pull request. Do we really need to drop the patch from 3.11 and 3.12 trees ? I already did. The patch their is a variant, and don't require this build fix. Don't give me a variant, give me the exact same patch, only changed to handle the fuzz/differences of older kernels, don't make different changes to the original patch to make up for things you found out later on, otherwise everyone is confused as to why the fix for the fix is not in the tree. In this specific case it may be difficult. 3.13 have other changes around the code path. It has split pmd locks etc which result in us doing a withdraw and deposit even on x86. For 3.11 and 3.12, we need to do that extra withdraw and deposit only for ppc64. Hence the variant which used #ifdef around that code. So, when both patches get in Linus's tree, please send me the properly backported patches and I'll be glad to apply them. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V2 1/2] powerpc/thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.13 stable series Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org --- arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++ include/asm-generic/pgtable.h| 12 mm/huge_memory.c | 14 +- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index 4a191c472867..d27960c89a71 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #define __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); + +#define pmd_move_must_withdraw pmd_move_must_withdraw +typedef struct spinlock spinlock_t; +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + return true; +} + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index db0923458940..8e4f41d9af4d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, +spinlock_t *old_pmd_ptl) +{ + /* +* With split pmd lock we also need to move preallocated +* PTE page table if new_pmd is on different PMD page table. +*/ + return new_pmd_ptl != old_pmd_ptl; +} +#endif + /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 95d1acb0f3d2..5d80c53b87cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) { - pgtable_t pgtable; - /* -* Move preallocated PTE page table if new_pmd is on -* different PMD page table. -*/ + if (pmd_move_must_withdraw(new_ptl, old_ptl)) { + pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); - - spin_unlock(new_ptl); } + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); spin_unlock(old_ptl); } out: -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V2 2/2] powerpc/mm: Fix compile error of pgtable-ppc64.h
From: Li Zhong zh...@linux.vnet.ibm.com It seems that forward declaration couldn't work well with typedef, use struct spinlock directly to avoiding following build errors: In file included from include/linux/spinlock.h:81, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:17, from arch/powerpc/kernel/asm-offsets.c:17: include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t' /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous declaration of 'spinlock_t' was here upstream sha1:fd120dc2e205d2318a8b47d6d8098b789e3af67d for 3.13 stable series Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org --- arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index d27960c89a71..bc141c950b1e 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define pmd_move_must_withdraw pmd_move_must_withdraw -typedef struct spinlock spinlock_t; -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, -spinlock_t *old_pmd_ptl) +struct spinlock; +static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, +struct spinlock *old_pmd_ptl) { /* * Archs like ppc64 use pgtable to store per pmd -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Fix PURR and SPURR emulation
Alexander Graf ag...@suse.de writes: On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote: We definitely don't need to emulate mtspr, because both the registers are hypervisor resource. This patch description doesn't cover what the patch actually does. It changes the implementation from always tell the guest it uses 100% to give the guest an accurate amount of cpu time spent inside guest context. Will fix that Also, I think we either go with full hyp semantics which means we also emulate the offset or we go with no hyp awareness in the guest at all which means we also don't emulate SPURR which is a hyp privileged register. Can you clarify this ? Otherwise I like the patch :). -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 02/10] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
Paul Mackerras pau...@samba.org writes: On Tue, Jan 28, 2014 at 10:14:07PM +0530, Aneesh Kumar K.V wrote: virtual time base register is a per vm register and need to saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. ... +#ifdef CONFIG_PPC_BOOK3S_64 +#define mfvtb() ({unsigned long rval; \ +asm volatile(mfspr %0, %1 : \ + =r (rval) : i (SPRN_VTB)); rval;}) The mfspr will be a no-op on anything before POWER8, meaning the result will be whatever value was in the destination GPR before the mfspr. I suppose that may not matter if the result is only ever used when we're running on a POWER8 host, but I would feel more comfortable if we had explicit feature tests to make sure of that, rather than possibly doing computations with unpredictable values. With your patch, a guest on a POWER7 or a PPC970 could do a read from VTB and get garbage -- first, there is nothing to stop userspace from requesting POWER8 emulation on an older machine, and secondly, even if the virtual machine is a PPC970 (say) you don't implement unimplemented SPR semantics for VTB (no-op if PR=0, illegal instruction interrupt if PR=1). Ok that means we need to do something like ? struct cpu_spec *s = find_cpuspec(vcpu-arch.pvr); if (s-cpu_features CPU_FTR_ARCH_207S) { } On the whole I think it is reasonable to reject an attempt to set the virtual PVR to a POWER8 PVR value if we are not running on a POWER8 host, because emulating all the new POWER8 features in software (particularly transactional memory) would not be feasible. Alex may disagree. :) That would make it much simpler. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 03/10] KVM: PPC: BOOK3S: PR: Emulate instruction counter
Alexander Graf ag...@suse.de writes: On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote: Writing to IC is not allowed in the privileged mode. This is not a patch description. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_pr.c| 2 ++ 3 files changed, 6 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9ebdd12e50a9..e0b13aca98e6 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -509,6 +509,7 @@ struct kvm_vcpu_arch { /* Time base value when we entered the guest */ u64 entry_tb; u64 entry_vtb; +u64 entry_ic; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 4b58d8a90cb5..abe6f3057e5b 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -531,6 +531,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_VTB: *spr_val = vcpu-arch.vtb; break; +case SPRN_IC: +*spr_val = vcpu-arch.ic; +break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index b5598e9cdd09..51d469f8c9fd 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -121,6 +121,7 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, */ vcpu-arch.entry_tb = get_tb(); vcpu-arch.entry_vtb = get_vtb(); +vcpu-arch.entry_ic = mfspr(SPRN_IC); Is this implemented on all systems? } @@ -174,6 +175,7 @@ out: vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb; +vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic; This is getting quite convoluted. How about we act slightly more fuzzy and put all of this into vcpu_load/put? I am not sure whether vcpu_load/put is too early/late to save these context ? -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 07/10] KVM: PPC: BOOK3S: PR: Emulate facility status and control register
Paul Mackerras pau...@samba.org writes: On Tue, Jan 28, 2014 at 10:14:12PM +0530, Aneesh Kumar K.V wrote: We allow priv-mode update of this. The guest value is saved in fscr, and the value actually used is saved in shadow_fscr. shadow_fscr only contains values that are allowed by the host. On facility unavailable interrupt, if the facility is allowed by fscr but disabled in shadow_fscr we need to emulate the support. Currently all but EBB is disabled. We still don't support performance monitoring in PR guest. ... +/* + * Save the current fscr in shadow fscr + */ +mfspr r3,SPRN_FSCR +PPC_STL r3, VCPU_SHADOW_FSCR(r7) I don't think you need to do this. What could possibly have changed FSCR since we loaded it on the way into the guest? The reason for facility unavailable interrupt is encoded in FSCR right ? -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 08/10] KVM: PPC: BOOK3S: PR: Add support for facility unavailable interrupt
Alexander Graf ag...@suse.de writes: On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote: At this point we allow all the supported facilities except EBB. So forward the interrupt to guest as illegal instruction. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_asm.h | 4 +++- arch/powerpc/kvm/book3s.c | 4 arch/powerpc/kvm/book3s_emulate.c | 18 ++ arch/powerpc/kvm/book3s_pr.c | 17 + 4 files changed, 42 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 1bd92fd43cfb..799244face51 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -99,6 +99,7 @@ #define BOOK3S_INTERRUPT_PERFMON 0xf00 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 #define BOOK3S_INTERRUPT_VSX 0xf40 +#define BOOK3S_INTERRUPT_FAC_UNAVAIL0xf60 #define BOOK3S_IRQPRIO_SYSTEM_RESET0 #define BOOK3S_IRQPRIO_DATA_SEGMENT1 @@ -117,7 +118,8 @@ #define BOOK3S_IRQPRIO_DECREMENTER 14 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15 #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 16 -#define BOOK3S_IRQPRIO_MAX 17 +#define BOOK3S_IRQPRIO_FAC_UNAVAIL 17 +#define BOOK3S_IRQPRIO_MAX 18 #define BOOK3S_HFLAG_DCBZ320x1 #define BOOK3S_HFLAG_SLB 0x2 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 8912608b7e1b..a9aea28c2677 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -143,6 +143,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;break; case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; +case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; default:prio = BOOK3S_IRQPRIO_MAX; break; } @@ -273,6 +274,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: vec = BOOK3S_INTERRUPT_PERFMON; break; +case BOOK3S_IRQPRIO_FAC_UNAVAIL: +vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; +break; default: deliver = 0; printk(KERN_ERR KVM: Unknown interrupt: 0x%x\n, priority); diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 60d0b6b745e7..bf6b11021250 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -481,6 +481,15 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) vcpu-arch.shadow_fscr = vcpu-arch.fscr host_fscr; break; } +case SPRN_EBBHR: +vcpu-arch.ebbhr = spr_val; +break; +case SPRN_EBBRR: +vcpu-arch.ebbrr = spr_val; +break; +case SPRN_BESCR: +vcpu-arch.bescr = spr_val; +break; unprivileged: default: printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn); @@ -607,6 +616,15 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_FSCR: *spr_val = vcpu-arch.fscr; break; +case SPRN_EBBHR: +*spr_val = vcpu-arch.ebbhr; +break; +case SPRN_EBBRR: +*spr_val = vcpu-arch.ebbrr; +break; +case SPRN_BESCR: +*spr_val = vcpu-arch.bescr; +break; default: unprivileged: printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 51d469f8c9fd..828056ec208f 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -900,6 +900,23 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, case BOOK3S_INTERRUPT_PERFMON: r = RESUME_GUEST; break; +case BOOK3S_INTERRUPT_FAC_UNAVAIL: +{ +/* + * Check for the facility that need to be emulated + */ +ulong fscr_ic = vcpu-arch.shadow_fscr 56; +if (fscr_ic != FSCR_EBB_LG) { +/* + * We only disable EBB facility. + * So only emulate that. I don't understand the comment. We emulate nothing at all here. We either - hit an EBB unavailable in which case we send the guest an illegal instruction interrupt or we - hit another facility interrupt in which case we forward the interrupt to the guest, but not the interrupt cause (fscr_ic). What i wanted to achive was, enable both TAR
[PATCH V2] powerpc: thp: Fix crash on mremap
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.12 stable series Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/Kconfig | 3 +++ arch/powerpc/platforms/Kconfig.cputype | 1 + mm/huge_memory.c | 12 3 files changed, 16 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index af2cc6eabcc7..bca9e7a18bd2 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -365,6 +365,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE config HAVE_ARCH_SOFT_DIRTY bool +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + bool + config HAVE_MOD_ARCH_SPECIFIC bool help diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 6704e2e20e6b..0225011231ea 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -71,6 +71,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select SYS_SUPPORTS_HUGETLBFS + select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES config PPC_BOOK3E_64 diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 292a266e0d42..89b7a647f1cb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1474,8 +1474,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, ret = __pmd_trans_huge_lock(old_pmd, vma); if (ret == 1) { +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + pgtable_t pgtable; +#endif pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW + /* +* Archs like ppc64 use pgtable to store per pmd +* specific information. So when we switch the pmd, +* we should also withdraw and deposit the pgtable +*/ + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); +#endif set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(mm-page_table_lock); } -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We will use this later to set the _PAGE_NUMA bit. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/hugetlb.h | 2 +- arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++--- arch/powerpc/mm/pgtable_64.c | 12 +++- arch/powerpc/mm/subpage-prot.c | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index d750336b171d..623f2971ce0e 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_PPC64 - return __pte(pte_update(mm, addr, ptep, ~0UL, 1)); + return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); #else return __pte(pte_update(ptep, ~0UL, 0)); #endif diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index bc141c950b1e..eb9261024f51 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, + unsigned long set, int huge) { #ifdef PTE_ATOMIC_UPDATES @@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct *mm, andi. %1,%0,%6\n\ bne-1b \n\ andc%1,%0,%4 \n\ + or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne-1b : =r (old), =r (tmp), =m (*ptep) - : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY) + : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set) : cc ); #else unsigned long old = pte_val(*ptep); - *ptep = __pte(old ~clr); + *ptep = __pte((old ~clr) | set); #endif /* huge pages use the old page table lock */ if (!huge) @@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if ((pte_val(*ptep) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + if ((pte_val(*ptep) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0); + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old _PAGE_ACCESSED) != 0; } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG @@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, if ((pte_val(*ptep) _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 0); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, @@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, if ((pte_val(*ptep) _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 1); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); } /* @@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0); + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); return __pte(old); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep) { - pte_update(mm, addr, ptep, ~0UL, 0); + pte_update(mm, addr, ptep, ~0UL, 0, 0); } @@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, extern unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, -pmd_t *pmdp, unsigned long clr); +pmd_t *pmdp, +unsigned long clr, +unsigned long set); static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, if ((pmd_val(*pmdp) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED); + old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0); return ((old
[PATCH 2/3] mm: dirty accountable change only apply to non prot numa case
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com So move it within the if loop Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- mm/mprotect.c | 21 +++-- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c1785744..33eab902f10e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* +* Avoid taking write faults for pages we +* know to be dirty. +*/ + if (dirty_accountable pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; @@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } } } - - /* -* Avoid taking write faults for pages we know to be -* dirty. -*/ - if (dirty_accountable pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also doesn't implement flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl locks. The reason to do that is to ensure that the hash page table is in sync with linux page table. We track the hpte index in linux pte and if we clear them without flushing hash and drop the ptl lock, we can have another cpu update the pte and can end up with double hash. We also want to keep set_pte_at simpler by not requiring them to do hash flush for performance reason. Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for marking pte/pmd numa Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/pgtable.h | 22 ++ include/asm-generic/pgtable.h | 24 mm/huge_memory.c | 9 ++--- mm/mprotect.c | 4 +--- 4 files changed, 49 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index f83b6f3e1b39..3ebb188c3ff5 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte) return pte; } +#define ptep_set_numa ptep_set_numa +static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, +pte_t *ptep) +{ + if ((pte_val(*ptep) _PAGE_PRESENT) == 0) + VM_BUG_ON(1); + + pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); + return; +} + #define pmd_numa pmd_numa static inline int pmd_numa(pmd_t pmd) { return pte_numa(pmd_pte(pmd)); } +#define pmdp_set_numa pmdp_set_numa +static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, +pmd_t *pmdp) +{ + if ((pmd_val(*pmdp) _PAGE_PRESENT) == 0) + VM_BUG_ON(1); + + pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); + return; +} + #define pmd_mknonnuma pmd_mknonnuma static inline pmd_t pmd_mknonnuma(pmd_t pmd) { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8e4f41d9af4d..93fdb5315a0d 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -669,6 +669,18 @@ static inline int pmd_numa(pmd_t pmd) } #endif +#ifndef pmdp_set_numa +static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, +pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmd_mknuma(entry); + set_pmd_at(mm, addr, pmdp, pmd); + return; +} +#endif + /* * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically * because they're called by the NUMA hinting minor page fault. If we @@ -701,6 +713,18 @@ static inline pte_t pte_mknuma(pte_t pte) } #endif +#ifndef ptep_set_numa +static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, +pte_t *ptep) +{ + pte_t ptent = *ptep; + + ptent = pte_mknuma(ptent); + set_pte_at(mm, addr, ptep, ptent); + return; +} +#endif + #ifndef pmd_mknuma static inline pmd_t pmd_mknuma(pmd_t pmd) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82166bf974e1..da23eb96779f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_mknonnuma(entry); entry = pmd_modify(entry, newprot); ret = HPAGE_PMD_NR; + set_pmd_at(mm, addr, pmd, entry); BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); @@ -1557,16 +1558,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, */ if (!is_huge_zero_page(page) !pmd_numa(*pmd)) { - entry = *pmd; - entry = pmd_mknuma(entry); + pmdp_set_numa(mm, addr, pmd); ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - spin_unlock(ptl); } diff --git a/mm/mprotect.c b/mm/mprotect.c index 33eab902f10e..769a67a15803 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,12 +69,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } else { struct page *page; - ptent = *pte; page = vm_normal_page(vma, addr
[PATCH 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled
Hello, This patch series fix random application crashes observed on ppc64 with numa balancing enabled. Without the patch we see crashes like anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 3cfd5464 code 30001 anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 3efc5464 code 30001 -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V2] powerpc: thp: Fix crash on mremap
Greg KH gre...@linuxfoundation.org writes: On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.12 stable series This doesn't look like a variant, it looks totally different. Why can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch (and follow-on fix) for 3.12? Because the code in that function changed in 3.13. Kirill added split ptl locks for huge pte, and we decide whether to withdraw and deposit again based on the ptl locks in 3.13. In 3.12 we do that only for ppc64 using #ifdef I _REALLY_ dislike patches that are totally different from Linus's tree in stable trees, it has caused nothing but problems in the past. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH V2] powerpc: thp: Fix crash on mremap
Benjamin Herrenschmidt b...@kernel.crashing.org writes: On Tue, 2014-02-11 at 09:31 -0800, Greg KH wrote: On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote: From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com This patch fix the below crash NIP [c004cee4] .__hash_page_thp+0x2a4/0x440 LR [c00439ac] .hash_page+0x18c/0x5e0 ... Call Trace: [c00736103c40] [1b00] 0x1b00(unreliable) [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58 On ppc64 we use the pgtable for storing the hpte slot information and store address to the pgtable at a constant offset (PTRS_PER_PMD) from pmd. On mremap, when we switch the pmd, we need to withdraw and deposit the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset from new pmd. We also want to move the withdraw and deposit before the set_pmd so that, when page fault find the pmd as trans huge we can be sure that pgtable can be located at the offset. variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f for 3.12 stable series This doesn't look like a variant, it looks totally different. Why can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch (and follow-on fix) for 3.12? I _REALLY_ dislike patches that are totally different from Linus's tree in stable trees, it has caused nothing but problems in the past. I don't think it applies... (I tried on an internal tree) but the affected function changed in 3.13 in various ways. Aneesh, please provide a more details explanation and whether we should backport those other changes too or whether this is not necessary Yes the affected function added support for split ptl locks for huge pte. I don't think that is a stable material. . BTW. Aneesh, we need a 3.11.x one too 3.11.x it is already applied. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V2 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com We will use this later to set the _PAGE_NUMA bit. Acked-by: Mel Gorman mgor...@suse.de Acked-by: Rik van Riel r...@redhat.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/hugetlb.h | 2 +- arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++--- arch/powerpc/mm/pgtable_64.c | 12 +++- arch/powerpc/mm/subpage-prot.c | 2 +- 4 files changed, 24 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index d750336b171d..623f2971ce0e 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_PPC64 - return __pte(pte_update(mm, addr, ptep, ~0UL, 1)); + return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); #else return __pte(pte_update(ptep, ~0UL, 0)); #endif diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h b/arch/powerpc/include/asm/pgtable-ppc64.h index bc141c950b1e..eb9261024f51 100644 --- a/arch/powerpc/include/asm/pgtable-ppc64.h +++ b/arch/powerpc/include/asm/pgtable-ppc64.h @@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, static inline unsigned long pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long clr, + unsigned long set, int huge) { #ifdef PTE_ATOMIC_UPDATES @@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct *mm, andi. %1,%0,%6\n\ bne-1b \n\ andc%1,%0,%4 \n\ + or %1,%1,%7\n\ stdcx. %1,0,%3 \n\ bne-1b : =r (old), =r (tmp), =m (*ptep) - : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY) + : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set) : cc ); #else unsigned long old = pte_val(*ptep); - *ptep = __pte(old ~clr); + *ptep = __pte((old ~clr) | set); #endif /* huge pages use the old page table lock */ if (!huge) @@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if ((pte_val(*ptep) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) + if ((pte_val(*ptep) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0); + old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old _PAGE_ACCESSED) != 0; } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG @@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, if ((pte_val(*ptep) _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 0); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, @@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, if ((pte_val(*ptep) _PAGE_RW) == 0) return; - pte_update(mm, addr, ptep, _PAGE_RW, 1); + pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); } /* @@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0); + unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); return __pte(old); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep) { - pte_update(mm, addr, ptep, ~0UL, 0); + pte_update(mm, addr, ptep, ~0UL, 0, 0); } @@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma, extern unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, -pmd_t *pmdp, unsigned long clr); +pmd_t *pmdp, +unsigned long clr, +unsigned long set); static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, if ((pmd_val(*pmdp) (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0) return 0; - old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED); + old
[PATCH V2 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled
Hello, This patch series fix random application crashes observed on ppc64 with numa balancing enabled. Without the patch we see crashes like anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 3cfd5464 code 30001 anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 3efc5464 code 30001 Changes from V1: * Build fix for CONFIG_NUMA_BALANCING disabled -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH V2 2/3] mm: dirty accountable change only apply to non prot numa case
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com So move it within the if loop Acked-by: Mel Gorman mgor...@suse.de Reviewed-by: Rik van Riel r...@redhat.com Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- mm/mprotect.c | 21 +++-- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 7332c1785744..33eab902f10e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_numa(ptent)) ptent = pte_mknonnuma(ptent); ptent = pte_modify(ptent, newprot); + /* +* Avoid taking write faults for pages we +* know to be dirty. +*/ + if (dirty_accountable pte_dirty(ptent)) + ptent = pte_mkwrite(ptent); + ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; } else { struct page *page; @@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } } } - - /* -* Avoid taking write faults for pages we know to be -* dirty. -*/ - if (dirty_accountable pte_dirty(ptent)) { - ptent = pte_mkwrite(ptent); - updated = true; - } - if (updated) pages++; - - /* Only !prot_numa always clears the pte */ - if (!prot_numa) - ptep_modify_prot_commit(mm, addr, pte, ptent); } else if (IS_ENABLED(CONFIG_MIGRATION) !pte_file(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev