[PATCH -V2 12/14] kvm: Add struct kvm arg to memslot APIs

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We will use that in the later patch to find the kvm ops handler

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/arm/kvm/arm.c |  5 +++--
 arch/ia64/kvm/kvm-ia64.c   |  5 +++--
 arch/mips/kvm/kvm_mips.c   |  5 +++--
 arch/powerpc/include/asm/kvm_ppc.h |  6 --
 arch/powerpc/kvm/book3s.c  |  4 ++--
 arch/powerpc/kvm/booke.c   |  4 ++--
 arch/powerpc/kvm/powerpc.c |  9 +
 arch/s390/kvm/kvm-s390.c   |  5 +++--
 arch/x86/kvm/x86.c |  5 +++--
 include/linux/kvm_host.h   |  5 +++--
 virt/kvm/kvm_main.c| 12 ++--
 11 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9c697db..e96c48f 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -152,12 +152,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct 
vm_fault *vmf)
return VM_FAULT_SIGBUS;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+   unsigned long npages)
 {
return 0;
 }
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index bdfd878..985bf80 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1550,12 +1550,13 @@ int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct 
vm_fault *vmf)
return VM_FAULT_SIGBUS;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+   unsigned long npages)
 {
return 0;
 }
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index a7b0445..73b3482 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -198,12 +198,13 @@ kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, 
unsigned long arg)
return -ENOIOCTLCMD;
 }
 
-void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
   struct kvm_memory_slot *dont)
 {
 }
 
-int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
+   unsigned long npages)
 {
return 0;
 }
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index c13f15d..20f4616 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -134,9 +134,11 @@ extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
 extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
 extern int kvmppc_core_init_vm(struct kvm *kvm);
 extern void kvmppc_core_destroy_vm(struct kvm *kvm);
-extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+extern void kvmppc_core_free_memslot(struct kvm *kvm,
+struct kvm_memory_slot *free,
 struct kvm_memory_slot *dont);
-extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+extern int kvmppc_core_create_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
  unsigned long npages);
 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 39d2994..130fe1d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -761,13 +761,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct 
kvm_dirty_log *log)
return kvmppc_ops-get_dirty_log(kvm, log);
 }
 
-void kvmppc_core_free_memslot(struct kvm_memory_slot *free,
+void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
  struct kvm_memory_slot *dont)
 {
kvmppc_ops-free_memslot(free, dont);
 }
 
-int kvmppc_core_create_memslot(struct kvm_memory_slot *slot,
+int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
   unsigned long npages)
 {
return kvmppc_ops-create_memslot(slot, npages);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 1769354..cb2d986 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1662,12 +1662,12 @@ int

[PATCH -V2 13/14] kvm: powerpc: book3s: Allow the HV and PR selection per virtual machine

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This moves the kvmppc_ops callbacks to be a per VM entity. This
enables us to select HV and PR mode when creating a VM. We also
allow both kvm-hv and kvm-pr kernel module to be loaded. To
achieve this we move /dev/kvm ownership to kvm.ko module. Depending on
which KVM mode we select during VM creation we take a reference
count on respective module

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/include/asm/kvm_ppc.h  |  7 +--
 arch/powerpc/kvm/44x.c  |  7 ++-
 arch/powerpc/kvm/book3s.c   | 89 +
 arch/powerpc/kvm/book3s.h   |  2 +
 arch/powerpc/kvm/book3s_hv.c| 18 
 arch/powerpc/kvm/book3s_pr.c| 25 +++
 arch/powerpc/kvm/book3s_xics.c  |  2 +-
 arch/powerpc/kvm/booke.c| 22 -
 arch/powerpc/kvm/e500.c |  8 +++-
 arch/powerpc/kvm/e500mc.c   |  6 ++-
 arch/powerpc/kvm/emulate.c  | 11 ++---
 arch/powerpc/kvm/powerpc.c  | 76 ++-
 include/uapi/linux/kvm.h|  4 ++
 14 files changed, 187 insertions(+), 91 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index e86db97..c7a041d 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -275,6 +275,7 @@ struct kvm_arch {
 #ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
 #endif
+   struct kvmppc_ops *kvm_ops;
 };
 
 /*
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 20f4616..3069cf4 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -182,6 +182,7 @@ union kvmppc_one_reg {
 };
 
 struct kvmppc_ops {
+   struct module *owner;
bool is_hv_enabled;
int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
@@ -217,7 +218,6 @@ struct kvmppc_ops {
  unsigned long npages);
int (*init_vm)(struct kvm *kvm);
void (*destroy_vm)(struct kvm *kvm);
-   int (*check_processor_compat)(void);
int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info);
int (*emulate_op)(struct kvm_run *run, struct kvm_vcpu *vcpu,
  unsigned int inst, int *advance);
@@ -229,7 +229,8 @@ struct kvmppc_ops {
 
 };
 
-extern struct kvmppc_ops *kvmppc_ops;
+extern struct kvmppc_ops *kvmppc_hv_ops;
+extern struct kvmppc_ops *kvmppc_pr_ops;
 
 /*
  * Cuts out inst bits with ordering according to spec.
@@ -326,7 +327,7 @@ static inline void kvmppc_set_host_ipi(int cpu, u8 host_ipi)
 
 static inline void kvmppc_fast_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-   kvmppc_ops-fast_vcpu_kick(vcpu);
+   vcpu-kvm-arch.kvm_ops-fast_vcpu_kick(vcpu);
 }
 
 #else
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index a765bcd..93221e8 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -213,16 +213,19 @@ static int __init kvmppc_44x_init(void)
if (r)
goto err_out;
 
-   r = kvm_init(kvm_ops_44x, sizeof(struct kvmppc_vcpu_44x),
-0, THIS_MODULE);
+   r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), 0, THIS_MODULE);
if (r)
goto err_out;
+   kvm_ops_44x.owner = THIS_MODULE;
+   kvmppc_pr_ops = kvm_ops_44x;
+
 err_out:
return r;
 }
 
 static void __exit kvmppc_44x_exit(void)
 {
+   kvmppc_pr_ops = NULL;
kvmppc_booke_exit();
 }
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 130fe1d..ad8f6ed 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -34,6 +34,7 @@
 #include linux/vmalloc.h
 #include linux/highmem.h
 
+#include book3s.h
 #include trace.h
 
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -71,7 +72,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
-   if (!kvmppc_ops-is_hv_enabled)
+   if (!vcpu-kvm-arch.kvm_ops-is_hv_enabled)
return to_book3s(vcpu)-hior;
return 0;
 }
@@ -79,7 +80,7 @@ static inline unsigned long kvmppc_interrupt_offset(struct 
kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
unsigned long pending_now, unsigned long old_pending)
 {
-   if (kvmppc_ops-is_hv_enabled)
+   if (vcpu-kvm-arch.kvm_ops-is_hv_enabled)
return;
if (pending_now)
vcpu-arch.shared-int_pending = 1;
@@ -93,7 +94,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu 
*vcpu)
ulong crit_r1;
bool crit;
 
-   if (kvmppc_ops-is_hv_enabled)
+   if (vcpu-kvm-arch.kvm_ops

[PATCH -V2 14/14] kvm: powerpc: book3s: drop is_hv_enabled

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

drop is_hv_enabled, because that should not be a callback property

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_ppc.h | 6 +-
 arch/powerpc/kvm/book3s.c  | 6 +++---
 arch/powerpc/kvm/book3s_hv.c   | 1 -
 arch/powerpc/kvm/book3s_pr.c   | 1 -
 arch/powerpc/kvm/book3s_xics.c | 2 +-
 arch/powerpc/kvm/powerpc.c | 2 +-
 6 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 3069cf4..c8317fb 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -183,7 +183,6 @@ union kvmppc_one_reg {
 
 struct kvmppc_ops {
struct module *owner;
-   bool is_hv_enabled;
int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id,
@@ -232,6 +231,11 @@ struct kvmppc_ops {
 extern struct kvmppc_ops *kvmppc_hv_ops;
 extern struct kvmppc_ops *kvmppc_pr_ops;
 
+static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
+{
+   return kvm-arch.kvm_ops == kvmppc_hv_ops;
+}
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index ad8f6ed..8912608 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -72,7 +72,7 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
 {
-   if (!vcpu-kvm-arch.kvm_ops-is_hv_enabled)
+   if (!is_kvmppc_hv_enabled(vcpu-kvm))
return to_book3s(vcpu)-hior;
return 0;
 }
@@ -80,7 +80,7 @@ static inline unsigned long kvmppc_interrupt_offset(struct 
kvm_vcpu *vcpu)
 static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
unsigned long pending_now, unsigned long old_pending)
 {
-   if (vcpu-kvm-arch.kvm_ops-is_hv_enabled)
+   if (is_kvmppc_hv_enabled(vcpu-kvm))
return;
if (pending_now)
vcpu-arch.shared-int_pending = 1;
@@ -94,7 +94,7 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu 
*vcpu)
ulong crit_r1;
bool crit;
 
-   if (vcpu-kvm-arch.kvm_ops-is_hv_enabled)
+   if (is_kvmppc_hv_enabled(vcpu-kvm))
return false;
 
crit_raw = vcpu-arch.shared-critical;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 31922d5..b5229eb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2160,7 +2160,6 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
 }
 
 static struct kvmppc_ops kvm_ops_hv = {
-   .is_hv_enabled = true,
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
.get_one_reg = kvmppc_get_one_reg_hv,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index fbd985f..df36cf2 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1526,7 +1526,6 @@ static long kvm_arch_vm_ioctl_pr(struct file *filp,
 }
 
 static struct kvmppc_ops kvm_ops_pr = {
-   .is_hv_enabled = false,
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr,
.get_one_reg = kvmppc_get_one_reg_pr,
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 76ef525..20d56ec 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -818,7 +818,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
}
 
/* Check for real mode returning too hard */
-   if (xics-real_mode  vcpu-kvm-arch.kvm_ops-is_hv_enabled)
+   if (xics-real_mode  is_kvmppc_hv_enabled(vcpu-kvm))
return kvmppc_xics_rm_complete(vcpu, req);
 
switch (req) {
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00a995a..058f9d6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -200,7 +200,7 @@ int kvmppc_sanity_check(struct kvm_vcpu *vcpu)
goto out;
 
/* HV KVM can only do PAPR mode for now */
-   if (!vcpu-arch.papr_enabled  vcpu-kvm-arch.kvm_ops-is_hv_enabled)
+   if (!vcpu-arch.papr_enabled  is_kvmppc_hv_enabled(vcpu-kvm))
goto out;
 
 #ifdef CONFIG_KVM_BOOKE_HV
-- 
1.8.1.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2 06/14] kvm: powerpc: booke: Convert BOOKE to use kvmppc_ops callbacks

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Make required changes to get BOOKE configs to build with
the introduction of kvmppc_ops callback

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 +--
 arch/powerpc/kvm/44x.c | 55 +++---
 arch/powerpc/kvm/44x_emulate.c |  8 +++---
 arch/powerpc/kvm/44x_tlb.c |  2 +-
 arch/powerpc/kvm/booke.c   | 47 +++-
 arch/powerpc/kvm/booke.h   | 24 +
 arch/powerpc/kvm/e500.c| 53 +---
 arch/powerpc/kvm/e500_emulate.c|  8 +++---
 arch/powerpc/kvm/e500_mmu.c|  2 +-
 arch/powerpc/kvm/e500mc.c  | 54 ++---
 10 files changed, 194 insertions(+), 63 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 1d22b53..326033c 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -285,10 +285,10 @@ static inline u32 kvmppc_set_field(u64 inst, int msb, int 
lsb, int value)
__v;\
 })
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
-void kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
+int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
 
 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg);
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 2f5c6b6..a765bcd 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -31,13 +31,13 @@
 #include 44x_tlb.h
 #include booke.h
 
-void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void kvmppc_core_vcpu_load_44x(struct kvm_vcpu *vcpu, int cpu)
 {
kvmppc_booke_vcpu_load(vcpu, cpu);
kvmppc_44x_tlb_load(vcpu);
 }
 
-void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_put_44x(struct kvm_vcpu *vcpu)
 {
kvmppc_44x_tlb_put(vcpu);
kvmppc_booke_vcpu_put(vcpu);
@@ -114,29 +114,32 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
return 0;
 }
 
-void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_get_sregs_44x(struct kvm_vcpu *vcpu,
+ struct kvm_sregs *sregs)
 {
-   kvmppc_get_sregs_ivor(vcpu, sregs);
+   return kvmppc_get_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
+static int kvmppc_core_set_sregs_44x(struct kvm_vcpu *vcpu,
+struct kvm_sregs *sregs)
 {
return kvmppc_set_sregs_ivor(vcpu, sregs);
 }
 
-int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
-   union kvmppc_one_reg *val)
+static int kvmppc_get_one_reg_44x(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
 {
return -EINVAL;
 }
 
-int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
-  union kvmppc_one_reg *val)
+static int kvmppc_set_one_reg_44x(struct kvm_vcpu *vcpu, u64 id,
+ union kvmppc_one_reg *val)
 {
return -EINVAL;
 }
 
-struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+static struct kvm_vcpu *kvmppc_core_vcpu_create_44x(struct kvm *kvm,
+   unsigned int id)
 {
struct kvmppc_vcpu_44x *vcpu_44x;
struct kvm_vcpu *vcpu;
@@ -167,7 +170,7 @@ out:
return ERR_PTR(err);
 }
 
-void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+static void kvmppc_core_vcpu_free_44x(struct kvm_vcpu *vcpu)
 {
struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 
@@ -176,24 +179,46 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
 }
 
-int kvmppc_core_init_vm(struct kvm *kvm)
+static int kvmppc_core_init_vm_44x(struct kvm *kvm)
 {
return 0;
 }
 
-void kvmppc_core_destroy_vm(struct kvm *kvm)
+static void kvmppc_core_destroy_vm_44x(struct kvm *kvm)
 {
 }
 
+static struct kvmppc_ops kvm_ops_44x = {
+   .get_sregs = kvmppc_core_get_sregs_44x,
+   .set_sregs = kvmppc_core_set_sregs_44x,
+   .get_one_reg = kvmppc_get_one_reg_44x,
+   .set_one_reg = kvmppc_set_one_reg_44x,
+   .vcpu_load   = kvmppc_core_vcpu_load_44x,
+   .vcpu_put= kvmppc_core_vcpu_put_44x,
+   .vcpu_create = kvmppc_core_vcpu_create_44x,
+   .vcpu_free   = kvmppc_core_vcpu_free_44x,
+   .mmu_destroy  = kvmppc_mmu_destroy_44x,
+   .init_vm = kvmppc_core_init_vm_44x,
+   .destroy_vm

[PATCH -V2 09/14] kvm: powerpc: book3s: pr: move PR related tracepoints to a separate header

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch moves PR related tracepoints to a separate header. This
enables in converting PR to a kernel module which will be done in
later patches

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_64_mmu_host.c |   2 +-
 arch/powerpc/kvm/book3s_mmu_hpte.c|   2 +-
 arch/powerpc/kvm/book3s_pr.c  |   4 +-
 arch/powerpc/kvm/trace.h  | 234 +--
 arch/powerpc/kvm/trace_pr.h   | 297 ++
 5 files changed, 309 insertions(+), 230 deletions(-)
 create mode 100644 arch/powerpc/kvm/trace_pr.h

diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c 
b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 819672c..0d513af 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -27,7 +27,7 @@
 #include asm/machdep.h
 #include asm/mmu_context.h
 #include asm/hw_irq.h
-#include trace.h
+#include trace_pr.h
 
 #define PTE_SIZE 12
 
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c 
b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 6b79bfc..5a1ab12 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -28,7 +28,7 @@
 #include asm/mmu_context.h
 #include asm/hw_irq.h
 
-#include trace.h
+#include trace_pr.h
 
 #define PTE_SIZE   12
 
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index b6a525d..ca6c73d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -42,7 +42,9 @@
 #include linux/highmem.h
 
 #include book3s.h
-#include trace.h
+
+#define CREATE_TRACE_POINTS
+#include trace_pr.h
 
 /* #define EXIT_DEBUG */
 /* #define DEBUG_EXT */
diff --git a/arch/powerpc/kvm/trace.h b/arch/powerpc/kvm/trace.h
index 9e8368e..80f252a 100644
--- a/arch/powerpc/kvm/trace.h
+++ b/arch/powerpc/kvm/trace.h
@@ -85,6 +85,12 @@ TRACE_EVENT(kvm_ppc_instr,
{41, HV_PRIV}
 #endif
 
+#ifndef CONFIG_KVM_BOOK3S_PR_POSSIBLE
+/*
+ * For pr we define this in trace_pr.h since it pr can be built as
+ * a module
+ */
+
 TRACE_EVENT(kvm_exit,
TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu),
TP_ARGS(exit_nr, vcpu),
@@ -94,9 +100,6 @@ TRACE_EVENT(kvm_exit,
__field(unsigned long,  pc  )
__field(unsigned long,  msr )
__field(unsigned long,  dar )
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-   __field(unsigned long,  srr1)
-#endif
__field(unsigned long,  last_inst   )
),
 
@@ -105,9 +108,6 @@ TRACE_EVENT(kvm_exit,
__entry-pc = kvmppc_get_pc(vcpu);
__entry-dar= kvmppc_get_fault_dar(vcpu);
__entry-msr= vcpu-arch.shared-msr;
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-   __entry-srr1   = vcpu-arch.shadow_srr1;
-#endif
__entry-last_inst  = vcpu-arch.last_inst;
),
 
@@ -115,18 +115,12 @@ TRACE_EVENT(kvm_exit,
 | pc=0x%lx
 | msr=0x%lx
 | dar=0x%lx
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-| srr1=0x%lx
-#endif
 | last_inst=0x%lx
,
__print_symbolic(__entry-exit_nr, kvm_trace_symbol_exit),
__entry-pc,
__entry-msr,
__entry-dar,
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-   __entry-srr1,
-#endif
__entry-last_inst
)
 );
@@ -145,6 +139,7 @@ TRACE_EVENT(kvm_unmap_hva,
 
TP_printk(unmap hva 0x%lx\n, __entry-hva)
 );
+#endif
 
 TRACE_EVENT(kvm_stlb_inval,
TP_PROTO(unsigned int stlb_index),
@@ -231,221 +226,6 @@ TRACE_EVENT(kvm_check_requests,
__entry-cpu_nr, __entry-requests)
 );
 
-
-/*
- * Book3S trace points   *
- */
-
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-
-TRACE_EVENT(kvm_book3s_reenter,
-   TP_PROTO(int r, struct kvm_vcpu *vcpu),
-   TP_ARGS(r, vcpu),
-
-   TP_STRUCT__entry(
-   __field(unsigned int,   r   )
-   __field(unsigned long,  pc  )
-   ),
-
-   TP_fast_assign(
-   __entry-r  = r;
-   __entry-pc = kvmppc_get_pc(vcpu);
-   ),
-
-   TP_printk(reentry r=%d | pc=0x%lx, __entry-r, __entry-pc)
-);
-
-#ifdef CONFIG_PPC_BOOK3S_64
-
-TRACE_EVENT(kvm_book3s_64_mmu_map,
-   TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr,
-struct kvmppc_pte *orig_pte),
-   TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte),
-
-   TP_STRUCT__entry(
-   __field(unsigned char

[PATCH -V2 08/14] kvm: powerpc: book3s: Add is_hv_enabled to kvmppc_ops

2013-10-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This help us to identify whether we are running with hypervisor mode KVM
enabled. The change is needed so that we can have both HV and PR kvm
enabled in the same kernel.

If both HV and PR KVM are included, interrupts come in to the HV version
of the kvmppc_interrupt code, which then jumps to the PR handler,
renamed to kvmppc_interrupt_pr, if the guest is a PR guest.

Allowing both PR and HV in the same kernel required some changes to
kvm_dev_ioctl_check_extension(), since the values returned now can't
be selected with #ifdefs as much as previously. We look at is_hv_enabled
to return the right value when checking for capabilities.For capabilities that
are only provided by HV KVM, we return the HV value only if
is_hv_enabled is true. For capabilities provided by PR KVM but not HV,
we return the PR value only if is_hv_enabled is false.

NOTE: in later patch we replace is_hv_enabled with a static inline
function comparing kvm_ppc_ops

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s.h | 53 --
 arch/powerpc/include/asm/kvm_ppc.h|  5 ++--
 arch/powerpc/kvm/book3s.c | 44 
 arch/powerpc/kvm/book3s_hv.c  |  1 +
 arch/powerpc/kvm/book3s_pr.c  |  1 +
 arch/powerpc/kvm/book3s_xics.c|  2 +-
 arch/powerpc/kvm/powerpc.c| 54 +++
 7 files changed, 79 insertions(+), 81 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 315a5d6..4a594b7 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -301,59 +301,6 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu 
*vcpu)
return vcpu-arch.fault_dar;
 }
 
-#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
-
-static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
-{
-   return to_book3s(vcpu)-hior;
-}
-
-static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
-   unsigned long pending_now, unsigned long old_pending)
-{
-   if (pending_now)
-   vcpu-arch.shared-int_pending = 1;
-   else if (old_pending)
-   vcpu-arch.shared-int_pending = 0;
-}
-
-static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
-{
-   ulong crit_raw = vcpu-arch.shared-critical;
-   ulong crit_r1 = kvmppc_get_gpr(vcpu, 1);
-   bool crit;
-
-   /* Truncate crit indicators in 32 bit mode */
-   if (!(vcpu-arch.shared-msr  MSR_SF)) {
-   crit_raw = 0x;
-   crit_r1 = 0x;
-   }
-
-   /* Critical section when crit == r1 */
-   crit = (crit_raw == crit_r1);
-   /* ... and we're in supervisor mode */
-   crit = crit  !(vcpu-arch.shared-msr  MSR_PR);
-
-   return crit;
-}
-#else /* CONFIG_KVM_BOOK3S_PR_POSSIBLE */
-
-static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
-{
-   return 0;
-}
-
-static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
-   unsigned long pending_now, unsigned long old_pending)
-{
-}
-
-static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu)
-{
-   return false;
-}
-#endif
-
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R30x113724FA
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 326033c..c13f15d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -180,6 +180,7 @@ union kvmppc_one_reg {
 };
 
 struct kvmppc_ops {
+   bool is_hv_enabled;
int (*get_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int (*set_sregs)(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs);
int (*get_one_reg)(struct kvm_vcpu *vcpu, u64 id,
@@ -309,10 +310,10 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned 
long addr)
 
 static inline u32 kvmppc_get_xics_latch(void)
 {
-   u32 xirr = get_paca()-kvm_hstate.saved_xirr;
+   u32 xirr;
 
+   xirr = get_paca()-kvm_hstate.saved_xirr;
get_paca()-kvm_hstate.saved_xirr = 0;
-
return xirr;
 }
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 784a1d5..493aff7 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -69,6 +69,50 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 {
 }
 
+static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu)
+{
+   if (!kvmppc_ops-is_hv_enabled)
+   return to_book3s(vcpu)-hior;
+   return 0;
+}
+
+static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu,
+   unsigned long pending_now, unsigned long old_pending)
+{
+   if (kvmppc_ops-is_hv_enabled

Re: [PATCH -V2 1/2] powerpc: Use HPTE constants when updating hpte bits

2013-10-07 Thread Aneesh Kumar K.V
Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

 Even though we have same value for linux PTE bits and hash PTE pits
 use the hash pte bits wen updating hash pte

...
 diff --git a/arch/powerpc/platforms/pseries/lpar.c 
 b/arch/powerpc/platforms/pseries/lpar.c
 index 02d6e21..78f2c59 100644
 --- a/arch/powerpc/platforms/pseries/lpar.c
 +++ b/arch/powerpc/platforms/pseries/lpar.c
 @@ -146,8 +146,9 @@ static long pSeries_lpar_hpte_insert(unsigned long 
 hpte_group,
   flags = 0;

   /* Make pHyp happy */
 - if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))
 - hpte_r = ~_PAGE_COHERENT;
 + if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))
 + hpte_r = ~HPTE_R_M;
 +

-   if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))
+   if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))

the if loop part of the change is already done in upstream. Since V2
was generated after moving V1 patch to a different tree (kvm tree),
ended up with that additional change. Let me know if you want me
respin the patch

   if (firmware_has_feature(FW_FEATURE_XCMO)  !(hpte_r  HPTE_R_N))
   flags |= H_COALESCE_CAND;


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH -V2 00/14] Allow PR and HV KVM to coexist in one kernel

2013-10-15 Thread Aneesh Kumar K.V

Hi Alex,

Any update on this ?

-aneesh

Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes:

 Hi All,

 This patch series support enabling HV and PR KVM together in the same kernel. 
 We
 extend machine property with new property kvm_type. A value of HV will 
 force HV
 KVM and PR PR KVM. If we don't specify kvm_type we will select the fastest 
 KVM mode.
 ie, HV if that is supported otherwise PR.

 With Qemu command line having

  -machine pseries,accel=kvm,kvm_type=HV

 [root@llmp24l02 qemu]# bash ../qemu
 failed to initialize KVM: Invalid argument
 [root@llmp24l02 qemu]# modprobe kvm-pr
 [root@llmp24l02 qemu]# bash ../qemu
 failed to initialize KVM: Invalid argument
 [root@llmp24l02 qemu]# modprobe  kvm-hv
 [root@llmp24l02 qemu]# bash ../qemu

 now with

  -machine pseries,accel=kvm,kvm_type=PR

 [root@llmp24l02 qemu]# rmmod kvm-pr
 [root@llmp24l02 qemu]# bash ../qemu
 failed to initialize KVM: Invalid argument
 [root@llmp24l02 qemu]#
 [root@llmp24l02 qemu]# modprobe kvm-pr
 [root@llmp24l02 qemu]# bash ../qemu

 Changes from V1:
 * Build fixes for BOOKE (only compile tested)
 * Address review feedback

 -aneesh

 ___
 Linuxppc-dev mailing list
 Linuxppc-dev@lists.ozlabs.org
 https://lists.ozlabs.org/listinfo/linuxppc-dev

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH -V2 11/14] kvm: powerpc: book3s: Support building HV and PR KVM as module

2013-10-16 Thread Aneesh Kumar K.V

The below patch fix a compile issue with KVM_XICS. Please fold

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index cef3de9..c3c832b 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -840,6 +840,7 @@ int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req)
 
return rc;
 }
+EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 
 /* -- Initialisation code etc. -- */


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: Fix 64K page size support for PPC44x

2013-10-16 Thread Aneesh Kumar K.V
Alistair Popple alist...@popple.id.au writes:

 PPC44x supports page sizes other than 4K however when 64K page sizes
 are selected compilation fails. This is due to a change in the
 definition of pgtable_t introduced by the following patch:

 commit 5c1f6ee9a31cbdac90bbb8ae1ba4475031ac74b4
 Author: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 powerpc: Reduce PTE table memory wastage

 The above patch only implements the new layout for PPC64 so it doesn't
 compile for PPC32 with a 64K page size. Ideally we should implement
 the same layout for PPC32 however for the meantime this patch reverts
 the definition of pgtable_t for PPC32.

 Signed-off-by: Alistair Popple alist...@popple.id.au
 ---
  arch/powerpc/include/asm/page.h |2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
 index b9f4262..b142d58 100644
 --- a/arch/powerpc/include/asm/page.h
 +++ b/arch/powerpc/include/asm/page.h
 @@ -403,7 +403,7 @@ void arch_free_page(struct page *page, int order);

  struct vm_area_struct;

 -#ifdef CONFIG_PPC_64K_PAGES
 +#if defined(CONFIG_PPC_64K_PAGES)  defined(PPC64)
   ^^^ CONFIG_PPC64 ?   
  

  typedef pte_t *pgtable_t;
  #else
  typedef struct page *pgtable_t;
 -- 
 1.7.10.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH -V2 06/14] kvm: powerpc: booke: Convert BOOKE to use kvmppc_ops callbacks

2013-10-17 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 07.10.2013, at 18:47, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Make required changes to get BOOKE configs to build with
 the introduction of kvmppc_ops callback
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

 This can not be a separate commit, as you're breaking bisectability for booke 
 this way.


The only reason I split that into two was to make review easy. But yes
when merging to your tree we should fold.

 I've squashed this in with the previous commit.


Ok. 

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 0/9] powerpc: mm: Numa faults support for ppc64

2013-10-22 Thread Aneesh Kumar K.V
Hi,

This patch series add support for numa faults on ppc64 architecture. We steal 
the
_PAGE_COHERENCE bit and use that for indicating _PAGE_NUMA. We clear the 
_PAGE_PRESENT bit
and also invalidate the hpte entry on setting _PAGE_NUMA. The next fault on that
page will be considered a numa fault.


NOTE:
__
Issue:
I am finding large lock contention on page_table_lock with this series on a 95 
cpu 4 node box with autonuma benchmark

I will out on vacation till NOV 6 without email access. Hence i will not be 
able to respond to review feedbacks
till then. 


lock_stat version 0.3
---
  class namecon-bouncescontentions   waittime-min   
waittime-max waittime-totalacq-bounces   acquisitions   holdtime-mi  hold 
time hold total
-

  (mm-page_table_lock)-rlock: 713531791  719610919   0.09   
  3038193.19 357867523236.3  729709189  7500401620.0  236991.36  
1159646899.68
  --
  (mm-page_table_lock)-rlock  1  [c0218880] 
.anon_vma_prepare+0xb0/0x1e0
  (mm-page_table_lock)-rlock 93  [c0207ebc] 
.do_numa_page+0x4c/0x190
  (mm-page_table_lock)-rlock 301678  [c02139d4] 
.change_protection+0x1d4/0x560
  (mm-page_table_lock)-rlock 244524  [c0213be8] 
.change_protection+0x3e8/0x560
  --
  (mm-page_table_lock)-rlock  1  [c0206a38] 
.__do_fault+0x198/0x6b0
  (mm-page_table_lock)-rlock 704163  [c02139d4] 
.change_protection+0x1d4/0x560
  (mm-page_table_lock)-rlock 207227  [c0213be8] 
.change_protection+0x3e8/0x560
  (mm-page_table_lock)-rlock 95  [c0207ebc] 
.do_numa_page+0x4c/0x190
 
-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 4/9] powerpc: mm: Only check for _PAGE_PRESENT in set_pte/pmd functions

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We want to make sure we don't use these function when updating a pte
or pmd entry that have a valid hpte entry, because these functions
don't invalidate them. So limit the check to _PAGE_PRESENT bit.
Numafault core changes use these functions for updating _PAGE_NUMA bits.
That should be ok because when _PAGE_NUMA is set we can be sure that
hpte entries are not present.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/mm/pgtable.c| 2 +-
 arch/powerpc/mm/pgtable_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index edda589..10c09b6 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -187,7 +187,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
pte_t pte)
 {
 #ifdef CONFIG_DEBUG_VM
-   WARN_ON(pte_present(*ptep));
+   WARN_ON(pte_val(*ptep)  _PAGE_PRESENT);
 #endif
/* Note: mm-context.id might not yet have been assigned as
 * this context might not have been activated yet when this
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 536eec72..56b7586 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -686,7 +686,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef CONFIG_DEBUG_VM
-   WARN_ON(!pmd_none(*pmdp));
+   WARN_ON(pmd_val(*pmdp)  _PAGE_PRESENT);
assert_spin_locked(mm-page_table_lock);
WARN_ON(!pmd_trans_huge(pmd));
 #endif
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 1/9] powerpc: Use HPTE constants when updating hpte bits

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Even though we have same value for linux PTE bits and hash PTE pits
use the hash pte bits wen updating hash pte

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/cell/beat_htab.c | 4 ++--
 arch/powerpc/platforms/pseries/lpar.c   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/cell/beat_htab.c 
b/arch/powerpc/platforms/cell/beat_htab.c
index c34ee4e..d4d245c 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -111,7 +111,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group,
DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r);
 
if (rflags  _PAGE_NO_CACHE)
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
 
raw_spin_lock(beat_htab_lock);
lpar_rc = beat_read_mask(hpte_group);
@@ -337,7 +337,7 @@ static long beat_lpar_hpte_insert_v3(unsigned long 
hpte_group,
DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r);
 
if (rflags  _PAGE_NO_CACHE)
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
 
/* insert into not-volted entry */
lpar_rc = beat_insert_htab_entry3(0, hpte_group, hpte_v, hpte_r,
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 356bc75..c8fbef23 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -153,7 +153,8 @@ static long pSeries_lpar_hpte_insert(unsigned long 
hpte_group,
 
/* Make pHyp happy */
if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
+
if (firmware_has_feature(FW_FEATURE_XCMO)  !(hpte_r  HPTE_R_N))
flags |= H_COALESCE_CAND;
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 3/9] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE.
On archs like ppc64 that don't use _PAGE_PROTNONE and also have
a separate page table outside linux pagetable, we just need to
make sure that when calling change_prot_numa we flush the
hardware page table entry so that next page access  result in a numa
fault.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 include/linux/mm.h | 3 ---
 mm/mempolicy.c | 9 -
 2 files changed, 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8b6e55e..5ab0e22 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1668,11 +1668,8 @@ static inline pgprot_t vm_get_page_prot(unsigned long 
vm_flags)
 }
 #endif
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
-#endif
-
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0472964..efb4300 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -612,7 +612,6 @@ static inline int queue_pages_pgd_range(struct 
vm_area_struct *vma,
return 0;
 }
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 /*
  * This is used to mark a range of virtual addresses to be inaccessible.
  * These are later cleared by a NUMA hinting fault. Depending on these
@@ -626,7 +625,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
int nr_updated;
-   BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 
nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1);
if (nr_updated)
@@ -634,13 +632,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 
return nr_updated;
 }
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
-{
-   return 0;
-}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 
 /*
  * Walk through page tables and collect pages to be migrated.
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 8/9] powerpc: mm: Support setting _PAGE_NUMA bit on pmd entry which are pointer to PTE page

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 46db094..f828944 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -150,8 +150,22 @@
 
 #define pmd_set(pmdp, pmdval)  (pmd_val(*(pmdp)) = (pmdval))
 #define pmd_none(pmd)  (!pmd_val(pmd))
-#definepmd_bad(pmd)(!is_kernel_addr(pmd_val(pmd)) \
-|| (pmd_val(pmd)  PMD_BAD_BITS))
+
+static inline int pmd_bad(pmd_t pmd)
+{
+#ifdef CONFIG_NUMA_BALANCING
+   /*
+* For numa balancing we can have this set
+*/
+   if (pmd_val(pmd)  _PAGE_NUMA)
+   return 0;
+#endif
+   if (!is_kernel_addr(pmd_val(pmd)) ||
+   (pmd_val(pmd)  PMD_BAD_BITS))
+   return 1;
+   return 0;
+}
+
 #definepmd_present(pmd)(pmd_val(pmd) != 0)
 #definepmd_clear(pmdp) (pmd_val(*(pmdp)) = 0)
 #define pmd_page_vaddr(pmd)(pmd_val(pmd)  ~PMD_MASKED_BITS)
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 9/9] powerpc: mm: Enable numa faulting for hugepages

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Provide numa related functions for updating pmd entries.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 67ea8fb..aa3add7 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -95,19 +95,19 @@ static inline void change_pmd_protnuma(struct mm_struct 
*mm, unsigned long addr,
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
-   return 0;
+   return pte_numa(pmd_pte(pmd));
 }
 
 #define pmd_mknonnuma pmd_mknonnuma
 static inline pmd_t pmd_mknonnuma(pmd_t pmd)
 {
-   return pmd;
+   return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
 }
 
 #define pmd_mknuma pmd_mknuma
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
-   return pmd;
+   return pte_pmd(pte_mknuma(pmd_pte(pmd)));
 }
 
 # else
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 7/9] mm: numafaults: Use change_pmd_protnuma for updating _PAGE_NUMA for regular pmds

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Archs like ppc64 have different layout for pmd entries pointing to PTE
page. Hence add a separate function for modifying them

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 17 +
 include/asm-generic/pgtable.h  | 20 
 mm/memory.c|  2 +-
 mm/mprotect.c  | 24 ++--
 4 files changed, 44 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 9d87125..67ea8fb 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -75,6 +75,23 @@ static inline pte_t pte_mknuma(pte_t pte)
return pte;
 }
 
+#define change_pmd_protnuma change_pmd_protnuma
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long 
addr,
+  pmd_t *pmdp, int prot_numa)
+{
+   /*
+* We don't track the _PAGE_PRESENT bit here
+*/
+   unsigned long pmd_val;
+   pmd_val = pmd_val(*pmdp);
+   if (prot_numa)
+   pmd_val |= _PAGE_NUMA;
+   else
+   pmd_val = ~_PAGE_NUMA;
+   pmd_set(pmdp, pmd_val | _PAGE_NUMA);
+}
+
+
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index f330d28..568a8c4 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -697,6 +697,18 @@ static inline pmd_t pmd_mknuma(pmd_t pmd)
return pmd_clear_flags(pmd, _PAGE_PRESENT);
 }
 #endif
+
+#ifndef change_pmd_protnuma
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long 
addr,
+  pmd_t *pmd, int prot_numa)
+{
+   if (prot_numa)
+   set_pmd_at(mm, addr  PMD_MASK, pmd, pmd_mknuma(*pmd));
+   else
+   set_pmd_at(mm, addr  PMD_MASK, pmd, pmd_mknonnuma(*pmd));
+}
+
+#endif
 #else
 extern int pte_numa(pte_t pte);
 extern int pmd_numa(pmd_t pmd);
@@ -704,6 +716,8 @@ extern pte_t pte_mknonnuma(pte_t pte);
 extern pmd_t pmd_mknonnuma(pmd_t pmd);
 extern pte_t pte_mknuma(pte_t pte);
 extern pmd_t pmd_mknuma(pmd_t pmd);
+extern void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+   pmd_t *pmd, int prot_numa);
 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 #else
 static inline int pmd_numa(pmd_t pmd)
@@ -735,6 +749,12 @@ static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
return pmd;
 }
+
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long 
addr,
+  pmd_t *pmd, int prot_numa)
+{
+   BUG();
+}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #endif /* CONFIG_MMU */
diff --git a/mm/memory.c b/mm/memory.c
index ca00039..e930e50 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3605,7 +3605,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
spin_lock(mm-page_table_lock);
pmd = *pmdp;
if (pmd_numa(pmd)) {
-   set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+   change_pmd_protnuma(mm, _addr, pmdp, 0);
numa = true;
}
spin_unlock(mm-page_table_lock);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4..88de575 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -112,22 +112,6 @@ static unsigned long change_pte_range(struct 
vm_area_struct *vma, pmd_t *pmd,
return pages;
 }
 
-#ifdef CONFIG_NUMA_BALANCING
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long 
addr,
-  pmd_t *pmd)
-{
-   spin_lock(mm-page_table_lock);
-   set_pmd_at(mm, addr  PMD_MASK, pmd, pmd_mknuma(*pmd));
-   spin_unlock(mm-page_table_lock);
-}
-#else
-static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long 
addr,
-  pmd_t *pmd)
-{
-   BUG();
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -161,8 +145,12 @@ static inline unsigned long change_pmd_range(struct 
vm_area_struct *vma,
 * node. This allows a regular PMD to be handled as one fault
 * and effectively batches the taking of the PTL
 */
-   if (prot_numa  all_same_node)
-   change_pmd_protnuma(vma-vm_mm, addr, pmd);
+   if (prot_numa  all_same_node) {
+   spin_lock(vma-vm_mm-page_table_lock);
+   change_pmd_protnuma(vma-vm_mm, addr, pmd, 1);
+   spin_unlock(vma-vm_mm-page_table_lock);
+
+   }
} while (pmd

[RFC PATCH 6/9] powerpc: mm: book3s: Disable hugepaged pmd format for book3s

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

After commit e2b3d202d1dba8f3546ed28224ce485bc50010be we have the
below possible formats for pmd entry

(1) invalid (all zeroes)
(2) pointer to next table, as normal; bottom 6 bits == 0
(3) leaf pte for huge page, bottom two bits != 00
(4) hugepd pointer, bottom two bits == 00, next 4 bits indicate size of table

On book3s we don't really use the (4).  For Numa balancing we need to
tag pmd entries that are pointer to next table with _PAGE_NUMA for
performance reason (9532fec118d485ea37ab6e3ea372d68cd8b4cd0d). This
patch enables that by disabling hugepd support for book3s if
NUMA_BALANCING is enabled. We ideally want to get rid of hugepd pointer
completely.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/page.h | 11 +++
 arch/powerpc/mm/hugetlbpage.c   |  8 +++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index b9f4262..791ab56 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -369,11 +369,22 @@ typedef struct { signed long pd; } hugepd_t;
 #ifdef CONFIG_PPC_BOOK3S_64
 static inline int hugepd_ok(hugepd_t hpd)
 {
+#ifdef CONFIG_NUMA_BALANCING
+   /*
+* In order to enable batch handling of pte numa faults, Numa balancing
+* code use the _PAGE_NUMA bit even on pmd that is pointing to PTE PAGE.
+* 9532fec118d485ea37ab6e3ea372d68cd8b4cd0d. After commit
+* e2b3d202d1dba8f3546ed28224ce485bc50010be we really don't need to
+* support hugepd for ppc64.
+*/
+   return 0;
+#else
/*
 * hugepd pointer, bottom two bits == 00 and next 4 bits
 * indicate size of table
 */
return (((hpd.pd  0x3) == 0x0)  ((hpd.pd  HUGEPD_SHIFT_MASK) != 0));
+#endif
 }
 #else
 static inline int hugepd_ok(hugepd_t hpd)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index d67db4b..71bd214 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -235,8 +235,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long 
addr, unsigned long sz
if (!hpdp)
return NULL;
 
+#ifdef CONFIG_NUMA_BALANCING
+   /*
+* We cannot support hugepd format with numa balancing support
+* enabled.
+*/
+   return NULL;
+#endif
BUG_ON(!hugepd_none(*hpdp)  !hugepd_ok(*hpdp));
-
if (hugepd_none(*hpdp)  __hugepte_alloc(mm, hpdp, addr, pdshift, 
pshift))
return NULL;
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 5/9] powerpc: mm: book3s: Enable _PAGE_NUMA for book3s

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes.
This patch still disables the numa hinting using pmd entries. That
require further changes to pmd entry format which is done in later
patches.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 66 +-
 arch/powerpc/include/asm/pte-hash64.h  |  6 
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 7d6eacf..9d87125 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #ifndef __ASSEMBLY__
+#include linux/mmdebug.h
 #include asm/processor.h /* For TASK_SIZE */
 #include asm/mmu.h
 #include asm/page.h
@@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte)  { 
return pte_val(pte)  _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte) { return pte_val(pte)  
_PAGE_ACCESSED; }
 static inline int pte_file(pte_t pte)  { return pte_val(pte)  
_PAGE_FILE; }
 static inline int pte_special(pte_t pte)   { return pte_val(pte)  
_PAGE_SPECIAL; }
-static inline int pte_present(pte_t pte)   { return pte_val(pte)  
_PAGE_PRESENT; }
 static inline int pte_none(pte_t pte)  { return (pte_val(pte)  
~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)   { return __pgprot(pte_val(pte) 
 PAGE_PROT_BITS); }
 
+#ifdef CONFIG_NUMA_BALANCING
+
+static inline int pte_present(pte_t pte)
+{
+   return pte_val(pte)  (_PAGE_PRESENT | _PAGE_NUMA);
+}
+
+#define pte_numa pte_numa
+static inline int pte_numa(pte_t pte)
+{
+   return (pte_val(pte) 
+   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+
+#define pte_mknonnuma pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+   pte_val(pte) = ~_PAGE_NUMA;
+   pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
+   return pte;
+}
+
+#define pte_mknuma pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+   /*
+* We should not set _PAGE_NUMA on non present ptes. Also clear the
+* present bit so that hash_page will return 1 and we collect this
+* as numa fault.
+*/
+   if (pte_present(pte)) {
+   pte_val(pte) |= _PAGE_NUMA;
+   pte_val(pte) = ~_PAGE_PRESENT;
+   } else
+   VM_BUG_ON(1);
+   return pte;
+}
+
+#define pmd_numa pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+   return 0;
+}
+
+#define pmd_mknonnuma pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+   return pmd;
+}
+
+#define pmd_mknuma pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+   return pmd;
+}
+
+# else
+
+static inline int pte_present(pte_t pte)
+{
+   return pte_val(pte)  _PAGE_PRESENT;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 55aea0c..2505d8e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,6 +27,12 @@
 #define _PAGE_RW   0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY 0x0800 /* software: PTE  hash are busy */
 
+/*
+ * Used for tracking numa faults
+ */
+#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */
+
+
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW(_PAGE_RW | _PAGE_DIRTY) /* user access 
blocked by key */
 #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 6704e2e..c9d6223 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,6 +72,7 @@ config PPC_BOOK3S_64
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+   select ARCH_SUPPORTS_NUMA_BALANCING
 
 config PPC_BOOK3E_64
bool Embedded processors
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 2/9] powerpc: Free up _PAGE_COHERENCE for numa fault use later

2013-10-22 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Set  memory coherence always on hash64 config. If
a platform cannot have memory coherence always set they
can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU
like in lpar. So we dont' really need a separate bit
for tracking _PAGE_COHERENCE.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pte-hash64.h |  2 +-
 arch/powerpc/mm/hash_low_64.S | 15 ---
 arch/powerpc/mm/hash_utils_64.c   |  7 ---
 arch/powerpc/mm/hugepage-hash64.c |  6 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  |  4 
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 0419eeb..55aea0c 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -19,7 +19,7 @@
 #define _PAGE_FILE 0x0002 /* (!present only) software: pte holds 
file offset */
 #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we 
invert) */
 #define _PAGE_GUARDED  0x0008
-#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP 
systems) */
+/* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x0040 /* W: cache write-through */
 #define _PAGE_DIRTY0x0080 /* C: page changed */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d3cbda6..1136d26 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r30,r0   /* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...) 
@@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r3,r0/* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...)
@@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r30,r0   /* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index bde8b55..fb176e9 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
if ((pteflags  _PAGE_USER)  !((pteflags  _PAGE_RW) 
 (pteflags  _PAGE_DIRTY)))
rflags |= 1;
-
-   /* Always add C */
-   return rflags | HPTE_R_C;
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   return rflags | HPTE_R_C | HPTE_R_M;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
diff --git a/arch/powerpc/mm/hugepage-hash64.c 
b/arch/powerpc/mm/hugepage-hash64.c
index 34de9e0..826893f 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -127,7 +127,11 @@ repeat:
 
/* Add in WIMG bits */
rflags |= (new_pmd  (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
- _PAGE_COHERENT | _PAGE_GUARDED));
+ _PAGE_GUARDED));
+   /*
+* enable the memory coherence always
+*/
+   rflags |= HPTE_R_M;
 
/* Insert into the hash table, primary slot */
slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c 
b/arch/powerpc/mm/hugetlbpage

[PATCH] powerpc: book3s: kvm: Don't abuse host r2 in exit path

2013-11-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We don't use PACATOC for PR. Avoid updating HOST_R2 with PR
KVM mode when both HV and PR are enabled in the kernel. Without this we
get the below crash

(qemu)
Unable to handle kernel paging request for data at address 0x8310
Faulting instruction address: 0xc001d5a4
cpu 0x2: Vector: 300 (Data Access) at [c001dc53aef0]
pc: c001d5a4: .vtime_delta.isra.1+0x34/0x1d0
lr: c001d760: .vtime_account_system+0x20/0x60
sp: c001dc53b170
   msr: 80009032
   dar: 8310
 dsisr: 4000
  current = 0xc001d76c62d0
  paca= 0xcfef1100   softe: 0irq_happened: 0x01
pid   = 4472, comm = qemu-system-ppc
enter ? for help
[c001dc53b200] c001d760 .vtime_account_system+0x20/0x60
[c001dc53b290] c008d050 .kvmppc_handle_exit_pr+0x60/0xa50
[c001dc53b340] c008f51c kvm_start_lightweight+0xb4/0xc4
[c001dc53b510] c008cdf0 .kvmppc_vcpu_run_pr+0x150/0x2e0
[c001dc53b9e0] c008341c .kvmppc_vcpu_run+0x2c/0x40
[c001dc53ba50] c0080af4 .kvm_arch_vcpu_ioctl_run+0x54/0x1b0
[c001dc53bae0] c007b4c8 .kvm_vcpu_ioctl+0x478/0x730
[c001dc53bca0] c02140cc .do_vfs_ioctl+0x4ac/0x770
[c001dc53bd80] c02143e8 .SyS_ioctl+0x58/0xb0
[c001dc53be30] c0009e58 syscall_exit+0x0/0x98
--- Exception: c00 (System Call) at 1f960160
SP (1ecbe3c0) is in userspace

These changes were originally part of
http://mid.gmane.org/20130806042205.gr19...@iris.ozlabs.ibm.com

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
 arch/powerpc/kernel/asm-offsets.c | 1 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 7 +++
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 0bd9348..69fe837 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -79,6 +79,7 @@ struct kvmppc_host_state {
ulong vmhandler;
ulong scratch0;
ulong scratch1;
+   ulong scratch2;
u8 in_guest;
u8 restore_hid5;
u8 napping;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 8e6ede6..841a4c8 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -583,6 +583,7 @@ int main(void)
HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
+   HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
HSTATE_FIELD(HSTATE_NAPPING, napping);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 339aa5e..16f7654 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -750,15 +750,14 @@ kvmppc_interrupt_hv:
 * guest CR, R12 saved in shadow VCPU SCRATCH1/0
 * guest R13 saved in SPRN_SCRATCH0
 */
-   /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
-   std r9, HSTATE_HOST_R2(r13)
+   std r9, HSTATE_SCRATCH2(r13)
 
lbz r9, HSTATE_IN_GUEST(r13)
cmpwi   r9, KVM_GUEST_MODE_HOST_HV
beq kvmppc_bad_host_intr
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
cmpwi   r9, KVM_GUEST_MODE_GUEST
-   ld  r9, HSTATE_HOST_R2(r13)
+   ld  r9, HSTATE_SCRATCH2(r13)
beq kvmppc_interrupt_pr
 #endif
/* We're now back in the host but in guest MMU context */
@@ -778,7 +777,7 @@ kvmppc_interrupt_hv:
std r6, VCPU_GPR(R6)(r9)
std r7, VCPU_GPR(R7)(r9)
std r8, VCPU_GPR(R8)(r9)
-   ld  r0, HSTATE_HOST_R2(r13)
+   ld  r0, HSTATE_SCRATCH2(r13)
std r0, VCPU_GPR(R9)(r9)
std r10, VCPU_GPR(R10)(r9)
std r11, VCPU_GPR(R11)(r9)
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values

2013-11-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Don't try to compute these values.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---

NOTE: I am not sure why we were originally computing dsisr and dar. So may be
we need a variant of this patch. But with this and the additional patch
powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a Little 
Endian
PR guest to boot.

 arch/powerpc/kvm/book3s_emulate.c | 64 ++-
 1 file changed, 2 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 99d40f8..62768f9 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -569,70 +569,10 @@ unprivileged:
 
 u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
 {
-   u32 dsisr = 0;
-
-   /*
-* This is what the spec says about DSISR bits (not mentioned = 0):
-*
-* 12:13[DS]Set to bits 30:31
-* 15:16[X] Set to bits 29:30
-* 17   [X] Set to bit 25
-*  [D/DS]  Set to bit 5
-* 18:21[X] Set to bits 21:24
-*  [D/DS]  Set to bits 1:4
-* 22:26Set to bits 6:10 (RT/RS/FRT/FRS)
-* 27:31Set to bits 11:15 (RA)
-*/
-
-   switch (get_op(inst)) {
-   /* D-form */
-   case OP_LFS:
-   case OP_LFD:
-   case OP_STFD:
-   case OP_STFS:
-   dsisr |= (inst  12)  0x4000; /* bit 17 */
-   dsisr |= (inst  17)  0x3c00; /* bits 18:21 */
-   break;
-   /* X-form */
-   case 31:
-   dsisr |= (inst  14)  0x18000; /* bits 15:16 */
-   dsisr |= (inst  8)   0x04000; /* bit 17 */
-   dsisr |= (inst  3)   0x03c00; /* bits 18:21 */
-   break;
-   default:
-   printk(KERN_INFO KVM: Unaligned instruction 0x%x\n, inst);
-   break;
-   }
-
-   dsisr |= (inst  16)  0x03ff; /* bits 22:31 */
-
-   return dsisr;
+   return vcpu-arch.fault_dsisr;
 }
 
 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 {
-   ulong dar = 0;
-   ulong ra = get_ra(inst);
-   ulong rb = get_rb(inst);
-
-   switch (get_op(inst)) {
-   case OP_LFS:
-   case OP_LFD:
-   case OP_STFD:
-   case OP_STFS:
-   if (ra)
-   dar = kvmppc_get_gpr(vcpu, ra);
-   dar += (s32)((s16)inst);
-   break;
-   case 31:
-   if (ra)
-   dar = kvmppc_get_gpr(vcpu, ra);
-   dar += kvmppc_get_gpr(vcpu, rb);
-   break;
-   default:
-   printk(KERN_INFO KVM: Unaligned instruction 0x%x\n, inst);
-   break;
-   }
-
-   return dar;
+   return vcpu-arch.fault_dar;
 }
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: book3s: PR: Enable Little Endian PR guest

2013-11-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch make sure we inherit the LE bit correctly in different case
so that we can run Little Endian distro in PR mode

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---

This patch depends on the below two changes
1)  [PATCH v5 0/6] KVM: PPC: Book3S: MMIO support for Little Endian guests 
(kvm-ppc)
http://mid.gmane.org/1383672128-26795-1-git-send-email-...@fr.ibm.com
2) [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values
   
http://mid.gmane.org/1384178577-23721-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com

 arch/powerpc/kvm/book3s_64_mmu.c | 2 +-
 arch/powerpc/kvm/book3s_pr.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 83da1f8..d339096 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,7 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-   kvmppc_set_msr(vcpu, MSR_SF);
+   kvmppc_set_msr(vcpu, MSR_SF | (vcpu-arch.shared-msr  MSR_LE));
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index a7fe87a..cf9362c 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -226,7 +226,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
ulong smsr = vcpu-arch.shared-msr;
 
/* Guest MSR values */
-   smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
+   smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
/* Process MSR values */
smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
/* External providers the guest reserved */
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: booke: Fix build failures

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

arch/powerpc/platforms/wsp/wsp.c: In function ‘wsp_probe_devices’:
arch/powerpc/platforms/wsp/wsp.c:76:3: error: implicit declaration of function 
‘of_address_to_resource’ [-Werror=implicit-function-declaration]

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/wsp/chroma.c   | 1 +
 arch/powerpc/platforms/wsp/h8.c   | 1 +
 arch/powerpc/platforms/wsp/ics.c  | 2 ++
 arch/powerpc/platforms/wsp/opb_pic.c  | 2 ++
 arch/powerpc/platforms/wsp/psr2.c | 1 +
 arch/powerpc/platforms/wsp/scom_wsp.c | 1 +
 arch/powerpc/platforms/wsp/wsp.c  | 1 +
 7 files changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/wsp/chroma.c 
b/arch/powerpc/platforms/wsp/chroma.c
index 8ef53bc2e70e..aaa46b353715 100644
--- a/arch/powerpc/platforms/wsp/chroma.c
+++ b/arch/powerpc/platforms/wsp/chroma.c
@@ -15,6 +15,7 @@
 #include linux/of.h
 #include linux/smp.h
 #include linux/time.h
+#include linux/of_fdt.h
 
 #include asm/machdep.h
 #include asm/udbg.h
diff --git a/arch/powerpc/platforms/wsp/h8.c b/arch/powerpc/platforms/wsp/h8.c
index d18e6cc19df3..a3c87f395750 100644
--- a/arch/powerpc/platforms/wsp/h8.c
+++ b/arch/powerpc/platforms/wsp/h8.c
@@ -10,6 +10,7 @@
 #include linux/kernel.h
 #include linux/of.h
 #include linux/io.h
+#include linux/of_address.h
 
 #include wsp.h
 
diff --git a/arch/powerpc/platforms/wsp/ics.c b/arch/powerpc/platforms/wsp/ics.c
index 2d3b1dd9571d..9cd92e645028 100644
--- a/arch/powerpc/platforms/wsp/ics.c
+++ b/arch/powerpc/platforms/wsp/ics.c
@@ -18,6 +18,8 @@
 #include linux/smp.h
 #include linux/spinlock.h
 #include linux/types.h
+#include linux/of_address.h
+#include linux/of_irq.h
 
 #include asm/io.h
 #include asm/irq.h
diff --git a/arch/powerpc/platforms/wsp/opb_pic.c 
b/arch/powerpc/platforms/wsp/opb_pic.c
index cb565bf93650..3f6729807938 100644
--- a/arch/powerpc/platforms/wsp/opb_pic.c
+++ b/arch/powerpc/platforms/wsp/opb_pic.c
@@ -15,6 +15,8 @@
 #include linux/of.h
 #include linux/slab.h
 #include linux/time.h
+#include linux/of_address.h
+#include linux/of_irq.h
 
 #include asm/reg_a2.h
 #include asm/irq.h
diff --git a/arch/powerpc/platforms/wsp/psr2.c 
b/arch/powerpc/platforms/wsp/psr2.c
index 508ec8282b96..a87b414c766a 100644
--- a/arch/powerpc/platforms/wsp/psr2.c
+++ b/arch/powerpc/platforms/wsp/psr2.c
@@ -15,6 +15,7 @@
 #include linux/of.h
 #include linux/smp.h
 #include linux/time.h
+#include linux/of_fdt.h
 
 #include asm/machdep.h
 #include asm/udbg.h
diff --git a/arch/powerpc/platforms/wsp/scom_wsp.c 
b/arch/powerpc/platforms/wsp/scom_wsp.c
index 8928507affea..6538b4de34fc 100644
--- a/arch/powerpc/platforms/wsp/scom_wsp.c
+++ b/arch/powerpc/platforms/wsp/scom_wsp.c
@@ -14,6 +14,7 @@
 #include linux/of.h
 #include linux/spinlock.h
 #include linux/types.h
+#include linux/of_address.h
 
 #include asm/cputhreads.h
 #include asm/reg_a2.h
diff --git a/arch/powerpc/platforms/wsp/wsp.c b/arch/powerpc/platforms/wsp/wsp.c
index ddb6efe88914..58cd1f00e1ef 100644
--- a/arch/powerpc/platforms/wsp/wsp.c
+++ b/arch/powerpc/platforms/wsp/wsp.c
@@ -13,6 +13,7 @@
 #include linux/smp.h
 #include linux/delay.h
 #include linux/time.h
+#include linux/of_address.h
 
 #include asm/scom.h
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH -V2 1/5] powerpc: Use HPTE constants when updating hpte bits

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Even though we have same value for linux PTE bits and hash PTE pits
use the hash pte bits wen updating hash pte

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/cell/beat_htab.c | 4 ++--
 arch/powerpc/platforms/pseries/lpar.c   | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/platforms/cell/beat_htab.c 
b/arch/powerpc/platforms/cell/beat_htab.c
index c34ee4e60873..d4d245c0d787 100644
--- a/arch/powerpc/platforms/cell/beat_htab.c
+++ b/arch/powerpc/platforms/cell/beat_htab.c
@@ -111,7 +111,7 @@ static long beat_lpar_hpte_insert(unsigned long hpte_group,
DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r);
 
if (rflags  _PAGE_NO_CACHE)
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
 
raw_spin_lock(beat_htab_lock);
lpar_rc = beat_read_mask(hpte_group);
@@ -337,7 +337,7 @@ static long beat_lpar_hpte_insert_v3(unsigned long 
hpte_group,
DBG_LOW( hpte_v=%016lx, hpte_r=%016lx\n, hpte_v, hpte_r);
 
if (rflags  _PAGE_NO_CACHE)
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
 
/* insert into not-volted entry */
lpar_rc = beat_insert_htab_entry3(0, hpte_group, hpte_v, hpte_r,
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 356bc75ca74f..c8fbef238d4b 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -153,7 +153,8 @@ static long pSeries_lpar_hpte_insert(unsigned long 
hpte_group,
 
/* Make pHyp happy */
if ((rflags  _PAGE_NO_CACHE)  !(rflags  _PAGE_WRITETHRU))
-   hpte_r = ~_PAGE_COHERENT;
+   hpte_r = ~HPTE_R_M;
+
if (firmware_has_feature(FW_FEATURE_XCMO)  !(hpte_r  HPTE_R_N))
flags |= H_COALESCE_CAND;
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2 4/5] powerpc: mm: Only check for _PAGE_PRESENT in set_pte/pmd functions

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We want to make sure we don't use these function when updating a pte
or pmd entry that have a valid hpte entry, because these functions
don't invalidate them. So limit the check to _PAGE_PRESENT bit.
Numafault core changes use these functions for updating _PAGE_NUMA bits.
That should be ok because when _PAGE_NUMA is set we can be sure that
hpte entries are not present.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/mm/pgtable.c| 2 +-
 arch/powerpc/mm/pgtable_64.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index 841e0d00863c..ad90429bbd8b 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -174,7 +174,7 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, 
pte_t *ptep,
pte_t pte)
 {
 #ifdef CONFIG_DEBUG_VM
-   WARN_ON(pte_present(*ptep));
+   WARN_ON(pte_val(*ptep)  _PAGE_PRESENT);
 #endif
/* Note: mm-context.id might not yet have been assigned as
 * this context might not have been activated yet when this
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 9d95786aa80f..02e8681fb865 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -687,7 +687,7 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef CONFIG_DEBUG_VM
-   WARN_ON(!pmd_none(*pmdp));
+   WARN_ON(pmd_val(*pmdp)  _PAGE_PRESENT);
assert_spin_locked(mm-page_table_lock);
WARN_ON(!pmd_trans_huge(pmd));
 #endif
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2 3/5] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE.
On archs like ppc64 that don't use _PAGE_PROTNONE and also have
a separate page table outside linux pagetable, we just need to
make sure that when calling change_prot_numa we flush the
hardware page table entry so that next page access  result in a numa
fault.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 include/linux/mm.h | 3 ---
 mm/mempolicy.c | 9 -
 2 files changed, 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0548eb201e05..51794c1a1d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1851,11 +1851,8 @@ static inline pgprot_t vm_get_page_prot(unsigned long 
vm_flags)
 }
 #endif
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
-#endif
-
 struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
 int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c4403cdf3433..cae10af4fdc4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,6 @@ static inline int queue_pages_pgd_range(struct 
vm_area_struct *vma,
return 0;
 }
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
 /*
  * This is used to mark a range of virtual addresses to be inaccessible.
  * These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +626,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
int nr_updated;
-   BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 
nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1);
if (nr_updated)
@@ -635,13 +633,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 
return nr_updated;
 }
-#else
-static unsigned long change_prot_numa(struct vm_area_struct *vma,
-   unsigned long addr, unsigned long end)
-{
-   return 0;
-}
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 
 /*
  * Walk through page tables and collect pages to be migrated.
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2 2/5] powerpc: Free up _PAGE_COHERENCE for numa fault use later

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Set  memory coherence always on hash64 config. If
a platform cannot have memory coherence always set they
can infer that from _PAGE_NO_CACHE and _PAGE_WRITETHRU
like in lpar. So we dont' really need a separate bit
for tracking _PAGE_COHERENCE.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pte-hash64.h |  2 +-
 arch/powerpc/mm/hash_low_64.S | 15 ---
 arch/powerpc/mm/hash_utils_64.c   |  7 ---
 arch/powerpc/mm/hugepage-hash64.c |  6 +-
 arch/powerpc/mm/hugetlbpage-hash64.c  |  4 
 5 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 0419eeb53274..55aea0caf95e 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -19,7 +19,7 @@
 #define _PAGE_FILE 0x0002 /* (!present only) software: pte holds 
file offset */
 #define _PAGE_EXEC 0x0004 /* No execute on POWER4 and newer (we 
invert) */
 #define _PAGE_GUARDED  0x0008
-#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP 
systems) */
+/* We can derive Memory coherence from _PAGE_NO_CACHE */
 #define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU0x0040 /* W: cache write-through */
 #define _PAGE_DIRTY0x0080 /* C: page changed */
diff --git a/arch/powerpc/mm/hash_low_64.S b/arch/powerpc/mm/hash_low_64.S
index d3cbda62857b..1136d26a95ae 100644
--- a/arch/powerpc/mm/hash_low_64.S
+++ b/arch/powerpc/mm/hash_low_64.S
@@ -148,7 +148,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r30,r0   /* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...) 
@@ -457,7 +460,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r3,r0/* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...)
@@ -795,7 +801,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
and r0,r0,r4/* _PAGE_RW  _PAGE_DIRTY -r0 bit 30*/
andcr0,r30,r0   /* r0 = pte  ~r0 */
rlwimi  r3,r0,32-1,31,31/* Insert result into PP lsb */
-   ori r3,r3,HPTE_R_C  /* Always add C bit for perf. */
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   ori r3,r3,HPTE_R_C | HPTE_R_M
 
/* We eventually do the icache sync here (maybe inline that
 * code rather than call a C function...)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 6176b3cdf579..de6881259aef 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -169,9 +169,10 @@ static unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
if ((pteflags  _PAGE_USER)  !((pteflags  _PAGE_RW) 
 (pteflags  _PAGE_DIRTY)))
rflags |= 1;
-
-   /* Always add C */
-   return rflags | HPTE_R_C;
+   /*
+* Always add C bit for perf. Memory coherence is always enabled
+*/
+   return rflags | HPTE_R_C | HPTE_R_M;
 }
 
 int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
diff --git a/arch/powerpc/mm/hugepage-hash64.c 
b/arch/powerpc/mm/hugepage-hash64.c
index 34de9e0cdc34..826893fcb3a7 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -127,7 +127,11 @@ repeat:
 
/* Add in WIMG bits */
rflags |= (new_pmd  (_PAGE_WRITETHRU | _PAGE_NO_CACHE |
- _PAGE_COHERENT | _PAGE_GUARDED));
+ _PAGE_GUARDED));
+   /*
+* enable the memory coherence always
+*/
+   rflags |= HPTE_R_M;
 
/* Insert into the hash table, primary slot */
slot = ppc_md.hpte_insert(hpte_group, vpn, pa, rflags, 0,
diff --git a/arch/powerpc/mm/hugetlbpage

[PATCH -V2 0/5] powerpc: mm: Numa faults support for ppc64

2013-11-18 Thread Aneesh Kumar K.V
Hi,

This patch series add support for numa faults on ppc64 architecture. We steal 
the
_PAGE_COHERENCE bit and use that for indicating _PAGE_NUMA. We clear the 
_PAGE_PRESENT bit
and also invalidate the hpte entry on setting _PAGE_NUMA. The next fault on that
page will be considered a numa fault.

Changes from V1:
* Dropped few patches related pmd update because batch handling of pmd pages 
got dropped from core code
   0f19c17929c952c6f0966d93ab05558e7bf814cc mm: numa: Do not batch handle PMD 
pages
   This also avoided the large lock contention on page_table_lock that we 
observed with the previous series.

 -aneesh
 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2 5/5] powerpc: mm: book3s: Enable _PAGE_NUMA for book3s

2013-11-18 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We steal the _PAGE_COHERENCE bit and use that for indicating NUMA ptes.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 66 +-
 arch/powerpc/include/asm/pte-hash64.h  |  6 
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 3 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 7d6eacf249cf..b999ca318985 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -3,6 +3,7 @@
 #ifdef __KERNEL__
 
 #ifndef __ASSEMBLY__
+#include linux/mmdebug.h
 #include asm/processor.h /* For TASK_SIZE */
 #include asm/mmu.h
 #include asm/page.h
@@ -33,10 +34,73 @@ static inline int pte_dirty(pte_t pte)  { 
return pte_val(pte)  _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte) { return pte_val(pte)  
_PAGE_ACCESSED; }
 static inline int pte_file(pte_t pte)  { return pte_val(pte)  
_PAGE_FILE; }
 static inline int pte_special(pte_t pte)   { return pte_val(pte)  
_PAGE_SPECIAL; }
-static inline int pte_present(pte_t pte)   { return pte_val(pte)  
_PAGE_PRESENT; }
 static inline int pte_none(pte_t pte)  { return (pte_val(pte)  
~_PTE_NONE_MASK) == 0; }
 static inline pgprot_t pte_pgprot(pte_t pte)   { return __pgprot(pte_val(pte) 
 PAGE_PROT_BITS); }
 
+#ifdef CONFIG_NUMA_BALANCING
+
+static inline int pte_present(pte_t pte)
+{
+   return pte_val(pte)  (_PAGE_PRESENT | _PAGE_NUMA);
+}
+
+#define pte_numa pte_numa
+static inline int pte_numa(pte_t pte)
+{
+   return (pte_val(pte) 
+   (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA;
+}
+
+#define pte_mknonnuma pte_mknonnuma
+static inline pte_t pte_mknonnuma(pte_t pte)
+{
+   pte_val(pte) = ~_PAGE_NUMA;
+   pte_val(pte) |=  _PAGE_PRESENT | _PAGE_ACCESSED;
+   return pte;
+}
+
+#define pte_mknuma pte_mknuma
+static inline pte_t pte_mknuma(pte_t pte)
+{
+   /*
+* We should not set _PAGE_NUMA on non present ptes. Also clear the
+* present bit so that hash_page will return 1 and we collect this
+* as numa fault.
+*/
+   if (pte_present(pte)) {
+   pte_val(pte) |= _PAGE_NUMA;
+   pte_val(pte) = ~_PAGE_PRESENT;
+   } else
+   VM_BUG_ON(1);
+   return pte;
+}
+
+#define pmd_numa pmd_numa
+static inline int pmd_numa(pmd_t pmd)
+{
+   return pte_numa(pmd_pte(pmd));
+}
+
+#define pmd_mknonnuma pmd_mknonnuma
+static inline pmd_t pmd_mknonnuma(pmd_t pmd)
+{
+   return pte_pmd(pte_mknonnuma(pmd_pte(pmd)));
+}
+
+#define pmd_mknuma pmd_mknuma
+static inline pmd_t pmd_mknuma(pmd_t pmd)
+{
+   return pte_pmd(pte_mknuma(pmd_pte(pmd)));
+}
+
+# else
+
+static inline int pte_present(pte_t pte)
+{
+   return pte_val(pte)  _PAGE_PRESENT;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
 /* Conversion functions: convert a page and protection to a page entry,
  * and a page entry and page directory to the page they refer to.
  *
diff --git a/arch/powerpc/include/asm/pte-hash64.h 
b/arch/powerpc/include/asm/pte-hash64.h
index 55aea0caf95e..2505d8eab15c 100644
--- a/arch/powerpc/include/asm/pte-hash64.h
+++ b/arch/powerpc/include/asm/pte-hash64.h
@@ -27,6 +27,12 @@
 #define _PAGE_RW   0x0200 /* software: user write access allowed */
 #define _PAGE_BUSY 0x0800 /* software: PTE  hash are busy */
 
+/*
+ * Used for tracking numa faults
+ */
+#define _PAGE_NUMA 0x0010 /* Gather numa placement stats */
+
+
 /* No separate kernel read-only */
 #define _PAGE_KERNEL_RW(_PAGE_RW | _PAGE_DIRTY) /* user access 
blocked by key */
 #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index c2a566fb8bb8..2048655d8ec4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -72,6 +72,7 @@ config PPC_BOOK3S_64
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
+   select ARCH_SUPPORTS_NUMA_BALANCING
 
 config PPC_BOOK3E_64
bool Embedded processors
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2] powerpc: book3s: PR: Enable Little Endian PR guest

2013-11-28 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch make sure we inherit the LE bit correctly in different case
so that we can run Little Endian distro in PR mode

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---

Changes from V1:
* Use LPCR bit to find whether to enable LE on interrupt. We do it more or less
  same as HV now. We keep it separate at this point because HV H_SETMODE does
  a lot more than what we do here.

This patch depends on the below two changes
1)  [PATCH v5 0/6] KVM: PPC: Book3S: MMIO support for Little Endian guests 
(kvm-ppc)
http://mid.gmane.org/1383672128-26795-1-git-send-email-...@fr.ibm.com
2) [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values
   
http://mid.gmane.org/1384178577-23721-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com
3) [PATCH 11/15] KVM: PPC: Book3S HV: Add little-endian guest support
http://mid.gmane.org/1383995103-24732-12-git-send-email-pau...@samba.org
With further changes to make it apply to latest upstream.

 arch/powerpc/include/asm/kvm_host.h |  4 +--
 arch/powerpc/kernel/asm-offsets.c   |  4 +--
 arch/powerpc/kvm/book3s_64_mmu.c|  2 +-
 arch/powerpc/kvm/book3s_pr.c|  3 +-
 arch/powerpc/kvm/book3s_pr_papr.c   | 57 +
 5 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index cecd88338f28..1e67adc725d2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -240,7 +240,6 @@ struct kvm_arch {
unsigned long sdr1;
unsigned long host_sdr1;
int tlbie_lock;
-   unsigned long lpcr;
unsigned long rmor;
struct kvm_rma_info *rma;
unsigned long vrma_slb_v;
@@ -261,6 +260,7 @@ struct kvm_arch {
struct mutex hpt_mutex;
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
+   unsigned long lpcr;
struct list_head spapr_tce_tables;
struct list_head rtas_tokens;
 #endif
@@ -524,6 +524,7 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_PPC_BOOK3S
ulong fault_dar;
u32 fault_dsisr;
+   unsigned long intr_msr;
 #endif
 
 #ifdef CONFIG_BOOKE
@@ -616,7 +617,6 @@ struct kvm_vcpu_arch {
spinlock_t tbacct_lock;
u64 busy_stolen;
u64 busy_preempt;
-   unsigned long intr_msr;
 #endif
 };
 
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 39dbcb3d3d7d..136c4bec52ab 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -473,7 +473,6 @@ int main(void)
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
-   DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1));
DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock));
DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
@@ -484,9 +483,9 @@ int main(void)
DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
-   DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
 #endif
 #ifdef CONFIG_PPC_BOOK3S
+   DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
DEFINE(VCPU_PURR, offsetof(struct kvm_vcpu, arch.purr));
DEFINE(VCPU_SPURR, offsetof(struct kvm_vcpu, arch.spurr));
@@ -510,6 +509,7 @@ int main(void)
DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+   DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 83da1f868fd5..8231b83c493b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,7 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-   kvmppc_set_msr(vcpu, MSR_SF);
+   kvmppc_set_msr(vcpu, vcpu-arch.intr_msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index f84778547c6b..dc22643a45d2 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -226,7 +226,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
ulong smsr = vcpu-arch.shared-msr

Re: [PATCH -V2 3/5] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE

2013-12-04 Thread Aneesh Kumar K.V

Adding Mel and Rik to cc:

Benjamin Herrenschmidt b...@au1.ibm.com writes:

 On Mon, 2013-11-18 at 14:58 +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE.
 On archs like ppc64 that don't use _PAGE_PROTNONE and also have
 a separate page table outside linux pagetable, we just need to
 make sure that when calling change_prot_numa we flush the
 hardware page table entry so that next page access  result in a numa
 fault.

 That patch doesn't look right...

 You are essentially making change_prot_numa() do whatever it does (which
 I don't completely understand) *for all architectures* now, whether they
 have CONFIG_ARCH_USES_NUMA_PROT_NONE or not ... So because you want that
 behaviour on powerpc book3s64, you change everybody.

 Is that correct ?


Yes. 


 Also what exactly is that doing, can you explain ? From what I can see,
 it calls back into the core of mprotect to change the protection to
 vma-vm_page_prot, which I would have expected is already the protection
 there, with the added prot_numa flag passed down.

it set the _PAGE_NUMA bit. Now we also want to make sure that when
we set _PAGE_NUMA, we would get a pagefault on that so that we can track
that fault as a numa fault. To ensure that, we had the below BUILD_BUG

BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);


But other than that the function doesn't really have any dependency on
_PAGE_PROTNONE. The only requirement is when we set _PAGE_NUMA, the
architecture should do enough to ensure that we get a page fault. Now on
ppc64 we does that by clearlying hpte entry and also clearing
_PAGE_PRESENT. Since we have _PAGE_PRESENT cleared hash_page will return
1 and we get to page fault handler.


 Your changeset comment says On archs like ppc64 [...] we just need to
 make sure that when calling change_prot_numa we flush the
 hardware page table entry so that next page access  result in a numa
 fault.

 But change_prot_numa() does a lot more than that ... it does
 pte_mknuma(), do we need it ? I assume we do or we wouldn't have added
 that PTE bit to begin with...

 Now it *might* be allright and it might be that no other architecture
 cares anyway etc... but I need at least some mm folks to ack on that
 patch before I can take it because it *will* change behaviour of other
 architectures.


Ok, I can move the changes below #ifdef CONFIG_NUMA_BALANCING ? We call
change_prot_numa from task_numa_work and queue_pages_range(). The later
may be an issue. So doing the below will help ?

-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING


-aneesh


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 1/3] powerpc: mm: make _PAGE_NUMA take effect

2013-12-05 Thread Aneesh Kumar K.V
Liu Ping Fan kernelf...@gmail.com writes:

 To enable the do_numa_page(), we should not fix _PAGE_NUMA in
 hash_page(), so bail out for the case of pte_numa().

 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com
 ---
  arch/powerpc/mm/hash_utils_64.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
 index fb176e9..9bf1195 100644
 --- a/arch/powerpc/mm/hash_utils_64.c
 +++ b/arch/powerpc/mm/hash_utils_64.c
 @@ -1033,7 +1033,7 @@ int hash_page(unsigned long ea, unsigned long access, 
 unsigned long trap)

   /* Get PTE and page size from page tables */
   ptep = find_linux_pte_or_hugepte(pgdir, ea, hugeshift);
 - if (ptep == NULL || !pte_present(*ptep)) {
 + if (ptep == NULL || !pte_present(*ptep) || pte_numa(*ptep)) {
   DBG_LOW( no PTE !\n);
   rc = 1;
   goto bail;

why ? , All the hash routines do check for _PAGE_PRESENT via access
variable.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 3/3] powerpc: mm: optimize for the correctly placed page

2013-12-05 Thread Aneesh Kumar K.V
Liu Ping Fan kernelf...@gmail.com writes:

 The period check of _PAGE_NUMA can probably trigger the check on
 the correctly placed page. For this case, we can just insert hpte and
 do fast exception return.

I still don't understand why we need to handle numa faults in hash
page ? Are you trying to optimize the code path ? If so can you explain
the benefits ? Some numbers showing it is helping  ?



 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com
 ---
  arch/powerpc/mm/hash_utils_64.c | 34 +-
  1 file changed, 33 insertions(+), 1 deletion(-)

 diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
 index 9bf1195..735678c 100644
 --- a/arch/powerpc/mm/hash_utils_64.c
 +++ b/arch/powerpc/mm/hash_utils_64.c
 @@ -965,6 +965,10 @@ int hash_page(unsigned long ea, unsigned long access, 
 unsigned long trap)
   const struct cpumask *tmp;
   int rc, user_region = 0, local = 0;
   int psize, ssize;
 + pte_t old, new;
 + struct vm_area_struct *vma;
 + int page_nid, target_nid;
 + struct page *test_page;

   DBG_LOW(hash_page(ea=%016lx, access=%lx, trap=%lx\n,
   ea, access, trap);
 @@ -1033,12 +1037,40 @@ int hash_page(unsigned long ea, unsigned long access, 
 unsigned long trap)

   /* Get PTE and page size from page tables */
   ptep = find_linux_pte_or_hugepte(pgdir, ea, hugeshift);
 - if (ptep == NULL || !pte_present(*ptep) || pte_numa(*ptep)) {
 + if (ptep == NULL || !pte_present(*ptep)) {
   DBG_LOW( no PTE !\n);
   rc = 1;
   goto bail;
   }

 + old = pte_val(*ptep);
 + if (pte_numa(old)) {
 + /* If fail to lock, let do_page_fault() to handle it */
 + if (down_read_trylock(mm-mmap_sem)) {

hmm is that something we want to do in hash_page ?

 + vma = find_vma(mm, ea);
 + up_read(mm-mmap_sem);
 + test_page = pte_page(old);
 + page_nid = page_to_nid(test_page);
 + target_nid = numa_migrate_prep(test_page, vma, ea,
 + page_nid);
 + if (target_nid  0) {
 + new = pte_mknonnuma(old);
 + /* If ptep is modified under us,
 +  * just retry the access
 +  */
 + if (unlikely(cmpxchg(ptep, old, new) != old)) {
 + put_page(test_page);
 + return 0;
 + }
 + put_page(test_page);
 + }
 + } else {
 + put_page(test_page);
 + rc = 1;
 + goto bail;
 + }
 + }
 +
   /* Add _PAGE_PRESENT to the required access perm */
   access |= _PAGE_PRESENT;


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V3] mm: Move change_prot_numa outside CONFIG_ARCH_USES_NUMA_PROT_NONE

2013-12-05 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

change_prot_numa should work even if _PAGE_NUMA != _PAGE_PROTNONE.
On archs like ppc64 that don't use _PAGE_PROTNONE and also have
a separate page table outside linux pagetable, we just need to
make sure that when calling change_prot_numa we flush the
hardware page table entry so that next page access  result in a numa
fault.

We still need to make sure we use the numa faulting logic only
when CONFIG_NUMA_BALANCING is set. This implies the migrate-on-fault
(Lazy migration) via mbind will only work if CONFIG_NUMA_BALANCING
is set.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Previous discussion around the patch can be found at
http://article.gmane.org/gmane.linux.kernel.mm/109305

changes from V2:
* Move the numa faulting definition within CONFIG_NUMA_BALANCING

 include/linux/mm.h | 2 +-
 mm/mempolicy.c | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1cedd000cf29..a7b4e310bf42 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1842,7 +1842,7 @@ static inline pgprot_t vm_get_page_prot(unsigned long 
vm_flags)
 }
 #endif
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
 unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
 #endif
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a3129129..9f73b29d304d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -613,7 +613,7 @@ static inline int queue_pages_pgd_range(struct 
vm_area_struct *vma,
return 0;
 }
 
-#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
+#ifdef CONFIG_NUMA_BALANCING
 /*
  * This is used to mark a range of virtual addresses to be inaccessible.
  * These are later cleared by a NUMA hinting fault. Depending on these
@@ -627,7 +627,6 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
unsigned long addr, unsigned long end)
 {
int nr_updated;
-   BUILD_BUG_ON(_PAGE_NUMA != _PAGE_PROTNONE);
 
nr_updated = change_protection(vma, addr, end, vma-vm_page_prot, 0, 1);
if (nr_updated)
@@ -641,7 +640,7 @@ static unsigned long change_prot_numa(struct vm_area_struct 
*vma,
 {
return 0;
 }
-#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
+#endif /* CONFIG_NUMA_BALANCING */
 
 /*
  * Walk through page tables and collect pages to be migrated.
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor

2013-12-06 Thread Aneesh Kumar K.V
Hong H. Pham hong.p...@windriver.com writes:

 In pte_alloc_one(), pgtable_page_ctor() is passed an address that has
 not been converted by page_address() to the newly allocated PTE page.

 When the PTE is freed, __pte_free_tlb() calls pgtable_page_dtor()
 with an address to the PTE page that has been converted by page_address().
 The mismatch in the PTE's page address causes pgtable_page_dtor() to access
 invalid memory, so resources for that PTE (such as the page lock) is not
 properly cleaned up.

 This bug was introduced by commit d614bb041209fd7cb5e4b35e11a7b2f6ee8f62b8
 powerpc: Move the pte free routines from common header.

 On a preempt-rt kernel, a spinlock is dynamically allocated for each
 PTE in pgtable_page_ctor().  When the PTE is freed, calling
 pgtable_page_dtor() with a mismatched page address causes a memory leak,
 as the pointer to the PTE's spinlock is bogus.

 On mainline, there isn't any immediately obvious symptoms, but the
 problem still exists here.


can you also specifiy the config details here. ie, 4K page size functions
are broken ?


 Fixes: d614bb041209fd7c powerpc: Move the pte free routes from common header
 Cc: Paul Mackerras pau...@samba.org
 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: linux-stable sta...@vger.kernel.org # v3.10+
 Signed-off-by: Hong H. Pham hong.p...@windriver.com
 ---
  arch/powerpc/include/asm/pgalloc-32.h | 2 +-
  arch/powerpc/include/asm/pgalloc-64.h | 2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)

 diff --git a/arch/powerpc/include/asm/pgalloc-32.h 
 b/arch/powerpc/include/asm/pgalloc-32.h
 index 27b2386..7ff24f0 100644
 --- a/arch/powerpc/include/asm/pgalloc-32.h
 +++ b/arch/powerpc/include/asm/pgalloc-32.h
 @@ -87,7 +87,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, 
 pgtable_t table,
   struct page *page = page_address(table);

   tlb_flush_pgtable(tlb, address);
 - pgtable_page_dtor(page);
 + pgtable_page_dtor(table);
   pgtable_free_tlb(tlb, page, 0);
  }
  #endif /* _ASM_POWERPC_PGALLOC_32_H */
 diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
 b/arch/powerpc/include/asm/pgalloc-64.h
 index f65e27b..b187dc5 100644
 --- a/arch/powerpc/include/asm/pgalloc-64.h
 +++ b/arch/powerpc/include/asm/pgalloc-64.h
 @@ -147,7 +147,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, 
 pgtable_t table,
   struct page *page = page_address(table);


That one is also wrong right ? why not 


   tlb_flush_pgtable(tlb, address);
 - pgtable_page_dtor(page);
 + pgtable_page_dtor(table);
   pgtable_free_tlb(tlb, page, 0);
  }


make it closer to what it was before,

pgtable_page_dtor(table);
pgtable_free_tlb(tlb, page_address(table), 0);

This is what we had before

-static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage,
- unsigned long address)
-{
-   tlb_flush_pgtable(tlb, address);
-   pgtable_page_dtor(ptepage);
-   pgtable_free_tlb(tlb, page_address(ptepage), 0);
-}


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor

2013-12-09 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Sat, 2013-12-07 at 09:06 -0500, Hong H. Pham wrote:

 diff --git a/arch/powerpc/include/asm/pgalloc-32.h 
 b/arch/powerpc/include/asm/pgalloc-32.h
 index 27b2386..842846c 100644
 --- a/arch/powerpc/include/asm/pgalloc-32.h
 +++ b/arch/powerpc/include/asm/pgalloc-32.h
 @@ -84,10 +84,8 @@ static inline void pgtable_free_tlb(struct mmu_gather 
 *tlb,
  static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
unsigned long address)
  {
 -struct page *page = page_address(table);
 -
  tlb_flush_pgtable(tlb, address);
 -pgtable_page_dtor(page);
 -pgtable_free_tlb(tlb, page, 0);
 +pgtable_page_dtor(table);
 +pgtable_free_tlb(tlb, page_address(table), 0);
  }

 Ok so your description of the problem confused me a bit, but I see that
 in the !64K page, pgtable_t is already a struct page so yes, the
 page_address() call here is bogus.

 However, I also noticed that in the 64k page case, we don't call the dto
 at all. Is that a problem ?

 Also, Aneesh, shouldn't we just fix the disconnect here and have
 pgtable_t always be the same type ? The way this is now is confusing
 and error prone...

With pte page fragments that may not be possible right ?. With PTE fragments,
we share the page allocated with multiple pmd entries 

5c1f6ee9a31cbdac90bbb8ae1ba4475031ac74b4 should have more details


  #endif /* _ASM_POWERPC_PGALLOC_32_H */
 diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
 b/arch/powerpc/include/asm/pgalloc-64.h
 index f65e27b..256d6f8 100644
 --- a/arch/powerpc/include/asm/pgalloc-64.h
 +++ b/arch/powerpc/include/asm/pgalloc-64.h
 @@ -144,11 +144,9 @@ static inline void pgtable_free_tlb(struct mmu_gather 
 *tlb,
  static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
unsigned long address)
  {
 -struct page *page = page_address(table);
 -
  tlb_flush_pgtable(tlb, address);
 -pgtable_page_dtor(page);
 -pgtable_free_tlb(tlb, page, 0);
 +pgtable_page_dtor(table);
 +pgtable_free_tlb(tlb, page_address(table), 0);
  }
  
  #else /* if CONFIG_PPC_64K_PAGES */

 Ben.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v3] powerpc: Fix PTE page address mismatch in pgtable ctor/dtor

2013-12-09 Thread Aneesh Kumar K.V
Hong H. Pham hong.p...@windriver.com writes:

 From: Hong H. Pham hong.p...@windriver.com

 In pte_alloc_one(), pgtable_page_ctor() is passed an address that has
 not been converted by page_address() to the newly allocated PTE page.

 When the PTE is freed, __pte_free_tlb() calls pgtable_page_dtor()
 with an address to the PTE page that has been converted by page_address().
 The mismatch in the PTE's page address causes pgtable_page_dtor() to access
 invalid memory, so resources for that PTE (such as the page lock) is not
 properly cleaned up.

 On PPC32, only SMP kernels are affected.

 On PPC64, only SMP kernels with 4K page size are affected.

 This bug was introduced by commit d614bb041209fd7cb5e4b35e11a7b2f6ee8f62b8
 powerpc: Move the pte free routines from common header.

 On a preempt-rt kernel, a spinlock is dynamically allocated for each
 PTE in pgtable_page_ctor().  When the PTE is freed, calling
 pgtable_page_dtor() with a mismatched page address causes a memory leak,
 as the pointer to the PTE's spinlock is bogus.

 On mainline, there isn't any immediately obvious symptoms, but the
 problem still exists here.

 Fixes: d614bb041209fd7c powerpc: Move the pte free routes from common header
 Cc: Paul Mackerras pau...@samba.org
 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Cc: linux-stable sta...@vger.kernel.org # v3.10+
 Signed-off-by: Hong H. Pham hong.p...@windriver.com


Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com


 ---
  arch/powerpc/include/asm/pgalloc-32.h | 6 ++
  arch/powerpc/include/asm/pgalloc-64.h | 6 ++
  2 files changed, 4 insertions(+), 8 deletions(-)

 diff --git a/arch/powerpc/include/asm/pgalloc-32.h 
 b/arch/powerpc/include/asm/pgalloc-32.h
 index 27b2386..842846c 100644
 --- a/arch/powerpc/include/asm/pgalloc-32.h
 +++ b/arch/powerpc/include/asm/pgalloc-32.h
 @@ -84,10 +84,8 @@ static inline void pgtable_free_tlb(struct mmu_gather *tlb,
  static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 unsigned long address)
  {
 - struct page *page = page_address(table);
 -
   tlb_flush_pgtable(tlb, address);
 - pgtable_page_dtor(page);
 - pgtable_free_tlb(tlb, page, 0);
 + pgtable_page_dtor(table);
 + pgtable_free_tlb(tlb, page_address(table), 0);
  }
  #endif /* _ASM_POWERPC_PGALLOC_32_H */
 diff --git a/arch/powerpc/include/asm/pgalloc-64.h 
 b/arch/powerpc/include/asm/pgalloc-64.h
 index f65e27b..256d6f8 100644
 --- a/arch/powerpc/include/asm/pgalloc-64.h
 +++ b/arch/powerpc/include/asm/pgalloc-64.h
 @@ -144,11 +144,9 @@ static inline void pgtable_free_tlb(struct mmu_gather 
 *tlb,
  static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 unsigned long address)
  {
 - struct page *page = page_address(table);
 -
   tlb_flush_pgtable(tlb, address);
 - pgtable_page_dtor(page);
 - pgtable_free_tlb(tlb, page, 0);
 + pgtable_page_dtor(table);
 + pgtable_free_tlb(tlb, page_address(table), 0);
  }

  #else /* if CONFIG_PPC_64K_PAGES */
 -- 
 1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] KVM: PPC: Use schedule instead of cond_resched

2013-12-10 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We already checked need_resched. So we can call schedule directly

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---

NOTE: This patch also work around a regression upstream w.r.t PR KVM

 BUG: soft lockup - CPU#0 stuck for 23s! [qemu-system-ppc:4394]
 Modules linked in:
 CPU: 0 PID: 4394 Comm: qemu-system-ppc Not tainted 3.13.0-rc3+ #98
 task: c001d0788400 ti: c001dca0 task.ti: c001dca0
 NIP: c082dd80 LR: c0081ae0 CTR: c0062ba0
 REGS: c001dca02f70 TRAP: 0901   Not tainted  (3.13.0-rc3+)
 MSR: 80009032 SF,EE,ME,IR,DR,RI  CR: 24822024  XER: 
 CFAR: c0081adc SOFTE: 1
 GPR00: c0081ae0 c001dca031f0 c0d67ab0 0001
 GPR04: 7102 0001 0189a0d786b7 018c
 GPR08: 0001   c0da
 GPR12: 0c00 cfef
 NIP [c082dd80] ._cond_resched+0x0/0x40
 LR [c0081ae0] .kvmppc_prepare_to_enter+0x2a0/0x2e0
 Call Trace:
 [c001dca031f0] [c0081ae0] .kvmppc_prepare_to_enter+0x2a0/0x2e0 
(unreliable)
 [c001dca03290] [c008f2cc] .kvmppc_handle_exit_pr+0xec/0xa40
 [c001dca03340] [c00918c4] kvm_start_lightweight+0xac/0xbc
 [c001dca03510] [c008efe0] .kvmppc_vcpu_run_pr+0x130/0x2a0
 [c001dca039e0] [c00855bc] .kvmppc_vcpu_run+0x2c/0x40
 [c001dca03a50] [c0082c94] .kvm_arch_vcpu_ioctl_run+0x54/0x1b0
 [c001dca03ae0] [c007d5f8] .kvm_vcpu_ioctl+0x478/0x740
 [c001dca03ca0] [c0218864] .do_vfs_ioctl+0x4a4/0x760
 [c001dca03d80] [c0218b78] .SyS_ioctl+0x58/0xb0
 [c001dca03e30] [c0009e58] syscall_exit+0x0/0x98
 Instruction dump:
 e92d0260 e94911c0 812a0004 5529f07e 5529103e 912a0004 38210080 e8010010
 ebc1fff0 ebe1fff8 7c0803a6 4e800020 7c0802a6 3860 f8010010 f821ff91 

 arch/powerpc/kvm/powerpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index e4d511c8b38b..6a49b23a3276 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -74,7 +74,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
while (true) {
if (need_resched()) {
local_irq_enable();
-   cond_resched();
+   schedule();
local_irq_disable();
continue;
}
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] KVM: PPC: Use schedule instead of cond_resched

2013-12-10 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 10.12.2013, at 15:21, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 We already checked need_resched. So we can call schedule directly
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

 The real fix for the issue you're seeing is

   https://lkml.org/lkml/2013/11/28/241

True, I mentioned that in the thread

https://lkml.org/lkml/2013/12/9/64

But do we need to do cond_resched after we checked for need_resched() ?

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: book3s: kvm: Don't abuse host r2 in exit path

2013-12-17 Thread Aneesh Kumar K.V

Hi Alex,

Any update on this ? We need this to got into 3.13.

-aneesh 

Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com writes:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

 We don't use PACATOC for PR. Avoid updating HOST_R2 with PR
 KVM mode when both HV and PR are enabled in the kernel. Without this we
 get the below crash

 (qemu)
 Unable to handle kernel paging request for data at address 0x8310
 Faulting instruction address: 0xc001d5a4
 cpu 0x2: Vector: 300 (Data Access) at [c001dc53aef0]
 pc: c001d5a4: .vtime_delta.isra.1+0x34/0x1d0
 lr: c001d760: .vtime_account_system+0x20/0x60
 sp: c001dc53b170
msr: 80009032
dar: 8310
  dsisr: 4000
   current = 0xc001d76c62d0
   paca= 0xcfef1100   softe: 0irq_happened: 0x01
 pid   = 4472, comm = qemu-system-ppc
 enter ? for help
 [c001dc53b200] c001d760 .vtime_account_system+0x20/0x60
 [c001dc53b290] c008d050 .kvmppc_handle_exit_pr+0x60/0xa50
 [c001dc53b340] c008f51c kvm_start_lightweight+0xb4/0xc4
 [c001dc53b510] c008cdf0 .kvmppc_vcpu_run_pr+0x150/0x2e0
 [c001dc53b9e0] c008341c .kvmppc_vcpu_run+0x2c/0x40
 [c001dc53ba50] c0080af4 .kvm_arch_vcpu_ioctl_run+0x54/0x1b0
 [c001dc53bae0] c007b4c8 .kvm_vcpu_ioctl+0x478/0x730
 [c001dc53bca0] c02140cc .do_vfs_ioctl+0x4ac/0x770
 [c001dc53bd80] c02143e8 .SyS_ioctl+0x58/0xb0
 [c001dc53be30] c0009e58 syscall_exit+0x0/0x98
 --- Exception: c00 (System Call) at 1f960160
 SP (1ecbe3c0) is in userspace

 These changes were originally part of
 http://mid.gmane.org/20130806042205.gr19...@iris.ozlabs.ibm.com

 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
  arch/powerpc/include/asm/kvm_book3s_asm.h | 1 +
  arch/powerpc/kernel/asm-offsets.c | 1 +
  arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 7 +++
  3 files changed, 5 insertions(+), 4 deletions(-)

 diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
 b/arch/powerpc/include/asm/kvm_book3s_asm.h
 index 0bd9348..69fe837 100644
 --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
 +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
 @@ -79,6 +79,7 @@ struct kvmppc_host_state {
   ulong vmhandler;
   ulong scratch0;
   ulong scratch1;
 + ulong scratch2;
   u8 in_guest;
   u8 restore_hid5;
   u8 napping;
 diff --git a/arch/powerpc/kernel/asm-offsets.c 
 b/arch/powerpc/kernel/asm-offsets.c
 index 8e6ede6..841a4c8 100644
 --- a/arch/powerpc/kernel/asm-offsets.c
 +++ b/arch/powerpc/kernel/asm-offsets.c
 @@ -583,6 +583,7 @@ int main(void)
   HSTATE_FIELD(HSTATE_VMHANDLER, vmhandler);
   HSTATE_FIELD(HSTATE_SCRATCH0, scratch0);
   HSTATE_FIELD(HSTATE_SCRATCH1, scratch1);
 + HSTATE_FIELD(HSTATE_SCRATCH2, scratch2);
   HSTATE_FIELD(HSTATE_IN_GUEST, in_guest);
   HSTATE_FIELD(HSTATE_RESTORE_HID5, restore_hid5);
   HSTATE_FIELD(HSTATE_NAPPING, napping);
 diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
 b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
 index 339aa5e..16f7654 100644
 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
 +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
 @@ -750,15 +750,14 @@ kvmppc_interrupt_hv:
* guest CR, R12 saved in shadow VCPU SCRATCH1/0
* guest R13 saved in SPRN_SCRATCH0
*/
 - /* abuse host_r2 as third scratch area; we get r2 from PACATOC(r13) */
 - std r9, HSTATE_HOST_R2(r13)
 + std r9, HSTATE_SCRATCH2(r13)
  
   lbz r9, HSTATE_IN_GUEST(r13)
   cmpwi   r9, KVM_GUEST_MODE_HOST_HV
   beq kvmppc_bad_host_intr
  #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
   cmpwi   r9, KVM_GUEST_MODE_GUEST
 - ld  r9, HSTATE_HOST_R2(r13)
 + ld  r9, HSTATE_SCRATCH2(r13)
   beq kvmppc_interrupt_pr
  #endif
   /* We're now back in the host but in guest MMU context */
 @@ -778,7 +777,7 @@ kvmppc_interrupt_hv:
   std r6, VCPU_GPR(R6)(r9)
   std r7, VCPU_GPR(R7)(r9)
   std r8, VCPU_GPR(R8)(r9)
 - ld  r0, HSTATE_HOST_R2(r13)
 + ld  r0, HSTATE_SCRATCH2(r13)
   std r0, VCPU_GPR(R9)(r9)
   std r10, VCPU_GPR(R10)(r9)
   std r11, VCPU_GPR(R11)(r9)
 -- 
 1.8.3.2

 --
 To unsubscribe from this list: send the line unsubscribe kvm-ppc in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values

2013-12-18 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 11.11.2013, at 15:02, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Don't try to compute these values.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
 
 NOTE: I am not sure why we were originally computing dsisr and dar. So may be
 we need a variant of this patch. But with this and the additional patch
 powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a 
 Little Endian
 PR guest to boot.

 It's quite easy to find out - git blame tells you all the history and points 
 you to commit ca7f4203b.

 commit ca7f4203b9b66e12d0d9968ff7dfe781f3a9695a
 Author: Alexander Graf ag...@suse.de
 Date:   Wed Mar 24 21:48:28 2010 +0100

 KVM: PPC: Implement alignment interrupt

 Mac OS X has some applications - namely the Finder - that require 
 alignment
 interrupts to work properly. So we need to implement them.

 But the spec for 970 and 750 also looks different. While 750 requires the
 DSISR and DAR fields to reflect some instruction bits (DSISR) and the 
 fault
 address (DAR), the 970 declares this as an optional feature. So we need
 to reconstruct DSISR and DAR manually.

 Signed-off-by: Alexander Graf ag...@suse.de
 Signed-off-by: Avi Kivity a...@redhat.com

 Read this as on 970, alignment interrupts don't give us DSISR and DAR of the 
 faulting instruction as otherwise I wouldn't have implemented it.

 So this is clearly a nack on this patch :).

I can possibly do a if (cpu_has_feature(CPU_FTR_ARCH_201)). But do we need
to do that ? According to Paul we should always find DAR.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: book3s: kvm: Use the saved dsisr and dar values

2013-12-19 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 Am 19.12.2013 um 08:02 schrieb Aneesh Kumar K.V 
 aneesh.ku...@linux.vnet.ibm.com:
 
 Alexander Graf ag...@suse.de writes:
 
 On 11.11.2013, at 15:02, Aneesh Kumar K.V 
 aneesh.ku...@linux.vnet.ibm.com wrote:
 
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 Don't try to compute these values.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
 
 NOTE: I am not sure why we were originally computing dsisr and dar. So may 
 be
 we need a variant of this patch. But with this and the additional patch
 powerpc: book3s: PR: Enable Little Endian PR guest I am able to get a 
 Little Endian
 PR guest to boot.
 
 It's quite easy to find out - git blame tells you all the history and 
 points you to commit ca7f4203b.
 
 commit ca7f4203b9b66e12d0d9968ff7dfe781f3a9695a
 Author: Alexander Graf ag...@suse.de
 Date:   Wed Mar 24 21:48:28 2010 +0100
 
KVM: PPC: Implement alignment interrupt
 
Mac OS X has some applications - namely the Finder - that require 
 alignment
interrupts to work properly. So we need to implement them.
 
But the spec for 970 and 750 also looks different. While 750 requires the
DSISR and DAR fields to reflect some instruction bits (DSISR) and the 
 fault
address (DAR), the 970 declares this as an optional feature. So we need
to reconstruct DSISR and DAR manually.
 
Signed-off-by: Alexander Graf ag...@suse.de
Signed-off-by: Avi Kivity a...@redhat.com
 
 Read this as on 970, alignment interrupts don't give us DSISR and DAR of 
 the faulting instruction as otherwise I wouldn't have implemented it.
 
 So this is clearly a nack on this patch :).
 
 I can possibly do a if (cpu_has_feature(CPU_FTR_ARCH_201)). But do we need
 to do that ? According to Paul we should always find DAR.

 Paul only mentioned DAR, not DSISR. Please verify whether 970 gives us a 
 proper DAR value - we can then remove that part.

 But for DSISR I'm not convinced CPUs above 970 handle this
 correctly. So we would at least need a guest cpu check to find out
 whether the vcpu expects a working dsisr and emulate it then.


 I don't really fully understand the problem though. Why does the
 calculation break at all for you?


IIRC this was to get little endian PR setup to work. This is to avoid
handling new instructions, because in little endian mode we get
alignment interrupt for a larger instructon set

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2] POWERPC: BOOK3S: KVM: Use the saved dsisr and dar values on book3s 64

2013-12-29 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Although it's optional IBM POWER cpus always had DAR value set on
alignment interrupt. So don't try to compute these values.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 502a47ac4453..d8e2d079483d 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -599,6 +599,19 @@ unprivileged:
 
 u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+   return vcpu-arch.fault_dsisr;
+#else
+   /*
+* Mac OS X has some applications - namely the Finder - that require
+* alignment interrupts to work properly. So we need to implement them.
+
+* But the spec for 970 and 750 also looks different. While 750 requires
+* the DSISR and DAR fields to reflect some instruction bits (DSISR) and
+* the fault address (DAR), the 970 declares this as an optional 
feature.
+* So we need to reconstruct DSISR and DAR manually.
+*/
+
u32 dsisr = 0;
 
/*
@@ -637,10 +650,24 @@ u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, 
unsigned int inst)
dsisr |= (inst  16)  0x03ff; /* bits 22:31 */
 
return dsisr;
+#endif
 }
 
 ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst)
 {
+#ifdef CONFIG_PPC_BOOK3S_64
+   return vcpu-arch.fault_dar;
+#else
+   /*
+* Mac OS X has some applications - namely the Finder - that require
+* alignment interrupts to work properly. So we need to implement them.
+
+* But the spec for 970 and 750 also looks different. While 750 requires
+* the DSISR and DAR fields to reflect some instruction bits (DSISR) and
+* the fault address (DAR), the 970 declares this as an optional 
feature.
+* So we need to reconstruct DSISR and DAR manually.
+*/
+
ulong dar = 0;
ulong ra = get_ra(inst);
ulong rb = get_rb(inst);
@@ -665,4 +692,5 @@ ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned 
int inst)
}
 
return dar;
+#endif
 }
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: thp: Fix crash on mremap

2014-01-01 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
NOTE:
For other archs we would just be removing the pgtable from the list and adding 
it back.
I didn't find an easy way to make it not do that without lots of #ifdef around. 
Any
suggestion around that is welcome.

 mm/huge_memory.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7de1bf85f683..eb2e60d9ba45 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1500,24 +1500,23 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
 */
ret = __pmd_trans_huge_lock(old_pmd, vma, old_ptl);
if (ret == 1) {
+   pgtable_t pgtable;
+
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
-   pgtable_t pgtable;
-
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
-   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
-   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
+   if (new_ptl != old_ptl)
spin_unlock(new_ptl);
-   }
spin_unlock(old_ptl);
}
 out:
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V2] powerpc: thp: Fix crash on mremap

2014-01-02 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Changes from V1:
* limit the withraw/deposit to only ppc64

 arch/Kconfig   |  3 +++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 include/linux/huge_mm.h|  6 ++
 mm/huge_memory.c   | 21 -
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index f1cf895c040f..3759e70a649d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -371,6 +371,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
bool
 
+config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   bool
+
 config HAVE_ARCH_SOFT_DIRTY
bool
 
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index bca2465a9c34..5f83b4334e5f 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
+   select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 91672e2deec3..836242a738a5 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -230,4 +230,10 @@ static inline int do_huge_pmd_numa_page(struct mm_struct 
*mm, struct vm_area_str
 
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 1
+#else
+#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 0
+#endif
+
 #endif /* _LINUX_HUGE_MM_H */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7de1bf85f683..32006b51d102 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1505,19 +1505,22 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*/
+   if (new_ptl != old_ptl || ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW) {
pgtable_t pgtable;
-
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-   spin_unlock(new_ptl);
}
+   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
 out:
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


RE: [PATCH -V2] powerpc: thp: Fix crash on mremap

2014-01-02 Thread Aneesh Kumar K.V
Kirill A. Shutemov kirill.shute...@linux.intel.com writes:

 Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This patch fix the below crash
 
 NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
 LR [c00439ac] .hash_page+0x18c/0x5e0
 ...
 Call Trace:
 [c00736103c40] [1b00] 0x1b00(unreliable)
 [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58
 
 On ppc64 we use the pgtable for storing the hpte slot information and
 store address to the pgtable at a constant offset (PTRS_PER_PMD) from
 pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
 the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
 from new pmd.
 
 We also want to move the withdraw and deposit before the set_pmd so
 that, when page fault find the pmd as trans huge we can be sure that
 pgtable can be located at the offset.
 
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
 Changes from V1:
 * limit the withraw/deposit to only ppc64
 
  arch/Kconfig   |  3 +++
  arch/powerpc/platforms/Kconfig.cputype |  1 +
  include/linux/huge_mm.h|  6 ++
  mm/huge_memory.c   | 21 -
  4 files changed, 22 insertions(+), 9 deletions(-)
 
 diff --git a/arch/Kconfig b/arch/Kconfig
 index f1cf895c040f..3759e70a649d 100644
 --- a/arch/Kconfig
 +++ b/arch/Kconfig
 @@ -371,6 +371,9 @@ config HAVE_IRQ_TIME_ACCOUNTING
  config HAVE_ARCH_TRANSPARENT_HUGEPAGE
  bool
  
 +config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW

 I don't like name of the option, but can't find any better... :(

 +bool
 +
  config HAVE_ARCH_SOFT_DIRTY
  bool
  
 diff --git a/arch/powerpc/platforms/Kconfig.cputype 
 b/arch/powerpc/platforms/Kconfig.cputype
 index bca2465a9c34..5f83b4334e5f 100644
 --- a/arch/powerpc/platforms/Kconfig.cputype
 +++ b/arch/powerpc/platforms/Kconfig.cputype
 @@ -71,6 +71,7 @@ config PPC_BOOK3S_64
  select PPC_FPU
  select PPC_HAVE_PMU_SUPPORT
  select SYS_SUPPORTS_HUGETLBFS
 +select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
  select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
  
  config PPC_BOOK3E_64
 diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
 index 91672e2deec3..836242a738a5 100644
 --- a/include/linux/huge_mm.h
 +++ b/include/linux/huge_mm.h
 @@ -230,4 +230,10 @@ static inline int do_huge_pmd_numa_page(struct 
 mm_struct *mm, struct vm_area_str
  
  #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
  
 +#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
 +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 1
 +#else
 +#define ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW 0
 +#endif
 +

 Just use config option directly:

   if (new_ptl != old_ptl ||
   IS_ENABLED(CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW))


I didn't like that. I found the earlier one easier for reading.
If you and others strongly feel about this, I can redo the patch. Please let me 
know


 ...


 Otherwise, looks good:

 Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com

  #endif /* _LINUX_HUGE_MM_H */
 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 index 7de1bf85f683..32006b51d102 100644
 --- a/mm/huge_memory.c
 +++ b/mm/huge_memory.c
 @@ -1505,19 +1505,22 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
 vm_area_struct *new_vma,
  spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
  VM_BUG_ON(!pmd_none(*new_pmd));
 -set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
 -if (new_ptl != old_ptl) {
 +/*
 + * Archs like ppc64 use pgtable to store per pmd
 + * specific information. So when we switch the pmd,
 + * we should also withdraw and deposit the pgtable
 + *
 + * With split pmd lock we also need to move preallocated
 + * PTE page table if new_pmd is on different PMD page table.
 + */
 +if (new_ptl != old_ptl || ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW) {
  pgtable_t pgtable;
 -
 -/*
 - * Move preallocated PTE page table if new_pmd is on
 - * different PMD page table.
 - */
  pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
  pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
 -
 -spin_unlock(new_ptl);
  }
 +set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
 +if (new_ptl != old_ptl)
 +spin_unlock(new_ptl);
  spin_unlock(old_ptl);
  }
  out:
 -- 
 1.8.3.2

 -- 
  Kirill A. Shutemov

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https

Re: [PATCH -V2] powerpc: thp: Fix crash on mremap

2014-01-02 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Thu, 2014-01-02 at 16:22 +0530, Aneesh Kumar K.V wrote:
  Just use config option directly:
 
if (new_ptl != old_ptl ||
IS_ENABLED(CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW))
 
 
 I didn't like that. I found the earlier one easier for reading.
 If you and others strongly feel about this, I can redo the patch.
 Please let me know

 Yes, use IS_ENABLED, no need to have two indirections of #define's

 Another option is to have

   if (pmd_move_must_withdraw(new,old)) {
   }

 With in a generic header:

 #ifndef pmd_move_must_withdraw
 static inline bool pmd_move_must_withdraw(spinlock_t *new_ptl, ...)
 {
   return new_ptl != old_ptl;
 }
 #endif

 And in powerpc:

 static inline bool pmd_move_must_withdraw(spinlock_t *new_ptl, ...)
 {
   return true;
 }
 #define pmd_move_must_withdraw pmd_move_must_withdraw

This is better i guess. It is also in-line with rest of transparent
hugepage functions. I will do this.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V3 2/2] powerpc: thp: Fix crash on mremap

2014-01-06 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++
 include/asm-generic/pgtable.h| 12 
 mm/huge_memory.c | 14 +-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 9935e9b79524..ff3afce40f3b 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -12,6 +12,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include linux/spinlock.h
 /*
  * This is the default implementation of various PTE accessors, it's
  * used in all cases except Book3S with 64K pages where we have a
@@ -459,5 +460,18 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_INVALIDATE
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   return true;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index db0923458940..8e4f41d9af4d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*/
+   return new_pmd_ptl != old_pmd_ptl;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9c0b17295ba0..b77bb5df4db9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
-   pgtable_t pgtable;
 
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
+   if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+   pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-   spin_unlock(new_ptl);
}
+   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
 out:
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH -V3 1/2] powerpc: mm: Move ppc64 page table range definitions to separate header

2014-01-06 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This avoid mmu-hash64.h including pagetable-ppc64.h. That inclusion
cause issues like

  CC  arch/powerpc/kernel/asm-offsets.s
In file included from 
/home/aneesh/linus/arch/powerpc/include/asm/mmu-hash64.h:23:0,
 from /home/aneesh/linus/arch/powerpc/include/asm/mmu.h:196,
 from /home/aneesh/linus/arch/powerpc/include/asm/lppaca.h:36,
 from /home/aneesh/linus/arch/powerpc/include/asm/paca.h:21,
 from /home/aneesh/linus/arch/powerpc/include/asm/hw_irq.h:41,
 from /home/aneesh/linus/arch/powerpc/include/asm/irqflags.h:11,
 from include/linux/irqflags.h:15,
 from include/linux/spinlock.h:53,
 from include/linux/seqlock.h:35,
 from include/linux/time.h:5,
 from include/uapi/linux/timex.h:56,
 from include/linux/timex.h:56,
 from include/linux/sched.h:17,
 from arch/powerpc/kernel/asm-offsets.c:17:
/home/aneesh/linus/arch/powerpc/include/asm/pgtable-ppc64.h:563:42: error: 
unknown type name ‘spinlock_t’
 static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---

NOTE: We can either do this or stuck a typdef struct spinlock spinlock_t; in 
pgtable-ppc64.h 

 arch/powerpc/include/asm/mmu-hash64.h  |   2 +-
 arch/powerpc/include/asm/pgtable-ppc64-range.h | 101 +
 arch/powerpc/include/asm/pgtable-ppc64.h   | 101 +
 3 files changed, 103 insertions(+), 101 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pgtable-ppc64-range.h

diff --git a/arch/powerpc/include/asm/mmu-hash64.h 
b/arch/powerpc/include/asm/mmu-hash64.h
index 807014dde821..895b4df31fec 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/mmu-hash64.h
@@ -20,7 +20,7 @@
  * need for various slices related matters. Note that this isn't the
  * complete pgtable.h but only a portion of it.
  */
-#include asm/pgtable-ppc64.h
+#include asm/pgtable-ppc64-range.h
 #include asm/bug.h
 
 /*
diff --git a/arch/powerpc/include/asm/pgtable-ppc64-range.h 
b/arch/powerpc/include/asm/pgtable-ppc64-range.h
new file mode 100644
index ..b48b089fb209
--- /dev/null
+++ b/arch/powerpc/include/asm/pgtable-ppc64-range.h
@@ -0,0 +1,101 @@
+#ifndef _ASM_POWERPC_PGTABLE_PPC64_RANGE_H_
+#define _ASM_POWERPC_PGTABLE_PPC64_RANGE_H_
+/*
+ * This file contains the functions and defines necessary to modify and use
+ * the ppc64 hashed page table.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include asm/pgtable-ppc64-64k.h
+#else
+#include asm/pgtable-ppc64-4k.h
+#endif
+#include asm/barrier.h
+
+#define FIRST_USER_ADDRESS 0
+
+/*
+ * Size of EA range mapped by our pagetables.
+ */
+#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
+   PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+#define PGTABLE_RANGE (ASM_CONST(1)  PGTABLE_EADDR_SIZE)
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PMD_CACHE_INDEX(PMD_INDEX_SIZE + 1)
+#else
+#define PMD_CACHE_INDEXPMD_INDEX_SIZE
+#endif
+/*
+ * Define the address range of the kernel non-linear virtual area
+ */
+
+#ifdef CONFIG_PPC_BOOK3E
+#define KERN_VIRT_START ASM_CONST(0x8000)
+#else
+#define KERN_VIRT_START ASM_CONST(0xD000)
+#endif
+#define KERN_VIRT_SIZE ASM_CONST(0x1000)
+
+/*
+ * The vmalloc space starts at the beginning of that region, and
+ * occupies half of it on hash CPUs and a quarter of it on Book3E
+ * (we keep a quarter for the virtual memmap)
+ */
+#define VMALLOC_START  KERN_VIRT_START
+#ifdef CONFIG_PPC_BOOK3E
+#define VMALLOC_SIZE   (KERN_VIRT_SIZE  2)
+#else
+#define VMALLOC_SIZE   (KERN_VIRT_SIZE  1)
+#endif
+#define VMALLOC_END(VMALLOC_START + VMALLOC_SIZE)
+
+/*
+ * The second half of the kernel virtual space is used for IO mappings,
+ * it's itself carved into the PIO region (ISA and PHB IO space) and
+ * the ioremap space
+ *
+ *  ISA_IO_BASE = KERN_IO_START, 64K reserved area
+ *  PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces
+ * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE
+ */
+#define KERN_IO_START  (KERN_VIRT_START + (KERN_VIRT_SIZE  1))
+#define FULL_IO_SIZE   0x8000ul
+#define  ISA_IO_BASE   (KERN_IO_START)
+#define  ISA_IO_END(KERN_IO_START + 0x1ul)
+#define  PHB_IO_BASE   (ISA_IO_END)
+#define  PHB_IO_END(KERN_IO_START + FULL_IO_SIZE)
+#define IOREMAP_BASE   (PHB_IO_END)
+#define IOREMAP_END(KERN_VIRT_START + KERN_VIRT_SIZE)
+
+
+/*
+ * Region IDs
+ */
+#define REGION_SHIFT   60UL
+#define REGION_MASK(0xfUL  REGION_SHIFT)
+#define REGION_ID(ea)  (((unsigned long)(ea))  REGION_SHIFT)
+
+#define VMALLOC_REGION_ID  (REGION_ID(VMALLOC_START))
+#define KERNEL_REGION_ID   (REGION_ID

Re: [PATCH -V3 1/2] powerpc: mm: Move ppc64 page table range definitions to separate header

2014-01-06 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Mon, 2014-01-06 at 14:33 +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This avoid mmu-hash64.h including pagetable-ppc64.h. That inclusion
 cause issues like

 I don't like this. We have that stuff split into too many includes
 already it's a mess.

I understand. Let me know, if you have any suggestion on cleaning that
up. I can do that.


 Why do we need to include it from mmu*.h ?

in mmu-hash64.h added by me via 78f1dbde9fd020419313c2a0c3b602ea2427118f

/*
 * This is necessary to get the definition of PGTABLE_RANGE which we
 * need for various slices related matters. Note that this isn't the
 * complete pgtable.h but only a portion of it.
 */
#include asm/pgtable-ppc64.h

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V4] powerpc: thp: Fix crash on mremap

2014-01-12 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Changes from V3:
* Drop powerpc: mm: Move ppc64 page table range definitions to separate 
header patch

 arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++
 include/asm-generic/pgtable.h| 12 
 mm/huge_memory.c | 14 +-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 4a191c472867..d27960c89a71 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_INVALIDATE
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+typedef struct spinlock spinlock_t;
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   return true;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index db0923458940..8e4f41d9af4d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*/
+   return new_pmd_ptl != old_pmd_ptl;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..5d80c53b87cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
-   pgtable_t pgtable;
 
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
+   if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+   pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-   spin_unlock(new_ptl);
}
+   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
 out:
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V4] powerpc: thp: Fix crash on mremap

2014-01-13 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Mon, 2014-01-13 at 11:34 +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This patch fix the below crash

 Andrea, can you ack the generic bit please ?

 Thanks !

Kirill A. Shutemov did ack an earlier version

http://article.gmane.org/gmane.linux.kernel.mm/111368

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH powerpc] Fix compile error of pgtable-ppc64.h

2014-01-16 Thread Aneesh Kumar K.V
Li Zhong zh...@linux.vnet.ibm.com writes:

 It seems that forward declaration couldn't work well with typedef, use
 struct spinlock directly to avoiding following build errors:

 In file included from include/linux/spinlock.h:81,
  from include/linux/seqlock.h:35,
  from include/linux/time.h:5,
  from include/uapi/linux/timex.h:56,
  from include/linux/timex.h:56,
  from include/linux/sched.h:17,
  from arch/powerpc/kernel/asm-offsets.c:17:
 include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t'
 /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous 
 declaration of 'spinlock_t' was here


what compiler version ? I have seen that error in gcc 4.3 and it was
concluded that it is too old a compiler version to worry about. That
specific compiler version also gave error for forward declaring struct;

 Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com
 ---
  arch/powerpc/include/asm/pgtable-ppc64.h |6 +++---
  1 files changed, 3 insertions(+), 3 deletions(-)

 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
 b/arch/powerpc/include/asm/pgtable-ppc64.h
 index d27960c..bc141c9 100644
 --- a/arch/powerpc/include/asm/pgtable-ppc64.h
 +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
 @@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, 
 unsigned long address,
   pmd_t *pmdp);

  #define pmd_move_must_withdraw pmd_move_must_withdraw
 -typedef struct spinlock spinlock_t;
 -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
 -  spinlock_t *old_pmd_ptl)
 +struct spinlock;
 +static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
 +  struct spinlock *old_pmd_ptl)
  {
   /*
* Archs like ppc64 use pgtable to store per pmd

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/4] powernv: kvm: make _PAGE_NUMA take effect

2014-01-20 Thread Aneesh Kumar K.V
Liu Ping Fan kernelf...@gmail.com writes:

 To make _PAGE_NUMA take effect, we should force the checking when
 guest uses hypercall to setup hpte.

 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com
 ---
  arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 9c51544..af8602d 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -232,7 +232,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
   /* Look up the Linux PTE for the backing page */
   pte_size = psize;
   pte = lookup_linux_pte(pgdir, hva, writing, pte_size);
 - if (pte_present(pte)) {
 + if (pte_present(pte)  !pte_numa(pte)) {
   if (writing  !pte_write(pte))
   /* make the actual HPTE be read-only */
   ptel = hpte_make_readonly(ptel);

How did we end up doing h_enter on a pte entry with pte_numa bit set ?

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/4] powernv: kvm: numa fault improvement

2014-01-20 Thread Aneesh Kumar K.V
Liu ping fan kernelf...@gmail.com writes:

 On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote:

 On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote:

 This series is based on Aneesh's series  [PATCH -V2 0/5] powerpc: mm: Numa 
 faults support for ppc64

 For this series, I apply the same idea from the previous thread [PATCH 
 0/3] optimize for powerpc _PAGE_NUMA
 (for which, I still try to get a machine to show nums)

 But for this series, I think that I have a good justification -- the fact 
 of heavy cost when switching context between guest and host,
 which is  well known.

 This cover letter isn't really telling me anything. Please put a proper 
 description of what you're trying to achieve, why you're trying to achieve 
 what you're trying and convince your readers that it's a good idea to do it 
 the way you do it.

 Sorry for the unclear message. After introducing the _PAGE_NUMA,
 kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it
 should rely on host's kvmppc_book3s_hv_page_fault() to call
 do_numa_page() to do the numa fault check. This incurs the overhead
 when exiting from rmode to vmode.  My idea is that in
 kvmppc_do_h_enter(), we do a quick check, if the page is right placed,
 there is no need to exit to vmode (i.e saving htab, slab switching)

Can you explain more. Are we looking at hcall from guest  and
hypervisor handling them in real mode ? If so why would guest issue a
hcall on a pte entry that have PAGE_NUMA set. Or is this about
hypervisor handling a missing hpte, because of host swapping this page
out ? In that case how we end up in h_enter ? IIUC for that case we
should get to kvmppc_hpte_hv_fault. 



 If my suppose is correct, will CCing k...@vger.kernel.org from next version.

 This translates to me as This is an RFC?

 Yes, I am not quite sure about it. I have no bare-metal to verify it.
 So I hope at least, from the theory, it is correct.


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/4] powernv: kvm: numa fault improvement

2014-01-20 Thread Aneesh Kumar K.V
Liu ping fan kernelf...@gmail.com writes:

 On Mon, Jan 20, 2014 at 11:45 PM, Aneesh Kumar K.V
 aneesh.ku...@linux.vnet.ibm.com wrote:
 Liu ping fan kernelf...@gmail.com writes:

 On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote:

 On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote:

 This series is based on Aneesh's series  [PATCH -V2 0/5] powerpc: mm: 
 Numa faults support for ppc64

 For this series, I apply the same idea from the previous thread [PATCH 
 0/3] optimize for powerpc _PAGE_NUMA
 (for which, I still try to get a machine to show nums)

 But for this series, I think that I have a good justification -- the fact 
 of heavy cost when switching context between guest and host,
 which is  well known.

 This cover letter isn't really telling me anything. Please put a proper 
 description of what you're trying to achieve, why you're trying to achieve 
 what you're trying and convince your readers that it's a good idea to do 
 it the way you do it.

 Sorry for the unclear message. After introducing the _PAGE_NUMA,
 kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it
 should rely on host's kvmppc_book3s_hv_page_fault() to call
 do_numa_page() to do the numa fault check. This incurs the overhead
 when exiting from rmode to vmode.  My idea is that in
 kvmppc_do_h_enter(), we do a quick check, if the page is right placed,
 there is no need to exit to vmode (i.e saving htab, slab switching)

 Can you explain more. Are we looking at hcall from guest  and
 hypervisor handling them in real mode ? If so why would guest issue a
 hcall on a pte entry that have PAGE_NUMA set. Or is this about
 hypervisor handling a missing hpte, because of host swapping this page
 out ? In that case how we end up in h_enter ? IIUC for that case we
 should get to kvmppc_hpte_hv_fault.

 After setting _PAGE_NUMA, we should flush out all hptes both in host's
 htab and guest's. So when guest tries to access memory, host finds
 that there is not hpte ready for guest in guest's htab. And host
 should raise dsi to guest.

Now guest receive that fault, removes the PAGE_NUMA bit and do an
hpte_insert. So before we do an hpte_insert (or H_ENTER) we should have
cleared PAGE_NUMA bit.

This incurs that guest ends up in h_enter.
 And you can see in current code, we also try this quick path firstly.
 Only if fail, we will resort to slow path --  kvmppc_hpte_hv_fault.

hmm ? hpte_hv_fault is the hypervisor handling the fault.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect

2014-01-21 Thread Aneesh Kumar K.V
Liu Ping Fan kernelf...@gmail.com writes:

 To make sure that on host, the pages marked with _PAGE_NUMA result in a fault
 when guest access them, we should force the checking when guest uses hypercall
 to setup hpte.

 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com

Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

When we mark pte with _PAGE_NUMA we already call 
mmu_notifier_invalidate_range_start and
mmu_notifier_invalidate_range_end, which will mark existing guest hpte
entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new
guest hpte entries. This patch does that. 

 ---
 v2:
   It should be the reply to [PATCH 2/4] powernv: kvm: make _PAGE_NUMA take 
 effect
   And I imporve the changelog according to Aneesh's suggestion.
 ---
  arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
 b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 index 9c51544..af8602d 100644
 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
 @@ -232,7 +232,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long 
 flags,
   /* Look up the Linux PTE for the backing page */
   pte_size = psize;
   pte = lookup_linux_pte(pgdir, hva, writing, pte_size);
 - if (pte_present(pte)) {
 + if (pte_present(pte)  !pte_numa(pte)) {
   if (writing  !pte_write(pte))
   /* make the actual HPTE be read-only */
   ptel = hpte_make_readonly(ptel);
 -- 
 1.8.1.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 0/4] powernv: kvm: numa fault improvement

2014-01-21 Thread Aneesh Kumar K.V
Paul Mackerras pau...@samba.org writes:

 On Mon, Jan 20, 2014 at 03:48:36PM +0100, Alexander Graf wrote:
 
 On 15.01.2014, at 07:36, Liu ping fan kernelf...@gmail.com wrote:
 
  On Thu, Jan 9, 2014 at 8:08 PM, Alexander Graf ag...@suse.de wrote:
  
  On 11.12.2013, at 09:47, Liu Ping Fan kernelf...@gmail.com wrote:
  
  This series is based on Aneesh's series  [PATCH -V2 0/5] powerpc: mm: 
  Numa faults support for ppc64
  
  For this series, I apply the same idea from the previous thread [PATCH 
  0/3] optimize for powerpc _PAGE_NUMA
  (for which, I still try to get a machine to show nums)
  
  But for this series, I think that I have a good justification -- the 
  fact of heavy cost when switching context between guest and host,
  which is  well known.
  
  This cover letter isn't really telling me anything. Please put a proper 
  description of what you're trying to achieve, why you're trying to 
  achieve what you're trying and convince your readers that it's a good 
  idea to do it the way you do it.
  
  Sorry for the unclear message. After introducing the _PAGE_NUMA,
  kvmppc_do_h_enter() can not fill up the hpte for guest. Instead, it
  should rely on host's kvmppc_book3s_hv_page_fault() to call
  do_numa_page() to do the numa fault check. This incurs the overhead
  when exiting from rmode to vmode.  My idea is that in
  kvmppc_do_h_enter(), we do a quick check, if the page is right placed,
  there is no need to exit to vmode (i.e saving htab, slab switching)
  
  If my suppose is correct, will CCing k...@vger.kernel.org from next 
  version.
  
  This translates to me as This is an RFC?
  
  Yes, I am not quite sure about it. I have no bare-metal to verify it.
  So I hope at least, from the theory, it is correct.
 
 Paul, could you please give this some thought and maybe benchmark it?

 OK, once I get Aneesh to tell me how I get to have ptes with
 _PAGE_NUMA set in the first place. :)


I guess we want patch 2, Which Liu has sent separately and I have
reviewed. http://article.gmane.org/gmane.comp.emulators.kvm.powerpc.devel/8619
I am not sure about the rest of the patches in the series.
We definitely don't want to numa migrate on henter. We may want to do
that on fault. But even there, IMHO, we should let the host take the
fault and do the numa migration instead of doing this in guest context.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect

2014-01-27 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 21.01.2014, at 10:42, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:

 Liu Ping Fan kernelf...@gmail.com writes:
 
 To make sure that on host, the pages marked with _PAGE_NUMA result in a 
 fault
 when guest access them, we should force the checking when guest uses 
 hypercall
 to setup hpte.
 
 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com
 
 Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 When we mark pte with _PAGE_NUMA we already call 
 mmu_notifier_invalidate_range_start and
 mmu_notifier_invalidate_range_end, which will mark existing guest hpte
 entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new
 guest hpte entries. This patch does that. 

 So what happens next? We insert a page into the HTAB without
 HPTE_V_VALID set, so the guest will fail to use it. If the guest does
 an H_READ on it it will suddenly turn to V_VALID though?

As per the guest the entry is valid, so yes an hread should return a
valid entry. But in real hpte we would mark it not valid.


 I might need a crash course in the use of HPTE_V_ABSENT.

When guest tries to access the address, the host will handle the fault.

kvmppc_hpte_hv_fault should give more info

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH v2] powernv: kvm: make _PAGE_NUMA take effect

2014-01-27 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 27.01.2014, at 11:28, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:

 Alexander Graf ag...@suse.de writes:
 
 On 21.01.2014, at 10:42, Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com 
 wrote:
 
 Liu Ping Fan kernelf...@gmail.com writes:
 
 To make sure that on host, the pages marked with _PAGE_NUMA result in a 
 fault
 when guest access them, we should force the checking when guest uses 
 hypercall
 to setup hpte.
 
 Signed-off-by: Liu Ping Fan pingf...@linux.vnet.ibm.com
 
 Reviewed-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 When we mark pte with _PAGE_NUMA we already call 
 mmu_notifier_invalidate_range_start and
 mmu_notifier_invalidate_range_end, which will mark existing guest hpte
 entry as HPTE_V_ABSENT. Now we need to do that when we are inserting new
 guest hpte entries. This patch does that. 
 
 So what happens next? We insert a page into the HTAB without
 HPTE_V_VALID set, so the guest will fail to use it. If the guest does
 an H_READ on it it will suddenly turn to V_VALID though?
 
 As per the guest the entry is valid, so yes an hread should return a
 valid entry. But in real hpte we would mark it not valid.

 Ah, yes.

 
 
 I might need a crash course in the use of HPTE_V_ABSENT.
 
 When guest tries to access the address, the host will handle the fault.
 
 kvmppc_hpte_hv_fault should give more info

 Thanks for the pointer. So we fault it in lazily. Is there any
 particular reason we can't do that on h_enter already? After all this
 just means an additional roundtrip because the guest is pretty likely
 to use the page it just entered, no?

We could get wrong numa fault information if we didn't do h_enter from
the right node from which we faulted.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V3] POWERPC: BOOK3S: KVM: Use the saved dsisr and dar values on book3s 64

2014-01-27 Thread Aneesh Kumar K.V
Although it's optional IBM POWER cpus always had DAR value set on
alignment interrupt. So don't try to compute these values.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Changes from V2:
* Depend on cpu feature flag to decide whether to use fault_dsir or not

 arch/powerpc/include/asm/cputable.h|  1 +
 arch/powerpc/include/asm/disassemble.h | 34 +
 arch/powerpc/kernel/align.c| 34 +
 arch/powerpc/kernel/cputable.c | 15 +++-
 arch/powerpc/kvm/book3s_emulate.c  | 69 --
 5 files changed, 82 insertions(+), 71 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 0d4939ba48e7..1922dce6124d 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -555,6 +555,7 @@ static inline int cpu_has_feature(unsigned long feature)
 }
 
 #define HBP_NUM 1
+extern struct cpu_spec *find_cpuspec(unsigned int pvr);
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/disassemble.h 
b/arch/powerpc/include/asm/disassemble.h
index 856f8deb557a..6330a61b875a 100644
--- a/arch/powerpc/include/asm/disassemble.h
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -81,4 +81,38 @@ static inline unsigned int get_oc(u32 inst)
 {
return (inst  11)  0x7fff;
 }
+
+#define IS_XFORM(inst) (get_op(inst)  == 31)
+#define IS_DSFORM(inst)(get_op(inst) = 56)
+
+/*
+ * Create a DSISR value from the instruction
+ */
+static inline unsigned make_dsisr(unsigned instr)
+{
+   unsigned dsisr;
+
+
+   /* bits  6:15 -- 22:31 */
+   dsisr = (instr  0x03ff)  16;
+
+   if (IS_XFORM(instr)) {
+   /* bits 29:30 -- 15:16 */
+   dsisr |= (instr  0x0006)  14;
+   /* bit 25 --17 */
+   dsisr |= (instr  0x0040)  8;
+   /* bits 21:24 -- 18:21 */
+   dsisr |= (instr  0x0780)  3;
+   } else {
+   /* bit  5 --17 */
+   dsisr |= (instr  0x0400)  12;
+   /* bits  1: 4 -- 18:21 */
+   dsisr |= (instr  0x7800)  17;
+   /* bits 30:31 -- 12:13 */
+   if (IS_DSFORM(instr))
+   dsisr |= (instr  0x0003)  18;
+   }
+
+   return dsisr;
+}
 #endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index de91f3ae631e..111d93ec7f34 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -25,14 +25,13 @@
 #include asm/cputable.h
 #include asm/emulated_ops.h
 #include asm/switch_to.h
+#include asm/disassemble.h
 
 struct aligninfo {
unsigned char len;
unsigned char flags;
 };
 
-#define IS_XFORM(inst) (((inst)  26) == 31)
-#define IS_DSFORM(inst)(((inst)  26) = 56)
 
 #define INVALID{ 0, 0 }
 
@@ -192,37 +191,6 @@ static struct aligninfo aligninfo[128] = {
 };
 
 /*
- * Create a DSISR value from the instruction
- */
-static inline unsigned make_dsisr(unsigned instr)
-{
-   unsigned dsisr;
-
-
-   /* bits  6:15 -- 22:31 */
-   dsisr = (instr  0x03ff)  16;
-
-   if (IS_XFORM(instr)) {
-   /* bits 29:30 -- 15:16 */
-   dsisr |= (instr  0x0006)  14;
-   /* bit 25 --17 */
-   dsisr |= (instr  0x0040)  8;
-   /* bits 21:24 -- 18:21 */
-   dsisr |= (instr  0x0780)  3;
-   } else {
-   /* bit  5 --17 */
-   dsisr |= (instr  0x0400)  12;
-   /* bits  1: 4 -- 18:21 */
-   dsisr |= (instr  0x7800)  17;
-   /* bits 30:31 -- 12:13 */
-   if (IS_DSFORM(instr))
-   dsisr |= (instr  0x0003)  18;
-   }
-
-   return dsisr;
-}
-
-/*
  * The dcbz (data cache block zero) instruction
  * gives an alignment fault if used on non-cacheable
  * memory.  We handle the fault mainly for the
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 597d954e5860..b367f5b772f6 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -116,7 +116,7 @@ extern void __restore_cpu_e6500(void);
 PPC_FEATURE_BOOKE)
 #endif
 
-static struct cpu_spec __initdata cpu_specs[] = {
+static struct cpu_spec cpu_specs[] = {
 #ifdef CONFIG_PPC_BOOK3S_64
{   /* Power3 */
.pvr_mask   = 0x,
@@ -2258,3 +2258,16 @@ struct cpu_spec * __init identify_cpu(unsigned long 
offset, unsigned int pvr)
 
return NULL;
 }
+
+struct cpu_spec *find_cpuspec(unsigned int pvr)
+{
+   int i;
+   struct cpu_spec *s = cpu_specs;
+
+   for (i = 0; i  ARRAY_SIZE(cpu_specs); i++, s++) {
+   if ((pvr  s-pvr_mask) == s-pvr_value)
+   return s;
+   }
+   return NULL

[PATCH V3] KVM: PPC: BOOK3S: PR: Enable Little Endian PR guest

2014-01-27 Thread Aneesh Kumar K.V
This patch make sure we inherit the LE bit correctly in different case
so that we can run Little Endian distro in PR mode

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
Changes from V2:
 * Move H_SET_MODE to qemu


 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/kernel/asm-offsets.c   |  1 +
 arch/powerpc/kvm/book3s_64_mmu.c|  2 +-
 arch/powerpc/kvm/book3s_pr.c| 32 +++-
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 207b7826c9b1..f4be7be14330 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -550,6 +550,7 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_PPC_BOOK3S
ulong fault_dar;
u32 fault_dsisr;
+   unsigned long intr_msr;
 #endif
 
 #ifdef CONFIG_BOOKE
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index b754f629a177..7484676b8f25 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -518,6 +518,7 @@ int main(void)
DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr));
DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr));
DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar));
+   DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr));
DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));
DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap));
DEFINE(VCPU_PTID, offsetof(struct kvm_vcpu, arch.ptid));
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 83da1f868fd5..8231b83c493b 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -38,7 +38,7 @@
 
 static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu)
 {
-   kvmppc_set_msr(vcpu, MSR_SF);
+   kvmppc_set_msr(vcpu, vcpu-arch.intr_msr);
 }
 
 static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe(
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index eb070eb4da40..828056ec208f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -263,7 +263,7 @@ static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu)
ulong smsr = vcpu-arch.shared-msr;
 
/* Guest MSR values */
-   smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE;
+   smsr = MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE;
/* Process MSR values */
smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE;
/* External providers the guest reserved */
@@ -1178,6 +1178,15 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
break;
}
 #endif /* CONFIG_VSX */
+   case KVM_REG_PPC_LPCR:
+   /*
+* We are only interested in the LPCR_ILE bit
+*/
+   if (vcpu-arch.intr_msr  MSR_LE)
+   *val = get_reg_val(id, LPCR_ILE);
+   else
+   *val = get_reg_val(id, 0);
+   break;
default:
r = -EINVAL;
break;
@@ -1186,6 +1195,23 @@ static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
return r;
 }
 
+static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr)
+{
+   struct kvm *kvm = vcpu-kvm;
+   /*
+* If ILE (interrupt little-endian) has changed, update the
+* MSR_LE bit in the intr_msr for each vcpu in this vcore.
+*/
+   if ((new_lpcr  LPCR_ILE) != (vcpu-arch.intr_msr  MSR_LE)) {
+   mutex_lock(kvm-lock);
+   if (new_lpcr  LPCR_ILE)
+   vcpu-arch.intr_msr |= MSR_LE;
+   else
+   vcpu-arch.intr_msr = ~MSR_LE;
+   mutex_unlock(kvm-lock);
+   }
+}
+
 static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id,
 union kvmppc_one_reg *val)
 {
@@ -1209,6 +1235,9 @@ static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, 
u64 id,
break;
}
 #endif /* CONFIG_VSX */
+   case KVM_REG_PPC_LPCR:
+   kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val));
+   break;
default:
r = -EINVAL;
break;
@@ -1261,6 +1290,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct 
kvm *kvm,
vcpu-arch.pvr = 0x3C0301;
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
vcpu-arch.pvr = mfspr(SPRN_PVR);
+   vcpu-arch.intr_msr = MSR_SF;
 #else
/* default to book3s_32 (750) */
vcpu-arch.pvr = 0x84202;
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: thp: Fix crash on mremap

2014-01-28 Thread Aneesh Kumar K.V
This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.11 stable series

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/Kconfig   |  3 +++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 mm/huge_memory.c   | 12 
 3 files changed, 16 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index 1feb169274fe..c5863b35d054 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -368,6 +368,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 config HAVE_ARCH_SOFT_DIRTY
bool
 
+config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   bool
+
 config HAVE_MOD_ARCH_SPECIFIC
bool
help
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 47d9a03dd415..d11a34be018d 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
+   select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 12acb0ba7991..beaa7cc9de75 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1461,8 +1461,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
 
ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   pgtable_t pgtable;
+#endif
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+#endif
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(mm-page_table_lock);
}
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH] powerpc: thp: Fix crash on mremap

2014-01-28 Thread Aneesh Kumar K.V
This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.12 stable series

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/Kconfig   |  3 +++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 mm/huge_memory.c   | 12 
 3 files changed, 16 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index af2cc6eabcc7..bca9e7a18bd2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -365,6 +365,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 config HAVE_ARCH_SOFT_DIRTY
bool
 
+config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   bool
+
 config HAVE_MOD_ARCH_SPECIFIC
bool
help
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 6704e2e20e6b..0225011231ea 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
+   select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292a266e0d42..89b7a647f1cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1474,8 +1474,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
 
ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   pgtable_t pgtable;
+#endif
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+#endif
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(mm-page_table_lock);
}
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 2/2] Fix compile error of pgtable-ppc64.h

2014-01-28 Thread Aneesh Kumar K.V
From: Li Zhong zh...@linux.vnet.ibm.com

It seems that forward declaration couldn't work well with typedef, use
struct spinlock directly to avoiding following build errors:

In file included from include/linux/spinlock.h:81,
 from include/linux/seqlock.h:35,
 from include/linux/time.h:5,
 from include/uapi/linux/timex.h:56,
 from include/linux/timex.h:56,
 from include/linux/sched.h:17,
 from arch/powerpc/kernel/asm-offsets.c:17:
include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t'
/root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous 
declaration of 'spinlock_t' was here

build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.13 stable series

Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index d27960c89a71..bc141c950b1e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, 
unsigned long address,
pmd_t *pmdp);
 
 #define pmd_move_must_withdraw pmd_move_must_withdraw
-typedef struct spinlock spinlock_t;
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-spinlock_t *old_pmd_ptl)
+struct spinlock;
+static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+struct spinlock *old_pmd_ptl)
 {
/*
 * Archs like ppc64 use pgtable to store per pmd
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/2] powerpc/thp: Fix crash on mremap

2014-01-28 Thread Aneesh Kumar K.V
This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.13 stable series

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++
 include/asm-generic/pgtable.h| 12 
 mm/huge_memory.c | 14 +-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 4a191c472867..d27960c89a71 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_INVALIDATE
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+typedef struct spinlock spinlock_t;
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   return true;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index db0923458940..8e4f41d9af4d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*/
+   return new_pmd_ptl != old_pmd_ptl;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..5d80c53b87cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
-   pgtable_t pgtable;
 
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
+   if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+   pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-   spin_unlock(new_ptl);
}
+   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
 out:
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Add POWER8 support

2014-01-28 Thread Aneesh Kumar K.V
Hello,

This patch series implements PR KVM support for POWER8 platform

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Fix PURR and SPURR emulation

2014-01-28 Thread Aneesh Kumar K.V
We definitely don't need to emulate mtspr, because both the registers
are hypervisor resource.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 --
 arch/powerpc/include/asm/kvm_host.h   |  4 ++--
 arch/powerpc/kvm/book3s_emulate.c | 16 
 arch/powerpc/kvm/book3s_pr.c  | 10 ++
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index bc23b1ba7980..396448afa38b 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -83,8 +83,6 @@ struct kvmppc_vcpu_book3s {
u64 sdr1;
u64 hior;
u64 msr_mask;
-   u64 purr_offset;
-   u64 spurr_offset;
 #ifdef CONFIG_PPC_BOOK3S_32
u32 vsid_pool[VSID_POOL_SIZE];
u32 vsid_next;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 9a0cdb2c9d58..0a3785271f34 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -506,8 +506,8 @@ struct kvm_vcpu_arch {
 #ifdef CONFIG_BOOKE
u32 decar;
 #endif
-   u32 tbl;
-   u32 tbu;
+   /* Time base value when we entered the guest */
+   u64 entry_tb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index a7d54aa203d0..e1f1e5e16449 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -422,12 +422,6 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
(mfmsr()  MSR_HV))
vcpu-arch.hflags |= BOOK3S_HFLAG_DCBZ32;
break;
-   case SPRN_PURR:
-   to_book3s(vcpu)-purr_offset = spr_val - get_tb();
-   break;
-   case SPRN_SPURR:
-   to_book3s(vcpu)-spurr_offset = spr_val - get_tb();
-   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
@@ -523,10 +517,16 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
*spr_val = 0;
break;
case SPRN_PURR:
-   *spr_val = get_tb() + to_book3s(vcpu)-purr_offset;
+   /*
+* On exit we would have updated purr
+*/
+   *spr_val = vcpu-arch.purr;
break;
case SPRN_SPURR:
-   *spr_val = get_tb() + to_book3s(vcpu)-purr_offset;
+   /*
+* On exit we would have updated spurr
+*/
+   *spr_val = vcpu-arch.spurr;
break;
case SPRN_GQR0:
case SPRN_GQR1:
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index fdcbabdfb709..02231f5193c2 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -115,6 +115,11 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu 
*svcpu,
svcpu-lr  = vcpu-arch.lr;
svcpu-pc  = vcpu-arch.pc;
svcpu-in_use = true;
+   /*
+* Now also save the current time base value. We use this
+* to find the guest purr and spurr value.
+*/
+   vcpu-arch.entry_tb = get_tb();
 }
 
 /* Copy data touched by real-mode code from shadow vcpu back to vcpu */
@@ -161,6 +166,11 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
 
 out:
preempt_enable();
+   /*
+* Update purr and spurr using time base
+*/
+   vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb;
+   vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb;
 }
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 02/10] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-01-28 Thread Aneesh Kumar K.V
virtual time base register is a per vm register and need to saved
and restored on vm exit and entry. Writing to VTB is not allowed
in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/include/asm/reg.h  |  7 +++
 arch/powerpc/include/asm/time.h | 12 
 arch/powerpc/kvm/book3s_emulate.c   |  3 +++
 arch/powerpc/kvm/book3s_pr.c|  3 +++
 5 files changed, 26 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 0a3785271f34..9ebdd12e50a9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -508,6 +508,7 @@ struct kvm_vcpu_arch {
 #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e789f76c9bc2..6c649355b1e9 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1161,6 +1161,13 @@
 #define mtspr(rn, v)   asm volatile(mtspr  __stringify(rn) ,%0 : \
 : r ((unsigned long)(v)) \
 : memory)
+#ifdef CONFIG_PPC_BOOK3S_64
+#define mfvtb()({unsigned long rval;   
\
+   asm volatile(mfspr %0, %1 :   \
+=r (rval) : i (SPRN_VTB)); rval;})
+#else
+#define mfvtb() BUG()
+#endif
 
 #ifdef __powerpc64__
 #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index c1f267694acb..1e89dbc665d9 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -101,6 +101,18 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+static inline u64 get_vtb(void)
+{
+   return mfvtb();
+}
+#else
+static inline u64 get_vtb(void)
+{
+   return 0;
+}
+#endif
+
 #ifdef CONFIG_PPC64
 static inline u64 get_tb(void)
 {
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index e1f1e5e16449..4b58d8a90cb5 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -528,6 +528,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu-arch.vtb;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 02231f5193c2..b5598e9cdd09 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -120,6 +120,8 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu 
*svcpu,
 * to find the guest purr and spurr value.
 */
vcpu-arch.entry_tb = get_tb();
+   vcpu-arch.entry_vtb = get_vtb();
+
 }
 
 /* Copy data touched by real-mode code from shadow vcpu back to vcpu */
@@ -171,6 +173,7 @@ out:
 */
vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb;
vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb;
+   vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb;
 }
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 04/10] KVM: PPC: BOOK3S: PR: Emulate Thread identification register

2014-01-28 Thread Aneesh Kumar K.V
Since PR KVM doesn't support SMT yet, we always return 0.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index abe6f3057e5b..e74dda36ebea 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -561,6 +561,12 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
case SPRN_DABR:
*spr_val = 0;
break;
+   case SPRN_TIR:
+   /*
+* We don't have SMT support for PR yet, hence always return 0
+*/
+   *spr_val = 0;
+   break;
default:
 unprivileged:
printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn);
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 05/10] KVM: PPC: BOOK3S: PR: Doorbell support

2014-01-28 Thread Aneesh Kumar K.V
We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index e74dda36ebea..9cf0392e3dcf 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
 #define OP_19_XOP_RFI  50
 
 #define OP_31_XOP_MFMSR83
+#define OP_31_XOP_MSGSNDP  142
 #define OP_31_XOP_MTMSR146
+#define OP_31_XOP_MSGCLRP  174
 #define OP_31_XOP_MTMSRD   178
 #define OP_31_XOP_MTSR 210
 #define OP_31_XOP_MTSRIN   242
@@ -286,6 +288,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 
break;
}
+   case OP_31_XOP_MSGSNDP:
+   {
+   /*
+* PR KVM still don't support SMT mode. So we should
+* not see a MSGSNDP/MSGCLRP used with PR KVM
+*/
+   pr_info(KVM: MSGSNDP used in non SMT case\n);
+   emulated = EMULATE_FAIL;
+   break;
+   }
+   case OP_31_XOP_MSGCLRP:
+   {
+   pr_infoKVM: MSGCLRP used in non SMT case\n);
+   emulated = EMULATE_FAIL;
+   break;
+   }
default:
emulated = EMULATE_FAIL;
}
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 06/10] KVM: PPC: BOOK3S: PR: Emulate DPDES register

2014-01-28 Thread Aneesh Kumar K.V
Since we don't support SMT yet, we should always find zero in
Directed privileged doorbell exception state register.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 9cf0392e3dcf..7f25adbd2590 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -585,6 +585,12 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
 */
*spr_val = 0;
break;
+   case SPRN_DPDES:
+   /*
+* We don't have SMT support for PR yet, hence always return 0
+*/
+   *spr_val = 0;
+   break;
default:
 unprivileged:
printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn);
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 08/10] KVM: PPC: BOOK3S: PR: Add support for facility unavailable interrupt

2014-01-28 Thread Aneesh Kumar K.V
At this point we allow all the supported facilities except EBB. So
forward the interrupt to guest as illegal instruction.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_asm.h |  4 +++-
 arch/powerpc/kvm/book3s.c  |  4 
 arch/powerpc/kvm/book3s_emulate.c  | 18 ++
 arch/powerpc/kvm/book3s_pr.c   | 17 +
 4 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_asm.h 
b/arch/powerpc/include/asm/kvm_asm.h
index 1bd92fd43cfb..799244face51 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -99,6 +99,7 @@
 #define BOOK3S_INTERRUPT_PERFMON   0xf00
 #define BOOK3S_INTERRUPT_ALTIVEC   0xf20
 #define BOOK3S_INTERRUPT_VSX   0xf40
+#define BOOK3S_INTERRUPT_FAC_UNAVAIL0xf60
 
 #define BOOK3S_IRQPRIO_SYSTEM_RESET0
 #define BOOK3S_IRQPRIO_DATA_SEGMENT1
@@ -117,7 +118,8 @@
 #define BOOK3S_IRQPRIO_DECREMENTER 14
 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15
 #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL  16
-#define BOOK3S_IRQPRIO_MAX 17
+#define BOOK3S_IRQPRIO_FAC_UNAVAIL 17
+#define BOOK3S_IRQPRIO_MAX 18
 
 #define BOOK3S_HFLAG_DCBZ320x1
 #define BOOK3S_HFLAG_SLB   0x2
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 8912608b7e1b..a9aea28c2677 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -143,6 +143,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;break;
case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;  break;
case 0xf40: prio = BOOK3S_IRQPRIO_VSX;  break;
+   case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;  break;
default:prio = BOOK3S_IRQPRIO_MAX;  break;
}
 
@@ -273,6 +274,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, 
unsigned int priority)
case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
vec = BOOK3S_INTERRUPT_PERFMON;
break;
+   case BOOK3S_IRQPRIO_FAC_UNAVAIL:
+   vec = BOOK3S_INTERRUPT_FAC_UNAVAIL;
+   break;
default:
deliver = 0;
printk(KERN_ERR KVM: Unknown interrupt: 0x%x\n, priority);
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 60d0b6b745e7..bf6b11021250 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -481,6 +481,15 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
vcpu-arch.shadow_fscr = vcpu-arch.fscr  host_fscr;
break;
}
+   case SPRN_EBBHR:
+   vcpu-arch.ebbhr = spr_val;
+   break;
+   case SPRN_EBBRR:
+   vcpu-arch.ebbrr = spr_val;
+   break;
+   case SPRN_BESCR:
+   vcpu-arch.bescr = spr_val;
+   break;
 unprivileged:
default:
printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn);
@@ -607,6 +616,15 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong *spr_val
case SPRN_FSCR:
*spr_val = vcpu-arch.fscr;
break;
+   case SPRN_EBBHR:
+   *spr_val = vcpu-arch.ebbhr;
+   break;
+   case SPRN_EBBRR:
+   *spr_val = vcpu-arch.ebbrr;
+   break;
+   case SPRN_BESCR:
+   *spr_val = vcpu-arch.bescr;
+   break;
default:
 unprivileged:
printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn);
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 51d469f8c9fd..828056ec208f 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -900,6 +900,23 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_PERFMON:
r = RESUME_GUEST;
break;
+   case BOOK3S_INTERRUPT_FAC_UNAVAIL:
+   {
+   /*
+* Check for the facility that need to be emulated
+*/
+   ulong fscr_ic = vcpu-arch.shadow_fscr  56;
+   if (fscr_ic != FSCR_EBB_LG) {
+   /*
+* We only disable EBB facility.
+* So only emulate that.
+*/
+   kvmppc_book3s_queue_irqprio(vcpu, exit_nr);
+   r = RESUME_GUEST;
+   break;
+   }
+   /* Fall through */
+   }
case BOOK3S_INTERRUPT_PROGRAM:
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
{
-- 
1.8.5.3

___
Linuxppc-dev mailing

[RFC PATCH 07/10] KVM: PPC: BOOK3S: PR: Emulate facility status and control register

2014-01-28 Thread Aneesh Kumar K.V
We allow priv-mode update of this. The guest value is saved in fscr,
and the value actually used is saved in shadow_fscr. shadow_fscr
only contains values that are allowed by the host. On
facility unavailable interrupt, if the facility is allowed by fscr
but disabled in shadow_fscr we need to emulate the support. Currently
all but EBB is disabled. We still don't support performance monitoring
in PR guest.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_book3s_asm.h |  1 +
 arch/powerpc/include/asm/kvm_host.h   |  1 +
 arch/powerpc/kernel/asm-offsets.c |  2 ++
 arch/powerpc/kvm/book3s_emulate.c | 16 
 arch/powerpc/kvm/book3s_interrupts.S  | 25 ++---
 5 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h 
b/arch/powerpc/include/asm/kvm_book3s_asm.h
index 192917d2239c..abd42523ad93 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -103,6 +103,7 @@ struct kvmppc_host_state {
 #ifdef CONFIG_PPC_BOOK3S_64
u64 cfar;
u64 ppr;
+   u64 host_fscr;
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index e0b13aca98e6..f4be7be14330 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -478,6 +478,7 @@ struct kvm_vcpu_arch {
ulong ppr;
ulong pspb;
ulong fscr;
+   ulong shadow_fscr;
ulong tfhar;
ulong tfiar;
ulong texasr;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 2c2227da6917..7484676b8f25 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -525,6 +525,7 @@ int main(void)
DEFINE(VCPU_CFAR, offsetof(struct kvm_vcpu, arch.cfar));
DEFINE(VCPU_PPR, offsetof(struct kvm_vcpu, arch.ppr));
DEFINE(VCPU_FSCR, offsetof(struct kvm_vcpu, arch.fscr));
+   DEFINE(VCPU_SHADOW_FSCR, offsetof(struct kvm_vcpu, arch.shadow_fscr));
DEFINE(VCPU_PSPB, offsetof(struct kvm_vcpu, arch.pspb));
DEFINE(VCPU_TFHAR, offsetof(struct kvm_vcpu, arch.tfhar));
DEFINE(VCPU_TFIAR, offsetof(struct kvm_vcpu, arch.tfiar));
@@ -626,6 +627,7 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S_64
HSTATE_FIELD(HSTATE_CFAR, cfar);
HSTATE_FIELD(HSTATE_PPR, ppr);
+   HSTATE_FIELD(HSTATE_FSCR, host_fscr);
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 #else /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 7f25adbd2590..60d0b6b745e7 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -468,6 +468,19 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
case SPRN_MSSSR0:
case SPRN_DABR:
break;
+   case SPRN_FSCR:
+   {
+   ulong host_fscr = mfspr(SPRN_FSCR);
+   /*
+* We disable FSCR_EBB for pr guest. TAR and DSCR are always
+* enabled.
+*/
+   if (spr_val  ~(FSCR_TAR|FSCR_DSCR|FSCR_EBB))
+   pr_info(KVM: invalud FSCR value 0x%lx, spr_val);
+   vcpu-arch.fscr = spr_val  (FSCR_TAR|FSCR_DSCR);
+   vcpu-arch.shadow_fscr = vcpu-arch.fscr  host_fscr;
+   break;
+   }
 unprivileged:
default:
printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn);
@@ -591,6 +604,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = 0;
break;
+   case SPRN_FSCR:
+   *spr_val = vcpu-arch.fscr;
+   break;
default:
 unprivileged:
printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn);
diff --git a/arch/powerpc/kvm/book3s_interrupts.S 
b/arch/powerpc/kvm/book3s_interrupts.S
index f779450cb07c..fcbdf4817301 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -107,6 +107,14 @@ kvm_start_lightweight:
ld  r3, VCPU_SHARED(r4)
ld  r3, VCPU_SHARED_SPRG3(r3)
mtspr   SPRN_SPRG3, r3
+
+BEGIN_FTR_SECTION
+   mfspr r3,SPRN_FSCR
+   PPC_STL r3, HSTATE_FSCR(r13)
+
+   PPC_LL r3, VCPU_SHADOW_FSCR(r4)
+   mtspr SPRN_FSCR, r3
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
PPC_LL  r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */
@@ -148,6 +156,9 @@ kvm_start_lightweight:
bl  FUNC(kvmppc_copy_from_svcpu)
nop
 
+   /* R7 = vcpu */
+   PPC_LL  r7, GPR4(r1)
+
 #ifdef CONFIG_PPC_BOOK3S_64
/*
 * Reload kernel SPRG3 value.
@@ -155,10 +166,18 @@ kvm_start_lightweight:
 */
ld  r3, PACA_SPRG3(r13)
mtspr   SPRN_SPRG3, r3
-#endif

[RFC PATCH 10/10] PPC: BOOK3S: Disable/Enable TM looking at the ibm, pa-features device tree entry

2014-01-28 Thread Aneesh Kumar K.V
Runtime disable transactional memory feature looking at pa-features
device tree entry. We need to do this so that we can run a kernel
built with TM config in PR mode. For PR guest we provide a device
tree entry with TM feature disabled in pa-features

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kernel/prom.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index fa0ad8aafbcc..de8c2caf1024 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -160,6 +160,11 @@ static struct ibm_pa_feature {
{CPU_FTR_NODSISRALIGN, 0, 0,1, 1, 1},
{0, MMU_FTR_CI_LARGE_PAGE, 0,   1, 2, 0},
{CPU_FTR_REAL_LE, PPC_FEATURE_TRUE_LE, 5, 0, 0},
+   /*
+* We should use CPU_FTR_TM_COMP so that if we disable TM, it won't get
+* enabled via device tree
+*/
+   {CPU_FTR_TM_COMP, 0, 0, 22, 0, 0},
 };
 
 static void __init scan_features(unsigned long node, unsigned char *ftrs,
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 09/10] KVM: PPC: BOOK3S: PR: Ignore write to monitor mode control register

2014-01-28 Thread Aneesh Kumar K.V
We ignore write to these registers now

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index bf6b11021250..c0aee34ef04f 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -490,6 +490,16 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
int sprn, ulong spr_val)
case SPRN_BESCR:
vcpu-arch.bescr = spr_val;
break;
+   case SPRN_MMCRS:
+   break;
+   case SPRN_MMCRA:
+   break;
+   case SPRN_MMCR0:
+   break;
+   case SPRN_MMCR1:
+   break;
+   case SPRN_MMCR2:
+   break;
 unprivileged:
default:
printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn);
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[RFC PATCH 03/10] KVM: PPC: BOOK3S: PR: Emulate instruction counter

2014-01-28 Thread Aneesh Kumar K.V
Writing to IC is not allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h | 1 +
 arch/powerpc/kvm/book3s_emulate.c   | 3 +++
 arch/powerpc/kvm/book3s_pr.c| 2 ++
 3 files changed, 6 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 9ebdd12e50a9..e0b13aca98e6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -509,6 +509,7 @@ struct kvm_vcpu_arch {
/* Time base value when we entered the guest */
u64 entry_tb;
u64 entry_vtb;
+   u64 entry_ic;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 4b58d8a90cb5..abe6f3057e5b 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -531,6 +531,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
case SPRN_VTB:
*spr_val = vcpu-arch.vtb;
break;
+   case SPRN_IC:
+   *spr_val = vcpu-arch.ic;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index b5598e9cdd09..51d469f8c9fd 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -121,6 +121,7 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu 
*svcpu,
 */
vcpu-arch.entry_tb = get_tb();
vcpu-arch.entry_vtb = get_vtb();
+   vcpu-arch.entry_ic = mfspr(SPRN_IC);
 
 }
 
@@ -174,6 +175,7 @@ out:
vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb;
vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb;
vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb;
+   vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic;
 }
 
 static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu)
-- 
1.8.5.3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: don't re-issue spinlock typedef that breaks older gcc

2014-01-28 Thread Aneesh Kumar K.V
Paul Gortmaker paul.gortma...@windriver.com writes:

 Commit b3084f4db3aeb991c507ca774337c7e7893ed04f (powerpc/thp: Fix
 crash on mremap) added a typedef struct spinlock spinlock_t;
 which on gcc 4.5.2 (and possibly other versions) causes many of:

 include/linux/spinlock_types.h:76:3: error: redefinition of typedef 
 'spinlock_t'
 arch/powerpc/include/asm/pgtable-ppc64.h:563:25: note: previous declaration 
 of 'spinlock_t' was here
 In file included from include/linux/mutex.h:15:0,
  from include/linux/notifier.h:13,
  from include/linux/pm_qos.h:8,
  from include/linux/netdevice.h:28,
  from drivers/net/wireless/ath/wil6210/wil6210.h:20,
  from drivers/net/wireless/ath/wil6210/debug.c:17:

 It appears that somewhere between gcc 4.5.2 and 4.6.3 this
 redefinition restriction was lifted.  Using the proper header
 from within !ASSEMBLY seems to fix it up in an acceptable way.

 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Kirill A. Shutemov kirill.shute...@linux.intel.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com
 ---



http://mid.gmane.org/1389939036.3000.7.ca...@thinkpad-t5421.cn.ibm.com

This was posted earlier.



 [ Note that b3084f4db3 isn't mainline yet, it is currently in
   benh/powerpc.git #merge -- but is headed there soon via:
  https://lkml.org/lkml/2014/1/27/599   ]

  arch/powerpc/include/asm/pgtable-ppc64.h | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

 diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
 b/arch/powerpc/include/asm/pgtable-ppc64.h
 index d27960c89a71..3b638411646a 100644
 --- a/arch/powerpc/include/asm/pgtable-ppc64.h
 +++ b/arch/powerpc/include/asm/pgtable-ppc64.h
 @@ -111,6 +111,8 @@

  #ifndef __ASSEMBLY__

 +#include linux/spinlock_types.h
 +
  /*
   * This is the default implementation of various PTE accessors, it's
   * used in all cases except Book3S with 64K pages where we have a
 @@ -560,7 +562,6 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, 
 unsigned long address,
   pmd_t *pmdp);

  #define pmd_move_must_withdraw pmd_move_must_withdraw
 -typedef struct spinlock spinlock_t;
  static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
spinlock_t *old_pmd_ptl)
  {
 -- 
 1.8.5.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH] powerpc: don't re-issue spinlock typedef that breaks older gcc

2014-01-28 Thread Aneesh Kumar K.V
Paul Gortmaker paul.gortma...@windriver.com writes:

 On 14-01-28 12:28 PM, Aneesh Kumar K.V wrote:
 Paul Gortmaker paul.gortma...@windriver.com writes:
 
 Commit b3084f4db3aeb991c507ca774337c7e7893ed04f (powerpc/thp: Fix
 crash on mremap) added a typedef struct spinlock spinlock_t;
 which on gcc 4.5.2 (and possibly other versions) causes many of:

 include/linux/spinlock_types.h:76:3: error: redefinition of typedef 
 'spinlock_t'
 arch/powerpc/include/asm/pgtable-ppc64.h:563:25: note: previous declaration 
 of 'spinlock_t' was here
 In file included from include/linux/mutex.h:15:0,
  from include/linux/notifier.h:13,
  from include/linux/pm_qos.h:8,
  from include/linux/netdevice.h:28,
  from drivers/net/wireless/ath/wil6210/wil6210.h:20,
  from drivers/net/wireless/ath/wil6210/debug.c:17:

 It appears that somewhere between gcc 4.5.2 and 4.6.3 this
 redefinition restriction was lifted.  Using the proper header
 from within !ASSEMBLY seems to fix it up in an acceptable way.

 Cc: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 Cc: Kirill A. Shutemov kirill.shute...@linux.intel.com
 Cc: Benjamin Herrenschmidt b...@kernel.crashing.org
 Signed-off-by: Paul Gortmaker paul.gortma...@windriver.com
 ---

 
 
 http://mid.gmane.org/1389939036.3000.7.ca...@thinkpad-t5421.cn.ibm.com
 
 This was posted earlier.

 I see.  Well I guess Ben didn't use it since it is the same as the
 temporary not-signed-off-by hack patch I posted earlier as well.

 https://lkml.org/lkml/2014/1/27/584

 I believe what I've posted here below to be the proper fix.

I had another variant which needed this

http://mid.gmane.org/1388999012-14424-1-git-send-email-aneesh.ku...@linux.vnet.ibm.com

BTW I had added the above struct spinlock; patch as the backport to
stable 3.13 series. So if we are picking another one, we may need to
update stable also

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/2] Fix compile error of pgtable-ppc64.h

2014-01-30 Thread Aneesh Kumar K.V
Greg KH g...@kroah.com writes:

 On Thu, Jan 30, 2014 at 09:57:36AM +1100, Benjamin Herrenschmidt wrote:
 On Wed, 2014-01-29 at 10:45 -0800, Greg KH wrote:
  On Tue, Jan 28, 2014 at 05:52:42PM +0530, Aneesh Kumar K.V wrote:
   From: Li Zhong zh...@linux.vnet.ibm.com
   
   It seems that forward declaration couldn't work well with typedef, use
   struct spinlock directly to avoiding following build errors:
   
   In file included from include/linux/spinlock.h:81,
from include/linux/seqlock.h:35,
from include/linux/time.h:5,
from include/uapi/linux/timex.h:56,
from include/linux/timex.h:56,
from include/linux/sched.h:17,
from arch/powerpc/kernel/asm-offsets.c:17:
   include/linux/spinlock_types.h:76: error: redefinition of typedef 
   'spinlock_t'
   /root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: 
   previous declaration of 'spinlock_t' was here
   
   build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
   for 3.13 stable series
  
  I don't understand, why is this needed?  Is there a corrisponding patch
  upstream that already does this?  What went wrong with a normal
  backport of the patch to 3.13?
 
 There's a corresponding patch in powerpc-next that I'm about to send to
 Linus today, but for the backport, the fix could be folded into the
 original offending patch.

 Oh come on, you know better than to try to send me a patch that isn't in
 Linus's tree already.  Crap, I can't take that at all.

 Send me the git commit id when it is in Linus's tree, otherwise I'm not
 taking it.

 And no, don't fold in anything, that's not ok either.  I'll just go
 drop this patch entirely from all of my -stable trees for now.  Feel
 free to resend them when all of the needed stuff is upstream.

The fix for mremap crash is already in Linus tree. It is the build
failure for older gcc compiler version that is not in linus tree. We
missed that in the first pull request. Do we really need to drop the
patch from 3.11 and 3.12 trees ? The patch their is a variant, and don't
require this build fix.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH 2/2] Fix compile error of pgtable-ppc64.h

2014-01-30 Thread Aneesh Kumar K.V
Greg KH g...@kroah.com writes:

 On Thu, Jan 30, 2014 at 11:08:52PM +0530, Aneesh Kumar K.V wrote:
 Greg KH g...@kroah.com writes:
 
  On Thu, Jan 30, 2014 at 09:57:36AM +1100, Benjamin Herrenschmidt wrote:
  On Wed, 2014-01-29 at 10:45 -0800, Greg KH wrote:
   On Tue, Jan 28, 2014 at 05:52:42PM +0530, Aneesh Kumar K.V wrote:
From: Li Zhong zh...@linux.vnet.ibm.com

It seems that forward declaration couldn't work well with typedef, use
struct spinlock directly to avoiding following build errors:

In file included from include/linux/spinlock.h:81,
 from include/linux/seqlock.h:35,
 from include/linux/time.h:5,
 from include/uapi/linux/timex.h:56,
 from include/linux/timex.h:56,
 from include/linux/sched.h:17,
 from arch/powerpc/kernel/asm-offsets.c:17:
include/linux/spinlock_types.h:76: error: redefinition of typedef 
'spinlock_t'
/root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: 
previous declaration of 'spinlock_t' was here

build fix for upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.13 stable series
   
   I don't understand, why is this needed?  Is there a corrisponding patch
   upstream that already does this?  What went wrong with a normal
   backport of the patch to 3.13?
  
  There's a corresponding patch in powerpc-next that I'm about to send to
  Linus today, but for the backport, the fix could be folded into the
  original offending patch.
 
  Oh come on, you know better than to try to send me a patch that isn't in
  Linus's tree already.  Crap, I can't take that at all.
 
  Send me the git commit id when it is in Linus's tree, otherwise I'm not
  taking it.
 
  And no, don't fold in anything, that's not ok either.  I'll just go
  drop this patch entirely from all of my -stable trees for now.  Feel
  free to resend them when all of the needed stuff is upstream.
 
 The fix for mremap crash is already in Linus tree.

 What is the git commit id?

upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f

That is patch 1 in this series.



 It is the build failure for older gcc compiler version that is not in
 linus tree.

 That is what I can not take.

 We missed that in the first pull request. Do we really need to drop
 the patch from 3.11 and 3.12 trees ?

 I already did.

 The patch their is a variant, and don't require this build fix.

 Don't give me a variant, give me the exact same patch, only changed to
 handle the fuzz/differences of older kernels, don't make different
 changes to the original patch to make up for things you found out later
 on, otherwise everyone is confused as to why the fix for the fix is not
 in the tree.

In this specific case it may be difficult. 3.13 have other changes
around the code path. It has split pmd locks etc which result in us
doing a withdraw and deposit even on x86. For 3.11 and 3.12, we need to
do that extra withdraw and deposit only for ppc64. Hence the variant
which used #ifdef around that code. 


 So, when both patches get in Linus's tree, please send me the properly
 backported patches and I'll be glad to apply them.


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 1/2] powerpc/thp: Fix crash on mremap

2014-01-31 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.13 stable series

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
Acked-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 14 ++
 include/asm-generic/pgtable.h| 12 
 mm/huge_memory.c | 14 +-
 3 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index 4a191c472867..d27960c89a71 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -558,5 +558,19 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp);
 #define __HAVE_ARCH_PMDP_INVALIDATE
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp);
+
+#define pmd_move_must_withdraw pmd_move_must_withdraw
+typedef struct spinlock spinlock_t;
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   return true;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index db0923458940..8e4f41d9af4d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -558,6 +558,18 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl)
+{
+   /*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*/
+   return new_pmd_ptl != old_pmd_ptl;
+}
+#endif
+
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 95d1acb0f3d2..5d80c53b87cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1502,19 +1502,15 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
-   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
-   if (new_ptl != old_ptl) {
-   pgtable_t pgtable;
 
-   /*
-* Move preallocated PTE page table if new_pmd is on
-* different PMD page table.
-*/
+   if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+   pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-
-   spin_unlock(new_ptl);
}
+   set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
+   if (new_ptl != old_ptl)
+   spin_unlock(new_ptl);
spin_unlock(old_ptl);
}
 out:
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 2/2] powerpc/mm: Fix compile error of pgtable-ppc64.h

2014-01-31 Thread Aneesh Kumar K.V
From: Li Zhong zh...@linux.vnet.ibm.com

It seems that forward declaration couldn't work well with typedef, use
struct spinlock directly to avoiding following build errors:

In file included from include/linux/spinlock.h:81,
 from include/linux/seqlock.h:35,
 from include/linux/time.h:5,
 from include/uapi/linux/timex.h:56,
 from include/linux/timex.h:56,
 from include/linux/sched.h:17,
 from arch/powerpc/kernel/asm-offsets.c:17:
include/linux/spinlock_types.h:76: error: redefinition of typedef 'spinlock_t'
/root/linux-next/arch/powerpc/include/asm/pgtable-ppc64.h:563: note: previous 
declaration of 'spinlock_t' was here

upstream sha1:fd120dc2e205d2318a8b47d6d8098b789e3af67d
for 3.13 stable series

Signed-off-by: Li Zhong zh...@linux.vnet.ibm.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
Signed-off-by: Benjamin Herrenschmidt b...@kernel.crashing.org
---
 arch/powerpc/include/asm/pgtable-ppc64.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index d27960c89a71..bc141c950b1e 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -560,9 +560,9 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, 
unsigned long address,
pmd_t *pmdp);
 
 #define pmd_move_must_withdraw pmd_move_must_withdraw
-typedef struct spinlock spinlock_t;
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-spinlock_t *old_pmd_ptl)
+struct spinlock;
+static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
+struct spinlock *old_pmd_ptl)
 {
/*
 * Archs like ppc64 use pgtable to store per pmd
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 01/10] KVM: PPC: BOOK3S: PR: Fix PURR and SPURR emulation

2014-01-31 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote:
 We definitely don't need to emulate mtspr, because both the registers
 are hypervisor resource.

 This patch description doesn't cover what the patch actually does. It 
 changes the implementation from always tell the guest it uses 100% to 
 give the guest an accurate amount of cpu time spent inside guest
 context.

Will fix that


 Also, I think we either go with full hyp semantics which means we also 
 emulate the offset or we go with no hyp awareness in the guest at all 
 which means we also don't emulate SPURR which is a hyp privileged
 register.

Can you clarify this ?


 Otherwise I like the patch :).


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 02/10] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-01-31 Thread Aneesh Kumar K.V
Paul Mackerras pau...@samba.org writes:

 On Tue, Jan 28, 2014 at 10:14:07PM +0530, Aneesh Kumar K.V wrote:
 virtual time base register is a per vm register and need to saved
 and restored on vm exit and entry. Writing to VTB is not allowed
 in the privileged mode.
 ...

 +#ifdef CONFIG_PPC_BOOK3S_64
 +#define mfvtb() ({unsigned long rval;   
 \
 +asm volatile(mfspr %0, %1 :   \
 + =r (rval) : i (SPRN_VTB)); rval;})

 The mfspr will be a no-op on anything before POWER8, meaning the
 result will be whatever value was in the destination GPR before the
 mfspr.  I suppose that may not matter if the result is only ever used
 when we're running on a POWER8 host, but I would feel more comfortable
 if we had explicit feature tests to make sure of that, rather than
 possibly doing computations with unpredictable values.

 With your patch, a guest on a POWER7 or a PPC970 could do a read from
 VTB and get garbage -- first, there is nothing to stop userspace from
 requesting POWER8 emulation on an older machine, and secondly, even if
 the virtual machine is a PPC970 (say) you don't implement
 unimplemented SPR semantics for VTB (no-op if PR=0, illegal
 instruction interrupt if PR=1).

Ok that means we need to do something like  ?

struct cpu_spec *s = find_cpuspec(vcpu-arch.pvr);
if (s-cpu_features  CPU_FTR_ARCH_207S) {

}



 On the whole I think it is reasonable to reject an attempt to set the
 virtual PVR to a POWER8 PVR value if we are not running on a POWER8
 host, because emulating all the new POWER8 features in software
 (particularly transactional memory) would not be feasible.  Alex may
 disagree. :)

That would make it much simpler.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 03/10] KVM: PPC: BOOK3S: PR: Emulate instruction counter

2014-01-31 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote:
 Writing to IC is not allowed in the privileged mode.

 This is not a patch description.


 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
   arch/powerpc/include/asm/kvm_host.h | 1 +
   arch/powerpc/kvm/book3s_emulate.c   | 3 +++
   arch/powerpc/kvm/book3s_pr.c| 2 ++
   3 files changed, 6 insertions(+)

 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index 9ebdd12e50a9..e0b13aca98e6 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -509,6 +509,7 @@ struct kvm_vcpu_arch {
  /* Time base value when we entered the guest */
  u64 entry_tb;
  u64 entry_vtb;
 +u64 entry_ic;
  u32 tcr;
  ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
  u32 ivor[64];
 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 4b58d8a90cb5..abe6f3057e5b 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -531,6 +531,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
 int sprn, ulong *spr_val
  case SPRN_VTB:
  *spr_val = vcpu-arch.vtb;
  break;
 +case SPRN_IC:
 +*spr_val = vcpu-arch.ic;
 +break;
  case SPRN_GQR0:
  case SPRN_GQR1:
  case SPRN_GQR2:
 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
 index b5598e9cdd09..51d469f8c9fd 100644
 --- a/arch/powerpc/kvm/book3s_pr.c
 +++ b/arch/powerpc/kvm/book3s_pr.c
 @@ -121,6 +121,7 @@ void kvmppc_copy_to_svcpu(struct 
 kvmppc_book3s_shadow_vcpu *svcpu,
   */
  vcpu-arch.entry_tb = get_tb();
  vcpu-arch.entry_vtb = get_vtb();
 +vcpu-arch.entry_ic = mfspr(SPRN_IC);

 Is this implemented on all systems?

   
   }
   
 @@ -174,6 +175,7 @@ out:
  vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb;
  vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb;
  vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb;
 +vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic;

 This is getting quite convoluted. How about we act slightly more fuzzy 
 and put all of this into vcpu_load/put?


I am not sure whether vcpu_load/put is too early/late to save these
context ?

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 07/10] KVM: PPC: BOOK3S: PR: Emulate facility status and control register

2014-01-31 Thread Aneesh Kumar K.V
Paul Mackerras pau...@samba.org writes:

 On Tue, Jan 28, 2014 at 10:14:12PM +0530, Aneesh Kumar K.V wrote:
 We allow priv-mode update of this. The guest value is saved in fscr,
 and the value actually used is saved in shadow_fscr. shadow_fscr
 only contains values that are allowed by the host. On
 facility unavailable interrupt, if the facility is allowed by fscr
 but disabled in shadow_fscr we need to emulate the support. Currently
 all but EBB is disabled. We still don't support performance monitoring
 in PR guest.

 ...

 +/*
 + * Save the current fscr in shadow fscr
 + */
 +mfspr r3,SPRN_FSCR
 +PPC_STL r3, VCPU_SHADOW_FSCR(r7)

 I don't think you need to do this.  What could possibly have changed
 FSCR since we loaded it on the way into the guest?

The reason for facility unavailable interrupt is encoded in FSCR right ?

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [RFC PATCH 08/10] KVM: PPC: BOOK3S: PR: Add support for facility unavailable interrupt

2014-01-31 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 01/28/2014 05:44 PM, Aneesh Kumar K.V wrote:
 At this point we allow all the supported facilities except EBB. So
 forward the interrupt to guest as illegal instruction.

 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
   arch/powerpc/include/asm/kvm_asm.h |  4 +++-
   arch/powerpc/kvm/book3s.c  |  4 
   arch/powerpc/kvm/book3s_emulate.c  | 18 ++
   arch/powerpc/kvm/book3s_pr.c   | 17 +
   4 files changed, 42 insertions(+), 1 deletion(-)

 diff --git a/arch/powerpc/include/asm/kvm_asm.h 
 b/arch/powerpc/include/asm/kvm_asm.h
 index 1bd92fd43cfb..799244face51 100644
 --- a/arch/powerpc/include/asm/kvm_asm.h
 +++ b/arch/powerpc/include/asm/kvm_asm.h
 @@ -99,6 +99,7 @@
   #define BOOK3S_INTERRUPT_PERFMON   0xf00
   #define BOOK3S_INTERRUPT_ALTIVEC   0xf20
   #define BOOK3S_INTERRUPT_VSX   0xf40
 +#define BOOK3S_INTERRUPT_FAC_UNAVAIL0xf60
   
   #define BOOK3S_IRQPRIO_SYSTEM_RESET0
   #define BOOK3S_IRQPRIO_DATA_SEGMENT1
 @@ -117,7 +118,8 @@
   #define BOOK3S_IRQPRIO_DECREMENTER 14
   #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 15
   #define BOOK3S_IRQPRIO_EXTERNAL_LEVEL  16
 -#define BOOK3S_IRQPRIO_MAX  17
 +#define BOOK3S_IRQPRIO_FAC_UNAVAIL  17
 +#define BOOK3S_IRQPRIO_MAX  18
   
   #define BOOK3S_HFLAG_DCBZ320x1
   #define BOOK3S_HFLAG_SLB   0x2
 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
 index 8912608b7e1b..a9aea28c2677 100644
 --- a/arch/powerpc/kvm/book3s.c
 +++ b/arch/powerpc/kvm/book3s.c
 @@ -143,6 +143,7 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
  case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG;break;
  case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC;  break;
  case 0xf40: prio = BOOK3S_IRQPRIO_VSX;  break;
 +case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL;  break;
  default:prio = BOOK3S_IRQPRIO_MAX;  break;
  }
   
 @@ -273,6 +274,9 @@ int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, 
 unsigned int priority)
  case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR:
  vec = BOOK3S_INTERRUPT_PERFMON;
  break;
 +case BOOK3S_IRQPRIO_FAC_UNAVAIL:
 +vec = BOOK3S_INTERRUPT_FAC_UNAVAIL;
 +break;
  default:
  deliver = 0;
  printk(KERN_ERR KVM: Unknown interrupt: 0x%x\n, priority);
 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 60d0b6b745e7..bf6b11021250 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -481,6 +481,15 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, 
 int sprn, ulong spr_val)
  vcpu-arch.shadow_fscr = vcpu-arch.fscr  host_fscr;
  break;
  }
 +case SPRN_EBBHR:
 +vcpu-arch.ebbhr = spr_val;
 +break;
 +case SPRN_EBBRR:
 +vcpu-arch.ebbrr = spr_val;
 +break;
 +case SPRN_BESCR:
 +vcpu-arch.bescr = spr_val;
 +break;
   unprivileged:
  default:
  printk(KERN_INFO KVM: invalid SPR write: %d\n, sprn);
 @@ -607,6 +616,15 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
 int sprn, ulong *spr_val
  case SPRN_FSCR:
  *spr_val = vcpu-arch.fscr;
  break;
 +case SPRN_EBBHR:
 +*spr_val = vcpu-arch.ebbhr;
 +break;
 +case SPRN_EBBRR:
 +*spr_val = vcpu-arch.ebbrr;
 +break;
 +case SPRN_BESCR:
 +*spr_val = vcpu-arch.bescr;
 +break;
  default:
   unprivileged:
  printk(KERN_INFO KVM: invalid SPR read: %d\n, sprn);
 diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
 index 51d469f8c9fd..828056ec208f 100644
 --- a/arch/powerpc/kvm/book3s_pr.c
 +++ b/arch/powerpc/kvm/book3s_pr.c
 @@ -900,6 +900,23 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct 
 kvm_vcpu *vcpu,
  case BOOK3S_INTERRUPT_PERFMON:
  r = RESUME_GUEST;
  break;
 +case BOOK3S_INTERRUPT_FAC_UNAVAIL:
 +{
 +/*
 + * Check for the facility that need to be emulated
 + */
 +ulong fscr_ic = vcpu-arch.shadow_fscr  56;
 +if (fscr_ic != FSCR_EBB_LG) {
 +/*
 + * We only disable EBB facility.
 + * So only emulate that.

 I don't understand the comment. We emulate nothing at all here. We either
  - hit an EBB unavailable in which case we send the guest an illegal 
 instruction interrupt or we
  - hit another facility interrupt in which case we forward the 
 interrupt to the guest, but not the interrupt cause (fscr_ic).


What i wanted to achive was, enable both TAR

[PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-07 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

This patch fix the below crash

NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
LR [c00439ac] .hash_page+0x18c/0x5e0
...
Call Trace:
[c00736103c40] [1b00] 0x1b00(unreliable)
[437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
[437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58

On ppc64 we use the pgtable for storing the hpte slot information and
store address to the pgtable at a constant offset (PTRS_PER_PMD) from
pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
from new pmd.

We also want to move the withdraw and deposit before the set_pmd so
that, when page fault find the pmd as trans huge we can be sure that
pgtable can be located at the offset.

variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
for 3.12 stable series

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/Kconfig   |  3 +++
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 mm/huge_memory.c   | 12 
 3 files changed, 16 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index af2cc6eabcc7..bca9e7a18bd2 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -365,6 +365,9 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE
 config HAVE_ARCH_SOFT_DIRTY
bool
 
+config ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   bool
+
 config HAVE_MOD_ARCH_SPECIFIC
bool
help
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 6704e2e20e6b..0225011231ea 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -71,6 +71,7 @@ config PPC_BOOK3S_64
select PPC_FPU
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
+   select ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if PPC_64K_PAGES
 
 config PPC_BOOK3E_64
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 292a266e0d42..89b7a647f1cb 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1474,8 +1474,20 @@ int move_huge_pmd(struct vm_area_struct *vma, struct 
vm_area_struct *new_vma,
 
ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   pgtable_t pgtable;
+#endif
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
+#ifdef CONFIG_ARCH_THP_MOVE_PMD_ALWAYS_WITHDRAW
+   /*
+* Archs like ppc64 use pgtable to store per pmd
+* specific information. So when we switch the pmd,
+* we should also withdraw and deposit the pgtable
+*/
+   pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
+   pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
+#endif
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
spin_unlock(mm-page_table_lock);
}
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We will use this later to set the _PAGE_NUMA bit.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/hugetlb.h   |  2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++---
 arch/powerpc/mm/pgtable_64.c | 12 +++-
 arch/powerpc/mm/subpage-prot.c   |  2 +-
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index d750336b171d..623f2971ce0e 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct 
mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
 #ifdef CONFIG_PPC64
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
 #else
return __pte(pte_update(ptep, ~0UL, 0));
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index bc141c950b1e..eb9261024f51 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned 
long addr,
 static inline unsigned long pte_update(struct mm_struct *mm,
   unsigned long addr,
   pte_t *ptep, unsigned long clr,
+  unsigned long set,
   int huge)
 {
 #ifdef PTE_ATOMIC_UPDATES
@@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct 
*mm,
andi.   %1,%0,%6\n\
bne-1b \n\
andc%1,%0,%4 \n\
+   or  %1,%1,%7\n\
stdcx.  %1,0,%3 \n\
bne-1b
: =r (old), =r (tmp), =m (*ptep)
-   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY)
+   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set)
: cc );
 #else
unsigned long old = pte_val(*ptep);
-   *ptep = __pte(old  ~clr);
+   *ptep = __pte((old  ~clr) | set);
 #endif
/* huge pages use the old page table lock */
if (!huge)
@@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 {
unsigned long old;
 
-   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
+   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
return (old  _PAGE_ACCESSED) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, 
unsigned long addr,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 0);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
 
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
@@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 1);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
 /*
@@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 pte_t * ptep)
 {
-   pte_update(mm, addr, ptep, ~0UL, 0);
+   pte_update(mm, addr, ptep, ~0UL, 0, 0);
 }
 
 
@@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 unsigned long addr,
-pmd_t *pmdp, unsigned long clr);
+pmd_t *pmdp,
+unsigned long clr,
+unsigned long set);
 
 static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
@@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct 
mm_struct *mm,
 
if ((pmd_val(*pmdp)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED, 0);
return ((old

[PATCH 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

So move it within the if loop

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 mm/mprotect.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..33eab902f10e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (pte_numa(ptent))
ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
+   /*
+* Avoid taking write faults for pages we
+* know to be dirty.
+*/
+   if (dirty_accountable  pte_dirty(ptent))
+   ptent = pte_mkwrite(ptent);
+   ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
@@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
}
}
}
-
-   /*
-* Avoid taking write faults for pages we know to be
-* dirty.
-*/
-   if (dirty_accountable  pte_dirty(ptent)) {
-   ptent = pte_mkwrite(ptent);
-   updated = true;
-   }
-
if (updated)
pages++;
-
-   /* Only !prot_numa always clears the pte */
-   if (!prot_numa)
-   ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION)  !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH 3/3] mm: Use ptep/pmdp_set_numa for updating _PAGE_NUMA bit

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Archs like ppc64 doesn't do tlb flush in set_pte/pmd functions. ppc64 also 
doesn't implement
flush_tlb_range. ppc64 require the tlb flushing to be batched within ptl locks. 
The reason
to do that is to ensure that the hash page table is in sync with linux page 
table.
We track the hpte index in linux pte and if we clear them without flushing hash 
and drop the
ptl lock, we can have another cpu update the pte and can end up with double 
hash. We also want
to keep set_pte_at simpler by not requiring them to do hash flush for 
performance reason.
Hence cannot use them while updating _PAGE_NUMA bit. Add new functions for 
marking pte/pmd numa

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/pgtable.h | 22 ++
 include/asm-generic/pgtable.h  | 24 
 mm/huge_memory.c   |  9 ++---
 mm/mprotect.c  |  4 +---
 4 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index f83b6f3e1b39..3ebb188c3ff5 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -75,12 +75,34 @@ static inline pte_t pte_mknuma(pte_t pte)
return pte;
 }
 
+#define ptep_set_numa ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   if ((pte_val(*ptep)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0);
+   return;
+}
+
 #define pmd_numa pmd_numa
 static inline int pmd_numa(pmd_t pmd)
 {
return pte_numa(pmd_pte(pmd));
 }
 
+#define pmdp_set_numa pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   if ((pmd_val(*pmdp)  _PAGE_PRESENT) == 0)
+   VM_BUG_ON(1);
+
+   pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA);
+   return;
+}
+
 #define pmd_mknonnuma pmd_mknonnuma
 static inline pmd_t pmd_mknonnuma(pmd_t pmd)
 {
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e4f41d9af4d..93fdb5315a0d 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -669,6 +669,18 @@ static inline int pmd_numa(pmd_t pmd)
 }
 #endif
 
+#ifndef pmdp_set_numa
+static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr,
+pmd_t *pmdp)
+{
+   pmd_t pmd = *pmdp;
+
+   pmd = pmd_mknuma(entry);
+   set_pmd_at(mm, addr, pmdp, pmd);
+   return;
+}
+#endif
+
 /*
  * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically
  * because they're called by the NUMA hinting minor page fault. If we
@@ -701,6 +713,18 @@ static inline pte_t pte_mknuma(pte_t pte)
 }
 #endif
 
+#ifndef ptep_set_numa
+static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr,
+pte_t *ptep)
+{
+   pte_t ptent = *ptep;
+
+   ptent = pte_mknuma(ptent);
+   set_pte_at(mm, addr, ptep, ptent);
+   return;
+}
+#endif
+
 #ifndef pmd_mknuma
 static inline pmd_t pmd_mknuma(pmd_t pmd)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 82166bf974e1..da23eb96779f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1545,6 +1545,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
+   set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
@@ -1557,16 +1558,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
 */
if (!is_huge_zero_page(page) 
!pmd_numa(*pmd)) {
-   entry = *pmd;
-   entry = pmd_mknuma(entry);
+   pmdp_set_numa(mm, addr, pmd);
ret = HPAGE_PMD_NR;
}
}
-
-   /* Set PMD if cleared earlier */
-   if (ret == HPAGE_PMD_NR)
-   set_pmd_at(mm, addr, pmd, entry);
-
spin_unlock(ptl);
}
 
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33eab902f10e..769a67a15803 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -69,12 +69,10 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
} else {
struct page *page;
 
-   ptent = *pte;
page = vm_normal_page(vma, addr

[PATCH 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled

2014-02-11 Thread Aneesh Kumar K.V
Hello,

This patch series fix random application crashes observed on ppc64 with numa
balancing enabled. Without the patch we see crashes like

anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 
3cfd5464 code 30001
anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 
3efc5464 code 30001

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Aneesh Kumar K.V
Greg KH gre...@linuxfoundation.org writes:

 On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 This patch fix the below crash
 
 NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
 LR [c00439ac] .hash_page+0x18c/0x5e0
 ...
 Call Trace:
 [c00736103c40] [1b00] 0x1b00(unreliable)
 [437908.479693] [c00736103d50] [c00439ac] .hash_page+0x18c/0x5e0
 [437908.479699] [c00736103e30] [c000924c] .do_hash_page+0x4c/0x58
 
 On ppc64 we use the pgtable for storing the hpte slot information and
 store address to the pgtable at a constant offset (PTRS_PER_PMD) from
 pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
 the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
 from new pmd.
 
 We also want to move the withdraw and deposit before the set_pmd so
 that, when page fault find the pmd as trans huge we can be sure that
 pgtable can be located at the offset.
 
 variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
 for 3.12 stable series

 This doesn't look like a variant, it looks totally different.  Why
 can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
 (and follow-on fix) for 3.12?

Because the code in that function changed in 3.13. Kirill added split
ptl locks for huge pte, and we decide whether to withdraw and
deposit again based on the ptl locks in 3.13. In 3.12 we do that only
for ppc64 using #ifdef



 I _REALLY_ dislike patches that are totally different from Linus's tree
 in stable trees, it has caused nothing but problems in the past.


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


Re: [PATCH V2] powerpc: thp: Fix crash on mremap

2014-02-11 Thread Aneesh Kumar K.V
Benjamin Herrenschmidt b...@kernel.crashing.org writes:

 On Tue, 2014-02-11 at 09:31 -0800, Greg KH wrote:
 On Fri, Feb 07, 2014 at 07:21:57PM +0530, Aneesh Kumar K.V wrote:
  From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
  
  This patch fix the below crash
  
  NIP [c004cee4] .__hash_page_thp+0x2a4/0x440
  LR [c00439ac] .hash_page+0x18c/0x5e0
  ...
  Call Trace:
  [c00736103c40] [1b00] 0x1b00(unreliable)
  [437908.479693] [c00736103d50] [c00439ac] 
  .hash_page+0x18c/0x5e0
  [437908.479699] [c00736103e30] [c000924c] 
  .do_hash_page+0x4c/0x58
  
  On ppc64 we use the pgtable for storing the hpte slot information and
  store address to the pgtable at a constant offset (PTRS_PER_PMD) from
  pmd. On mremap, when we switch the pmd, we need to withdraw and deposit
  the pgtable again, so that we find the pgtable at PTRS_PER_PMD offset
  from new pmd.
  
  We also want to move the withdraw and deposit before the set_pmd so
  that, when page fault find the pmd as trans huge we can be sure that
  pgtable can be located at the offset.
  
  variant of upstream SHA1: b3084f4db3aeb991c507ca774337c7e7893ed04f
  for 3.12 stable series
 
 This doesn't look like a variant, it looks totally different.  Why
 can't I just take the b3084f4db3aeb991c507ca774337c7e7893ed04f patch
 (and follow-on fix) for 3.12?
 
 I _REALLY_ dislike patches that are totally different from Linus's tree
 in stable trees, it has caused nothing but problems in the past.

 I don't think it applies... (I tried on an internal tree) but the
 affected function changed in 3.13 in various ways. Aneesh, please
 provide a more details explanation and whether we should backport those
 other changes too or whether this is not necessary

Yes the affected function added support for split ptl locks for huge
pte. I don't think that is a stable material.

.

 BTW. Aneesh, we need a 3.11.x one too


3.11.x it is already applied.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 1/3] powerpc: mm: Add new set flag argument to pte/pmd update function

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

We will use this later to set the _PAGE_NUMA bit.

Acked-by: Mel Gorman mgor...@suse.de
Acked-by: Rik van Riel r...@redhat.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/hugetlb.h   |  2 +-
 arch/powerpc/include/asm/pgtable-ppc64.h | 26 +++---
 arch/powerpc/mm/pgtable_64.c | 12 +++-
 arch/powerpc/mm/subpage-prot.c   |  2 +-
 4 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index d750336b171d..623f2971ce0e 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -127,7 +127,7 @@ static inline pte_t huge_ptep_get_and_clear(struct 
mm_struct *mm,
unsigned long addr, pte_t *ptep)
 {
 #ifdef CONFIG_PPC64
-   return __pte(pte_update(mm, addr, ptep, ~0UL, 1));
+   return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1));
 #else
return __pte(pte_update(ptep, ~0UL, 0));
 #endif
diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index bc141c950b1e..eb9261024f51 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -195,6 +195,7 @@ extern void hpte_need_flush(struct mm_struct *mm, unsigned 
long addr,
 static inline unsigned long pte_update(struct mm_struct *mm,
   unsigned long addr,
   pte_t *ptep, unsigned long clr,
+  unsigned long set,
   int huge)
 {
 #ifdef PTE_ATOMIC_UPDATES
@@ -205,14 +206,15 @@ static inline unsigned long pte_update(struct mm_struct 
*mm,
andi.   %1,%0,%6\n\
bne-1b \n\
andc%1,%0,%4 \n\
+   or  %1,%1,%7\n\
stdcx.  %1,0,%3 \n\
bne-1b
: =r (old), =r (tmp), =m (*ptep)
-   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY)
+   : r (ptep), r (clr), m (*ptep), i (_PAGE_BUSY), r (set)
: cc );
 #else
unsigned long old = pte_val(*ptep);
-   *ptep = __pte(old  ~clr);
+   *ptep = __pte((old  ~clr) | set);
 #endif
/* huge pages use the old page table lock */
if (!huge)
@@ -231,9 +233,9 @@ static inline int __ptep_test_and_clear_young(struct 
mm_struct *mm,
 {
unsigned long old;
 
-   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
+   if ((pte_val(*ptep)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0);
+   old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0);
return (old  _PAGE_ACCESSED) != 0;
 }
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
@@ -252,7 +254,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, 
unsigned long addr,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 0);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 0);
 }
 
 static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
@@ -261,7 +263,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct 
*mm,
if ((pte_val(*ptep)  _PAGE_RW) == 0)
return;
 
-   pte_update(mm, addr, ptep, _PAGE_RW, 1);
+   pte_update(mm, addr, ptep, _PAGE_RW, 0, 1);
 }
 
 /*
@@ -284,14 +286,14 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
 {
-   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0);
+   unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0);
return __pte(old);
 }
 
 static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 pte_t * ptep)
 {
-   pte_update(mm, addr, ptep, ~0UL, 0);
+   pte_update(mm, addr, ptep, ~0UL, 0, 0);
 }
 
 
@@ -506,7 +508,9 @@ extern int pmdp_set_access_flags(struct vm_area_struct *vma,
 
 extern unsigned long pmd_hugepage_update(struct mm_struct *mm,
 unsigned long addr,
-pmd_t *pmdp, unsigned long clr);
+pmd_t *pmdp,
+unsigned long clr,
+unsigned long set);
 
 static inline int __pmdp_test_and_clear_young(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
@@ -515,7 +519,7 @@ static inline int __pmdp_test_and_clear_young(struct 
mm_struct *mm,
 
if ((pmd_val(*pmdp)  (_PAGE_ACCESSED | _PAGE_HASHPTE)) == 0)
return 0;
-   old = pmd_hugepage_update(mm, addr, pmdp, _PAGE_ACCESSED);
+   old

[PATCH V2 0/3] powerpc: Fix random application crashes with NUMA_BALANCING enabled

2014-02-11 Thread Aneesh Kumar K.V
Hello,

This patch series fix random application crashes observed on ppc64 with numa
balancing enabled. Without the patch we see crashes like

anacron[14551]: unhandled signal 11 at 0041 nip 3cfd54b4 lr 
3cfd5464 code 30001
anacron[14599]: unhandled signal 11 at 0041 nip 3efc54b4 lr 
3efc5464 code 30001

Changes from V1:
* Build fix for CONFIG_NUMA_BALANCING disabled

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


[PATCH V2 2/3] mm: dirty accountable change only apply to non prot numa case

2014-02-11 Thread Aneesh Kumar K.V
From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

So move it within the if loop

Acked-by: Mel Gorman mgor...@suse.de
Reviewed-by: Rik van Riel r...@redhat.com
Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 mm/mprotect.c | 21 +++--
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7332c1785744..33eab902f10e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -58,6 +58,13 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (pte_numa(ptent))
ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot);
+   /*
+* Avoid taking write faults for pages we
+* know to be dirty.
+*/
+   if (dirty_accountable  pte_dirty(ptent))
+   ptent = pte_mkwrite(ptent);
+   ptep_modify_prot_commit(mm, addr, pte, ptent);
updated = true;
} else {
struct page *page;
@@ -72,22 +79,8 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
}
}
}
-
-   /*
-* Avoid taking write faults for pages we know to be
-* dirty.
-*/
-   if (dirty_accountable  pte_dirty(ptent)) {
-   ptent = pte_mkwrite(ptent);
-   updated = true;
-   }
-
if (updated)
pages++;
-
-   /* Only !prot_numa always clears the pte */
-   if (!prot_numa)
-   ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION)  !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
 
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev


<    3   4   5   6   7   8   9   10   11   12   >