[RFC PATCH 12/17] powerpc/kvm/hash: Implement HASH_PROTECT hcall

2017-08-01 Thread Aneesh Kumar K.V
This is equivalent to H_PROTECT hcall, but then takes hash value as the arg
instead of hashpte slot number. We will use this later to speed up invalidate
operation in guest. Instead of finding slot number using H_READ4 hcall, we can
use hash value directly using this hcall.

H_AVPN flag value is needed. Otherwise will return error.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/hvcall.h |  3 +-
 arch/powerpc/include/asm/plpar_wrappers.h |  7 +++
 arch/powerpc/kvm/book3s_hv.c  |  1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 74 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  1 +
 5 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 6a09e91889cf..c234be675774 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -281,7 +281,8 @@
 #define H_REGISTER_PROC_TBL0x37C
 #define H_SIGNAL_SYS_RESET 0x380
 #define H_HASH_REMOVE  0x384
-#define MAX_HCALL_OPCODE   H_HASH_REMOVE
+#define H_HASH_PROTECT 0x388
+#define MAX_HCALL_OPCODE   H_HASH_PROTECT
 
 /* H_VIOCTL functions */
 #define H_GET_VIOA_DUMP_SIZE   0x01
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index 8160fea9b5bc..27e30ca6105d 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -226,6 +226,13 @@ static inline long plpar_pte_protect(unsigned long flags, 
unsigned long ptex,
return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn);
 }
 
+static inline long plpar_pte_hash_protect(unsigned long flags,
+ unsigned long hash,
+ unsigned long avpn)
+{
+   return plpar_hcall_norets(H_HASH_PROTECT, flags, hash, avpn);
+}
+
 static inline long plpar_resize_hpt_prepare(unsigned long flags,
unsigned long shift)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 86c66af38637..d7be56339d53 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4144,6 +4144,7 @@ static unsigned int default_hcall_list[] = {
H_XIRR,
H_XIRR_X,
 #endif
+   H_HASH_PROTECT,
H_HASH_REMOVE,
0
 };
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 85fedb72469b..2aa507614819 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -752,33 +752,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
return ret;
 }
 
-long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
- unsigned long pte_index, unsigned long avpn,
- unsigned long va)
+long __kvmppc_do_hash_protect(struct kvm *kvm, __be64 *hpte,
+ unsigned long flags, unsigned long pte_index)
 {
-   struct kvm *kvm = vcpu->kvm;
-   __be64 *hpte;
+   u64 pte_v, pte_r;
struct revmap_entry *rev;
unsigned long v, r, rb, mask, bits;
-   u64 pte_v, pte_r;
-
-   if (kvm_is_radix(kvm))
-   return H_FUNCTION;
-   if (pte_index >= kvmppc_hpt_npte(>arch.hpt))
-   return H_PARAMETER;
 
-   hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
-   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
-   cpu_relax();
v = pte_v = be64_to_cpu(hpte[0]);
-   if (cpu_has_feature(CPU_FTR_ARCH_300))
-   v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
-   if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
-   ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
-   __unlock_hpte(hpte, pte_v);
-   return H_NOT_FOUND;
-   }
-
pte_r = be64_to_cpu(hpte[1]);
bits = (flags << 55) & HPTE_R_PP0;
bits |= (flags << 48) & HPTE_R_KEY_HI;
@@ -823,6 +804,55 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long 
flags,
return H_SUCCESS;
 }
 
+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long pte_index, unsigned long avpn,
+ unsigned long va)
+{
+   __be64 *hpte;
+   u64 v, pte_v;
+   struct kvm *kvm = vcpu->kvm;
+
+   if (kvm_is_radix(kvm))
+   return H_FUNCTION;
+   if (pte_index >= kvmppc_hpt_npte(>arch.hpt))
+   return H_PARAMETER;
+
+   hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+   cpu_relax();
+   v = pte_v = be64_to_cpu(hpte[0]);
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
+   if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
+   ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
+ 

[RFC PATCH 11/17] powerpc/kvm/hash: Implement HASH_REMOVE hcall

2017-08-01 Thread Aneesh Kumar K.V
This is equivalent to H_REMOVE hcall, but then takes hash value as the arg
instead of hashpte slot number. We will use this later to speed up invalidate
operation in guest. Instead of finding slot number using H_READ4 hcall, we can
use hash value directly using this hcall.

Only support flag value for the operation is H_AVPN.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/hvcall.h |   3 +-
 arch/powerpc/include/asm/plpar_wrappers.h |  16 
 arch/powerpc/kvm/book3s_hv.c  |   1 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c   | 134 ++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   2 +
 5 files changed, 138 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 57d38b504ff7..6a09e91889cf 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -280,7 +280,8 @@
 #define H_RESIZE_HPT_COMMIT0x370
 #define H_REGISTER_PROC_TBL0x37C
 #define H_SIGNAL_SYS_RESET 0x380
-#define MAX_HCALL_OPCODE   H_SIGNAL_SYS_RESET
+#define H_HASH_REMOVE  0x384
+#define MAX_HCALL_OPCODE   H_HASH_REMOVE
 
 /* H_VIOCTL functions */
 #define H_GET_VIOA_DUMP_SIZE   0x01
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index c7b164836bc3..8160fea9b5bc 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -124,6 +124,22 @@ static inline long plpar_pte_remove(unsigned long flags, 
unsigned long ptex,
return rc;
 }
 
+static inline long plpar_pte_hash_remove(unsigned long flags, unsigned long 
hash,
+   unsigned long avpn, unsigned long 
*old_pteh_ret,
+   unsigned long *old_ptel_ret)
+{
+   long rc;
+   unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+   rc = plpar_hcall(H_HASH_REMOVE, retbuf, flags, hash, avpn);
+
+   *old_pteh_ret = retbuf[0];
+   *old_ptel_ret = retbuf[1];
+
+   return rc;
+}
+
+
 /* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */
 static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long 
ptex,
unsigned long avpn, unsigned long *old_pteh_ret,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0b436df746fc..86c66af38637 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4144,6 +4144,7 @@ static unsigned int default_hcall_list[] = {
H_XIRR,
H_XIRR_X,
 #endif
+   H_HASH_REMOVE,
0
 };
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 5abaed27708b..85fedb72469b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -465,34 +465,21 @@ static void do_tlbies(struct kvm *kvm, unsigned long 
*rbvalues,
}
 }
 
-long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
-   unsigned long pte_index, unsigned long avpn,
-   unsigned long *hpret)
+static long __kvmppc_do_hash_remove(struct kvm *kvm, __be64 *hpte,
+   unsigned long pte_index,
+   unsigned long *hpret)
 {
-   __be64 *hpte;
+
unsigned long v, r, rb;
struct revmap_entry *rev;
u64 pte, orig_pte, pte_r;
 
-   if (kvm_is_radix(kvm))
-   return H_FUNCTION;
-   if (pte_index >= kvmppc_hpt_npte(>arch.hpt))
-   return H_PARAMETER;
-   hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
-   while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
-   cpu_relax();
pte = orig_pte = be64_to_cpu(hpte[0]);
pte_r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
pte = hpte_new_to_old_v(pte, pte_r);
pte_r = hpte_new_to_old_r(pte_r);
}
-   if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
-   ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
-   ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-   __unlock_hpte(hpte, orig_pte);
-   return H_NOT_FOUND;
-   }
 
rev = real_vmalloc_addr(>arch.hpt.rev[pte_index]);
v = pte & ~HPTE_V_HVLOCK;
@@ -525,6 +512,35 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long 
flags,
hpret[1] = r;
return H_SUCCESS;
 }
+
+long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
+   unsigned long pte_index, unsigned long avpn,
+   unsigned long *hpret)
+{
+   __be64 *hpte;
+   u64 pte, orig_pte, pte_r;
+
+   if (kvm_is_radix(kvm))
+   return H_FUNCTION;
+   if (pte_index >= kvmppc_hpt_npte(>arch.hpt))
+   return H_PARAMETER;
+   hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
+

[RFC PATCH 10/17] powerpc/mm: Add new firmware feature HASH API

2017-08-01 Thread Aneesh Kumar K.V
We will use this feature to check whether hypervisor implements hash based
remove and protect hcalls

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/firmware.h   | 3 ++-
 arch/powerpc/kvm/powerpc.c| 4 
 arch/powerpc/platforms/pseries/firmware.c | 1 +
 include/uapi/linux/kvm.h  | 1 +
 4 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 8645897472b1..152d704ac3c3 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -51,6 +51,7 @@
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
 #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
+#define FW_FEATURE_HASH_APIASM_CONST(0x0004)
 
 #ifndef __ASSEMBLY__
 
@@ -67,7 +68,7 @@ enum {
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
-   FW_FEATURE_HPT_RESIZE,
+   FW_FEATURE_HPT_RESIZE | FW_FEATURE_HASH_API,
FW_FEATURE_PSERIES_ALWAYS = 0,
FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1a75c0b5f4ca..bd551edfa155 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -632,6 +632,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
/* Disable this on POWER9 until code handles new HPTE format */
r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
break;
+   case KVM_CAP_SPAPR_HASH_API:
+   /* Only enable for HV kvm */
+   r = is_kvmppc_hv_enabled(kvm);
+   break;
 #endif
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_FWNMI:
diff --git a/arch/powerpc/platforms/pseries/firmware.c 
b/arch/powerpc/platforms/pseries/firmware.c
index 63cc82ad58ac..32081d4406e8 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -65,6 +65,7 @@ hypertas_fw_features_table[] = {
{FW_FEATURE_SET_MODE,   "hcall-set-mode"},
{FW_FEATURE_BEST_ENERGY,"hcall-best-energy-1*"},
{FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"},
+   {FW_FEATURE_HASH_API,   "hcall-hash-api"},
 };
 
 /* Build up the firmware features bitmask using the contents of
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6cd63c18708a..698b202b4c53 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -929,6 +929,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_SMT_POSSIBLE 147
 #define KVM_CAP_HYPERV_SYNIC2 148
 #define KVM_CAP_HYPERV_VP_INDEX 149
+#define KVM_CAP_SPAPR_HASH_API 150
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.13.3



[RFC PATCH 09/17] powerpc/mm: Remove unused flag arg in global_invalidates

2017-08-01 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index fedb0139524c..5abaed27708b 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -42,7 +42,7 @@ static void *real_vmalloc_addr(void *x)
 }
 
 /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
-static int global_invalidates(struct kvm *kvm, unsigned long flags)
+static int global_invalidates(struct kvm *kvm)
 {
int global;
int cpu;
@@ -499,7 +499,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long 
flags,
if (v & HPTE_V_VALID) {
hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
rb = compute_tlbie_rb(v, pte_r, pte_index);
-   do_tlbies(kvm, , 1, global_invalidates(kvm, flags), true);
+   do_tlbies(kvm, , 1, global_invalidates(kvm), true);
/*
 * The reference (R) and change (C) bits in a HPT
 * entry can be set by hardware at any time up until
@@ -549,7 +549,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
if (kvm_is_radix(kvm))
return H_FUNCTION;
-   global = global_invalidates(kvm, 0);
+   global = global_invalidates(kvm);
for (i = 0; i < 4 && ret == H_SUCCESS; ) {
n = 0;
for (; i < 4; ++i) {
@@ -709,8 +709,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long 
flags,
rb = compute_tlbie_rb(v, r, pte_index);
hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
  HPTE_V_ABSENT);
-   do_tlbies(kvm, , 1, global_invalidates(kvm, flags),
- true);
+   do_tlbies(kvm, , 1, global_invalidates(kvm), true);
/* Don't lose R/C bit updates done by hardware */
r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
hpte[1] = cpu_to_be64(r);
-- 
2.13.3



[RFC PATCH 08/17] powerpc/mm/hash: Don't track hash pte slot number in linux page table.

2017-08-01 Thread Aneesh Kumar K.V
Now that we have updated all MMU hash operations to work with hash value instead
of slot, remove slot tracking completely. We also remove real_pte because
without slot tracking 4k, 64k and 64k subpages all have similar pte format.

One of the side effect of this is, we now don't track whether we have taken
a fault on 4k subpages on a 64k page config. That means a invalidate will try
to invalidate all the 4k subpages.

To minimize the impact from above THP still track the slot details. With THP we
have 4096 subpages and we want to avoid calling invalidate on all. For THP we
don't track slot details as part of linux page table, but are tracked in the
deposited page table

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h   | 16 +++-
 arch/powerpc/include/asm/book3s/64/hash-64k.h  | 44 +-
 arch/powerpc/include/asm/book3s/64/hash.h  |  5 +-
 arch/powerpc/include/asm/book3s/64/pgtable.h   | 26 --
 arch/powerpc/include/asm/book3s/64/tlbflush-hash.h |  3 +-
 arch/powerpc/include/asm/pgtable-be-types.h| 10 ---
 arch/powerpc/include/asm/pgtable-types.h   |  9 --
 arch/powerpc/mm/dump_linuxpagetables.c | 10 ---
 arch/powerpc/mm/hash64_4k.c|  2 -
 arch/powerpc/mm/hash64_64k.c   | 95 +-
 arch/powerpc/mm/hash_native_64.c   | 12 +--
 arch/powerpc/mm/hash_utils_64.c| 22 +
 arch/powerpc/mm/hugetlbpage-hash64.c   |  4 -
 arch/powerpc/mm/tlb_hash64.c   |  9 +-
 arch/powerpc/platforms/pseries/lpar.c  |  4 +-
 15 files changed, 50 insertions(+), 221 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..d65dcb5826ff 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -17,8 +17,7 @@
 #define H_PGD_TABLE_SIZE   (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
 /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
-H_PAGE_F_SECOND | H_PAGE_F_GIX)
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE)
 /*
  * Not supported by 4k linux page size
  */
@@ -27,6 +26,19 @@
 #define H_PAGE_COMBO   0x0
 #define H_PTE_FRAG_NR  0
 #define H_PTE_FRAG_SIZE_SHIFT  0
+
+#define pte_iterate_hashed_subpages(vpn, psize, index, shift)  \
+   do {\
+   index = 0;  \
+   shift = mmu_psize_defs[psize].shift;\
+
+#define pte_iterate_hashed_end() } while(0)
+/*
+ * We expect this to be called only for user addresses or kernel virtual
+ * addresses other than the linear mapping.
+ */
+#define pte_pagesize_index(mm, addr, pte)  MMU_PAGE_4K
+
 /*
  * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
  */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9732837aaae8..ab36323b8a3e 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -25,8 +25,7 @@
 #define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND)
 
 /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
-H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
 /*
  * we support 16 fragments per PTE page of 64K size.
  */
@@ -40,55 +39,16 @@
 
 #ifndef __ASSEMBLY__
 #include 
-
-/*
- * With 64K pages on hash table, we have a special PTE format that
- * uses a second "half" of the page table to encode sub-page information
- * in order to deal with 64K made of 4K HW pages. Thus we override the
- * generic accessors and iterators here
- */
-#define __real_pte __real_pte
-static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
-{
-   real_pte_t rpte;
-   unsigned long *hidxp;
-
-   rpte.pte = pte;
-   rpte.hidx = 0;
-   if (pte_val(pte) & H_PAGE_COMBO) {
-   /*
-* Make sure we order the hidx load against the H_PAGE_COMBO
-* check. The store side ordering is done in __hash_page_4K
-*/
-   smp_rmb();
-   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-   rpte.hidx = *hidxp;
-   }
-   return rpte;
-}
-
-static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long 
index)
-{
-   if ((pte_val(rpte.pte) & H_PAGE_COMBO))
-   return (rpte.hidx >> (index<<2)) & 0xf;
-   return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
-}
-
-#define __rpte_to_pte(r)   ((r).pte)
-extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 /*
  * Trick: we 

[RFC PATCH 07/17] powerpc/mm: Add hash updatepp callback

2017-08-01 Thread Aneesh Kumar K.V
Add hash based updatepp callback and use that during hash pte fault.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  6 +
 arch/powerpc/mm/hash64_4k.c   |  7 +
 arch/powerpc/mm/hash64_64k.c  | 17 +++-
 arch/powerpc/mm/hash_native_64.c  | 37 +++
 arch/powerpc/mm/hugetlbpage-hash64.c  |  9 ++-
 arch/powerpc/platforms/ps3/htab.c | 29 +
 arch/powerpc/platforms/pseries/lpar.c | 31 ++
 7 files changed, 109 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 7e1fcae472f0..a784d4ac4fb1 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -144,6 +144,12 @@ struct mmu_hash_ops {
 unsigned long vpn,
 int bpsize, int apsize,
 int ssize, unsigned long flags);
+   long(*hash_updatepp)(unsigned long hash,
+unsigned long newpp,
+unsigned long vpn,
+int bpsize, int apsize,
+int ssize, unsigned long flags);
+
void(*hpte_updateboltedpp)(unsigned long newpp,
   unsigned long ea,
   int psize, int ssize);
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index 6fa450c12d6d..d262d814ca55 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -65,12 +65,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, 
unsigned long vsid,
 * There MIGHT be an HPTE for this pte
 */
hash = hpt_hash(vpn, shift, ssize);
-   if (old_pte & H_PAGE_F_SECOND)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-   if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_4K,
+   if (mmu_hash_ops.hash_updatepp(hash, rflags, vpn, MMU_PAGE_4K,
   MMU_PAGE_4K, ssize, flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
}
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 1a68cb19b0e3..2b72f2c5ed10 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -127,17 +127,11 @@ int __hash_page_4K(unsigned long ea, unsigned long 
access, unsigned long vsid,
int ret;
 
hash = hpt_hash(vpn, shift, ssize);
-   hidx = __rpte_to_hidx(rpte, subpg_index);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
-
-   ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+   ret = mmu_hash_ops.hash_updatepp(hash, rflags, vpn,
 MMU_PAGE_4K, MMU_PAGE_4K,
 ssize, flags);
/*
-*if we failed because typically the HPTE wasn't really here
+* if we failed because typically the HPTE wasn't really here
 * we try an insertion.
 */
if (ret == -1)
@@ -268,12 +262,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 * There MIGHT be an HPTE for this pte
 */
hash = hpt_hash(vpn, shift, ssize);
-   if (old_pte & H_PAGE_F_SECOND)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-   if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, MMU_PAGE_64K,
+   if (mmu_hash_ops.hash_updatepp(hash, rflags, vpn, MMU_PAGE_64K,
   MMU_PAGE_64K, ssize,
   flags) == -1)
old_pte &= ~_PAGE_HPTEFLAGS;
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index c3fdd684a287..2eaded4680ae 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -391,6 +391,42 @@ struct hash_pte *native_hpte_find(unsigned long hash, 
unsigned long vpn,
return NULL;
 }
 
+static long native_hash_updatepp(unsigned long hash, unsigned long newpp,
+unsigned long 

[RFC PATCH 06/17] powerpc/mm: Switch flush_hash_range to not use slot

2017-08-01 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/hash_native_64.c  | 28 
 arch/powerpc/platforms/pseries/lpar.c | 13 -
 2 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index ce25e125dd06..c3fdd684a287 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -702,10 +702,8 @@ static void native_hpte_clear(void)
 static void native_flush_hash_range(unsigned long number, int local)
 {
unsigned long vpn;
-   unsigned long hash, index, hidx, shift, slot;
+   unsigned long hash, index, shift;
struct hash_pte *hptep;
-   unsigned long hpte_v;
-   unsigned long want_v;
unsigned long flags;
real_pte_t pte;
struct ppc64_tlb_batch *batch = this_cpu_ptr(_tlb_batch);
@@ -725,23 +723,13 @@ static void native_flush_hash_range(unsigned long number, 
int local)
 
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
hash = hpt_hash(vpn, shift, ssize);
-   hidx = __rpte_to_hidx(pte, index);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
-   hptep = htab_address + slot;
-   want_v = hpte_encode_avpn(vpn, psize, ssize);
-   native_lock_hpte(hptep);
-   hpte_v = be64_to_cpu(hptep->v);
-   if (cpu_has_feature(CPU_FTR_ARCH_300))
-   hpte_v = hpte_new_to_old_v(hpte_v,
-   be64_to_cpu(hptep->r));
-   if (!HPTE_V_COMPARE(hpte_v, want_v) ||
-   !(hpte_v & HPTE_V_VALID))
-   native_unlock_hpte(hptep);
-   else
-   hptep->v = 0;
+   hptep = native_hpte_find(hash, vpn, psize, ssize);
+   if (!hptep)
+   continue;
+   /*
+* Invalidate the hpte. NOTE: this also unlocks it
+*/
+   hptep->v = 0;
} pte_iterate_hashed_end();
}
 
diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index e366252e0e93..ad7838171bb0 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -580,14 +580,14 @@ static int pSeries_lpar_hpte_removebolted(unsigned long 
ea,
 static void pSeries_lpar_flush_hash_range(unsigned long number, int local)
 {
unsigned long vpn;
-   unsigned long i, pix, rc;
+   unsigned long i, rc;
unsigned long flags = 0;
struct ppc64_tlb_batch *batch = this_cpu_ptr(_tlb_batch);
int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
unsigned long param[PLPAR_HCALL9_BUFSIZE];
-   unsigned long hash, index, shift, hidx, slot;
+   unsigned long index, shift, slot;
real_pte_t pte;
-   int psize, ssize;
+   int psize, ssize, pix;
 
if (lock_tlbie)
spin_lock_irqsave(_lpar_tlbie_lock, flags);
@@ -599,12 +599,7 @@ static void pSeries_lpar_flush_hash_range(unsigned long 
number, int local)
vpn = batch->vpn[i];
pte = batch->pte[i];
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
-   hash = hpt_hash(vpn, shift, ssize);
-   hidx = __rpte_to_hidx(pte, index);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
+   slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) {
/*
 * lpar doesn't use the passed actual page size
-- 
2.13.3



[RFC PATCH 05/17] powerpc/mm: use hash_invalidate for __kernel_map_pages()

2017-08-01 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/hash_utils_64.c | 32 +---
 1 file changed, 5 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index a02570b4cfed..66f12b48f838 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -118,11 +118,6 @@ EXPORT_SYMBOL_GPL(mmu_slb_size);
 #ifdef CONFIG_PPC_64K_PAGES
 int mmu_ci_restrictions;
 #endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-static u8 *linear_map_hash_slots;
-static unsigned long linear_map_hash_count;
-static DEFINE_SPINLOCK(linear_map_hash_lock);
-#endif /* CONFIG_DEBUG_PAGEALLOC */
 struct mmu_hash_ops mmu_hash_ops;
 EXPORT_SYMBOL(mmu_hash_ops);
 
@@ -1746,7 +1741,7 @@ long hpte_insert_repeating(unsigned long hash, unsigned 
long vpn,
 }
 
 #ifdef CONFIG_DEBUG_PAGEALLOC
-static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi)
+static void kernel_map_linear_page(unsigned long vaddr)
 {
unsigned long hash;
unsigned long vsid = get_kernel_vsid(vaddr, mmu_kernel_ssize);
@@ -1763,12 +1758,7 @@ static void kernel_map_linear_page(unsigned long vaddr, 
unsigned long lmi)
ret = hpte_insert_repeating(hash, vpn, __pa(vaddr), mode,
HPTE_V_BOLTED,
mmu_linear_psize, mmu_kernel_ssize);
-
BUG_ON (ret < 0);
-   spin_lock(_map_hash_lock);
-   BUG_ON(linear_map_hash_slots[lmi] & 0x80);
-   linear_map_hash_slots[lmi] = ret | 0x80;
-   spin_unlock(_map_hash_lock);
 }
 
 static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi)
@@ -1778,35 +1768,23 @@ static void kernel_unmap_linear_page(unsigned long 
vaddr, unsigned long lmi)
unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize);
 
hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize);
-   spin_lock(_map_hash_lock);
-   BUG_ON(!(linear_map_hash_slots[lmi] & 0x80));
-   hidx = linear_map_hash_slots[lmi] & 0x7f;
-   linear_map_hash_slots[lmi] = 0;
-   spin_unlock(_map_hash_lock);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
-   mmu_hash_ops.hpte_invalidate(slot, vpn, mmu_linear_psize,
+   mmu_hash_ops.hash_invalidate(hash, vpn, mmu_linear_psize,
 mmu_linear_psize,
 mmu_kernel_ssize, 0);
 }
 
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
-   unsigned long flags, vaddr, lmi;
+   unsigned long flags, vaddr;
int i;
 
local_irq_save(flags);
for (i = 0; i < numpages; i++, page++) {
vaddr = (unsigned long)page_address(page);
-   lmi = __pa(vaddr) >> PAGE_SHIFT;
-   if (lmi >= linear_map_hash_count)
-   continue;
if (enable)
-   kernel_map_linear_page(vaddr, lmi);
+   kernel_map_linear_page(vaddr);
else
-   kernel_unmap_linear_page(vaddr, lmi);
+   kernel_unmap_linear_page(vaddr);
}
local_irq_restore(flags);
 }
-- 
2.13.3



[RFC PATCH 04/17] powerpc/mm: Add hash invalidate callback

2017-08-01 Thread Aneesh Kumar K.V
Add hash based invalidate callback and use that in flush_hash_page.
Note: In a later patch, we will drop the slot tracking completely. At that point
we will also loose the __rpte_sub_valid() check in
pte_iterate_hashed_subpages(). That means we call the invalidate for all
subpages irrespective of whether we took a hash fault on that or not.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  4 
 arch/powerpc/mm/hash_native_64.c  | 27 +++
 arch/powerpc/mm/hash_utils_64.c   | 11 +++
 arch/powerpc/platforms/ps3/htab.c | 22 ++
 arch/powerpc/platforms/pseries/lpar.c | 26 ++
 5 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index f9cce40a4035..7e1fcae472f0 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -135,6 +135,10 @@ struct mmu_hash_ops {
   unsigned long vpn,
   int bpsize, int apsize,
   int ssize, int local);
+   void(*hash_invalidate)(unsigned long hash,
+  unsigned long vpn,
+  int bpsize, int apsize,
+  int ssize, int local);
long(*hpte_updatepp)(unsigned long slot,
 unsigned long newpp,
 unsigned long vpn,
diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 4b3f6d66e7f0..ce25e125dd06 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -492,6 +492,32 @@ static void native_hpte_invalidate(unsigned long slot, 
unsigned long vpn,
local_irq_restore(flags);
 }
 
+static void native_hash_invalidate(unsigned long hash, unsigned long vpn,
+  int bpsize, int apsize, int ssize, int local)
+{
+   unsigned long flags;
+   struct hash_pte *hptep;
+
+   DBG_LOW("invalidate(vpn=%016lx, hash: %lx)\n", vpn, hash);
+   local_irq_save(flags);
+   hptep = native_hpte_find(hash, vpn, bpsize, ssize);
+   if (hptep) {
+   /*
+* Invalidate the hpte. NOTE: this also unlocks it
+*/
+   hptep->v = 0;
+   }
+   /*
+* We need to invalidate the TLB always because hpte_remove doesn't do
+* a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
+* random entry from it. When we do that we don't invalidate the TLB
+* (hpte_remove) because we assume the old translation is still
+* technically "valid".
+*/
+   tlbie(vpn, bpsize, apsize, ssize, local);
+   local_irq_restore(flags);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 static void native_hugepage_invalidate(unsigned long vsid,
   unsigned long addr,
@@ -771,6 +797,7 @@ static int native_register_proc_table(unsigned long base, 
unsigned long page_siz
 void __init hpte_init_native(void)
 {
mmu_hash_ops.hpte_invalidate= native_hpte_invalidate;
+   mmu_hash_ops.hash_invalidate= native_hash_invalidate;
mmu_hash_ops.hpte_updatepp  = native_hpte_updatepp;
mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp;
mmu_hash_ops.hpte_removebolted = native_hpte_removebolted;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ff3c9522a2b3..a02570b4cfed 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1598,23 +1598,18 @@ static inline void tm_flush_hash_page(int local)
 void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 unsigned long flags)
 {
-   unsigned long hash, index, shift, hidx, slot;
+   unsigned long hash, index, shift;
int local = flags & HPTE_LOCAL_UPDATE;
 
DBG_LOW("flush_hash_page(vpn=%016lx)\n", vpn);
pte_iterate_hashed_subpages(pte, psize, vpn, index, shift) {
hash = hpt_hash(vpn, shift, ssize);
-   hidx = __rpte_to_hidx(pte, index);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
-   DBG_LOW(" sub %ld: hash=%lx, hidx=%lx\n", index, slot, hidx);
+   DBG_LOW(" sub %ld: hash=%lx\n", index, hash);
/*
 * We use same base page size and actual psize, because we don't
 * use these functions for hugepage
 */
-

[RFC PATCH 03/17] powerpc/ps3/mm: Add helper for finding hash pte slot using hash value

2017-08-01 Thread Aneesh Kumar K.V
We will use this in later patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/ps3/htab.c | 37 +
 1 file changed, 37 insertions(+)

diff --git a/arch/powerpc/platforms/ps3/htab.c 
b/arch/powerpc/platforms/ps3/htab.c
index cc2b281a3766..255b7a33fefe 100644
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -193,6 +193,43 @@ static void ps3_hpte_clear(void)
ps3_mm_vas_destroy();
 }
 
+static long ps3_hpte_find(unsigned long hash, unsigned long want_v)
+{
+   unsigned long i, j, result;
+   unsigned long hpte_group;
+   bool secondary_search = false;
+   u64 hpte_v_array[4], hpte_rs;
+
+
+   /* first check primary */
+   hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+search_again:
+   for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
+
+   result = lv1_read_htab_entries(PS3_LPAR_VAS_ID_CURRENT,
+  hpte_group & ~0x3UL, 
_v_array[0],
+  _v_array[1], 
_v_array[2],
+  _v_array[3], _rs);
+   /* ignore failures ? */
+   if (result)
+   continue;
+
+   for (j = 0; j < 4; j++) {
+   if (HPTE_V_COMPARE(hpte_v_array[j], want_v) &&
+   (hpte_v_array[j] & HPTE_V_VALID)) {
+   return hpte_group + j;
+   }
+   }
+   }
+   if (!secondary_search) {
+   hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+   secondary_search = true;
+   goto search_again;
+   }
+   return -1;
+}
+
 void __init ps3_hpte_init(unsigned long htab_size)
 {
mmu_hash_ops.hpte_invalidate = ps3_hpte_invalidate;
-- 
2.13.3



[RFC PATCH 02/17] powerpc/pseries: Update hpte find helper to take hash value

2017-08-01 Thread Aneesh Kumar K.V
The helper now also does secondary hash search so that we can use this in other
functions.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/pseries/lpar.c | 28 +---
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/lpar.c 
b/arch/powerpc/platforms/pseries/lpar.c
index 495ba4e7336d..edab68d9f9f3 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -328,15 +328,21 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot,
return 0;
 }
 
-static long __pSeries_lpar_hpte_find(unsigned long want_v, unsigned long 
hpte_group)
+static long __pSeries_lpar_hpte_find(unsigned long hash, unsigned long want_v)
 {
long lpar_rc;
unsigned long i, j;
+   unsigned long hpte_group;
+   bool secondary_search = false;
struct {
unsigned long pteh;
unsigned long ptel;
} ptes[4];
 
+   /* first check primary */
+   hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+
+search_again:
for (i = 0; i < HPTES_PER_GROUP; i += 4, hpte_group += 4) {
 
lpar_rc = plpar_pte_read_4(0, hpte_group, (void *)ptes);
@@ -346,31 +352,31 @@ static long __pSeries_lpar_hpte_find(unsigned long 
want_v, unsigned long hpte_gr
for (j = 0; j < 4; j++) {
if (HPTE_V_COMPARE(ptes[j].pteh, want_v) &&
(ptes[j].pteh & HPTE_V_VALID))
-   return i + j;
+   return hpte_group + j;
}
}
-
+   if (!secondary_search) {
+   hpte_group = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+   secondary_search = true;
+   goto search_again;
+   }
return -1;
 }
 
 static long pSeries_lpar_hpte_find(unsigned long vpn, int psize, int ssize)
 {
long slot;
-   unsigned long hash;
-   unsigned long want_v;
-   unsigned long hpte_group;
+   unsigned long hash, want_v;
 
hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
want_v = hpte_encode_avpn(vpn, psize, ssize);
-
-   /* Bolted entries are always in the primary group */
-   hpte_group = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot = __pSeries_lpar_hpte_find(want_v, hpte_group);
+   slot = __pSeries_lpar_hpte_find(hash, want_v);
if (slot < 0)
return -1;
-   return hpte_group + slot;
+   return slot;
 }
 
+
 static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp,
 unsigned long ea,
 int psize, int ssize)
-- 
2.13.3



[RFC PATCH 01/17] powerpc/mm: Update native_hpte_find to return hash pte

2017-08-01 Thread Aneesh Kumar K.V
The helper now also does a secondary hash search so that we can use this in 
other
functions.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/hash_native_64.c | 70 +++-
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c
index 3848af167df9..4b3f6d66e7f0 100644
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -351,32 +351,44 @@ static long native_hpte_updatepp(unsigned long slot, 
unsigned long newpp,
return ret;
 }
 
-static long native_hpte_find(unsigned long vpn, int psize, int ssize)
+/* returns a locked hash pte */
+struct hash_pte *native_hpte_find(unsigned long hash, unsigned long vpn,
+ unsigned long bpsize, unsigned long ssize)
 {
+   int i;
+   unsigned long hpte_v;
struct hash_pte *hptep;
-   unsigned long hash;
-   unsigned long i;
-   long slot;
-   unsigned long want_v, hpte_v;
-
-   hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
-   want_v = hpte_encode_avpn(vpn, psize, ssize);
+   unsigned long want_v, slot;
+   bool secondary_search = false;
 
-   /* Bolted mappings are only ever in the primary group */
+   want_v = hpte_encode_avpn(vpn, bpsize, ssize);
slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   for (i = 0; i < HPTES_PER_GROUP; i++) {
-   hptep = htab_address + slot;
+
+   /*
+* search for hpte in the primary group
+*/
+search_again:
+   hptep = htab_address + slot;
+   for (i = 0; i < HPTES_PER_GROUP; i++, hptep++) {
+   /*
+* FIXME!! Should we check locklessly check first ?
+*/
+   native_lock_hpte(hptep);
hpte_v = be64_to_cpu(hptep->v);
if (cpu_has_feature(CPU_FTR_ARCH_300))
hpte_v = hpte_new_to_old_v(hpte_v, 
be64_to_cpu(hptep->r));
-
-   if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
-   /* HPTE matches */
-   return slot;
-   ++slot;
+   if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
+   native_unlock_hpte(hptep);
+   else
+   return hptep;
}
-
-   return -1;
+   if (!secondary_search) {
+   /* Search for hpte in the secondary group */
+   slot = (~hash & htab_hash_mask) * HPTES_PER_GROUP;
+   secondary_search = true;
+   goto search_again;
+   }
+   return NULL;
 }
 
 /*
@@ -389,23 +401,22 @@ static long native_hpte_find(unsigned long vpn, int 
psize, int ssize)
 static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea,
   int psize, int ssize)
 {
-   unsigned long vpn;
-   unsigned long vsid;
-   long slot;
+   unsigned long hash;
+   unsigned long vpn, vsid;
struct hash_pte *hptep;
 
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
-
-   slot = native_hpte_find(vpn, psize, ssize);
-   if (slot == -1)
+   hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
+   hptep = native_hpte_find(hash, vpn, psize, ssize);
+   if (!hptep)
panic("could not find page to bolt\n");
-   hptep = htab_address + slot;
 
/* Update the HPTE */
hptep->r = cpu_to_be64((be64_to_cpu(hptep->r) &
~(HPTE_R_PPP | HPTE_R_N)) |
   (newpp & (HPTE_R_PPP | HPTE_R_N)));
+   native_unlock_hpte(hptep);
/*
 * Ensure it is out of the tlb too. Bolted entries base and
 * actual page size will be same.
@@ -422,18 +433,17 @@ static int native_hpte_removebolted(unsigned long ea, int 
psize, int ssize)
 {
unsigned long vpn;
unsigned long vsid;
-   long slot;
+   unsigned long hash;
struct hash_pte *hptep;
 
vsid = get_kernel_vsid(ea, ssize);
vpn = hpt_vpn(ea, vsid, ssize);
+   hash = hpt_hash(vpn, mmu_psize_defs[psize].shift, ssize);
 
-   slot = native_hpte_find(vpn, psize, ssize);
-   if (slot == -1)
+   hptep = native_hpte_find(hash, vpn, psize, ssize);
+   if (!hptep)
return -ENOENT;
 
-   hptep = htab_address + slot;
-
VM_WARN_ON(!(be64_to_cpu(hptep->v) & HPTE_V_BOLTED));
 
/* Invalidate the hpte */
-- 
2.13.3



[RFC PATCH 00/17] Remove slot tracking from linux page table

2017-08-01 Thread Aneesh Kumar K.V
Hi,

This patch series removes hash pte slot tracking in linux page table.
This free up 4 bits from linux page table and brings the hash and radix
linux page table closer. The series also attempt remove __real_pte_t
because without slot tracking 4k subpage and 64k page , pte formats
are similar. 

However not tracking slot implies we search the hash group during invalidate
and updatepp operations. That involves searching max 16 slots to find the
matching hash page table entry. W.r.t subpages, since we don't track the
validity of slots, when invalidating 64K page, we ends up calling invalidate
for all subpages irrespective of whether we have taken a subpage
fault or not.

W.r.t THP, we skip the above and still track slots in level deposited page 
table.

The patch series do have an impact, hence i am sending this as an RFC series
before doing further measurements with kvm. On baremetal a kernel build gives.

Without patch:
/usr/bin/time -p make  vmlinux modules > /dev/null
real 270.70
user 280.23
sys 57.99

With patch
/usr/bin/time -p make  vmlinux modules > /dev/null
real 272.97
user 281.32
sys 61.46

That is 6% impact on system time:  The real time impact is within the runtime
variance. 

Let me know if you think we should continue with this approach.

-aneesh

Aneesh Kumar K.V (17):
  powerpc/mm: Update native_hpte_find to return hash pte
  powerpc/pseries: Update hpte find helper to take hash value
  powerpc/ps3/mm: Add helper for finding hash pte slot using hash value
  powerpc/mm: Add hash invalidate callback
  powerpc/mm: use hash_invalidate for __kernel_map_pages()
  powerpc/mm: Switch flush_hash_range to not use slot
  powerpc/mm: Add hash updatepp callback
  powerpc/mm/hash: Don't track hash pte slot number in linux page table.
  powerpc/mm: Remove unused flag arg in global_invalidates
  powerpc/mm: Add new firmware feature HASH API
  powerpc/kvm/hash: Implement HASH_REMOVE hcall
  powerpc/kvm/hash: Implement HASH_PROTECT hcall
  powerpc/kvm/hash: Implement HASH_BULK_REMOVE hcall
  powerpc/mm/pseries: Use HASH_PROTECT hcall in guest
  powerpc/mm/pseries: Use HASH_REMOVE hcall in guest
  powerpc/mm/pseries: Move slot based bulk remove to helper
  powerpc/mm/pseries: Use HASH_BULK_REMOVE hcall in guest

 arch/powerpc/include/asm/book3s/64/hash-4k.h   |  16 +-
 arch/powerpc/include/asm/book3s/64/hash-64k.h  |  44 +--
 arch/powerpc/include/asm/book3s/64/hash.h  |   5 +-
 arch/powerpc/include/asm/book3s/64/mmu-hash.h  |  12 +
 arch/powerpc/include/asm/book3s/64/pgtable.h   |  26 --
 arch/powerpc/include/asm/book3s/64/tlbflush-hash.h |   3 +-
 arch/powerpc/include/asm/firmware.h|   3 +-
 arch/powerpc/include/asm/hvcall.h  |   5 +-
 arch/powerpc/include/asm/pgtable-be-types.h|  10 -
 arch/powerpc/include/asm/pgtable-types.h   |   9 -
 arch/powerpc/include/asm/plpar_wrappers.h  |  23 ++
 arch/powerpc/kvm/book3s_hv.c   |   3 +
 arch/powerpc/kvm/book3s_hv_rm_mmu.c| 306 ++---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S|   4 +
 arch/powerpc/kvm/powerpc.c |   4 +
 arch/powerpc/mm/dump_linuxpagetables.c |  10 -
 arch/powerpc/mm/hash64_4k.c|   9 +-
 arch/powerpc/mm/hash64_64k.c   | 108 ++--
 arch/powerpc/mm/hash_native_64.c   | 172 
 arch/powerpc/mm/hash_utils_64.c|  65 +
 arch/powerpc/mm/hugetlbpage-hash64.c   |  13 +-
 arch/powerpc/mm/tlb_hash64.c   |   9 +-
 arch/powerpc/platforms/ps3/htab.c  |  88 ++
 arch/powerpc/platforms/pseries/firmware.c  |   1 +
 arch/powerpc/platforms/pseries/lpar.c  | 193 ++---
 include/uapi/linux/kvm.h   |   1 +
 26 files changed, 736 insertions(+), 406 deletions(-)

-- 
2.13.3



Re: [PATCH] POWER9 PMU stops after idle workaround

2017-08-01 Thread Anton Blanchard
Hi Nick,

> POWER9 DD2 PMU can stop after a state-loss idle in some conditions.
> 
> A solution is to set then clear MMCRA[60] after wake from state-loss
> idle.

Looks good.

Acked-by: Anton Blanchard 

Anton

> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/kernel/idle_book3s.S | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/idle_book3s.S
> b/arch/powerpc/kernel/idle_book3s.S index 516ebef905c0..e6252c5a57a4
> 100644 --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -460,11 +460,17 @@ pnv_restore_hyp_resource_arch300:
>   /*
>* Workaround for POWER9, if we lost resources, the ERAT
>* might have been mixed up and needs flushing. We also need
> -  * to reload MMCR0 (see comment above).
> +  * to reload MMCR0 (see comment above). We also need to set
> +  * then clear bit 60 in MMCRA to ensure the PMU starts
> running. */
>   blt cr3,1f
>   PPC_INVALIDATE_ERAT
>   ld  r1,PACAR1(r13)
> + mfspr   r4,SPRN_MMCRA
> + ori r4,r4,(1 << (63-60))
> + mtspr   SPRN_MMCRA,r4
> + xorir4,r4,(1 << (63-60))
> + mtspr   SPRN_MMCRA,r4
>   ld  r4,_MMCR0(r1)
>   mtspr   SPRN_MMCR0,r4
>  1:



Re: [PATCH] mpc832x_rdb: fix of_irq_to_resource() error check

2017-08-01 Thread Scott Wood
On Mon, 2017-07-31 at 20:04 +1000, Michael Ellerman wrote:
> Scott Wood  writes:
> 
> > On Sat, 2017-07-29 at 22:52 +0300, Sergei Shtylyov wrote:
> > > of_irq_to_resource() has recently been  fixed to return negative error
> > > #'s
> > > along with 0 in case of failure,  however the Freescale MPC832x RDB
> > > board
> > > code still only regards 0 as as failure indication -- fix it up.
> > > 
> > > Fixes: 7a4228bbff76 ("of: irq: use of_irq_get() in
> > > of_irq_to_resource()")
> > > Signed-off-by: Sergei Shtylyov 
> > > 
> > > ---
> > > The patch is against the 'master' branch of Scott Wood's 'linux.git'
> > > repo
> > > (the 'fixes' branch is too much behind).
> > 
> > The master branch is also old.  Those branches are only used when needed
> > to
> > apply patches; I don't update them just to sync up.  If they're older than
> > what's in Michael's or Linus's tree (as they almost always are), then use
> > those instead.
> > 
> > Not that I expect it to make a difference to this patch...
> 
> Do you want me to grab this as a fix for 4.13 ?

Sure:
Acked-by: Scott Wood 

-Scott



Re: [RFC Part1 PATCH v3 11/17] x86/mm, resource: Use PAGE_KERNEL protection for ioremap of memory pages

2017-08-01 Thread Borislav Petkov
On Mon, Jul 24, 2017 at 02:07:51PM -0500, Brijesh Singh wrote:
> From: Tom Lendacky 
> 
> In order for memory pages to be properly mapped when SEV is active, we
> need to use the PAGE_KERNEL protection attribute as the base protection.
> This will insure that memory mapping of, e.g. ACPI tables, receives the
> proper mapping attributes.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/mm/ioremap.c  | 28 
>  include/linux/ioport.h |  3 +++
>  kernel/resource.c  | 17 +
>  3 files changed, 48 insertions(+)
> 
> diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
> index c0be7cf..7b27332 100644
> --- a/arch/x86/mm/ioremap.c
> +++ b/arch/x86/mm/ioremap.c
> @@ -69,6 +69,26 @@ static int __ioremap_check_ram(unsigned long start_pfn, 
> unsigned long nr_pages,
>   return 0;
>  }
>  
> +static int __ioremap_res_desc_other(struct resource *res, void *arg)
> +{
> + return (res->desc != IORES_DESC_NONE);
> +}
> +
> +/*
> + * This function returns true if the target memory is marked as
> + * IORESOURCE_MEM and IORESOURCE_BUSY and described as other than
> + * IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
> + */
> +static bool __ioremap_check_if_mem(resource_size_t addr, unsigned long size)
> +{
> + u64 start, end;
> +
> + start = (u64)addr;
> + end = start + size - 1;
> +
> + return (walk_mem_res(start, end, NULL, __ioremap_res_desc_other) == 1);
> +}
> +
>  /*
>   * Remap an arbitrary physical address space into the kernel virtual
>   * address space. It transparently creates kernel huge I/O mapping when
> @@ -146,7 +166,15 @@ static void __iomem *__ioremap_caller(resource_size_t 
> phys_addr,
>   pcm = new_pcm;
>   }
>  
> + /*
> +  * If the page being mapped is in memory and SEV is active then
> +  * make sure the memory encryption attribute is enabled in the
> +  * resulting mapping.
> +  */
>   prot = PAGE_KERNEL_IO;
> + if (sev_active() && __ioremap_check_if_mem(phys_addr, size))
> + prot = pgprot_encrypted(prot);

Hmm, so this function already does walk_system_ram_range() a bit
earlier and now on SEV systems we're going to do it again. Can we make
walk_system_ram_range() return a distinct value for SEV systems and act
accordingly in __ioremap_caller() instead of repeating the operation?

It looks to me like we could...

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 


Re: [PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines

2017-08-01 Thread Balbir Singh
On Tue, 1 Aug 2017 21:08:49 +0200
christophe leroy  wrote:

> Le 01/08/2017 à 13:25, Balbir Singh a écrit :
> > Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX
> > feature support we got support for changing the page permissions
> > for pte ranges. This patch adds support for both radix and hash
> > so that we can change their permissions via set/clear masks.
> >
> > A new helper is required for hash (hash__change_memory_range()
> > is changed to hash__change_boot_memory_range() as it deals with
> > bolted PTE's).
> >
> > hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests
> > for permission changes. hash__change_memory_range() does not invoke
> > updatepp, instead it changes the software PTE and invalidates the PTE.
> >
> > For radix, radix__change_memory_range() is setup to do the right
> > thing for vmalloc'd addresses. It takes a new parameter to decide
> > what attributes to set.
> >
> > Signed-off-by: Balbir Singh 
> > ---
> >  arch/powerpc/include/asm/book3s/64/hash.h  |  6 +++
> >  arch/powerpc/include/asm/book3s/64/radix.h |  6 +++
> >  arch/powerpc/include/asm/set_memory.h  | 34 +++
> >  arch/powerpc/mm/pgtable-hash64.c   | 51 --
> >  arch/powerpc/mm/pgtable-radix.c| 26 ++--
> >  arch/powerpc/mm/pgtable_64.c   | 68 
> > ++
> >  6 files changed, 175 insertions(+), 16 deletions(-)
> >  create mode 100644 arch/powerpc/include/asm/set_memory.h
> >  
> 
> [...]
> 
> > diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
> > index 0736e94..3ee4c7d 100644
> > --- a/arch/powerpc/mm/pgtable_64.c
> > +++ b/arch/powerpc/mm/pgtable_64.c
> > @@ -514,3 +514,71 @@ void mark_initmem_nx(void)
> > hash__mark_initmem_nx();
> >  }
> >  #endif
> > +
> > +#ifdef CONFIG_ARCH_HAS_SET_MEMORY
> > +/*
> > + * Some of these bits are taken from arm64/mm/page_attr.c
> > + */
> > +static int change_memory_common(unsigned long addr, int numpages,
> > +   unsigned long set, unsigned long clear)
> > +{
> > +   unsigned long start = addr;
> > +   unsigned long size = PAGE_SIZE*numpages;
> > +   unsigned long end = start + size;
> > +   struct vm_struct *area;
> > +
> > +   if (!PAGE_ALIGNED(addr)) {
> > +   start &= PAGE_MASK;
> > +   end = start + size;
> > +   WARN_ON_ONCE(1);
> > +   }  
> 
> Why not just set start = addr & PAGE_MASK, then just do 
> WARN_ON_ONCE(start != addr), instead of that if ()

The code has been taken from arch/arm64/mm/page_attr.c. I did
not change any bits, but we could make changes.

> 
> > +
> > +   /*
> > +* So check whether the [addr, addr + size) interval is entirely
> > +* covered by precisely one VM area that has the VM_ALLOC flag set.
> > +*/
> > +   area = find_vm_area((void *)addr);
> > +   if (!area ||
> > +   end > (unsigned long)area->addr + area->size ||
> > +   !(area->flags & VM_ALLOC))
> > +   return -EINVAL;
> > +
> > +   if (!numpages)
> > +   return 0;  
> 
> Shouldn't that be tested earlier ?
> 

Same as above

> > +
> > +   if (radix_enabled())
> > +   return radix__change_memory_range(start, start + size,
> > +   set, clear);
> > +   else
> > +   return hash__change_memory_range(start, start + size,
> > +   set, clear);
> > +}  
> 
> The following functions should go in a place common to PPC32 and PPC64, 
> otherwise they will have to be duplicated when implementing for PPC32.
> Maybe the above function should also go in a common place, only the last 
> part should remain in a PPC64 dedicated part. It could be called 
> change_memory_range(), something like
> 
> int change_memory_range(unsigned long start, unsigned long end,
>   unsigned long set, unsigned long clear)
> {
>   if (radix_enabled())
>   return radix__change_memory_range(start, end,
> set, clear);
>   return hash__change_memory_range(start, end, set, clear);
> }
> 
> Then change_memory_range() could also be implemented for PPC32 later.

I was hoping that when we implement support for PPC32, we
could refactor the code then and move it to arch/powerpc/mm/page_attr.c
if required. What do you think?

> 
> > +
> > +int set_memory_ro(unsigned long addr, int numpages)
> > +{
> > +   return change_memory_common(addr, numpages,
> > +   0, _PAGE_WRITE);
> > +}
> > +EXPORT_SYMBOL(set_memory_ro);  
> 
> Take care that _PAGE_WRITE has value 0 when _PAGE_RO instead of _PAGE_RW 
> is defined (eg for the 8xx).
> 
> It would be better to use accessors like pte_wrprotect() and pte_mkwrite()
>

Sure we can definitely refactor this for PPC32, pte_wrprotect()
and pte_mkwrite() would require us to make the 

Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome

2017-08-01 Thread Michael Ellerman
Daniel Axtens  writes:

> Hi Matt,
>
>> The raid6 Q syndrome check has been optimised using the vpermxor
>> instruction.
>
> Very much a nit, but normally we'd write the change that the patch makes
> as a command: "Optimise the raid6 Q syndrome generation using the
> vpermxor instruction" - see
> https://www.kernel.org/doc/html/v4.11/process/submitting-patches.html#describe-your-changes

There's a good list here:
  https://chris.beams.io/posts/git-commit/

Which includes "Use the imperative mood in the subject line".

And has a good rule of thumb:

  A properly formed Git commit subject line should always be able to
  complete the following sentence:
  
  If applied, this commit will [your subject line here]


In this case Matt's subject is fine, but IMHO you should also use the
imperative mood for the body of the change log - which is basically what
you said :)

cheers


Re: [v5 1/2] lib/raid6: Build proper files on corresponding arch

2017-08-01 Thread Michael Ellerman
Daniel Axtens  writes:

> Hi Matt,
>
>> --- a/lib/raid6/test/Makefile
>> +++ b/lib/raid6/test/Makefile
>> @@ -44,10 +44,12 @@ else ifeq ($(HAS_NEON),yes)
>>  CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
>>  else
>>  HAS_ALTIVEC := $(shell printf '\#include \nvector int 
>> a;\n' |\
>> - gcc -c -x c - >&/dev/null && \
>> - rm ./-.o && echo yes)
>> + gcc -c -x c - >/dev/null && rm ./-.o && echo yes)
>
> From memory the change here (s/>&/>/) was necessary to get the build to
> succeed - did we ever figure out why that was? I'm not enough of a shell
> guru to grok the difference.

Using >& redirects stdout and stderr, whereas > only redirects stdout.

So possibly it doesn't fix anything, but rather lets you see any error
emitted by the compiler rather than swallowing it?

cheers


Re: [PATCH v2 3/4] powerpc: add irq accounting for system reset interrupts

2017-08-01 Thread Nicholas Piggin
On Tue,  1 Aug 2017 22:00:53 +1000
Nicholas Piggin  wrote:

> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index b67f8b03a32d..4b9a567c9975 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -204,6 +204,9 @@ void soft_nmi_interrupt(struct pt_regs *regs)
>   return;
>  
>   nmi_enter();
> +
> + __this_cpu_inc(irq_stat.soft_nmi_irqs);
> +
>   tb = get_tb();
>   if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
>   per_cpu(wd_timer_tb, cpu) = tb;

Sorry, this hunk leaked into patch 3. Should be in patch 4.


QRE: [PATCH v2] qe: fix compile issue for arm64

2017-08-01 Thread Qiang Zhao

Michael Ellerman  wrote:

> -Original Message-
> From: Michael Ellerman [mailto:m...@ellerman.id.au]
> Sent: Monday, July 31, 2017 6:37 PM
> To: Qiang Zhao ; o...@buserror.net
> Cc: valentin.longch...@keymile.com; linuxppc-dev@lists.ozlabs.org; linux-
> ker...@vger.kernel.org
> Subject: RE: [PATCH v2] qe: fix compile issue for arm64
> 
> Qiang Zhao  writes:
> 
> > Fri 7/28/2017 2:14 PM, Michael Ellerman  wrote:
> >
> >> -Original Message-
> >> From: Michael Ellerman [mailto:m...@ellerman.id.au]
> >> Sent: Friday, July 28, 2017 2:14 PM
> >> To: Qiang Zhao ; o...@buserror.net
> >> Cc: valentin.longch...@keymile.com; linuxppc-dev@lists.ozlabs.org;
> >> linux- ker...@vger.kernel.org; Qiang Zhao 
> >> Subject: Re: [PATCH v2] qe: fix compile issue for arm64
> >>
> >> Zhao Qiang  writes:
> >>
> >> > Signed-off-by: Zhao Qiang 
> >> > ---
> >> > Changes for v2:
> >> >  - include all Errata QE_General4 in #ifdef
> >> >
> >> >  drivers/soc/fsl/qe/qe.c | 2 ++
> >> >  1 file changed, 2 insertions(+)
> >>
> >> AFAICS this driver can only be built on PPC, what am I missing?
> >>
> >> config QUICC_ENGINE
> >> bool "Freescale QUICC Engine (QE) Support"
> >> depends on FSL_SOC && PPC32
> >>
> >> cheers
> >
> > I sent another patchset to support it on arm64.
> 
> Where? I don't see it.
> 
> Shouldn't this patch be part of that series? Otherwise when that series is 
> merged
> the build will break on arm64.
> 
You are correct, thanks for your recommend.
I will add this patch to the patchset.

Thank you!

BR
Qiang Zhao


Re: [PATCH v4 1/5] powerpc/lib/sstep: Add cmpb instruction emulation

2017-08-01 Thread Matt Brown
On Tue, Aug 1, 2017 at 10:44 PM, Segher Boessenkool
 wrote:
> Hi!
>
> On Mon, Jul 31, 2017 at 10:58:22AM +1000, Matt Brown wrote:
>> @@ -1049,6 +1065,10 @@ int analyse_instr(struct instruction_op *op, struct 
>> pt_regs *regs,
>>   do_cmp_unsigned(regs, val, val2, rd >> 2);
>>   goto instr_done;
>>
>> + case 508: /* cmpb */
>> + do_cmpb(regs, regs->gpr[rd], regs->gpr[rb], ra);
>> + goto instr_done;
>
> Should this then be under an ifdef for 64-bit?

I don't think so, the cmpb instruction should be 32 and 64-bit.
It isn't listed under the '64-bit Fixed-point Logical Instructions'
section in the ISA either.

Thanks,
Matt
>
>
> Segher


Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome

2017-08-01 Thread Daniel Axtens
Oh, one final thing - I just realised there's a .gitignore file in
lib/raid6/.gitignore that needs to be updated to include the vpermxor
generated files. That should be part of this patch.

Regards,
Daniel


Re: [v5 2/2] raid6/altivec: Add vpermxor implementation for raid6 Q syndrome

2017-08-01 Thread Daniel Axtens
Hi Matt,

> The raid6 Q syndrome check has been optimised using the vpermxor
> instruction.

Very much a nit, but normally we'd write the change that the patch makes
as a command: "Optimise the raid6 Q syndrome generation using the
vpermxor instruction" - see
https://www.kernel.org/doc/html/v4.11/process/submitting-patches.html#describe-your-changes

> +static void noinline raid6_vpermxor$#_gen_syndrome_real(int disks, size_t 
> bytes,
> + void **ptrs)
> +{
> + u8 **dptr = (u8 **)ptrs;
> + u8 *p, *q;
> + int d, z, z0;
> + unative_t wp$$, wq$$, wd$$;
> +
> + z0 = disks - 3; /* Highest data disk */
> + p = dptr[z0+1]; /* XOR parity */
> + q = dptr[z0+2]; /* RS syndrome */
> +
> + for (d = 0; d < bytes; d += NSIZE*$#) {
> + wp$$ = wq$$ = *(unative_t *)[z0][d+$$*NSIZE];
> +
> + for (z = z0-1; z>=0; z--) {
> + wd$$ = *(unative_t *)[z][d+$$*NSIZE];
> + /* P syndrome */
> + wp$$ = vec_xor(wp$$, wd$$);
> +
> + /*Q syndrome */
> + asm("vpermxor %0,%1,%2,%3":"=v"(wq$$):"v"(gf_high), 
> "v"(gf_low), "v"(wq$$));

Initially I thought "why can't we break this over 2 lines?" and then I
remembered that the awk script can't handle that. A space between /* and
Q would be good though.

> + wq$$ = vec_xor(wq$$, wd$$);

I generated some of the unrolled code and inspected it. It's non-trivial
to follow but that's justifiable, it's due to:
 - the complex maths
 - the unrolling process
 - consistency with the altivec code, which I think is worth keeping
I am not sure how you could make it any easier to read, so I don't think
that should block its acceptance into the kernel.

I am confident that this code works correctly and as described.

Reviewed-by: Daniel Axtens 

Regards,
Daniel

> -- 
> 2.9.3


Re: [v5 1/2] lib/raid6: Build proper files on corresponding arch

2017-08-01 Thread Daniel Axtens
Hi Matt,

> --- a/lib/raid6/test/Makefile
> +++ b/lib/raid6/test/Makefile
> @@ -44,10 +44,12 @@ else ifeq ($(HAS_NEON),yes)
>  CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
>  else
>  HAS_ALTIVEC := $(shell printf '\#include \nvector int 
> a;\n' |\
> - gcc -c -x c - >&/dev/null && \
> - rm ./-.o && echo yes)
> + gcc -c -x c - >/dev/null && rm ./-.o && echo yes)

>From memory the change here (s/>&/>/) was necessary to get the build to
succeed - did we ever figure out why that was? I'm not enough of a shell
guru to grok the difference. If it's easy to explain it would be good to
put it in the commit message, rather than just saying you fixed an
unspecified bug.

>  ifeq ($(HAS_ALTIVEC),yes)
> -OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
> +CFLAGS += -I../../../arch/powerpc/include
> +CFLAGS += -DCONFIG_ALTIVEC
> +OBJS += altivec1.o altivec2.o altivec4.o altivec8.o \
> +vpermxor1.o vpermxor2.o vpermxor4.o vpermxor8.o

You've added vpermxor here, but you don't define them until the next
patch, so the tests will fail. Please move the change to OBJS to the
next patch.

With that change, I'd be happy to formally Review this patch.

Regards,
Daniel

>  endif
>  endif
>  ifeq ($(ARCH),tilegx)
> -- 
> 2.9.3


Re: [PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines

2017-08-01 Thread christophe leroy



Le 01/08/2017 à 13:25, Balbir Singh a écrit :

Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX
feature support we got support for changing the page permissions
for pte ranges. This patch adds support for both radix and hash
so that we can change their permissions via set/clear masks.

A new helper is required for hash (hash__change_memory_range()
is changed to hash__change_boot_memory_range() as it deals with
bolted PTE's).

hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests
for permission changes. hash__change_memory_range() does not invoke
updatepp, instead it changes the software PTE and invalidates the PTE.

For radix, radix__change_memory_range() is setup to do the right
thing for vmalloc'd addresses. It takes a new parameter to decide
what attributes to set.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/include/asm/book3s/64/hash.h  |  6 +++
 arch/powerpc/include/asm/book3s/64/radix.h |  6 +++
 arch/powerpc/include/asm/set_memory.h  | 34 +++
 arch/powerpc/mm/pgtable-hash64.c   | 51 --
 arch/powerpc/mm/pgtable-radix.c| 26 ++--
 arch/powerpc/mm/pgtable_64.c   | 68 ++
 6 files changed, 175 insertions(+), 16 deletions(-)
 create mode 100644 arch/powerpc/include/asm/set_memory.h



[...]


diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0736e94..3ee4c7d 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -514,3 +514,71 @@ void mark_initmem_nx(void)
hash__mark_initmem_nx();
 }
 #endif
+
+#ifdef CONFIG_ARCH_HAS_SET_MEMORY
+/*
+ * Some of these bits are taken from arm64/mm/page_attr.c
+ */
+static int change_memory_common(unsigned long addr, int numpages,
+   unsigned long set, unsigned long clear)
+{
+   unsigned long start = addr;
+   unsigned long size = PAGE_SIZE*numpages;
+   unsigned long end = start + size;
+   struct vm_struct *area;
+
+   if (!PAGE_ALIGNED(addr)) {
+   start &= PAGE_MASK;
+   end = start + size;
+   WARN_ON_ONCE(1);
+   }


Why not just set start = addr & PAGE_MASK, then just do 
WARN_ON_ONCE(start != addr), instead of that if ()



+
+   /*
+* So check whether the [addr, addr + size) interval is entirely
+* covered by precisely one VM area that has the VM_ALLOC flag set.
+*/
+   area = find_vm_area((void *)addr);
+   if (!area ||
+   end > (unsigned long)area->addr + area->size ||
+   !(area->flags & VM_ALLOC))
+   return -EINVAL;
+
+   if (!numpages)
+   return 0;


Shouldn't that be tested earlier ?


+
+   if (radix_enabled())
+   return radix__change_memory_range(start, start + size,
+   set, clear);
+   else
+   return hash__change_memory_range(start, start + size,
+   set, clear);
+}


The following functions should go in a place common to PPC32 and PPC64, 
otherwise they will have to be duplicated when implementing for PPC32.
Maybe the above function should also go in a common place, only the last 
part should remain in a PPC64 dedicated part. It could be called 
change_memory_range(), something like


int change_memory_range(unsigned long start, unsigned long end,
unsigned long set, unsigned long clear)
{
if (radix_enabled())
return radix__change_memory_range(start, end,
  set, clear);
return hash__change_memory_range(start, end, set, clear);
}

Then change_memory_range() could also be implemented for PPC32 later.


+
+int set_memory_ro(unsigned long addr, int numpages)
+{
+   return change_memory_common(addr, numpages,
+   0, _PAGE_WRITE);
+}
+EXPORT_SYMBOL(set_memory_ro);


Take care that _PAGE_WRITE has value 0 when _PAGE_RO instead of _PAGE_RW 
is defined (eg for the 8xx).


It would be better to use accessors like pte_wrprotect() and pte_mkwrite()


+
+int set_memory_rw(unsigned long addr, int numpages)
+{
+   return change_memory_common(addr, numpages,
+   _PAGE_WRITE, 0);
+}
+EXPORT_SYMBOL(set_memory_rw);
+
+int set_memory_nx(unsigned long addr, int numpages)
+{
+   return change_memory_common(addr, numpages,
+   0, _PAGE_EXEC);
+}
+EXPORT_SYMBOL(set_memory_nx);
+
+int set_memory_x(unsigned long addr, int numpages)
+{
+   return change_memory_common(addr, numpages,
+   _PAGE_EXEC, 0);
+}
+EXPORT_SYMBOL(set_memory_x);
+#endif



Christophe

---
L'absence de virus dans ce courrier électronique a été vérifiée par le logiciel 
antivirus Avast.

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-08-01 Thread Paul E. McKenney
On Mon, Jul 31, 2017 at 04:27:57PM +0100, Jonathan Cameron wrote:
> On Mon, 31 Jul 2017 08:04:11 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Mon, Jul 31, 2017 at 12:08:47PM +0100, Jonathan Cameron wrote:
> > > On Fri, 28 Jul 2017 12:03:50 -0700
> > > "Paul E. McKenney"  wrote:
> > >   
> > > > On Fri, Jul 28, 2017 at 06:27:05PM +0100, Jonathan Cameron wrote:  
> > > > > On Fri, 28 Jul 2017 09:55:29 -0700
> > > > > "Paul E. McKenney"  wrote:
> > > > > 
> > > > > > On Fri, Jul 28, 2017 at 02:24:03PM +0100, Jonathan Cameron wrote:   
> > > > > >  
> > > > > > > On Fri, 28 Jul 2017 08:44:11 +0100
> > > > > > > Jonathan Cameron  wrote:  
> > > > > > 
> > > > > > [ . . . ]
> > > > > > 
> > > > > > > Ok.  Some info.  I disabled a few driver (usb and SAS) in the 
> > > > > > > interest of having
> > > > > > > fewer timer events.  Issue became much easier to trigger (on some 
> > > > > > > runs before
> > > > > > > I could get tracing up and running)
> > > > > > >e
> > > > > > > So logs are large enough that pastebin doesn't like them - please 
> > > > > > > shoet if  
> > > > > > >>e another timer period is of interest.  
> > > > > > > 
> > > > > > > https://pastebin.com/iUZDfQGM for the timer trace.
> > > > > > > https://pastebin.com/3w1F7amH for dmesg.  
> > > > > > > 
> > > > > > > The relevant timeout on the RCU stall detector was 8 seconds.  
> > > > > > > Event is
> > > > > > > detected around 835.
> > > > > > > 
> > > > > > > It's a lot of logs, so I haven't identified a smoking gun yet but 
> > > > > > > there
> > > > > > > may well be one in there.  
> > > > > > 
> > > > > > The dmesg says:
> > > > > > 
> > > > > > rcu_preempt kthread starved for 2508 jiffies! g112 c111 f0x0 
> > > > > > RCU_GP_WAIT_FQS(3) ->state=0x1
> > > > > > 
> > > > > > So I look for "rcu_preempt" timer events and find these:
> > > > > > 
> > > > > > rcu_preempt-9 [019]    827.579114: timer_init: 
> > > > > > timer=8017d5fc7da0
> > > > > > rcu_preempt-9 [019] d..1   827.579115: timer_start: 
> > > > > > timer=8017d5fc7da0 function=process_timeout 
> > > > > > 
> > > > > > Next look for "8017d5fc7da0" and I don't find anything else.
> > > > > It does show up off the bottom of what would fit in pastebin...
> > > > > 
> > > > >  rcu_preempt-9 [001] d..1   837.681077: timer_cancel: 
> > > > > timer=8017d5fc7da0
> > > > >  rcu_preempt-9 [001]    837.681086: timer_init: 
> > > > > timer=8017d5fc7da0
> > > > >  rcu_preempt-9 [001] d..1   837.681087: timer_start: 
> > > > > timer=8017d5fc7da0 function=process_timeout expires=4295101298 
> > > > > [timeout=1] cpu=1 idx=0 flags=
> > > > 
> > > > Odd.  I would expect an expiration...  And ten seconds is way longer
> > > > than the requested one jiffy!
> > > >   
> > > > > > The timeout was one jiffy, and more than a second later, no 
> > > > > > expiration.
> > > > > > Is it possible that this event was lost?  I am not seeing any sign 
> > > > > > of
> > > > > > this is the trace.
> > > > > > 
> > > > > > I don't see any sign of CPU hotplug (and I test with lots of that in
> > > > > > any case).
> > > > > > 
> > > > > > The last time we saw something like this it was a timer HW/driver 
> > > > > > problem,
> > > > > > but it is a bit hard to imagine such a problem affecting both ARM64
> > > > > > and SPARC.  ;-)
> > > > > Could be different issues, both of which were hidden by that lockup 
> > > > > detector.
> > > > > 
> > > > > There is an errata work around for the timers on this particular 
> > > > > board.
> > > > > I'm only vaguely aware of it, so may be unconnected.
> > > > > 
> > > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/clocksource/arm_arch_timer.c?h=v4.13-rc2=bb42ca47401010fc02901b5e8f79e40a26f208cb
> > > > > 
> > > > > Seems unlikely though! + we've not yet seen it on the other chips that
> > > > > errata effects (not that that means much).
> > > > 
> > > > If you can reproduce quickly, might be worth trying anyway...
> > > > 
> > > > Thanx, Paul  
> > > Errata fix is running already and was for all those tests.  
> > 
> > I was afraid of that...  ;-)
> It's a pretty rare errata it seems.  Not actually managed to catch
> one yet. 
> > 
> > > I'll have a dig into the timers today and see where I get to.  
> > 
> > Look forward to seeing what you find!
> Nothing obvious turning up other than we don't seem to have issue
> when we aren't running hrtimers.
> 
> On a plus side I just got a report that it is effecting our d03
> boards which is good on the basis I couldn't tell what the difference
> could be wrt to this issue!
> 
> It indeed looks like we are consistently missing a timer before
> the rcu splat occurs.

And for my part, my tests with CONFIG_HZ_PERIODIC=y and

Re: [RFC v6 21/62] powerpc: introduce execute-only pkey

2017-08-01 Thread Thiago Jung Bauermann

Michael Ellerman  writes:

> Thiago Jung Bauermann  writes:
>> Ram Pai  writes:
> ...
>>> +
>>> +   /* We got one, store it and use it from here on out */
>>> +   if (need_to_set_mm_pkey)
>>> +   mm->context.execute_only_pkey = execute_only_pkey;
>>> +   return execute_only_pkey;
>>> +}
>>
>> If you follow the code flow in __execute_only_pkey, the AMR and UAMOR
>> are read 3 times in total, and AMR is written twice. IAMR is read and
>> written twice. Since they are SPRs and access to them is slow (or isn't
>> it?),
>
> SPRs read/writes are slow, but they're not *that* slow in comparison to
> a system call (which I think is where this code is being called?).

Yes, this code runs on mprotect and mmap syscalls if the memory is
requested to have execute but not read nor write permissions.

> So we should try to avoid too many SPR read/writes, but at the same time
> we can accept more than the minimum if it makes the code much easier to
> follow.

Ok. Ram had asked me to suggest a way to optimize the SPR reads and
writes and I came up with the patch below. Do you think it's worth it?

The patch applies on top of this series, but if Ram includes it I think
he would break it up and merge it into the other patches.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center


>From f6e73e67d325c4a1952c375072ca35156a9f2042 Mon Sep 17 00:00:00 2001
From: Thiago Jung Bauermann 
Date: Mon, 31 Jul 2017 20:22:59 -0300
Subject: [PATCH] powerpc: Cache protection key registers in
 __execute_only_pkey

Pass around a struct with the contents of AMR, IAMR and AMOR, as well as
flags indicating whether those fields hold valid values and whether they
should be committed back to the registers.

Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/pkeys.h |  18 --
 arch/powerpc/mm/pkeys.c  | 120 +--
 2 files changed, 104 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index e61ed6c332db..66f15dbc5855 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -129,12 +129,15 @@ static inline bool mm_pkey_is_allocated(struct mm_struct 
*mm, int pkey)
mm_set_pkey_is_allocated(mm, pkey));
 }
 
-extern void __arch_activate_pkey(int pkey);
+struct pkey_regs_cache;
+
+extern void __arch_activate_pkey(int pkey, struct pkey_regs_cache *regs);
 extern void __arch_deactivate_pkey(int pkey);
 /*
  * Returns a positive, 5-bit key on success, or -1 on failure.
  */
-static inline int mm_pkey_alloc(struct mm_struct *mm)
+static inline int __mm_pkey_alloc(struct mm_struct *mm,
+ struct pkey_regs_cache *regs)
 {
/*
 * Note: this is the one and only place we make sure
@@ -162,10 +165,15 @@ static inline int mm_pkey_alloc(struct mm_struct *mm)
 * enable the key in the hardware
 */
if (ret > 0)
-   __arch_activate_pkey(ret);
+   __arch_activate_pkey(ret, regs);
return ret;
 }
 
+static inline int mm_pkey_alloc(struct mm_struct *mm)
+{
+   return __mm_pkey_alloc(mm, NULL);
+}
+
 static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
 {
if (!pkey_inited)
@@ -206,13 +214,13 @@ static inline int arch_override_mprotect_pkey(struct 
vm_area_struct *vma,
 }
 
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
-   unsigned long init_val);
+   unsigned long init_val, struct pkey_regs_cache *regs);
 static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
 {
if (!pkey_inited)
return -EINVAL;
-   return __arch_set_user_pkey_access(tsk, pkey, init_val);
+   return __arch_set_user_pkey_access(tsk, pkey, init_val, NULL);
 }
 
 static inline bool arch_pkeys_enabled(void)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 1424c79f45f6..718ea23f8184 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -22,52 +22,92 @@ u32  initial_allocation_mask;   /* bits set for 
reserved keys */
 #define PKEY_REG_BITS (sizeof(u64)*8)
 #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
 
-static bool is_pkey_enabled(int pkey)
+/*
+ * The registers controlling memory protection keys are expensive to access, so
+ * we want to cache their values in code paths that might need to use them more
+ * than once.
+ */
+struct pkey_regs_cache {
+   u64 amr;
+   u64 iamr;
+   u64 uamor;
+
+   bool amr_valid;
+   bool iamr_valid;
+   bool uamor_valid;
+
+   bool write_amr;
+   bool write_iamr;
+   bool write_uamor;
+};
+
+static bool is_pkey_enabled(int pkey, struct pkey_regs_cache *regs)
 {
-   return !!(read_uamor() & (0x3ul << 

Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-08-01 Thread Jens Axboe
On 08/01/2017 12:55 AM, Michael Ellerman wrote:
> Jens Axboe  writes:
> ...
>>
>> Can you try the below fix? Should be more palatable than the previous
>> one. Brian, maybe you can take a look at the IRQ issue mentioned above?
> 
> Given the patch from Brian fixed the lockdep warning, do you still want
> me to try and test this one?

Nope, we don't have to do that. I'd much rather just add a WARN_ON()
or similar to make sure we catch buggy users earlier. scsi_run_queue()
needs a

WARN_ON(in_interrupt());

but it might be better to put that in __blk_mq_run_hw_queue().

-- 
Jens Axboe



[PATCH] ipr: Fix scsi-mq lockdep issue

2017-08-01 Thread Brian King
Fixes the following lockdep warning that can occur when scsi-mq is enabled
with ipr due to ipr calling scsi_unblock_requests from irq context. The fix
is to move the call to scsi_unblock_requests to ipr's existing workqueue.

stack backtrace:
CPU: 28 PID: 0 Comm: swapper/28 Not tainted 4.13.0-rc2-gcc6x-gf74c89b #1
Call Trace:
[c01fffe97550] [c0b50818] dump_stack+0xe8/0x160 (unreliable)
[c01fffe97590] [c01586d0] print_usage_bug+0x2d0/0x390
[c01fffe97640] [c0158f34] mark_lock+0x7a4/0x8e0
[c01fffe976f0] [c015a000] __lock_acquire+0x6a0/0x1a70
[c01fffe97860] [c015befc] lock_acquire+0xec/0x2e0
[c01fffe97930] [c0b71514] _raw_spin_lock+0x44/0x70
[c01fffe97960] [c05b60f4] blk_mq_sched_dispatch_requests+0xa4/0x2a0
[c01fffe979c0] [c05acac0] __blk_mq_run_hw_queue+0x100/0x2c0
[c01fffe97a00] [c05ad478] __blk_mq_delay_run_hw_queue+0x118/0x130
[c01fffe97a40] [c05ad61c] blk_mq_start_hw_queues+0x6c/0xa0
[c01fffe97a80] [c0797aac] scsi_kick_queue+0x2c/0x60
[c01fffe97aa0] [c0797cf0] scsi_run_queue+0x210/0x360
[c01fffe97b10] [c079b888] scsi_run_host_queues+0x48/0x80
[c01fffe97b40] [c07b6090] ipr_ioa_bringdown_done+0x70/0x1e0
[c01fffe97bc0] [c07bc860] ipr_reset_ioa_job+0x80/0xf0
[c01fffe97bf0] [c07b4d50] ipr_reset_timer_done+0xd0/0x100
[c01fffe97c30] [c01937bc] call_timer_fn+0xdc/0x4b0
[c01fffe97cf0] [c0193d08] expire_timers+0x178/0x330
[c01fffe97d60] [c01940c8] run_timer_softirq+0xb8/0x120
[c01fffe97de0] [c0b726a8] __do_softirq+0x168/0x6d8
[c01fffe97ef0] [c00df2c8] irq_exit+0x108/0x150
[c01fffe97f10] [c0017bf4] __do_irq+0x2a4/0x4a0
[c01fffe97f90] [c002da50] call_do_irq+0x14/0x24
[c007fad93aa0] [c0017e8c] do_IRQ+0x9c/0x140
[c007fad93af0] [c0008b98] hardware_interrupt_common+0x138/0x140

Reported-by: Michael Ellerman 
Signed-off-by: Brian King 
---

Index: linux-2.6.git/drivers/scsi/ipr.c
===
--- linux-2.6.git.orig/drivers/scsi/ipr.c
+++ linux-2.6.git/drivers/scsi/ipr.c
@@ -3351,6 +3351,16 @@ static void ipr_worker_thread(struct wor
return;
}
 
+   if (ioa_cfg->scsi_unblock) {
+   ioa_cfg->scsi_unblock = 0;
+   ioa_cfg->scsi_blocked = 0;
+   spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
+   scsi_unblock_requests(ioa_cfg->host);
+   spin_lock_irqsave(ioa_cfg->host->host_lock, lock_flags);
+   if (ioa_cfg->scsi_blocked)
+   scsi_block_requests(ioa_cfg->host);
+   }
+
if (!ioa_cfg->scan_enabled) {
spin_unlock_irqrestore(ioa_cfg->host->host_lock, lock_flags);
return;
@@ -7211,9 +7221,8 @@ static int ipr_ioa_bringdown_done(struct
ENTER;
if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) {
ipr_trace;
-   spin_unlock_irq(ioa_cfg->host->host_lock);
-   scsi_unblock_requests(ioa_cfg->host);
-   spin_lock_irq(ioa_cfg->host->host_lock);
+   ioa_cfg->scsi_unblock = 1;
+   schedule_work(_cfg->work_q);
}
 
ioa_cfg->in_reset_reload = 0;
@@ -7287,13 +7296,7 @@ static int ipr_ioa_reset_done(struct ipr
list_add_tail(_cmd->queue, _cmd->hrrq->hrrq_free_q);
wake_up_all(_cfg->reset_wait_q);
 
-   spin_unlock(ioa_cfg->host->host_lock);
-   scsi_unblock_requests(ioa_cfg->host);
-   spin_lock(ioa_cfg->host->host_lock);
-
-   if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].allow_cmds)
-   scsi_block_requests(ioa_cfg->host);
-
+   ioa_cfg->scsi_unblock = 1;
schedule_work(_cfg->work_q);
LEAVE;
return IPR_RC_JOB_RETURN;
@@ -9249,8 +9252,11 @@ static void _ipr_initiate_ioa_reset(stru
spin_unlock(_cfg->hrrq[i]._lock);
}
wmb();
-   if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa)
+   if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) {
+   ioa_cfg->scsi_unblock = 0;
+   ioa_cfg->scsi_blocked = 1;
scsi_block_requests(ioa_cfg->host);
+   }
 
ipr_cmd = ipr_get_free_ipr_cmnd(ioa_cfg);
ioa_cfg->reset_cmd = ipr_cmd;
@@ -9306,9 +9312,8 @@ static void ipr_initiate_ioa_reset(struc
wake_up_all(_cfg->reset_wait_q);
 
if (!ioa_cfg->hrrq[IPR_INIT_HRRQ].removing_ioa) {
-   spin_unlock_irq(ioa_cfg->host->host_lock);
-   scsi_unblock_requests(ioa_cfg->host);
-   spin_lock_irq(ioa_cfg->host->host_lock);
+   ioa_cfg->scsi_unblock = 1;
+   

Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch

2017-08-01 Thread Daniel Henrique Barboza



On 08/01/2017 11:05 AM, Nathan Fontenot wrote:

On 08/01/2017 04:59 AM, Michael Ellerman wrote:

Daniel Henrique Barboza  writes:


Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online
hotplugged memory'") reverted the auto-online feature for pseries due
to problems with LMB removals not updating the device struct properly.
Among other things, this commit made the following change in
arch/powerpc/configs/pseries_defconfig:

@@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y
  CONFIG_IRQ_ALL_CPUS=y
  CONFIG_MEMORY_HOTPLUG=y
  CONFIG_MEMORY_HOTREMOVE=y
-CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
  CONFIG_KSM=y

The intent was to disable the option in the defconfig of pseries, since
after that the code doesn't have this support anymore.

It's always polite to Cc the author of a commit you're referring to, so
I added Nathan.

Noted. Thanks for adding Nathan in the CC.



The intention when we merged that fix was that the auto-online code
would be "fixed" to mark the device online. I say "fixed" because it
wasn't entirely clear if that was the correct behaviour, though it
definitely seemed like it should be.

I've lost track of where/if the discussion got to on whether the
auto-online code should do that or not. Did anything get resolved?

I think, though I should go back and test to be sure, that everything
works in the latest mainline code. The issue causing this to be a problem
was in the original implementation of auto_online support. If you wanted
to auto online memory, the code was calling memory_block_change_state().
This worked but did not update the device struct for each of the memory
block that was online'ed such that dev->offline == true even after the
memory was online.

I sent a patch earlier this year (commit dc18d706a436) that corrected
this to call device_online() instead of memory_block_change_state().
With this fix (appears to have gone into the 4.11 kernel) it should be
possible to use auto_online on power systems.


Commit dc18d706a436 was present in the 4.11 kernels that experiences this
issue (Fedora 26 and Ubuntu 17.10 in my tests). So I am not entirely sure
that we can use auto_online on power systems, at least in those kernels.




At this point I don't think we need this patch to disable auto online
for ppc64. I would be curious if this is still broken with the latest
mainline code though.


If the auto_online feature is already working in upstream 4.13 kernel 
then I don't see
a reason to apply this patch either. We can leave it as a FYI/reminder 
of a problem

that was happening in 4.11 and got solved later on.


Thanks,


Daniel



-Nathan
   

However, this change
alone isn't enough to prevent situations such as [1], where
distros can enable the option unaware of the consequences of
doing it (e.g. breaking LMB hotplug altogether).

Instead of relying on all distros knowing that pseries can't handle
CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch
changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config
unavailable for the PPC64 arch.

[1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380

Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged memory'")
Signed-off-by: Daniel Henrique Barboza 
---
  mm/Kconfig | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

I don't own that file, so we at least need an Ack from the mm folks.

cheers


diff --git a/mm/Kconfig b/mm/Kconfig
index 48b1af4..a342c77 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE
  config MEMORY_HOTPLUG_DEFAULT_ONLINE
  bool "Online the newly added memory blocks by default"
  default n
-depends on MEMORY_HOTPLUG
+depends on MEMORY_HOTPLUG && !PPC64
  help
  This option sets the default policy setting for memory hotplug
  onlining policy (/sys/devices/system/memory/auto_online_blocks) which
--
2.9.4




Re: [PATCH 2/3] powerpc/xmon: Disable and enable tracing command

2017-08-01 Thread Breno Leitao
Hi Naveen,

On Tue, Aug 01, 2017 at 12:10:24PM +0530, Naveen N. Rao wrote:
> On 2017/07/31 02:22PM, Breno Leitao wrote:
> > If tracing is enabled and you get into xmon, the tracing buffer
> > continues to be updated, causing possible loss of data due to buffer
> > overflow and unnecessary tracing information coming from xmon functions.
> > 
> > This patch adds a new option that allows the tracing to be disabled and
> > re-enabled from inside xmon.
> 
> How is this new option useful? In the next patch, you disable tracing by 
> default -- in what scenario do you expect to have to re-enable tracing 
> from within xmon?

I see it being useful on two different scenarios:

1) You can reenable tracing if you want to call a function from xmon
(with 'p'), or even for code stepping (with 's').

2) You may also want to reenable tracing once you resume from xmon with
'zr'.

> > +   case 'v':
> > +   if (tracing_is_on()) {
> > +   printk("Disabling tracing\n");
> > +   tracing_enabled = 0;
> > +   tracing_off();
> 
> This only disables trace buffer updates - ftrace (and all its callbacks, 
> et al) remains active, which isn't desirable.

Why isn't it desirable? In fact, I thought it would be *the* desirable
function to call, since it does not do a lot of stuff, as disabling
tracing, in xmon mode, but, just disable the tracing buffer to be updated.

Since we are in xmon, we are in a very bad state, and something went
very wrong. Disabling the whole tracing might not be what we want to do
in this scenario, since it can hit the broken subsystem causing xmon to
fail.

For bad state scenario, I understand that it is desirable to be less
instrusive as possible, and tracing_off() does exactly it.

> Can you see if this works for you:
> https://patchwork.ozlabs.org/patch/769611/

Well, I understand that this patch solves a different issue, this does
not reduce the tracing caused by function tracer after you got into into
xmon.

As for example, with your patch applied, I can see a lot of xmon
functions polluting the tracing buffer as:

1:mon> dt
[  359.196593] Dumping ftrace buffer:
[  359.196689] -
[  359.196904]   1)   |  xmon_printf() {
<110+ lines snipped>
[  359.197727]   1) + 22.930 us   |  }
[  359.199405]   1)   |  skipbl() {
<50+ lines snipped>
[  359.225069]   1) + 23.750 us   |  }


Since tracing continues to be enabled during xmon, these messages
continue to show up. That is exactly what I am trying to avoid with this
current patchset. Avoiding all xmon-related tracing is my main goal.

Thanks for your review,
Breno


Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch

2017-08-01 Thread Nathan Fontenot
On 08/01/2017 04:59 AM, Michael Ellerman wrote:
> Daniel Henrique Barboza  writes:
> 
>> Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online
>> hotplugged memory'") reverted the auto-online feature for pseries due
>> to problems with LMB removals not updating the device struct properly.
>> Among other things, this commit made the following change in
>> arch/powerpc/configs/pseries_defconfig:
>>
>> @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y
>>  CONFIG_IRQ_ALL_CPUS=y
>>  CONFIG_MEMORY_HOTPLUG=y
>>  CONFIG_MEMORY_HOTREMOVE=y
>> -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
>>  CONFIG_KSM=y
>>
>> The intent was to disable the option in the defconfig of pseries, since
>> after that the code doesn't have this support anymore.
> 
> It's always polite to Cc the author of a commit you're referring to, so
> I added Nathan.
> 
> The intention when we merged that fix was that the auto-online code
> would be "fixed" to mark the device online. I say "fixed" because it
> wasn't entirely clear if that was the correct behaviour, though it
> definitely seemed like it should be.
> 
> I've lost track of where/if the discussion got to on whether the
> auto-online code should do that or not. Did anything get resolved?

I think, though I should go back and test to be sure, that everything
works in the latest mainline code. The issue causing this to be a problem
was in the original implementation of auto_online support. If you wanted
to auto online memory, the code was calling memory_block_change_state().
This worked but did not update the device struct for each of the memory
block that was online'ed such that dev->offline == true even after the
memory was online.

I sent a patch earlier this year (commit dc18d706a436) that corrected
this to call device_online() instead of memory_block_change_state().
With this fix (appears to have gone into the 4.11 kernel) it should be
possible to use auto_online on power systems.

At this point I don't think we need this patch to disable auto online
for ppc64. I would be curious if this is still broken with the latest
mainline code though.

-Nathan
  
> 
>> However, this change
>> alone isn't enough to prevent situations such as [1], where
>> distros can enable the option unaware of the consequences of
>> doing it (e.g. breaking LMB hotplug altogether).
>>
>> Instead of relying on all distros knowing that pseries can't handle
>> CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch
>> changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config
>> unavailable for the PPC64 arch.
>>
>> [1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380
>>
>> Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged 
>> memory'")
>> Signed-off-by: Daniel Henrique Barboza 
>> ---
>>  mm/Kconfig | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> I don't own that file, so we at least need an Ack from the mm folks.
> 
> cheers
> 
>> diff --git a/mm/Kconfig b/mm/Kconfig
>> index 48b1af4..a342c77 100644
>> --- a/mm/Kconfig
>> +++ b/mm/Kconfig
>> @@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE
>>  config MEMORY_HOTPLUG_DEFAULT_ONLINE
>>  bool "Online the newly added memory blocks by default"
>>  default n
>> -depends on MEMORY_HOTPLUG
>> +depends on MEMORY_HOTPLUG && !PPC64
>>  help
>>This option sets the default policy setting for memory hotplug
>>onlining policy (/sys/devices/system/memory/auto_online_blocks) which
>> -- 
>> 2.9.4
> 



[PATCH] powerpc/64: Fix __check_irq_replay missing decrementer interrupt

2017-08-01 Thread Nicholas Piggin
If the decrementer wraps and de-asserts the decrementer exception while
hard-disabled, __check_irq_replay has a test to notice the wrap when
interrupts are re-enabled.

The decrementer check must be done when clearing the PACA_IRQ_HARD_DIS
flag, not when the PACA_IRQ_DEC flag is tested. Previously this worked
because the decrementer interrupt was always the first one checked after
clearing the hard disable flag, but HMI check was moved ahead of that,
which introduced this bug.

This can cause a missed decrementer interrupt if we soft-disable
interrupts then take an HMI which is recorded in irq_happened, then
hard-disable interrupts for > 4s to wrap the decrementer.

Fixes: e0e0d6b739 ("powerpc/64: Replay hypervisor maintenance interrupt first")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/irq.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0bcec745a672..f291f7826abc 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -145,6 +145,19 @@ notrace unsigned int __check_irq_replay(void)
 
/* Clear bit 0 which we wouldn't clear otherwise */
local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+   if (happened & PACA_IRQ_HARD_DIS) {
+   /*
+* We may have missed a decrementer interrupt if hard disabled.
+* Check the decrementer register in case we had a rollover
+* while hard disabled.
+*/
+   if (!(happened & PACA_IRQ_DEC)) {
+   if (decrementer_check_overflow()) {
+   local_paca->irq_happened |= PACA_IRQ_DEC;
+   happened |= PACA_IRQ_DEC;
+   }
+   }
+   }
 
/*
 * Force the delivery of pending soft-disabled interrupts on PS3.
@@ -170,7 +183,7 @@ notrace unsigned int __check_irq_replay(void)
 * in case we also had a rollover while hard disabled
 */
local_paca->irq_happened &= ~PACA_IRQ_DEC;
-   if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+   if (happened & PACA_IRQ_DEC)
return 0x900;
 
/* Finally check if an external interrupt happened */
-- 
2.11.0



Re: [PATCH v2] powerpc/powernv: Use darn instr for random_seed on p9

2017-08-01 Thread Segher Boessenkool
On Mon, Jul 31, 2017 at 07:10:15PM +1000, Michael Ellerman wrote:
> And ___PPC_RA() is not quite right. The L field is only 2 bits wide, not
> the 5 that ___PPC_RA() allows.
> 
> We don't have a __PPC_L() macro, because L fields vary in size and
> location. So I think you're best of open coding it, eg:
> 
> +#define PPC_DARN(t, l)   stringify_in_c(.long PPC_INST_DARN |  \
> + __PPC_RT(t)|  \
> + (((l) & 0x3) << 16))

It would be better if you could do a compile-time error if the L value
is out of range.  Hrm, nothing else does such checking either?


Segher


Re: [PATCH v4 1/5] powerpc/lib/sstep: Add cmpb instruction emulation

2017-08-01 Thread Segher Boessenkool
Hi!

On Mon, Jul 31, 2017 at 10:58:22AM +1000, Matt Brown wrote:
> @@ -1049,6 +1065,10 @@ int analyse_instr(struct instruction_op *op, struct 
> pt_regs *regs,
>   do_cmp_unsigned(regs, val, val2, rd >> 2);
>   goto instr_done;
>  
> + case 508: /* cmpb */
> + do_cmpb(regs, regs->gpr[rd], regs->gpr[rb], ra);
> + goto instr_done;

Should this then be under an ifdef for 64-bit?


Segher


Re: [PATCH] drivers: cpuidle: Disable preemption before get_lppaca function call in pseries_idle_probe function

2017-08-01 Thread Victor Aoqui

Em 2017-07-20 18:21, Benjamin Herrenschmidt escreveu:

On Thu, 2017-07-20 at 14:57 -0300, Victor Aoqui wrote:

When CONFIG_PREEMPT=y, the following warning shows up:

BUG: using smp_processor_id() in preemptible [] code: 
swapper/0/1

caller is pseries_processor_idle_init+0x58/0x21c

This warning shows up because preemption cannot occur when using
get_paca(), otherwise the paca_struct it points to may be the wrong 
one

just after.

For this reason, preemption needs to be disabled before
lppaca_shared_proc(get_lppaca()).


Also chekc the generated assembly. We had all sort of interesting
issues where gcc would copy the paca pointer or the lppaca pointer
to a GPR *outside* of the preempt disabled section...

In that specific case it's not a big deal but overall, I am not
comfortable with PREEMPT on powerpc until we do something a bit
more drastic...

I would like to remove all such direct accesses to paca, instead have a
"new" get_paca() written in asm that does the preempt disable then
returns the PACA in a GPR (not directly use r13, hide that from gcc),
and which is paired with a put_paca().

The few places where we want to directly access r13 should be hand
written in asm too to hide r13 from gcc, for accessing the irq_happened
in the fast path of local_irq_enable/disable/... we should do the same
with lock tokens.

Ben.


Hi Benjamin,

Sorry for the delay. I was a little bit busy last days.
I took note of your comments and I will work on those changes.
I will let you know soon when it's done.

Thanks

--
Victor Aoqui



[PATCH v2 4/4] powerpc: add irq accounting for watchdog interrupts

2017-08-01 Thread Nicholas Piggin
This adds an irq counter for the watchdog soft-NMI. This interrupt
only fires when interrupts are soft-disabled, so it will not
increment much even when the watchdog is running. However it's
useful for debugging and sanity checking.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/hardirq.h |  3 +++
 arch/powerpc/kernel/irq.c  | 10 ++
 2 files changed, 13 insertions(+)

diff --git a/arch/powerpc/include/asm/hardirq.h 
b/arch/powerpc/include/asm/hardirq.h
index 64b73b03d473..c97603d617e3 100644
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -13,6 +13,9 @@ typedef struct {
unsigned int spurious_irqs;
unsigned int hmi_exceptions;
unsigned int sreset_irqs;
+#ifdef CONFIG_PPC_WATCHDOG
+   unsigned int soft_nmi_irqs;
+#endif
 #ifdef CONFIG_PPC_DOORBELL
unsigned int doorbell_irqs;
 #endif
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5c18335580b6..77a7f7514327 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -475,6 +475,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs);
seq_printf(p, "  System Reset interrupts\n");
 
+#ifdef CONFIG_PPC_WATCHDOG
+   seq_printf(p, "%*s: ", prec, "WDG");
+   for_each_online_cpu(j)
+   seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs);
+   seq_printf(p, "  Watchdog soft-NMI interrupts\n");
+#endif
+
 #ifdef CONFIG_PPC_DOORBELL
if (cpu_has_feature(CPU_FTR_DBELL)) {
seq_printf(p, "%*s: ", prec, "DBL");
@@ -500,6 +507,9 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += per_cpu(irq_stat, cpu).timer_irqs_others;
sum += per_cpu(irq_stat, cpu).hmi_exceptions;
sum += per_cpu(irq_stat, cpu).sreset_irqs;
+#ifdef CONFIG_PPC_WATCHDOG
+   sum += per_cpu(irq_stat, cpu).soft_nmi_irqs;
+#endif
 #ifdef CONFIG_PPC_DOORBELL
sum += per_cpu(irq_stat, cpu).doorbell_irqs;
 #endif
-- 
2.11.0



[PATCH v2 3/4] powerpc: add irq accounting for system reset interrupts

2017-08-01 Thread Nicholas Piggin
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/hardirq.h | 1 +
 arch/powerpc/kernel/irq.c  | 6 ++
 arch/powerpc/kernel/traps.c| 2 ++
 arch/powerpc/kernel/watchdog.c | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/arch/powerpc/include/asm/hardirq.h 
b/arch/powerpc/include/asm/hardirq.h
index 8add8b861e8d..64b73b03d473 100644
--- a/arch/powerpc/include/asm/hardirq.h
+++ b/arch/powerpc/include/asm/hardirq.h
@@ -12,6 +12,7 @@ typedef struct {
unsigned int mce_exceptions;
unsigned int spurious_irqs;
unsigned int hmi_exceptions;
+   unsigned int sreset_irqs;
 #ifdef CONFIG_PPC_DOORBELL
unsigned int doorbell_irqs;
 #endif
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0bcec745a672..5c18335580b6 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -470,6 +470,11 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "  Hypervisor Maintenance Interrupts\n");
}
 
+   seq_printf(p, "%*s: ", prec, "NMI");
+   for_each_online_cpu(j)
+   seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs);
+   seq_printf(p, "  System Reset interrupts\n");
+
 #ifdef CONFIG_PPC_DOORBELL
if (cpu_has_feature(CPU_FTR_DBELL)) {
seq_printf(p, "%*s: ", prec, "DBL");
@@ -494,6 +499,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += per_cpu(irq_stat, cpu).spurious_irqs;
sum += per_cpu(irq_stat, cpu).timer_irqs_others;
sum += per_cpu(irq_stat, cpu).hmi_exceptions;
+   sum += per_cpu(irq_stat, cpu).sreset_irqs;
 #ifdef CONFIG_PPC_DOORBELL
sum += per_cpu(irq_stat, cpu).doorbell_irqs;
 #endif
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 5adfea2dc822..6a892ca7bf18 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -288,6 +288,8 @@ void system_reset_exception(struct pt_regs *regs)
if (!nested)
nmi_enter();
 
+   __this_cpu_inc(irq_stat.sreset_irqs);
+
/* See if any machine dependent calls */
if (ppc_md.system_reset_exception) {
if (ppc_md.system_reset_exception(regs))
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index b67f8b03a32d..4b9a567c9975 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -204,6 +204,9 @@ void soft_nmi_interrupt(struct pt_regs *regs)
return;
 
nmi_enter();
+
+   __this_cpu_inc(irq_stat.soft_nmi_irqs);
+
tb = get_tb();
if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
per_cpu(wd_timer_tb, cpu) = tb;
-- 
2.11.0



[PATCH v2 2/4] powerpc: Fix powerpc-specific watchdog build configuration

2017-08-01 Thread Nicholas Piggin
The powerpc kernel/watchdog.o should be built when HARDLOCKUP_DETECTOR
and HAVE_HARDLOCKUP_DETECTOR_ARCH are both selected. If only the former
is selected, then the generic perf watchdog has been selected.

To simplify this check, introduce a new Kconfig symbol PPC_WATCHDOG that
depends on both. This Kconfig option means the powerpc specific
watchdog is enabled.

Without this patch, Book3E will attempt to build the powerpc watchdog.

Fixes: 2104180a53 ("powerpc/64s: implement arch-specific hardlockup watchdog")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/Kconfig | 11 +++
 arch/powerpc/kernel/Makefile |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  6 +++---
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..2a5060aa1674 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -85,6 +85,17 @@ config NMI_IPI
depends on SMP && (DEBUGGER || KEXEC_CORE || HARDLOCKUP_DETECTOR)
default y
 
+config PPC_WATCHDOG
+   bool
+   depends on HARDLOCKUP_DETECTOR
+   depends on HAVE_HARDLOCKUP_DETECTOR_ARCH
+   default y
+   help
+ This is a placeholder when the powerpc hardlockup detector
+ watchdog is selected (arch/powerpc/kernel/watchdog.c). It is
+ seleted via the generic lockup detector menu which is why we
+ have no standalone config option for it here.
+
 config STACKTRACE_SUPPORT
bool
default y
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 4aa7c147e447..5622bd0248e5 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -38,7 +38,7 @@ obj-$(CONFIG_PPC64)   += setup_64.o sys_ppc32.o \
   signal_64.o ptrace32.o \
   paca.o nvram_64.o firmware.o
 obj-$(CONFIG_VDSO32)   += vdso32/
-obj-$(CONFIG_HARDLOCKUP_DETECTOR)  += watchdog.o
+obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)   += hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_power.o
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 9029afd1fa2a..48aaca3e0b20 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1314,7 +1314,7 @@ EXC_REAL_NONE(0x1800, 0x100)
 EXC_VIRT_NONE(0x5800, 0x100)
 #endif
 
-#if defined(CONFIG_HARDLOCKUP_DETECTOR) && 
defined(CONFIG_HAVE_HARDLOCKUP_DETECTOR_ARCH)
+#ifdef CONFIG_PPC_WATCHDOG
 
 #define MASKED_DEC_HANDLER_LABEL 3f
 
@@ -1335,10 +1335,10 @@ EXC_COMMON_BEGIN(soft_nmi_common)
ADD_NVGPRS;ADD_RECONCILE)
b   ret_from_except
 
-#else
+#else /* CONFIG_PPC_WATCHDOG */
 #define MASKED_DEC_HANDLER_LABEL 2f /* normal return */
 #define MASKED_DEC_HANDLER(_H)
-#endif
+#endif /* CONFIG_PPC_WATCHDOG */
 
 /*
  * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
-- 
2.11.0



[PATCH v2 1/4] powerpc/64s: fix mce accounting for powernv

2017-08-01 Thread Nicholas Piggin
---
 arch/powerpc/kernel/traps.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index bfcfd9ef09f2..5adfea2dc822 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -755,7 +755,14 @@ void machine_check_exception(struct pt_regs *regs)
enum ctx_state prev_state = exception_enter();
int recover = 0;
 
+#ifdef CONFIG_PPC_BOOK3S_64
+   /* 64s accounts the mce in machine_check_early when in HVMODE */
+   if (!cpu_has_feature(CPU_FTR_HVMODE))
+   __this_cpu_inc(irq_stat.mce_exceptions);
+#else
__this_cpu_inc(irq_stat.mce_exceptions);
+#endif
+
 
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-- 
2.11.0



[PATCH v2 0/4] powerpc: improve accounting of non maskable interrupts

2017-08-01 Thread Nicholas Piggin
This is the same as the last patch, but broken out and in the
process of making ifdefs nicer, also found and fixed a watchdog
build bug in patch 2.

Patches 1-2 are fixes that should go to 4.13.
Patches 3-4 are probably simple enough they could also go to 4.13.

Nicholas Piggin (4):
  powerpc/64s: fix mce accounting for powernv
  powerpc: fix powerpc-specific watchdog build configuration
  powerpc: add irq accounting for system reset interrupts
  powerpc: add irq accounting for watchdog interrupts

 arch/powerpc/Kconfig | 11 +++
 arch/powerpc/include/asm/hardirq.h   |  4 
 arch/powerpc/kernel/Makefile |  2 +-
 arch/powerpc/kernel/exceptions-64s.S |  6 +++---
 arch/powerpc/kernel/irq.c| 16 
 arch/powerpc/kernel/traps.c  |  9 +
 arch/powerpc/kernel/watchdog.c   |  3 +++
 7 files changed, 47 insertions(+), 4 deletions(-)

-- 
2.11.0



[PATCH v1 3/3] arch/powerpc/net/bpf: Basic EBPF support

2017-08-01 Thread Balbir Singh
Signed-off-by: Balbir Singh 
---
 arch/powerpc/net/bpf_jit_comp64.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index 861c5af..d81110e 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1054,6 +1054,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp->jited = 1;
fp->jited_len = alloclen;
 
+   bpf_jit_binary_lock_ro(bpf_hdr);
bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
 
 out:
@@ -1064,15 +1065,3 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 
return fp;
 }
-
-/* Overriding bpf_jit_free() as we don't set images read-only. */
-void bpf_jit_free(struct bpf_prog *fp)
-{
-   unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
-   struct bpf_binary_header *bpf_hdr = (void *)addr;
-
-   if (fp->jited)
-   bpf_jit_binary_free(bpf_hdr);
-
-   bpf_prog_unlock_free(fp);
-}
-- 
2.9.4



[PATCH v1 2/3] Enable ARCH_HAS_SET_MEMORY

2017-08-01 Thread Balbir Singh
Signed-off-by: Balbir Singh 
---
 arch/powerpc/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b5b8ba8..7be710d 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -165,6 +165,7 @@ config PPC
select HAVE_ARCH_MMAP_RND_COMPAT_BITS   if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
+   select ARCH_HAS_SET_MEMORY  if (PPC_BOOK3S_64)
select ARCH_HAS_STRICT_KERNEL_RWX   if (PPC_BOOK3S_64 && 
!HIBERNATION)
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select HAVE_CBPF_JITif !PPC64
-- 
2.9.4



[PATCH v1 1/3] arch/powerpc/set_memory: Implement set_memory_xx routines

2017-08-01 Thread Balbir Singh
Add support for set_memory_xx routines. With the STRICT_KERNEL_RWX
feature support we got support for changing the page permissions
for pte ranges. This patch adds support for both radix and hash
so that we can change their permissions via set/clear masks.

A new helper is required for hash (hash__change_memory_range()
is changed to hash__change_boot_memory_range() as it deals with
bolted PTE's).

hash__change_memory_range() works with vmalloc'ed PAGE_SIZE requests
for permission changes. hash__change_memory_range() does not invoke
updatepp, instead it changes the software PTE and invalidates the PTE.

For radix, radix__change_memory_range() is setup to do the right
thing for vmalloc'd addresses. It takes a new parameter to decide
what attributes to set.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/include/asm/book3s/64/hash.h  |  6 +++
 arch/powerpc/include/asm/book3s/64/radix.h |  6 +++
 arch/powerpc/include/asm/set_memory.h  | 34 +++
 arch/powerpc/mm/pgtable-hash64.c   | 51 --
 arch/powerpc/mm/pgtable-radix.c| 26 ++--
 arch/powerpc/mm/pgtable_64.c   | 68 ++
 6 files changed, 175 insertions(+), 16 deletions(-)
 create mode 100644 arch/powerpc/include/asm/set_memory.h

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 36fc7bf..65003c9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -94,6 +94,12 @@ extern void hash__mark_rodata_ro(void);
 extern void hash__mark_initmem_nx(void);
 #endif
 
+/*
+ * For set_memory_*
+ */
+extern int hash__change_memory_range(unsigned long start, unsigned long end,
+unsigned long set, unsigned long clear);
+
 extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, unsigned long pte, int huge);
 extern unsigned long htab_convert_pte_flags(unsigned long pteflags);
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 50b..5ca0636 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -121,6 +121,12 @@ extern void radix__mark_rodata_ro(void);
 extern void radix__mark_initmem_nx(void);
 #endif
 
+/*
+ * For set_memory_*
+ */
+extern int radix__change_memory_range(unsigned long start, unsigned long end,
+ unsigned long set, unsigned long clear);
+
 static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
   unsigned long set)
 {
diff --git a/arch/powerpc/include/asm/set_memory.h 
b/arch/powerpc/include/asm/set_memory.h
new file mode 100644
index 000..b19c67c
--- /dev/null
+++ b/arch/powerpc/include/asm/set_memory.h
@@ -0,0 +1,34 @@
+/*
+ * set_memory.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright IBM Corporation, 2017
+ *
+ * Authors: Balbir Singh 
+ */
+
+#ifndef __ASM_SET_MEMORY_H
+#define __ASM_SET_MEMORY_H
+
+/*
+ * Functions to change memory attributes.
+ */
+int set_memory_ro(unsigned long addr, int numpages);
+int set_memory_rw(unsigned long addr, int numpages);
+int set_memory_x(unsigned long addr, int numpages);
+int set_memory_nx(unsigned long addr, int numpages);
+
+#endif
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 656f7f3..db5b477 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -424,9 +424,52 @@ int hash__has_transparent_hugepage(void)
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
+/*
+ * This routine will change pte protection only for vmalloc'd
+ * PAGE_SIZE pages, do not invoke for bolted pages
+ */
+int hash__change_memory_range(unsigned long start, unsigned long end,
+   unsigned long set, unsigned long clear)
+{
+   unsigned long idx;
+   pgd_t *pgdp;
+   pud_t *pudp;
+   pmd_t *pmdp;
+   pte_t *ptep;
+
+   start = ALIGN_DOWN(start, PAGE_SIZE);
+   end = PAGE_ALIGN(end); // aligns up
+
+   /*
+* Update the software PTE and flush the entry.
+* This should cause a new fault with the 

[PATCH v1 0/3] Implement set_memory_xx for ppc64 book3s

2017-08-01 Thread Balbir Singh
After implementing STRICT_KERNEL_RWX, it turns out that implementing
set_memory_ro/rw/x/nx is quite easy. The first patch is applied on
top (http://patchwork.ozlabs.org/patch/795745/).

The first patch implements the various routines, the second patch
enables ARCH_HAS_SET_MEMORY for PPC_BOOK3S_64 and the third patch
enables the BPF infrastructure to use the set_memory_ro and
set_memory_rw routines.

Balbir Singh (3):
  arch/powerpc/set_memory: Implement set_memory_xx routines
  Enable ARCH_HAS_SET_MEMORY
  arch/powerpc/net/bpf: Basic EBPF support

 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/include/asm/book3s/64/hash.h  |  6 +++
 arch/powerpc/include/asm/book3s/64/radix.h |  6 +++
 arch/powerpc/include/asm/set_memory.h  | 34 +++
 arch/powerpc/mm/pgtable-hash64.c   | 51 --
 arch/powerpc/mm/pgtable-radix.c| 26 ++--
 arch/powerpc/mm/pgtable_64.c   | 68 ++
 arch/powerpc/net/bpf_jit_comp64.c  | 13 +-
 8 files changed, 177 insertions(+), 28 deletions(-)
 create mode 100644 arch/powerpc/include/asm/set_memory.h

-- 
2.9.4



Re: [v3 PATCH 1/2] powernv/powerpc:Save/Restore additional SPRs for stop4 cpuidle

2017-08-01 Thread Michael Ellerman
"Gautham R. Shenoy"  writes:
>
> Subject: [v3 PATCH 1/2] powernv/powerpc:Save/Restore additional SPRs for 
> stop4 cpuidle

I know it's not a big deal, but can we agree on the subject format?

  powerpc/powernv: Save/Restore additional SPRs for stop4 cpuidle

cheers


Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-08-01 Thread Jonathan Cameron

Sorry - accidental send.  No content!

Jonathan

On Mon, 31 Jul 2017 12:55:48 +0100
Jonathan Cameron  wrote:

> On Mon, 31 Jul 2017 12:09:08 +0100
> Jonathan Cameron  wrote:
> 
> > On Wed, 26 Jul 2017 16:15:05 -0700
> > "Paul E. McKenney"  wrote:
> >   
> > > On Wed, Jul 26, 2017 at 03:45:40PM -0700, David Miller wrote:
> > > > From: "Paul E. McKenney" 
> > > > Date: Wed, 26 Jul 2017 15:36:58 -0700
> > > >   
> > > > > And without CONFIG_SOFTLOCKUP_DETECTOR, I see five runs of 24 with RCU
> > > > > CPU stall warnings.  So it seems likely that 
> > > > > CONFIG_SOFTLOCKUP_DETECTOR
> > > > > really is having an effect.  
> > > > 
> > > > Thanks for all of the info Paul, I'll digest this and scan over the
> > > > code myself.
> > > > 
> > > > Just out of curiousity, what x86 idle method is your machine using?
> > > > The mwait one or the one which simply uses 'halt'?  The mwait variant
> > > > might mask this bug, and halt would be a lot closer to how sparc64 and
> > > > Jonathan's system operates.  
> > > 
> > > My kernel builds with CONFIG_INTEL_IDLE=n, which I believe means that
> > > I am not using the mwait one.  Here is a grep for IDLE in my .config:
> > > 
> > >   CONFIG_NO_HZ_IDLE=y
> > >   CONFIG_GENERIC_SMP_IDLE_THREAD=y
> > >   # CONFIG_IDLE_PAGE_TRACKING is not set
> > >   CONFIG_ACPI_PROCESSOR_IDLE=y
> > >   CONFIG_CPU_IDLE=y
> > >   # CONFIG_CPU_IDLE_GOV_LADDER is not set
> > >   CONFIG_CPU_IDLE_GOV_MENU=y
> > >   # CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set
> > >   # CONFIG_INTEL_IDLE is not set
> > > 
> > > > On sparc64 the cpu yield we do in the idle loop sleeps the cpu.  It's
> > > > local TICK register keeps advancing, and the local timer therefore
> > > > will still trigger.  Also, any externally generated interrupts
> > > > (including cross calls) will wake up the cpu as well.
> > > > 
> > > > The tick-sched code is really tricky wrt. NO_HZ even in the NO_HZ_IDLE
> > > > case.  One of my running theories is that we miss scheduling a tick
> > > > due to a race.  That would be consistent with the behavior we see
> > > > in the RCU dumps, I think.  
> > > 
> > > But wouldn't you have to miss a -lot- of ticks to get an RCU CPU stall
> > > warning?  By default, your grace period needs to extend for more than
> > > 21 seconds (more than one-third of a -minute-) to get one.  Or do
> > > you mean that the ticks get shut off now and forever, as opposed to
> > > just losing one of them?
> > > 
> > > > Anyways, just a theory, and that's why I keep mentioning that commit
> > > > about the revert of the revert (specifically
> > > > 411fe24e6b7c283c3a1911450cdba6dd3aaea56e).
> > > > 
> > > > :-)  
> > > 
> > > I am running an overnight test in preparation for attempting to push
> > > some fixes for regressions into 4.12, but will try reverting this
> > > and enabling CONFIG_HZ_PERIODIC tomorrow.
> > > 
> > > Jonathan, might the commit that Dave points out above be what reduces
> > > the probability of occurrence as you test older releases?
> > I just got around to trying this out of curiosity.  Superficially it did
> > appear to possibly make the issue harder to hit took over 30 minutes
> > but the issue otherwise looks much the same with or without that patch.
> > 
> > Just out of curiosity, next thing on my list is to disable hrtimers entirely
> > and see what happens.
> > 
> > Jonathan  
> > > 
> > >   Thanx, Paul
> > > 
> > 
> > ___
> > linuxarm mailing list
> > linux...@huawei.com
> > http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm  
> 
> ___
> linuxarm mailing list
> linux...@huawei.com
> http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm



[PATCH 3/3] powerpc/mm/hash64: Make vmalloc 56T on hash

2017-08-01 Thread Michael Ellerman
On 64-bit book3s, with the hash MMU, we currently define the kernel
virtual space (vmalloc, ioremap etc.), to be 16T in size. This is a
leftover from pre v3.7 when our user VM was also 16T.

Of that 16T we split it 50/50, with half used for PCI IO and ioremap
and the other 8T for vmalloc.

We never bothered to make it any bigger because 8T of vmalloc ought to
be enough for anybody. But it turns out that's not true, the per cpu
allocator wants large amounts of vmalloc space, not to make large
allocations, but to allow a large stride between allocations, because
we use pcpu_embed_first_chunk().

With a bit of juggling we can keep 8T for the IO etc. and make the
vmalloc space 56T. The only complication is the check of the address
in the SLB miss handler, see the comment in the code.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  4 ++--
 arch/powerpc/mm/slb_low.S | 18 +++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d613653ed5b9..f88452019114 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -40,7 +40,7 @@
  * Define the address range of the kernel non-linear virtual area
  */
 #define H_KERN_VIRT_START ASM_CONST(0xD000)
-#define H_KERN_VIRT_SIZE   ASM_CONST(0x1000)
+#define H_KERN_VIRT_SIZE  ASM_CONST(0x4000) /* 64T */
 
 /*
  * The vmalloc space starts at the beginning of that region, and
@@ -48,7 +48,7 @@
  * (we keep a quarter for the virtual memmap)
  */
 #define H_VMALLOC_STARTH_KERN_VIRT_START
-#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1)
+#define H_VMALLOC_SIZE ASM_CONST(0x3800) /* 56T */
 #define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
 
 #define H_KERN_IO_STARTH_VMALLOC_END
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index 2eb1b92a68ff..906a86fe457b 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -121,9 +121,21 @@ slb_miss_kernel_load_vmemmap:
 1:
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-   clrldi  r11,r10,48
-   cmpldi  r11,(H_VMALLOC_SIZE >> 28) - 1
-   bgt 5f
+   /*
+* r10 contains the ESID, which is the original faulting EA shifted
+* right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28)
+* which is 0xd00038000. That can't be used as an immediate, even if we
+* ignored the 0xd, so we have to load it into a register, and we only
+* have one register free. So we must load all of (H_VMALLOC_END >> 28)
+* into a register and compare ESID against that.
+*/
+   lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xd000
+   ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xd0003800
+   // Rotate left 4, then mask with 0x0
+   rldic   r11,r11,4,28// r11 = 0xd00038000
+   cmpld   r10,r11 // if r10 >= r11
+   bge 5f  //   goto io_mapping
+
/*
 * vmalloc mapping gets the encoding from the PACA as the mapping
 * can be demoted from 64K -> 4K dynamically on some machines.
-- 
2.7.4



[PATCH 2/3] powerpc/mm/slb: Move comment next to the code it's referring to

2017-08-01 Thread Michael Ellerman
There is a comment in slb_allocate() referring to the load of
paca->vmalloc_sllp, but it's several lines prior in the assembly.
We're about to change this code, and we want to add another comment,
so move the comment immediately prior to the instruction it's talking
about.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/mm/slb_low.S | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index bde378559d01..2eb1b92a68ff 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -121,12 +121,13 @@ slb_miss_kernel_load_vmemmap:
 1:
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-   /* vmalloc mapping gets the encoding from the PACA as the mapping
-* can be demoted from 64K -> 4K dynamically on some machines
-*/
clrldi  r11,r10,48
cmpldi  r11,(H_VMALLOC_SIZE >> 28) - 1
bgt 5f
+   /*
+* vmalloc mapping gets the encoding from the PACA as the mapping
+* can be demoted from 64K -> 4K dynamically on some machines.
+*/
lhz r11,PACAVMALLOCSLLP(r13)
b   6f
 5:
-- 
2.7.4



[PATCH 1/3] powerpc/mm/book3s64: Make KERN_IO_START a variable

2017-08-01 Thread Michael Ellerman
Currently KERN_IO_START is defined as:

 #define KERN_IO_START  (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1))

Although it looks like a constant, both the components are actually
variables, to allow us to have a different value between Radix and
Hash with a single kernel.

However that still requires both Radix and Hash to place the kernel IO
region at the same location relative to the start and end of the
kernel virtual region (namely 1/2 way through it), and we'd like to
change that.

So split KERN_IO_START out into its own variable, and initialise it
for Radix and Hash. In the medium term we should be able to
reconsolidate this, by doing a more involved rearrangement of the
location of the regions.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/book3s/64/hash.h| 2 ++
 arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++-
 arch/powerpc/include/asm/book3s/64/radix.h   | 2 ++
 arch/powerpc/mm/hash_utils_64.c  | 1 +
 arch/powerpc/mm/pgtable-radix.c  | 1 +
 arch/powerpc/mm/pgtable_64.c | 2 ++
 6 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 36fc7bfe9e11..d613653ed5b9 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -51,6 +51,8 @@
 #define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1)
 #define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
 
+#define H_KERN_IO_STARTH_VMALLOC_END
+
 /*
  * Region IDs
  */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415e283c..18a8580d3ddc 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -272,8 +272,10 @@ extern unsigned long __vmalloc_end;
 
 extern unsigned long __kernel_virt_start;
 extern unsigned long __kernel_virt_size;
+extern unsigned long __kernel_io_start;
 #define KERN_VIRT_START __kernel_virt_start
 #define KERN_VIRT_SIZE  __kernel_virt_size
+#define KERN_IO_START  __kernel_io_start
 extern struct page *vmemmap;
 extern unsigned long ioremap_bot;
 extern unsigned long pci_io_base;
@@ -298,7 +300,6 @@ extern unsigned long pci_io_base;
  *  PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces
  * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE
  */
-#define KERN_IO_START  (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1))
 #define FULL_IO_SIZE   0x8000ul
 #define  ISA_IO_BASE   (KERN_IO_START)
 #define  ISA_IO_END(KERN_IO_START + 0x1ul)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 50b5aff3..1e5ba94e62ef 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -110,6 +110,8 @@
  */
 #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END)
 
+#define RADIX_KERN_IO_START(RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE 
>> 1))
+
 #ifndef __ASSEMBLY__
 #define RADIX_PTE_TABLE_SIZE   (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE)
 #define RADIX_PMD_TABLE_SIZE   (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7419fc1854ad..a93137c358ea 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1019,6 +1019,7 @@ void __init hash__early_init_mmu(void)
__kernel_virt_size = H_KERN_VIRT_SIZE;
__vmalloc_start = H_VMALLOC_START;
__vmalloc_end = H_VMALLOC_END;
+   __kernel_io_start = H_KERN_IO_START;
vmemmap = (struct page *)H_VMEMMAP_BASE;
ioremap_bot = IOREMAP_BASE;
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 5cc50d47ce3f..d37e68495acc 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -494,6 +494,7 @@ void __init radix__early_init_mmu(void)
__kernel_virt_size = RADIX_KERN_VIRT_SIZE;
__vmalloc_start = RADIX_VMALLOC_START;
__vmalloc_end = RADIX_VMALLOC_END;
+   __kernel_io_start = RADIX_KERN_IO_START;
vmemmap = (struct page *)RADIX_VMEMMAP_BASE;
ioremap_bot = IOREMAP_BASE;
 
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 0736e94c7615..ac0717a90ca6 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -104,6 +104,8 @@ unsigned long __vmalloc_start;
 EXPORT_SYMBOL(__vmalloc_start);
 unsigned long __vmalloc_end;
 EXPORT_SYMBOL(__vmalloc_end);
+unsigned long __kernel_io_start;
+EXPORT_SYMBOL(__kernel_io_start);
 struct page *vmemmap;
 EXPORT_SYMBOL(vmemmap);
 unsigned long __pte_frag_nr;
-- 
2.7.4



Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-08-01 Thread Jonathan Cameron
On Mon, 31 Jul 2017 12:09:08 +0100
Jonathan Cameron  wrote:

> On Wed, 26 Jul 2017 16:15:05 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Wed, Jul 26, 2017 at 03:45:40PM -0700, David Miller wrote:  
> > > From: "Paul E. McKenney" 
> > > Date: Wed, 26 Jul 2017 15:36:58 -0700
> > > 
> > > > And without CONFIG_SOFTLOCKUP_DETECTOR, I see five runs of 24 with RCU
> > > > CPU stall warnings.  So it seems likely that CONFIG_SOFTLOCKUP_DETECTOR
> > > > really is having an effect.
> > > 
> > > Thanks for all of the info Paul, I'll digest this and scan over the
> > > code myself.
> > > 
> > > Just out of curiousity, what x86 idle method is your machine using?
> > > The mwait one or the one which simply uses 'halt'?  The mwait variant
> > > might mask this bug, and halt would be a lot closer to how sparc64 and
> > > Jonathan's system operates.
> > 
> > My kernel builds with CONFIG_INTEL_IDLE=n, which I believe means that
> > I am not using the mwait one.  Here is a grep for IDLE in my .config:
> > 
> > CONFIG_NO_HZ_IDLE=y
> > CONFIG_GENERIC_SMP_IDLE_THREAD=y
> > # CONFIG_IDLE_PAGE_TRACKING is not set
> > CONFIG_ACPI_PROCESSOR_IDLE=y
> > CONFIG_CPU_IDLE=y
> > # CONFIG_CPU_IDLE_GOV_LADDER is not set
> > CONFIG_CPU_IDLE_GOV_MENU=y
> > # CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED is not set
> > # CONFIG_INTEL_IDLE is not set
> >   
> > > On sparc64 the cpu yield we do in the idle loop sleeps the cpu.  It's
> > > local TICK register keeps advancing, and the local timer therefore
> > > will still trigger.  Also, any externally generated interrupts
> > > (including cross calls) will wake up the cpu as well.
> > > 
> > > The tick-sched code is really tricky wrt. NO_HZ even in the NO_HZ_IDLE
> > > case.  One of my running theories is that we miss scheduling a tick
> > > due to a race.  That would be consistent with the behavior we see
> > > in the RCU dumps, I think.
> > 
> > But wouldn't you have to miss a -lot- of ticks to get an RCU CPU stall
> > warning?  By default, your grace period needs to extend for more than
> > 21 seconds (more than one-third of a -minute-) to get one.  Or do
> > you mean that the ticks get shut off now and forever, as opposed to
> > just losing one of them?
> >   
> > > Anyways, just a theory, and that's why I keep mentioning that commit
> > > about the revert of the revert (specifically
> > > 411fe24e6b7c283c3a1911450cdba6dd3aaea56e).
> > > 
> > > :-)
> > 
> > I am running an overnight test in preparation for attempting to push
> > some fixes for regressions into 4.12, but will try reverting this
> > and enabling CONFIG_HZ_PERIODIC tomorrow.
> > 
> > Jonathan, might the commit that Dave points out above be what reduces
> > the probability of occurrence as you test older releases?  
> I just got around to trying this out of curiosity.  Superficially it did
> appear to possibly make the issue harder to hit took over 30 minutes
> but the issue otherwise looks much the same with or without that patch.
> 
> Just out of curiosity, next thing on my list is to disable hrtimers entirely
> and see what happens.
> 
> Jonathan
> > 
> > Thanx, Paul
> >   
> 
> ___
> linuxarm mailing list
> linux...@huawei.com
> http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm



Re: [RFC PATCH] powerpc: Disabling MEMORY_HOTPLUG_DEFAULT_ONLINE option for PPC64 arch

2017-08-01 Thread Michael Ellerman
Daniel Henrique Barboza  writes:

> Commit 943db62c316c ("powerpc/pseries: Revert 'Auto-online
> hotplugged memory'") reverted the auto-online feature for pseries due
> to problems with LMB removals not updating the device struct properly.
> Among other things, this commit made the following change in
> arch/powerpc/configs/pseries_defconfig:
>
> @@ -58,7 +58,6 @@ CONFIG_KEXEC_FILE=y
>  CONFIG_IRQ_ALL_CPUS=y
>  CONFIG_MEMORY_HOTPLUG=y
>  CONFIG_MEMORY_HOTREMOVE=y
> -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y
>  CONFIG_KSM=y
>
> The intent was to disable the option in the defconfig of pseries, since
> after that the code doesn't have this support anymore.

It's always polite to Cc the author of a commit you're referring to, so
I added Nathan.

The intention when we merged that fix was that the auto-online code
would be "fixed" to mark the device online. I say "fixed" because it
wasn't entirely clear if that was the correct behaviour, though it
definitely seemed like it should be.

I've lost track of where/if the discussion got to on whether the
auto-online code should do that or not. Did anything get resolved?

> However, this change
> alone isn't enough to prevent situations such as [1], where
> distros can enable the option unaware of the consequences of
> doing it (e.g. breaking LMB hotplug altogether).
>
> Instead of relying on all distros knowing that pseries can't handle
> CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y after 943db62c316c, this patch
> changes mm/Kconfig to make the MEMORY_HOTPLUG_DEFAULT_ONLINE config
> unavailable for the PPC64 arch.
>
> [1] https://bugzilla.redhat.com/show_bug.cgi?id=1476380
>
> Fixes: 943db62c316c ("powerpc/pseries: Revert 'Auto-online hotplugged 
> memory'")
> Signed-off-by: Daniel Henrique Barboza 
> ---
>  mm/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

I don't own that file, so we at least need an Ack from the mm folks.

cheers

> diff --git a/mm/Kconfig b/mm/Kconfig
> index 48b1af4..a342c77 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -169,7 +169,7 @@ config MEMORY_HOTPLUG_SPARSE
>  config MEMORY_HOTPLUG_DEFAULT_ONLINE
>  bool "Online the newly added memory blocks by default"
>  default n
> -depends on MEMORY_HOTPLUG
> +depends on MEMORY_HOTPLUG && !PPC64
>  help
> This option sets the default policy setting for memory hotplug
> onlining policy (/sys/devices/system/memory/auto_online_blocks) which
> -- 
> 2.9.4


RE: [RESEND][PATCH V10 0/3] powernv : Add support for OPAL-OCC command/response interface

2017-08-01 Thread David Laight
From: Shilpasri G Bhat
> Sent: 31 July 2017 08:43
> In P9, OCC (On-Chip-Controller) supports shared memory based
> commad-response interface. Within the shared memory there is an OPAL
  ^ typo
> command buffer and OCC response buffer that can be used to send
> inband commands to OCC. The following commands are supported:
...

David


Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-08-01 Thread Michael Ellerman
Jens Axboe  writes:
...
>
> Can you try the below fix? Should be more palatable than the previous
> one. Brian, maybe you can take a look at the IRQ issue mentioned above?

Given the patch from Brian fixed the lockdep warning, do you still want
me to try and test this one?

cheers

> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index f6097b89d5d3..dfb89596af81 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -481,13 +481,14 @@ static void scsi_starved_list_run(struct Scsi_Host 
> *shost)
>   * Purpose:Select a proper request queue to serve next
>   *
>   * Arguments:  q   - last request's queue
> + * async   - run queues async, if we need to
>   *
>   * Returns: Nothing
>   *
>   * Notes:  The previous command was completely finished, start
>   * a new one if possible.
>   */
> -static void scsi_run_queue(struct request_queue *q)
> +static void scsi_run_queue(struct request_queue *q, bool async)
>  {
>   struct scsi_device *sdev = q->queuedata;
>  
> @@ -497,7 +498,7 @@ static void scsi_run_queue(struct request_queue *q)
>   scsi_starved_list_run(sdev->host);
>  
>   if (q->mq_ops)
> - blk_mq_run_hw_queues(q, false);
> + blk_mq_run_hw_queues(q, async);
>   else
>   blk_run_queue(q);
>  }
> @@ -509,7 +510,7 @@ void scsi_requeue_run_queue(struct work_struct *work)
>  
>   sdev = container_of(work, struct scsi_device, requeue_work);
>   q = sdev->request_queue;
> - scsi_run_queue(q);
> + scsi_run_queue(q, false);
>  }
>  
>  /*
> @@ -543,17 +544,22 @@ static void scsi_requeue_command(struct request_queue 
> *q, struct scsi_cmnd *cmd)
>   blk_requeue_request(q, req);
>   spin_unlock_irqrestore(q->queue_lock, flags);
>  
> - scsi_run_queue(q);
> + scsi_run_queue(q, true);
>  
>   put_device(>sdev_gendev);
>  }
>  
> -void scsi_run_host_queues(struct Scsi_Host *shost)
> +static void __scsi_run_host_queues(struct Scsi_Host *shost, bool async)
>  {
>   struct scsi_device *sdev;
>  
>   shost_for_each_device(sdev, shost)
> - scsi_run_queue(sdev->request_queue);
> + scsi_run_queue(sdev->request_queue, async);
> +}
> +
> +void scsi_run_host_queues(struct Scsi_Host *shost)
> +{
> + __scsi_run_host_queues(shost, false);
>  }
>  
>  static void scsi_uninit_cmd(struct scsi_cmnd *cmd)
> @@ -671,7 +677,7 @@ static bool scsi_end_request(struct request *req, 
> blk_status_t error,
>   blk_finish_request(req, error);
>   spin_unlock_irqrestore(q->queue_lock, flags);
>  
> - scsi_run_queue(q);
> + scsi_run_queue(q, false);
>   }
>  
>   put_device(>sdev_gendev);
> @@ -2293,7 +2299,7 @@ EXPORT_SYMBOL(scsi_block_requests);
>  void scsi_unblock_requests(struct Scsi_Host *shost)
>  {
>   shost->host_self_blocked = 0;
> - scsi_run_host_queues(shost);
> + __scsi_run_host_queues(shost, true);
>  }
>  EXPORT_SYMBOL(scsi_unblock_requests);
>  
> @@ -2897,10 +2903,10 @@ scsi_device_quiesce(struct scsi_device *sdev)
>   if (err)
>   return err;
>  
> - scsi_run_queue(sdev->request_queue);
> + scsi_run_queue(sdev->request_queue, false);
>   while (atomic_read(>device_busy)) {
>   msleep_interruptible(200);
> - scsi_run_queue(sdev->request_queue);
> + scsi_run_queue(sdev->request_queue, false);
>   }
>   return 0;
>  }
> @@ -2924,7 +2930,7 @@ void scsi_device_resume(struct scsi_device *sdev)
>   mutex_lock(>state_mutex);
>   if (sdev->sdev_state == SDEV_QUIESCE &&
>   scsi_device_set_state(sdev, SDEV_RUNNING) == 0)
> - scsi_run_queue(sdev->request_queue);
> + scsi_run_queue(sdev->request_queue, false);
>   mutex_unlock(>state_mutex);
>  }
>  EXPORT_SYMBOL(scsi_device_resume);
>
> -- 
> Jens Axboe


Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-08-01 Thread Michael Ellerman
Michael Ellerman  writes:

> Brian King  writes:
>
>> On 07/28/2017 10:17 AM, Brian J King wrote:
>>> Jens Axboe  wrote on 07/28/2017 09:25:48 AM:
>>> 
 Can you try the below fix? Should be more palatable than the previous
 one. Brian, maybe you can take a look at the IRQ issue mentioned above?
>>
>> Michael,
>>
>> Does this address the issue you are seeing?
>
> Yes it seems to, thanks.
>
> I only see the trace on reboot, and not 100% of the time. But I've
> survived a couple of reboots now without seeing anything, so I think
> this is helping.
>
> I'll put the patch in my Jenkins over night and let you know how it
> survives that, which should be ~= 25 boots.

No lockdep warnings or other oddness over night, so that patch looks
good to me.

cheers


Re: [RFC v6 21/62] powerpc: introduce execute-only pkey

2017-08-01 Thread Michael Ellerman
Thiago Jung Bauermann  writes:
> Ram Pai  writes:
...
>> +
>> +/* We got one, store it and use it from here on out */
>> +if (need_to_set_mm_pkey)
>> +mm->context.execute_only_pkey = execute_only_pkey;
>> +return execute_only_pkey;
>> +}
>
> If you follow the code flow in __execute_only_pkey, the AMR and UAMOR
> are read 3 times in total, and AMR is written twice. IAMR is read and
> written twice. Since they are SPRs and access to them is slow (or isn't
> it?),

SPRs read/writes are slow, but they're not *that* slow in comparison to
a system call (which I think is where this code is being called?).

So we should try to avoid too many SPR read/writes, but at the same time
we can accept more than the minimum if it makes the code much easier to
follow.

cheers


Re: [PATCH v3] powerpc/powernv: Enable PCI peer-to-peer

2017-08-01 Thread Michael Ellerman
Brian King  writes:

> Michael,
>
> What do we need on this one before we can pull into your -next branch?

This skiboot side to be merged.

cheers


Re: [PATCH 2/3] powerpc/xmon: Disable and enable tracing command

2017-08-01 Thread Naveen N. Rao
On 2017/07/31 02:22PM, Breno Leitao wrote:
> If tracing is enabled and you get into xmon, the tracing buffer
> continues to be updated, causing possible loss of data due to buffer
> overflow and unnecessary tracing information coming from xmon functions.
> 
> This patch adds a new option that allows the tracing to be disabled and
> re-enabled from inside xmon.

How is this new option useful? In the next patch, you disable tracing by 
default -- in what scenario do you expect to have to re-enable tracing 
from within xmon?

> 
> Signed-off-by: Breno Leitao 
> ---
>  arch/powerpc/xmon/xmon.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 0cbd910193fa..19276d2f2f25 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -89,6 +89,7 @@ static unsigned long nidump = 16;
>  static unsigned long ncsum = 4096;
>  static int termch;
>  static char tmpstr[128];
> +static char tracing_enabled = 1;
> 
>  static long bus_error_jmp[JMP_BUF_LEN];
>  static int catch_memory_errors;
> @@ -268,6 +269,7 @@ Commands:\n\
>Sr #   read SPR #\n\
>Sw #v write v to SPR #\n\
>t  print backtrace\n\
> +  v  trace enable/disable\n\
>x  exit monitor and recover\n\
>X  exit monitor and don't recover\n"
>  #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E)
> @@ -983,6 +985,17 @@ cmds(struct pt_regs *excp)
>   case 'x':
>   case 'X':
>   return cmd;
> + case 'v':
> + if (tracing_is_on()) {
> + printk("Disabling tracing\n");
> + tracing_enabled = 0;
> + tracing_off();

This only disables trace buffer updates - ftrace (and all its callbacks, 
et al) remains active, which isn't desirable. Can you see if this works 
for you:
https://patchwork.ozlabs.org/patch/769611/

- Naveen