Re: [PATCH] powerpc/pseries: Move CMO code from plapr_wrappers.h to platforms/pseries

2016-11-13 Thread kbuild test robot
Hi Michael,

[auto build test ERROR on powerpc/next]
[also build test ERROR on v4.9-rc5 next-2016]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Michael-Ellerman/powerpc-pseries-Move-CMO-code-from-plapr_wrappers-h-to-platforms-pseries/20161114-145812
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allmodconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross
 -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/pseries/cmm.c: In function 'plpar_page_set_loaned':
>> arch/powerpc/platforms/pseries/cmm.c:114:30: error: implicit declaration of 
>> function 'cmo_get_page_size' [-Werror=implicit-function-declaration]
 unsigned long cmo_page_sz = cmo_get_page_size();
 ^
   cc1: some warnings being treated as errors

vim +/cmo_get_page_size +114 arch/powerpc/platforms/pseries/cmm.c

   108  static int hotplug_occurred; /* protected by the hotplug mutex */
   109  
   110  static struct task_struct *cmm_thread_ptr;
   111  
   112  static long plpar_page_set_loaned(unsigned long vpa)
   113  {
 > 114  unsigned long cmo_page_sz = cmo_get_page_size();
   115  long rc = 0;
   116  int i;
   117  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH] powerpc/pseries: Move CMO code from plapr_wrappers.h to platforms/pseries

2016-11-13 Thread Michael Ellerman
Currently there's some CMO (Cooperative Memory Overcommit) code, in
plpar_wrappers.h. Some of it is #ifdef CONFIG_PSERIES and some of it
isn't. The end result being if a file includes plpar_wrappers.h it won't
build with CONFIG_PSERIES=n.

Fix it by moving the CMO code into platforms/pseries. The two hcall
wrappers can just be moved into their only caller, cmm.c, and the
accessors can go in pseries.h.

Note we need the accessors because cmm.c can be built as a module, so
there needs to be a split between the built-in code vs the module, and
that's achieved by using those accessors.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/hvcall.h | 21 
 arch/powerpc/include/asm/plpar_wrappers.h | 32 ---
 arch/powerpc/platforms/pseries/cmm.c  | 32 +++
 arch/powerpc/platforms/pseries/pseries.h  | 19 ++
 4 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index 708edebcf147..1acdcad5f773 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -412,27 +412,6 @@ static inline unsigned int get_longbusy_msecs(int 
longbusy_rc)
}
 }
 
-#ifdef CONFIG_PPC_PSERIES
-extern int CMO_PrPSP;
-extern int CMO_SecPSP;
-extern unsigned long CMO_PageSize;
-
-static inline int cmo_get_primary_psp(void)
-{
-   return CMO_PrPSP;
-}
-
-static inline int cmo_get_secondary_psp(void)
-{
-   return CMO_SecPSP;
-}
-
-static inline unsigned long cmo_get_page_size(void)
-{
-   return CMO_PageSize;
-}
-#endif /* CONFIG_PPC_PSERIES */
-
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h 
b/arch/powerpc/include/asm/plpar_wrappers.h
index 1b394247afc2..034a588b122c 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -93,38 +93,6 @@ static inline long register_dtl(unsigned long cpu, unsigned 
long vpa)
return vpa_call(H_VPA_REG_DTL, cpu, vpa);
 }
 
-static inline long plpar_page_set_loaned(unsigned long vpa)
-{
-   unsigned long cmo_page_sz = cmo_get_page_size();
-   long rc = 0;
-   int i;
-
-   for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
-   rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + 
i, 0);
-
-   for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
-   plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
-  vpa + i - cmo_page_sz, 0);
-
-   return rc;
-}
-
-static inline long plpar_page_set_active(unsigned long vpa)
-{
-   unsigned long cmo_page_sz = cmo_get_page_size();
-   long rc = 0;
-   int i;
-
-   for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
-   rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + 
i, 0);
-
-   for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
-   plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
-  vpa + i - cmo_page_sz, 0);
-
-   return rc;
-}
-
 extern void vpa_init(int cpu);
 
 static inline long plpar_pte_enter(unsigned long flags,
diff --git a/arch/powerpc/platforms/pseries/cmm.c 
b/arch/powerpc/platforms/pseries/cmm.c
index 66e7227469b8..4412f12374d3 100644
--- a/arch/powerpc/platforms/pseries/cmm.c
+++ b/arch/powerpc/platforms/pseries/cmm.c
@@ -109,6 +109,38 @@ static int hotplug_occurred; /* protected by the hotplug 
mutex */
 
 static struct task_struct *cmm_thread_ptr;
 
+static long plpar_page_set_loaned(unsigned long vpa)
+{
+   unsigned long cmo_page_sz = cmo_get_page_size();
+   long rc = 0;
+   int i;
+
+   for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+   rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + 
i, 0);
+
+   for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+   plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE,
+  vpa + i - cmo_page_sz, 0);
+
+   return rc;
+}
+
+static long plpar_page_set_active(unsigned long vpa)
+{
+   unsigned long cmo_page_sz = cmo_get_page_size();
+   long rc = 0;
+   int i;
+
+   for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz)
+   rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + 
i, 0);
+
+   for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz)
+   plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED,
+  vpa + i - cmo_page_sz, 0);
+
+   return rc;
+}
+
 /**
  * cmm_alloc_pages - Allocate pages and mark them as loaned
  * @nr:number of pages to allocate
diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index b1be7b713fe6..1361a9db534b 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/p

Re: [PATCH V3 1/2] mm: move vma_is_anonymous check within pmd_move_must_withdraw

2016-11-13 Thread Balbir Singh


On 14/11/16 02:00, Aneesh Kumar K.V wrote:
> Architectures like ppc64 want to use page table deposit/withraw
> even with huge pmd dax entries. Allow arch to override the
> vma_is_anonymous check by moving that to pmd_move_must_withdraw
> function
> 

I think the changelog can be reworded a bit

Independent of whether the vma is for anonymous memory, some arches
like ppc64 would like to override pmd_move_must_withdraw(). One option
is to encapsulate the vma_is_anonymous() check for general architectures
inside pmd_move_must_withdraw() so that is always called and architectures
that need unconditional overriding can override this function. ppc64
needs to override the function when the MMU is configured to use hash
PTE's.

What do you think?

Balbir Singh.


Re: [PATCH 1/3] crash: move crashkernel parsing and vmcore related code under CONFIG_CRASH_CORE

2016-11-13 Thread Baoquan He
On 11/10/16 at 05:27pm, Hari Bathini wrote:
> Traditionally, kdump is used to save vmcore in case of a crash. Some
> architectures like powerpc can save vmcore using architecture specific
> support instead of kexec/kdump mechanism. Such architecture specific
> support also needs to reserve memory, to be used by dump capture kernel.
> crashkernel parameter can be a reused, for memory reservation, by such
> architecture specific infrastructure.
> 
> But currently, code related to vmcoreinfo and parsing of crashkernel
> parameter is built under CONFIG_KEXEC_CORE. This patch introduces
> CONFIG_CRASH_CORE and moves the above mentioned code under this config,
> allowing code reuse without dependency on CONFIG_KEXEC. While here,
> removing the multiple definitions of append_elf_note() and final_note()
> for one defined under CONFIG_CONFIG_CORE. There is no functional change
> with this patch.

Can't think of a reason to object.

Could it be that do the moving from kexec_core.c to crash_core.c only,
then do the arch specific clean up in another patch?

Besides there's already a file crash_dump.h, can we reuse that?

> 
> Signed-off-by: Hari Bathini 
> ---
>  arch/Kconfig   |4 
>  arch/ia64/kernel/crash.c   |   22 --
>  arch/powerpc/Kconfig   |   10 -
>  arch/powerpc/include/asm/fadump.h  |2 
>  arch/powerpc/kernel/crash.c|2 
>  arch/powerpc/kernel/fadump.c   |   34 ---
>  arch/powerpc/kernel/setup-common.c |5 
>  include/linux/crash_core.h |   75 ++
>  include/linux/kexec.h  |   63 -
>  kernel/Makefile|1 
>  kernel/crash_core.c|  450 
> 
>  kernel/kexec_core.c|  435 ---
>  12 files changed, 550 insertions(+), 553 deletions(-)
>  create mode 100644 include/linux/crash_core.h
>  create mode 100644 kernel/crash_core.c
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 659bdd0..4ad34b9 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -2,7 +2,11 @@
>  # General architecture dependent options
>  #
>  
> +config CRASH_CORE
> + bool
> +
>  config KEXEC_CORE
> + select CRASH_CORE
>   bool
>  
>  config OPROFILE
> diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c
> index 2955f35..75859a0 100644
> --- a/arch/ia64/kernel/crash.c
> +++ b/arch/ia64/kernel/crash.c
> @@ -27,28 +27,6 @@ static int kdump_freeze_monarch;
>  static int kdump_on_init = 1;
>  static int kdump_on_fatal_mca = 1;
>  
> -static inline Elf64_Word
> -*append_elf_note(Elf64_Word *buf, char *name, unsigned type, void *data,
> - size_t data_len)
> -{
> - struct elf_note *note = (struct elf_note *)buf;
> - note->n_namesz = strlen(name) + 1;
> - note->n_descsz = data_len;
> - note->n_type   = type;
> - buf += (sizeof(*note) + 3)/4;
> - memcpy(buf, name, note->n_namesz);
> - buf += (note->n_namesz + 3)/4;
> - memcpy(buf, data, data_len);
> - buf += (data_len + 3)/4;
> - return buf;
> -}
> -
> -static void
> -final_note(void *buf)
> -{
> - memset(buf, 0, sizeof(struct elf_note));
> -}
> -
>  extern void ia64_dump_cpu_regs(void *);
>  
>  static DEFINE_PER_CPU(struct elf_prstatus, elf_prstatus);
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 65fba4c..644703f 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -479,21 +479,23 @@ config RELOCATABLE
> load address of the kernel (eg. u-boot/mkimage).
>  
>  config CRASH_DUMP
> - bool "Build a kdump crash kernel"
> + bool "Build a dump capture kernel"
>   depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
>   select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
>   help
> -   Build a kernel suitable for use as a kdump capture kernel.
> +   Build a kernel suitable for use as a dump capture kernel.
> The same kernel binary can be used as production kernel and dump
> capture kernel.
>  
>  config FA_DUMP
>   bool "Firmware-assisted dump"
> - depends on PPC64 && PPC_RTAS && CRASH_DUMP && KEXEC
> + depends on PPC64 && PPC_RTAS
> + select CRASH_CORE
> + select CRASH_DUMP
>   help
> A robust mechanism to get reliable kernel crash dump with
> assistance from firmware. This approach does not use kexec,
> -   instead firmware assists in booting the kdump kernel
> +   instead firmware assists in booting the capture kernel
> while preserving memory contents. Firmware-assisted dump
> is meant to be a kdump replacement offering robustness and
> speed not possible without system firmware assistance.
> diff --git a/arch/powerpc/include/asm/fadump.h 
> b/arch/powerpc/include/asm/fadump.h
> index 0031806..60b9108 100644
> --- a/arch/powerpc/include/asm/fadump.h
> +++ b/arch/powerpc/include/asm/fadump.h
> @@ -73,6 +73,8 @@
>   reg_entry++;   

[powerpc v4 3/3] Enable storage keys for radix - user mode execution

2016-11-13 Thread Balbir Singh
ISA 3 defines new encoded access authority that allows instruction
access prevention in privileged mode and allows normal access
to problem state. This patch just enables IAMR (Instruction Authority
Mask Register), enabling AMR would require more work.

I've tested this with a buggy driver and a simple payload. The payload
is specific to the build I've tested.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/mm/pgtable-radix.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 7343573..5c90bcd 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -339,6 +339,24 @@ static void __init radix_init_amor(void)
mtspr(SPRN_AMOR, amor);
 }
 
+/*
+ * For radix page tables we setup, the IAMR values as follows
+ * IMAR = 0100...00 (key 0 is set to 1)
+ * AMR, UAMR, UAMOR are not affected
+ */
+static void __init radix_init_iamr(void)
+{
+   unsigned long iamr_mask = 0x4000;
+   unsigned long iamr = mfspr(SPRN_IAMR);
+
+   if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+   return;
+
+   iamr = iamr_mask;
+
+   mtspr(SPRN_IAMR, iamr);
+}
+
 void __init radix__early_init_mmu(void)
 {
unsigned long lpcr;
@@ -398,6 +416,7 @@ void __init radix__early_init_mmu(void)
radix_init_amor();
}
 
+   radix_init_iamr();
radix_init_pgtable();
 }
 
@@ -415,6 +434,7 @@ void radix__early_init_mmu_secondary(void)
  __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
radix_init_amor();
}
+   radix_init_iamr();
 }
 
 void radix__mmu_cleanup_all(void)
-- 
2.5.5



[powerpc v4 2/3] Detect instruction fetch denied and report

2016-11-13 Thread Balbir Singh
ISA 3 allows for prevention of instruction fetch and execution
of user mode pages. If such an error occurs, SRR1 bit 35
reports the error. We catch and report the error in do_page_fault()

Signed-off-by: Balbir Singh 
---
 arch/powerpc/mm/fault.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d0b137d..1e7ff7b 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -404,6 +404,10 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
(cpu_has_feature(CPU_FTR_NOEXECUTE) ||
 !(vma->vm_flags & (VM_READ | VM_WRITE
goto bad_area;
+
+   if (regs->msr & SRR1_ISI_N_OR_G)
+   goto bad_area;
+
 #ifdef CONFIG_PPC_STD_MMU
/*
 * protfault should only happen due to us
-- 
2.5.5



[powerpc v4 1/3] Setup AMOR in HV mode

2016-11-13 Thread Balbir Singh
AMOR should be setup in HV mode, we set it up once
and let the generic kernel handle IAMR. This patch is
used to enable storage keys in a following patch as
defined in ISA 3

Reported-by: Aneesh Kumar K.V 
Signed-off-by: Balbir Singh 
---
 arch/powerpc/mm/pgtable-radix.c | 21 +
 1 file changed, 21 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index ed7bddc..7343573 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -320,6 +320,25 @@ static void update_hid_for_radix(void)
cpu_relax();
 }
 
+/*
+ * In HV mode, we init AMOR so that the hypervisor
+ * and guest can setup IMAR, enable key 0 and set
+ * it to 1
+ * AMOR = 110000 (Mask for key 0 is 11)
+ */
+static void __init radix_init_amor(void)
+{
+   unsigned long amor_mask = 0xc000;
+   unsigned long amor = mfspr(SPRN_AMOR);
+
+   if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+   return;
+
+   amor = amor_mask;
+
+   mtspr(SPRN_AMOR, amor);
+}
+
 void __init radix__early_init_mmu(void)
 {
unsigned long lpcr;
@@ -376,6 +395,7 @@ void __init radix__early_init_mmu(void)
lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
radix_init_partition_table();
+   radix_init_amor();
}
 
radix_init_pgtable();
@@ -393,6 +413,7 @@ void radix__early_init_mmu_secondary(void)
 
mtspr(SPRN_PTCR,
  __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
+   radix_init_amor();
}
 }
 
-- 
2.5.5



[powerpc v4 0/3] Enable IAMR storage keys for radix

2016-11-13 Thread Balbir Singh
The first patch sets up AMOR in hypervisor mode. AMOR
needs to be setup before IAMR (details of AMOR/IAMR in
each patch). The second patch enables detection of exceptions
generated due to instruction fetch violations caused
and OOPSs' the task. The third patch enables IAMR for
both hypervisor and guest kernels.

I've tested with patch series with a sample hack and
payload.

Chris Smart helped with the series, reviewing and
providing valuable feedback

Cc: Chris Smart 
Cc: Benjamin Herrenschmidt 
Cc: Michael Neuling 
Cc: Aneesh Kumar K.V 
Cc: Paul Mackerras 

Changelog
  Enable both primary and secondary MMU's (BUG FIX)
  Make the check for instruction violations common (SRR1_ISI_N_OR_G)

Balbir Singh (3):
  Setup AMOR in HV mode
  Detect instruction fetch denied and report
  Enable storage keys for radix - user mode execution

 arch/powerpc/mm/fault.c |  4 
 arch/powerpc/mm/pgtable-radix.c | 41 +
 2 files changed, 45 insertions(+)

-- 
2.5.5



Re: [PATCH V3 2/2] mm: THP page cache support for ppc64

2016-11-13 Thread Kirill A. Shutemov
On Sun, Nov 13, 2016 at 08:30:25PM +0530, Aneesh Kumar K.V wrote:
> Add arch specific callback in the generic THP page cache code that will
> deposit and withdarw preallocated page table. Archs like ppc64 use
> this preallocated table to store the hash pte slot information.
> 
> Testing:
> kernel build of the patch series on tmpfs mounted with option huge=always
> 
> The related thp stat:
> thp_fault_alloc 72939
> thp_fault_fallback 60547
> thp_collapse_alloc 603
> thp_collapse_alloc_failed 0
> thp_file_alloc 253763
> thp_file_mapped 4251
> thp_split_page 51518
> thp_split_page_failed 1
> thp_deferred_split_page 73566
> thp_split_pmd 665
> thp_zero_page_alloc 3
> thp_zero_page_alloc_failed 0
> 
> Signed-off-by: Aneesh Kumar K.V 

One nit-pick below, but otherwise

Acked-by: Kirill A. Shutemov 

> @@ -2975,6 +3004,13 @@ static int do_set_pmd(struct fault_env *fe, struct 
> page *page)
>   ret = 0;
>   count_vm_event(THP_FILE_MAPPED);
>  out:
> + /*
> +  * If we are going to fallback to pte mapping, do a
> +  * withdraw with pmd lock held.
> +  */
> + if (arch_needs_pgtable_deposit() && (ret == VM_FAULT_FALLBACK))

Parenthesis are redundant around ret check.

> + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
> +fe->pmd);
>   spin_unlock(fe->ptl);
>   return ret;
>  }

-- 
 Kirill A. Shutemov


[PATCH] Rename early_init_mmu to early_init_mmu_primary

2016-11-13 Thread Balbir Singh

It helps clarify that the action taken is just for the primary
CPU and more action might be required for in the secondaries
in early_init_mmu_secondary. This patch does not introduce
a functional change

Signed-off-by: Balbir Singh 
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 10 +-
 arch/powerpc/include/asm/mmu.h   |  2 +-
 arch/powerpc/kernel/setup_32.c   |  2 +-
 arch/powerpc/kernel/setup_64.c   |  2 +-
 arch/powerpc/mm/hash_utils_64.c  |  2 +-
 arch/powerpc/mm/pgtable-radix.c  |  2 +-
 arch/powerpc/mm/tlb_hash32.c |  2 +-
 arch/powerpc/mm/tlb_nohash.c |  4 ++--
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 8afb0e0..c60a629 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -104,13 +104,13 @@ void mmu_early_init_devtree(void);
 void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
 extern void radix_init_native(void);
-extern void hash__early_init_mmu(void);
-extern void radix__early_init_mmu(void);
-static inline void early_init_mmu(void)
+extern void hash__early_init_mmu_primary(void);
+extern void radix__early_init_mmu_primary(void);
+static inline void early_init_mmu_primary(void)
 {
if (radix_enabled())
-   return radix__early_init_mmu();
-   return hash__early_init_mmu();
+   return radix__early_init_mmu_primary();
+   return hash__early_init_mmu_primary();
 }
 extern void hash__early_init_mmu_secondary(void);
 extern void radix__early_init_mmu_secondary(void);
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index e883683..3f8226c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -284,7 +284,7 @@ static inline bool early_radix_enabled(void)
 
 #ifndef __ASSEMBLY__
 /* MMU initialization */
-extern void early_init_mmu(void);
+extern void early_init_mmu_primary(void);
 extern void early_init_mmu_secondary(void);
 extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
   phys_addr_t first_memblock_size);
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 5fe7918..e1ee6d6 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -118,7 +118,7 @@ notrace void __init machine_init(u64 dt_ptr)
/* Do some early initialization based on the flat device tree */
early_init_devtree(__va(dt_ptr));
 
-   early_init_mmu();
+   early_init_mmu_primary();
 
setup_kdump_trampoline();
 }
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index c3e1290..5d1ba51a 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -303,7 +303,7 @@ void __init early_setup(unsigned long dt_ptr)
setup_feature_keys();
 
/* Initialize the hash table or TLB handling */
-   early_init_mmu();
+   early_init_mmu_primary();
 
/*
 * At this point, we can let interrupts switch to virtual mode
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 44d3c3a..e0acd6d 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -964,7 +964,7 @@ void __init hash__early_init_devtree(void)
htab_scan_page_sizes();
 }
 
-void __init hash__early_init_mmu(void)
+void __init hash__early_init_mmu_primary(void)
 {
htab_init_page_sizes();
 
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index ed7bddc..968e29c 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -320,7 +320,7 @@ static void update_hid_for_radix(void)
cpu_relax();
 }
 
-void __init radix__early_init_mmu(void)
+void __init radix__early_init_mmu_primary(void)
 {
unsigned long lpcr;
 
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 702d768..c006f04 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -168,6 +168,6 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned 
long start,
 }
 EXPORT_SYMBOL(flush_tlb_range);
 
-void __init early_init_mmu(void)
+void __init early_init_mmu_primary(void)
 {
 }
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 050badc..e704b33 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -718,7 +718,7 @@ static void __init early_mmu_set_memory_limit(void)
 }
 
 /* boot cpu only */
-void __init early_init_mmu(void)
+void __init early_init_mmu_primary(void)
 {
early_init_mmu_global();
early_init_this_mmu();
@@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t 
first_memblock_base,
memblock_set_current_limit(first_memblock_base + ppc64_rma_size);
 }
 #else /* ! CONFIG_PPC64 

[PATCH V4 2/2] powerpc/kvm: Update kvmppc_set_arch_compat() for ISA v3.00

2016-11-13 Thread Suraj Jitindar Singh
The function kvmppc_set_arch_compat() is used to determine the value of the
processor compatibility register (PCR) for a guest running in a given
compatibility mode. There is currently no support for v3.00 of the ISA.

Add support for v3.00 of the ISA which adds an ISA v2.07 compatilibity mode
to the PCR.

We also add a check to ensure the processor we are running on is capable of
emulating the chosen processor (for example a POWER7 cannot emulate a
POWER8, similarly with a POWER8 and a POWER9).

Based on work by: Paul Mackerras 

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/kvm/book3s_hv.c | 38 +++---
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3686471..5d83ecb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -301,39 +301,47 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 
pvr)
 
 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
-   unsigned long pcr = 0;
+   unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+   /* We can (emulate) our own architecture version and anything older */
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   host_pcr_bit = PCR_ARCH_300;
+   else if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   host_pcr_bit = PCR_ARCH_207;
+   else if (cpu_has_feature(CPU_FTR_ARCH_206))
+   host_pcr_bit = PCR_ARCH_206;
+   else
+   host_pcr_bit = PCR_ARCH_205;
+
+   /* Determine lowest PCR bit needed to run guest in given PVR level */
if (arch_compat) {
switch (arch_compat) {
case PVR_ARCH_205:
-   /*
-* If an arch bit is set in PCR, all the defined
-* higher-order arch bits also have to be set.
-*/
-   pcr = PCR_ARCH_206 | PCR_ARCH_205;
+   guest_pcr_bit = PCR_ARCH_205;
break;
case PVR_ARCH_206:
case PVR_ARCH_206p:
-   pcr = PCR_ARCH_206;
+   guest_pcr_bit = PCR_ARCH_206;
break;
case PVR_ARCH_207:
+   guest_pcr_bit = PCR_ARCH_207;
+   break;
+   case PVR_ARCH_300:
+   guest_pcr_bit = PCR_ARCH_300;
break;
default:
return -EINVAL;
}
-
-   if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
-   /* POWER7 can't emulate POWER8 */
-   if (!(pcr & PCR_ARCH_206))
-   return -EINVAL;
-   pcr &= ~PCR_ARCH_206;
-   }
}
 
+   /* Check requested PCR bits don't exceed our capabilities */
+   if (guest_pcr_bit > host_pcr_bit)
+   return -EINVAL;
+
spin_lock(&vc->lock);
vc->arch_compat = arch_compat;
-   vc->pcr = pcr;
+   vc->pcr = host_pcr_bit - guest_pcr_bit;
spin_unlock(&vc->lock);
 
return 0;
-- 
2.5.5



[PATCH V4 1/2] powerpc: Define new ISA v3.00 logical PVR value and PCR register value

2016-11-13 Thread Suraj Jitindar Singh
ISA 3.00 adds the logical PVR value 0x0f05, so add a definition for
this.

Define PCR_ARCH_207 to reflect ISA 2.07 compatibility mode in the processor
compatibility register (PCR).

The next patch changes the algorithm used to determine the required PCR
value in the function kvmppc_set_arch_compat(). We use the PCR_ARCH_XXX
bits to specify and determine the compatibility level which we want to
emulate as well as the compatibility levels which the host is capable
of emulating. To show that we can emulate a v3.00 guest (which is actually
a v3.00 host with no compatility bits set, at the moment) we need a
PCR_ARCH_300 bit to represent this, however currently there is no such bit
defined by the ISA. Thus we define a 'dummy' v3.00 compat bit to be used.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/reg.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 9cd4e8c..30d897a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -377,6 +377,16 @@
 #define   PCR_VEC_DIS  (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */
 #define   PCR_VSX_DIS  (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */
 #define   PCR_TM_DIS   (1ul << (63-2)) /* Trans. memory disable (POWER8) */
+/*
+ * These bits are used in the function kvmppc_set_arch_compat() to specify and
+ * determine both the compatibility level which we want to emulate and the
+ * compatibility level which the host is capable of emulating. Thus we need a
+ * bit to show that we are capable of emulating an ISA v3.00 guest however as
+ * yet no such bit has been defined in the PCR register. Thus we have to define
+ * a 'dummy' value to be used.
+ */
+#define   PCR_ARCH_300 0x10/* Dummy Architecture 3.00 */
+#define   PCR_ARCH_207 0x8 /* Architecture 2.07 */
 #define   PCR_ARCH_206 0x4 /* Architecture 2.06 */
 #define   PCR_ARCH_205 0x2 /* Architecture 2.05 */
 #defineSPRN_HEIR   0x153   /* Hypervisor Emulated Instruction 
Register */
@@ -1218,6 +1228,7 @@
 #define PVR_ARCH_206   0x0f03
 #define PVR_ARCH_206p  0x0f13
 #define PVR_ARCH_207   0x0f04
+#define PVR_ARCH_300   0x0f05
 
 /* Macros for setting and retrieving special purpose registers */
 #ifndef __ASSEMBLY__
-- 
2.5.5



[PATCH V4 0/2] powerpc: add support for ISA v2.07 compat level

2016-11-13 Thread Suraj Jitindar Singh
Version v3.00 of the ISA added a new compat level to the processor
compatibility register (PCR), an ISA v2.07 compatibility mode.

Upstream QEMU already supports this so it may as well go into the kernel
now.

Change Log:

V1 -> V2: 
- Reworked logic to set and mask the PCR, no functional change

V2 -> V3:
- Reworked logic again, no functional change

V3 -> V4:
- Added a comment in the first patch to clarify why a 'dummy' PCR v3.00
  value is needed

Suraj Jitindar Singh (2):
  powerpc: Define new ISA v3.00 logical PVR value and PCR register value
  powerpc/kvm: Update kvmppc_set_arch_compat() for ISA v3.00

 arch/powerpc/include/asm/reg.h | 11 +++
 arch/powerpc/kvm/book3s_hv.c   | 38 +++---
 2 files changed, 34 insertions(+), 15 deletions(-)

-- 
2.5.5



Re: [PATCH] powerpc/64: Simplify adaptation to new ISA v3.00 HPTE format

2016-11-13 Thread Balbir Singh


On 11/11/16 16:55, Paul Mackerras wrote:
> This changes the way that we support the new ISA v3.00 HPTE format.
> Instead of adapting everything that uses HPTE values to handle either
> the old format or the new format, depending on which CPU we are on,
> we now convert explicitly between old and new formats if necessary
> in the low-level routines that actually access HPTEs in memory.
> This limits the amount of code that needs to know about the new
> format and makes the conversions explicit.  This is OK because the
> old format contains all the information that is in the new format.
> 
> This also fixes operation under a hypervisor, because the H_ENTER
> hypercall (and other hypercalls that deal with HPTEs) will continue
> to require the HPTE value to be supplied in the old format.  At
> present the kernel will not boot in HPT mode on POWER9 under a
> hypervisor.
> 
> This fixes and partially reverts commit 50de596de8be
> ("powerpc/mm/hash: Add support for Power9 Hash", 2016-04-29).
> 
> Fixes: 50de596de8be
> Signed-off-by: Paul Mackerras 
> ---
>  arch/powerpc/include/asm/book3s/64/mmu-hash.h | 47 
> ++-
>  arch/powerpc/mm/hash_native_64.c  | 30 +
>  arch/powerpc/platforms/ps3/htab.c |  2 +-
>  arch/powerpc/platforms/pseries/lpar.c |  2 +-
>  4 files changed, 65 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
> b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> index e407af2..2e6a823 100644
> --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> @@ -70,7 +70,9 @@
>  
>  #define HPTE_V_SSIZE_SHIFT   62
>  #define HPTE_V_AVPN_SHIFT7
> +#define HPTE_V_COMMON_BITS   ASM_CONST(0x000f)
>  #define HPTE_V_AVPN  ASM_CONST(0x3f80)
> +#define HPTE_V_AVPN_3_0  ASM_CONST(0x000fff80)
>  #define HPTE_V_AVPN_VAL(x)   (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
>  #define HPTE_V_COMPARE(x,y)  (!(((x) ^ (y)) & 0xff80UL))
>  #define HPTE_V_BOLTEDASM_CONST(0x0010)
> @@ -80,14 +82,16 @@
>  #define HPTE_V_VALID ASM_CONST(0x0001)
>  
>  /*
> - * ISA 3.0 have a different HPTE format.
> + * ISA 3.0 has a different HPTE format.
>   */
>  #define HPTE_R_3_0_SSIZE_SHIFT   58
> +#define HPTE_R_3_0_SSIZE_MASK(3ull << HPTE_R_3_0_SSIZE_SHIFT)
>  #define HPTE_R_PP0   ASM_CONST(0x8000)
>  #define HPTE_R_TSASM_CONST(0x4000)
>  #define HPTE_R_KEY_HIASM_CONST(0x3000)
>  #define HPTE_R_RPN_SHIFT 12
>  #define HPTE_R_RPN   ASM_CONST(0x0000)
> +#define HPTE_R_RPN_3_0   ASM_CONST(0x01fff000)
>  #define HPTE_R_PPASM_CONST(0x0003)
>  #define HPTE_R_PPP   ASM_CONST(0x8003)
>  #define HPTE_R_N ASM_CONST(0x0004)
> @@ -316,12 +320,43 @@ static inline unsigned long hpte_encode_avpn(unsigned 
> long vpn, int psize,
>*/
>   v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
>   v <<= HPTE_V_AVPN_SHIFT;
> - if (!cpu_has_feature(CPU_FTR_ARCH_300))
> - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
> + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
>   return v;
>  }
>  
>  /*
> + * ISA v3.0 defines a new HPTE format, which differs from the old
> + * format in having smaller AVPN and ARPN fields, and the B field
> + * in the second dword instead of the first.
> + */
> +static inline unsigned long hpte_old_to_new_v(unsigned long v)
> +{
> + /* trim AVPN, drop B */
> + return v & HPTE_V_COMMON_BITS;
> +}
> +
> +static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long 
> r)
> +{
> + /* move B field from 1st to 2nd dword, trim ARPN */
> + return (r & ~HPTE_R_3_0_SSIZE_MASK) |
> + (((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT);
> +}
> +
> +static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long 
> r)
> +{
> + /* insert B field */
> + return (v & HPTE_V_COMMON_BITS) |
> + ((r & HPTE_R_3_0_SSIZE_MASK) <<
> +  (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT));
> +}
> +
> +static inline unsigned long hpte_new_to_old_r(unsigned long r)
> +{
> + /* clear out B field */
> + return r & ~HPTE_R_3_0_SSIZE_MASK;
> +}
> +

I wonder if we can encapsulate the name and ISA version check inside the 
helpers and like
Aneesh suggested call them as newv3 as opposed to new_?

> +/*
>   * This function sets the AVPN and L fields of the HPTE  appropriately
>   * using the base page size and actual page size.
>   */
> @@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long 
> vpn, int base_psize,
>   * aligned for the requested page size
>   */
>  static inline unsigned long hpte_encode_r(unsigned long pa, int 

[PATCH 0/3] soc: avoid module usage in non-modular code

2016-11-13 Thread Paul Gortmaker
This series of commits is a part of a larger project to ensure
people don't reference modular support functions in non-modular
code.  Overall there was roughly 5k lines of dead code in the
kernel due to this.  So far we've fixed several areas, like tty,
x86, net, gpio ... and we continue to work on other areas.

There are several reasons to not use module support for code that
can never be built as a module, but the big ones are:

 (1) it is easy to accidentally code up unused module_exit and remove code
 (2) it can be misleading when reading the source, thinking it can be
  modular when the Makefile and/or Kconfig prohibit it
 (3) it requires the include of the module.h header file which in turn
 includes nearly everything else.

Two of the changes are essentially source only -- the resuting module
will be binary equivalent.  Only the FSL driver has unused code in
addition to the use of modular macros that get converted.

Note the FSL SOC driver just appeared in linux-next and so this series
won't apply on Linus' master branch.  These commits were applied to
linux-next and build tested there.

Paul.
---

Cc: Alexandre Courbot 
Cc: Arnd Bergmann 
Cc: Maxime Ripard 
Cc: Scott Wood 
Cc: Stephen Warren 
Cc: Thierry Reding 
Cc: Ulf Hansson 
Cc: Yangbo Lu 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-te...@vger.kernel.org

Paul Gortmaker (3):
  soc: sunxi: make sunxi_sram explicitly non-modular
  soc: tegra: make fuse-tegra explicitly non-modular
  soc: fsl: make guts driver explicitly non-modular

 drivers/soc/fsl/guts.c  | 17 ++---
 drivers/soc/sunxi/sunxi_sram.c  |  9 ++---
 drivers/soc/tegra/fuse/fuse-tegra.c |  4 ++--
 3 files changed, 6 insertions(+), 24 deletions(-)

-- 
2.10.1



[PATCH 3/3] soc: fsl: make guts driver explicitly non-modular

2016-11-13 Thread Paul Gortmaker
The Kconfig currently controlling compilation of this code is:

drivers/soc/fsl/Kconfig:config FSL_GUTS
drivers/soc/fsl/Kconfig:bool

...meaning that it currently is not being built as a module by anyone.

Lets remove the modular code that is essentially orphaned, so that
when reading the driver there is no doubt it is builtin-only.

We explicitly disallow a driver unbind, since that doesn't have a
sensible use case anyway, and it allows us to drop the ".remove"
code for non-modular drivers.

Since the code was already not using module_init, the init ordering
remains unchanged with this commit.

Also note that MODULE_DEVICE_TABLE is a no-op for non-modular code.

Cc: Scott Wood 
Cc: Yangbo Lu 
Cc: Arnd Bergmann 
Cc: Ulf Hansson 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-arm-ker...@lists.infradead.org
Signed-off-by: Paul Gortmaker 
---
 drivers/soc/fsl/guts.c | 17 ++---
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/drivers/soc/fsl/guts.c b/drivers/soc/fsl/guts.c
index 0ac88263c2d7..b4d2fd9263b2 100644
--- a/drivers/soc/fsl/guts.c
+++ b/drivers/soc/fsl/guts.c
@@ -11,7 +11,7 @@
 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
@@ -180,12 +180,6 @@ static int fsl_guts_probe(struct platform_device *pdev)
return 0;
 }
 
-static int fsl_guts_remove(struct platform_device *dev)
-{
-   soc_device_unregister(soc_dev);
-   return 0;
-}
-
 /*
  * Table for matching compatible strings, for device tree
  * guts node, for Freescale QorIQ SOCs.
@@ -212,15 +206,14 @@ static const struct of_device_id fsl_guts_of_match[] = {
{ .compatible = "fsl,ls2080a-dcfg", },
{}
 };
-MODULE_DEVICE_TABLE(of, fsl_guts_of_match);
 
 static struct platform_driver fsl_guts_driver = {
.driver = {
.name = "fsl-guts",
+   .suppress_bind_attrs = true,
.of_match_table = fsl_guts_of_match,
},
.probe = fsl_guts_probe,
-   .remove = fsl_guts_remove,
 };
 
 static int __init fsl_guts_init(void)
@@ -228,9 +221,3 @@ static int __init fsl_guts_init(void)
return platform_driver_register(&fsl_guts_driver);
 }
 core_initcall(fsl_guts_init);
-
-static void __exit fsl_guts_exit(void)
-{
-   platform_driver_unregister(&fsl_guts_driver);
-}
-module_exit(fsl_guts_exit);
-- 
2.10.1



Re: [PATCH net-next v7 03/10] dpaa_eth: add option to use one buffer pool set

2016-11-13 Thread David Miller
From: Madalin Bucur 
Date: Fri, 11 Nov 2016 10:20:00 +0200

> @@ -8,3 +8,12 @@ menuconfig FSL_DPAA_ETH
> supporting the Freescale QorIQ chips.
> Depends on Freescale Buffer Manager and Queue Manager
> driver and Frame Manager Driver.
> +
> +if FSL_DPAA_ETH
> +config FSL_DPAA_ETH_COMMON_BPOOL
> + bool "Use a common buffer pool set for all the interfaces"
> + ---help---
> +   The DPAA Ethernet netdevices require buffer pools for storing the 
> buffers
> +   used by the FMan hardware for reception. One can use a single buffer 
> pool
> +   set for all interfaces or a dedicated buffer pool set for each 
> interface.
> +endif # FSL_DPAA_ETH

This in no way belongs in Kconfig.  If you want to support this,
support it wit a run time configuration choice via ethtool flags
or similar.  Do not use debugfs, do not use sysfs, do not use
module options.

If you put it in Kconfig, distributions will have to pick one way or
another which means that users who want the other choice lose.  This
never works.


[PATCH V3 2/2] mm: THP page cache support for ppc64

2016-11-13 Thread Aneesh Kumar K.V
Add arch specific callback in the generic THP page cache code that will
deposit and withdarw preallocated page table. Archs like ppc64 use
this preallocated table to store the hash pte slot information.

Testing:
kernel build of the patch series on tmpfs mounted with option huge=always

The related thp stat:
thp_fault_alloc 72939
thp_fault_fallback 60547
thp_collapse_alloc 603
thp_collapse_alloc_failed 0
thp_file_alloc 253763
thp_file_mapped 4251
thp_split_page 51518
thp_split_page_failed 1
thp_deferred_split_page 73566
thp_split_pmd 665
thp_zero_page_alloc 3
thp_zero_page_alloc_failed 0

Signed-off-by: Aneesh Kumar K.V 
---
Changes from V2:
* Handle page table allocation failures.

 arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +
 include/asm-generic/pgtable.h|  3 ++
 mm/Kconfig   |  6 +--
 mm/huge_memory.c | 17 
 mm/khugepaged.c  | 21 +-
 mm/memory.c  | 60 +++-
 6 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 700301bc5190..0ebfbc8f0449 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock 
*new_pmd_ptl,
 */
return true;
 }
+
+
+#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
+static inline bool arch_needs_pgtable_deposit(void)
+{
+   if (radix_enabled())
+   return false;
+   return true;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 324990273ad2..e00e3b7cf6a8 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -653,6 +653,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef arch_needs_pgtable_deposit
+#define arch_needs_pgtable_deposit() (false)
+#endif
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/Kconfig b/mm/Kconfig
index be0ee11fa0d9..0a279d399722 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -447,13 +447,9 @@ choice
  benefit.
 endchoice
 
-#
-# We don't deposit page tables on file THP mapping,
-# but Power makes use of them to address MMU quirk.
-#
 config TRANSPARENT_HUGE_PAGECACHE
def_bool y
-   depends on TRANSPARENT_HUGEPAGE && !PPC
+   depends on TRANSPARENT_HUGEPAGE
 
 #
 # UP and nommu archs use km based percpu allocator
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 54f265ec902e..a6f1e4443adc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1377,6 +1377,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
return ret;
 }
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   atomic_long_dec(&mm->nr_ptes);
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 pmd_t *pmd, unsigned long addr)
 {
@@ -1416,6 +1425,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
atomic_long_dec(&tlb->mm->nr_ptes);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
+   if (arch_needs_pgtable_deposit())
+   zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -1595,6 +1606,12 @@ static void __split_huge_pmd_locked(struct 
vm_area_struct *vma, pmd_t *pmd,
 
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+   /*
+* We are going to unmap this huge page. So
+* just go ahead and zap it
+*/
+   if (arch_needs_pgtable_deposit())
+   zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 728d7790dc2d..9fb7b275cb63 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,6 +1240,7 @@ static void retract_page_tables(struct address_space 
*mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
+   bool deposited = false;
 
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1264,10 +1265,26 @@ static 

[PATCH V3 1/2] mm: move vma_is_anonymous check within pmd_move_must_withdraw

2016-11-13 Thread Aneesh Kumar K.V
Architectures like ppc64 want to use page table deposit/withraw
even with huge pmd dax entries. Allow arch to override the
vma_is_anonymous check by moving that to pmd_move_must_withdraw
function

Acked-by: Kirill A. Shutemov 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  3 ++-
 include/asm-generic/pgtable.h| 12 
 mm/huge_memory.c | 18 --
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 9fd77f8794a0..700301bc5190 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct 
vm_area_struct *vma,
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
-struct spinlock *old_pmd_ptl)
+struct spinlock *old_pmd_ptl,
+struct vm_area_struct *vma)
 {
if (radix_enabled())
return false;
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index c4f8fd2fd384..324990273ad2 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -653,18 +653,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
-#ifndef pmd_move_must_withdraw
-static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
-spinlock_t *old_pmd_ptl)
-{
-   /*
-* With split pmd lock we also need to move preallocated
-* PTE page table if new_pmd is on different PMD page table.
-*/
-   return new_pmd_ptl != old_pmd_ptl;
-}
-#endif
-
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cdcd25cb30fe..54f265ec902e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1424,6 +1424,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
return 1;
 }
 
+#ifndef pmd_move_must_withdraw
+static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
+spinlock_t *old_pmd_ptl,
+struct vm_area_struct *vma)
+{
+   /*
+* With split pmd lock we also need to move preallocated
+* PTE page table if new_pmd is on different PMD page table.
+*
+* We also don't deposit and withdraw tables for file pages.
+*/
+   return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
+}
+#endif
+
 bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  unsigned long new_addr, unsigned long old_end,
  pmd_t *old_pmd, pmd_t *new_pmd)
@@ -1458,8 +1473,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned 
long old_addr,
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
 
-   if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
-   vma_is_anonymous(vma)) {
+   if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
-- 
2.10.2