date:20170727

[PATCH v4 3/3] powerpc/mm/hugetlb: Allow runtime allocation of 16G.

2017-07-27 Thread Aneesh Kumar K.V

Now that we have GIGANTIC_PAGE enabled on powerpc, use this for 16G hugepages
with hash translation mode. Depending on the total system memory we have, we may
be able to allocate 16G hugepages runtime. This also remove the hugetlb setup
difference between hash/radix translation mode.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 5c28bd6f2ae1..2d1ca488ca44 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -54,9 +54,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct 
vm_area_struct *vma,
 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
 static inline bool gigantic_page_supported(void)
 {
-   if (radix_enabled())
-   return true;
-   return false;
+   return true;
 }
 #endif
 
-- 
2.13.3

[PATCH v4 2/3] powerpc/mm/hugetlb: Add support for reserving gigantic huge pages via kernel command line

2017-07-27 Thread Aneesh Kumar K.V

With commit aa888a74977a8 ("hugetlb: support larger than MAX_ORDER") we added
support for allocating gigantic hugepages via kernel command line. Switch
ppc64 arch specific code to use that.

W.r.t FSL support, we now limit our allocation range using 
BOOTMEM_ALLOC_ACCESSIBLE.

We use the kernel command line to do reservation of hugetlb pages on powernv
platforms. On pseries hash mmu mode the supported gigantic huge page size is
16GB and that can only be allocated with hypervisor assist. For pseries the
command line option doesn't do the allocation. Instead pseries does gigantic
hugepage allocation based on hypervisor hint that is specified via
"ibm,expected#pages" property of the memory node.

Cc: Scott Wood 
Cc: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   2 +-
 arch/powerpc/include/asm/hugetlb.h|  14 --
 arch/powerpc/kernel/setup-common.c|   7 -
 arch/powerpc/mm/hash_utils_64.c   |   2 +-
 arch/powerpc/mm/hugetlbpage.c | 177 +++---
 arch/powerpc/mm/init_32.c |   2 -
 6 files changed, 22 insertions(+), 182 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 6981a52b3887..f28d21c69f79 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -468,7 +468,7 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned 
long vend,
 int psize, int ssize);
 int htab_remove_mapping(unsigned long vstart, unsigned long vend,
int psize, int ssize);
-extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages);
+extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long 
number_of_pages);
 extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);
 
 #ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 7f4025a6c69e..b8a0fb442c64 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -218,18 +218,4 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned 
long addr,
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
-/*
- * FSL Book3E platforms require special gpage handling - the gpages
- * are reserved early in the boot process by memblock instead of via
- * the .dts as on IBM platforms.
- */
-#if defined(CONFIG_HUGETLB_PAGE) && (defined(CONFIG_PPC_FSL_BOOK3E) || \
-defined(CONFIG_PPC_8xx))
-extern void __init reserve_hugetlb_gpages(void);
-#else
-static inline void reserve_hugetlb_gpages(void)
-{
-}
-#endif
-
 #endif /* _ASM_POWERPC_HUGETLB_H */
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 94a948207cd2..0f896f17d5ab 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -916,13 +916,6 @@ void __init setup_arch(char **cmdline_p)
/* Reserve large chunks of memory for use by CMA for KVM. */
kvm_cma_reserve();
 
-   /*
-* Reserve any gigantic pages requested on the command line.
-* memblock needs to have been initialized by the time this is
-* called since this will reserve memory.
-*/
-   reserve_hugetlb_gpages();
-
klp_init_thread_info(_thread_info);
 
init_mm.start_code = (unsigned long)_stext;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a20669c19e7..2f1f6bc04012 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -509,7 +509,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned 
long node,
phys_addr, block_size, expected_pages);
if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
-   add_gpage(phys_addr, block_size, expected_pages);
+   pseries_add_gpage(phys_addr, block_size, expected_pages);
}
return 0;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca397fe..a0271d738a30 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -36,26 +36,6 @@
 unsigned int HPAGE_SHIFT;
 EXPORT_SYMBOL(HPAGE_SHIFT);
 
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready.  On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array.  FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define MAX_NUMBER_GPAGES  128
-struct psize_gpages {
-   u64 gpage_list[MAX_NUMBER_GPAGES];
-   unsigned int nr_gpages;
-};
-static struct

[PATCH v4 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Aneesh Kumar K.V

When running in guest mode ppc64 supports a different mechanism for hugetlb
allocation/reservation. The LPAR management application called HMC can
be used to reserve a set of hugepages and we pass the details of
reserved pages via device tree to the guest. (more details in
htab_dt_scan_hugepage_blocks()) . We do the memblock_reserve of the range
and later in the boot sequence, we add the reserved range to huge_boot_pages.

But to enable 16G hugetlb on baremetal config (when we are not running as guest)
we want to do memblock reservation during boot. Generic code already does this

Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/hugetlb.h | 1 +
 mm/hugetlb.c| 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0ed8e41aaf11..8bbbd37ab105 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -358,6 +358,7 @@ int huge_add_to_page_cache(struct page *page, struct 
address_space *mapping,
pgoff_t idx);
 
 /* arch callback */
+int __init __alloc_bootmem_huge_page(struct hstate *h);
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
 void __init hugetlb_bad_size(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..b97e6494d74d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct 
*vma,
return page;
 }
 
-int __weak alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h)
+   __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
 {
struct huge_bootmem_page *m;
int nr_nodes, node;
-- 
2.13.3

Re: [PATCH V8 3/3] powernv: Add support to clear sensor groups data

2017-07-27 Thread Cyril Bur

On Wed, 2017-07-26 at 10:35 +0530, Shilpasri G Bhat wrote:
> Adds support for clearing different sensor groups. OCC inband sensor
> groups like CSM, Profiler, Job Scheduler can be cleared using this
> driver. The min/max of all sensors belonging to these sensor groups
> will be cleared.
> 

Hi Shilpasri,

I think also some comments from v1 also apply here.

Other comments inline

Thanks,

Cyril

> Signed-off-by: Shilpasri G Bhat 
> ---
> Changes from V7:
> - s/send_occ_command/opal_sensor_groups_clear_history
> 
>  arch/powerpc/include/asm/opal-api.h|   3 +-
>  arch/powerpc/include/asm/opal.h|   2 +
>  arch/powerpc/include/uapi/asm/opal-occ.h   |  23 ++
>  arch/powerpc/platforms/powernv/Makefile|   2 +-
>  arch/powerpc/platforms/powernv/opal-occ.c  | 109 
> +
>  arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
>  arch/powerpc/platforms/powernv/opal.c  |   3 +
>  7 files changed, 141 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/include/uapi/asm/opal-occ.h
>  create mode 100644 arch/powerpc/platforms/powernv/opal-occ.c
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index 0d37315..342738a 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -195,7 +195,8 @@
>  #define OPAL_SET_POWERCAP153
>  #define OPAL_GET_PSR 154
>  #define OPAL_SET_PSR 155
> -#define OPAL_LAST155
> +#define OPAL_SENSOR_GROUPS_CLEAR 156
> +#define OPAL_LAST156
>  
>  /* Device tree flags */
>  
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 58b30a4..92db6af 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -271,6 +271,7 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
>  int opal_set_powercap(u32 handle, int token, u32 pcap);
>  int opal_get_power_shifting_ratio(u32 handle, int token, u32 *psr);
>  int opal_set_power_shifting_ratio(u32 handle, int token, u32 psr);
> +int opal_sensor_groups_clear(u32 group_hndl, int token);
>  
>  /* Internal functions */
>  extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
> @@ -351,6 +352,7 @@ static inline int opal_get_async_rc(struct opal_msg msg)
>  
>  void opal_powercap_init(void);
>  void opal_psr_init(void);
> +int opal_sensor_groups_clear_history(u32 handle);
>  
>  #endif /* __ASSEMBLY__ */
>  
> diff --git a/arch/powerpc/include/uapi/asm/opal-occ.h 
> b/arch/powerpc/include/uapi/asm/opal-occ.h
> new file mode 100644
> index 000..97c45e2
> --- /dev/null
> +++ b/arch/powerpc/include/uapi/asm/opal-occ.h
> @@ -0,0 +1,23 @@
> +/*
> + * OPAL OCC command interface
> + * Supported on POWERNV platform
> + *
> + * (C) Copyright IBM 2017
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2, or (at your option)
> + * any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#ifndef _UAPI_ASM_POWERPC_OPAL_OCC_H_
> +#define _UAPI_ASM_POWERPC_OPAL_OCC_H_
> +
> +#define OPAL_OCC_IOCTL_CLEAR_SENSOR_GROUPS   _IOR('o', 1, u32)
> +
> +#endif /* _UAPI_ASM_POWERPC_OPAL_OCC_H */
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index 9ed7d33..f193b33 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -2,7 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o 
> opal-async.o idle.o
>  obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o 
> opal-flash.o
>  obj-y+= rng.o opal-elog.o opal-dump.o 
> opal-sysparam.o opal-sensor.o
>  obj-y+= opal-msglog.o opal-hmi.o opal-power.o 
> opal-irqchip.o
> -obj-y+= opal-kmsg.o opal-powercap.o opal-psr.o
> +obj-y+= opal-kmsg.o opal-powercap.o opal-psr.o 
> opal-occ.o
>  
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
>  obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o
> diff --git a/arch/powerpc/platforms/powernv/opal-occ.c 
> b/arch/powerpc/platforms/powernv/opal-occ.c
> new file mode 100644
> index 000..d1d4b28
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-occ.c
> @@ -0,0 +1,109 @@
> +/*
> + * Copyright IBM Corporation 2017
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version

Re: [PATCH V8 2/3] powernv: Add support to set power-shifting-ratio

2017-07-27 Thread Cyril Bur

On Wed, 2017-07-26 at 10:35 +0530, Shilpasri G Bhat wrote:
> This patch adds support to set power-shifting-ratio for CPU-GPU which
> is used by OCC power capping algorithm.
> 
> Signed-off-by: Shilpasri G Bhat 


Hi Shilpasri,

I started looking though this - a lot the comments to patch 1/3 apply
here so I'll stop repeating myself :).


Thanks,

Cyril
> ---
> Changes from V7:
> - Replaced sscanf with kstrtoint
> 
>  arch/powerpc/include/asm/opal-api.h|   4 +-
>  arch/powerpc/include/asm/opal.h|   3 +
>  arch/powerpc/platforms/powernv/Makefile|   2 +-
>  arch/powerpc/platforms/powernv/opal-psr.c  | 169 
> +
>  arch/powerpc/platforms/powernv/opal-wrappers.S |   2 +
>  arch/powerpc/platforms/powernv/opal.c  |   3 +
>  6 files changed, 181 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/opal-psr.c
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index c3e0c4a..0d37315 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -193,7 +193,9 @@
>  #define OPAL_NPU_MAP_LPAR148
>  #define OPAL_GET_POWERCAP152
>  #define OPAL_SET_POWERCAP153
> -#define OPAL_LAST153
> +#define OPAL_GET_PSR 154
> +#define OPAL_SET_PSR 155
> +#define OPAL_LAST155
>  
>  /* Device tree flags */
>  
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index ec2087c..58b30a4 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -269,6 +269,8 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
>  int64_t opal_xive_dump(uint32_t type, uint32_t id);
>  int opal_get_powercap(u32 handle, int token, u32 *pcap);
>  int opal_set_powercap(u32 handle, int token, u32 pcap);
> +int opal_get_power_shifting_ratio(u32 handle, int token, u32 *psr);
> +int opal_set_power_shifting_ratio(u32 handle, int token, u32 psr);
>  
>  /* Internal functions */
>  extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
> @@ -348,6 +350,7 @@ static inline int opal_get_async_rc(struct opal_msg msg)
>  void opal_wake_poller(void);
>  
>  void opal_powercap_init(void);
> +void opal_psr_init(void);
>  
>  #endif /* __ASSEMBLY__ */
>  
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index e79f806..9ed7d33 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -2,7 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o 
> opal-async.o idle.o
>  obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o 
> opal-flash.o
>  obj-y+= rng.o opal-elog.o opal-dump.o 
> opal-sysparam.o opal-sensor.o
>  obj-y+= opal-msglog.o opal-hmi.o opal-power.o 
> opal-irqchip.o
> -obj-y+= opal-kmsg.o opal-powercap.o
> +obj-y+= opal-kmsg.o opal-powercap.o opal-psr.o
>  
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
>  obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o
> diff --git a/arch/powerpc/platforms/powernv/opal-psr.c 
> b/arch/powerpc/platforms/powernv/opal-psr.c
> new file mode 100644
> index 000..07e3f78
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-psr.c
> @@ -0,0 +1,169 @@
> +/*
> + * PowerNV OPAL Power-Shifting-Ratio interface
> + *
> + * Copyright 2017 IBM Corp.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#define pr_fmt(fmt) "opal-psr: " fmt
> +
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +DEFINE_MUTEX(psr_mutex);
> +
> +static struct kobject *psr_kobj;
> +
> +struct psr_attr {
> + u32 handle;
> + struct kobj_attribute attr;
> +};
> +
> +static struct psr_attr *psr_attrs;
> +static struct kobject *psr_kobj;
> +
> +static ssize_t psr_show(struct kobject *kobj, struct kobj_attribute *attr,
> + char *buf)
> +{
> + struct psr_attr *psr_attr = container_of(attr, struct psr_attr, attr);
> + struct opal_msg msg;
> + int psr, ret, token;
> +
> + token = opal_async_get_token_interruptible();
> + if (token < 0) {
> + pr_devel("Failed to get token\n");
> + return token;
> + }
> +
> + mutex_lock(_mutex);
> + ret = opal_get_power_shifting_ratio(psr_attr->handle, token, );

__pa()

> + switch (ret) {
> + case OPAL_ASYNC_COMPLETION:
> + ret = opal_async_wait_response(token, );
> + if (ret) {
> +

Re: [PATCH V8 1/3] powernv: powercap: Add support for powercap framework

2017-07-27 Thread Shilpasri G Bhat

Hi Cyril,

On 07/28/2017 07:09 AM, Cyril Bur wrote:
> Is there any reason that pcap_attrs needs to be contiguous? If not, I
> feel like you could eliminate the entire loop below (and the last one
> as well maybe) and just do the assignment of pattr_groups[].attrs[] up
> there.
> 
> In fact do you even need to store pcap_attrs? If you kcalloc them as
> you need them (in the loop above), you can always free them again on
> error by freeing pattr_groups[].attrs[] right?
> 
> I'll admit I've become quite confused as to the layout of the sysfs dir
>  that you're creating here - would you mind showing what the expected
> layout will be?
> 
> I'll take more of a look once thats more clear in my head
> 
> Thanks,
> 
> Cyril

The sysfs layout looks as below:
# ls /sys/firmware/opal/powercap/
system-powercap

# ls /sys/firmware/opal/powercap/system-powercap/
powercap-current  powercap-max  powercap-min

# grep .  /sys/firmware/opal/powercap/system-powercap/*
/sys/firmware/opal/powercap/system-powercap/powercap-current:2375
/sys/firmware/opal/powercap/system-powercap/powercap-max:2375
/sys/firmware/opal/powercap/system-powercap/powercap-min:1945

Thanks and Regards,
Shilpa

Re: [PATCH v2] powerpc/mm: Fix pmd/pte_devmap() on non-leaf entries

2017-07-27 Thread Aneesh Kumar K.V

Michael Ellerman  writes:

> From: Oliver O'Halloran 
>
> The Radix MMU translation tree as defined in ISA v3.0 contains two
> different types of entry, directories and leaves. Leaves are
> identified by _PAGE_PTE being set.
>
> The formats of the two entries are different, with the directory
> entries containing no spare bits for use by software. In particular
> the bit we use for _PAGE_DEVMAP is not reserved for software, and is
> part of the NLB (Next Level Base) field, essentially the address of
> the next level in the tree.
>
> Note that the Linux pte_t is not == _PAGE_PTE. A huge page pmd
> entry (or devmap!) is also a leaf and so has _PAGE_PTE set, even
> though we use a pmd_t for it in Linux.
>
> The fix is to ensure that the pmd/pte_devmap() confirm they are
> looking at a leaf entry (_PAGE_PTE) as well as checking _PAGE_DEVMAP.
>

Reviewed-by: Aneesh Kumar K.V 

> Signed-off-by: Oliver O'Halloran 
> Tested-by: Laurent Vivier 
> Tested-by: Jose Ricardo Ziviani 
> Reviewed-by: Suraj Jitindar Singh 
> [mpe: Add a comment in the code and flesh out change log]
> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
>
> v2: Add a comment in the code and flesh out change log
>
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index d1da415e283c..818a58fc3f4f 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -608,9 +608,17 @@ static inline pte_t pte_mkdevmap(pte_t pte)
>   return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
>  }
>  
> +/*
> + * This is potentially called with a pmd as the argument, in which case it's 
> not
> + * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
> + * That's because the bit we use for _PAGE_DEVMAP is not reserved for 
> software
> + * use in page directory entries (ie. non-ptes).
> + */
>  static inline int pte_devmap(pte_t pte)
>  {
> - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
> + u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
> +
> + return (pte_raw(pte) & mask) == mask;
>  }
>  
>  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
> -- 
> 2.7.4

[PATCH v2] powerpc/mm: Fix pmd/pte_devmap() on non-leaf entries

2017-07-27 Thread Michael Ellerman

From: Oliver O'Halloran 

The Radix MMU translation tree as defined in ISA v3.0 contains two
different types of entry, directories and leaves. Leaves are
identified by _PAGE_PTE being set.

The formats of the two entries are different, with the directory
entries containing no spare bits for use by software. In particular
the bit we use for _PAGE_DEVMAP is not reserved for software, and is
part of the NLB (Next Level Base) field, essentially the address of
the next level in the tree.

Note that the Linux pte_t is not == _PAGE_PTE. A huge page pmd
entry (or devmap!) is also a leaf and so has _PAGE_PTE set, even
though we use a pmd_t for it in Linux.

The fix is to ensure that the pmd/pte_devmap() confirm they are
looking at a leaf entry (_PAGE_PTE) as well as checking _PAGE_DEVMAP.

Signed-off-by: Oliver O'Halloran 
Tested-by: Laurent Vivier 
Tested-by: Jose Ricardo Ziviani 
Reviewed-by: Suraj Jitindar Singh 
[mpe: Add a comment in the code and flesh out change log]
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

v2: Add a comment in the code and flesh out change log

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415e283c..818a58fc3f4f 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -608,9 +608,17 @@ static inline pte_t pte_mkdevmap(pte_t pte)
return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP);
 }
 
+/*
+ * This is potentially called with a pmd as the argument, in which case it's 
not
+ * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set.
+ * That's because the bit we use for _PAGE_DEVMAP is not reserved for software
+ * use in page directory entries (ie. non-ptes).
+ */
 static inline int pte_devmap(pte_t pte)
 {
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+   u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+
+   return (pte_raw(pte) & mask) == mask;
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-- 
2.7.4

Re: [PATCH V8 1/3] powernv: powercap: Add support for powercap framework

2017-07-27 Thread Cyril Bur

On Wed, 2017-07-26 at 10:35 +0530, Shilpasri G Bhat wrote:
> Adds a generic powercap framework to change the system powercap
> inband through OPAL-OCC command/response interface.
> 
> Signed-off-by: Shilpasri G Bhat 
> ---
> Changes from V7:
> - Replaced sscanf with kstrtoint
> 
>  arch/powerpc/include/asm/opal-api.h|   5 +-
>  arch/powerpc/include/asm/opal.h|   4 +
>  arch/powerpc/platforms/powernv/Makefile|   2 +-
>  arch/powerpc/platforms/powernv/opal-powercap.c | 237 
> +
>  arch/powerpc/platforms/powernv/opal-wrappers.S |   2 +
>  arch/powerpc/platforms/powernv/opal.c  |   4 +
>  6 files changed, 252 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/platforms/powernv/opal-powercap.c
> 
> diff --git a/arch/powerpc/include/asm/opal-api.h 
> b/arch/powerpc/include/asm/opal-api.h
> index 3130a73..c3e0c4a 100644
> --- a/arch/powerpc/include/asm/opal-api.h
> +++ b/arch/powerpc/include/asm/opal-api.h
> @@ -42,6 +42,7 @@
>  #define OPAL_I2C_STOP_ERR-24
>  #define OPAL_XIVE_PROVISIONING   -31
>  #define OPAL_XIVE_FREE_ACTIVE-32
> +#define OPAL_TIMEOUT -33
>  
>  /* API Tokens (in r0) */
>  #define OPAL_INVALID_CALL   -1
> @@ -190,7 +191,9 @@
>  #define OPAL_NPU_INIT_CONTEXT146
>  #define OPAL_NPU_DESTROY_CONTEXT 147
>  #define OPAL_NPU_MAP_LPAR148
> -#define OPAL_LAST148
> +#define OPAL_GET_POWERCAP152
> +#define OPAL_SET_POWERCAP153
> +#define OPAL_LAST153
>  
>  /* Device tree flags */
>  
> diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
> index 588fb1c..ec2087c 100644
> --- a/arch/powerpc/include/asm/opal.h
> +++ b/arch/powerpc/include/asm/opal.h
> @@ -267,6 +267,8 @@ int64_t opal_xive_set_vp_info(uint64_t vp,
>  int64_t opal_xive_free_irq(uint32_t girq);
>  int64_t opal_xive_sync(uint32_t type, uint32_t id);
>  int64_t opal_xive_dump(uint32_t type, uint32_t id);
> +int opal_get_powercap(u32 handle, int token, u32 *pcap);
> +int opal_set_powercap(u32 handle, int token, u32 pcap);
>  
>  /* Internal functions */
>  extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
> @@ -345,6 +347,8 @@ static inline int opal_get_async_rc(struct opal_msg msg)
>  
>  void opal_wake_poller(void);
>  
> +void opal_powercap_init(void);
> +
>  #endif /* __ASSEMBLY__ */
>  
>  #endif /* _ASM_POWERPC_OPAL_H */
> diff --git a/arch/powerpc/platforms/powernv/Makefile 
> b/arch/powerpc/platforms/powernv/Makefile
> index b5d98cb..e79f806 100644
> --- a/arch/powerpc/platforms/powernv/Makefile
> +++ b/arch/powerpc/platforms/powernv/Makefile
> @@ -2,7 +2,7 @@ obj-y += setup.o opal-wrappers.o opal.o 
> opal-async.o idle.o
>  obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o 
> opal-flash.o
>  obj-y+= rng.o opal-elog.o opal-dump.o 
> opal-sysparam.o opal-sensor.o
>  obj-y+= opal-msglog.o opal-hmi.o opal-power.o 
> opal-irqchip.o
> -obj-y+= opal-kmsg.o
> +obj-y+= opal-kmsg.o opal-powercap.o
>  
>  obj-$(CONFIG_SMP)+= smp.o subcore.o subcore-asm.o
>  obj-$(CONFIG_PCI)+= pci.o pci-ioda.o npu-dma.o
> diff --git a/arch/powerpc/platforms/powernv/opal-powercap.c 
> b/arch/powerpc/platforms/powernv/opal-powercap.c
> new file mode 100644
> index 000..7c57f4b
> --- /dev/null
> +++ b/arch/powerpc/platforms/powernv/opal-powercap.c
> @@ -0,0 +1,237 @@
> +/*
> + * PowerNV OPAL Powercap interface
> + *
> + * Copyright 2017 IBM Corp.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +
> +#define pr_fmt(fmt) "opal-powercap: " fmt
> +
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +
> +DEFINE_MUTEX(powercap_mutex);
> +
> +static struct kobject *powercap_kobj;
> +
> +struct powercap_attr {
> + u32 handle;
> + struct kobj_attribute attr;
> +};
> +
> +static struct attribute_group *pattr_groups;
> +static struct powercap_attr *pcap_attrs;
> +
> +static ssize_t powercap_show(struct kobject *kobj, struct kobj_attribute 
> *attr,
> +  char *buf)
> +{
> + struct powercap_attr *pcap_attr = container_of(attr,
> + struct powercap_attr, attr);
> + struct opal_msg msg;
> + u32 pcap;
> + int ret, token;
> +
> + token = opal_async_get_token_interruptible();
> + if (token < 0) {
> + pr_devel("Failed to get token\n");
> + return token;
> + }
> +
> + mutex_lock(_mutex);

If this is purely a userspace interface,

Re: [PATCH v3 4/5] powerpc/lib/sstep: Add prty instruction emulation

2017-07-27 Thread Michael Ellerman

Matt Brown  writes:

> On Thu, Jul 27, 2017 at 11:26 AM, Michael Ellerman  
> wrote:
>> Segher Boessenkool  writes:
>>
>>> On Wed, Jul 26, 2017 at 08:03:30PM +1000, Michael Ellerman wrote:
 Segher Boessenkool  writes:
 > A general question about these patches: some things are inside #ifdef
 > __powerpc64__, some are not.  It seems it is the wrong macro, and it
 > should be used (or not used) consistently?

 Why is it the wrong macro? Because we tend to use CONFIG_PPC64 you mean?
>>>
>>> Yeah.  But I see sstep.c already mixes those two at will (or if there
>>> is a distinction, I'm not seeing it :-) )
>>
>> Yeah OK. In practice they're equivalent, if CONFIG_PPC64=y then the
>> kernel is built 64-bit and therefore __powerpc64__ is defined.
>>
>> But I agree it's a mess, we should use CONFIG_PPC64 exclusively unless
>> there's some reason not to (which I don't think there ever is).
>>
 I thought the reason some are #ifdef'ed is that some are 64-bit only.
 ie. bpermd is 64-bit only ?
>>>
>>> 64-bit only, in what way?  It's not clear what the rules are.
>>
>> Instructions that have "d" in the name? :P
>>
>>> It's not instructions that can only run in 64-bit mode.
>>> It's not instructions that only give a usable result with 64-bit regs
>>> implemented.
>>> It's not instructions only implemented on 64-bit CPUs.
>>
>> I think it's trying to be that ^
>>
>> If you build a 32-bit kernel then instructions that are only defined on
>> 64-bit CPUs should be treated as illegal, so the easiest way to achieve
>> that is to #ifdef off the code for those instructions.
>
> I'll fixup this up to use the xor implementation, and change the
> series to use CONFIG_PPC64 for the ifdef.

Thanks.

In terms of the actual algorithms, we want to use something reasonably
efficient, but don't sweat too much on the performance. The overhead of
the emulation is in the exception we took to get there. So readable and
obviously correct source is the main goal.

cheers

Re: [PATCH v2] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Suraj Jitindar Singh

On Fri, 2017-07-28 at 01:35 +1000, Oliver O'Halloran wrote:
> The ISA radix translation tree contains two different types of entry,
s/entry/entries
> > directories and leaves. The formats of the two entries are
> different
> with the directory entries containing no spare bits for use by
> software.

Rather than saying the directory entries contain no spare bits, would
it be better to say something like: the devmap property only relates to
pte (leaf) entries and so we shouldn't perform the check on/should
always return false for page directory entries?

> As a result we need to ensure that the *_devmap() family of functions
> check fail for everything except leaf (PTE) entries.
> 
> Signed-off-by: Oliver O'Halloran 
> ---
> "i'll just tweak the mbox before i sent it, what's the worst that can
> happen"
> *completely breaks KVM*
> "..."
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index d1da415..6bc6248 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -610,7 +610,9 @@ static inline pte_t pte_mkdevmap(pte_t pte)
>  
>  static inline int pte_devmap(pte_t pte)
>  {
> - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
> + uint64_t mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
> +
> + return (pte_raw(pte) & mask) == mask;
>  }
>  
>  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)

Reviewed-by: Suraj Jitindar Singh

Re: [PATCH] powerpc/powernv/pci: Return failure for some uses of dma_set_mask()

2017-07-27 Thread Russell Currey

On Wed, 2017-07-26 at 15:26 +1000, Alistair Popple wrote:
> Commit 8e3f1b1d8255 ("powerpc/powernv/pci: Enable 64-bit devices to access
> > 4GB DMA space") introduced the ability for PCI device drivers to request a
> 
> DMA mask between 64 and 32 bits and actually get a mask greater than
> 32-bits. However currently if certain machine configuration dependent
> conditions are not meet the code silently falls back to a 32-bit mask.
> 
> This makes it hard for device drivers to detect which mask they actually
> got. Instead we should return an error when the request could not be
> fulfilled which allows drivers to either fallback or implement other
> workarounds as documented in DMA-API-HOWTO.txt.
> 
> Signed-off-by: Alistair Popple 

Acked-by: Russell Currey

Re: [PATCH v3 4/5] powerpc/lib/sstep: Add prty instruction emulation

2017-07-27 Thread Matt Brown

On Thu, Jul 27, 2017 at 11:26 AM, Michael Ellerman  wrote:
> Segher Boessenkool  writes:
>
>> On Wed, Jul 26, 2017 at 08:03:30PM +1000, Michael Ellerman wrote:
>>> Segher Boessenkool  writes:
>>> > A general question about these patches: some things are inside #ifdef
>>> > __powerpc64__, some are not.  It seems it is the wrong macro, and it
>>> > should be used (or not used) consistently?
>>>
>>> Why is it the wrong macro? Because we tend to use CONFIG_PPC64 you mean?
>>
>> Yeah.  But I see sstep.c already mixes those two at will (or if there
>> is a distinction, I'm not seeing it :-) )
>
> Yeah OK. In practice they're equivalent, if CONFIG_PPC64=y then the
> kernel is built 64-bit and therefore __powerpc64__ is defined.
>
> But I agree it's a mess, we should use CONFIG_PPC64 exclusively unless
> there's some reason not to (which I don't think there ever is).
>
>>> I thought the reason some are #ifdef'ed is that some are 64-bit only.
>>> ie. bpermd is 64-bit only ?
>>
>> 64-bit only, in what way?  It's not clear what the rules are.
>
> Instructions that have "d" in the name? :P
>
>> It's not instructions that can only run in 64-bit mode.
>> It's not instructions that only give a usable result with 64-bit regs
>> implemented.
>> It's not instructions only implemented on 64-bit CPUs.
>
> I think it's trying to be that ^
>
> If you build a 32-bit kernel then instructions that are only defined on
> 64-bit CPUs should be treated as illegal, so the easiest way to achieve
> that is to #ifdef off the code for those instructions.
>

I'll fixup this up to use the xor implementation, and change the
series to use CONFIG_PPC64 for the ifdef.

Thanks,
Matt
> cheers

Re: [PATCH v2] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread joserz

On Fri, Jul 28, 2017 at 01:35:53AM +1000, Oliver O'Halloran wrote:
> The ISA radix translation tree contains two different types of entry,
> directories and leaves. The formats of the two entries are different
> with the directory entries containing no spare bits for use by software.
> As a result we need to ensure that the *_devmap() family of functions
> check fail for everything except leaf (PTE) entries.
> 
> Signed-off-by: Oliver O'Halloran 
> ---
> "i'll just tweak the mbox before i sent it, what's the worst that can happen"
> *completely breaks KVM*
> "..."
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index d1da415..6bc6248 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -610,7 +610,9 @@ static inline pte_t pte_mkdevmap(pte_t pte)
> 
>  static inline int pte_devmap(pte_t pte)
>  {
> - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
> + uint64_t mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
> +
> + return (pte_raw(pte) & mask) == mask;
>  }
> 
>  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
> -- 
> 2.7.4
> 

Michael should thank you because I was about to send another insane
suggestion and ask him if that makes sense. :-D

Tested-by: Jose Ricardo Ziviani

Re: [PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Liam R. Howlett

* Aneesh Kumar K.V  [170727 12:12]:
> 
> 
> On 07/27/2017 08:55 PM, Liam R. Howlett wrote:
> > * Aneesh Kumar K.V  [170727 02:18]:
> > > For ppc64, we want to call this function when we are not running as guest.
> > > Also, if we failed to allocate hugepages, let the user know.
> > > 
> > [...]
> > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > > index bc48ee783dd9..a3a7a7e6339e 100644
> > > --- a/mm/hugetlb.c
> > > +++ b/mm/hugetlb.c
> > > @@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct 
> > > vm_area_struct *vma,
> > >   return page;
> > >   }
> > > -int __weak alloc_bootmem_huge_page(struct hstate *h)
> > > +int alloc_bootmem_huge_page(struct hstate *h)
> > > + __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> > > +int __alloc_bootmem_huge_page(struct hstate *h)
> > >   {
> > >   struct huge_bootmem_page *m;
> > >   int nr_nodes, node;
> > > @@ -2104,6 +2106,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
> > >   goto found;
> > >   }
> > >   }
> > > + pr_info("Failed to allocate hugepage of size %ld\n", huge_page_size(h));
> > >   return 0;
> > >   found:
> > 
> > There is already a call to warn the user in the
> > hugetlb_hstate_alloc_pages function.  If you look there, you will see
> > that the huge_page_size was translated into a more user friendly format
> > and the count prior to the failure is included.  What call path are you
> > trying to cover?  Also, you may want your print to be a pr_warn since it
> > is a failure?
> > 
> 
> Sorry I missed that in the recent kernel. I wrote the above before the
> mentioned changes was done. I will drop the pr_info from the patch.

Okay, thanks.  I didn't think there was a code path that was missed on
boot.

Cheers,
Liam

Re: [PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Liam R. Howlett

* Aneesh Kumar K.V  [170727 02:18]:
> For ppc64, we want to call this function when we are not running as guest.
> Also, if we failed to allocate hugepages, let the user know.
> 
[...]
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index bc48ee783dd9..a3a7a7e6339e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct 
> vm_area_struct *vma,
>   return page;
>  }
>  
> -int __weak alloc_bootmem_huge_page(struct hstate *h)
> +int alloc_bootmem_huge_page(struct hstate *h)
> + __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> +int __alloc_bootmem_huge_page(struct hstate *h)
>  {
>   struct huge_bootmem_page *m;
>   int nr_nodes, node;
> @@ -2104,6 +2106,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
>   goto found;
>   }
>   }
> + pr_info("Failed to allocate hugepage of size %ld\n", huge_page_size(h));
>   return 0;
>  
>  found:

There is already a call to warn the user in the
hugetlb_hstate_alloc_pages function.  If you look there, you will see
that the huge_page_size was translated into a more user friendly format
and the count prior to the failure is included.  What call path are you
trying to cover?  Also, you may want your print to be a pr_warn since it
is a failure?

Thanks,
Liam

Re: [RFC v6 15/62] powerpc: helper functions to initialize AMR, IAMR and UMOR registers

2017-07-27 Thread Thiago Jung Bauermann


Ram Pai  writes:

> Introduce helper functions that can initialize the bits in the AMR,
> IAMR and UMOR register; the bits that correspond to the given pkey.
>
> Signed-off-by: Ram Pai 

s/UMOR/UAMOR/ here and in the subject as well.

> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -16,3 +16,47 @@
>  #include /* PKEY_*   */
>
>  bool pkey_inited;
> +#define pkeyshift(pkey) ((arch_max_pkey()-pkey-1) * AMR_BITS_PER_PKEY)
> +
> +static inline void init_amr(int pkey, u8 init_bits)
> +{
> + u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
> + u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
> +
> + write_amr(old_amr | new_amr_bits);
> +}
> +
> +static inline void init_iamr(int pkey, u8 init_bits)
> +{
> + u64 new_iamr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
> + u64 old_iamr = read_iamr() & ~((u64)(0x3ul) << pkeyshift(pkey));
> +
> + write_amr(old_iamr | new_iamr_bits);
> +}

init_iamr should call write_iamr, not write_amr.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH v2] include/linux/vfio.h: Guard powerpc-specific functions with CONFIG_VFIO_SPAPR_EEH

2017-07-27 Thread Murilo Opsfelder Araújo

On 07/26/2017 04:37 PM, Alex Williamson wrote:
[...]
> Applied to my for-linus branch with David and Alexey's R-b for v4.13.
> Thanks,
> 
> Alex

Thank you all!

-- 
Murilo

Re: [PATCH v2] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Laurent Vivier

On 27/07/2017 17:35, ooh...@gmail.com (Oliver O'Halloran) wrote:
> The ISA radix translation tree contains two different types of entry,
> directories and leaves. The formats of the two entries are different
> with the directory entries containing no spare bits for use by software.
> As a result we need to ensure that the *_devmap() family of functions
> check fail for everything except leaf (PTE) entries.
> 
> Signed-off-by: Oliver O'Halloran 
> ---
> "i'll just tweak the mbox before i sent it, what's the worst that can happen"
> *completely breaks KVM*
> "..."
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index d1da415..6bc6248 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -610,7 +610,9 @@ static inline pte_t pte_mkdevmap(pte_t pte)
>  
>  static inline int pte_devmap(pte_t pte)
>  {
> - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
> + uint64_t mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
> +
> + return (pte_raw(pte) & mask) == mask;
>  }
>  
>  static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
> 

Tested-by: Laurent Vivier

Re: [PATCH 2/4] pseries/drc-info: Search DRC properties for CPU indexes

2017-07-27 Thread Nathan Fontenot

On 07/27/2017 11:10 AM, Michael Bringmann wrote:
> 
> pseries/drc-info: Provide parallel routines to convert between
> drc_index and CPU numbers at runtime, using the older device-tree
> properties ("ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types"
> and "ibm,drc-power-domains"), or the new property "ibm,drc-info".
> 
> Signed-off-by: Michael Bringmann 
> ---
>  arch/powerpc/include/asm/prom.h |7 +
>  arch/powerpc/platforms/pseries/of_helpers.c |   79 
>  arch/powerpc/platforms/pseries/pseries_energy.c |  150 
> +++
>  3 files changed, 210 insertions(+), 26 deletions(-)
> ---
> Changes in V2:
>   -- Minor changes to integrate to latest 4.13 code
> 
> diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
> index 4fb02cc..d469d7c 100644
> --- a/arch/powerpc/include/asm/prom.h
> +++ b/arch/powerpc/include/asm/prom.h
> @@ -96,6 +96,13 @@ struct of_drconf_cell {
>  #define DRCONF_MEM_AI_INVALID0x0040
>  #define DRCONF_MEM_RESERVED  0x0080
> 
> +extern int of_one_drc_info(struct property **prop, void **curval,
> + char **dtype, char **dname,
> + u32 *drc_index_start_p,
> + u32 *num_sequential_elems_p,
> + u32 *sequential_inc_p,
> + u32 *last_drc_index_p);
> +
>  /*
>   * There are two methods for telling firmware what our capabilities are.
>   * Newer machines have an "ibm,client-architecture-support" method on the
> diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
> b/arch/powerpc/platforms/pseries/of_helpers.c
> index 2798933..1d59939 100644
> --- a/arch/powerpc/platforms/pseries/of_helpers.c
> +++ b/arch/powerpc/platforms/pseries/of_helpers.c
> @@ -36,3 +36,82 @@ struct device_node *pseries_of_derive_parent(const char 
> *path)
>   kfree(parent_path);
>   return parent ? parent : ERR_PTR(-EINVAL);
>  }
> +
> +
> +/* Helper Routines to convert between drc_index to cpu numbers */
> +
> +int of_one_drc_info(struct property **prop, void **curval,
> + char **dtype, char **dname,
> + u32 *drc_index_start_p,
> + u32 *num_sequential_elems_p,
> + u32 *sequential_inc_p,
> + u32 *last_drc_index_p)
> +{
> + char *drc_type, *drc_name_prefix;
> + u32 drc_index_start, num_sequential_elems, dummy;
> + u32 sequential_inc, last_drc_index;
> + const char *p;
> + const __be32 *p2;
> +
> + drc_index_start = num_sequential_elems = 0;
> + sequential_inc = last_drc_index = 0;
> +
> + /* Get drc-type:encode-string */
> + p = drc_type = (*curval);
> + p = of_prop_next_string(*prop, p);
> + if (!p)
> + return -EINVAL;
> +
> + /* Get drc-name-prefix:encode-string */
> + drc_name_prefix = (char *)p;
> + p = of_prop_next_string(*prop, p);
> + if (!p)
> + return -EINVAL;
> +
> + /* Get drc-index-start:encode-int */
> + p2 = (const __be32 *)p;
> + p2 = of_prop_next_u32(*prop, p2, _index_start);
> + if (!p2)
> + return -EINVAL;
> +
> + /* Get/skip drc-name-suffix-start:encode-int */

Why are we skipping the drc name suffix start value? It seems this
would make the routine unusable for anyone wanting to get drc-name
values.

> + p2 = of_prop_next_u32(*prop, p2, );
> + if (!p)

shouldn't this be checking p2?

> + return -EINVAL;
> +
> + /* Get number-sequential-elements:encode-int */
> + p2 = of_prop_next_u32(*prop, p2, _sequential_elems);
> + if (!p2)
> + return -EINVAL;
> +
> + /* Get sequential-increment:encode-int */
> + p2 = of_prop_next_u32(*prop, p2, _inc);
> + if (!p2)
> + return -EINVAL;
> +
> + /* Get/skip drc-power-domain:encode-int */

Same for power-domain, why skip it?

I don't think any parts of the kernel are currently
looking at this piece of the DRC information, but if we are
going to have a routine to return all the data for a drc-info
block shouldn't it return all of it?

Would it be easier if the routine were to return a drc_info struct with
pointers to the string values and int values read into it?

> + p2 = of_prop_next_u32(*prop, p2, );
> + if (!p2)
> + return -EINVAL;
> +
> + /* Should now know end of current entry */
> + (*curval) = (void *)p2;
> + last_drc_index = drc_index_start +
> + ((num_sequential_elems-1)*sequential_inc);
> +
> + if (dtype)
> + *dtype = drc_type;
> + if (dname)
> + *dname = drc_name_prefix;
> + if (drc_index_start_p)
> + *drc_index_start_p = drc_index_start;
> + if (num_sequential_elems_p)
> + *num_sequential_elems_p = num_sequential_elems;
> + if (sequential_inc_p)
> + *sequential_inc_p = sequential_inc;
> + if (last_drc_index_p)
>

Re: [RFC v6 20/62] powerpc: store and restore the pkey state across context switches

2017-07-27 Thread Thiago Jung Bauermann


Ram Pai  writes:

> Store and restore the AMR, IAMR and UMOR register state of the task
> before scheduling out and after scheduling in, respectively.
>
> Signed-off-by: Ram Pai 

s/UMOR/UAMOR/

> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index 2ad725e..9429361 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1096,6 +1096,11 @@ static inline void save_sprs(struct thread_struct *t)
>   t->tar = mfspr(SPRN_TAR);
>   }
>  #endif
> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> + t->amr = mfspr(SPRN_AMR);
> + t->iamr = mfspr(SPRN_IAMR);
> + t->uamor = mfspr(SPRN_UAMOR);
> +#endif
>  }
>
>  static inline void restore_sprs(struct thread_struct *old_thread,
> @@ -1131,6 +1136,14 @@ static inline void restore_sprs(struct thread_struct 
> *old_thread,
>   mtspr(SPRN_TAR, new_thread->tar);
>   }
>  #endif
> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> + if (old_thread->amr != new_thread->amr)
> + mtspr(SPRN_AMR, new_thread->amr);
> + if (old_thread->iamr != new_thread->iamr)
> + mtspr(SPRN_IAMR, new_thread->iamr);
> + if (old_thread->uamor != new_thread->uamor)
> + mtspr(SPRN_UAMOR, new_thread->uamor);
> +#endif
>  }

Shouldn't the saving and restoring of the SPRs be guarded by a check for
whether memory protection keys are enabled? What happens when trying to
access these registers on a CPU which doesn't have them?

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-27 Thread Paul E. McKenney

On Thu, Jul 27, 2017 at 05:39:23PM +0100, Jonathan Cameron wrote:
> On Thu, 27 Jul 2017 14:49:03 +0100
> Jonathan Cameron  wrote:
> 
> > On Thu, 27 Jul 2017 05:49:13 -0700
> > "Paul E. McKenney"  wrote:
> > 
> > > On Thu, Jul 27, 2017 at 02:34:00PM +1000, Nicholas Piggin wrote:  
> > > > On Wed, 26 Jul 2017 18:42:14 -0700
> > > > "Paul E. McKenney"  wrote:
> > > > 
> > > > > On Wed, Jul 26, 2017 at 04:22:00PM -0700, David Miller wrote:
> > > > 
> > > > > > Indeed, that really wouldn't explain how we end up with a RCU stall
> > > > > > dump listing almost all of the cpus as having missed a grace 
> > > > > > period.  
> > > > > 
> > > > > I have seen stranger things, but admittedly not often.
> > > > 
> > > > So the backtraces show the RCU gp thread in schedule_timeout.
> > > > 
> > > > Are you sure that it's timeout has expired and it's not being scheduled,
> > > > or could it be a bad (large) timeout (looks unlikely) or that it's being
> > > > scheduled but not correctly noting gps on other CPUs?
> > > > 
> > > > It's not in R state, so if it's not being scheduled at all, then it's
> > > > because the timer has not fired:
> > > 
> > > Good point, Nick!
> > > 
> > > Jonathan, could you please reproduce collecting timer event tracing?  
> > I'm a little new to tracing (only started playing with it last week)
> > so fingers crossed I've set it up right.  No splats yet.  Was getting
> > splats on reading out the trace when running with the RCU stall timer
> > set to 4 so have increased that back to the default and am rerunning.
> > 
> > This may take a while.  Correct me if I've gotten this wrong to save time
> > 
> > echo "timer:*" > /sys/kernel/debug/tracing/set_event
> > 
> > when it dumps, just send you the relevant part of what is in
> > /sys/kernel/debug/tracing/trace?
> 
> Interestingly the only thing that can make trip for me with tracing on
> is peaking in the tracing buffers.  Not sure this is a valid case or
> not.
> 
> Anyhow all timer activity seems to stop around the area of interest.
> 
> 
> [ 9442.413624] INFO: rcu_sched detected stalls on CPUs/tasks:
> [ 9442.419107]1-...: (1 GPs behind) idle=844/0/0 softirq=27747/27755 
> fqs=0 last_accelerate: dd6a/de80, nonlazy_posted: 0, L.
> [ 9442.430224]3-...: (2 GPs behind) idle=8f8/0/0 softirq=32197/32198 
> fqs=0 last_accelerate: 29b1/de80, nonlazy_posted: 0, L.
> [ 9442.441340]4-...: (7 GPs behind) idle=740/0/0 softirq=22351/22352 
> fqs=0 last_accelerate: ca88/de80, nonlazy_posted: 0, L.
> [ 9442.452456]5-...: (2 GPs behind) idle=9b0/0/0 softirq=21315/21319 
> fqs=0 last_accelerate: b280/de88, nonlazy_posted: 0, L.
> [ 9442.463572]6-...: (2 GPs behind) idle=794/0/0 softirq=19699/19707 
> fqs=0 last_accelerate: ba62/de88, nonlazy_posted: 0, L.
> [ 9442.474688]7-...: (2 GPs behind) idle=ac4/0/0 softirq=22547/22554 
> fqs=0 last_accelerate: b280/de88, nonlazy_posted: 0, L.
> [ 9442.485803]8-...: (9 GPs behind) idle=118/0/0 softirq=281/291 
> fqs=0 last_accelerate: c3fe/de88, nonlazy_posted: 0, L.
> [ 9442.496571]9-...: (9 GPs behind) idle=8fc/0/0 softirq=284/292 
> fqs=0 last_accelerate: 6030/de88, nonlazy_posted: 0, L.
> [ 9442.507339]10-...: (14 GPs behind) idle=f78/0/0 softirq=254/254 
> fqs=0 last_accelerate: 5487/de88, nonlazy_posted: 0, L.
> [ 9442.518281]11-...: (9 GPs behind) idle=c9c/0/0 softirq=301/308 
> fqs=0 last_accelerate: 3d3e/de99, nonlazy_posted: 0, L.
> [ 9442.529136]12-...: (9 GPs behind) idle=4a4/0/0 softirq=735/737 
> fqs=0 last_accelerate: 6010/de99, nonlazy_posted: 0, L.
> [ 9442.539992]13-...: (9 GPs behind) idle=34c/0/0 softirq=1121/1131 
> fqs=0 last_accelerate: b280/de99, nonlazy_posted: 0, L.
> [ 9442.551020]14-...: (9 GPs behind) idle=2f4/0/0 softirq=707/713 
> fqs=0 last_accelerate: 6030/de99, nonlazy_posted: 0, L.
> [ 9442.561875]15-...: (2 GPs behind) idle=b30/0/0 softirq=821/976 
> fqs=0 last_accelerate: c208/de99, nonlazy_posted: 0, L.
> [ 9442.572730]17-...: (2 GPs behind) idle=5a8/0/0 softirq=1456/1565 
> fqs=0 last_accelerate: ca88/de99, nonlazy_posted: 0, L.
> [ 9442.583759]18-...: (2 GPs behind) idle=2e4/0/0 softirq=1923/1936 
> fqs=0 last_accelerate: ca88/dea7, nonlazy_posted: 0, L.
> [ 9442.594787]19-...: (2 GPs behind) idle=138/0/0 softirq=1421/1432 
> fqs=0 last_accelerate: b280/dea7, nonlazy_posted: 0, L.
> [ 9442.605816]20-...: (50 GPs behind) idle=634/0/0 softirq=217/219 
> fqs=0 last_accelerate: c96f/dea7, nonlazy_posted: 0, L.
> [ 9442.616758]21-...: (2 GPs behind) idle=eb8/0/0 softirq=1368/1369 
> fqs=0 last_accelerate: b599/deb2, nonlazy_posted: 0, L.
> [ 9442.627786]22-...: (1 GPs behind) idle=aa8/0/0 softirq=229/232 
> fqs=0 last_accelerate: c604/deb2, nonlazy_posted: 0, L.
> [ 9442.638641]23-...: (1 GPs behind) idle=488/0/0

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-27 Thread Jonathan Cameron

On Thu, 27 Jul 2017 14:49:03 +0100
Jonathan Cameron  wrote:

> On Thu, 27 Jul 2017 05:49:13 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Thu, Jul 27, 2017 at 02:34:00PM +1000, Nicholas Piggin wrote:  
> > > On Wed, 26 Jul 2017 18:42:14 -0700
> > > "Paul E. McKenney"  wrote:
> > > 
> > > > On Wed, Jul 26, 2017 at 04:22:00PM -0700, David Miller wrote:
> > > 
> > > > > Indeed, that really wouldn't explain how we end up with a RCU stall
> > > > > dump listing almost all of the cpus as having missed a grace period.  
> > > > > 
> > > > 
> > > > I have seen stranger things, but admittedly not often.
> > > 
> > > So the backtraces show the RCU gp thread in schedule_timeout.
> > > 
> > > Are you sure that it's timeout has expired and it's not being scheduled,
> > > or could it be a bad (large) timeout (looks unlikely) or that it's being
> > > scheduled but not correctly noting gps on other CPUs?
> > > 
> > > It's not in R state, so if it's not being scheduled at all, then it's
> > > because the timer has not fired:
> > 
> > Good point, Nick!
> > 
> > Jonathan, could you please reproduce collecting timer event tracing?  
> I'm a little new to tracing (only started playing with it last week)
> so fingers crossed I've set it up right.  No splats yet.  Was getting
> splats on reading out the trace when running with the RCU stall timer
> set to 4 so have increased that back to the default and am rerunning.
> 
> This may take a while.  Correct me if I've gotten this wrong to save time
> 
> echo "timer:*" > /sys/kernel/debug/tracing/set_event
> 
> when it dumps, just send you the relevant part of what is in
> /sys/kernel/debug/tracing/trace?

Interestingly the only thing that can make trip for me with tracing on
is peaking in the tracing buffers.  Not sure this is a valid case or
not.

Anyhow all timer activity seems to stop around the area of interest.


[ 9442.413624] INFO: rcu_sched detected stalls on CPUs/tasks:
[ 9442.419107]  1-...: (1 GPs behind) idle=844/0/0 softirq=27747/27755 fqs=0 
last_accelerate: dd6a/de80, nonlazy_posted: 0, L.
[ 9442.430224]  3-...: (2 GPs behind) idle=8f8/0/0 softirq=32197/32198 fqs=0 
last_accelerate: 29b1/de80, nonlazy_posted: 0, L.
[ 9442.441340]  4-...: (7 GPs behind) idle=740/0/0 softirq=22351/22352 fqs=0 
last_accelerate: ca88/de80, nonlazy_posted: 0, L.
[ 9442.452456]  5-...: (2 GPs behind) idle=9b0/0/0 softirq=21315/21319 fqs=0 
last_accelerate: b280/de88, nonlazy_posted: 0, L.
[ 9442.463572]  6-...: (2 GPs behind) idle=794/0/0 softirq=19699/19707 fqs=0 
last_accelerate: ba62/de88, nonlazy_posted: 0, L.
[ 9442.474688]  7-...: (2 GPs behind) idle=ac4/0/0 softirq=22547/22554 fqs=0 
last_accelerate: b280/de88, nonlazy_posted: 0, L.
[ 9442.485803]  8-...: (9 GPs behind) idle=118/0/0 softirq=281/291 fqs=0 
last_accelerate: c3fe/de88, nonlazy_posted: 0, L.
[ 9442.496571]  9-...: (9 GPs behind) idle=8fc/0/0 softirq=284/292 fqs=0 
last_accelerate: 6030/de88, nonlazy_posted: 0, L.
[ 9442.507339]  10-...: (14 GPs behind) idle=f78/0/0 softirq=254/254 fqs=0 
last_accelerate: 5487/de88, nonlazy_posted: 0, L.
[ 9442.518281]  11-...: (9 GPs behind) idle=c9c/0/0 softirq=301/308 fqs=0 
last_accelerate: 3d3e/de99, nonlazy_posted: 0, L.
[ 9442.529136]  12-...: (9 GPs behind) idle=4a4/0/0 softirq=735/737 fqs=0 
last_accelerate: 6010/de99, nonlazy_posted: 0, L.
[ 9442.539992]  13-...: (9 GPs behind) idle=34c/0/0 softirq=1121/1131 fqs=0 
last_accelerate: b280/de99, nonlazy_posted: 0, L.
[ 9442.551020]  14-...: (9 GPs behind) idle=2f4/0/0 softirq=707/713 fqs=0 
last_accelerate: 6030/de99, nonlazy_posted: 0, L.
[ 9442.561875]  15-...: (2 GPs behind) idle=b30/0/0 softirq=821/976 fqs=0 
last_accelerate: c208/de99, nonlazy_posted: 0, L.
[ 9442.572730]  17-...: (2 GPs behind) idle=5a8/0/0 softirq=1456/1565 fqs=0 
last_accelerate: ca88/de99, nonlazy_posted: 0, L.
[ 9442.583759]  18-...: (2 GPs behind) idle=2e4/0/0 softirq=1923/1936 fqs=0 
last_accelerate: ca88/dea7, nonlazy_posted: 0, L.
[ 9442.594787]  19-...: (2 GPs behind) idle=138/0/0 softirq=1421/1432 fqs=0 
last_accelerate: b280/dea7, nonlazy_posted: 0, L.
[ 9442.605816]  20-...: (50 GPs behind) idle=634/0/0 softirq=217/219 fqs=0 
last_accelerate: c96f/dea7, nonlazy_posted: 0, L.
[ 9442.616758]  21-...: (2 GPs behind) idle=eb8/0/0 softirq=1368/1369 fqs=0 
last_accelerate: b599/deb2, nonlazy_posted: 0, L.
[ 9442.627786]  22-...: (1 GPs behind) idle=aa8/0/0 softirq=229/232 fqs=0 
last_accelerate: c604/deb2, nonlazy_posted: 0, L.
[ 9442.638641]  23-...: (1 GPs behind) idle=488/0/0 softirq=247/248 fqs=0 
last_accelerate: c600/deb2, nonlazy_posted: 0, L.
[ 9442.649496]  24-...: (33 GPs behind) idle=f7c/0/0 softirq=319/319 fqs=0 
last_accelerate: 5290/deb2, nonlazy_posted: 0, L.
[ 9442.660437]  25-...: (33 GPs behind) idle=944/0/0 softirq=308/308 fqs=0 
last_accelerate: 52c0/deb2, nonlazy_posted: 0, L.
[ 9442.671379]  26-...: (9 GPs behind)

Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-07-27 Thread Jens Axboe

On 07/27/2017 08:47 AM, Bart Van Assche wrote:
> On Thu, 2017-07-27 at 08:02 -0600, Jens Axboe wrote:
>> The bug looks like SCSI running the queue inline from IRQ
>> context, that's not a good idea. Can you confirm the below works for
>> you?
>>
>>
>> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
>> index f6097b89d5d3..78740ebf966c 100644
>> --- a/drivers/scsi/scsi_lib.c
>> +++ b/drivers/scsi/scsi_lib.c
>> @@ -497,7 +497,7 @@ static void scsi_run_queue(struct request_queue *q)
>>  scsi_starved_list_run(sdev->host);
>>  
>>  if (q->mq_ops)
>> -blk_mq_run_hw_queues(q, false);
>> +blk_mq_run_hw_queues(q, true);
>>  else
>>  blk_run_queue(q);
>>  }
> 
> Hello Jens,
> 
> scsi_run_queue() works fine if no scheduler is configured. Additionally, that
> code predates the introduction of blk-mq I/O schedulers. I think it is
> nontrivial for block driver authors to figure out that a queue has to be run
> from process context if a scheduler has been configured that does not support
> to be run from interrupt context.

No it doesn't, you could never run the queue from interrupt context with
async == false. So I don't think that's confusing at all, you should
always be aware of the context.

> How about adding WARN_ON_ONCE(in_interrupt()) to
> blk_mq_start_hw_queue() or replacing the above patch by the following:

No, I hate having dependencies like that, because they always just catch
one of them. Looks like the IPR path that hits this should just offload
to a workqueue or similar, you don't have to make any scsi_run_queue()
async.

-- 
Jens Axboe

Re: [PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 06:31 PM, Michal Hocko wrote:

On Thu 27-07-17 11:48:26, Aneesh Kumar K.V wrote:

For ppc64, we want to call this function when we are not running as guest.


What does this mean?



ppc64 guest (aka LPAR) support a different mechanism for hugetlb 
allocation/reservation. The LPAR management application called HMC can 
be used to reserve a set of hugepages and we pass the details of 
reserved pages via device tree to the guest. You can find the details in
htab_dt_scan_hugepage_blocks() . We do the memblock_reserve of the range 
and later in the boot sequence, we just add the reserved range to

huge_boot_pages.

For baremetal config (when we are not running as guest) we want to 
follow what other architecture does, that is look at the command line 
and do memblock allocation. Hence the need to call generic function 
__alloc_bootmem_huge_page() in that case.


I can add all these details in to the commit message if that makes it easy ?

-aneesh

Re: [PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 08:55 PM, Liam R. Howlett wrote:

* Aneesh Kumar K.V  [170727 02:18]:

For ppc64, we want to call this function when we are not running as guest.
Also, if we failed to allocate hugepages, let the user know.


[...]

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..a3a7a7e6339e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct 
*vma,
return page;
  }
  
-int __weak alloc_bootmem_huge_page(struct hstate *h)

+int alloc_bootmem_huge_page(struct hstate *h)
+   __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
  {
struct huge_bootmem_page *m;
int nr_nodes, node;
@@ -2104,6 +2106,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
goto found;
}
}
+   pr_info("Failed to allocate hugepage of size %ld\n", huge_page_size(h));
return 0;
  
  found:


There is already a call to warn the user in the
hugetlb_hstate_alloc_pages function.  If you look there, you will see
that the huge_page_size was translated into a more user friendly format
and the count prior to the failure is included.  What call path are you
trying to cover?  Also, you may want your print to be a pr_warn since it
is a failure?



Sorry I missed that in the recent kernel. I wrote the above before the 
mentioned changes was done. I will drop the pr_info from the patch.


Thanks
-aneesh

[PATCH V2 3/4] hotplug/drc-info: Add code to search ibm,drc-info property

2017-07-27 Thread Michael Bringmann


rpadlpar_core.c: Provide parallel routines to search the older device-
tree properties ("ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types"
and "ibm,drc-power-domains"), or the new property "ibm,drc-info".

The interface to examine the DRC information is changed from a "get"
function that returns values for local verification elsewhere, to a
"check" function that validates the 'name' and/or 'type' of a device
node.  This update hides the format of the underlying device-tree
properties, and concentrates the value checks into a single function
without requiring the user to verify whether a search was successful.

Signed-off-by: Michael Bringmann 
---
 drivers/pci/hotplug/rpadlpar_core.c |   13 ++--
 drivers/pci/hotplug/rpaphp.h|4 +
 drivers/pci/hotplug/rpaphp_core.c   |  109 +++
 3 files changed, 91 insertions(+), 35 deletions(-)

diff --git a/drivers/pci/hotplug/rpadlpar_core.c 
b/drivers/pci/hotplug/rpadlpar_core.c
index 3f93a4e..e6924de 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "../pci.h"
 #include "rpaphp.h"
@@ -44,15 +45,14 @@ static struct device_node *find_vio_slot_node(char 
*drc_name)
 {
struct device_node *parent = of_find_node_by_name(NULL, "vdevice");
struct device_node *dn = NULL;
-   char *name;
int rc;
 
if (!parent)
return NULL;
 
while ((dn = of_get_next_child(parent, dn))) {
-   rc = rpaphp_get_drc_props(dn, NULL, , NULL, NULL);
-   if ((rc == 0) && (!strcmp(drc_name, name)))
+   rc = rpaphp_check_drc_props(dn, drc_name, NULL);
+   if (rc == 0)
break;
}
 
@@ -64,15 +64,12 @@ static struct device_node *find_php_slot_pci_node(char 
*drc_name,
  char *drc_type)
 {
struct device_node *np = NULL;
-   char *name;
-   char *type;
int rc;
 
while ((np = of_find_node_by_name(np, "pci"))) {
-   rc = rpaphp_get_drc_props(np, NULL, , , NULL);
+   rc = rpaphp_check_drc_props(np, drc_name, drc_type);
if (rc == 0)
-   if (!strcmp(drc_name, name) && !strcmp(drc_type, type))
-   break;
+   break;
}
 
return np;
diff --git a/drivers/pci/hotplug/rpaphp.h b/drivers/pci/hotplug/rpaphp.h
index 7db024e..8db5f2e 100644
--- a/drivers/pci/hotplug/rpaphp.h
+++ b/drivers/pci/hotplug/rpaphp.h
@@ -91,8 +91,8 @@ struct slot {
 
 /* rpaphp_core.c */
 int rpaphp_add_slot(struct device_node *dn);
-int rpaphp_get_drc_props(struct device_node *dn, int *drc_index,
-   char **drc_name, char **drc_type, int *drc_power_domain);
+int rpaphp_check_drc_props(struct device_node *dn, char *drc_name,
+   char *drc_type);
 
 /* rpaphp_slot.c */
 void dealloc_slot_struct(struct slot *slot);
diff --git a/drivers/pci/hotplug/rpaphp_core.c 
b/drivers/pci/hotplug/rpaphp_core.c
index 8d13202..a3c8a1c 100644
--- a/drivers/pci/hotplug/rpaphp_core.c
+++ b/drivers/pci/hotplug/rpaphp_core.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include/* for eeh_add_device() */
 #include   /* rtas_call */
 #include /* for pci_controller */
@@ -196,25 +197,21 @@ static int get_children_props(struct device_node *dn, 
const int **drc_indexes,
return 0;
 }
 
-/* To get the DRC props describing the current node, first obtain it's
- * my-drc-index property.  Next obtain the DRC list from it's parent.  Use
- * the my-drc-index for correlation, and obtain the requested properties.
+
+/* Verify the existence of 'drc_name' and/or 'drc_type' within the
+ * current node.  First obtain it's my-drc-index property.  Next,
+ * obtain the DRC info from it's parent.  Use the my-drc-index for
+ * correlation, and obtain/validate the requested properties.
  */
-int rpaphp_get_drc_props(struct device_node *dn, int *drc_index,
-   char **drc_name, char **drc_type, int *drc_power_domain)
+
+static int rpaphp_check_drc_props_v1(struct device_node *dn, char *drc_name,
+   char *drc_type, unsigned int my_index)
 {
+   char *name_tmp, *type_tmp;
const int *indexes, *names;
const int *types, *domains;
-   const unsigned int *my_index;
-   char *name_tmp, *type_tmp;
int i, rc;
 
-   my_index = of_get_property(dn, "ibm,my-drc-index", NULL);
-   if (!my_index) {
-   /* Node isn't DLPAR/hotplug capable */
-   return -EINVAL;
-   }
-
rc = get_children_props(dn->parent, , , , );
if (rc < 0) {
return -EINVAL;
@@ -225,24 +222,86 @@ int rpaphp_get_drc_props(struct device_node *dn, int 
*drc_index,
 
/* Iterate through parent properties, looking

[PATCH V2 4/4] powerpc: Enable support for ibm,drc-info devtree property

2017-07-27 Thread Michael Bringmann


prom_init.c: Enable support for new DRC device tree property
"ibm,drc-info" in initial handshake between the Linux kernel and
the front end processor.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/kernel/prom_init.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 613f79f..1fd07aa 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -872,6 +872,7 @@ struct ibm_arch_vec __cacheline_aligned 
ibm_architecture_vec = {
.mmu = 0,
.hash_ext = 0,
.radix_ext = 0,
+   .byte22 = OV5_FEAT(OV5_DRC_INFO),
},
 
/* option vector 6: IBM PAPR hints */

[PATCH 2/4] pseries/drc-info: Search DRC properties for CPU indexes

2017-07-27 Thread Michael Bringmann


pseries/drc-info: Provide parallel routines to convert between
drc_index and CPU numbers at runtime, using the older device-tree
properties ("ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types"
and "ibm,drc-power-domains"), or the new property "ibm,drc-info".

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/prom.h |7 +
 arch/powerpc/platforms/pseries/of_helpers.c |   79 
 arch/powerpc/platforms/pseries/pseries_energy.c |  150 +++
 3 files changed, 210 insertions(+), 26 deletions(-)
---
Changes in V2:
  -- Minor changes to integrate to latest 4.13 code

diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 4fb02cc..d469d7c 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -96,6 +96,13 @@ struct of_drconf_cell {
 #define DRCONF_MEM_AI_INVALID  0x0040
 #define DRCONF_MEM_RESERVED0x0080
 
+extern int of_one_drc_info(struct property **prop, void **curval,
+   char **dtype, char **dname,
+   u32 *drc_index_start_p,
+   u32 *num_sequential_elems_p,
+   u32 *sequential_inc_p,
+   u32 *last_drc_index_p);
+
 /*
  * There are two methods for telling firmware what our capabilities are.
  * Newer machines have an "ibm,client-architecture-support" method on the
diff --git a/arch/powerpc/platforms/pseries/of_helpers.c 
b/arch/powerpc/platforms/pseries/of_helpers.c
index 2798933..1d59939 100644
--- a/arch/powerpc/platforms/pseries/of_helpers.c
+++ b/arch/powerpc/platforms/pseries/of_helpers.c
@@ -36,3 +36,82 @@ struct device_node *pseries_of_derive_parent(const char 
*path)
kfree(parent_path);
return parent ? parent : ERR_PTR(-EINVAL);
 }
+
+
+/* Helper Routines to convert between drc_index to cpu numbers */
+
+int of_one_drc_info(struct property **prop, void **curval,
+   char **dtype, char **dname,
+   u32 *drc_index_start_p,
+   u32 *num_sequential_elems_p,
+   u32 *sequential_inc_p,
+   u32 *last_drc_index_p)
+{
+   char *drc_type, *drc_name_prefix;
+   u32 drc_index_start, num_sequential_elems, dummy;
+   u32 sequential_inc, last_drc_index;
+   const char *p;
+   const __be32 *p2;
+
+   drc_index_start = num_sequential_elems = 0;
+   sequential_inc = last_drc_index = 0;
+
+   /* Get drc-type:encode-string */
+   p = drc_type = (*curval);
+   p = of_prop_next_string(*prop, p);
+   if (!p)
+   return -EINVAL;
+
+   /* Get drc-name-prefix:encode-string */
+   drc_name_prefix = (char *)p;
+   p = of_prop_next_string(*prop, p);
+   if (!p)
+   return -EINVAL;
+
+   /* Get drc-index-start:encode-int */
+   p2 = (const __be32 *)p;
+   p2 = of_prop_next_u32(*prop, p2, _index_start);
+   if (!p2)
+   return -EINVAL;
+
+   /* Get/skip drc-name-suffix-start:encode-int */
+   p2 = of_prop_next_u32(*prop, p2, );
+   if (!p)
+   return -EINVAL;
+
+   /* Get number-sequential-elements:encode-int */
+   p2 = of_prop_next_u32(*prop, p2, _sequential_elems);
+   if (!p2)
+   return -EINVAL;
+
+   /* Get sequential-increment:encode-int */
+   p2 = of_prop_next_u32(*prop, p2, _inc);
+   if (!p2)
+   return -EINVAL;
+
+   /* Get/skip drc-power-domain:encode-int */
+   p2 = of_prop_next_u32(*prop, p2, );
+   if (!p2)
+   return -EINVAL;
+
+   /* Should now know end of current entry */
+   (*curval) = (void *)p2;
+   last_drc_index = drc_index_start +
+   ((num_sequential_elems-1)*sequential_inc);
+
+   if (dtype)
+   *dtype = drc_type;
+   if (dname)
+   *dname = drc_name_prefix;
+   if (drc_index_start_p)
+   *drc_index_start_p = drc_index_start;
+   if (num_sequential_elems_p)
+   *num_sequential_elems_p = num_sequential_elems;
+   if (sequential_inc_p)
+   *sequential_inc_p = sequential_inc;
+   if (last_drc_index_p)
+   *last_drc_index_p = last_drc_index;
+
+   return 0;
+}
+EXPORT_SYMBOL(of_one_drc_info);
diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c 
b/arch/powerpc/platforms/pseries/pseries_energy.c
index 164a13d..865c2af 100644
--- a/arch/powerpc/platforms/pseries/pseries_energy.c
+++ b/arch/powerpc/platforms/pseries/pseries_energy.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 #define MODULE_VERS "1.0"
@@ -38,7 +39,6 @@
 static u32 cpu_to_drc_index(int cpu)
 {
struct device_node *dn = NULL;
-   const int *indexes;
int i;
int rc = 1;
u32 ret = 0;
@@ -46,18 +46,65 @@ static u32 cpu_to_drc_index(int cpu)
dn =

[PATCH V2 1/4] powerpc/firmware: Add definitions for new drc-info firmware feature

2017-07-27 Thread Michael Bringmann


Firmware Features: Define new bit flag representing the presence of
new device tree property "ibm,drc-info".  The flag is used to tell
the front end processor when the Linux kernel supports the new properties,
and by the front end processor to tell the Linux kernel that the new
property is present in the device tree.

Signed-off-by: Michael Bringmann 
---
 arch/powerpc/include/asm/firmware.h   |3 ++-
 arch/powerpc/include/asm/prom.h   |1 +
 arch/powerpc/platforms/pseries/firmware.c |1 +
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/firmware.h 
b/arch/powerpc/include/asm/firmware.h
index 8645897..329d537 100644
--- a/arch/powerpc/include/asm/firmware.h
+++ b/arch/powerpc/include/asm/firmware.h
@@ -51,6 +51,7 @@
 #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x8000)
 #define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0001)
 #define FW_FEATURE_PRRNASM_CONST(0x0002)
+#define FW_FEATURE_DRC_INFOASM_CONST(0x0004)
 
 #ifndef __ASSEMBLY__
 
@@ -67,7 +68,7 @@ enum {
FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO |
FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY |
FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN |
-   FW_FEATURE_HPT_RESIZE,
+   FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRC_INFO,
FW_FEATURE_PSERIES_ALWAYS = 0,
FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL,
FW_FEATURE_POWERNV_ALWAYS = 0,
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 35c00d7..4fb02cc 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -172,6 +172,7 @@ struct of_drconf_cell {
 #define OV5_HASH_GTSE  0x1940  /* Guest Translation Shoot Down Avail */
 /* Radix Table Extensions */
 #define OV5_RADIX_GTSE 0x1A40  /* Guest Translation Shoot Down Avail */
+#define OV5_DRC_INFO   0x1640  /* Redef Prop Structures: drc-info   */
 
 /* Option Vector 6: IBM PAPR hints */
 #define OV6_LINUX  0x02/* Linux is our OS */
diff --git a/arch/powerpc/platforms/pseries/firmware.c 
b/arch/powerpc/platforms/pseries/firmware.c
index 63cc82a..757d757 100644
--- a/arch/powerpc/platforms/pseries/firmware.c
+++ b/arch/powerpc/platforms/pseries/firmware.c
@@ -114,6 +114,7 @@ struct vec5_fw_feature {
 vec5_fw_features_table[] = {
{FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY},
{FW_FEATURE_PRRN,   OV5_PRRN},
+   {FW_FEATURE_DRC_INFO,   OV5_DRC_INFO},
 };
 
 static void __init fw_vec5_feature_init(const char *vec5, unsigned long len)

[PATCH V2 0/4] powerpc/devtree: Add support for 'ibm,drc-info' property

2017-07-27 Thread Michael Bringmann


Several properties in the DRC device tree format are replaced by
more compact representations to allow, for example, for the encoding
of vast amounts of memory, and or reduced duplication of information
in related data structures.

"ibm,drc-info": This property, when present, replaces the following
four properties: "ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types"
and "ibm,drc-power-domains".  This property is defined for all
dynamically reconfigurable platform nodes.  The "ibm,drc-info" elements
are intended to provide a more compact representation, and reduce some
search overhead.

"ibm,architecture.vec": Bidirectional communication mechanism between
the host system and the front end processor indicating what features
the host system supports and what features the front end processor will
actually provide.  In this case, we are indicating that the host system
can support the new device tree structure "ibm,drc-info".

Signed-off-by: Michael Bringmann 

Michael Bringmann (4):
  powerpc/firmware: Add definitions for new drc-info firmware feature.
  pseries/drc-info: Search new DRC properties for CPU indexes
  hotplug/drc-info: Add code to search new devtree property
  powerpc: Enable support for new DRC devtree property
---
Changes in V2:
  -- Minor changes to integrate to latest 4.13 code

Re: [RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 06:27 PM, Michal Hocko wrote:

On Thu 27-07-17 14:07:56, Aneesh Kumar K.V wrote:

Instead of marking the pmd ready for split, invalidate the pmd. This should
take care of powerpc requirement.


which is?


I can add the commit which explain details here. Or add more details 
from the older commit here.


c777e2a8b65420b31dac28a453e35be984f5808b

powerpc/mm: Fix Multi hit ERAT cause by recent THP update





Only side effect is that we mark the pmd
invalid early. This can result in us blocking access to the page a bit longer
if we race against a thp split.


Again, this doesn't tell me what is the problem and why do we care.


Primary motivation is code reduction.

  7 files changed, 35 insertions(+), 87 deletions(-)


-aneesh

Re: [RFC PATCH 2/3] powerpc/mm: Implement pmdp_establish for ppc64

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 06:26 PM, Michal Hocko wrote:

On Thu 27-07-17 14:07:55, Aneesh Kumar K.V wrote:

We can now use this to set pmd page table entries to absolute values. THP
need to ensure that we always update pmd PTE entries such that we never mark
the pmd none. pmdp_establish helps in implementing that.

This doesn't flush the tlb. Based on the old_pmd value returned caller can
decide to call flush_pmd_tlb_range()


_Why_ do we need this. It doesn't really help that the newly added
function is not used so we could check that...



We were looking at having pmdp_establish used by the core code. But i 
guess Kirill ended up using pmdp_invalidate. If we don't have 
pmdp_establish usage in core code, we can drop this. This is to help 
Kiril make progress with series at



https://lkml.kernel.org/r/20170615145224.66200-1-kirill.shute...@linux.intel.com


Also thinking about the interface further, I guess pmdp_establish 
interface is some what confusing. So we may want to rethink this 
further. I know that i asked for pmdp_establish in earlier review of 
Kirill's patchset. But now looking back i am not sure we can clearly 
explain only semantic requirement of pmdp_establish. One thing we may 
want to clarify is whether we should retain the Reference and change bit 
from the old entry when we are doing a pmdp_establish ?


Kirill,

Considering core code is still only using pmdp_invalidate(), we may want 
to drop this interface completely ?


-aneesh

Re: [RFC PATCH 1/3] powerpc/mm: update pmdp_invalidate to return old pmd value

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 06:24 PM, Michal Hocko wrote:

EMISSING_CHANGELOG

besides that no user actually uses the return value. Please fold this
into the patch which uses the new functionality.



The patch series was suppose to help Kirill to make progress with the 
his series at



https://lkml.kernel.org/r/20170615145224.66200-1-kirill.shute...@linux.intel.com

It is essentially implementing the pmdp_invalidate update for ppc64. His 
series does it for x86-64.


-aneesh

[PATCH v2] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Oliver O'Halloran

The ISA radix translation tree contains two different types of entry,
directories and leaves. The formats of the two entries are different
with the directory entries containing no spare bits for use by software.
As a result we need to ensure that the *_devmap() family of functions
check fail for everything except leaf (PTE) entries.

Signed-off-by: Oliver O'Halloran 
---
"i'll just tweak the mbox before i sent it, what's the worst that can happen"
*completely breaks KVM*
"..."
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415..6bc6248 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -610,7 +610,9 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 
 static inline int pte_devmap(pte_t pte)
 {
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+   uint64_t mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE);
+
+   return (pte_raw(pte) & mask) == mask;
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-- 
2.7.4

Re: [RFC v6 19/62] powerpc: ability to create execute-disabled pkeys

2017-07-27 Thread Thiago Jung Bauermann


Thiago Jung Bauermann  writes:
> diff --git a/arch/powerpc/include/asm/pkeys.h 
> b/arch/powerpc/include/asm/pkeys.h
> index e31f5ee8e81f..67e6a3a343ae 100644
> --- a/arch/powerpc/include/asm/pkeys.h
> +++ b/arch/powerpc/include/asm/pkeys.h
> @@ -4,17 +4,6 @@
>  #include 
>  
>  extern bool pkey_inited;
> -/* override any generic PKEY Permission defines */
> -#undef  PKEY_DISABLE_ACCESS
> -#define PKEY_DISABLE_ACCESS0x1
> -#undef  PKEY_DISABLE_WRITE
> -#define PKEY_DISABLE_WRITE 0x2
> -#undef  PKEY_DISABLE_EXECUTE
> -#define PKEY_DISABLE_EXECUTE   0x4
> -#undef  PKEY_ACCESS_MASK
> -#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
> - PKEY_DISABLE_WRITE  |\
> - PKEY_DISABLE_EXECUTE)
>  
>  #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
>   VM_PKEY_BIT3 | VM_PKEY_BIT4)
> diff --git a/arch/powerpc/include/uapi/asm/mman.h 
> b/arch/powerpc/include/uapi/asm/mman.h
> index ab45cc2f3101..dee43feb7c53 100644
> --- a/arch/powerpc/include/uapi/asm/mman.h
> +++ b/arch/powerpc/include/uapi/asm/mman.h
> @@ -45,4 +45,6 @@
>  #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)  /* 1GB   HugeTLB Page */
>  #define MAP_HUGE_16GB(34 << MAP_HUGE_SHIFT)  /* 16GB  HugeTLB Page */
>  
> +#define PKEY_DISABLE_EXECUTE   0x4
> +
>  #endif /* _UAPI_ASM_POWERPC_MMAN_H */
> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index 72eb9a1bde79..777f8f8dff47 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -12,7 +12,7 @@
>   * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
>   * more details.
>   */
> -#include 
> +#include 
>  #include /* PKEY_*   */
>  
>  bool pkey_inited;
> diff --git a/include/uapi/asm-generic/mman-common.h 
> b/include/uapi/asm-generic/mman-common.h
> index 8c27db0c5c08..93e3841d9ada 100644
> --- a/include/uapi/asm-generic/mman-common.h
> +++ b/include/uapi/asm-generic/mman-common.h
> @@ -74,7 +74,15 @@
>  
>  #define PKEY_DISABLE_ACCESS  0x1
>  #define PKEY_DISABLE_WRITE   0x2
> +
> +/* The arch-specific code may define PKEY_DISABLE_EXECUTE */
> +#ifdef PKEY_DISABLE_EXECUTE
> +#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
> + PKEY_DISABLE_WRITE  |   \
> + PKEY_DISABLE_EXECUTE)
> +#else
>  #define PKEY_ACCESS_MASK (PKEY_DISABLE_ACCESS |\
>PKEY_DISABLE_WRITE)
> +#endif
>  
>  #endif /* __ASM_GENERIC_MMAN_COMMON_H */

Actually, I just noticed that arch/powerpc/include/uapi/asm/mman.h
includes , so for the #ifdef above to work
the former has to #define PKEY_DISABLE_EXECUTE before including the
latter.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH] soc: Convert to using %pOF instead of full_name

2017-07-27 Thread Simon Horman

On Tue, Jul 18, 2017 at 04:43:29PM -0500, Rob Herring wrote:
> Now that we have a custom printf format specifier, convert users of
> full_name to use %pOF instead. This is preparation to remove storing
> of the full path string for each node.
> 
> Signed-off-by: Rob Herring 
> Cc: Scott Wood 
> Cc: Qiang Zhao 
> Cc: Matthias Brugger 
> Cc: Simon Horman 
> Cc: Magnus Damm 
> Cc: Kukjin Kim 
> Cc: Krzysztof Kozlowski 
> Cc: Javier Martinez Canillas 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-media...@lists.infradead.org
> Cc: linux-renesas-...@vger.kernel.org
> Cc: linux-samsung-...@vger.kernel.org
> ---
>  drivers/soc/fsl/qbman/bman_ccsr.c| 10 +-
>  drivers/soc/fsl/qbman/bman_portal.c  |  8 +++-
>  drivers/soc/fsl/qbman/qman_ccsr.c| 12 ++--
>  drivers/soc/fsl/qbman/qman_portal.c  | 11 ---
>  drivers/soc/fsl/qe/gpio.c|  4 ++--
>  drivers/soc/mediatek/mtk-pmic-wrap.c |  4 ++--
>  drivers/soc/renesas/rcar-rst.c   |  4 ++--
>  drivers/soc/renesas/rcar-sysc.c  |  6 +++---
>  drivers/soc/samsung/pm_domains.c |  8 
>  9 files changed, 31 insertions(+), 36 deletions(-)

Renesas portions:

Acked-by: Simon Horman

Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-07-27 Thread Bart Van Assche

On Thu, 2017-07-27 at 08:02 -0600, Jens Axboe wrote:
> The bug looks like SCSI running the queue inline from IRQ
> context, that's not a good idea. Can you confirm the below works for
> you?
> 
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index f6097b89d5d3..78740ebf966c 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -497,7 +497,7 @@ static void scsi_run_queue(struct request_queue *q)
>   scsi_starved_list_run(sdev->host);
>  
>   if (q->mq_ops)
> - blk_mq_run_hw_queues(q, false);
> + blk_mq_run_hw_queues(q, true);
>   else
>   blk_run_queue(q);
>  }

Hello Jens,

scsi_run_queue() works fine if no scheduler is configured. Additionally, that
code predates the introduction of blk-mq I/O schedulers. I think it is
nontrivial for block driver authors to figure out that a queue has to be run
from process context if a scheduler has been configured that does not support
to be run from interrupt context. How about adding WARN_ON_ONCE(in_interrupt())
to blk_mq_start_hw_queue() or replacing the above patch by the following:

Subject: [PATCH] blk-mq: Make it safe to call blk_mq_start_hw_queues() from 
interrupt context

blk_mq_start_hw_queues() triggers a queue run. Some functions that
get called to run a queue, e.g. dd_dispatch_request(), are not IRQ-safe.
Hence run the queue asynchronously if blk_mq_start_hw_queues() is called
from interrupt context.

Signed-off-by: Bart Van Assche 
---
 block/blk-mq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 041f7b7fa0d6..c5cb3b2aabcf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1251,7 +1251,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
clear_bit(BLK_MQ_S_STOPPED, >state);

-   blk_mq_run_hw_queue(hctx, false);
+   blk_mq_run_hw_queue(hctx, in_interrupt());
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);

Thanks,

Bart.

Re: [RFC Part1 PATCH v3 07/17] x86/mm: Include SEV for encryption memory attribute changes

2017-07-27 Thread Borislav Petkov

On Mon, Jul 24, 2017 at 02:07:47PM -0500, Brijesh Singh wrote:
> From: Tom Lendacky 
> 
> The current code checks only for sme_active() when determining whether
> to perform the encryption attribute change.  Include sev_active() in this
> check so that memory attribute changes can occur under SME and SEV.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/mm/pageattr.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
> index dfb7d65..b726b23 100644
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int 
> numpages, bool enc)
>   unsigned long start;
>   int ret;
>  
> - /* Nothing to do if the SME is not active */
> - if (!sme_active())
> + /* Nothing to do if SME and SEV are not active */
> + if (!sme_active() && !sev_active())

This is the second place which does

if (!SME && !SEV)

I wonder if, instead of sprinking those, we should have a

if (mem_enc_active())

or so which unifies all those memory encryption logic tests and makes
the code more straightforward for readers who don't have to pay
attention to SME vs SEV ...

Just a thought.

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
--

Re: [RFC v6 19/62] powerpc: ability to create execute-disabled pkeys

2017-07-27 Thread Thiago Jung Bauermann


Ram Pai  writes:

> --- a/arch/powerpc/include/asm/pkeys.h
> +++ b/arch/powerpc/include/asm/pkeys.h
> @@ -2,6 +2,18 @@
>  #define _ASM_PPC64_PKEYS_H
>
>  extern bool pkey_inited;
> +/* override any generic PKEY Permission defines */
> +#undef  PKEY_DISABLE_ACCESS
> +#define PKEY_DISABLE_ACCESS0x1
> +#undef  PKEY_DISABLE_WRITE
> +#define PKEY_DISABLE_WRITE 0x2
> +#undef  PKEY_DISABLE_EXECUTE
> +#define PKEY_DISABLE_EXECUTE   0x4
> +#undef  PKEY_ACCESS_MASK
> +#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
> + PKEY_DISABLE_WRITE  |\
> + PKEY_DISABLE_EXECUTE)
> +

Is it ok to #undef macros from another header? Especially since said
header is in uapi (include/uapi/asm-generic/mman-common.h).

Also, it's unnecessary to undef the _ACCESS and _WRITE macros since they
are identical to the original definition. And since these macros are
originally defined in an uapi header, the powerpc-specific ones should
be in an uapi header as well, if I understand it correctly.

An alternative solution is to define only PKEY_DISABLE_EXECUTE in
arch/powerpc/include/uapi/asm/mman.h and then test for its existence to
properly define PKEY_ACCESS_MASK in
include/uapi/asm-generic/mman-common.h. What do you think of the code
below?

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index e31f5ee8e81f..67e6a3a343ae 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -4,17 +4,6 @@
 #include 
 
 extern bool pkey_inited;
-/* override any generic PKEY Permission defines */
-#undef  PKEY_DISABLE_ACCESS
-#define PKEY_DISABLE_ACCESS0x1
-#undef  PKEY_DISABLE_WRITE
-#define PKEY_DISABLE_WRITE 0x2
-#undef  PKEY_DISABLE_EXECUTE
-#define PKEY_DISABLE_EXECUTE   0x4
-#undef  PKEY_ACCESS_MASK
-#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
-   PKEY_DISABLE_WRITE  |\
-   PKEY_DISABLE_EXECUTE)
 
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
VM_PKEY_BIT3 | VM_PKEY_BIT4)
diff --git a/arch/powerpc/include/uapi/asm/mman.h 
b/arch/powerpc/include/uapi/asm/mman.h
index ab45cc2f3101..dee43feb7c53 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -45,4 +45,6 @@
 #define MAP_HUGE_1GB   (30 << MAP_HUGE_SHIFT)  /* 1GB   HugeTLB Page */
 #define MAP_HUGE_16GB  (34 << MAP_HUGE_SHIFT)  /* 16GB  HugeTLB Page */
 
+#define PKEY_DISABLE_EXECUTE   0x4
+
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 72eb9a1bde79..777f8f8dff47 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -12,7 +12,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  * more details.
  */
-#include 
+#include 
 #include /* PKEY_*   */
 
 bool pkey_inited;
diff --git a/include/uapi/asm-generic/mman-common.h 
b/include/uapi/asm-generic/mman-common.h
index 8c27db0c5c08..93e3841d9ada 100644
--- a/include/uapi/asm-generic/mman-common.h
+++ b/include/uapi/asm-generic/mman-common.h
@@ -74,7 +74,15 @@
 
 #define PKEY_DISABLE_ACCESS0x1
 #define PKEY_DISABLE_WRITE 0x2
+
+/* The arch-specific code may define PKEY_DISABLE_EXECUTE */
+#ifdef PKEY_DISABLE_EXECUTE
+#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |  \
+   PKEY_DISABLE_WRITE  |   \
+   PKEY_DISABLE_EXECUTE)
+#else
 #define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
 PKEY_DISABLE_WRITE)
+#endif
 
 #endif /* __ASM_GENERIC_MMAN_COMMON_H */


> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index 98d0391..b9ad98d 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -73,6 +73,7 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, 
> int pkey,
>   unsigned long init_val)
>  {
>   u64 new_amr_bits = 0x0ul;
> + u64 new_iamr_bits = 0x0ul;
>
>   if (!is_pkey_enabled(pkey))
>   return -1;
> @@ -85,5 +86,14 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, 
> int pkey,
>
>   init_amr(pkey, new_amr_bits);
>
> + /*
> +  * By default execute is disabled.
> +  * To enable execute, PKEY_ENABLE_EXECUTE
> +  * needs to be specified.
> +  */
> + if ((init_val & PKEY_DISABLE_EXECUTE))
> + new_iamr_bits |= IAMR_EX_BIT;
> +
> + init_iamr(pkey, new_iamr_bits);
>   return 0;
>  }

The comment seems to be from an earlier version which has the logic
inverted, and there is no PKEY_ENABLE_EXECUTE. Should the comment be
updated to the following?

By default execute is enabled.
To disable execute, PKEY_DISABLE_EXECUTE needs to be specified.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

[PATCH] powerpc/mm: Check for _PAGE_PTE in *_devmap()

2017-07-27 Thread Oliver O'Halloran

The ISA radix translation tree contains two different types of entry,
directories and leaves. The formats of the two entries are different
with the directory entries containing no spare bits for use by software.
As a result we need to ensure that the *_devmap() family of functions
check fail for everything except leaf (PTE) entries.

Signed-off-by: Oliver O'Halloran 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c0737c8..e1989dd 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -610,7 +610,7 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 
 static inline int pte_devmap(pte_t pte)
 {
-   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP));
+   return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE));
 }
 
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-- 
2.9.3

Re: [RFC v6 17/62] powerpc: implementation for arch_set_user_pkey_access()

2017-07-27 Thread Thiago Jung Bauermann

Ram Pai  writes:
> @@ -113,10 +117,14 @@ static inline int arch_override_mprotect_pkey(struct 
> vm_area_struct *vma,
>   return 0;
>  }
>
> +extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
> + unsigned long init_val);
>  static inline int arch_set_user_pkey_access(struct task_struct *tsk, int 
> pkey,
>   unsigned long init_val)
>  {
> - return 0;
> + if (!pkey_inited)
> + return -1;
> + return __arch_set_user_pkey_access(tsk, pkey, init_val);
>  }

If non-zero, the return value of this function will be passed to
userspace by the pkey_alloc syscall. Shouldn't it be returning an errno
macro such as -EPERM?

Also, why are there both arch_set_user_pkey_access and
__arch_set_user_pkey_access? Is it a speed optimization so that the
early return is inlined into the caller? Ditto for execute_only_pkey
and __arch_override_mprotect_pkey.

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: blk_mq_sched_insert_request: inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage

2017-07-27 Thread Jens Axboe

On 07/26/2017 11:10 PM, Michael Ellerman wrote:
> Hi Jens,
> 
> I'm seeing the lockdep warning below on shutdown on a Power8 machine
> using IPR.
> 
> If I'm reading it right it looks like the spin_lock() (non-irq) in
> blk_mq_sched_insert_request() is the immediate cause.

All the users of ctx->lock should be from process context.

> Looking at blk_mq_requeue_work() (the caller), it is doing
> spin_lock_irqsave(). So is switching blk_mq_sched_insert_request() to
> spin_lock_irqsave() the right fix?

That's because the requeue lock needs to be IRQ safe. However, the
context allows for just spin_lock_irq() for that lock there, so that
should be fixed up. Not your issue, of course, but we don't need to
save flags there.

> ipr 0001:08:00.0: shutdown
> 
> 
> WARNING: inconsistent lock state
> 4.13.0-rc2-gcc6x-gf74c89b #1 Not tainted
> 
> inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
> swapper/28/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
>  (&(>lock)->rlock){+.?...}, at: [] 
> blk_mq_sched_dispatch_requests+0xa4/0x2a0
> {SOFTIRQ-ON-W} state was registered at:
>   lock_acquire+0xec/0x2e0
>   _raw_spin_lock+0x44/0x70
>   blk_mq_sched_insert_request+0x88/0x1f0
>   blk_mq_requeue_work+0x108/0x180
>   process_one_work+0x310/0x800
>   worker_thread+0x88/0x520
>   kthread+0x164/0x1b0
>   ret_from_kernel_thread+0x5c/0x74
> irq event stamp: 3572314
> hardirqs last  enabled at (3572314): [] 
> _raw_spin_unlock_irqrestore+0x58/0xb0
> hardirqs last disabled at (3572313): [] 
> _raw_spin_lock_irqsave+0x3c/0x90
> softirqs last  enabled at (3572302): [] irq_enter+0x9c/0xe0
> softirqs last disabled at (3572303): [] irq_exit+0x108/0x150
> 
> other info that might help us debug this:
>  Possible unsafe locking scenario:
> 
>CPU0
>
>   lock(&(>lock)->rlock);
>   
> lock(&(>lock)->rlock);
> 
>  *** DEADLOCK ***
> 
> 2 locks held by swapper/28/0:
>  #0:  ((_cmd->timer)){+.-...}, at: [] 
> call_timer_fn+0x10/0x4b0
>  #1:  (rcu_read_lock){..}, at: [] 
> __blk_mq_run_hw_queue+0xa0/0x2c0
> 
> stack backtrace:
> CPU: 28 PID: 0 Comm: swapper/28 Not tainted 4.13.0-rc2-gcc6x-gf74c89b #1
> Call Trace:
> [c01fffe97550] [c0b50818] dump_stack+0xe8/0x160 (unreliable)
> [c01fffe97590] [c01586d0] print_usage_bug+0x2d0/0x390
> [c01fffe97640] [c0158f34] mark_lock+0x7a4/0x8e0
> [c01fffe976f0] [c015a000] __lock_acquire+0x6a0/0x1a70
> [c01fffe97860] [c015befc] lock_acquire+0xec/0x2e0
> [c01fffe97930] [c0b71514] _raw_spin_lock+0x44/0x70
> [c01fffe97960] [c05b60f4] 
> blk_mq_sched_dispatch_requests+0xa4/0x2a0
> [c01fffe979c0] [c05acac0] __blk_mq_run_hw_queue+0x100/0x2c0
> [c01fffe97a00] [c05ad478] __blk_mq_delay_run_hw_queue+0x118/0x130
> [c01fffe97a40] [c05ad61c] blk_mq_start_hw_queues+0x6c/0xa0
> [c01fffe97a80] [c0797aac] scsi_kick_queue+0x2c/0x60
> [c01fffe97aa0] [c0797cf0] scsi_run_queue+0x210/0x360
> [c01fffe97b10] [c079b888] scsi_run_host_queues+0x48/0x80
> [c01fffe97b40] [c07b6090] ipr_ioa_bringdown_done+0x70/0x1e0
> [c01fffe97bc0] [c07bc860] ipr_reset_ioa_job+0x80/0xf0
> [c01fffe97bf0] [c07b4d50] ipr_reset_timer_done+0xd0/0x100
> [c01fffe97c30] [c01937bc] call_timer_fn+0xdc/0x4b0
> [c01fffe97cf0] [c0193d08] expire_timers+0x178/0x330
> [c01fffe97d60] [c01940c8] run_timer_softirq+0xb8/0x120
> [c01fffe97de0] [c0b726a8] __do_softirq+0x168/0x6d8
> [c01fffe97ef0] [c00df2c8] irq_exit+0x108/0x150
> [c01fffe97f10] [c0017bf4] __do_irq+0x2a4/0x4a0
> [c01fffe97f90] [c002da50] call_do_irq+0x14/0x24
> [c007fad93aa0] [c0017e8c] do_IRQ+0x9c/0x140
> [c007fad93af0] [c0008b98] hardware_interrupt_common+0x138/0x140
> --- interrupt: 501 at .L1.42+0x0/0x4
> LR = arch_local_irq_restore.part.4+0x84/0xb0
> [c007fad93de0] [c007ffc1f7d8] 0xc007ffc1f7d8 (unreliable)
> [c007fad93e00] [c0988d3c] cpuidle_enter_state+0x1bc/0x530
> [c007fad93e60] [c01457cc] call_cpuidle+0x4c/0x90
> [c007fad93e80] [c0145b28] do_idle+0x208/0x2f0
> [c007fad93ef0] [c0145f8c] cpu_startup_entry+0x3c/0x50
> [c007fad93f20] [c0042bc0] start_secondary+0x3b0/0x4b0
> [c007fad93f90] [c000ac6c] start_secondary_prolog+0x10/0x14

The bug looks like SCSI running the queue inline from IRQ
context, that's not a good idea. Can you confirm the below works for
you?


diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index f6097b89d5d3..78740ebf966c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -497,7 +497,7 @@ static void scsi_run_queue(struct request_queue *q)
scsi_starved_list_run(sdev->host);
 
if (q->mq_ops)
-   blk_mq_run_hw_queues(q, false);
+

Re: [RFC v6 13/62] powerpc: track allocation status of all pkeys

2017-07-27 Thread Thiago Jung Bauermann


Hello Ram,

I'm still going through the patches and haven't formed a full picture of
the feature in my mind yet, so my comments today won't be particularly
insightful...

But hopefully the comments that I currently have will be helpful anyway.

Ram Pai  writes:
> diff --git a/arch/powerpc/include/asm/pkeys.h 
> b/arch/powerpc/include/asm/pkeys.h
> index 203d7de..09b268e 100644
> --- a/arch/powerpc/include/asm/pkeys.h
> +++ b/arch/powerpc/include/asm/pkeys.h
> @@ -2,21 +2,87 @@
>  #define _ASM_PPC64_PKEYS_H
>
>  extern bool pkey_inited;
> -#define ARCH_VM_PKEY_FLAGS 0
> +#define arch_max_pkey()  32
> +#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
> + VM_PKEY_BIT3 | VM_PKEY_BIT4)
> +/*
> + * Bits are in BE format.
> + * NOTE: key 31, 1, 0 are not used.
> + * key 0 is used by default. It give read/write/execute permission.
> + * key 31 is reserved by the hypervisor.
> + * key 1 is recommended to be not used.
> + * PowerISA(3.0) page 1015, programming note.
> + */
> +#define PKEY_INITIAL_ALLOCAION  0xc001

There's a typo in the macro name, should be "ALLOCATION".

-- 
Thiago Jung Bauermann
IBM Linux Technology Center

Re: [PATCH] powerpc/smp: Call smp_ops->setup_cpu() directly on the boot CPU

2017-07-27 Thread Thomas Gleixner

On Thu, 27 Jul 2017, Michael Ellerman wrote:
> In smp_cpus_done() we need to call smp_ops->setup_cpu() for the boot
> CPU, which means it has to run *on* the boot CPU.
> 
> In the past we ensured it ran on the boot CPU by changing the CPU
> affinity mask of current directly. That was removed in commit
> 6d11b87d55eb ("powerpc/smp: Replace open coded task affinity logic"),
> and replaced with a work queue call.
> 
> Unfortunately using a work queue leads to a lockdep warning, now that
> the CPU hotplug lock is a regular semaphore:
> 
>   ==
>   WARNING: possible circular locking dependency detected
>   ...
>   kworker/0:1/971 is trying to acquire lock:
>(cpu_hotplug_lock.rw_sem){++}, at: [] 
> apply_workqueue_attrs+0x34/0xa0
> 
>   but task is already holding lock:
>(()){+.+.+.}, at: [] 
> process_one_work+0x25c/0x800
>   ...
>CPU0CPU1
>
>   lock(());
>lock(cpu_hotplug_lock.rw_sem);
>lock(());
>   lock(cpu_hotplug_lock.rw_sem);
> 
> Although the deadlock can't happen in practice, because
> smp_cpus_done() only runs in early boot before CPU hotplug is allowed,
> lockdep can't tell that.
> 
> Luckily in commit 8fb12156b8db ("init: Pin init task to the boot CPU,
> initially") tglx changed the generic code to pin init to the boot CPU
> to begin with. The unpinning of init from the boot CPU happens in
> sched_init_smp(), which is called after smp_cpus_done().
> 
> So smp_cpus_done() is always called on the boot CPU, which means we
> don't need the work queue call at all - and the lockdep warning goes
> away.
> 
> Signed-off-by: Michael Ellerman 

Reviewed-by: Thomas Gleixner

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-27 Thread Jonathan Cameron

On Thu, 27 Jul 2017 05:49:13 -0700
"Paul E. McKenney"  wrote:

> On Thu, Jul 27, 2017 at 02:34:00PM +1000, Nicholas Piggin wrote:
> > On Wed, 26 Jul 2017 18:42:14 -0700
> > "Paul E. McKenney"  wrote:
> >   
> > > On Wed, Jul 26, 2017 at 04:22:00PM -0700, David Miller wrote:  
> >   
> > > > Indeed, that really wouldn't explain how we end up with a RCU stall
> > > > dump listing almost all of the cpus as having missed a grace period.
> > > 
> > > I have seen stranger things, but admittedly not often.  
> > 
> > So the backtraces show the RCU gp thread in schedule_timeout.
> > 
> > Are you sure that it's timeout has expired and it's not being scheduled,
> > or could it be a bad (large) timeout (looks unlikely) or that it's being
> > scheduled but not correctly noting gps on other CPUs?
> > 
> > It's not in R state, so if it's not being scheduled at all, then it's
> > because the timer has not fired:  
> 
> Good point, Nick!
> 
> Jonathan, could you please reproduce collecting timer event tracing?
I'm a little new to tracing (only started playing with it last week)
so fingers crossed I've set it up right.  No splats yet.  Was getting
splats on reading out the trace when running with the RCU stall timer
set to 4 so have increased that back to the default and am rerunning.

This may take a while.  Correct me if I've gotten this wrong to save time

echo "timer:*" > /sys/kernel/debug/tracing/set_event

when it dumps, just send you the relevant part of what is in
/sys/kernel/debug/tracing/trace?

Thanks,

Jonathan
> 
>   Thanx, Paul
> 
> > [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 c1565 
> > f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1
> > [ 1984.638153] rcu_preempt S0 9  2 0x
> > [ 1984.643626] Call trace:
> > [ 1984.646059] [] __switch_to+0x90/0xa8
> > [ 1984.651189] [] __schedule+0x19c/0x5d8
> > [ 1984.656400] [] schedule+0x38/0xa0
> > [ 1984.661266] [] schedule_timeout+0x124/0x218
> > [ 1984.667002] [] rcu_gp_kthread+0x4fc/0x748
> > [ 1984.672564] [] kthread+0xfc/0x128
> > [ 1984.677429] [] ret_from_fork+0x10/0x50
> >   
>

Re: [RFC Part1 PATCH v3 03/17] x86/mm: Secure Encrypted Virtualization (SEV) support

2017-07-27 Thread Borislav Petkov

On Wed, Jul 26, 2017 at 11:47:32AM -0500, Tom Lendacky wrote:
> If it's made static then the sme_active()/sev_active() inline functions
> would need to be turned into functions within the mem_encrypt.c file. So
> there's a trade-off to do that, which is the better one?

Simple: why do we have functions if the variables are exported?

The reasoning for sme_me_mask is more or less obvious but for sev_enabled...

IOW, either make the bool static and unlinine the function - this way
you're free to change how you determine whether SEV is enabled later as
callers will be using the function.

Or, if it doesn't really matter because you can always change callers
later, simply drop sev_active() the function and use a bool sev_active
everywhere.

> The kernel needs to distinguish between running under SME and running
> under SEV. SME and SEV are similar but not the same. The trampoline code
> is a good example.  Before paging is activated, SME will access all
> memory as decrypted, but SEV will access all memory as encrypted.  So
> when APs are being brought up under SME the trampoline area cannot be
> encrypted, whereas under SEV the trampoline area must be encrypted.

I guess you're sensing by now that we need this clarification in a
comment above it...

:-)

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
--

Re: [RFC Part1 PATCH v3 06/17] x86/mm: Use encrypted access of boot related data with SEV

2017-07-27 Thread Borislav Petkov

On Mon, Jul 24, 2017 at 02:07:46PM -0500, Brijesh Singh wrote:
> From: Tom Lendacky 
> 
> When Secure Encrypted Virtualization (SEV) is active, boot data (such as
> EFI related data, setup data) is encrypted and needs to be accessed as
> such when mapped. Update the architecture override in early_memremap to
> keep the encryption attribute when mapping this data.
> 
> Signed-off-by: Tom Lendacky 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/mm/ioremap.c | 44 
>  1 file changed, 32 insertions(+), 12 deletions(-)

...

> @@ -590,10 +598,15 @@ bool arch_memremap_can_ram_remap(resource_size_t 
> phys_addr, unsigned long size,
>   if (flags & MEMREMAP_DEC)
>   return false;
>  
> - if (memremap_is_setup_data(phys_addr, size) ||
> - memremap_is_efi_data(phys_addr, size) ||
> - memremap_should_map_decrypted(phys_addr, size))
> - return false;
> + if (sme_active()) {
> + if (memremap_is_setup_data(phys_addr, size) ||
> + memremap_is_efi_data(phys_addr, size) ||
> + memremap_should_map_decrypted(phys_addr, size))
> + return false;
> + } else if (sev_active()) {
> + if (memremap_should_map_decrypted(phys_addr, size))
> + return false;
> + }
>  
>   return true;
>  }

I guess this function's hind part can be simplified to:

if (sme_active()) {
if (memremap_is_setup_data(phys_addr, size) ||
memremap_is_efi_data(phys_addr, size))
return false;
}

return ! memremap_should_map_decrypted(phys_addr, size);
}

> @@ -608,15 +621,22 @@ pgprot_t __init 
> early_memremap_pgprot_adjust(resource_size_t phys_addr,
>unsigned long size,
>pgprot_t prot)

And this one in a similar manner...

>  {
> - if (!sme_active())
> + if (!sme_active() && !sev_active())
>   return prot;

... and you don't need that check...

> - if (early_memremap_is_setup_data(phys_addr, size) ||
> - memremap_is_efi_data(phys_addr, size) ||
> - memremap_should_map_decrypted(phys_addr, size))
> - prot = pgprot_decrypted(prot);
> - else
> - prot = pgprot_encrypted(prot);
> + if (sme_active()) {

... if you're going to do it here too.

> + if (early_memremap_is_setup_data(phys_addr, size) ||
> + memremap_is_efi_data(phys_addr, size) ||
> + memremap_should_map_decrypted(phys_addr, size))
> + prot = pgprot_decrypted(prot);
> + else
> + prot = pgprot_encrypted(prot);
> + } else if (sev_active()) {

And here.

> + if (memremap_should_map_decrypted(phys_addr, size))
> + prot = pgprot_decrypted(prot);
> + else
> + prot = pgprot_encrypted(prot);
> + }

-- 
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
--

Re: Possible circular locking dependency detected between cpu_hotplug_lock.rw_sem and wfc.work

2017-07-27 Thread Thomas Gleixner

On Thu, 27 Jul 2017, Thomas Gleixner wrote:
> On Thu, 27 Jul 2017, Michael Ellerman wrote:
> > Thomas Gleixner  writes:
> > 
> > > On Wed, 26 Jul 2017, Michael Ellerman wrote:
> > >
> > >> Hi Thomas,
> > >> 
> > >> I'm seeing the lockdep barf below on some bare metal Power8 machines.
> > >> 
> > >> This seems to be caused by our smp_cpus_done(), which does:
> > >> 
> > >>   void __init smp_cpus_done(unsigned int max_cpus)
> > >>   {
> > >>  /*
> > >>   * We want the setup_cpu() here to be called on the boot CPU, 
> > >> but
> > >>   * init might run on any CPU, so make sure it's invoked on the 
> > >> boot
> > >>   * CPU.
> > >>   */
> > >>  if (smp_ops && smp_ops->setup_cpu)
> > >>  work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, 
> > >> NULL);
> > >> 
> > >> 
> > >> I don't think CPU hotplug can happen at this point, so I don't think
> > >> there's really a bug.
> > >> 
> > >> But it looks like the work_on_cpu_safe() call could just go away, since
> > >> you pinned init to the boot CPU in 8fb12156b8db ("init: Pin init task to
> > >> the boot CPU, initially"). Though I can't see where init is unpinned, so
> > >> maybe we do still need to do it?
> > >
> > > It's undone in sched_init_smp(). So it looks safe. The call order is:
> > >
> > >  smp_init()
> > >   ...
> > >   smp_cpus_done()
> > >
> > >  sched_init_smp()
> > 
> > Great thanks.
> > 
> > Patch on the way.
> 
> Hmm. Second thoughts. The issue is the stability of the CPUs. Surely the
> boot CPU can't go away at that point, but the debug stuff does not know
> about it. maybe I'm missing something 

Ok. Now seeing your patch I know what I was missing :)

Re: Possible circular locking dependency detected between cpu_hotplug_lock.rw_sem and wfc.work

2017-07-27 Thread Thomas Gleixner

On Thu, 27 Jul 2017, Michael Ellerman wrote:
> Thomas Gleixner  writes:
> 
> > On Wed, 26 Jul 2017, Michael Ellerman wrote:
> >
> >> Hi Thomas,
> >> 
> >> I'm seeing the lockdep barf below on some bare metal Power8 machines.
> >> 
> >> This seems to be caused by our smp_cpus_done(), which does:
> >> 
> >>   void __init smp_cpus_done(unsigned int max_cpus)
> >>   {
> >>/*
> >> * We want the setup_cpu() here to be called on the boot CPU, but
> >> * init might run on any CPU, so make sure it's invoked on the boot
> >> * CPU.
> >> */
> >>if (smp_ops && smp_ops->setup_cpu)
> >>work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
> >> 
> >> 
> >> I don't think CPU hotplug can happen at this point, so I don't think
> >> there's really a bug.
> >> 
> >> But it looks like the work_on_cpu_safe() call could just go away, since
> >> you pinned init to the boot CPU in 8fb12156b8db ("init: Pin init task to
> >> the boot CPU, initially"). Though I can't see where init is unpinned, so
> >> maybe we do still need to do it?
> >
> > It's undone in sched_init_smp(). So it looks safe. The call order is:
> >
> >  smp_init()
> > ...
> > smp_cpus_done()
> >
> >  sched_init_smp()
> 
> Great thanks.
> 
> Patch on the way.

Hmm. Second thoughts. The issue is the stability of the CPUs. Surely the
boot CPU can't go away at that point, but the debug stuff does not know
about it. maybe I'm missing something 

Thanks,

tglx

[PATCH] powerpc/smp: Call smp_ops->setup_cpu() directly on the boot CPU

2017-07-27 Thread Michael Ellerman

In smp_cpus_done() we need to call smp_ops->setup_cpu() for the boot
CPU, which means it has to run *on* the boot CPU.

In the past we ensured it ran on the boot CPU by changing the CPU
affinity mask of current directly. That was removed in commit
6d11b87d55eb ("powerpc/smp: Replace open coded task affinity logic"),
and replaced with a work queue call.

Unfortunately using a work queue leads to a lockdep warning, now that
the CPU hotplug lock is a regular semaphore:

  ==
  WARNING: possible circular locking dependency detected
  ...
  kworker/0:1/971 is trying to acquire lock:
   (cpu_hotplug_lock.rw_sem){++}, at: [] 
apply_workqueue_attrs+0x34/0xa0

  but task is already holding lock:
   (()){+.+.+.}, at: [] process_one_work+0x25c/0x800
  ...
   CPU0CPU1
   
  lock(());
   lock(cpu_hotplug_lock.rw_sem);
   lock(());
  lock(cpu_hotplug_lock.rw_sem);

Although the deadlock can't happen in practice, because
smp_cpus_done() only runs in early boot before CPU hotplug is allowed,
lockdep can't tell that.

Luckily in commit 8fb12156b8db ("init: Pin init task to the boot CPU,
initially") tglx changed the generic code to pin init to the boot CPU
to begin with. The unpinning of init from the boot CPU happens in
sched_init_smp(), which is called after smp_cpus_done().

So smp_cpus_done() is always called on the boot CPU, which means we
don't need the work queue call at all - and the lockdep warning goes
away.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/smp.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 997c88d54acf..cf0e1245b8cc 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -1003,21 +1003,13 @@ static struct sched_domain_topology_level 
powerpc_topology[] = {
{ NULL, },
 };
 
-static __init long smp_setup_cpu_workfn(void *data __always_unused)
-{
-   smp_ops->setup_cpu(boot_cpuid);
-   return 0;
-}
-
 void __init smp_cpus_done(unsigned int max_cpus)
 {
/*
-* We want the setup_cpu() here to be called on the boot CPU, but
-* init might run on any CPU, so make sure it's invoked on the boot
-* CPU.
+* We are running pinned to the boot CPU, see rest_init().
 */
if (smp_ops && smp_ops->setup_cpu)
-   work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
+   smp_ops->setup_cpu(boot_cpuid);
 
if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done();
-- 
2.7.4

Re: Possible circular locking dependency detected between cpu_hotplug_lock.rw_sem and wfc.work

2017-07-27 Thread Michael Ellerman

Thomas Gleixner  writes:

> On Wed, 26 Jul 2017, Michael Ellerman wrote:
>
>> Hi Thomas,
>> 
>> I'm seeing the lockdep barf below on some bare metal Power8 machines.
>> 
>> This seems to be caused by our smp_cpus_done(), which does:
>> 
>>   void __init smp_cpus_done(unsigned int max_cpus)
>>   {
>>  /*
>>   * We want the setup_cpu() here to be called on the boot CPU, but
>>   * init might run on any CPU, so make sure it's invoked on the boot
>>   * CPU.
>>   */
>>  if (smp_ops && smp_ops->setup_cpu)
>>  work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
>> 
>> 
>> I don't think CPU hotplug can happen at this point, so I don't think
>> there's really a bug.
>> 
>> But it looks like the work_on_cpu_safe() call could just go away, since
>> you pinned init to the boot CPU in 8fb12156b8db ("init: Pin init task to
>> the boot CPU, initially"). Though I can't see where init is unpinned, so
>> maybe we do still need to do it?
>
> It's undone in sched_init_smp(). So it looks safe. The call order is:
>
>  smp_init()
>   ...
>   smp_cpus_done()
>
>  sched_init_smp()

Great thanks.

Patch on the way.

cheers

Re: [PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Michal Hocko

On Thu 27-07-17 11:48:26, Aneesh Kumar K.V wrote:
> For ppc64, we want to call this function when we are not running as guest.

What does this mean?

> Also, if we failed to allocate hugepages, let the user know.
> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  include/linux/hugetlb.h | 1 +
>  mm/hugetlb.c| 5 -
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 0ed8e41aaf11..8bbbd37ab105 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -358,6 +358,7 @@ int huge_add_to_page_cache(struct page *page, struct 
> address_space *mapping,
>   pgoff_t idx);
>  
>  /* arch callback */
> +int __init __alloc_bootmem_huge_page(struct hstate *h);
>  int __init alloc_bootmem_huge_page(struct hstate *h);
>  
>  void __init hugetlb_bad_size(void);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index bc48ee783dd9..a3a7a7e6339e 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct 
> vm_area_struct *vma,
>   return page;
>  }
>  
> -int __weak alloc_bootmem_huge_page(struct hstate *h)
> +int alloc_bootmem_huge_page(struct hstate *h)
> + __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
> +int __alloc_bootmem_huge_page(struct hstate *h)
>  {
>   struct huge_bootmem_page *m;
>   int nr_nodes, node;
> @@ -2104,6 +2106,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
>   goto found;
>   }
>   }
> + pr_info("Failed to allocate hugepage of size %ld\n", huge_page_size(h));
>   return 0;
>  
>  found:
> -- 
> 2.13.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Michal Hocko
SUSE Labs

Re: [RFC PATCH 1/3] powerpc/mm: update pmdp_invalidate to return old pmd value

2017-07-27 Thread Kirill A. Shutemov

On Thu, Jul 27, 2017 at 02:54:49PM +0200, Michal Hocko wrote:
> EMISSING_CHANGELOG
> 
> besides that no user actually uses the return value. Please fold this
> into the patch which uses the new functionality.

That's for patchset I'm working on[1].

[1] 
http://lkml.kernel.org/r/20170615145224.66200-1-kirill.shute...@linux.intel.com

-- 
 Kirill A. Shutemov

Re: [RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare

2017-07-27 Thread Michal Hocko

On Thu 27-07-17 14:07:56, Aneesh Kumar K.V wrote:
> Instead of marking the pmd ready for split, invalidate the pmd. This should
> take care of powerpc requirement.

which is?

> Only side effect is that we mark the pmd
> invalid early. This can result in us blocking access to the page a bit longer
> if we race against a thp split.

Again, this doesn't tell me what is the problem and why do we care.

> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 -
>  arch/powerpc/include/asm/book3s/64/hash-64k.h |  2 -
>  arch/powerpc/include/asm/book3s/64/pgtable.h  |  9 
>  arch/powerpc/include/asm/book3s/64/radix.h|  6 ---
>  arch/powerpc/mm/pgtable-hash64.c  | 22 
>  include/asm-generic/pgtable.h |  8 ---
>  mm/huge_memory.c  | 73 
> +--
>  7 files changed, 35 insertions(+), 87 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
> b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> index 0c4e470571ca..7d914f4fc534 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> @@ -100,8 +100,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct 
> vm_area_struct *vma,
>  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t 
> *pmdp,
>pgtable_t pgtable);
>  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, 
> pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -   unsigned long address, pmd_t *pmdp);
>  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  unsigned long addr, pmd_t *pmdp);
>  extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
> b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> index 8c8fb6fbdabe..b856e130c678 100644
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -164,8 +164,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct 
> vm_area_struct *vma,
>  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t 
> *pmdp,
>pgtable_t pgtable);
>  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, 
> pmd_t *pmdp);
> -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -   unsigned long address, pmd_t *pmdp);
>  extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
>  unsigned long addr, pmd_t *pmdp);
>  extern int hash__has_transparent_hugepage(void);
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index ece6912fae8e..557915792214 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1122,15 +1122,6 @@ static inline pgtable_t 
> pgtable_trans_huge_withdraw(struct mm_struct *mm,
>  extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long 
> address,
>pmd_t *pmdp);
>  
> -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
> -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> -unsigned long address, pmd_t *pmdp)
> -{
> - if (radix_enabled())
> - return radix__pmdp_huge_split_prepare(vma, address, pmdp);
> - return hash__pmdp_huge_split_prepare(vma, address, pmdp);
> -}
> -
>  #define pmd_move_must_withdraw pmd_move_must_withdraw
>  struct spinlock;
>  static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index 558fea3b2d22..a779a43b643b 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -270,12 +270,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
>   return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
>   return __pmd(pmd_val(pmd) | _PAGE_PTE);
>  }
> -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
> - unsigned long address, pmd_t *pmdp)
> -{
> - /* Nothing to do for radix. */
> - return;
> -}
>  
>  extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, 
> unsigned long addr,
> pmd_t *pmdp, unsigned long clr,
> diff --git a/arch/powerpc/mm/pgtable-hash64.c 
> b/arch/powerpc/mm/pgtable-hash64.c
> index c0a7372bdaa6..00aee1485714 100644
> --- a/arch/powerpc/mm/pgtable-hash64.c
> +++ b/arch/powerpc/mm/pgtable-hash64.c
> @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct 
>

Re: [RFC PATCH 2/3] powerpc/mm: Implement pmdp_establish for ppc64

2017-07-27 Thread Michal Hocko

On Thu 27-07-17 14:07:55, Aneesh Kumar K.V wrote:
> We can now use this to set pmd page table entries to absolute values. THP
> need to ensure that we always update pmd PTE entries such that we never mark
> the pmd none. pmdp_establish helps in implementing that.
> 
> This doesn't flush the tlb. Based on the old_pmd value returned caller can
> decide to call flush_pmd_tlb_range()

_Why_ do we need this. It doesn't really help that the newly added
function is not used so we could check that...

> 
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |  9 ++---
>  arch/powerpc/mm/pgtable-book3s64.c | 10 ++
>  2 files changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
> b/arch/powerpc/include/asm/book3s/64/radix.h
> index cd481ab601b6..558fea3b2d22 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -131,7 +131,8 @@ static inline unsigned long __radix_pte_update(pte_t 
> *ptep, unsigned long clr,
>   do {
>   pte = READ_ONCE(*ptep);
>   old_pte = pte_val(pte);
> - new_pte = (old_pte | set) & ~clr;
> + new_pte = old_pte & ~clr;
> + new_pte |= set;
>  
>   } while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
>  
> @@ -153,9 +154,11 @@ static inline unsigned long radix__pte_update(struct 
> mm_struct *mm,
>  
>   old_pte = __radix_pte_update(ptep, ~0ul, 0);
>   /*
> -  * new value of pte
> +  * new value of pte. We clear all the bits in clr mask
> +  * first and set the bits in set mask.
>*/
> - new_pte = (old_pte | set) & ~clr;
> + new_pte = old_pte & ~clr;
> + new_pte |= set;
>   radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
>   if (new_pte)
>   __radix_pte_update(ptep, 0, new_pte);
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index 0bb7f824ecdd..7100b0150a2a 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -45,6 +45,16 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
> unsigned long address,
>   return changed;
>  }
>  
> +pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long addr,
> +  pmd_t *pmdp, pmd_t entry)
> +{
> + long pmdval;
> +
> + pmdval = pmd_hugepage_update(vma->vm_mm, addr, pmdp, ~0UL, 
> pmd_val(entry));
> + return __pmd(pmdval);
> +}
> +
> +
>  int pmdp_test_and_clear_young(struct vm_area_struct *vma,
> unsigned long address, pmd_t *pmdp)
>  {
> -- 
> 2.13.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Michal Hocko
SUSE Labs

Re: [RFC PATCH 1/3] powerpc/mm: update pmdp_invalidate to return old pmd value

2017-07-27 Thread Michal Hocko

EMISSING_CHANGELOG

besides that no user actually uses the return value. Please fold this
into the patch which uses the new functionality.

On Thu 27-07-17 14:07:54, Aneesh Kumar K.V wrote:
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++--
>  arch/powerpc/mm/pgtable-book3s64.c   | 9 ++---
>  2 files changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
> b/arch/powerpc/include/asm/book3s/64/pgtable.h
> index 41d484ac0822..ece6912fae8e 100644
> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> @@ -1119,8 +1119,8 @@ static inline pgtable_t 
> pgtable_trans_huge_withdraw(struct mm_struct *mm,
>  }
>  
>  #define __HAVE_ARCH_PMDP_INVALIDATE
> -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long 
> address,
> - pmd_t *pmdp);
> +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long 
> address,
> +  pmd_t *pmdp);
>  
>  #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
>  static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index 3b65917785a5..0bb7f824ecdd 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
>   * We use this to invalidate a pmdp entry before switching from a
>   * hugepte to regular pmd entry.
>   */
> -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> -  pmd_t *pmdp)
> +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
> +   pmd_t *pmdp)
>  {
> - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
> + unsigned long old_pmd;
> +
> + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 
> 0);
>   flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>   /*
>* This ensures that generic code that rely on IRQ disabling
>* to prevent a parallel THP split work as expected.
>*/
>   serialize_against_pte_lookup(vma->vm_mm);
> + return __pmd(old_pmd);
>  }
>  
>  static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
> -- 
> 2.13.3
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 

-- 
Michal Hocko
SUSE Labs

New "merge" branch in the powerpc repo

2017-07-27 Thread Michael Ellerman

Hi folks,

tldr: There's a new branch in the powerpc repo called "merge", and if
you're doing work that's targeting the powerpc tree then it is probably
a good base to work on top of.


It's a merge of Linus' master branch, my current fixes branch, and my
current next branch. If we have significant chunks of code in another
tree, eg. akpm's then I may also merge that in, but that will be done on
a case-by-case basis.

Think of it like a mini linux-next, just for powerpc stuff :)

I'll keep it up to date whenever I update either of my branches, and
semi regularly when Linus updates his.

The branch itself will never merge with Linus' tree, but the component
parts will eventually (unless they ever need to be emergency rebased).

It's here:

  https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/log/?h=merge

And also approximately mirrored to here:

  https://github.com/linuxppc/linux/tree/merge


Feel free to ask me here or on IRC (or IBM slack) if you have any
questions.

cheers

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-27 Thread Paul E. McKenney

On Thu, Jul 27, 2017 at 02:34:00PM +1000, Nicholas Piggin wrote:
> On Wed, 26 Jul 2017 18:42:14 -0700
> "Paul E. McKenney"  wrote:
> 
> > On Wed, Jul 26, 2017 at 04:22:00PM -0700, David Miller wrote:
> 
> > > Indeed, that really wouldn't explain how we end up with a RCU stall
> > > dump listing almost all of the cpus as having missed a grace period.  
> > 
> > I have seen stranger things, but admittedly not often.
> 
> So the backtraces show the RCU gp thread in schedule_timeout.
> 
> Are you sure that it's timeout has expired and it's not being scheduled,
> or could it be a bad (large) timeout (looks unlikely) or that it's being
> scheduled but not correctly noting gps on other CPUs?
> 
> It's not in R state, so if it's not being scheduled at all, then it's
> because the timer has not fired:

Good point, Nick!

Jonathan, could you please reproduce collecting timer event tracing?

Thanx, Paul

> [ 1984.628602] rcu_preempt kthread starved for 5663 jiffies! g1566 c1565 f0x0 
> RCU_GP_WAIT_FQS(3) ->state=0x1
> [ 1984.638153] rcu_preempt S0 9  2 0x
> [ 1984.643626] Call trace:
> [ 1984.646059] [] __switch_to+0x90/0xa8
> [ 1984.651189] [] __schedule+0x19c/0x5d8
> [ 1984.656400] [] schedule+0x38/0xa0
> [ 1984.661266] [] schedule_timeout+0x124/0x218
> [ 1984.667002] [] rcu_gp_kthread+0x4fc/0x748
> [ 1984.672564] [] kthread+0xfc/0x128
> [ 1984.677429] [] ret_from_fork+0x10/0x50
>

Re: powerpc/Makefile: Fix ld version check with 64-bit LE-only toolchain

2017-07-27 Thread Michael Ellerman

On Wed, 2017-07-26 at 13:18:31 UTC, Michael Ellerman wrote:
> In commit efe0160cfd40 ("powerpc/64: Linker on-demand sfpr functions
> for modules"), we added an ld version check early in the powerpc
> top-level Makefile.
> 
> Because the Makefile runs before the kernel config is setup, the
> checks for CONFIG_CPU_LITTLE_ENDIAN etc. all take the default case. So
> we end up configuring ld for 32-bit big endian.
> 
> That would be OK, except that for historical (or perhaps no) reason,
> we use 'override LD' to add the endian flags to the LD variable
> itself, rather than the normal approach of adding them to LDFLAGS.
> 
> The end result is that when we check the ld version we run it as:
> 
>   $(CROSS_COMPILE)ld -EB -m elf32ppc --version
> 
> This often works, unless you are using a 64-bit only and/or little
> endian only, toolchain. In which case you see something like:
> 
>   $ make defconfig
>   powerpc64le-linux-ld: unrecognised emulation mode: elf32ppc
>   Supported emulations: elf64lppc elf32lppc elf32lppclinux elf32lppcsim
>   /bin/sh: 1: [: -ge: unexpected operator
> 
> The proper fix is to stop using 'override LD', but that will require a
> fair bit of testing. Instead we can fix it for now just by reordering
> the Makefile to do the version check earlier.
> 
> Fixes: efe0160cfd40 ("powerpc/64: Linker on-demand sfpr functions for 
> modules")
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/b40b2386bce982ad97f3683b2b34e5

cheers

Re: [v6] powerpc/mm/radix: Workaround prefetch issue with KVM

2017-07-27 Thread Michael Ellerman

On Tue, 2017-07-25 at 11:47:42 UTC, Michael Ellerman wrote:
> From: Benjamin Herrenschmidt 
> 
> There's a somewhat architectural issue with Radix MMU and KVM.
> 
> When coming out of a guest with AIL (Alternate Interrupt Location, ie,
> MMU enabled), we start executing hypervisor code with the PID register
> still containing whatever the guest has been using.
...
> 
> Signed-off-by: Benjamin Herrenschmidt 
> [mpe: Rework the asm to build with CONFIG_PPC_RADIX_MMU=n, drop
>   unneeded include of kvm_book3s_asm.h]
> Signed-off-by: Michael Ellerman 

Applied to powerpc fixes.

https://git.kernel.org/powerpc/c/a25bd72badfa793ab5aeafd50dbd9d

cheers

Re: powerpc/pseries: Fix of_node_put() underflow during pseries remove

2017-07-27 Thread Michael Ellerman

On Fri, 2017-07-21 at 14:51:39 UTC, Laurent Vivier wrote:
> As for commit 68baf692c435 ("powerpc/pseries: Fix of_node_put()
> underflow during DLPAR remove"), the call to of_node_put()
> must be removed from pSeries_reconfig_remove_node().
> 
> dlpar_detach_node() and pSeries_reconfig_remove_node() call
> of_detach_node(), and thus the node should not be released
> in this case too.
> 
> Signed-off-by: Laurent Vivier 
> Reviewed-by: David Gibson 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/4fd1bd443e80b12f0a01a45fb9a793

cheers

Re: powerpc/mm/hash: Free the subpage_prot_table correctly

2017-07-27 Thread Michael Ellerman

On Sat, 2017-06-17 at 14:30:55 UTC, "Aneesh Kumar K.V" wrote:
> Fixes: dad6f37c2602e ("powerpc: subpage_protect: Increase the array size to 
> take care of 64TB")
> Signed-off-by: Aneesh Kumar K.V 
> Tested-by: Ram Pai 

Applied to powerpc fixes, thanks.

https://git.kernel.org/powerpc/c/0da12a7a81f1e2255e89dc783c565e

cheers

Re: [v13,1/5] powerpc/platform/powernv: Update IMC OPAL APIs

2017-07-27 Thread Michael Ellerman

On Tue, 2017-07-18 at 21:36:32 UTC, Madhavan Srinivasan wrote:
> In-Memory Collection (IMC) counters are performance monitoring infrastrcuture.
> These counters need special sequence of scoms to init/start/stop which is 
> handled
> by OPAL. And OPAL provides three APIs to init and control these IMC engines.
> 
> OPAL API documentation:
>   
> https://github.com/open-power/skiboot/blob/master/doc/opal-api/opal-imc-counters.rst
> 
> Patch updates the kernel side powernv platform code to support the new OPAL 
> APIs
> 
> Signed-off-by: Hemant Kumar 
> Signed-off-by: Anju T Sudhakar 
> Signed-off-by: Madhavan Srinivasan 

Series applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/28a5db0061014c8afbbb98560cf420

cheers

Re: [v2] powerpc: allow compiling with GENERIC_MSI_IRQ_DOMAIN

2017-07-27 Thread Michael Ellerman

On Mon, 2017-07-17 at 13:12:43 UTC, laurentiu.tu...@nxp.com wrote:
> From: Laurentiu Tudor 
> 
> This allows building powerpc with the GENERIC_MSI_IRQ_DOMAIN
> Kconfig by enabling the asm-generic msi.h in Kbuild. Without
> this, there's a compilation error [1] because powerpc, as most
> arches, doesn't provide an asm/msi.h.
> 
> [1] In file included from ./include/linux/kvm_host.h:20:0,
>  from ./arch/powerpc/include/asm/kvm_ppc.h:30,
>  from arch/powerpc/kernel/dbell.c:20:
> ./include/linux/msi.h:195:21: fatal error: asm/msi.h: No such file or 
> directory
> 
> Signed-off-by: Laurentiu Tudor 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/8f36479d0eda3c078ee4e1ec79ea9a

cheers

Re: powerpc/smp: Get cpu only after validity check

2017-07-27 Thread Michael Ellerman

On Tue, 2017-07-04 at 04:22:46 UTC, Santosh Sivaraj wrote:
> Check for validity of cpu before calling get_hard_smp_processor_id.
> 
> Found with coverity.
> 
> Signed-off-by: Santosh Sivaraj 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/76d98ab4628bf8d36498e8228a5b92

cheers

Re: powerpc: Build fix for non SPARSEMEM_VMEMAP config

2017-07-27 Thread Michael Ellerman

On Wed, 2017-06-28 at 06:09:28 UTC, "Aneesh Kumar K.V" wrote:
> We can use pfn_to_page in realmode for other configs. Hence remove the
> CONFIG_FLATMEM ifdef
> 
> Fixes: 8e0861fa3c4ed (powerpc: Prepare to support kernel handling of IOMMU 
> map/unmap)
> 
> Cc: Alexey Kardashevskiy 
> Signed-off-by: Aneesh Kumar K.V 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/7e7dc66adcf490a619bc3c7763a8ce

cheers

Re: powerpc/ipic: Support edge on IRQ0

2017-07-27 Thread Michael Ellerman

On Sun, 2017-06-25 at 02:39:05 UTC, Scott Wood wrote:
> External IRQ0 has the same capabilities as the other IRQ1-7 and is
> handled by the same register IPIC_SEPNR.  When this register is not
> specified for "ack" in "ipic_info", you cannot configure this IRQ as
> IRQ_TYPE_EDGE_FALLING.  This oversight was probably due to the
> non-contiguous hwirq numbering of IRQ0 in the IPIC.
> 
> Signed-off-by: Jurgen Schindele 
> [scottwood: Cleaned up commit message and posted as a proper patch]
> Signed-off-by: Scott Wood 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/446183e4069e0b62cedfd72ccb90b8

cheers

Re: powerpc/pseries: use memdup_user_nul

2017-07-27 Thread Michael Ellerman

On Sat, 2017-04-29 at 01:45:15 UTC, Geliang Tang wrote:
> Use memdup_user_nul() helper instead of open-coding to simplify the code.
> 
> Signed-off-by: Geliang Tang 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/3783225130f01ea86fc0ee477a0e72

cheers

Re: powerpc/powernv: use memdup_user

2017-07-27 Thread Michael Ellerman

On Sat, 2017-04-29 at 01:45:14 UTC, Geliang Tang wrote:
> Use memdup_user() helper instead of open-coding to simplify the code.
> 
> Signed-off-by: Geliang Tang 

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/5588b29a5cd34aec747202e95f328a

cheers

Re: KVM guests freeze under upstream kernel

2017-07-27 Thread Michael Ellerman

Suraj Jitindar Singh  writes:
>
...
> kernel BUG at 
> /scratch/surajjs/linux/arch/powerpc/include/asm/book3s/64/radix.h:260!

Next thing to try would be something like below.

cheers

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415e283c..c749a757738e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1016,6 +1016,7 @@ static inline unsigned long
 pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp,
unsigned long clr, unsigned long set)
 {
+   BUG_ON(set & PAGE_DEVMAP);
if (radix_enabled())
return radix__pmd_hugepage_update(mm, addr, pmdp, clr, set);
return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 31eed8fa8e99..55c443a3dd5b 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -31,6 +31,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
unsigned long address,
  pmd_t *pmdp, pmd_t entry, int dirty)
 {
int changed;
+   BUG_ON(pmd & PAGE_DEVMAP);
 #ifdef CONFIG_DEBUG_VM
WARN_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
assert_spin_locked(>vm_mm->page_table_lock);
@@ -56,6 +57,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 void set_pmd_at(struct mm_struct *mm, unsigned long addr,
pmd_t *pmdp, pmd_t pmd)
 {
+   BUG_ON(pmd & PAGE_DEVMAP);
 #ifdef CONFIG_DEBUG_VM
WARN_ON(pte_present(pmd_pte(*pmdp)) && !pte_protnone(pmd_pte(*pmdp)));
assert_spin_locked(>page_table_lock);

Re: [RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare

2017-07-27 Thread Aneesh Kumar K.V




On 07/27/2017 02:07 PM, Aneesh Kumar K.V wrote:


diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 8c8fb6fbdabe..b856e130c678 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -164,8 +164,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct 
vm_area_struct *vma,
  extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t 
*pmdp,
 pgtable_t pgtable);
  extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, 
pmd_t *pmdp);
-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,

@@ -1956,14 +1956,39 @@ static void __split_huge_pmd_locked(struct 
vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}







+*/
+   old_pmd = pmdp_invalidate(vma, haddr, pmd);
+
+   page = pmd_page(old_pmd);
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
-   write = pmd_write(*pmd);
-   young = pmd_young(*pmd);
-   soft_dirty = pmd_soft_dirty(*pmd);
-
-   pmdp_huge_split_prepare(vma, haddr, pmd);
+   write = pmd_write(old_pmd);
+   young = pmd_young(old_pmd);
+   dirty = pmd_dirty(*pmd);


This should be

dirty = pmd_dirty(old_pmd);



+   soft_dirty = pmd_soft_dirty(old_pmd);
+   /*
+* withdraw the table only after we mark the pmd entry invalid
+*/
pgtable = pgtable_trans_huge_withdraw(mm, pmd);


-aneesh

[RFC PATCH 3/3] mm/hugetlb: Remove pmd_huge_split_prepare

2017-07-27 Thread Aneesh Kumar K.V

Instead of marking the pmd ready for split, invalidate the pmd. This should
take care of powerpc requirement. Only side effect is that we mark the pmd
invalid early. This can result in us blocking access to the page a bit longer
if we race against a thp split.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  2 -
 arch/powerpc/include/asm/book3s/64/hash-64k.h |  2 -
 arch/powerpc/include/asm/book3s/64/pgtable.h  |  9 
 arch/powerpc/include/asm/book3s/64/radix.h|  6 ---
 arch/powerpc/mm/pgtable-hash64.c  | 22 
 include/asm-generic/pgtable.h |  8 ---
 mm/huge_memory.c  | 73 +--
 7 files changed, 35 insertions(+), 87 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..7d914f4fc534 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -100,8 +100,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct 
vm_area_struct *vma,
 extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 pgtable_t pgtable);
 extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t 
*pmdp);
-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
 extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pmd_t *pmdp);
 extern int hash__has_transparent_hugepage(void);
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 8c8fb6fbdabe..b856e130c678 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -164,8 +164,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct 
vm_area_struct *vma,
 extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
 pgtable_t pgtable);
 extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t 
*pmdp);
-extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmdp);
 extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pmd_t *pmdp);
 extern int hash__has_transparent_hugepage(void);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index ece6912fae8e..557915792214 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1122,15 +1122,6 @@ static inline pgtable_t 
pgtable_trans_huge_withdraw(struct mm_struct *mm,
 extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 pmd_t *pmdp);
 
-#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
-static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
-  unsigned long address, pmd_t *pmdp)
-{
-   if (radix_enabled())
-   return radix__pmdp_huge_split_prepare(vma, address, pmdp);
-   return hash__pmdp_huge_split_prepare(vma, address, pmdp);
-}
-
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 558fea3b2d22..a779a43b643b 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -270,12 +270,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
return __pmd(pmd_val(pmd) | _PAGE_PTE);
 }
-static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-   unsigned long address, pmd_t *pmdp)
-{
-   /* Nothing to do for radix. */
-   return;
-}
 
 extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned 
long addr,
  pmd_t *pmdp, unsigned long clr,
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index c0a7372bdaa6..00aee1485714 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct 
mm_struct *mm, pmd_t *pmdp)
return pgtable;
 }
 
-void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma,
-  unsigned long address, pmd_t *pmdp)
-{
-   VM_BUG_ON(address & ~HPAGE_PMD_MASK);
-   VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
-

[RFC PATCH 2/3] powerpc/mm: Implement pmdp_establish for ppc64

2017-07-27 Thread Aneesh Kumar K.V

We can now use this to set pmd page table entries to absolute values. THP
need to ensure that we always update pmd PTE entries such that we never mark
the pmd none. pmdp_establish helps in implementing that.

This doesn't flush the tlb. Based on the old_pmd value returned caller can
decide to call flush_pmd_tlb_range()

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/radix.h |  9 ++---
 arch/powerpc/mm/pgtable-book3s64.c | 10 ++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index cd481ab601b6..558fea3b2d22 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -131,7 +131,8 @@ static inline unsigned long __radix_pte_update(pte_t *ptep, 
unsigned long clr,
do {
pte = READ_ONCE(*ptep);
old_pte = pte_val(pte);
-   new_pte = (old_pte | set) & ~clr;
+   new_pte = old_pte & ~clr;
+   new_pte |= set;
 
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
@@ -153,9 +154,11 @@ static inline unsigned long radix__pte_update(struct 
mm_struct *mm,
 
old_pte = __radix_pte_update(ptep, ~0ul, 0);
/*
-* new value of pte
+* new value of pte. We clear all the bits in clr mask
+* first and set the bits in set mask.
 */
-   new_pte = (old_pte | set) & ~clr;
+   new_pte = old_pte & ~clr;
+   new_pte |= set;
radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr);
if (new_pte)
__radix_pte_update(ptep, 0, new_pte);
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 0bb7f824ecdd..7100b0150a2a 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -45,6 +45,16 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
unsigned long address,
return changed;
 }
 
+pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long addr,
+pmd_t *pmdp, pmd_t entry)
+{
+   long pmdval;
+
+   pmdval = pmd_hugepage_update(vma->vm_mm, addr, pmdp, ~0UL, 
pmd_val(entry));
+   return __pmd(pmdval);
+}
+
+
 int pmdp_test_and_clear_young(struct vm_area_struct *vma,
  unsigned long address, pmd_t *pmdp)
 {
-- 
2.13.3

[RFC PATCH 1/3] powerpc/mm: update pmdp_invalidate to return old pmd value

2017-07-27 Thread Aneesh Kumar K.V

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++--
 arch/powerpc/mm/pgtable-book3s64.c   | 9 ++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 41d484ac0822..ece6912fae8e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1119,8 +1119,8 @@ static inline pgtable_t 
pgtable_trans_huge_withdraw(struct mm_struct *mm,
 }
 
 #define __HAVE_ARCH_PMDP_INVALIDATE
-extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-   pmd_t *pmdp);
+extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+pmd_t *pmdp);
 
 #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
 static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 3b65917785a5..0bb7f824ecdd 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm)
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
  */
-void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
-pmd_t *pmdp)
+pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pmd_t *pmdp)
 {
-   pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+   unsigned long old_pmd;
+
+   old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 
0);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
/*
 * This ensures that generic code that rely on IRQ disabling
 * to prevent a parallel THP split work as expected.
 */
serialize_against_pte_lookup(vma->vm_mm);
+   return __pmd(old_pmd);
 }
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
-- 
2.13.3

Re: [PATCH 1/6] powerpc: Free up four 64K PTE bits in 4K backed HPTE pages

2017-07-27 Thread Ram Pai

On Thu, Jul 27, 2017 at 07:29:32AM +0530, Aneesh Kumar K.V wrote:
> 
> 
> On 07/26/2017 09:36 PM, Ram Pai wrote:
> >On Wed, Jul 26, 2017 at 04:05:48PM +0530, Aneesh Kumar K.V wrote:
> >>Ram Pai  writes:
> >>
> 
> >>>diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
> >>>b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> >>>index 9732837..62e580c 100644
> >>>--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> >>>+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> >>>@@ -12,18 +12,14 @@
> >>>   */
> >>>  #define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */
> >>>  #define H_PAGE_4K_PFN_RPAGE_RPN1 /* PFN is for a single 4k page */
> >>>+#define H_PAGE_BUSY   _RPAGE_RPN42 /* software: PTE & hash are 
> >>>busy */
> >>
> >>
> >>Why are we moving H_PAGE_BUSY. Right now 4k and 64k linux page table
> >>format looks similar.
> >
> >The goal is to clear off all the _RPAGE_RSV* bits so that they can be
> >used for protection keys.  the aim is to keep the protection-bits in the
> >_RPAGE_RSV* bits, so that they will work as-is whenever radix MMU enables
> >protection keys.
> >
> >Yes this makes the PTE format differ from 4k PTE. Hopefully it is a
> >small inconvenience. The PTE format for 4K is anyway not exactly the
> >same compared to 64K PTE format. For example, higher RPN bits are
> >used on 4K but not on 64k. lower RPN bits are used on 64k but not
> >on 4k.
> I was wondering why in this patch ? You do in the next patch

True. because in this patch, we have not yet freed up bit
_RPAGE_RPN44. _RPAGE_RPN44 bit is still used by H_PAGE_F_GIX for 64K
backed HPTEs.  Hence I have temporarily parked H_PAGE_BUSY at
_RPAGE_RPN42.

I could leave H_PAGE_BUSY at bit _RPAGE_RSV1  and move it to 
_RPAGE_RPN44 in the next patch. But by doing so, i would have not
truely released bit _RPAGE_RSV1 for 4K backed hptes; as claimed in the title
of this patch  

> 
> --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> @@ -12,7 +12,7 @@
>   */
>  #define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */
>  #define H_PAGE_4K_PFN_RPAGE_RPN1 /* PFN is for a single 4k page */
> -#define H_PAGE_BUSY  _RPAGE_RPN42 /* software: PTE & hash are busy */
> +#define H_PAGE_BUSY  _RPAGE_RPN44 /* software: PTE & hash are busy */
> 
...
-- 
Ram Pai

RE: [RFC Part1 PATCH v3 13/17] x86/io: Unroll string I/O when SEV is active

2017-07-27 Thread David Laight

From: Brijesh Singh
> Sent: 26 July 2017 21:07
...
> I am not sure if I understand your concern.
> 
> Are you commenting on amount of code duplication ? If so, I can certainly 
> improve
> and use the similar macro used into header file to generate the functions 
> body.

If you are careful the real functions could expand the inline functions
that get used when SEV is compiled out.

Oh, if you are looking at this, can you fix memcpy_to_io()
so that it is never 'rep movsb'?

David

Re: RCU lockup issues when CONFIG_SOFTLOCKUP_DETECTOR=n - any one else seeing this?

2017-07-27 Thread Jonathan Cameron

On Wed, 26 Jul 2017 18:13:12 +0100
Jonathan Cameron  wrote:

> On Wed, 26 Jul 2017 09:54:32 -0700
> David Miller  wrote:
> 
> > From: "Paul E. McKenney" 
> > Date: Wed, 26 Jul 2017 08:49:00 -0700
> >   
> > > On Wed, Jul 26, 2017 at 04:33:40PM +0100, Jonathan Cameron wrote:
> > >> Didn't leave it long enough. Still bad on 4.10-rc7 just took over
> > >> an hour to occur.
> > > 
> > > And it is quite possible that SOFTLOCKUP_DETECTOR=y and HZ_PERIODIC=y
> > > are just greatly reducing the probability of the problem rather than
> > > completely preventing it.
> > > 
> > > Still, hopefully useful information, thank you for the testing!
> 
> Not sure it actually gives us much information, but no issues yet
> with a simple program running every cpu that wakes up every 3 seconds.
> 
> Will leave it running overnight and report back in the morning.
Perhaps unsurprisingly the above test didn't show any splats.

So it appears a userspace wakeup is enough to stop the issue happening
(or at least make it a lot less likely).

Jonathan
> 
> > 
> > I guess that invalidates my idea to test reverting recent changes to
> > the tick-sched.c code... :-/
> > 
> > In NO_HZ_IDLE mode, what is really supposed to happen on a completely
> > idle system?
> > 
> > All the cpus enter the idle loop, have no timers programmed, and they
> > all just go to sleep until an external event happens.
> > 
> > What ensures that grace periods get processed in this regime?  
> ___
> linuxarm mailing list
> linux...@huawei.com
> http://rnd-openeuler.huawei.com/mailman/listinfo/linuxarm

Re: KVM guests freeze under upstream kernel

2017-07-27 Thread Suraj Jitindar Singh

On Thu, 2017-07-27 at 13:14 +1000, Michael Ellerman wrote:
> jos...@linux.vnet.ibm.com writes:
> > On Thu, Jul 20, 2017 at 10:18:18PM -0300, jos...@linux.vnet.ibm.com
> >  wrote:
> > > On Thu, Jul 20, 2017 at 03:21:59PM +1000, Paul Mackerras wrote:
> > > > 
> > > > Did you check the host kernel logs for any oops messages?
> > > 
> > > dmesg was clean but after sometime waiting (I forgot QEMU running
> > > in
> > > another terminal) I got the oops below (after rebooting the host
> > > I 
> > > couldn't reproduce it again).
> > > 
> > > Another test that I did was:
> > > Compile with transparent huge pages disabled: KVM works fine
> > > Compile with transparent huge pages enabled: doesn't work
> > >   + disabling it in /sys/kernel/mm/transparent_hugepage: doesn't
> > > work
> > > 
> > > Just out of my own curiosity I made this small change:
> > > 
> > > diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h
> > > b/arch/powerpc/include
> > > index c0737c8..f94a3b6 100644
> > > --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
> > > +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
> > > @@ -80,7 +80,7 @@
> > >  
> > >   #define _PAGE_SOFT_DIRTY   _RPAGE_SW3 /* software: software
> > > dirty
> > >   tracking 
> > >    #define _PAGE_SPECIAL  _RPAGE_SW2 /* software: special
> > > page */
> > >    -#define _PAGE_DEVMAP   _RPAGE_SW1 /* software:
> > > ZONE_DEVICE page */
> > >    +#define _PAGE_DEVMAP   _RPAGE_RSV3
> > > #define __HAVE_ARCH_PTE_DEVMAP
> > > 
> > > and it works. I chose _RPAGE_RSV3 because it uses the same value
> > > that
> > > x86 uses (0x0400UL) but I don't if it could have any
> > > side
> > > effect
> > > 
> > 
> > Does this change make any sense to you people?
> 
> No :)
> 
> I think it's just hiding the bug somehow. Presumably we have some
> code
> somewhere that is getting confused by _RPAGE_SW1 being set, or
> setting
> that bit incorrectly.

kernel BUG at 
/scratch/surajjs/linux/arch/powerpc/include/asm/book3s/64/radix.h:260!
Oops: Exception in kernel mode, sig: 5 [#1]
SMP NR_CPUS=2048 
NUMA 
PowerNV
Modules linked in:
CPU: 3 PID: 2050 Comm: qemu-system-ppc Not tainted 
4.13.0-rc2-1-g2f3013c-dirty #1
task: c00f1ebc task.stack: c00f1ec0
NIP: c0070fd4 LR: c00e2120 CTR: c00e20d0
REGS: c00f1ec036b0 TRAP: 0700   Not tainted  
(4.13.0-rc2-1-g2f3013c-dirty)
MSR: 9282b033 
  CR: 22244824  XER: 
CFAR: c0070e74 SOFTE: 1 
GPR00: 0009 c00f1ec03930 c1067400 19cf0a05 
GPR04: c000 050acf190f80 0005 0800 
GPR08: 0015 800f19cf0a05 c00f1eb64368 0009 
GPR12: 0009 cfd80f00 c00f1eca7a30 4000 
GPR16: 5f9f1780 40002000 7fff5fff 7fff879700a6 
GPR20: 8108 c110bce0 0f61 c00e20d0 
GPR24:  c00f1c7a6008 7fff6f60 7fff5fff 
GPR28: c00f19fd 0da0  c00f1ec03990 
NIP [c0070fd4] __find_linux_pte_or_hugepte+0x1d4/0x350
LR [c00e2120] kvm_unmap_radix+0x50/0x1d0
Call Trace:
[c00f1ec03930] [c00b2554] mark_page_dirty+0x34/0xa0 (unreliable)
[c00f1ec03970] [c00e2120] kvm_unmap_radix+0x50/0x1d0
[c00f1ec039c0] [c00dbea0] kvm_handle_hva_range+0x100/0x170
[c00f1ec03a30] [c00df43c] kvm_unmap_hva_range_hv+0x6c/0x80
[c00f1ec03a70] [c00c7588] kvm_unmap_hva_range+0x48/0x60
[c00f1ec03ab0] [c00bb77c] 
kvm_mmu_notifier_invalidate_range_start+0x8c/0x130
[c00f1ec03b10] [c0316f10] 
__mmu_notifier_invalidate_range_start+0xa0/0xf0
[c00f1ec03b60] [c02e95f0] change_protection+0x840/0xe20
[c00f1ec03cb0] [c0313050] change_prot_numa+0x50/0xd0
[c00f1ec03d00] [c0143f24] task_numa_work+0x2b4/0x3b0
[c00f1ec03dc0] [c0128738] task_work_run+0xf8/0x160
[c00f1ec03e00] [c001db94] do_notify_resume+0xe4/0xf0
[c00f1ec03e30] [c000b744] ret_from_except_lite+0x70/0x74
Instruction dump:
419e00ec 6000 78a70022 54a9403e 50a9c00e 54e3403e 50a9c42e 50e3c00e 
50e3c42e 792907c6 7d291b78 55270528 <0b07> 3ce04000 3c804000 78e707c6 
---[ end trace aecf406c356566bb ]---


The bug on added was:

arch/powerpc/include/asm/book3s/64/radix.h:260:
258 static inline int radix__pmd_trans_huge(pmd_t pmd)
259 {
260 BUG_ON(pmd_val(pmd) & _PAGE_DEVMAP);
261 return (pmd_val(pmd) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE;
262 }

> 
> cheers

[PATCH v3 3/3] powerpc/mm/cxl: Add the fault handling cpu to mm cpumask

2017-07-27 Thread Aneesh Kumar K.V

We use mm cpumask for serializing against lockless page table walk. Anybody
who is doing a lockless page table walk is expected to disable irq and only
cpus in mm cpumask is expected do the lockless walk. This ensure that
a THP split can send IPI to only cpus in the mm cpumask, to make sure there
are no parallel lockless page table walk.

Add the CAPI fault handling cpu to the mm cpumask so that we can do the lockless
page table walk while inserting hash page table entries.

Reviewed-by: Frederic Barrat 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/pgtable-book3s64.c | 10 +-
 drivers/misc/cxl/fault.c   |  6 ++
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 57b947cde2bf..3b65917785a5 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -83,15 +83,7 @@ static void do_nothing(void *unused)
 void serialize_against_pte_lookup(struct mm_struct *mm)
 {
smp_mb();
-   /*
-* Cxl fault handling requires us to do a lockless page table
-* walk while inserting hash page table entry with mm tracked
-* in cxl context. Hence we need to do a global flush.
-*/
-   if (cxl_ctx_in_use())
-   smp_call_function(do_nothing, NULL, 1);
-   else
-   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
 }
 
 /*
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 6eed7d03e2b5..ab507e4ed69b 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -138,6 +138,12 @@ int cxl_handle_mm_fault(struct mm_struct *mm, u64 dsisr, 
u64 dar)
int result;
unsigned long access, flags, inv_flags = 0;
 
+   /*
+* Add the fault handling cpu to task mm cpumask so that we
+* can do a safe lockless page table walk when inserting the
+* hash page table entry.
+*/
+   cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
if ((result = copro_handle_mm_fault(mm, dar, dsisr, ))) {
pr_devel("copro_handle_mm_fault failed: %#x\n", result);
return result;
-- 
2.13.3

[PATCH v3 2/3] powerpc/mm: Don't send IPI to all cpus on THP updates

2017-07-27 Thread Aneesh Kumar K.V

Now that we made sure that lockless walk of linux page table is mostly limitted
to current task(current->mm->pgdir) we can update the THP update sequence to
only send IPI to cpus on which this task has run. This helps in reducing the IPI
overload on systems with large number of CPUs.

W.r.t kvm even though kvm is walking page table with vpc->arch.pgdir, it is
done only on secondary cpus and in that case we have primary cpu added to
task's mm cpumask. Sending an IPI to primary will force the secondary to do
a vm exit and hence this mm cpumask usage is safe here.

W.r.t CAPI, we still end up walking linux page table with capi context MM. For
now the pte lookup serialization sends an IPI to all cpus in CPI is in use. We
can further improve this by adding the CAPI interrupt handling cpu to task
mm cpumask. That will be done in a later patch.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  1 +
 arch/powerpc/mm/pgtable-book3s64.c   | 32 +++-
 arch/powerpc/mm/pgtable-hash64.c |  8 +++
 arch/powerpc/mm/pgtable-radix.c  |  8 +++
 4 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index d1da415e283c..f349f5388af6 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1159,6 +1159,7 @@ static inline bool arch_needs_pgtable_deposit(void)
return false;
return true;
 }
+extern void serialize_against_pte_lookup(struct mm_struct *mm);
 
 
 static inline pmd_t pmd_mkdevmap(pmd_t pmd)
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 31eed8fa8e99..57b947cde2bf 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -9,6 +9,7 @@
 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -64,6 +65,35 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
trace_hugepage_set_pmd(addr, pmd_val(pmd));
return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
+
+static void do_nothing(void *unused)
+{
+
+}
+/*
+ * Serialize against find_current_mm_pte which does lock-less
+ * lookup in page tables with local interrupts disabled. For huge pages
+ * it casts pmd_t to pte_t. Since format of pte_t is different from
+ * pmd_t we want to prevent transit from pmd pointing to page table
+ * to pmd pointing to huge page (and back) while interrupts are disabled.
+ * We clear pmd to possibly replace it with page table pointer in
+ * different code paths. So make sure we wait for the parallel
+ * find_current_mm_pte to finish.
+ */
+void serialize_against_pte_lookup(struct mm_struct *mm)
+{
+   smp_mb();
+   /*
+* Cxl fault handling requires us to do a lockless page table
+* walk while inserting hash page table entry with mm tracked
+* in cxl context. Hence we need to do a global flush.
+*/
+   if (cxl_ctx_in_use())
+   smp_call_function(do_nothing, NULL, 1);
+   else
+   smp_call_function_many(mm_cpumask(mm), do_nothing, NULL, 1);
+}
+
 /*
  * We use this to invalidate a pmdp entry before switching from a
  * hugepte to regular pmd entry.
@@ -77,7 +107,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned 
long address,
 * This ensures that generic code that rely on IRQ disabling
 * to prevent a parallel THP split work as expected.
 */
-   kick_all_cpus_sync();
+   serialize_against_pte_lookup(vma->vm_mm);
 }
 
 static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot)
diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c
index 443a2c66a304..c0a7372bdaa6 100644
--- a/arch/powerpc/mm/pgtable-hash64.c
+++ b/arch/powerpc/mm/pgtable-hash64.c
@@ -239,7 +239,7 @@ pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, 
unsigned long addres
 * by sending an IPI to all the cpus and executing a dummy
 * function there.
 */
-   kick_all_cpus_sync();
+   serialize_against_pte_lookup(vma->vm_mm);
/*
 * Now invalidate the hpte entries in the range
 * covered by pmd. This make sure we take a
@@ -380,16 +380,16 @@ pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm,
 */
memset(pgtable, 0, PTE_FRAG_SIZE);
/*
-* Serialize against find_linux_pte_or_hugepte which does lock-less
+* Serialize against find_current_mm_pte variants which does lock-less
 * lookup in page tables with local interrupts disabled. For huge pages
 * it casts pmd_t to pte_t. Since format of pte_t is different from
 * pmd_t we want to prevent transit from pmd pointing to page table
 * to pmd pointing to huge page (and back) while interrupts are 
disabled.
 * We clear pmd to possibly

[PATCH v3 1/3] powerpc/mm: Rename find_linux_pte_or_hugepte

2017-07-27 Thread Aneesh Kumar K.V

Add newer helpers to make the function usage simpler. It is always recommended
to use find_current_mm_pte() for walking the page table. If we cannot use
find_current_mm_pte(), it should be documented why the said usage of
__find_linux_pte() is safe against a parallel THP split.

For now we have KVM code using __find_linux_pte(). This is because kvm code ends
up calling __find_linux_pte() in real mode with MSR_EE=0 but with PACA 
soft_enabled =
1. We may want to fix that later and make sure we keep the MSR_EE and  PACA
soft_enabled in sync. When we do that we can switch kvm to use find_linux_pte().

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/pgtable.h | 10 +
 arch/powerpc/include/asm/pte-walk.h| 38 ++
 arch/powerpc/kernel/eeh.c  |  4 ++--
 arch/powerpc/kernel/io-workarounds.c   |  5 +++--
 arch/powerpc/kvm/book3s_64_mmu_hv.c|  5 +++--
 arch/powerpc/kvm/book3s_64_mmu_radix.c | 28 -
 arch/powerpc/kvm/book3s_64_vio_hv.c| 12 ++-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c| 18 
 arch/powerpc/kvm/e500_mmu_host.c   |  3 ++-
 arch/powerpc/mm/hash_utils_64.c|  5 +++--
 arch/powerpc/mm/hugetlbpage.c  | 24 -
 arch/powerpc/mm/tlb_hash64.c   |  6 --
 arch/powerpc/perf/callchain.c  |  3 ++-
 13 files changed, 106 insertions(+), 55 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pte-walk.h

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index afae9a336136..eb9d57defb75 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -66,16 +66,8 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, 
unsigned long addr,
 #ifndef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_large(pmd) 0
 #endif
-pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-  bool *is_thp, unsigned *shift);
-static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
-  bool *is_thp, unsigned *shift)
-{
-   VM_WARN(!arch_irqs_disabled(),
-   "%s called with irq enabled\n", __func__);
-   return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
-}
 
+/* can we use this in kvm */
 unsigned long vmalloc_to_phys(void *vmalloc_addr);
 
 void pgtable_cache_add(unsigned shift, void (*ctor)(void *));
diff --git a/arch/powerpc/include/asm/pte-walk.h 
b/arch/powerpc/include/asm/pte-walk.h
new file mode 100644
index ..3a5a391a4c6d
--- /dev/null
+++ b/arch/powerpc/include/asm/pte-walk.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_POWERPC_PTE_WALK_H
+#define _ASM_POWERPC_PTE_WALK_H
+
+#ifndef __ASSEMBLY__
+#include 
+
+/* Don't use this directly */
+extern pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea,
+  bool *is_thp, unsigned *hshift);
+
+static inline pte_t *find_linux_pte(pgd_t *pgdir, unsigned long ea,
+   bool *is_thp, unsigned *hshift)
+{
+   VM_WARN(!arch_irqs_disabled(),
+   "%s called with irq enabled\n", __func__);
+   return __find_linux_pte(pgdir, ea, is_thp, hshift);
+}
+
+static inline pte_t *find_init_mm_pte(unsigned long ea, unsigned *hshift)
+{
+   pgd_t *pgdir = init_mm.pgd;
+   return __find_linux_pte(pgdir, ea, NULL, hshift);
+}
+/*
+ * This is what we should always use. Any other lockless page table lookup 
needs
+ * careful audit against THP split.
+ */
+static inline pte_t *find_current_mm_pte(pgd_t *pgdir, unsigned long ea,
+bool *is_thp, unsigned *hshift)
+{
+   VM_WARN(!arch_irqs_disabled(),
+   "%s called with irq enabled\n", __func__);
+   VM_WARN(pgdir != current->mm->pgd,
+   "%s lock less page table lookup called on wrong mm\n", 
__func__);
+   return __find_linux_pte(pgdir, ea, is_thp, hshift);
+}
+#endif
+#endif
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 63992b2d8e15..5e6887c40528 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 /** Overview:
@@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long 
token)
 * worried about _PAGE_SPLITTING/collapse. Also we will not hit
 * page table free, because of init_mm.
 */
-   ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token,
-  NULL, _shift);
+   ptep = find_init_mm_pte(token, _shift);
if (!ptep)
return token;
WARN_ON(hugepage_shift);
diff --git a/arch/powerpc/kernel/io-workarounds.c 
b/arch/powerpc/kernel/io-workarounds.c
index a582e0d42525..bbe85f5aea71 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++

[PATCH v3 3/3] powerpc/mm/hugetlb: Allow runtime allocation of 16G.

2017-07-27 Thread Aneesh Kumar K.V

We now have GIGANTIC_PAGE on powerpc. Currently this is enabled only on
radix with 1G as gigantic hugepage size. Enable this with hash translation mode
too (ie, with 16G hugepage size). Depending on the total system memory we may
be able to allocate 16G hugepage size. This bring parity between radix and hash
translation mode. Also reduce the confusion of the user with respect to
hugetlbfs usage.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hugetlb.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 5c28bd6f2ae1..2d1ca488ca44 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -54,9 +54,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct 
vm_area_struct *vma,
 #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
 static inline bool gigantic_page_supported(void)
 {
-   if (radix_enabled())
-   return true;
-   return false;
+   return true;
 }
 #endif
 
-- 
2.13.3

[PATCH v3 2/3] powerpc/mm/hugetlb: Add support for reserving gigantic huge pages via kernel command line

2017-07-27 Thread Aneesh Kumar K.V

With commit aa888a74977a8 ("hugetlb: support larger than MAX_ORDER") we added
support for allocating gigantic hugepages via kernel command line. Switch
ppc64 arch specific code to use that.

W.r.t FSL support, we now limit our allocation range using 
BOOTMEM_ALLOC_ACCESSIBLE.

We use the kernel command line to do reservation of hugetlb pages on powernv
platforms. On pseries hash mmu mode the supported gigantic huge page size is
16GB and that can only be allocated with hypervisor assist. For pseries the
command line option doesn't do the allocation. Instead pseries does gigantic
hugepage allocation based on hypervisor hint that is specified via
"ibm,expected#pages" property of the memory node.

Cc: Scott Wood 
Cc: Christophe Leroy 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   2 +-
 arch/powerpc/include/asm/hugetlb.h|  14 --
 arch/powerpc/kernel/setup-common.c|   7 -
 arch/powerpc/mm/hash_utils_64.c   |   2 +-
 arch/powerpc/mm/hugetlbpage.c | 177 +++---
 arch/powerpc/mm/init_32.c |   2 -
 6 files changed, 22 insertions(+), 182 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 6981a52b3887..f28d21c69f79 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -468,7 +468,7 @@ extern int htab_bolt_mapping(unsigned long vstart, unsigned 
long vend,
 int psize, int ssize);
 int htab_remove_mapping(unsigned long vstart, unsigned long vend,
int psize, int ssize);
-extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages);
+extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long 
number_of_pages);
 extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr);
 
 #ifdef CONFIG_PPC_PSERIES
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 7f4025a6c69e..b8a0fb442c64 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -218,18 +218,4 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned 
long addr,
 }
 #endif /* CONFIG_HUGETLB_PAGE */
 
-/*
- * FSL Book3E platforms require special gpage handling - the gpages
- * are reserved early in the boot process by memblock instead of via
- * the .dts as on IBM platforms.
- */
-#if defined(CONFIG_HUGETLB_PAGE) && (defined(CONFIG_PPC_FSL_BOOK3E) || \
-defined(CONFIG_PPC_8xx))
-extern void __init reserve_hugetlb_gpages(void);
-#else
-static inline void reserve_hugetlb_gpages(void)
-{
-}
-#endif
-
 #endif /* _ASM_POWERPC_HUGETLB_H */
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 94a948207cd2..0f896f17d5ab 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -916,13 +916,6 @@ void __init setup_arch(char **cmdline_p)
/* Reserve large chunks of memory for use by CMA for KVM. */
kvm_cma_reserve();
 
-   /*
-* Reserve any gigantic pages requested on the command line.
-* memblock needs to have been initialized by the time this is
-* called since this will reserve memory.
-*/
-   reserve_hugetlb_gpages();
-
klp_init_thread_info(_thread_info);
 
init_mm.start_code = (unsigned long)_stext;
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 7a20669c19e7..2f1f6bc04012 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -509,7 +509,7 @@ static int __init htab_dt_scan_hugepage_blocks(unsigned 
long node,
phys_addr, block_size, expected_pages);
if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) {
memblock_reserve(phys_addr, block_size * expected_pages);
-   add_gpage(phys_addr, block_size, expected_pages);
+   pseries_add_gpage(phys_addr, block_size, expected_pages);
}
return 0;
 }
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index e1bf5ca397fe..a0271d738a30 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -36,26 +36,6 @@
 unsigned int HPAGE_SHIFT;
 EXPORT_SYMBOL(HPAGE_SHIFT);
 
-/*
- * Tracks gpages after the device tree is scanned and before the
- * huge_boot_pages list is ready.  On non-Freescale implementations, this is
- * just used to track 16G pages and so is a single array.  FSL-based
- * implementations may have more than one gpage size, so we need multiple
- * arrays
- */
-#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
-#define MAX_NUMBER_GPAGES  128
-struct psize_gpages {
-   u64 gpage_list[MAX_NUMBER_GPAGES];
-   unsigned int nr_gpages;
-};
-static struct

[PATCH v3 1/3] mm/hugetlb: Allow arch to override and call the weak function

2017-07-27 Thread Aneesh Kumar K.V

For ppc64, we want to call this function when we are not running as guest.
Also, if we failed to allocate hugepages, let the user know.

Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/hugetlb.h | 1 +
 mm/hugetlb.c| 5 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 0ed8e41aaf11..8bbbd37ab105 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -358,6 +358,7 @@ int huge_add_to_page_cache(struct page *page, struct 
address_space *mapping,
pgoff_t idx);
 
 /* arch callback */
+int __init __alloc_bootmem_huge_page(struct hstate *h);
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
 void __init hugetlb_bad_size(void);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bc48ee783dd9..a3a7a7e6339e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2083,7 +2083,9 @@ struct page *alloc_huge_page_noerr(struct vm_area_struct 
*vma,
return page;
 }
 
-int __weak alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h)
+   __attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
+int __alloc_bootmem_huge_page(struct hstate *h)
 {
struct huge_bootmem_page *m;
int nr_nodes, node;
@@ -2104,6 +2106,7 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
goto found;
}
}
+   pr_info("Failed to allocate hugepage of size %ld\n", huge_page_size(h));
return 0;
 
 found:
-- 
2.13.3

85 matches

Mail list logo