Re: [PATCH] lib/xz: Fix powerpc build with KERNEL_XZ

2018-09-18 Thread Christophe LEROY




Le 19/09/2018 à 01:07, Joel Stanley a écrit :

This partially reverts faa16bc404d72a5 ("lib: Use existing define with
polynomial").

The cleanup added a dependency on include/linux, which broke the PowerPC
boot wrapper/decompresser when KERNEL_XZ is enabled:

   BOOTCC  arch/powerpc/boot/decompress.o
  In file included from arch/powerpc/boot/../../../lib/decompress_unxz.c:233,
  from arch/powerpc/boot/decompress.c:42:
  arch/powerpc/boot/../../../lib/xz/xz_crc32.c:18:10: fatal error:
  linux/crc32poly.h: No such file or directory
   #include 
^~~

The powerpc decompressor is a hairy corner of the kernel. Even while building
a 64-bit kernel it needs to build a 32-bit binary and therefore avoid including
files from include/linux.

Fixes: faa16bc404d7 ("lib: Use existing define with polynomial")
Signed-off-by: Joel Stanley 
---
We need to clean up the powerpc boot decompresser but that work will be
more involved than we would include in a late -rc. Please consider
merging this fix for 4.19. Thanks!

  lib/xz/xz_crc32.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c
index 25a5d87e2e4c..34532d14fd4c 100644
--- a/lib/xz/xz_crc32.c
+++ b/lib/xz/xz_crc32.c
@@ -15,7 +15,6 @@
   * but they are bigger and use more memory for the lookup table.
   */
  
-#include 

  #include "xz_private.h"
  
  /*

@@ -30,7 +29,7 @@ STATIC_RW_DATA uint32_t xz_crc32_table[256];
  
  XZ_EXTERN void xz_crc32_init(void)

  {
-   const uint32_t poly = CRC32_POLY_LE;
+   const uint32_t poly = 0xEDB88320;


Maybe avoid capital letters ?

What about adding something like the following in xz_private.h instead:

#define CRC32_POLY_LE 0xedb88320

Christophe

  
  	uint32_t i;

uint32_t j;



Re: [PATCH v4 16/20] powerpc/mm: Extend pte_fragment functionality to nohash/32

2018-09-18 Thread Aneesh Kumar K.V

On 9/18/18 10:27 PM, Christophe Leroy wrote:

In order to allow the 8xx to handle pte_fragments, this patch
extends the use of pte_fragments to nohash/32 platforms.

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/mmu-40x.h   |  1 +
  arch/powerpc/include/asm/mmu-44x.h   |  1 +
  arch/powerpc/include/asm/mmu-8xx.h   |  1 +
  arch/powerpc/include/asm/mmu-book3e.h|  1 +
  arch/powerpc/include/asm/mmu_context.h   |  2 +-
  arch/powerpc/include/asm/nohash/32/pgalloc.h | 43 +++-
  arch/powerpc/include/asm/nohash/32/pgtable.h |  7 +++--
  arch/powerpc/include/asm/page.h  |  6 +---
  arch/powerpc/include/asm/pgtable.h   |  8 ++
  arch/powerpc/mm/Makefile |  3 ++
  arch/powerpc/mm/mmu_context_nohash.c |  1 +
  arch/powerpc/mm/pgtable-frag.c   |  6 
  arch/powerpc/mm/pgtable_32.c |  8 --
  13 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-40x.h 
b/arch/powerpc/include/asm/mmu-40x.h
index 74f4edb5916e..7c77ceed71d6 100644
--- a/arch/powerpc/include/asm/mmu-40x.h
+++ b/arch/powerpc/include/asm/mmu-40x.h
@@ -58,6 +58,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
  } mm_context_t;

  #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-44x.h 
b/arch/powerpc/include/asm/mmu-44x.h
index 295b3dbb2698..3d72e889ae7b 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -109,6 +109,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
  } mm_context_t;

  #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index fa05aa566ece..750cef6f65e3 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -179,6 +179,7 @@ typedef struct {
unsigned int id;
unsigned int active;
unsigned long vdso_base;
+   void *pte_frag;
  #ifdef CONFIG_PPC_MM_SLICES
u16 user_psize; /* page size index */
unsigned char low_slices_psize[SLICE_ARRAY_SIZE];
diff --git a/arch/powerpc/include/asm/mmu-book3e.h 
b/arch/powerpc/include/asm/mmu-book3e.h
index e20072972e35..8e8aad5172ab 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -230,6 +230,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
  } mm_context_t;

  /* Page size definitions, common between 32 and 64-bit
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index b2f89b621b15..7f2c37a3f99d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -222,7 +222,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
return 0;
  }

-#ifndef CONFIG_PPC_BOOK3S_64
+#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_BOOK3S_32)
  static inline void arch_exit_mmap(struct mm_struct *mm)
  {
  }
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h 
b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index f3fec9052f31..e69423ad8e2e 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -27,6 +27,9 @@ extern void __bad_pte(pmd_t *pmd);
  extern struct kmem_cache *pgtable_cache[];
  #define PGT_CACHE(shift) pgtable_cache[shift]

+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int 
kernel);
+void pte_fragment_free(unsigned long *table, int kernel);
+
  static inline pgd_t *pgd_alloc(struct mm_struct *mm)
  {
return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
@@ -58,11 +61,10 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmdp,
  static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pte_page)
  {
-   *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_USER |
- _PMD_PRESENT);
+   *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
  }

-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
  #else

  static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
@@ -74,49 +76,38 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmdp,
  static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pte_page)
  {
-   *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | 
_PMD_PRESENT);
+   *pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT);
  }

-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
  #endif


Re: [PATCH v4 15/20] powerpc/mm: Avoid useless lock with single page fragments

2018-09-18 Thread Aneesh Kumar K.V

On 9/18/18 10:27 PM, Christophe Leroy wrote:

There is no point in taking the page table lock as
pte_frag is always NULL when we have only one fragment.

Signed-off-by: Christophe Leroy 
---
  arch/powerpc/mm/pgtable-frag.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index bc924822dcd6..ab4910e92aaf 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -85,6 +85,9 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
  {
void *pte_frag, *ret;

+   if (PTE_FRAG_NR == 1)
+   return NULL;
+
spin_lock(>page_table_lock);
ret = mm->context.pte_frag;
if (ret) {



May be update get_pmd_from_cache too?

-aneesh



Re: [PATCH net-next] net: ibm: fix return type of ndo_start_xmit function

2018-09-18 Thread David Miller
From: YueHaibing 
Date: Tue, 18 Sep 2018 14:35:47 +0800

> The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
> which is a typedef for an enum type, so make sure the implementation in
> this driver has returns 'netdev_tx_t' value, and change the function
> return type to netdev_tx_t.
> 
> Found by coccinelle.
> 
> Signed-off-by: YueHaibing 

Applied.


[PATCH crypto-next 13/23] crypto: vmx - Remove VLA usage of skcipher

2018-09-18 Thread Kees Cook
In the quest to remove all stack VLA usage from the kernel[1], this
replaces struct crypto_skcipher and SKCIPHER_REQUEST_ON_STACK() usage
with struct crypto_sync_skcipher and SYNC_SKCIPHER_REQUEST_ON_STACK(),
which uses a fixed stack size.

[1] 
https://lkml.kernel.org/r/CA+55aFzCG-zNmZwX4A2FQpadafLfEzK6CC=qpxydaacu1rq...@mail.gmail.com

Cc: "Leonidas S. Barbosa" 
Cc: Paulo Flabiano Smorigo 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Kees Cook 
---
 drivers/crypto/vmx/aes_cbc.c | 22 +++---
 drivers/crypto/vmx/aes_ctr.c | 18 +-
 drivers/crypto/vmx/aes_xts.c | 18 +-
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c
index b71895871be3..c5c5ff82b52e 100644
--- a/drivers/crypto/vmx/aes_cbc.c
+++ b/drivers/crypto/vmx/aes_cbc.c
@@ -32,7 +32,7 @@
 #include "aesp8-ppc.h"
 
 struct p8_aes_cbc_ctx {
-   struct crypto_skcipher *fallback;
+   struct crypto_sync_skcipher *fallback;
struct aes_key enc_key;
struct aes_key dec_key;
 };
@@ -40,11 +40,11 @@ struct p8_aes_cbc_ctx {
 static int p8_aes_cbc_init(struct crypto_tfm *tfm)
 {
const char *alg = crypto_tfm_alg_name(tfm);
-   struct crypto_skcipher *fallback;
+   struct crypto_sync_skcipher *fallback;
struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
 
-   fallback = crypto_alloc_skcipher(alg, 0,
-   CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+   fallback = crypto_alloc_sync_skcipher(alg, 0,
+ CRYPTO_ALG_NEED_FALLBACK);
 
if (IS_ERR(fallback)) {
printk(KERN_ERR
@@ -53,7 +53,7 @@ static int p8_aes_cbc_init(struct crypto_tfm *tfm)
return PTR_ERR(fallback);
}
 
-   crypto_skcipher_set_flags(
+   crypto_sync_skcipher_set_flags(
fallback,
crypto_skcipher_get_flags((struct crypto_skcipher *)tfm));
ctx->fallback = fallback;
@@ -66,7 +66,7 @@ static void p8_aes_cbc_exit(struct crypto_tfm *tfm)
struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm);
 
if (ctx->fallback) {
-   crypto_free_skcipher(ctx->fallback);
+   crypto_free_sync_skcipher(ctx->fallback);
ctx->fallback = NULL;
}
 }
@@ -86,7 +86,7 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 
*key,
pagefault_enable();
preempt_enable();
 
-   ret += crypto_skcipher_setkey(ctx->fallback, key, keylen);
+   ret += crypto_sync_skcipher_setkey(ctx->fallback, key, keylen);
return ret;
 }
 
@@ -100,8 +100,8 @@ static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc,
crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
if (in_interrupt()) {
-   SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
-   skcipher_request_set_tfm(req, ctx->fallback);
+   SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
+   skcipher_request_set_sync_tfm(req, ctx->fallback);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
ret = crypto_skcipher_encrypt(req);
@@ -139,8 +139,8 @@ static int p8_aes_cbc_decrypt(struct blkcipher_desc *desc,
crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm));
 
if (in_interrupt()) {
-   SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
-   skcipher_request_set_tfm(req, ctx->fallback);
+   SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback);
+   skcipher_request_set_sync_tfm(req, ctx->fallback);
skcipher_request_set_callback(req, desc->flags, NULL, NULL);
skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
ret = crypto_skcipher_decrypt(req);
diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c
index cd777c75291d..8a2fe092cb8e 100644
--- a/drivers/crypto/vmx/aes_ctr.c
+++ b/drivers/crypto/vmx/aes_ctr.c
@@ -32,18 +32,18 @@
 #include "aesp8-ppc.h"
 
 struct p8_aes_ctr_ctx {
-   struct crypto_skcipher *fallback;
+   struct crypto_sync_skcipher *fallback;
struct aes_key enc_key;
 };
 
 static int p8_aes_ctr_init(struct crypto_tfm *tfm)
 {
const char *alg = crypto_tfm_alg_name(tfm);
-   struct crypto_skcipher *fallback;
+   struct crypto_sync_skcipher *fallback;
struct p8_aes_ctr_ctx *ctx = crypto_tfm_ctx(tfm);
 
-   fallback = crypto_alloc_skcipher(alg, 0,
-   CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+   fallback = crypto_alloc_sync_skcipher(alg, 0,
+ CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(fallback)) {
printk(KERN_ERR
   "Failed to 

Re: [PATCH] MAINTAINERS: Add PPC contacts for PCI core error handling

2018-09-18 Thread Russell Currey
On Tue, 2018-09-18 at 16:58 -0500, Bjorn Helgaas wrote:
> On Wed, Sep 12, 2018 at 11:55:26AM -0500, Bjorn Helgaas wrote:
> > From: Bjorn Helgaas 
> > 
> > The original PCI error recovery functionality was for the powerpc-specific
> > IBM EEH feature.  PCIe subsequently added some similar features, including
> > AER and DPC, that can be used on any architecture.
> > 
> > We want the generic PCI core error handling support to work with all of
> > these features.  Driver error recovery callbacks should be independent of
> > which feature the platform provides.
> > 
> > Add the generic PCI core error recovery files to the powerpc EEH
> > MAINTAINERS entry so the powerpc folks will be copied on changes to the
> > generic PCI error handling strategy.
> > 
> > Signed-off-by: Bjorn Helgaas 
> 
> I applied the following to for-linus for v4.19.  Russell, if you want
> to be removed, let me know and I'll do that.

Oliver's email address for kernel stuff is ooh...@gmail.com, I think benh has 
been
CCing his IBM address.  But other than that,

Acked-by: Russell Currey 

Thanks for this, Bjorn.

- Russell

> 
> commit 3fed0e04026c
> Author: Bjorn Helgaas 
> Date:   Wed Sep 12 11:55:26 2018 -0500
> 
> MAINTAINERS: Update PPC contacts for PCI core error handling
> 
> The original PCI error recovery functionality was for the powerpc-specific
> IBM EEH feature.  PCIe subsequently added some similar features, including
> AER and DPC, that can be used on any architecture.
> 
> We want the generic PCI core error handling support to work with all of
> these features.  Driver error recovery callbacks should be independent of
> which feature the platform provides.
> 
> Add the generic PCI core error recovery files to the powerpc EEH
> MAINTAINERS entry so the powerpc folks will be copied on changes to the
> generic PCI error handling strategy.
> 
> Add Sam and Oliver as maintainers for this area.
> 
> Signed-off-by: Bjorn Helgaas 
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 4ece30f15777..f23244003836 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -11203,8 +11203,14 @@ F:   tools/pci/
>  
>  PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC
>  M:   Russell Currey 
> +M:   Sam Bobroff 
> +M:   Oliver O'Halloran 
>  L:   linuxppc-dev@lists.ozlabs.org
>  S:   Supported
> +F:   Documentation/PCI/pci-error-recovery.txt
> +F:   drivers/pci/pcie/aer.c
> +F:   drivers/pci/pcie/dpc.c
> +F:   drivers/pci/pcie/err.c
>  F:   Documentation/powerpc/eeh-pci-error-recovery.txt
>  F:   arch/powerpc/kernel/eeh*.c
>  F:   arch/powerpc/platforms/*/eeh*.c


Re: [PATCH v1 0/6] mm: online/offline_pages called w.o. mem_hotplug_lock

2018-09-18 Thread Balbir Singh
On Tue, Sep 18, 2018 at 01:48:16PM +0200, David Hildenbrand wrote:
> Reading through the code and studying how mem_hotplug_lock is to be used,
> I noticed that there are two places where we can end up calling
> device_online()/device_offline() - online_pages()/offline_pages() without
> the mem_hotplug_lock. And there are other places where we call
> device_online()/device_offline() without the device_hotplug_lock.
> 
> While e.g.
>   echo "online" > /sys/devices/system/memory/memory9/state
> is fine, e.g.
>   echo 1 > /sys/devices/system/memory/memory9/online
> Will not take the mem_hotplug_lock. However the device_lock() and
> device_hotplug_lock.
> 
> E.g. via memory_probe_store(), we can end up calling
> add_memory()->online_pages() without the device_hotplug_lock. So we can
> have concurrent callers in online_pages(). We e.g. touch in online_pages()
> basically unprotected zone->present_pages then.
> 
> Looks like there is a longer history to that (see Patch #2 for details),
> and fixing it to work the way it was intended is not really possible. We
> would e.g. have to take the mem_hotplug_lock in device/base/core.c, which
> sounds wrong.
> 
> Summary: We had a lock inversion on mem_hotplug_lock and device_lock().
> More details can be found in patch 3 and patch 6.
> 
> I propose the general rules (documentation added in patch 6):
> 
> 1. add_memory/add_memory_resource() must only be called with
>device_hotplug_lock.
> 2. remove_memory() must only be called with device_hotplug_lock. This is
>already documented and holds for all callers.
> 3. device_online()/device_offline() must only be called with
>device_hotplug_lock. This is already documented and true for now in core
>code. Other callers (related to memory hotplug) have to be fixed up.
> 4. mem_hotplug_lock is taken inside of add_memory/remove_memory/
>online_pages/offline_pages.
> 
> To me, this looks way cleaner than what we have right now (and easier to
> verify). And looking at the documentation of remove_memory, using
> lock_device_hotplug also for add_memory() feels natural.
>

That seems reasonable, but also implies that device_online() would hold
back add/remove memory, could you please also document what mode
read/write the locks need to be held? For example can the device_hotplug_lock
be held in read mode while add/remove memory via (mem_hotplug_lock) is held
in write mode?

Balbir Singh.
 


Re: MPC83xx reset status register (RSR, offset 0x910)

2018-09-18 Thread Radu Rendec
Hi Christophe,

On Thu, 2018-09-13 at 10:21 +0200, Christophe LEROY wrote:
>
> Le 11/09/2018 à 00:17, Radu Rendec a écrit :
> >
> > The MPC83xx also has a watchdog and the kernel driver (mpc8xxx_wdt.c)
> > could also be improved to support the WDIOC_GETBOOTSTATUS ioctl and
> > properly report if the system rebooted due to a watchdog.
>
> Very good idea.
>
> I just submitted a patch for that. Please look at it.
> I'm sure any driver which needs reset status information can do the same.

Thanks for submitting the patch and sorry for the late reply! I followed
the conversation between you and Guenter and it seems your patches are
almost accepted. That's a good thing.

> If we want to do something more central, maybe we should look at what
> was done on ARM:
>
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=04fef228fb00

Thanks for pointing out that commit. It's very similar to what I wanted
to do (for MPC83xx) in the first place: read the RSR value on start-up
into a variable and export it as a symbol to make it available to other
drivers.

I would take on the work to implement something similar for PowerPC, but
I need some guidance as to what goes where. For instance, what would be
the PowerPC equivalent of arch/arm/mach-pxa/reset.c, which defines the
reset_status variable?

Another question is if the device tree should be used. We already have
separate directories for each platform in arch/powerpc/platforms and I
guess for each platform the RSR is always there and at a fixed, well
known address.

Thanks,
Radu



[PATCH] lib/xz: Fix powerpc build with KERNEL_XZ

2018-09-18 Thread Joel Stanley
This partially reverts faa16bc404d72a5 ("lib: Use existing define with
polynomial").

The cleanup added a dependency on include/linux, which broke the PowerPC
boot wrapper/decompresser when KERNEL_XZ is enabled:

  BOOTCC  arch/powerpc/boot/decompress.o
 In file included from arch/powerpc/boot/../../../lib/decompress_unxz.c:233,
 from arch/powerpc/boot/decompress.c:42:
 arch/powerpc/boot/../../../lib/xz/xz_crc32.c:18:10: fatal error:
 linux/crc32poly.h: No such file or directory
  #include 
   ^~~

The powerpc decompressor is a hairy corner of the kernel. Even while building
a 64-bit kernel it needs to build a 32-bit binary and therefore avoid including
files from include/linux.

Fixes: faa16bc404d7 ("lib: Use existing define with polynomial")
Signed-off-by: Joel Stanley 
---
We need to clean up the powerpc boot decompresser but that work will be
more involved than we would include in a late -rc. Please consider
merging this fix for 4.19. Thanks!

 lib/xz/xz_crc32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c
index 25a5d87e2e4c..34532d14fd4c 100644
--- a/lib/xz/xz_crc32.c
+++ b/lib/xz/xz_crc32.c
@@ -15,7 +15,6 @@
  * but they are bigger and use more memory for the lookup table.
  */
 
-#include 
 #include "xz_private.h"
 
 /*
@@ -30,7 +29,7 @@ STATIC_RW_DATA uint32_t xz_crc32_table[256];
 
 XZ_EXTERN void xz_crc32_init(void)
 {
-   const uint32_t poly = CRC32_POLY_LE;
+   const uint32_t poly = 0xEDB88320;
 
uint32_t i;
uint32_t j;
-- 
2.17.1



Re: [PATCH] powerpc/mpc85xx: fix issues in clock node

2018-09-18 Thread Scott Wood
On Tue, 2018-09-11 at 10:12 +0800, andy.t...@nxp.com wrote:
> From: Yuantian Tang 
> 
> The compatible string is not correct in the clock node.
> The clocks property refers to the wrong node too.
> This patch is to fix them.
> 
> Signed-off-by: Tang Yuantian 
> ---
>  arch/powerpc/boot/dts/fsl/t1023si-post.dtsi |8 
>  1 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
> b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
> index 4908af5..763caf4 100644
> --- a/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
> +++ b/arch/powerpc/boot/dts/fsl/t1023si-post.dtsi
> @@ -348,7 +348,7 @@
>   mux0: mux0@0 {
>   #clock-cells = <0>;
>   reg = <0x0 4>;
> - compatible = "fsl,core-mux-clock";
> + compatible = "fsl,qoriq-core-mux-2.0";
>   clocks = < 0>, < 1>;
>   clock-names = "pll0_0", "pll0_1";
>   clock-output-names = "cmux0";
> @@ -356,9 +356,9 @@
>   mux1: mux1@20 {
>   #clock-cells = <0>;
>   reg = <0x20 4>;
> - compatible = "fsl,core-mux-clock";
> - clocks = < 0>, < 1>;
> - clock-names = "pll0_0", "pll0_1";
> + compatible = "fsl,qoriq-core-mux-2.0";
> + clocks = < 0>, < 1>;
> + clock-names = "pll1_0", "pll1_1";
>   clock-output-names = "cmux1";
>   };
>   };

These are the legacy nodes.  Why not just remove them instead of fixing them? 
Now that the cpufreq driver is fixed we could get rid of the legacy nodes for
all the chips.

-Scott



Re: [PATCH] MAINTAINERS: Add PPC contacts for PCI core error handling

2018-09-18 Thread Bjorn Helgaas
On Wed, Sep 12, 2018 at 11:55:26AM -0500, Bjorn Helgaas wrote:
> From: Bjorn Helgaas 
> 
> The original PCI error recovery functionality was for the powerpc-specific
> IBM EEH feature.  PCIe subsequently added some similar features, including
> AER and DPC, that can be used on any architecture.
> 
> We want the generic PCI core error handling support to work with all of
> these features.  Driver error recovery callbacks should be independent of
> which feature the platform provides.
> 
> Add the generic PCI core error recovery files to the powerpc EEH
> MAINTAINERS entry so the powerpc folks will be copied on changes to the
> generic PCI error handling strategy.
> 
> Signed-off-by: Bjorn Helgaas 

I applied the following to for-linus for v4.19.  Russell, if you want
to be removed, let me know and I'll do that.

commit 3fed0e04026c
Author: Bjorn Helgaas 
Date:   Wed Sep 12 11:55:26 2018 -0500

MAINTAINERS: Update PPC contacts for PCI core error handling

The original PCI error recovery functionality was for the powerpc-specific
IBM EEH feature.  PCIe subsequently added some similar features, including
AER and DPC, that can be used on any architecture.

We want the generic PCI core error handling support to work with all of
these features.  Driver error recovery callbacks should be independent of
which feature the platform provides.

Add the generic PCI core error recovery files to the powerpc EEH
MAINTAINERS entry so the powerpc folks will be copied on changes to the
generic PCI error handling strategy.

Add Sam and Oliver as maintainers for this area.

Signed-off-by: Bjorn Helgaas 

diff --git a/MAINTAINERS b/MAINTAINERS
index 4ece30f15777..f23244003836 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11203,8 +11203,14 @@ F: tools/pci/
 
 PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC
 M: Russell Currey 
+M: Sam Bobroff 
+M: Oliver O'Halloran 
 L: linuxppc-dev@lists.ozlabs.org
 S: Supported
+F: Documentation/PCI/pci-error-recovery.txt
+F: drivers/pci/pcie/aer.c
+F: drivers/pci/pcie/dpc.c
+F: drivers/pci/pcie/err.c
 F: Documentation/powerpc/eeh-pci-error-recovery.txt
 F: arch/powerpc/kernel/eeh*.c
 F: arch/powerpc/platforms/*/eeh*.c


Re: [PATCH v1 2/6] mm/memory_hotplug: make add_memory() take the device_hotplug_lock

2018-09-18 Thread Rafael J. Wysocki
On Tue, Sep 18, 2018 at 1:48 PM David Hildenbrand  wrote:
>
> add_memory() currently does not take the device_hotplug_lock, however
> is aleady called under the lock from
> arch/powerpc/platforms/pseries/hotplug-memory.c
> drivers/acpi/acpi_memhotplug.c
> to synchronize against CPU hot-remove and similar.
>
> In general, we should hold the device_hotplug_lock when adding memory
> to synchronize against online/offline request (e.g. from user space) -
> which already resulted in lock inversions due to device_lock() and
> mem_hotplug_lock - see 30467e0b3be ("mm, hotplug: fix concurrent memory
> hot-add deadlock"). add_memory()/add_memory_resource() will create memory
> block devices, so this really feels like the right thing to do.
>
> Holding the device_hotplug_lock makes sure that a memory block device
> can really only be accessed (e.g. via .online/.state) from user space,
> once the memory has been fully added to the system.
>
> The lock is not held yet in
> drivers/xen/balloon.c
> arch/powerpc/platforms/powernv/memtrace.c
> drivers/s390/char/sclp_cmd.c
> drivers/hv/hv_balloon.c
> So, let's either use the locked variants or take the lock.
>
> Don't export add_memory_resource(), as it once was exported to be used
> by XEN, which is never built as a module. If somebody requires it, we
> also have to export a locked variant (as device_hotplug_lock is never
> exported).
>
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Greg Kroah-Hartman 
> Cc: Boris Ostrovsky 
> Cc: Juergen Gross 
> Cc: Nathan Fontenot 
> Cc: John Allen 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Dan Williams 
> Cc: Joonsoo Kim 
> Cc: Vlastimil Babka 
> Cc: Oscar Salvador 
> Cc: Mathieu Malaterre 
> Cc: Pavel Tatashin 
> Cc: YASUAKI ISHIMATSU 
> Reviewed-by: Pavel Tatashin 
> Signed-off-by: David Hildenbrand 
> ---
>  .../platforms/pseries/hotplug-memory.c|  2 +-
>  drivers/acpi/acpi_memhotplug.c|  2 +-
>  drivers/base/memory.c |  9 ++--
>  drivers/xen/balloon.c |  3 +++
>  include/linux/memory_hotplug.h|  1 +
>  mm/memory_hotplug.c   | 22 ---
>  6 files changed, 32 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index b3f54466e25f..2e6f41dc103a 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -702,7 +702,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
> nid = memory_add_physaddr_to_nid(lmb->base_addr);
>
> /* Add the memory */
> -   rc = add_memory(nid, lmb->base_addr, block_sz);
> +   rc = __add_memory(nid, lmb->base_addr, block_sz);
> if (rc) {
> dlpar_remove_device_tree_lmb(lmb);
> return rc;
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 811148415993..8fe0960ea572 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -228,7 +228,7 @@ static int acpi_memory_enable_device(struct 
> acpi_memory_device *mem_device)
> if (node < 0)
> node = memory_add_physaddr_to_nid(info->start_addr);
>
> -   result = add_memory(node, info->start_addr, info->length);
> +   result = __add_memory(node, info->start_addr, info->length);
>
> /*
>  * If the memory block has been used by the kernel, 
> add_memory()
> diff --git a/drivers/base/memory.c b/drivers/base/memory.c
> index 817320c7c4c1..40cac122ec73 100644
> --- a/drivers/base/memory.c
> +++ b/drivers/base/memory.c
> @@ -519,15 +519,20 @@ memory_probe_store(struct device *dev, struct 
> device_attribute *attr,
> if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
> return -EINVAL;
>
> +   ret = lock_device_hotplug_sysfs();
> +   if (ret)
> +   goto out;
> +
> nid = memory_add_physaddr_to_nid(phys_addr);
> -   ret = add_memory(nid, phys_addr,
> -MIN_MEMORY_BLOCK_SIZE * sections_per_block);
> +   ret = __add_memory(nid, phys_addr,
> +  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
>
> if (ret)
> goto out;
>
> ret = count;
>  out:
> +   unlock_device_hotplug();
> return ret;
>  }
>
> diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
> index e12bb256036f..6bab019a82b1 100644
> --- a/drivers/xen/balloon.c
> +++ b/drivers/xen/balloon.c
> @@ -395,7 +395,10 @@ static enum bp_state reserve_additional_memory(void)
>  * callers drop the mutex before trying again.
>  */
> mutex_unlock(_mutex);
> +   /* add_memory_resource() requires the device_hotplug lock */
> 

Re: [PATCH v1 1/6] mm/memory_hotplug: make remove_memory() take the device_hotplug_lock

2018-09-18 Thread Rafael J. Wysocki
On Tue, Sep 18, 2018 at 1:48 PM David Hildenbrand  wrote:
>
> remove_memory() is exported right now but requires the
> device_hotplug_lock, which is not exported. So let's provide a variant
> that takes the lock and only export that one.
>
> The lock is already held in
> arch/powerpc/platforms/pseries/hotplug-memory.c
> drivers/acpi/acpi_memhotplug.c
> So, let's use the locked variant.
>
> The lock is not held (but taken in)
> arch/powerpc/platforms/powernv/memtrace.c
> So let's keep using the (now) locked variant.
>
> Apart from that, there are not other users in the tree.
>
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: "Rafael J. Wysocki" 
> Cc: Len Brown 
> Cc: Rashmica Gupta 
> Cc: Michael Neuling 
> Cc: Balbir Singh 
> Cc: Nathan Fontenot 
> Cc: John Allen 
> Cc: Andrew Morton 
> Cc: Michal Hocko 
> Cc: Dan Williams 
> Cc: Joonsoo Kim 
> Cc: Vlastimil Babka 
> Cc: Pavel Tatashin 
> Cc: Greg Kroah-Hartman 
> Cc: Oscar Salvador 
> Cc: YASUAKI ISHIMATSU 
> Cc: Mathieu Malaterre 
> Reviewed-by: Pavel Tatashin 
> Signed-off-by: David Hildenbrand 
> ---
>  arch/powerpc/platforms/powernv/memtrace.c   | 2 --
>  arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++---
>  drivers/acpi/acpi_memhotplug.c  | 2 +-
>  include/linux/memory_hotplug.h  | 3 ++-
>  mm/memory_hotplug.c | 9 -
>  5 files changed, 14 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
> b/arch/powerpc/platforms/powernv/memtrace.c
> index 51dc398ae3f7..8f1cd4f3bfd5 100644
> --- a/arch/powerpc/platforms/powernv/memtrace.c
> +++ b/arch/powerpc/platforms/powernv/memtrace.c
> @@ -90,9 +90,7 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, 
> u64 nr_pages)
> walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
>   change_memblock_state);
>
> -   lock_device_hotplug();
> remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
> -   unlock_device_hotplug();
>
> return true;
>  }
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index c1578f54c626..b3f54466e25f 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -334,7 +334,7 @@ static int pseries_remove_memblock(unsigned long base, 
> unsigned int memblock_siz
> nid = memory_add_physaddr_to_nid(base);
>
> for (i = 0; i < sections_per_block; i++) {
> -   remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
> +   __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
> base += MIN_MEMORY_BLOCK_SIZE;
> }
>
> @@ -423,7 +423,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
> block_sz = pseries_memory_block_size();
> nid = memory_add_physaddr_to_nid(lmb->base_addr);
>
> -   remove_memory(nid, lmb->base_addr, block_sz);
> +   __remove_memory(nid, lmb->base_addr, block_sz);
>
> /* Update memory regions for memory remove */
> memblock_remove(lmb->base_addr, block_sz);
> @@ -710,7 +710,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
>
> rc = dlpar_online_lmb(lmb);
> if (rc) {
> -   remove_memory(nid, lmb->base_addr, block_sz);
> +   __remove_memory(nid, lmb->base_addr, block_sz);
> dlpar_remove_device_tree_lmb(lmb);
> } else {
> lmb->flags |= DRCONF_MEM_ASSIGNED;
> diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
> index 6b0d3ef7309c..811148415993 100644
> --- a/drivers/acpi/acpi_memhotplug.c
> +++ b/drivers/acpi/acpi_memhotplug.c
> @@ -282,7 +282,7 @@ static void acpi_memory_remove_memory(struct 
> acpi_memory_device *mem_device)
> nid = memory_add_physaddr_to_nid(info->start_addr);
>
> acpi_unbind_memory_blocks(info);
> -   remove_memory(nid, info->start_addr, info->length);
> +   __remove_memory(nid, info->start_addr, info->length);
> list_del(>list);
> kfree(info);
> }
> diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> index 34a28227068d..1f096852f479 100644
> --- a/include/linux/memory_hotplug.h
> +++ b/include/linux/memory_hotplug.h
> @@ -301,6 +301,7 @@ extern bool is_mem_section_removable(unsigned long pfn, 
> unsigned long nr_pages);
>  extern void try_offline_node(int nid);
>  extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
>  extern void remove_memory(int nid, u64 start, u64 size);
> +extern void __remove_memory(int nid, u64 start, u64 size);
>
>  #else
>  static inline bool is_mem_section_removable(unsigned long pfn,
> @@ -317,6 +318,7 @@ static inline int offline_pages(unsigned long start_pfn, 
> unsigned long nr_pages)
>  }
>
>  

Re: [PATCH RFC 1/4] PCI: hotplug: Add parameter to put devices to reset during rescan

2018-09-18 Thread Bjorn Helgaas
On Tue, Sep 18, 2018 at 05:01:48PM +0300, Sergey Miroshnichenko wrote:
> On 9/18/18 1:59 AM, Bjorn Helgaas wrote:
> > On Mon, Sep 17, 2018 at 11:55:43PM +0300, Sergey Miroshnichenko wrote:
> >> On 9/17/18 8:28 AM, Sam Bobroff wrote:
> >>> On Fri, Sep 14, 2018 at 07:14:01PM +0300, Sergey Miroshnichenko wrote:
>  Introduce a new command line option "pci=pcie_movable_bars"
>  that indicates support of PCIe hotplug without prior
>  reservation of memory regions by BIOS/bootloader.

> >>> What about devices with drivers that don't have reset_prepare()?  It
> >>> looks like it will just reconfigure them anyway. Is that right?
> >>
> >> It is possible that unprepared driver without these hooks will get BARs
> >> moved, I should put a warning message there. There three ways we can see
> >> to make this safe:
> >>  - add the reset_prepare()/reset_done() hooks to *every* PCIe driver;
> >>  - refuse BAR movement if at least one unprepared driver has been
> >> encountered during rescan;
> >>  - reduce the number of drivers which can be affected to some
> >> controllable value and prepare them on demand.
> >>
> >> Applying the second proposal as a major restriction seems fairly
> >> reasonable, but for our particular setups and use-cases it is probably
> >> too stiff:
> >>  - we've noticed that devices connected directly to the root bridge
> >> don't get moved BARs, and this covers our x86_64 servers: we only
> >> insert/remove devices into "second-level" and "lower" bridges there, but
> >> not root;
> >>  - on PowerNV we have system devices (network interfaces, USB hub, etc.)
> >> grouped into dedicated domain, with all other domains ready for hotplug,
> >> and only these domains can be rescanned.
> >>
> >> With our scenarios currently reduced to these two, we can live with just
> >> a few drivers "prepared" for now: NVME and few Ethernet adapters, this
> >> gives us a possibility to use this feature before "converting" *all* the
> >> drivers, and even have the NVidia cards running on a closed proprietary
> >> driver.
> >>
> >> Should we make this behavior adjustable with something like
> >> "pcie_movable_bars=safe" and "pcie_movable_bars=always" ?
> > 
> > I like the overall idea of this a lot.
> > 
> >   - Why do we need a command line parameter to enable this?  Can't we
> > do it unconditionally and automatically when it's possible?  We
> > could have a chicken switch to *disable* it in case this breaks
> > something horribly, but I would like this functionality to be
> > always available without a special option.
> 
> After making this feature completely safe we could activate it with the
> existing option "pci=realloc".

That *sounds* good, but in practice it never happens that we decide a
feature is completely safe and somebody makes it the default.  If
we're going to do this, I think we need to commit to making it work
100% of the time, with no option needed.

> >   - I'm not sure the existence of .reset_done() is evidence that a
> > driver is prepared for its BARs to move.  I don't see any
> > documentation that refers to BAR movement, and I doubt it's been
> > tested.  But I only see 5 implementations in the tree, so it'd be
> > easy to verify.
> 
> You are right, and we should clarify the description:
>  - drivers which have the .reset_done() already - none of them are aware
> of movable BARs yet;
>  - the rest of the drivers should both be able to pause and handle the
> changes in BARs.

This doesn't clarify it for me.  If you want to update all existing
.reset_done() methods so they deal with BAR changes, that would be
fine with me.  That would be done by preliminary patches in the series
that adds the feature.

> >   - I think your second proposal above sounds right: we should regard
> > any device whose driver lacks .reset_done() as immovable.  We will
> > likely be able to move some devices but not others.  Implementing
> > .reset_done() will increase flexibility but it shouldn't be a
> > requirement for all drivers.
> 
> Thanks for the advice! This is more flexible and doesn't have any
> prerequisites. In this case the greater the "movable"/"immovable" ratio
> of the devices that was working before the hotplug event - the higher
> the probability to free some space for new BARs. But even a single
> "immovable" device at an undesirable place can block the re-arrangement,
> in this case all we can is just give up and print an error message.

Right.  There's nothing we can do about that except make the relevant
drivers smarter.

> This patchset in its current form doesn't support marking a choosen BAR
> as immovable (just releasing all the resources of the root bridge and
> trying to sort and re-assign them back), so I'll have to implement that.

The current IORESOURCE_PCI_FIXED usage is for things that literally
*cannot* be moved because there is no BAR at all (VGA or IDE legacy,
enhanced allocation (see pci_ea_read()) or there's some platform

Re: [PATCH 3/3] scripts/dtc: Update to upstream version v1.4.7-14-gc86da84d30e4

2018-09-18 Thread Frank Rowand
On 09/18/18 11:55, Rob Herring wrote:
> On Fri, Sep 14, 2018 at 2:32 PM Frank Rowand  wrote:
>>
>> On 09/13/18 13:28, Rob Herring wrote:
>>> Major changes are I2C and SPI bus checks, YAML output format (for
>>> future validation), some new libfdt functions, and more libfdt
>>> validation of dtbs.
>>>
>>> The YAML addition adds an optional dependency on libyaml. pkg-config is
>>> used to test for it and pkg-config became a kconfig dependency in 4.18.
>>
>> For Ubuntu, the libyaml dependency is provided by the packages:
>>
>>libyaml-0-2
>>libyaml-dev
> 
> Yes, but as it is not yet required by anything in the kernel I don't
> think that needs to be documented yet. Also, offhand, I don't think we
> generally document in the kernel distro specifics like package names.
> 
> Rob
> 

Agreed.  I was providing information that might save other people a bit
of research.  It is sufficiently visible in the email thread and does
not need to be in the commit message.


[REVIEW][PATCH 0/9] signal/powerpc: siginfo cleanups

2018-09-18 Thread Eric W. Biederman


This is the continuation of my work to sort out signaling of exceptions
with siginfo.  The old functions by passing siginfo resulted in many
cases of fields of siginfo that were not initialized and then passed to
userspace, and also resulted in callers getting confused and
initializing the wrong fields.  My remedy is to have specific functions
for sending each different kind of signal with siginfo.  Those functions
take the information needed to fill in siginfo and do the work
themselves.

This is my set of changes to update powerpc to use those functions.
Along with some refactoring so those functions can be cleanly used.

Folks please review and double check me.  I think I have kept these
changes simple and obviously correct but I am human and mess up
sometimes.

After these patches have had a chance to be reviewed I plan to merge
them by my siginfo tree.  If you would rather take them in the powerpc
tree let me know.   All of the prerequisites should have been merged
through Linus's tree several releases ago.

Eric W. Biederman (9):
  signal/powerpc: Use force_sig_mceerr as appropriate
  signal/powerpc: Remove pkey parameter from __bad_area
  signal/powerpc: Call _exception_pkey directly from bad_key_fault_exception
  signal/powerpc: Remove pkey parameter from __bad_area_nosemaphore
  signal/powerpc: Factor the common exception code into exception_common
  signal/powerpc: Call force_sig_fault from _exception
  signal/poewrpc: Specialize _exception_pkey for handling pkey exceptions
  signal/powerpc: Simplify _exception_pkey by using force_sig_pkuerr
  signal/powerpc: Use force_sig_fault where appropriate

 arch/powerpc/include/asm/bug.h|  2 +-
 arch/powerpc/kernel/process.c |  9 +
 arch/powerpc/kernel/traps.c   | 27 ---
 arch/powerpc/mm/fault.c   | 55 +--
 arch/powerpc/platforms/cell/spu_base.c|  4 +--
 arch/powerpc/platforms/cell/spufs/fault.c | 26 +--
 6 files changed, 57 insertions(+), 66 deletions(-)

Eric


Re: [PATCH 3/3] scripts/dtc: Update to upstream version v1.4.7-14-gc86da84d30e4

2018-09-18 Thread Rob Herring
On Fri, Sep 14, 2018 at 2:32 PM Frank Rowand  wrote:
>
> On 09/13/18 13:28, Rob Herring wrote:
> > Major changes are I2C and SPI bus checks, YAML output format (for
> > future validation), some new libfdt functions, and more libfdt
> > validation of dtbs.
> >
> > The YAML addition adds an optional dependency on libyaml. pkg-config is
> > used to test for it and pkg-config became a kconfig dependency in 4.18.
>
> For Ubuntu, the libyaml dependency is provided by the packages:
>
>libyaml-0-2
>libyaml-dev

Yes, but as it is not yet required by anything in the kernel I don't
think that needs to be documented yet. Also, offhand, I don't think we
generally document in the kernel distro specifics like package names.

Rob


[REVIEW][PATCH 9/9] signal/powerpc: Use force_sig_fault where appropriate

2018-09-18 Thread Eric W. Biederman
Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/kernel/process.c |  9 +---
 arch/powerpc/mm/fault.c   |  9 +---
 arch/powerpc/platforms/cell/spu_base.c|  4 ++--
 arch/powerpc/platforms/cell/spufs/fault.c | 26 +++
 4 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 913c5725cdb2..553a396e7fc1 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -620,8 +620,6 @@ void do_send_trap(struct pt_regs *regs, unsigned long 
address,
 void do_break (struct pt_regs *regs, unsigned long address,
unsigned long error_code)
 {
-   siginfo_t info;
-
current->thread.trap_nr = TRAP_HWBKPT;
if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
11, SIGSEGV) == NOTIFY_STOP)
@@ -634,12 +632,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
hw_breakpoint_disable();
 
/* Deliver the signal to userspace */
-   clear_siginfo();
-   info.si_signo = SIGTRAP;
-   info.si_errno = 0;
-   info.si_code = TRAP_HWBKPT;
-   info.si_addr = (void __user *)address;
-   force_sig_info(SIGTRAP, , current);
+   force_sig_fault(SIGTRAP, TRAP_HWBKPT, (void __user *)address, current);
 }
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 406d0e0ef096..1697e903bbf2 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -165,17 +165,10 @@ static noinline int bad_access(struct pt_regs *regs, 
unsigned long address)
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 vm_fault_t fault)
 {
-   siginfo_t info;
-
if (!user_mode(regs))
return SIGBUS;
 
current->thread.trap_nr = BUS_ADRERR;
-   clear_siginfo();
-   info.si_signo = SIGBUS;
-   info.si_errno = 0;
-   info.si_code = BUS_ADRERR;
-   info.si_addr = (void __user *)address;
 #ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
unsigned int lsb = 0; /* shutup gcc */
@@ -194,7 +187,7 @@ static int do_sigbus(struct pt_regs *regs, unsigned long 
address,
}
 
 #endif
-   force_sig_info(SIGBUS, , current);
+   force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, current);
return 0;
 }
 
diff --git a/arch/powerpc/platforms/cell/spu_base.c 
b/arch/powerpc/platforms/cell/spu_base.c
index 0c45cdbac4cf..7f12c7b78c0f 100644
--- a/arch/powerpc/platforms/cell/spu_base.c
+++ b/arch/powerpc/platforms/cell/spu_base.c
@@ -50,11 +50,11 @@ struct cbe_spu_info cbe_spu_info[MAX_NUMNODES];
 EXPORT_SYMBOL_GPL(cbe_spu_info);
 
 /*
- * The spufs fault-handling code needs to call force_sig_info to raise signals
+ * The spufs fault-handling code needs to call force_sig_fault to raise signals
  * on DMA errors. Export it here to avoid general kernel-wide access to this
  * function
  */
-EXPORT_SYMBOL_GPL(force_sig_info);
+EXPORT_SYMBOL_GPL(force_sig_fault);
 
 /*
  * Protects cbe_spu_info and spu->number.
diff --git a/arch/powerpc/platforms/cell/spufs/fault.c 
b/arch/powerpc/platforms/cell/spufs/fault.c
index 83cf58daaa79..971ac43b5d60 100644
--- a/arch/powerpc/platforms/cell/spufs/fault.c
+++ b/arch/powerpc/platforms/cell/spufs/fault.c
@@ -36,42 +36,32 @@
 static void spufs_handle_event(struct spu_context *ctx,
unsigned long ea, int type)
 {
-   siginfo_t info;
-
if (ctx->flags & SPU_CREATE_EVENTS_ENABLED) {
ctx->event_return |= type;
wake_up_all(>stop_wq);
return;
}
 
-   clear_siginfo();
-
switch (type) {
case SPE_EVENT_INVALID_DMA:
-   info.si_signo = SIGBUS;
-   info.si_code = BUS_OBJERR;
+   force_sig_fault(SIGBUS, BUS_OBJERR, NULL, current);
break;
case SPE_EVENT_SPE_DATA_STORAGE:
-   info.si_signo = SIGSEGV;
-   info.si_addr = (void __user *)ea;
-   info.si_code = SEGV_ACCERR;
ctx->ops->restart_dma(ctx);
+   force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *)ea,
+   current);
break;
case SPE_EVENT_DMA_ALIGNMENT:
-   info.si_signo = SIGBUS;
/* DAR isn't set for an alignment fault :( */
-   info.si_code = BUS_ADRALN;
+   force_sig_fault(SIGBUS, BUS_ADRALN, NULL, current);
break;
case SPE_EVENT_SPE_ERROR:
-   info.si_signo = SIGILL;
-   info.si_addr = (void __user *)(unsigned long)
-   ctx->ops->npc_read(ctx) - 4;
-   info.si_code = ILL_ILLOPC;
+   force_sig_fault(
+   SIGILL, ILL_ILLOPC,
+   

[REVIEW][PATCH 8/9] signal/powerpc: Simplify _exception_pkey by using force_sig_pkuerr

2018-09-18 Thread Eric W. Biederman
Call force_sig_pkuerr directly instead of rolling it by hand
in _exception_pkey.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/kernel/traps.c | 10 +-
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index e5ea69222459..ab1bd06d7c44 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -364,18 +364,10 @@ static bool exception_common(int signr, struct pt_regs 
*regs, int code,
 
 void _exception_pkey(struct pt_regs *regs, unsigned long addr, int key)
 {
-   siginfo_t info;
-
if (!exception_common(SIGSEGV, regs, SEGV_PKUERR, addr))
return;
 
-   clear_siginfo();
-   info.si_signo = SIGSEGV;
-   info.si_code = SEGV_PKUERR;
-   info.si_addr = (void __user *) addr;
-   info.si_pkey = key;
-
-   force_sig_info(info.si_signo, , current);
+   force_sig_pkuerr((void __user *) addr, key);
 }
 
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
-- 
2.17.1



[REVIEW][PATCH 7/9] signal/poewrpc: Specialize _exception_pkey for handling pkey exceptions

2018-09-18 Thread Eric W. Biederman
Now that _exception no longer calls _exception_pkey it is no longer
necessary to handle any signal with any si_code.  All pkey exceptions
are SIGSEGV with paired with SEGV_PKUERR.  So just handle
that case and remove the now unnecessary parameters from _exception_pkey.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/include/asm/bug.h |  2 +-
 arch/powerpc/kernel/traps.c| 10 +-
 arch/powerpc/mm/fault.c|  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index fd06dbe7d7d3..fed7e6241349 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -133,7 +133,7 @@ struct pt_regs;
 extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
-extern void _exception_pkey(int, struct pt_regs *, int, unsigned long, int);
+extern void _exception_pkey(struct pt_regs *, unsigned long, int);
 extern void die(const char *, struct pt_regs *, long);
 extern bool die_will_crash(void);
 extern void panic_flush_kmsg_start(void);
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index c38bec51dd84..e5ea69222459 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -362,20 +362,20 @@ static bool exception_common(int signr, struct pt_regs 
*regs, int code,
return true;
 }
 
-void _exception_pkey(int signr, struct pt_regs *regs, int code, unsigned long 
addr, int key)
+void _exception_pkey(struct pt_regs *regs, unsigned long addr, int key)
 {
siginfo_t info;
 
-   if (!exception_common(signr, regs, code, addr))
+   if (!exception_common(SIGSEGV, regs, SEGV_PKUERR, addr))
return;
 
clear_siginfo();
-   info.si_signo = signr;
-   info.si_code = code;
+   info.si_signo = SIGSEGV;
+   info.si_code = SEGV_PKUERR;
info.si_addr = (void __user *) addr;
info.si_pkey = key;
 
-   force_sig_info(signr, , current);
+   force_sig_info(info.si_signo, , current);
 }
 
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index a84d06b7d50d..406d0e0ef096 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -152,7 +152,7 @@ static int bad_key_fault_exception(struct pt_regs *regs, 
unsigned long address,
if (!user_mode(regs))
return SIGSEGV;
 
-   _exception_pkey(SIGSEGV, regs, SEGV_PKUERR, address, pkey);
+   _exception_pkey(regs, address, pkey);
 
return 0;
 }
-- 
2.17.1



Re: [PATCH v2 05/17] compat_ioctl: move more drivers to generic_compat_ioctl_ptrarg

2018-09-18 Thread Jason Gunthorpe
On Tue, Sep 18, 2018 at 10:51:08AM -0700, Darren Hart wrote:
> On Fri, Sep 14, 2018 at 09:57:48PM +0100, Al Viro wrote:
> > On Fri, Sep 14, 2018 at 01:35:06PM -0700, Darren Hart wrote:
> >  
> > > Acked-by: Darren Hart (VMware) 
> > > 
> > > As for a longer term solution, would it be possible to init fops in such
> > > a way that the compat_ioctl call defaults to generic_compat_ioctl_ptrarg
> > > so we don't have to duplicate this boilerplate for every ioctl fops
> > > structure?
> > 
> > Bad idea, that...  Because several years down the road somebody will add
> > an ioctl that takes an unsigned int for argument.  Without so much as 
> > looking
> > at your magical mystery macro being used to initialize file_operations.
> 
> Fair, being explicit in the declaration as it is currently may be
> preferable then.

It would be much cleaner and safer if you could arrange things to add
something like this to struct file_operations:

  long (*ptr_ioctl) (struct file *, unsigned int, void __user *);

Where the core code automatically converts the unsigned long to the
void __user * as appropriate.

Then it just works right always and the compiler will help address
Al's concern down the road.

Cheers,
Jason


[REVIEW][PATCH 6/9] signal/powerpc: Call force_sig_fault from _exception

2018-09-18 Thread Eric W. Biederman
The callers of _exception don't need the pkey exception logic because
they are not processing a pkey exception.  So just call exception_common
directly and then call force_sig_fault to generate the appropriate siginfo
and deliver the appropriate signal.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/kernel/traps.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f6c778b5144f..c38bec51dd84 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -380,7 +380,10 @@ void _exception_pkey(int signr, struct pt_regs *regs, int 
code, unsigned long ad
 
 void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
 {
-   _exception_pkey(signr, regs, code, addr, 0);
+   if (!exception_common(signr, regs, code, addr))
+   return;
+
+   force_sig_fault(signr, code, (void __user *)addr, current);
 }
 
 void system_reset_exception(struct pt_regs *regs)
-- 
2.17.1



[REVIEW][PATCH 5/9] signal/powerpc: Factor the common exception code into exception_common

2018-09-18 Thread Eric W. Biederman
It is brittle and wrong to populate si_pkey when there was not a pkey
exception.  The field does not exist for all si_codes and in some
cases another field exists in the same memory location.

So factor out the code that all exceptions handlers must run
into exception_common, leaving the individual exception handlers
to generate the signals themselves.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/kernel/traps.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f651fa91cdc9..f6c778b5144f 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -338,14 +338,12 @@ static void show_signal_msg(int signr, struct pt_regs 
*regs, int code,
show_user_instructions(regs);
 }
 
-void _exception_pkey(int signr, struct pt_regs *regs, int code,
-unsigned long addr, int key)
+static bool exception_common(int signr, struct pt_regs *regs, int code,
+ unsigned long addr)
 {
-   siginfo_t info;
-
if (!user_mode(regs)) {
die("Exception in kernel mode", regs, signr);
-   return;
+   return false;
}
 
show_signal_msg(signr, regs, code, addr);
@@ -361,6 +359,16 @@ void _exception_pkey(int signr, struct pt_regs *regs, int 
code,
 */
thread_pkey_regs_save(>thread);
 
+   return true;
+}
+
+void _exception_pkey(int signr, struct pt_regs *regs, int code, unsigned long 
addr, int key)
+{
+   siginfo_t info;
+
+   if (!exception_common(signr, regs, code, addr))
+   return;
+
clear_siginfo();
info.si_signo = signr;
info.si_code = code;
-- 
2.17.1



[REVIEW][PATCH 4/9] signal/powerpc: Remove pkey parameter from __bad_area_nosemaphore

2018-09-18 Thread Eric W. Biederman
Now that bad_key_fault_exception no longer calls __bad_area_nosemaphore
there is no reason for __bad_area_nosemaphore to handle pkeys.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/mm/fault.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 5afc1ee55043..a84d06b7d50d 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -103,8 +103,7 @@ static bool store_updates_sp(unsigned int inst)
  */
 
 static int
-__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int 
si_code,
-   int pkey)
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int 
si_code)
 {
/*
 * If we are in kernel mode, bail out with a SEGV, this will
@@ -114,14 +113,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned 
long address, int si_code,
if (!user_mode(regs))
return SIGSEGV;
 
-   _exception_pkey(SIGSEGV, regs, si_code, address, pkey);
+   _exception(SIGSEGV, regs, si_code, address);
 
return 0;
 }
 
 static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
address)
 {
-   return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
+   return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
 }
 
 static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
@@ -134,7 +133,7 @@ static int __bad_area(struct pt_regs *regs, unsigned long 
address, int si_code)
 */
up_read(>mmap_sem);
 
-   return __bad_area_nosemaphore(regs, address, si_code, 0);
+   return __bad_area_nosemaphore(regs, address, si_code);
 }
 
 static noinline int bad_area(struct pt_regs *regs, unsigned long address)
-- 
2.17.1



[REVIEW][PATCH 3/9] signal/powerpc: Call _exception_pkey directly from bad_key_fault_exception

2018-09-18 Thread Eric W. Biederman
This removes the need for other code paths to deal with pkey exceptions.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/mm/fault.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index e5725fa96a48..5afc1ee55043 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,7 +145,17 @@ static noinline int bad_area(struct pt_regs *regs, 
unsigned long address)
 static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
int pkey)
 {
-   return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey);
+   /*
+* If we are in kernel mode, bail out with a SEGV, this will
+* be caught by the assembly which will restore the non-volatile
+* registers before calling bad_page_fault()
+*/
+   if (!user_mode(regs))
+   return SIGSEGV;
+
+   _exception_pkey(SIGSEGV, regs, SEGV_PKUERR, address, pkey);
+
+   return 0;
 }
 
 static noinline int bad_access(struct pt_regs *regs, unsigned long address)
-- 
2.17.1



[REVIEW][PATCH 2/9] signal/powerpc: Remove pkey parameter from __bad_area

2018-09-18 Thread Eric W. Biederman
There are no callers of __bad_area that pass in a pkey parameter so it makes
no sense to take one.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/mm/fault.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 22d7f8748cd7..e5725fa96a48 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -124,8 +124,7 @@ static noinline int bad_area_nosemaphore(struct pt_regs 
*regs, unsigned long add
return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
 }
 
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
-   int pkey)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
 {
struct mm_struct *mm = current->mm;
 
@@ -135,12 +134,12 @@ static int __bad_area(struct pt_regs *regs, unsigned long 
address, int si_code,
 */
up_read(>mmap_sem);
 
-   return __bad_area_nosemaphore(regs, address, si_code, pkey);
+   return __bad_area_nosemaphore(regs, address, si_code, 0);
 }
 
 static noinline int bad_area(struct pt_regs *regs, unsigned long address)
 {
-   return __bad_area(regs, address, SEGV_MAPERR, 0);
+   return __bad_area(regs, address, SEGV_MAPERR);
 }
 
 static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
@@ -151,7 +150,7 @@ static int bad_key_fault_exception(struct pt_regs *regs, 
unsigned long address,
 
 static noinline int bad_access(struct pt_regs *regs, unsigned long address)
 {
-   return __bad_area(regs, address, SEGV_ACCERR, 0);
+   return __bad_area(regs, address, SEGV_ACCERR);
 }
 
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
-- 
2.17.1



[REVIEW][PATCH 1/9] signal/powerpc: Use force_sig_mceerr as appropriate

2018-09-18 Thread Eric W. Biederman
In do_sigbus isolate the mceerr signaling code and call
force_sig_mceerr instead of falling through to the force_sig_info that
works for all of the other signals.

Signed-off-by: "Eric W. Biederman" 
---
 arch/powerpc/mm/fault.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index d51cf5f4e45e..22d7f8748cd7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -158,7 +158,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long 
address,
 vm_fault_t fault)
 {
siginfo_t info;
-   unsigned int lsb = 0;
 
if (!user_mode(regs))
return SIGBUS;
@@ -171,17 +170,22 @@ static int do_sigbus(struct pt_regs *regs, unsigned long 
address,
info.si_addr = (void __user *)address;
 #ifdef CONFIG_MEMORY_FAILURE
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
+   unsigned int lsb = 0; /* shutup gcc */
+
pr_err("MCE: Killing %s:%d due to hardware memory corruption 
fault at %lx\n",
current->comm, current->pid, address);
-   info.si_code = BUS_MCEERR_AR;
+
+   if (fault & VM_FAULT_HWPOISON_LARGE)
+   lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
+   if (fault & VM_FAULT_HWPOISON)
+   lsb = PAGE_SHIFT;
+
+   force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb,
+current);
+   return 0;
}
 
-   if (fault & VM_FAULT_HWPOISON_LARGE)
-   lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
-   if (fault & VM_FAULT_HWPOISON)
-   lsb = PAGE_SHIFT;
 #endif
-   info.si_addr_lsb = lsb;
force_sig_info(SIGBUS, , current);
return 0;
 }
-- 
2.17.1



Re: [PATCH v2 05/17] compat_ioctl: move more drivers to generic_compat_ioctl_ptrarg

2018-09-18 Thread Darren Hart
On Fri, Sep 14, 2018 at 09:57:48PM +0100, Al Viro wrote:
> On Fri, Sep 14, 2018 at 01:35:06PM -0700, Darren Hart wrote:
>  
> > Acked-by: Darren Hart (VMware) 
> > 
> > As for a longer term solution, would it be possible to init fops in such
> > a way that the compat_ioctl call defaults to generic_compat_ioctl_ptrarg
> > so we don't have to duplicate this boilerplate for every ioctl fops
> > structure?
> 
>   Bad idea, that...  Because several years down the road somebody will add
> an ioctl that takes an unsigned int for argument.  Without so much as looking
> at your magical mystery macro being used to initialize file_operations.

Fair, being explicit in the declaration as it is currently may be
preferable then.

-- 
Darren Hart
VMware Open Source Technology Center


Re: [PATCH v2 07/24] powerpc: handover page flags with a pgprot_t parameter

2018-09-18 Thread Christophe LEROY

Christophe Leroy  writes:


In order to avoid multiple conversions, handover directly a
pgprot_t to map_kernel_page() as already done for radix.

Do the same for __ioremap_caller() and __ioremap_at().



Reviewed-by: Aneesh Kumar K.V 


Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  2 +-
 arch/powerpc/include/asm/book3s/64/hash.h|  3 +--
 arch/powerpc/include/asm/book3s/64/pgtable.h |  7 +++---
 arch/powerpc/include/asm/fixmap.h|  2 +-
 arch/powerpc/include/asm/io.h|  4 +--
 arch/powerpc/include/asm/machdep.h   |  2 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h |  2 +-
 arch/powerpc/include/asm/nohash/64/pgtable.h |  3 +--
 arch/powerpc/kernel/io-workarounds.c |  4 +--
 arch/powerpc/kernel/isa-bridge.c |  6 ++---
 arch/powerpc/kernel/pci_64.c |  2 +-
 arch/powerpc/lib/code-patching.c |  3 +--
 arch/powerpc/mm/8xx_mmu.c|  3 +--
 arch/powerpc/mm/dma-noncoherent.c|  2 +-
 arch/powerpc/mm/mem.c|  4 +--
 arch/powerpc/mm/pgtable-book3e.c |  9 +++
 arch/powerpc/mm/pgtable-hash64.c |  7 +++---
 arch/powerpc/mm/pgtable_32.c | 37 +---
 arch/powerpc/mm/pgtable_64.c | 37 ++--
 drivers/pcmcia/electra_cf.c  |  2 +-
 20 files changed, 64 insertions(+), 77 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 751cf931bb3f..7a9f0ed599ff 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -292,7 +292,7 @@ static inline void __ptep_set_access_flags(struct 
vm_area_struct *vma,
 #define __pte_to_swp_entry(pte)((swp_entry_t) { pte_val(pte) 
>> 3 })
 #define __swp_entry_to_pte(x)  ((pte_t) { (x).val << 3 })
 
-int map_kernel_page(unsigned long va, phys_addr_t pa, int flags);

+int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
 
 /* Generic accessors to PTE bits */

 static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & 
_PAGE_RW);}
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index d52a51b2ce7b..62b8b89e24c0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -196,8 +196,7 @@ static inline void hpte_do_hugepage_flush(struct mm_struct 
*mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 
-extern int hash__map_kernel_page(unsigned long ea, unsigned long pa,

-unsigned long flags);
+int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot);
 extern int __meminit hash__vmemmap_create_mapping(unsigned long start,
  unsigned long page_size,
  unsigned long phys);
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 13a688fc8cd0..91999cd2deb9 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1020,17 +1020,16 @@ extern struct page *pgd_page(pgd_t pgd);
 #define pgd_ERROR(e) \
pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
-static inline int map_kernel_page(unsigned long ea, unsigned long pa,

- unsigned long flags)
+static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t 
prot)
 {
if (radix_enabled()) {
 #if defined(CONFIG_PPC_RADIX_MMU) && defined(DEBUG_VM)
unsigned long page_size = 1 << 
mmu_psize_defs[mmu_io_psize].shift;
WARN((page_size != PAGE_SIZE), "I/O page size != PAGE_SIZE");
 #endif
-   return radix__map_kernel_page(ea, pa, __pgprot(flags), 
PAGE_SIZE);
+   return radix__map_kernel_page(ea, pa, prot, PAGE_SIZE);
}
-   return hash__map_kernel_page(ea, pa, flags);
+   return hash__map_kernel_page(ea, pa, prot);
 }
 
 static inline int __meminit vmemmap_create_mapping(unsigned long start,

diff --git a/arch/powerpc/include/asm/fixmap.h 
b/arch/powerpc/include/asm/fixmap.h
index 41cc15c14eee..b9fbed84ddca 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -72,7 +72,7 @@ enum fixed_addresses {
 static inline void __set_fixmap(enum fixed_addresses idx,
phys_addr_t phys, pgprot_t flags)
 {
-   map_kernel_page(fix_to_virt(idx), phys, pgprot_val(flags));
+   map_kernel_page(fix_to_virt(idx), phys, flags);
 }
 
 #endif /* !__ASSEMBLY__ */

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index cdccab3938db..0a034519957d 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -786,12 +786,12 

[PATCH v4 20/20] powerpc/8xx: set GUARDED attribute in the PMD directly

2018-09-18 Thread Christophe Leroy
On the 8xx, the GUARDED attribute of the pages is managed in the
L1 entry, therefore to avoid having to copy it into L1 entry
at each TLB miss, we set it in the PMD.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 3 ++-
 arch/powerpc/kernel/head_8xx.S   | 9 -
 arch/powerpc/platforms/Kconfig.cputype   | 1 +
 3 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 8c9872d93257..20d4c1c04726 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -62,10 +62,11 @@
 
 #define _PMD_PRESENT   0x0001
 #define _PMD_PRESENT_MASK  _PMD_PRESENT
-#define _PMD_BAD   0x0fd0
+#define _PMD_BAD   0x0fc0
 #define _PMD_PAGE_MASK 0x000c
 #define _PMD_PAGE_8M   0x000c
 #define _PMD_PAGE_512K 0x0004
+#define _PMD_GUARDED   0x0010
 #define _PMD_USER  0x0020  /* APG 1 */
 
 #define _PTE_NONE_MASK 0
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 3e38af7489a9..89974c938617 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -452,15 +452,6 @@ DataStoreTLBMiss:
mfspr   r10, SPRN_MD_TWC
lwz r10, 0(r10) /* Get the pte */
 
-   /* Insert the Guarded flag into the TWC from the Linux PTE.
-* It is bit 27 of both the Linux PTE and the TWC (at least
-* I got that right :-).  It will be better when we can put
-* this into the Linux pgd/pmd and load it in the operation
-* above.
-*/
-   rlwimi  r11, r10, 0, _PAGE_GUARDED
-   mtspr   SPRN_MD_TWC, r11
-
/* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set.
 * We also need to know if the insn is a load/store, so:
 * Clear _PAGE_PRESENT and load that which will
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index d0984546fbec..c92d084a5a23 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -34,6 +34,7 @@ config PPC_8xx
bool "Freescale 8xx"
select FSL_SOC
select SYS_SUPPORTS_HUGETLBFS
+   select PPC_PMD_GUARDED
 
 config 40x
bool "AMCC 40x"
-- 
2.13.3



[PATCH v4 19/20] powerpc/nohash32: allow setting GUARDED attribute in the PMD directly

2018-09-18 Thread Christophe Leroy
On the 8xx, the GUARDED attribute of the pages is managed in the
L1 entry, therefore to avoid having to copy it into L1 entry
at each TLB miss, we have to set it in the PMD

In order to allow this, this patch splits the VM alloc space in two
parts, one for VM alloc and non Guarded IO, and one for Guarded IO.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/pgtable.h |  2 +
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  8 
 arch/powerpc/include/asm/nohash/32/pgtable.h | 19 -
 arch/powerpc/mm/dump_linuxpagetables.c   | 21 +-
 arch/powerpc/mm/mem.c|  7 
 arch/powerpc/mm/pgtable_32.c | 60 
 arch/powerpc/platforms/Kconfig.cputype   |  2 +
 7 files changed, 108 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 7a8a590f6b4c..28001d5eaa89 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -156,6 +156,8 @@ static inline bool pte_user(pte_t pte)
 #define IOREMAP_TOPKVIRT_TOP
 #endif
 
+#define IOREMAP_BASE   VMALLOC_START
+
 /*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
  * current 16MB value just means that there will be a 64MB "hole" after the
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h 
b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index e69423ad8e2e..7d8de0b73aad 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -58,6 +58,14 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, 
pmd_t *pmdp,
*pmdp = __pmd(__pa(pte) | _PMD_PRESENT);
 }
 
+#ifdef CONFIG_PPC_PMD_GUARDED
+static inline void pmd_populate_kernel_g(struct mm_struct *mm, pmd_t *pmdp,
+pte_t *pte)
+{
+   *pmdp = __pmd(__pa(pte) | _PMD_PRESENT | _PMD_GUARDED);
+}
+#endif
+
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pte_page)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 6f2b35af7a28..9a328eda89a5 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -81,9 +81,14 @@ extern int icache_44x_need_flush;
  * virtual space that goes below PKMAP and FIXMAP
  */
 #ifdef CONFIG_HIGHMEM
-#define KVIRT_TOP  PKMAP_BASE
+#define _KVIRT_TOP PKMAP_BASE
 #else
-#define KVIRT_TOP  (0xfe00UL)  /* for now, could be FIXMAP_BASE ? */
+#define _KVIRT_TOP (0xfe00UL)  /* for now, could be FIXMAP_BASE ? */
+#endif
+#ifdef CONFIG_PPC_PMD_GUARDED
+#define KVIRT_TOP  _ALIGN_DOWN(_KVIRT_TOP, PGDIR_SIZE)
+#else
+#define KVIRT_TOP  _KVIRT_TOP
 #endif
 
 /*
@@ -96,6 +101,12 @@ extern int icache_44x_need_flush;
 #else
 #define IOREMAP_TOPKVIRT_TOP
 #endif
+#ifdef CONFIG_PPC_PMD_GUARDED
+#define IOREMAP_BASE   _ALIGN_UP(VMALLOC_START + (IOREMAP_TOP - VMALLOC_START) 
/ 2, \
+ PGDIR_SIZE)
+#else
+#define IOREMAP_BASE   VMALLOC_START
+#endif
 
 /*
  * Just any arbitrary offset to the start of the vmalloc VM area: the
@@ -120,7 +131,11 @@ extern int icache_44x_need_flush;
 #else
 #define VMALLOC_START long)high_memory + VMALLOC_OFFSET) & 
~(VMALLOC_OFFSET-1)))
 #endif
+#ifdef CONFIG_PPC_PMD_GUARDED
+#define VMALLOC_ENDIOREMAP_BASE
+#else
 #define VMALLOC_ENDioremap_bot
+#endif
 
 /*
  * Bits in a linux-style PTE.  These match the bits in the
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c 
b/arch/powerpc/mm/dump_linuxpagetables.c
index e60aa6d7456d..105d0118f735 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -76,9 +76,9 @@ struct addr_marker {
 
 static struct addr_marker address_markers[] = {
{ 0,"Start of kernel VM" },
+#ifdef CONFIG_PPC64
{ 0,"vmalloc() Area" },
{ 0,"vmalloc() End" },
-#ifdef CONFIG_PPC64
{ 0,"isa I/O start" },
{ 0,"isa I/O end" },
{ 0,"phb I/O start" },
@@ -87,8 +87,19 @@ static struct addr_marker address_markers[] = {
{ 0,"I/O remap end" },
{ 0,"vmemmap start" },
 #else
+#ifdef CONFIG_PPC_PMD_GUARDED
+   { 0,"vmalloc() Area" },
+   { 0,"vmalloc() End" },
+   { 0,"Early I/O remap start" },
+   { 0,"Early I/O remap end" },
+   { 0,"I/O remap start" },
+   { 0,"I/O remap end" },
+#else
{ 0,"Early I/O remap start" },
{ 0,"Early I/O remap end" },
+   { 0,"vmalloc() I/O remap start" },
+   { 0,"vmalloc() I/O remap end" },
+#endif
 #ifdef CONFIG_NOT_COHERENT_CACHE
{ 0,"Consistent mem start" },
{ 0,"Consistent mem end" },
@@ -286,9 +297,9 @@ static void populate_markers(void)
int i = 0;
 

[PATCH v4 18/20] powerpc/mm: reintroduce 16K pages with HW assistance on 8xx

2018-09-18 Thread Christophe Leroy
Using this HW assistance implies some constraints on the
page table structure:
- Regardless of the main page size used (4k or 16k), the
level 1 table (PGD) contains 1024 entries and each PGD entry covers
a 4Mbytes area which is managed by a level 2 table (PTE) containing
also 1024 entries each describing a 4k page.
- 16k pages require 4 identifical entries in the L2 table
- 512k pages PTE have to be spread every 128 bytes in the L2 table
- 8M pages PTE are at the address pointed by the L1 entry and each
8M page require 2 identical entries in the PGD.

In order to use hardware assistance with 16K pages, this patch does
the following modifications:
- Make PGD size independent of the main page size
- In 16k pages mode, redefine pte_t as a struct with 4 elements,
and populate those 4 elements in __set_pte_at() and pte_update()
- Adapt the size of the hugepage tables.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig |  2 +-
 arch/powerpc/include/asm/nohash/32/pgtable.h | 19 ++-
 arch/powerpc/include/asm/nohash/pgtable.h|  4 
 arch/powerpc/include/asm/pgtable-types.h |  4 
 4 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 33931804c46f..a80669209155 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -698,7 +698,7 @@ config PPC_4K_PAGES
 
 config PPC_16K_PAGES
bool "16k page size"
-   depends on 44x
+   depends on 44x || PPC_8xx
 
 config PPC_64K_PAGES
bool "64k page size"
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h 
b/arch/powerpc/include/asm/nohash/32/pgtable.h
index 73e2b1fbdb36..6f2b35af7a28 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -19,7 +19,14 @@ extern int icache_44x_need_flush;
 
 #endif /* __ASSEMBLY__ */
 
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+#define PTE_INDEX_SIZE  (PTE_SHIFT - 2)
+#define PTE_FRAG_NR4
+#define PTE_FRAG_SIZE_SHIFT12
+#define PTE_FRAG_SIZE  (1UL << PTE_FRAG_SIZE_SHIFT)
+#else
 #define PTE_INDEX_SIZE PTE_SHIFT
+#endif
 #define PMD_INDEX_SIZE 0
 #define PUD_INDEX_SIZE 0
 #define PGD_INDEX_SIZE (32 - PGDIR_SHIFT)
@@ -48,7 +55,11 @@ extern int icache_44x_need_flush;
  * -Matt
  */
 /* PGDIR_SHIFT determines what a top-level page table entry can map */
+#ifdef CONFIG_PPC_8xx
+#define PGDIR_SHIFT22
+#else
 #define PGDIR_SHIFT(PAGE_SHIFT + PTE_INDEX_SIZE)
+#endif
 #define PGDIR_SIZE (1UL << PGDIR_SHIFT)
 #define PGDIR_MASK (~(PGDIR_SIZE-1))
 
@@ -229,7 +240,13 @@ static inline unsigned long pte_update(pte_t *p,
: "cc" );
 #else /* PTE_ATOMIC_UPDATES */
unsigned long old = pte_val(*p);
-   *p = __pte((old & ~clr) | set);
+   unsigned long new = (old & ~clr) | set;
+
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+   p->pte = p->pte1 = p->pte2 = p->pte3 = new;
+#else
+   *p = __pte(new);
+#endif
 #endif /* !PTE_ATOMIC_UPDATES */
 
 #ifdef CONFIG_44x
diff --git a/arch/powerpc/include/asm/nohash/pgtable.h 
b/arch/powerpc/include/asm/nohash/pgtable.h
index aa968d87337b..883f69e6cdf7 100644
--- a/arch/powerpc/include/asm/nohash/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/pgtable.h
@@ -204,7 +204,11 @@ static inline void __set_pte_at(struct mm_struct *mm, 
unsigned long addr,
/* Anything else just stores the PTE normally. That covers all 64-bit
 * cases, and 32-bit non-hash with 32-bit PTEs.
 */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+   ptep->pte = ptep->pte1 = ptep->pte2 = ptep->pte3 = pte_val(pte);
+#else
*ptep = pte;
+#endif
 
/*
 * With hardware tablewalk, a sync is needed to ensure that
diff --git a/arch/powerpc/include/asm/pgtable-types.h 
b/arch/powerpc/include/asm/pgtable-types.h
index eccb30b38b47..3b0edf041b2e 100644
--- a/arch/powerpc/include/asm/pgtable-types.h
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -3,7 +3,11 @@
 #define _ASM_POWERPC_PGTABLE_TYPES_H
 
 /* PTE level */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+typedef struct { pte_basic_t pte, pte1, pte2, pte3; } pte_t;
+#else
 typedef struct { pte_basic_t pte; } pte_t;
+#endif
 #define __pte(x)   ((pte_t) { (x) })
 static inline pte_basic_t pte_val(pte_t x)
 {
-- 
2.13.3



[PATCH v4 17/20] powerpc/8xx: Remove PTE_ATOMIC_UPDATES

2018-09-18 Thread Christophe Leroy
commit 1bc54c03117b9 ("powerpc: rework 4xx PTE access and TLB miss")
introduced non atomic PTE updates and started the work of removing
PTE updates in TLB miss handlers, but kept PTE_ATOMIC_UPDATES for the
8xx with the following comment:
/* Until my rework is finished, 8xx still needs atomic PTE updates */

commit fe11dc3f9628e ("powerpc/8xx: Update TLB asm so it behaves as
linux mm expects") removed all PTE updates done in TLB miss handlers

Therefore, atomic PTE updates are not needed anymore for the 8xx

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/pte-8xx.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h 
b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
index 1c57efac089d..8c9872d93257 100644
--- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h
@@ -70,9 +70,6 @@
 
 #define _PTE_NONE_MASK 0
 
-/* Until my rework is finished, 8xx still needs atomic PTE updates */
-#define PTE_ATOMIC_UPDATES 1
-
 #ifdef CONFIG_PPC_16K_PAGES
 #define _PAGE_PSIZE_PAGE_SPS
 #else
-- 
2.13.3



[PATCH v4 16/20] powerpc/mm: Extend pte_fragment functionality to nohash/32

2018-09-18 Thread Christophe Leroy
In order to allow the 8xx to handle pte_fragments, this patch
extends the use of pte_fragments to nohash/32 platforms.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-40x.h   |  1 +
 arch/powerpc/include/asm/mmu-44x.h   |  1 +
 arch/powerpc/include/asm/mmu-8xx.h   |  1 +
 arch/powerpc/include/asm/mmu-book3e.h|  1 +
 arch/powerpc/include/asm/mmu_context.h   |  2 +-
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 43 +++-
 arch/powerpc/include/asm/nohash/32/pgtable.h |  7 +++--
 arch/powerpc/include/asm/page.h  |  6 +---
 arch/powerpc/include/asm/pgtable.h   |  8 ++
 arch/powerpc/mm/Makefile |  3 ++
 arch/powerpc/mm/mmu_context_nohash.c |  1 +
 arch/powerpc/mm/pgtable-frag.c   |  6 
 arch/powerpc/mm/pgtable_32.c |  8 --
 13 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-40x.h 
b/arch/powerpc/include/asm/mmu-40x.h
index 74f4edb5916e..7c77ceed71d6 100644
--- a/arch/powerpc/include/asm/mmu-40x.h
+++ b/arch/powerpc/include/asm/mmu-40x.h
@@ -58,6 +58,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-44x.h 
b/arch/powerpc/include/asm/mmu-44x.h
index 295b3dbb2698..3d72e889ae7b 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -109,6 +109,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index fa05aa566ece..750cef6f65e3 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -179,6 +179,7 @@ typedef struct {
unsigned int id;
unsigned int active;
unsigned long vdso_base;
+   void *pte_frag;
 #ifdef CONFIG_PPC_MM_SLICES
u16 user_psize; /* page size index */
unsigned char low_slices_psize[SLICE_ARRAY_SIZE];
diff --git a/arch/powerpc/include/asm/mmu-book3e.h 
b/arch/powerpc/include/asm/mmu-book3e.h
index e20072972e35..8e8aad5172ab 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -230,6 +230,7 @@ typedef struct {
unsigned intid;
unsigned intactive;
unsigned long   vdso_base;
+   void *pte_frag;
 } mm_context_t;
 
 /* Page size definitions, common between 32 and 64-bit
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index b2f89b621b15..7f2c37a3f99d 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -222,7 +222,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm,
return 0;
 }
 
-#ifndef CONFIG_PPC_BOOK3S_64
+#if defined(CONFIG_PPC_BOOK3E_64) || defined(CONFIG_PPC_BOOK3S_32)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 }
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h 
b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index f3fec9052f31..e69423ad8e2e 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -27,6 +27,9 @@ extern void __bad_pte(pmd_t *pmd);
 extern struct kmem_cache *pgtable_cache[];
 #define PGT_CACHE(shift) pgtable_cache[shift]
 
+pte_t *pte_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr, int 
kernel);
+void pte_fragment_free(unsigned long *table, int kernel);
+
 static inline pgd_t *pgd_alloc(struct mm_struct *mm)
 {
return kmem_cache_alloc(PGT_CACHE(PGD_INDEX_SIZE),
@@ -58,11 +61,10 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmdp,
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pte_page)
 {
-   *pmdp = __pmd((page_to_pfn(pte_page) << PAGE_SHIFT) | _PMD_USER |
- _PMD_PRESENT);
+   *pmdp = __pmd(__pa(pte_page) | _PMD_USER | _PMD_PRESENT);
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 #else
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
@@ -74,49 +76,38 @@ static inline void pmd_populate_kernel(struct mm_struct 
*mm, pmd_t *pmdp,
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pte_page)
 {
-   *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | 
_PMD_PRESENT);
+   *pmdp = __pmd((unsigned long)pte_page | _PMD_PRESENT);
 }
 
-#define pmd_pgtable(pmd) pmd_page(pmd)
+#define pmd_pgtable(pmd) ((pgtable_t)pmd_page_vaddr(pmd))
 #endif
 
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned 

[PATCH v4 15/20] powerpc/mm: Avoid useless lock with single page fragments

2018-09-18 Thread Christophe Leroy
There is no point in taking the page table lock as
pte_frag is always NULL when we have only one fragment.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/pgtable-frag.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c
index bc924822dcd6..ab4910e92aaf 100644
--- a/arch/powerpc/mm/pgtable-frag.c
+++ b/arch/powerpc/mm/pgtable-frag.c
@@ -85,6 +85,9 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm)
 {
void *pte_frag, *ret;
 
+   if (PTE_FRAG_NR == 1)
+   return NULL;
+
spin_lock(>page_table_lock);
ret = mm->context.pte_frag;
if (ret) {
-- 
2.13.3



[PATCH v4 14/20] powerpc/mm: Move pte_fragment_alloc() to a common location

2018-09-18 Thread Christophe Leroy
In preparation of next patch which generalises the use of
pte_fragment_alloc() for all, this patch moves the related functions
in a place that is common to all subarches.

The 8xx will need that for supporting 16k pages, as in that mode
page tables still have a size of 4k.

Since pte_fragment with only once fragment is not different
from what is done in the general case, we can easily migrate all
subarchs to pte fragments.

For the time being, it is only code move. We enclose it inside

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/Makefile   |   4 +-
 arch/powerpc/mm/mmu_context.c  |   1 -
 arch/powerpc/mm/mmu_context_book3s64.c |  67 -
 arch/powerpc/mm/pgtable-book3s64.c |  85 -
 arch/powerpc/mm/pgtable-frag.c | 167 +
 5 files changed, 170 insertions(+), 154 deletions(-)
 create mode 100644 arch/powerpc/mm/pgtable-frag.c

diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3c844bdd16c4..bd43b3ee52cb 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -15,7 +15,9 @@ obj-$(CONFIG_PPC_MMU_NOHASH)  += mmu_context_nohash.o 
tlb_nohash.o \
 obj-$(CONFIG_PPC_BOOK3E)   += tlb_low_$(BITS)e.o
 hash64-$(CONFIG_PPC_NATIVE):= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
-obj-$(CONFIG_PPC_BOOK3S_64)+= pgtable-hash64.o hash_utils_64.o slb_low.o 
slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
+obj-$(CONFIG_PPC_BOOK3S_64)+= pgtable-hash64.o hash_utils_64.o slb_low.o 
slb.o \
+  $(hash64-y) mmu_context_book3s64.o 
pgtable-book3s64.o \
+  pgtable-frag.o
 obj-$(CONFIG_PPC_RADIX_MMU)+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_STD_MMU_32)   += ppc_mmu_32.o hash_low_32.o 
mmu_context_hash32.o
 obj-$(CONFIG_PPC_STD_MMU)  += tlb_hash$(BITS).o
diff --git a/arch/powerpc/mm/mmu_context.c b/arch/powerpc/mm/mmu_context.c
index f84e14f23e50..b89e7dcc14cc 100644
--- a/arch/powerpc/mm/mmu_context.c
+++ b/arch/powerpc/mm/mmu_context.c
@@ -96,4 +96,3 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 */
switch_mmu_context(prev, next, tsk);
 }
-
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index dbd8f762140b..417b0cb67584 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -155,50 +155,6 @@ static void destroy_contexts(mm_context_t *ctx)
}
 }
 
-static void pte_frag_destroy(void *pte_frag)
-{
-   int count;
-   struct page *page;
-
-   page = virt_to_page(pte_frag);
-   /* drop all the pending references */
-   count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT;
-   /* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PTE_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_page_dtor(page);
-   __free_page(page);
-   }
-}
-
-static void pmd_frag_destroy(void *pmd_frag)
-{
-   int count;
-   struct page *page;
-
-   page = virt_to_page(pmd_frag);
-   /* drop all the pending references */
-   count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT;
-   /* We allow PTE_FRAG_NR fragments from a PTE page */
-   if (atomic_sub_and_test(PMD_FRAG_NR - count, >pt_frag_refcount)) {
-   pgtable_pmd_page_dtor(page);
-   __free_page(page);
-   }
-}
-
-static void destroy_pagetable_cache(struct mm_struct *mm)
-{
-   void *frag;
-
-   frag = mm->context.pte_frag;
-   if (frag)
-   pte_frag_destroy(frag);
-
-   frag = mm->context.pmd_frag;
-   if (frag)
-   pmd_frag_destroy(frag);
-   return;
-}
-
 void destroy_context(struct mm_struct *mm)
 {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
@@ -212,29 +168,6 @@ void destroy_context(struct mm_struct *mm)
mm->context.id = MMU_NO_CONTEXT;
 }
 
-void arch_exit_mmap(struct mm_struct *mm)
-{
-   destroy_pagetable_cache(mm);
-
-   if (radix_enabled()) {
-   /*
-* Radix doesn't have a valid bit in the process table
-* entries. However we know that at least P9 implementation
-* will avoid caching an entry with an invalid RTS field,
-* and 0 is invalid. So this will do.
-*
-* This runs before the "fullmm" tlb flush in exit_mmap,
-* which does a RIC=2 tlbie to clear the process table
-* entry. See the "fullmm" comments in tlb-radix.c.
-*
-* No barrier required here after the store because
-* this process will do the invalidate, which starts with
-* ptesync.
-*/
-   process_tb[mm->context.id].prtb0 = 0;
-   }
-}
-
 #ifdef CONFIG_PPC_RADIX_MMU
 void 

[PATCH v4 13/20] powerpc/book3s32: Remove CONFIG_BOOKE dependent code

2018-09-18 Thread Christophe Leroy
BOOK3S/32 cannot be BOOKE, so remove useless code

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 18 --
 arch/powerpc/include/asm/book3s/32/pgtable.h | 14 --
 2 files changed, 32 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 701748132442..2639b4b7d67c 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -47,8 +47,6 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #define __pmd_free_tlb(tlb,x,a)do { } while (0)
 /* #define pgd_populate(mm, pmd, pte)  BUG() */
 
-#ifndef CONFIG_BOOKE
-
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
   pte_t *pte)
 {
@@ -62,22 +60,6 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmdp,
 }
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
-#else
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp,
-  pte_t *pte)
-{
-   *pmdp = __pmd((unsigned long)pte | _PMD_PRESENT);
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmdp,
-   pgtable_t pte_page)
-{
-   *pmdp = __pmd((unsigned long)lowmem_page_address(pte_page) | 
_PMD_PRESENT);
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-#endif
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long 
address)
 {
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h 
b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 5ffb7e3b211f..7a8a590f6b4c 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -334,24 +334,10 @@ static inline void __ptep_set_access_flags(struct 
vm_area_struct *vma,
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)  (((pte_val(A) ^ pte_val(B)) & ~_PAGE_HASHPTE) == 0)
 
-/*
- * Note that on Book E processors, the pmd contains the kernel virtual
- * (lowmem) address of the pte page.  The physical address is less useful
- * because everything runs with translation enabled (even the TLB miss
- * handler).  On everything else the pmd contains the physical address
- * of the pte page.  -- paulus
- */
-#ifndef CONFIG_BOOKE
 #define pmd_page_vaddr(pmd)\
((unsigned long) __va(pmd_val(pmd) & PAGE_MASK))
 #define pmd_page(pmd)  \
pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
-#else
-#define pmd_page_vaddr(pmd)\
-   ((unsigned long) (pmd_val(pmd) & PAGE_MASK))
-#define pmd_page(pmd)  \
-   pfn_to_page((__pa(pmd_val(pmd)) >> PAGE_SHIFT))
-#endif
 
 /* to find an entry in a kernel page-table-directory */
 #define pgd_offset_k(address) pgd_offset(_mm, address)
-- 
2.13.3



[PATCH v4 12/20] powerpc/mm: inline pte_alloc_one() and pte_alloc_one_kernel() in PPC32

2018-09-18 Thread Christophe Leroy
As in PPC64, inline pte_alloc_one() and pte_alloc_one_kernel()
in PPC32. This will allow to switch nohash/32 to pte_fragment
without impacting hash/32.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/book3s/32/pgalloc.h | 22 --
 arch/powerpc/include/asm/nohash/32/pgalloc.h | 22 --
 arch/powerpc/mm/pgtable_32.c | 21 -
 3 files changed, 40 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h 
b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index 96138ab3ddd6..701748132442 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -79,8 +79,26 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmdp,
 #define pmd_pgtable(pmd) pmd_page(pmd)
 #endif
 
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long 
address)
+{
+   return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long 
address)
+{
+   struct page *ptepage;
+
+   gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT;
+
+   ptepage = alloc_pages(flags, 0);
+   if (!ptepage)
+   return NULL;
+   if (!pgtable_page_ctor(ptepage)) {
+   __free_page(ptepage);
+   return NULL;
+   }
+   return ptepage;
+}
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
diff --git a/arch/powerpc/include/asm/nohash/32/pgalloc.h 
b/arch/powerpc/include/asm/nohash/32/pgalloc.h
index 6fbbb90043c0..f3fec9052f31 100644
--- a/arch/powerpc/include/asm/nohash/32/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/32/pgalloc.h
@@ -80,8 +80,26 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t 
*pmdp,
 #define pmd_pgtable(pmd) pmd_page(pmd)
 #endif
 
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr);
-extern pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addr);
+static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long 
address)
+{
+   return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+}
+
+static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long 
address)
+{
+   struct page *ptepage;
+
+   gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT;
+
+   ptepage = alloc_pages(flags, 0);
+   if (!ptepage)
+   return NULL;
+   if (!pgtable_page_ctor(ptepage)) {
+   __free_page(ptepage);
+   return NULL;
+   }
+   return ptepage;
+}
 
 static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
 {
diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 6c8a07624773..7900b613e6e5 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -43,27 +43,6 @@ EXPORT_SYMBOL(ioremap_bot);  /* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
-   return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
-   struct page *ptepage;
-
-   gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT;
-
-   ptepage = alloc_pages(flags, 0);
-   if (!ptepage)
-   return NULL;
-   if (!pgtable_page_ctor(ptepage)) {
-   __free_page(ptepage);
-   return NULL;
-   }
-   return ptepage;
-}
-
 void __iomem *
 ioremap(phys_addr_t addr, unsigned long size)
 {
-- 
2.13.3



[PATCH v4 11/20] powerpc/mm: don't use pte_alloc_one_kernel() before slab is available

2018-09-18 Thread Christophe Leroy
In the same way as PPC64, let's handle pte allocation directly
in kernel_map_page() when slab is not available.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/mm/pgtable_32.c | 34 +-
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index 5877f5aa8f5d..6c8a07624773 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -43,18 +43,9 @@ EXPORT_SYMBOL(ioremap_bot);  /* aka VMALLOC_END */
 
 extern char etext[], _stext[], _sinittext[], _einittext[];
 
-__ref pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-   pte_t *pte;
-
-   if (slab_is_available()) {
-   pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
-   } else {
-   pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
-   if (pte)
-   clear_page(pte);
-   }
-   return pte;
+   return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
 }
 
 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -222,7 +213,21 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
-int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
+static __init pte_t *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va)
+{
+   if (!pmd_present(*pmdp)) {
+   pte_t *ptep = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE));
+
+   if (!ptep)
+   return NULL;
+
+   clear_page(ptep);
+   pmd_populate_kernel(_mm, pmdp, ptep);
+   }
+   return pte_offset_kernel(pmdp, va);
+}
+
+__ref int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot)
 {
pmd_t *pd;
pte_t *pg;
@@ -231,7 +236,10 @@ int map_kernel_page(unsigned long va, phys_addr_t pa, 
pgprot_t prot)
/* Use upper 10 bits of VA to index the first level map */
pd = pmd_offset(pud_offset(pgd_offset_k(va), va), va);
/* Use middle 10 bits of VA to index the second-level map */
-   pg = pte_alloc_kernel(pd, va);
+   if (slab_is_available())
+   pg = pte_alloc_kernel(pd, va);
+   else
+   pg = early_pte_alloc_kernel(pd, va);
if (pg != 0) {
err = 0;
/* The PTE should never be already set nor present in the
-- 
2.13.3



[PATCH v4 10/20] powerpc/8xx: regroup TLB handler routines

2018-09-18 Thread Christophe Leroy
As this is running with MMU off, the CPU only does speculative
fetch for code in the same page.

Following the significant size reduction of TLB handler routines,
the side handlers can be brought back close to the main part,
ie in the same page.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 112 -
 1 file changed, 54 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index d69c6e3d5cc1..3e38af7489a9 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -400,6 +400,23 @@ InstructionTLBMiss:
rfi
 #endif
 
+#ifndef CONFIG_PIN_TLB_TEXT
+ITLBMissLinear:
+   mtcrr11
+   /* Set 8M byte page and mark it valid */
+   li  r11, MI_PS8MEG | MI_SVALID
+   mtspr   SPRN_MI_TWC, r11
+   rlwinm  r10, r10, 20, 0x0f80/* 8xx supports max 256Mb RAM */
+   ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
+ _PAGE_PRESENT
+   mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
+   rfi
+   patch_site  0b, patch__itlbmiss_exit_2
+#endif
+
. = 0x1200
 DataStoreTLBMiss:
mtspr   SPRN_SPRG_SCRATCH0, r10
@@ -485,6 +502,43 @@ DataStoreTLBMiss:
rfi
 #endif
 
+DTLBMissIMMR:
+   mtcrr11
+   /* Set 512k byte guarded page and mark it valid */
+   li  r10, MD_PS512K | MD_GUARDED | MD_SVALID
+   mtspr   SPRN_MD_TWC, r10
+   mfspr   r10, SPRN_IMMR  /* Get current IMMR */
+   rlwinm  r10, r10, 0, 0xfff8 /* Get 512 kbytes boundary */
+   ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
+ _PAGE_PRESENT | _PAGE_NO_CACHE
+   mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
+
+   li  r11, RPN_PATTERN
+   mtspr   SPRN_DAR, r11   /* Tag DAR */
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
+   rfi
+   patch_site  0b, patch__dtlbmiss_exit_2
+
+DTLBMissLinear:
+   mtcrr11
+   /* Set 8M byte page and mark it valid */
+   li  r11, MD_PS8MEG | MD_SVALID
+   mtspr   SPRN_MD_TWC, r11
+   rlwinm  r10, r10, 20, 0x0f80/* 8xx supports max 256Mb RAM */
+   ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
+ _PAGE_PRESENT
+   mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
+
+   li  r11, RPN_PATTERN
+   mtspr   SPRN_DAR, r11   /* Tag DAR */
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
+   mfspr   r11, SPRN_SPRG_SCRATCH1
+   rfi
+   patch_site  0b, patch__dtlbmiss_exit_3
+
 /* This is an instruction TLB error on the MPC8xx.  This could be due
  * to many reasons, such as executing guarded memory or illegal instruction
  * addresses.  There is nothing to do but handle a big time error fault.
@@ -584,64 +638,6 @@ InstructionBreakpoint:
 
. = 0x2000
 
-/*
- * Bottom part of DataStoreTLBMiss handlers for IMMR area and linear RAM.
- * not enough space in the DataStoreTLBMiss area.
- */
-DTLBMissIMMR:
-   mtcrr11
-   /* Set 512k byte guarded page and mark it valid */
-   li  r10, MD_PS512K | MD_GUARDED | MD_SVALID
-   mtspr   SPRN_MD_TWC, r10
-   mfspr   r10, SPRN_IMMR  /* Get current IMMR */
-   rlwinm  r10, r10, 0, 0xfff8 /* Get 512 kbytes boundary */
-   ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
- _PAGE_PRESENT | _PAGE_NO_CACHE
-   mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
-
-   li  r11, RPN_PATTERN
-   mtspr   SPRN_DAR, r11   /* Tag DAR */
-
-0: mfspr   r10, SPRN_SPRG_SCRATCH0
-   mfspr   r11, SPRN_SPRG_SCRATCH1
-   rfi
-   patch_site  0b, patch__dtlbmiss_exit_2
-
-DTLBMissLinear:
-   mtcrr11
-   /* Set 8M byte page and mark it valid */
-   li  r11, MD_PS8MEG | MD_SVALID
-   mtspr   SPRN_MD_TWC, r11
-   rlwinm  r10, r10, 20, 0x0f80/* 8xx supports max 256Mb RAM */
-   ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
- _PAGE_PRESENT
-   mtspr   SPRN_MD_RPN, r10/* Update TLB entry */
-
-   li  r11, RPN_PATTERN
-   mtspr   SPRN_DAR, r11   /* Tag DAR */
-
-0: mfspr   r10, SPRN_SPRG_SCRATCH0
-   mfspr   r11, SPRN_SPRG_SCRATCH1
-   rfi
-   patch_site  0b, patch__dtlbmiss_exit_3
-
-#ifndef CONFIG_PIN_TLB_TEXT
-ITLBMissLinear:
-   mtcrr11
-   /* Set 8M byte page and mark it valid */
-   li  r11, MI_PS8MEG | MI_SVALID
-   mtspr   SPRN_MI_TWC, r11
-   rlwinm  r10, r10, 20, 0x0f80/* 8xx supports max 256Mb RAM */
-   ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SH | _PAGE_DIRTY | \
-   

[PATCH v4 09/20] powerpc/8xx: don't use r12/SPRN_SPRG_SCRATCH2 in TLB Miss handlers

2018-09-18 Thread Christophe Leroy
This patch reworks the TLB Miss handler in order to not use r12
register, hence avoiding having to save it into SPRN_SPRG_SCRATCH2.

In the DAR Fixup code we can now use SPRN_M_TW, freeing
SPRN_SPRG_SCRATCH2.

Then SPRN_SPRG_SCRATCH2 may be used for something else in the future.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 110 ++---
 1 file changed, 49 insertions(+), 61 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 50e97027b507..d69c6e3d5cc1 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -302,91 +302,88 @@ SystemCall:
  */
 
 #ifdef CONFIG_8xx_CPU15
-#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr) \
-   additmp, addr, PAGE_SIZE;   \
-   tlbie   tmp;\
-   additmp, addr, -PAGE_SIZE;  \
-   tlbie   tmp
+#define INVALIDATE_ADJACENT_PAGES_CPU15(addr)  \
+   addiaddr, addr, PAGE_SIZE;  \
+   tlbie   addr;   \
+   addiaddr, addr, -(PAGE_SIZE << 1);  \
+   tlbie   addr;   \
+   addiaddr, addr, PAGE_SIZE
 #else
-#define INVALIDATE_ADJACENT_PAGES_CPU15(tmp, addr)
+#define INVALIDATE_ADJACENT_PAGES_CPU15(addr)
 #endif
 
 InstructionTLBMiss:
mtspr   SPRN_SPRG_SCRATCH0, r10
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP)
mtspr   SPRN_SPRG_SCRATCH1, r11
-#ifdef ITLB_MISS_KERNEL
-   mtspr   SPRN_SPRG_SCRATCH2, r12
 #endif
 
/* If we are faulting a kernel address, we have to use the
 * kernel page tables.
 */
mfspr   r10, SPRN_SRR0  /* Get effective address of fault */
-   INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
+   INVALIDATE_ADJACENT_PAGES_CPU15(r10)
mtspr   SPRN_MD_EPN, r10
/* Only modules will cause ITLB Misses as we always
 * pin the first 8MB of kernel memory */
 #ifdef ITLB_MISS_KERNEL
-   mfcrr12
+   mfcrr11
 #if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
-   andis.  r11, r10, 0x8000/* Address >= 0x8000 */
+   cmpicr0, r10, 0 /* Address >= 0x8000 */
 #else
-   rlwinm  r11, r10, 16, 0xfff8
-   cmpli   cr0, r11, PAGE_OFFSET@h
+   rlwinm  r10, r10, 16, 0xfff8
+   cmpli   cr0, r10, PAGE_OFFSET@h
 #ifndef CONFIG_PIN_TLB_TEXT
/* It is assumed that kernel code fits into the first 8M page */
-0: cmpli   cr7, r11, (PAGE_OFFSET + 0x080)@h
+0: cmpli   cr7, r10, (PAGE_OFFSET + 0x080)@h
patch_site  0b, patch__itlbmiss_linmem_top
 #endif
 #endif
 #endif
-   mfspr   r11, SPRN_M_TWB /* Get level 1 table */
+   mfspr   r10, SPRN_M_TWB /* Get level 1 table */
 #ifdef ITLB_MISS_KERNEL
 #if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
-   beq+3f
+   bge+3f
 #else
blt+3f
 #endif
 #ifndef CONFIG_PIN_TLB_TEXT
blt cr7, ITLBMissLinear
 #endif
-   rlwinm  r11, r11, 0, 20, 31
-   orisr11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha
+   rlwinm  r10, r10, 0, 20, 31
+   orisr10, r10, (swapper_pg_dir - PAGE_OFFSET)@ha
 3:
 #endif
-   lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the 
level 1 entry */
+   lwz r10, (swapper_pg_dir-PAGE_OFFSET)@l(r10)/* Get level 1 
entry */
+   mtspr   SPRN_MI_TWC, r10/* Set segment attributes */
 
-   mtspr   SPRN_MD_TWC, r11
+   mtspr   SPRN_MD_TWC, r10
mfspr   r10, SPRN_MD_TWC
lwz r10, 0(r10) /* Get the pte */
 
 #ifdef ITLB_MISS_KERNEL
-   mtcrr12
+   mtcrr11
 #endif
-   /* Load the MI_TWC with the attributes for this "segment." */
-   mtspr   SPRN_MI_TWC, r11/* Set segment attributes */
-
 #ifdef CONFIG_SWAP
rlwinm  r11, r10, 32-5, _PAGE_PRESENT
and r11, r11, r10
rlwimi  r10, r11, 0, _PAGE_PRESENT
 #endif
-   li  r11, RPN_PATTERN | 0x200
/* The Linux PTE won't go exactly into the MMU TLB.
 * Software indicator bits 20 and 23 must be clear.
 * Software indicator bits 22, 24, 25, 26, and 27 must be
 * set.  All other Linux PTE bits control the behavior
 * of the MMU.
 */
-   rlwimi  r11, r10, 4, 0x0400 /* Copy _PAGE_EXEC into bit 21 */
-   rlwimi  r10, r11, 0, 0x0ff0 /* Set 22, 24-27, clear 20,23 */
+   rlwimi  r10, r10, 0, 0x0f00 /* Clear bits 20-23 */
+   rlwimi  r10, r10, 4, 0x0400 /* Copy _PAGE_EXEC into bit 21 */
+   ori r10, r10, RPN_PATTERN | 0x200 /* Set 22 and 24-27 */
mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
 
/* Restore registers */
 0: mfspr   r10, SPRN_SPRG_SCRATCH0
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_SWAP)
mfspr   r11, SPRN_SPRG_SCRATCH1
-#ifdef ITLB_MISS_KERNEL
-   mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
rfi
patch_site  0b, 

[PATCH v4 08/20] powerpc/mm: Enable 512k hugepage support with HW assistance on the 8xx

2018-09-18 Thread Christophe Leroy
For using 512k pages with hardware assistance, the PTEs have to be spread
every 128 bytes in the L2 table.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hugetlb.h |  4 +++-
 arch/powerpc/mm/hugetlbpage.c  | 13 +
 arch/powerpc/mm/tlb_nohash.c   |  3 +++
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index e13843556414..b22f164216ad 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -75,7 +75,9 @@ static inline pte_t *hugepte_offset(hugepd_t hpd, unsigned 
long addr,
unsigned long idx = 0;
 
pte_t *dir = hugepd_page(hpd);
-#ifndef CONFIG_PPC_FSL_BOOK3E
+#ifdef CONFIG_PPC_8xx
+   idx = (addr & ((1UL << pdshift) - 1)) >> PAGE_SHIFT;
+#elif !defined(CONFIG_PPC_FSL_BOOK3E)
idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(hpd);
 #endif
 
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 16846649499b..527ea2451cc2 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -66,7 +66,11 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t 
*hpdp,
cachep = PGT_CACHE(PTE_T_ORDER);
num_hugepd = 1 << (pshift - pdshift);
} else {
+#ifdef CONFIG_PPC_8xx
+   cachep = PGT_CACHE(PTE_SHIFT);
+#else
cachep = PGT_CACHE(pdshift - pshift);
+#endif
num_hugepd = 1;
}
 
@@ -330,8 +334,13 @@ static void free_hugepd_range(struct mmu_gather *tlb, 
hugepd_t *hpdp, int pdshif
if (shift >= pdshift)
hugepd_free(tlb, hugepte);
else
+#ifdef CONFIG_PPC_8xx
+   pgtable_free_tlb(tlb, hugepte,
+get_hugepd_cache_index(PTE_SHIFT));
+#else
pgtable_free_tlb(tlb, hugepte,
 get_hugepd_cache_index(pdshift - shift));
+#endif
 }
 
 static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -699,7 +708,11 @@ static int __init hugetlbpage_init(void)
 * use pgt cache for hugepd.
 */
if (pdshift > shift)
+#ifdef CONFIG_PPC_8xx
+   pgtable_cache_add(PTE_SHIFT);
+#else
pgtable_cache_add(pdshift - shift);
+#endif
 #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx)
else
pgtable_cache_add(PTE_T_ORDER);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 49441963d285..15fe5f0c8665 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -97,6 +97,9 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.shift  = 14,
},
 #endif
+   [MMU_PAGE_512K] = {
+   .shift  = 19,
+   },
[MMU_PAGE_8M] = {
.shift  = 23,
},
-- 
2.13.3



[PATCH v4 07/20] powerpc/mm: Use hardware assistance in TLB handlers on the 8xx

2018-09-18 Thread Christophe Leroy
Today, on the 8xx the TLB handlers do SW tablewalk by doing all
the calculation in ASM, in order to match with the Linux page
table structure.

The 8xx offers hardware assistance which allows significant size
reduction of the TLB handlers, hence also reduces the time spent
in the handlers.

However, using this HW assistance implies some constraints on the
page table structure:
- Regardless of the main page size used (4k or 16k), the
level 1 table (PGD) contains 1024 entries and each PGD entry covers
a 4Mbytes area which is managed by a level 2 table (PTE) containing
also 1024 entries each describing a 4k page.
- 16k pages require 4 identifical entries in the L2 table
- 512k pages PTE have to be spread every 128 bytes in the L2 table
- 8M pages PTE are at the address pointed by the L1 entry and each
8M page require 2 identical entries in the PGD.

This patch modifies the TLB handlers to use HW assistance for 4K PAGES.

Before that patch, the mean time spent in TLB miss handlers is:
- ITLB miss: 80 ticks
- DTLB miss: 62 ticks
After that patch, the mean time spent in TLB miss handlers is:
- ITLB miss: 72 ticks
- DTLB miss: 54 ticks
So the improvement is 10% for ITLB and 13% for DTLB misses

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 97 +-
 arch/powerpc/mm/8xx_mmu.c  |  4 +-
 2 files changed, 32 insertions(+), 69 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 9b31721b522c..50e97027b507 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -292,7 +292,7 @@ SystemCall:
. = 0x1100
 /*
  * For the MPC8xx, this is a software tablewalk to load the instruction
- * TLB.  The task switch loads the M_TW register with the pointer to the first
+ * TLB.  The task switch loads the M_TWB register with the pointer to the first
  * level table.
  * If we discover there is no second level table (value is zero) or if there
  * is an invalid pte, we load that into the TLB, which causes another fault
@@ -314,7 +314,7 @@ SystemCall:
 InstructionTLBMiss:
mtspr   SPRN_SPRG_SCRATCH0, r10
mtspr   SPRN_SPRG_SCRATCH1, r11
-#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+#ifdef ITLB_MISS_KERNEL
mtspr   SPRN_SPRG_SCRATCH2, r12
 #endif
 
@@ -323,12 +323,11 @@ InstructionTLBMiss:
 */
mfspr   r10, SPRN_SRR0  /* Get effective address of fault */
INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
+   mtspr   SPRN_MD_EPN, r10
/* Only modules will cause ITLB Misses as we always
 * pin the first 8MB of kernel memory */
-#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
-   mfcrr12
-#endif
 #ifdef ITLB_MISS_KERNEL
+   mfcrr12
 #if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
andis.  r11, r10, 0x8000/* Address >= 0x8000 */
 #else
@@ -341,7 +340,7 @@ InstructionTLBMiss:
 #endif
 #endif
 #endif
-   mfspr   r11, SPRN_M_TW  /* Get level 1 table */
+   mfspr   r11, SPRN_M_TWB /* Get level 1 table */
 #ifdef ITLB_MISS_KERNEL
 #if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
beq+3f
@@ -351,23 +350,17 @@ InstructionTLBMiss:
 #ifndef CONFIG_PIN_TLB_TEXT
blt cr7, ITLBMissLinear
 #endif
-   lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha
+   rlwinm  r11, r11, 0, 20, 31
+   orisr11, r11, (swapper_pg_dir - PAGE_OFFSET)@ha
 3:
 #endif
-   /* Insert level 1 index */
-   rlwimi  r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 
29
lwz r11, (swapper_pg_dir-PAGE_OFFSET)@l(r11)/* Get the 
level 1 entry */
 
-   /* Extract level 2 index */
-   rlwinm  r10, r10, 32 - (PAGE_SHIFT - 2), 32 - PAGE_SHIFT, 29
-#ifdef CONFIG_HUGETLB_PAGE
-   mtcrr11
-   bt- 28, 10f /* bit 28 = Large page (8M) */
-#endif
-   rlwimi  r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */
+   mtspr   SPRN_MD_TWC, r11
+   mfspr   r10, SPRN_MD_TWC
lwz r10, 0(r10) /* Get the pte */
-4:
-#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+
+#ifdef ITLB_MISS_KERNEL
mtcrr12
 #endif
/* Load the MI_TWC with the attributes for this "segment." */
@@ -392,7 +385,7 @@ InstructionTLBMiss:
/* Restore registers */
 0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
-#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+#ifdef ITLB_MISS_KERNEL
mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
rfi
@@ -405,20 +398,12 @@ InstructionTLBMiss:
stw r10, (itlb_miss_counter - PAGE_OFFSET)@l(0)
mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
-#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
+#ifdef ITLB_MISS_KERNEL
mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
rfi
 #endif
 
-#ifdef CONFIG_HUGETLB_PAGE
-10:

[PATCH v4 06/20] powerpc/8xx: Temporarily disable 16k pages and 512k hugepages

2018-09-18 Thread Christophe Leroy
In preparation of making use of hardware assistance in TLB handlers,
this patch temporarily disables 16K pages and 512K pages. The reason
is that when using HW assistance in 4K pages mode, the linux model
fit with the HW model for 4K pages and 8M pages.

However for 16K pages and 512K mode some additional work is needed
to get linux model fit with HW model.

Therefore the 4K pages mode will be implemented first and without
support for 512k hugepages. Then the 512k hugepages will be brought
back. And the 16K pages will be implemented in further steps.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig   |  2 +-
 arch/powerpc/kernel/head_8xx.S | 36 
 arch/powerpc/mm/tlb_nohash.c   |  3 ---
 3 files changed, 1 insertion(+), 40 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a80669209155..33931804c46f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -698,7 +698,7 @@ config PPC_4K_PAGES
 
 config PPC_16K_PAGES
bool "16k page size"
-   depends on 44x || PPC_8xx
+   depends on 44x
 
 config PPC_64K_PAGES
bool "64k page size"
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index c203defe49a4..9b31721b522c 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -363,7 +363,6 @@ InstructionTLBMiss:
 #ifdef CONFIG_HUGETLB_PAGE
mtcrr11
bt- 28, 10f /* bit 28 = Large page (8M) */
-   bt- 29, 20f /* bit 29 = Large page (8M or 512k) */
 #endif
rlwimi  r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */
lwz r10, 0(r10) /* Get the pte */
@@ -414,23 +413,8 @@ InstructionTLBMiss:
 
 #ifdef CONFIG_HUGETLB_PAGE
 10:/* 8M pages */
-#ifdef CONFIG_PPC_16K_PAGES
-   /* Extract level 2 index */
-   rlwinm  r10, r10, 32 - (PAGE_SHIFT_8M - PAGE_SHIFT), 32 + PAGE_SHIFT_8M 
- (PAGE_SHIFT << 1), 29
-   /* Add level 2 base */
-   rlwimi  r10, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
-#else
/* Level 2 base */
rlwinm  r10, r11, 0, ~HUGEPD_SHIFT_MASK
-#endif
-   lwz r10, 0(r10) /* Get the pte */
-   b   4b
-
-20:/* 512k pages */
-   /* Extract level 2 index */
-   rlwinm  r10, r10, 32 - (PAGE_SHIFT_512K - PAGE_SHIFT), 32 + 
PAGE_SHIFT_512K - (PAGE_SHIFT << 1), 29
-   /* Add level 2 base */
-   rlwimi  r10, r11, 0, 0, 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1) - 1
lwz r10, 0(r10) /* Get the pte */
b   4b
 #endif
@@ -475,7 +459,6 @@ DataStoreTLBMiss:
 #ifdef CONFIG_HUGETLB_PAGE
mtcrr11
bt- 28, 10f /* bit 28 = Large page (8M) */
-   bt- 29, 20f /* bit 29 = Large page (8M or 512k) */
 #endif
rlwimi  r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */
lwz r10, 0(r10) /* Get the pte */
@@ -537,22 +520,8 @@ DataStoreTLBMiss:
 #ifdef CONFIG_HUGETLB_PAGE
 10:/* 8M pages */
/* Extract level 2 index */
-#ifdef CONFIG_PPC_16K_PAGES
-   rlwinm  r10, r10, 32 - (PAGE_SHIFT_8M - PAGE_SHIFT), 32 + PAGE_SHIFT_8M 
- (PAGE_SHIFT << 1), 29
-   /* Add level 2 base */
-   rlwimi  r10, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
-#else
/* Level 2 base */
rlwinm  r10, r11, 0, ~HUGEPD_SHIFT_MASK
-#endif
-   lwz r10, 0(r10) /* Get the pte */
-   b   4b
-
-20:/* 512k pages */
-   /* Extract level 2 index */
-   rlwinm  r10, r10, 32 - (PAGE_SHIFT_512K - PAGE_SHIFT), 32 + 
PAGE_SHIFT_512K - (PAGE_SHIFT << 1), 29
-   /* Add level 2 base */
-   rlwimi  r10, r11, 0, 0, 32 + PAGE_SHIFT_512K - (PAGE_SHIFT << 1) - 1
lwz r10, 0(r10) /* Get the pte */
b   4b
 #endif
@@ -773,12 +742,7 @@ FixupDAR:/* Entry point for dcbx workaround. */
 
/* concat physical page address(r11) and page offset(r10) */
 200:
-#ifdef CONFIG_PPC_16K_PAGES
-   rlwinm  r11, r11, 0, 0, 32 + PAGE_SHIFT_8M - (PAGE_SHIFT << 1) - 1
-   rlwimi  r11, r10, 32 - (PAGE_SHIFT_8M - 2), 32 + PAGE_SHIFT_8M - 
(PAGE_SHIFT << 1), 29
-#else
rlwinm  r11, r10, 0, ~HUGEPD_SHIFT_MASK
-#endif
lwz r11, 0(r11) /* Get the pte */
/* concat physical page address(r11) and page offset(r10) */
rlwimi  r11, r10, 0, 32 - PAGE_SHIFT_8M, 31
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index 15fe5f0c8665..49441963d285 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -97,9 +97,6 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
.shift  = 14,
},
 #endif
-   [MMU_PAGE_512K] = {
-   .shift  = 19,
-   },
[MMU_PAGE_8M] = {
.shift  = 23,
},
-- 
2.13.3



[PATCH v4 05/20] powerpc/8xx: Move SW perf counters in first 32kb of memory

2018-09-18 Thread Christophe Leroy
In order to simplify time critical exceptions handling 8xx
specific SW perf counters, this patch moves the counters into
the beginning of memory. This is possible because .text is readable
and the counters are never modified outside of the handlers.

By doing this, we avoid having to set a second register with
the upper part of the address of the counters.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/kernel/head_8xx.S | 58 --
 1 file changed, 28 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 3b67b9533c82..c203defe49a4 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -106,6 +106,23 @@ turn_on_mmu:
mtspr   SPRN_SRR0,r0
rfi /* enables MMU */
 
+
+#ifdef CONFIG_PERF_EVENTS
+   .align  4
+
+   .globl  itlb_miss_counter
+itlb_miss_counter:
+   .space  4
+
+   .globl  dtlb_miss_counter
+dtlb_miss_counter:
+   .space  4
+
+   .globl  instruction_counter
+instruction_counter:
+   .space  4
+#endif
+
 /*
  * Exception entry code.  This code runs with address translation
  * turned off, i.e. using physical addresses.
@@ -384,17 +401,16 @@ InstructionTLBMiss:
 
 #ifdef CONFIG_PERF_EVENTS
patch_site  0f, patch__itlbmiss_perf
-0: lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
-   lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
-   addir11, r11, 1
-   stw r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
-#endif
+0: lwz r10, (itlb_miss_counter - PAGE_OFFSET)@l(0)
+   addir10, r10, 1
+   stw r10, (itlb_miss_counter - PAGE_OFFSET)@l(0)
mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
rfi
+#endif
 
 #ifdef CONFIG_HUGETLB_PAGE
 10:/* 8M pages */
@@ -509,15 +525,14 @@ DataStoreTLBMiss:
 
 #ifdef CONFIG_PERF_EVENTS
patch_site  0f, patch__dtlbmiss_perf
-0: lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha
-   lwz r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
-   addir11, r11, 1
-   stw r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
-#endif
+0: lwz r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
+   addir10, r10, 1
+   stw r10, (dtlb_miss_counter - PAGE_OFFSET)@l(0)
mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
+#endif
 
 #ifdef CONFIG_HUGETLB_PAGE
 10:/* 8M pages */
@@ -625,16 +640,13 @@ DataBreakpoint:
. = 0x1d00
 InstructionBreakpoint:
mtspr   SPRN_SPRG_SCRATCH0, r10
-   mtspr   SPRN_SPRG_SCRATCH1, r11
-   lis r10, (instruction_counter - PAGE_OFFSET)@ha
-   lwz r11, (instruction_counter - PAGE_OFFSET)@l(r10)
-   addir11, r11, -1
-   stw r11, (instruction_counter - PAGE_OFFSET)@l(r10)
+   lwz r10, (instruction_counter - PAGE_OFFSET)@l(0)
+   addir10, r10, -1
+   stw r10, (instruction_counter - PAGE_OFFSET)@l(0)
lis r10, 0x
ori r10, r10, 0x01
mtspr   SPRN_COUNTA, r10
mfspr   r10, SPRN_SPRG_SCRATCH0
-   mfspr   r11, SPRN_SPRG_SCRATCH1
rfi
 #else
EXCEPTION(0x1d00, Trap_1d, unknown_exception, EXC_XFER_EE)
@@ -1065,17 +1077,3 @@ swapper_pg_dir:
  */
 abatron_pteptrs:
.space  8
-
-#ifdef CONFIG_PERF_EVENTS
-   .globl  itlb_miss_counter
-itlb_miss_counter:
-   .space  4
-
-   .globl  dtlb_miss_counter
-dtlb_miss_counter:
-   .space  4
-
-   .globl  instruction_counter
-instruction_counter:
-   .space  4
-#endif
-- 
2.13.3



[PATCH v4 04/20] powerpc/8xx: Use patch_site for perf counters setup

2018-09-18 Thread Christophe Leroy
The 8xx TLB miss routines are patched when (de)activating
perf counters.

This patch uses the new patch_site functionality in order
to get a better code readability and avoid a label mess when
dumping the code with 'objdump -d'

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h |  4 
 arch/powerpc/kernel/head_8xx.S | 33 +++--
 arch/powerpc/perf/8xx-pmu.c| 27 ---
 3 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 3a15d6647d47..fa05aa566ece 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -234,6 +234,10 @@ extern s32 patch__itlbmiss_linmem_top;
 extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp;
 extern s32 patch__fixupdar_linmem_top;
 
+extern s32 patch__itlbmiss_exit_1, patch__itlbmiss_exit_2;
+extern s32 patch__dtlbmiss_exit_1, patch__dtlbmiss_exit_2, 
patch__dtlbmiss_exit_3;
+extern s32 patch__itlbmiss_perf, patch__dtlbmiss_perf;
+
 #endif /* !__ASSEMBLY__ */
 
 #if defined(CONFIG_PPC_4K_PAGES)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 0425571a533d..3b67b9533c82 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -374,16 +374,17 @@ InstructionTLBMiss:
mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
 
/* Restore registers */
-_ENTRY(itlb_miss_exit_1)
-   mfspr   r10, SPRN_SPRG_SCRATCH0
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mfspr   r12, SPRN_SPRG_SCRATCH2
 #endif
rfi
+   patch_site  0b, patch__itlbmiss_exit_1
+
 #ifdef CONFIG_PERF_EVENTS
-_ENTRY(itlb_miss_perf)
-   lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
+   patch_site  0f, patch__itlbmiss_perf
+0: lis r10, (itlb_miss_counter - PAGE_OFFSET)@ha
lwz r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
addir11, r11, 1
stw r11, (itlb_miss_counter - PAGE_OFFSET)@l(r10)
@@ -499,14 +500,16 @@ DataStoreTLBMiss:
 
/* Restore registers */
mtspr   SPRN_DAR, r11   /* Tag DAR */
-_ENTRY(dtlb_miss_exit_1)
-   mfspr   r10, SPRN_SPRG_SCRATCH0
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
+   patch_site  0b, patch__dtlbmiss_exit_1
+
 #ifdef CONFIG_PERF_EVENTS
-_ENTRY(dtlb_miss_perf)
-   lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha
+   patch_site  0f, patch__dtlbmiss_perf
+0: lis r10, (dtlb_miss_counter - PAGE_OFFSET)@ha
lwz r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
addir11, r11, 1
stw r11, (dtlb_miss_counter - PAGE_OFFSET)@l(r10)
@@ -658,11 +661,12 @@ DTLBMissIMMR:
 
li  r11, RPN_PATTERN
mtspr   SPRN_DAR, r11   /* Tag DAR */
-_ENTRY(dtlb_miss_exit_2)
-   mfspr   r10, SPRN_SPRG_SCRATCH0
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
+   patch_site  0b, patch__dtlbmiss_exit_2
 
 DTLBMissLinear:
mtcrr12
@@ -676,11 +680,12 @@ DTLBMissLinear:
 
li  r11, RPN_PATTERN
mtspr   SPRN_DAR, r11   /* Tag DAR */
-_ENTRY(dtlb_miss_exit_3)
-   mfspr   r10, SPRN_SPRG_SCRATCH0
+
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
+   patch_site  0b, patch__dtlbmiss_exit_3
 
 #ifndef CONFIG_PIN_TLB_TEXT
 ITLBMissLinear:
@@ -693,11 +698,11 @@ ITLBMissLinear:
  _PAGE_PRESENT
mtspr   SPRN_MI_RPN, r10/* Update TLB entry */
 
-_ENTRY(itlb_miss_exit_2)
-   mfspr   r10, SPRN_SPRG_SCRATCH0
+0: mfspr   r10, SPRN_SPRG_SCRATCH0
mfspr   r11, SPRN_SPRG_SCRATCH1
mfspr   r12, SPRN_SPRG_SCRATCH2
rfi
+   patch_site  0b, patch__itlbmiss_exit_2
 #endif
 
 /* This is the procedure to calculate the data EA for buggy dcbx,dcbi 
instructions
diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c
index 6c0020d1c561..808f1873de61 100644
--- a/arch/powerpc/perf/8xx-pmu.c
+++ b/arch/powerpc/perf/8xx-pmu.c
@@ -31,9 +31,6 @@
 
 extern unsigned long itlb_miss_counter, dtlb_miss_counter;
 extern atomic_t instruction_counter;
-extern unsigned int itlb_miss_perf, dtlb_miss_perf;
-extern unsigned int itlb_miss_exit_1, itlb_miss_exit_2;
-extern unsigned int dtlb_miss_exit_1, dtlb_miss_exit_2, dtlb_miss_exit_3;
 
 static atomic_t insn_ctr_ref;
 static atomic_t itlb_miss_ref;
@@ -103,22 +100,22 @@ static int mpc8xx_pmu_add(struct perf_event *event, int 
flags)
break;
case PERF_8xx_ID_ITLB_LOAD_MISS:
if (atomic_inc_return(_miss_ref) == 1) {
-   

[PATCH v4 03/20] powerpc/8xx: Use patch_site for memory setup patching

2018-09-18 Thread Christophe Leroy
The 8xx TLB miss routines are patched at startup at several places.

This patch uses the new patch_site functionality in order
to get a better code readability and avoid a label mess when
dumping the code with 'objdump -d'

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h |  5 +
 arch/powerpc/kernel/head_8xx.S | 19 +++
 arch/powerpc/mm/8xx_mmu.c  | 23 +++
 3 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 193f53116c7a..3a15d6647d47 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -229,6 +229,11 @@ static inline unsigned int mmu_psize_to_shift(unsigned int 
mmu_psize)
BUG();
 }
 
+/* patch sites */
+extern s32 patch__itlbmiss_linmem_top;
+extern s32 patch__dtlbmiss_linmem_top, patch__dtlbmiss_immr_jmp;
+extern s32 patch__fixupdar_linmem_top;
+
 #endif /* !__ASSEMBLY__ */
 
 #if defined(CONFIG_PPC_4K_PAGES)
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 12c92a483fb1..0425571a533d 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if CONFIG_TASK_SIZE <= 0x8000 && CONFIG_PAGE_OFFSET >= 0x8000
 /* By simply checking Address >= 0x8000, we know if its a kernel address */
@@ -318,8 +319,8 @@ InstructionTLBMiss:
cmpli   cr0, r11, PAGE_OFFSET@h
 #ifndef CONFIG_PIN_TLB_TEXT
/* It is assumed that kernel code fits into the first 8M page */
-_ENTRY(ITLBMiss_cmp)
-   cmpli   cr7, r11, (PAGE_OFFSET + 0x080)@h
+0: cmpli   cr7, r11, (PAGE_OFFSET + 0x080)@h
+   patch_site  0b, patch__itlbmiss_linmem_top
 #endif
 #endif
 #endif
@@ -436,11 +437,11 @@ DataStoreTLBMiss:
 #ifndef CONFIG_PIN_TLB_IMMR
cmpli   cr0, r11, VIRT_IMMR_BASE@h
 #endif
-_ENTRY(DTLBMiss_cmp)
-   cmpli   cr7, r11, (PAGE_OFFSET + 0x180)@h
+0: cmpli   cr7, r11, (PAGE_OFFSET + 0x180)@h
+   patch_site  0b, patch__dtlbmiss_linmem_top
 #ifndef CONFIG_PIN_TLB_IMMR
-_ENTRY(DTLBMiss_jmp)
-   beq-DTLBMissIMMR
+0: beq-DTLBMissIMMR
+   patch_site  0b, patch__dtlbmiss_immr_jmp
 #endif
blt cr7, DTLBMissLinear
lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha
@@ -714,8 +715,10 @@ FixupDAR:/* Entry point for dcbx workaround. */
mfspr   r11, SPRN_M_TW  /* Get level 1 table */
blt+3f
rlwinm  r11, r10, 16, 0xfff8
-_ENTRY(FixupDAR_cmp)
-   cmpli   cr7, r11, (PAGE_OFFSET + 0x180)@h
+
+0: cmpli   cr7, r11, (PAGE_OFFSET + 0x180)@h
+   patch_site  0b, patch__fixupdar_linmem_top
+
/* create physical page address from effective address */
tophys(r11, r10)
blt-cr7, 201f
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index fee599cf3bc3..d39f3af03221 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -97,22 +97,13 @@ static void __init mmu_mapin_immr(void)
map_kernel_page(v + offset, p + offset, PAGE_KERNEL_NCG);
 }
 
-/* Address of instructions to patch */
-#ifndef CONFIG_PIN_TLB_IMMR
-extern unsigned int DTLBMiss_jmp;
-#endif
-extern unsigned int DTLBMiss_cmp, FixupDAR_cmp;
-#ifndef CONFIG_PIN_TLB_TEXT
-extern unsigned int ITLBMiss_cmp;
-#endif
-
-static void __init mmu_patch_cmp_limit(unsigned int *addr, unsigned long 
mapped)
+static void __init mmu_patch_cmp_limit(s32 *site, unsigned long mapped)
 {
-   unsigned int instr = *addr;
+   unsigned int instr = *(unsigned int *)site_addr(site);
 
instr &= 0x;
instr |= (unsigned long)__va(mapped) >> 16;
-   patch_instruction(addr, instr);
+   patch_instruction_site(site, instr);
 }
 
 unsigned long __init mmu_mapin_ram(unsigned long top)
@@ -123,17 +114,17 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
mapped = 0;
mmu_mapin_immr();
 #ifndef CONFIG_PIN_TLB_IMMR
-   patch_instruction(_jmp, PPC_INST_NOP);
+   patch_instruction_site(__dtlbmiss_immr_jmp, PPC_INST_NOP);
 #endif
 #ifndef CONFIG_PIN_TLB_TEXT
-   mmu_patch_cmp_limit(_cmp, 0);
+   mmu_patch_cmp_limit(__itlbmiss_linmem_top, 0);
 #endif
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
}
 
-   mmu_patch_cmp_limit(_cmp, mapped);
-   mmu_patch_cmp_limit(_cmp, mapped);
+   mmu_patch_cmp_limit(__dtlbmiss_linmem_top, mapped);
+   mmu_patch_cmp_limit(__fixupdar_linmem_top, mapped);
 
/* If the size of RAM is not an exact power of two, we may not
 * have covered RAM in its entirety with 8 MiB
-- 
2.13.3



[PATCH v4 01/20] Revert "powerpc/8xx: Use L1 entry APG to handle _PAGE_ACCESSED for CONFIG_SWAP"

2018-09-18 Thread Christophe Leroy
This reverts commit 4f94b2c7462d9720b2afa7e8e8d4c19446bb31ce.

That commit was buggy, as it used rlwinm instead of rlwimi.
Instead of fixing that bug, we revert the previous commit in order to
reduce the dependency between L1 entries and L2 entries

Fixes: 4f94b2c7462d9 ("powerpc/8xx: Use L1 entry APG to handle _PAGE_ACCESSED 
for CONFIG_SWAP")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/mmu-8xx.h | 34 +---
 arch/powerpc/kernel/head_8xx.S | 45 +++---
 arch/powerpc/mm/8xx_mmu.c  |  2 +-
 3 files changed, 34 insertions(+), 47 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu-8xx.h 
b/arch/powerpc/include/asm/mmu-8xx.h
index 4f547752ae79..193f53116c7a 100644
--- a/arch/powerpc/include/asm/mmu-8xx.h
+++ b/arch/powerpc/include/asm/mmu-8xx.h
@@ -34,20 +34,12 @@
  * respectively NA for All or X for Supervisor and no access for User.
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
- * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP:
- * When that bit is not set access is done iaw "all user"
- * which means no access iaw page rules.
- * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED
- * 0x => No access => 11 (all accesses performed as user iaw page definition)
- * 10 => No user => 01 (all accesses performed according to page definition)
- * 11 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * Therefore, we define 2 APG groups. lsb is _PMD_USER
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
  * We define all 16 groups so that all other bits of APG can take any value
  */
-#ifdef CONFIG_SWAP
-#define MI_APG_INIT0xf4f4f4f4
-#else
 #define MI_APG_INIT0x
-#endif
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MI_RPN is written, bits in
@@ -115,20 +107,12 @@
  * Supervisor and no access for user and NA for ALL.
  * Then we use the APG to say whether accesses are according to Page rules or
  * "all Supervisor" rules (Access to all)
- * We also use the 2nd APG bit for _PAGE_ACCESSED when having SWAP:
- * When that bit is not set access is done iaw "all user"
- * which means no access iaw page rules.
- * Therefore, we define 4 APG groups. lsb is _PMD_USER, 2nd is _PAGE_ACCESSED
- * 0x => No access => 11 (all accesses performed as user iaw page definition)
- * 10 => No user => 01 (all accesses performed according to page definition)
- * 11 => User => 00 (all accesses performed as supervisor iaw page definition)
+ * Therefore, we define 2 APG groups. lsb is _PMD_USER
+ * 0 => No user => 01 (all accesses performed according to page definition)
+ * 1 => User => 00 (all accesses performed as supervisor iaw page definition)
  * We define all 16 groups so that all other bits of APG can take any value
  */
-#ifdef CONFIG_SWAP
-#define MD_APG_INIT0xf4f4f4f4
-#else
 #define MD_APG_INIT0x
-#endif
 
 /* The effective page number register.  When read, contains the information
  * about the last instruction TLB miss.  When MD_RPN is written, bits in
@@ -180,12 +164,6 @@
  */
 #define SPRN_M_TW  799
 
-/* APGs */
-#define M_APG0 0x
-#define M_APG1 0x0020
-#define M_APG2 0x0040
-#define M_APG3 0x0060
-
 #ifdef CONFIG_PPC_MM_SLICES
 #include 
 #define SLICE_ARRAY_SIZE   (1 << (32 - SLICE_LOW_SHIFT - 1))
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 134a573a9f2d..12c92a483fb1 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -353,13 +353,14 @@ _ENTRY(ITLBMiss_cmp)
 #if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mtcrr12
 #endif
-
-#ifdef CONFIG_SWAP
-   rlwinm  r11, r10, 31, _PAGE_ACCESSED >> 1
-#endif
/* Load the MI_TWC with the attributes for this "segment." */
mtspr   SPRN_MI_TWC, r11/* Set segment attributes */
 
+#ifdef CONFIG_SWAP
+   rlwinm  r11, r10, 32-5, _PAGE_PRESENT
+   and r11, r11, r10
+   rlwimi  r10, r11, 0, _PAGE_PRESENT
+#endif
li  r11, RPN_PATTERN | 0x200
/* The Linux PTE won't go exactly into the MMU TLB.
 * Software indicator bits 20 and 23 must be clear.
@@ -470,14 +471,22 @@ _ENTRY(DTLBMiss_jmp)
 * above.
 */
rlwimi  r11, r10, 0, _PAGE_GUARDED
-#ifdef CONFIG_SWAP
-   /* _PAGE_ACCESSED has to be set. We use second APG bit for that, 0
-* on that bit will represent a Non Access group
-*/
-   rlwinm  r11, r10, 31, _PAGE_ACCESSED >> 1
-#endif
mtspr   SPRN_MD_TWC, r11
 
+   /* Both _PAGE_ACCESSED and _PAGE_PRESENT has to be set.
+* We also need to know if the insn is a load/store, so:

[PATCH v4 02/20] powerpc/code-patching: add a helper to get the address of a patch_site

2018-09-18 Thread Christophe Leroy
This patch adds a helper to get the address of a patch_site

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/code-patching.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index 31733a95bbd0..bca48cc1b6ad 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -36,6 +36,11 @@ int raw_patch_instruction(unsigned int *addr, unsigned int 
instr);
 int patch_instruction_site(s32 *addr, unsigned int instr);
 int patch_branch_site(s32 *site, unsigned long target, int flags);
 
+static inline unsigned long site_addr(s32 *site)
+{
+   return (unsigned long)site + *site;
+}
+
 int instr_is_relative_branch(unsigned int instr);
 int instr_is_relative_link_branch(unsigned int instr);
 int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr);
-- 
2.13.3



[PATCH v4 00/20] Implement use of HW assistance on TLB table walk on 8xx

2018-09-18 Thread Christophe Leroy
The purpose of this serie is to implement hardware assistance for TLB table walk
on the 8xx.

First part switches to patch_site instead of patch_instruction,
as it makes the code clearer and avoids pollution with global symbols.

Optimise access to perf counters (hence reduce number of registers used)

Second part implements HW assistance in TLB routines.

Last part is to make L1 entries and L2 entries independant. For that,
we need to alter ioremap functions in order to handle GUARD attribute
at the PGD/PMD level.

Tested successfully on 8xx.

This serie applies after the two following series:
- [v2 00/24] ban the use of _PAGE_XXX flags outside platform specific code 
(https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=65376)
- [v2,1/4] powerpc/mm: enable the use of page table cache of order 0 
(https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=60777)

Successfull compilation on kisskb (v4)
http://kisskb.ellerman.id.au/kisskb/branch/chleroy/head/cfdf3349e3877df4cbfa9193ad1f4f4e4ada52de/

Successfull compilation on following defconfigs (v3):
ppc64_defconfig
ppc64e_defconfig

Successfull compilation on following defconfigs (v2):
ppc64_defconfig
ppc64e_defconfig
pseries_defconfig
pmac32_defconfig
linkstation_defconfig
corenet32_smp_defconfig
ppc40x_defconfig
storcenter_defconfig
ppc44x_defconfig

Changes in v4:
 - Reordered the serie to put at the end the modifications which makes
   L1 and L2 entries independant.
 - No modifications to ppc64 ioremap (we still have an opportunity to
   merge them, for a future patch serie)
 - 8xx code modified to use patch_site instead of patch_instruction
   to get a clearer code and avoid object pollution with global symbols
 - Moved perf counters in first 32kb of memory to optimise access
 - Split the big bang to HW assistance in several steps:
   1. Temporarily removes support of 16k pages and 512k hugepages
   2. Change TLB routines to use HW assistance for 4k pages and 8M hugepages
   3. Add back support for 512k hugepages
   4. Add back support for 16k pages (using pte_fragment as page tables are 
still 4k)

Changes in v3:
 - Fixed an issue in the 09/14 when CONFIG_PIN_TLB_TEXT was not enabled
 - Added performance measurement in the 09/14 commit log
 - Rebased on latest 'powerpc/merge' tree, which conflicted with 13/14

Changes in v2:
 - Removed the 3 first patchs which have been applied already
 - Fixed compilation errors reported by Michael
 - Squashed the commonalisation of ioremap functions into a single patch
 - Fixed the use of pte_fragment
 - Added a patch optimising perf counting of TLB misses and instructions

Christophe Leroy (20):
  Revert "powerpc/8xx: Use L1 entry APG to handle _PAGE_ACCESSED for
CONFIG_SWAP"
  powerpc/code-patching: add a helper to get the address of a patch_site
  powerpc/8xx: Use patch_site for memory setup patching
  powerpc/8xx: Use patch_site for perf counters setup
  powerpc/8xx: Move SW perf counters in first 32kb of memory
  powerpc/8xx: Temporarily disable 16k pages and 512k hugepages
  powerpc/mm: Use hardware assistance in TLB handlers on the 8xx
  powerpc/mm: Enable 512k hugepage support with HW assistance on the 8xx
  powerpc/8xx: don't use r12/SPRN_SPRG_SCRATCH2 in TLB Miss handlers
  powerpc/8xx: regroup TLB handler routines
  powerpc/mm: don't use pte_alloc_one_kernel() before slab is available
  powerpc/mm: inline pte_alloc_one() and pte_alloc_one_kernel() in PPC32
  powerpc/book3s32: Remove CONFIG_BOOKE dependent code
  powerpc/mm: Move pte_fragment_alloc() to a common location
  powerpc/mm: Avoid useless lock with single page fragments
  powerpc/mm: Extend pte_fragment functionality to nohash/32
  powerpc/8xx: Remove PTE_ATOMIC_UPDATES
  powerpc/mm: reintroduce 16K pages with HW assistance on 8xx
  powerpc/nohash32: allow setting GUARDED attribute in the PMD directly
  powerpc/8xx: set GUARDED attribute in the PMD directly

 arch/powerpc/include/asm/book3s/32/pgalloc.h |  28 +-
 arch/powerpc/include/asm/book3s/32/pgtable.h |  16 +-
 arch/powerpc/include/asm/code-patching.h |   5 +
 arch/powerpc/include/asm/hugetlb.h   |   4 +-
 arch/powerpc/include/asm/mmu-40x.h   |   1 +
 arch/powerpc/include/asm/mmu-44x.h   |   1 +
 arch/powerpc/include/asm/mmu-8xx.h   |  44 +--
 arch/powerpc/include/asm/mmu-book3e.h|   1 +
 arch/powerpc/include/asm/mmu_context.h   |   2 +-
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  43 ++-
 arch/powerpc/include/asm/nohash/32/pgtable.h |  45 ++-
 arch/powerpc/include/asm/nohash/32/pte-8xx.h |   6 +-
 arch/powerpc/include/asm/nohash/pgtable.h|   4 +
 arch/powerpc/include/asm/page.h  |   6 +-
 arch/powerpc/include/asm/pgtable-types.h |   4 +
 arch/powerpc/include/asm/pgtable.h   |   8 +
 arch/powerpc/kernel/head_8xx.S   | 425 +++
 arch/powerpc/mm/8xx_mmu.c|  29 +-
 arch/powerpc/mm/Makefile |   7 +-
 

Re: [PATCH] powerpc/fadump: re-register firmware-assisted dump if already registered

2018-09-18 Thread Mahesh Jagannath Salgaonkar
On 09/14/2018 07:36 PM, Hari Bathini wrote:
> Firmware-Assisted Dump (FADump) needs to be registered again after any
> memory hot add/remove operation to update the crash memory ranges. But
> currently, the kernel returns '-EEXIST' if we try to register without
> uregistering it first. This could expose the system to racing issues
> while unregistering and registering FADump from userspace during udev
> events. Spare the userspace of this and let it be taken care of in the
> kernel space for a simpler interface.
> 
> Since this change, running 'echo 1 > /sys/kernel/fadump_registered'
> would result in re-regisering (unregistering and registering) FADump,
> if it was already registered.
> 
> Signed-off-by: Hari Bathini 

Looks good to me.

Acked-by: Mahesh Salgaonkar 

Thanks,
-Mahesh.

> ---
>  arch/powerpc/kernel/fadump.c |4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index a711d22..761b28b 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -1444,8 +1444,8 @@ static ssize_t fadump_register_store(struct kobject 
> *kobj,
>   break;
>   case 1:
>   if (fw_dump.dump_registered == 1) {
> - ret = -EEXIST;
> - goto unlock_out;
> + /* Un-register Firmware-assisted dump */
> + fadump_unregister_dump();
>   }
>   /* Register Firmware-assisted dump */
>   ret = register_fadump();
> 



Re: [PATCH v3 2/3] watchdog: mpc8xxx: provide boot status

2018-09-18 Thread Guenter Roeck
On Mon, Sep 17, 2018 at 06:22:50AM +, Christophe Leroy wrote:
> mpc8xxx watchdog driver supports the following platforms:
> - mpc8xx
> - mpc83xx
> - mpc86xx
> 
> Those three platforms have a 32 bits register which provides the
> reason of the last boot, including whether it was caused by the
> watchdog.
> 
> mpc8xx: Register RSR, bit SWRS (bit 3)
> mpc83xx: Register RSR, bit SWRS (bit 28)
> mpc86xx: Register RSTRSCR, bit WDT_RR (bit 11)
> 
> This patch maps the register as defined in the device tree and updates
> wdt.bootstatus based on the value of the watchdog related bit. Then
> the information can be retrieved via the WDIOC_GETBOOTSTATUS ioctl.
> 
> Hereunder is an example of devicetree for mpc8xx,
> the Reset Status Register being at offset 0x288:
> 
>   WDT: watchdog@0 {
>   compatible = "fsl,mpc823-wdt";
>   reg = <0x0 0x10 0x288 0x4>;
>   };
> 
> On the mpc83xx, RSR is at offset 0x910
> On the mpc86xx, RSTRSCR is at offset 0xe0094
> 
> Suggested-by: Radu Rendec 
> Tested-by: Christophe Leroy  # On mpc885
> Signed-off-by: Christophe Leroy 
> ---

I am ok with the patch, though I would really appreciate change logs
in the future. Consider this an informal Reviewed-by:.
Waiting for DT review before final approval.

Guenter

>  drivers/watchdog/mpc8xxx_wdt.c | 22 ++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c
> index 1dcf5f10cdd9..069072e6747d 100644
> --- a/drivers/watchdog/mpc8xxx_wdt.c
> +++ b/drivers/watchdog/mpc8xxx_wdt.c
> @@ -47,6 +47,7 @@ struct mpc8xxx_wdt {
>  struct mpc8xxx_wdt_type {
>   int prescaler;
>   bool hw_enabled;
> + u32 rsr_mask;
>  };
>  
>  struct mpc8xxx_wdt_ddata {
> @@ -159,6 +160,24 @@ static int mpc8xxx_wdt_probe(struct platform_device 
> *ofdev)
>   return -ENODEV;
>   }
>  
> + res = platform_get_resource(ofdev, IORESOURCE_MEM, 1);
> + if (res) {
> + bool status;
> + u32 __iomem *rsr = ioremap(res->start, resource_size(res));
> +
> + if (!rsr)
> + return -ENOMEM;
> +
> + status = in_be32(rsr) & wdt_type->rsr_mask;
> + ddata->wdd.bootstatus = status ? WDIOF_CARDRESET : 0;
> +  /* clear reset status bits related to watchdog timer */
> + out_be32(rsr, wdt_type->rsr_mask);
> + iounmap(rsr);
> +
> + dev_info(dev, "Last boot was %scaused by watchdog\n",
> +  status ? "" : "not ");
> + }
> +
>   spin_lock_init(>lock);
>  
>   ddata->wdd.info = _wdt_info,
> @@ -216,6 +235,7 @@ static const struct of_device_id mpc8xxx_wdt_match[] = {
>   .compatible = "mpc83xx_wdt",
>   .data = &(struct mpc8xxx_wdt_type) {
>   .prescaler = 0x1,
> + .rsr_mask = BIT(3), /* RSR Bit SWRS */
>   },
>   },
>   {
> @@ -223,6 +243,7 @@ static const struct of_device_id mpc8xxx_wdt_match[] = {
>   .data = &(struct mpc8xxx_wdt_type) {
>   .prescaler = 0x1,
>   .hw_enabled = true,
> + .rsr_mask = BIT(20), /* RSTRSCR Bit WDT_RR */
>   },
>   },
>   {
> @@ -230,6 +251,7 @@ static const struct of_device_id mpc8xxx_wdt_match[] = {
>   .data = &(struct mpc8xxx_wdt_type) {
>   .prescaler = 0x800,
>   .hw_enabled = true,
> + .rsr_mask = BIT(28), /* RSR Bit SWRS */
>   },
>   },
>   {},
> -- 
> 2.13.3
> 


Re: [PATCH RFC 1/4] PCI: hotplug: Add parameter to put devices to reset during rescan

2018-09-18 Thread Sergey Miroshnichenko
On 9/18/18 1:59 AM, Bjorn Helgaas wrote:
> [+cc Russell, Ben, Oliver, linuxppc-dev]
> 
> On Mon, Sep 17, 2018 at 11:55:43PM +0300, Sergey Miroshnichenko wrote:
>> Hello Sam,
>>
>> On 9/17/18 8:28 AM, Sam Bobroff wrote:
>>> Hi Sergey,
>>>
>>> On Fri, Sep 14, 2018 at 07:14:01PM +0300, Sergey Miroshnichenko wrote:
 Introduce a new command line option "pci=pcie_movable_bars" that indicates
 support of PCIe hotplug without prior reservation of memory regions by
 BIOS/bootloader.

 If a new PCIe device has been hot-plugged between two active ones, which
 have no (or not big enough) gap between their BARs, allocating new BARs
 requires to move BARs of the following working devices:

 1)   dev 4
|
v
 .. |  dev 3  |  dev 3  |  dev 5  |  dev 7  |
 .. |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 0  |

 2) dev 4
  |
  v
 .. |  dev 3  |  dev 3  | -->   --> |  dev 5  |  dev 7  |
 .. |  BAR 0  |  BAR 1  | -->   --> |  BAR 0  |  BAR 0  |

 3)

 .. |  dev 3  |  dev 3  |  dev 4  |  dev 4  |  dev 5  |  dev 7  |
 .. |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 1  |  BAR 0  |  BAR 0  |

 Not only BARs, but also bridge windows can be updated during a PCIe rescan,
 threatening all memory transactions during this procedure, so the PCI
 subsystem will instruct the drivers to pause by calling the reset_prepare()
 and reset_done() callbacks.

 If a device may be affected by BAR movement, the BAR changes tracking must
 be implemented in its driver.

 Signed-off-by: Sergey Miroshnichenko 
 ---
  .../admin-guide/kernel-parameters.txt |  6 +++
  drivers/pci/pci.c |  2 +
  drivers/pci/probe.c   | 43 +++
  include/linux/pci.h   |  1 +
  4 files changed, 52 insertions(+)

 diff --git a/Documentation/admin-guide/kernel-parameters.txt 
 b/Documentation/admin-guide/kernel-parameters.txt
 index 64a3bf54b974..f8132a709061 100644
 --- a/Documentation/admin-guide/kernel-parameters.txt
 +++ b/Documentation/admin-guide/kernel-parameters.txt
 @@ -3311,6 +3311,12 @@
bridges without forcing it upstream. Note:
this removes isolation between devices and
may put more devices in an IOMMU group.
 +  pcie_movable_bars   Arrange a space at runtime for BARs of
 +  hotplugged PCIe devices - usable if bootloader
 +  doesn't reserve memory regions for them. Freeing
 +  a space may require moving BARs of active 
 devices
 +  to higher addresses, so device drivers will be
 +  paused during rescan.
  
pcie_aspm=  [PCIE] Forcibly enable or disable PCIe Active State 
 Power
Management.
 diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
 index 1835f3a7aa8d..5f07a59b5924 100644
 --- a/drivers/pci/pci.c
 +++ b/drivers/pci/pci.c
 @@ -6105,6 +6105,8 @@ static int __init pci_setup(char *str)
pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS);
} else if (!strncmp(str, "disable_acs_redir=", 18)) {
disable_acs_redir_param = str + 18;
 +  } else if (!strncmp(str, "pcie_movable_bars", 17)) {
 +  pci_add_flags(PCI_MOVABLE_BARS);
} else {
printk(KERN_ERR "PCI: Unknown option `%s'\n",
str);
 diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
 index 201f9e5ff55c..bdaafc48dc4c 100644
 --- a/drivers/pci/probe.c
 +++ b/drivers/pci/probe.c
 @@ -3138,6 +3138,45 @@ unsigned int pci_rescan_bus_bridge_resize(struct 
 pci_dev *bridge)
return max;
  }
  
 +/*
 + * Put all devices of the bus and its children to reset
 + */
 +static void pci_bus_reset_prepare(struct pci_bus *bus)
 +{
 +  struct pci_dev *dev;
 +
 +  list_for_each_entry(dev, >devices, bus_list) {
 +  struct pci_bus *child = dev->subordinate;
 +
 +  if (child) {
 +  pci_bus_reset_prepare(child);
 +  } else if (dev->driver &&
 + dev->driver->err_handler &&
 + dev->driver->err_handler->reset_prepare) {
 +  dev->driver->err_handler->reset_prepare(dev);
 +  }
>>>
>>> What about devices with drivers that don't have reset_prepare()?  It
>>> looks 

Re: [RFC PATCH 3/3] powerpc/mm/iommu: Allow migration of cma allocated pages during mm_iommu_get

2018-09-18 Thread Aneesh Kumar K.V

On 9/18/18 9:21 AM, David Gibson wrote:

On Mon, Sep 03, 2018 at 10:07:33PM +0530, Aneesh Kumar K.V wrote:

Current code doesn't do page migration if the page allocated is a compound page.
With HugeTLB migration support, we can end up allocating hugetlb pages from
CMA region. Also THP pages can be allocated from CMA region. This patch updates
the code to handle compound pages correctly.

This add a new helper get_user_pages_cma_migrate. It does one get_user_pages
with right count, instead of doing one get_user_pages per page. That avoids
reading page table multiple times. The helper could possibly used by other
subystem if we have more users.

The patch also convert the hpas member of mm_iommu_table_group_mem_t to a union.
We use the same storage location to store pointers to struct page. We cannot
update alll the code path use struct page *, because we access hpas in real mode
and we can't do that struct page * to pfn conversion in real mode.

Signed-off-by: Aneesh Kumar K.V 


This approach doesn't seem quite right to me.  It's specific to pages
mapped into the IOMMU.  It's true that will address the obvious case
we have, of vfio-using guests fragmenting the CMA for other guests.

But AFAICT, fragmenting the CMA coud happen with *any* locked memory,
not just things that are IOMMU mapped for VFIO.  So, for example a
guest not using vfio, but using -realtime mlock=on, or an unrelated
program using locked memory (e.g. gpg or something else that locks
memory for security reasons).

AFAICT this approach won't fix the problem for that case.



yes and we should be migrate away pages that we allocated out of CMA 
region before we pin/mlock them. This handle the long term pin w.r.t 
vfio. For mlock too we should do that.


-aneesh



Re: [PATCH 2/3] powerpc: Add system call table generation support

2018-09-18 Thread Firoz Khan
On 14 September 2018 at 15:31, Arnd Bergmann  wrote:
> On Fri, Sep 14, 2018 at 10:33 AM Firoz Khan  wrote:
>
>> ---
>>  arch/powerpc/kernel/syscalls/Makefile   |  51 
>>  arch/powerpc/kernel/syscalls/syscall_32.tbl | 378 
>> 
>>  arch/powerpc/kernel/syscalls/syscall_64.tbl | 372 
>> +++
>>  arch/powerpc/kernel/syscalls/syscallhdr.sh  |  37 +++
>>  arch/powerpc/kernel/syscalls/syscalltbl.sh  |  38 +++
>
> I think you should only need a single .tbl  input file here.

Yes, we can do that way also.As I mentioned, it will add
more complexity in the script.

 The script has to be smart enough to parse the
.tbl if we add more thing in the .tble file. It need more
logic in the scripts. This is not common. So if you keep
separate .tbl we can avoid this.

ABI flag is serving *nothing* in all other architecture including
SPARC.

But as I told in the cover letter, I followed x86/arm/
s390 architecture's system table generation implementation.
They are keeping ABI flag. In our case we can delete this
flag completely from all architectures.

Most of the architecture these 32/64 similarity is absent.
So it would be better keep separate files to maintain a
generic script across all architecture.

>
>
>> +
>> +systbl_abi_syscall_table_32 := 32
>> +$(out)/syscall_table_32.h: $(syscall32) $(systbl)
>> +   $(call if_changed,systbl)
>> +
>> +systbl_abi_syscall_table_64 := 64
>> +$(out)/syscall_table_64.h: $(syscall64) $(systbl)
>> +   $(call if_changed,systbl)
>> +
>> +systbl_abi_syscall_table_c32 := c32
>> +$(out)/syscall_table_c32.h: $(syscall32) $(systbl)
>> +   $(call if_changed,systbl)
>
> And here you need a fourth output file for the SPU table on ppc64.

Hmm. Let me have a look where things went wrong.

>
>> +383 common  statx   sys_statx
>> +384 common  pkey_alloc  sys_pkey_alloc
>> +385 common  pkey_free   sys_pkey_free
>> +386 common  pkey_mprotect   sys_pkey_mprotect
>
> This also misses rseq and io_pgetevents.

As I mentioned in the cover letter:
"I started working system call table generation on 4.17-rc1. I used
marcin's script - https://github.com/hrw/syscalls-table to generate
the syscall.tbl file. And this will be the input to the system call
table generation script. But there are couple system call got add
in the latest rc release. If run Marcin's script on latest release,
It will generate a new syscall.tbl. But I still use the old file -
syscall.tbl and once all review got over I'll update syscall.tbl
alone w.r.to the tip of the kernel. The impact of this thing, few
of the system call won't work."

Hopefully, the next version does have this change. Thanks!

- Firoz


[PATCH V3] powerpc/mm/iommu: Allow large IOMMU page size only for hugetlb backing

2018-09-18 Thread Aneesh Kumar K.V
THP pages can get split during different code paths. An incremented reference
count do imply we will not split the compound page. But the pmd entry can be
converted to level 4 pte entries. Keep the code simpler by allowing large
IOMMU page size only if the guest ram is backed by hugetlb pages.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/mmu_context_iommu.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index f0d8645872cb..c3172126bc0d 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -91,8 +91,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
struct mm_iommu_table_group_mem_t *mem;
long i, ret = 0, locked_entries = 0;
unsigned int pageshift;
-   unsigned long flags;
-   unsigned long cur_ua;
 
mutex_lock(_list_mutex);
 
@@ -155,23 +153,15 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
pageshift = PAGE_SHIFT;
for (i = 0; i < entries; ++i) {
struct page *page = mem->hpages[i];
-   cur_ua = ua + (i << PAGE_SHIFT);
 
-   if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
-   pte_t *pte;
+   /*
+* Allow to use larger than 64k IOMMU pages. Only do that
+* if we are backed by hugetlb.
+*/
+   if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
struct page *head = compound_head(page);
-   unsigned int compshift = compound_order(head);
-   unsigned int pteshift;
-
-   local_irq_save(flags); /* disables as well */
-   pte = find_linux_pte(mm->pgd, cur_ua, NULL, );
-
-   /* Double check it is still the same pinned page */
-   if (pte && pte_page(*pte) == head &&
-   pteshift == compshift + PAGE_SHIFT)
-   pageshift = max_t(unsigned int, pteshift,
-   PAGE_SHIFT);
-   local_irq_restore(flags);
+
+   pageshift = compound_order(head) + PAGE_SHIFT;
}
mem->pageshift = min(mem->pageshift, pageshift);
/*
-- 
2.17.1



[PATCH] powerpc/mm/iommu: Allow large IOMMU page size only for hugetlb backing

2018-09-18 Thread Aneesh Kumar K.V
THP pages can get split during different code paths. An incremented reference
count do imply we will not split the compound page. But the pmd entry can be
converted to level 4 pte entries. Keep the code simpler by allowing large
IOMMU page size only if the guest ram is backed by hugetlb pages.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/mmu_context_iommu.c | 24 +++-
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index f0d8645872cb..c3172126bc0d 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -91,8 +91,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
struct mm_iommu_table_group_mem_t *mem;
long i, ret = 0, locked_entries = 0;
unsigned int pageshift;
-   unsigned long flags;
-   unsigned long cur_ua;
 
mutex_lock(_list_mutex);
 
@@ -155,23 +153,15 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
pageshift = PAGE_SHIFT;
for (i = 0; i < entries; ++i) {
struct page *page = mem->hpages[i];
-   cur_ua = ua + (i << PAGE_SHIFT);
 
-   if (mem->pageshift > PAGE_SHIFT && PageCompound(page)) {
-   pte_t *pte;
+   /*
+* Allow to use larger than 64k IOMMU pages. Only do that
+* if we are backed by hugetlb.
+*/
+   if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) {
struct page *head = compound_head(page);
-   unsigned int compshift = compound_order(head);
-   unsigned int pteshift;
-
-   local_irq_save(flags); /* disables as well */
-   pte = find_linux_pte(mm->pgd, cur_ua, NULL, );
-
-   /* Double check it is still the same pinned page */
-   if (pte && pte_page(*pte) == head &&
-   pteshift == compshift + PAGE_SHIFT)
-   pageshift = max_t(unsigned int, pteshift,
-   PAGE_SHIFT);
-   local_irq_restore(flags);
+
+   pageshift = compound_order(head) + PAGE_SHIFT;
}
mem->pageshift = min(mem->pageshift, pageshift);
/*
-- 
2.17.1



[PATCH V3 2/2] powerpc/mm/iommu: Allow migration of cma allocated pages during mm_iommu_get

2018-09-18 Thread Aneesh Kumar K.V
Current code doesn't do page migration if the page allocated is a compound page.
With HugeTLB migration support, we can end up allocating hugetlb pages from
CMA region. Also THP pages can be allocated from CMA region. This patch updates
the code to handle compound pages correctly.

This use the new helper get_user_pages_cma_migrate. It does one get_user_pages
with right count, instead of doing one get_user_pages per page. That avoids
reading page table multiple times.

The patch also convert the hpas member of mm_iommu_table_group_mem_t to a union.
We use the same storage location to store pointers to struct page. We cannot
update alll the code path use struct page *, because we access hpas in real mode
and we can't do that struct page * to pfn conversion in real mode.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/mmu_context_iommu.c | 120 
 1 file changed, 35 insertions(+), 85 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index c9ee9e23845f..f0d8645872cb 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static DEFINE_MUTEX(mem_list_mutex);
 
@@ -30,8 +31,18 @@ struct mm_iommu_table_group_mem_t {
atomic64_t mapped;
unsigned int pageshift;
u64 ua; /* userspace address */
-   u64 entries;/* number of entries in hpas[] */
-   u64 *hpas;  /* vmalloc'ed */
+   u64 entries;/* number of entries in hpages[] */
+   /*
+* in mm_iommu_get we temporarily use this to store
+* struct page address.
+*
+* We need to convert ua to hpa in real mode. Make it
+* simpler by storing physicall address.
+*/
+   union {
+   struct page **hpages;   /* vmalloc'ed */
+   phys_addr_t *hpas;
+   };
 };
 
 static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
@@ -74,63 +85,14 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
-/*
- * Taken from alloc_migrate_target with changes to remove CMA allocations
- */
-struct page *new_iommu_non_cma_page(struct page *page, unsigned long private)
-{
-   gfp_t gfp_mask = GFP_USER;
-   struct page *new_page;
-
-   if (PageCompound(page))
-   return NULL;
-
-   if (PageHighMem(page))
-   gfp_mask |= __GFP_HIGHMEM;
-
-   /*
-* We don't want the allocation to force an OOM if possibe
-*/
-   new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
-   return new_page;
-}
-
-static int mm_iommu_move_page_from_cma(struct page *page)
-{
-   int ret = 0;
-   LIST_HEAD(cma_migrate_pages);
-
-   /* Ignore huge pages for now */
-   if (PageCompound(page))
-   return -EBUSY;
-
-   lru_add_drain();
-   ret = isolate_lru_page(page);
-   if (ret)
-   return ret;
-
-   list_add(>lru, _migrate_pages);
-   put_page(page); /* Drop the gup reference */
-
-   ret = migrate_pages(_migrate_pages, new_iommu_non_cma_page,
-   NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE);
-   if (ret) {
-   if (!list_empty(_migrate_pages))
-   putback_movable_pages(_migrate_pages);
-   }
-
-   return 0;
-}
-
 long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
-   long i, j, ret = 0, locked_entries = 0;
+   long i, ret = 0, locked_entries = 0;
unsigned int pageshift;
unsigned long flags;
unsigned long cur_ua;
-   struct page *page = NULL;
 
mutex_lock(_list_mutex);
 
@@ -177,41 +139,24 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
goto unlock_exit;
}
 
+   ret = get_user_pages_cma_migrate(ua, entries, 1, mem->hpages);
+   if (ret != entries) {
+   /* free the reference taken */
+   for (i = 0; i < ret; i++)
+   put_page(mem->hpages[i]);
+
+   vfree(mem->hpas);
+   kfree(mem);
+   ret = -EFAULT;
+   goto unlock_exit;
+   } else
+   ret = 0;
+
+   pageshift = PAGE_SHIFT;
for (i = 0; i < entries; ++i) {
+   struct page *page = mem->hpages[i];
cur_ua = ua + (i << PAGE_SHIFT);
-   if (1 != get_user_pages_fast(cur_ua,
-   1/* pages */, 1/* iswrite */, )) {
-   ret = -EFAULT;
-   for (j = 0; j < i; ++j)
-   put_page(pfn_to_page(mem->hpas[j] >>
-   PAGE_SHIFT));
- 

[PATCH V3 1/2] mm: Add get_user_pages_cma_migrate

2018-09-18 Thread Aneesh Kumar K.V
This helper does a get_user_pages_fast and if it find pages in the CMA area
it will try to migrate them before taking page reference. This makes sure that
we don't keep non-movable pages (due to page reference count) in the CMA area.
Not able to move pages out of CMA area result in CMA allocation failures.

Signed-off-by: Aneesh Kumar K.V 
---
 include/linux/hugetlb.h |   2 +
 include/linux/migrate.h |   3 +
 mm/hugetlb.c|   4 +-
 mm/migrate.c| 132 
 4 files changed, 139 insertions(+), 2 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6b68e345f0ca..1abccb1a1ecc 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -357,6 +357,8 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int 
preferred_nid,
nodemask_t *nmask);
 struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
unsigned long address);
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+int nid, nodemask_t *nmask);
 int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx);
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f2b4abbca55e..d82b35afd2eb 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -286,6 +286,9 @@ static inline int migrate_vma(const struct migrate_vma_ops 
*ops,
 }
 #endif /* IS_ENABLED(CONFIG_MIGRATE_VMA_HELPER) */
 
+extern int get_user_pages_cma_migrate(unsigned long start, int nr_pages, int 
write,
+ struct page **pages);
+
 #endif /* CONFIG_MIGRATION */
 
 #endif /* _LINUX_MIGRATE_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3c21775f196b..1abbfcb84f66 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1585,8 +1585,8 @@ static struct page *alloc_surplus_huge_page(struct hstate 
*h, gfp_t gfp_mask,
return page;
 }
 
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
-   int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+int nid, nodemask_t *nmask)
 {
struct page *page;
 
diff --git a/mm/migrate.c b/mm/migrate.c
index d6a2e89b086a..2f92534ea7a1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -3006,3 +3006,135 @@ int migrate_vma(const struct migrate_vma_ops *ops,
 }
 EXPORT_SYMBOL(migrate_vma);
 #endif /* defined(MIGRATE_VMA_HELPER) */
+
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+   /*
+* We want to make sure we allocate the new page from the same node
+* as the source page.
+*/
+   int nid = page_to_nid(page);
+   gfp_t gfp_mask = GFP_USER | __GFP_THISNODE;
+
+   if (PageHighMem(page))
+   gfp_mask |= __GFP_HIGHMEM;
+
+   if (PageTransHuge(page)) {
+   struct page *thp;
+   gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_THISNODE;
+
+   /*
+* Remove the movable mask so that we don't allocate from
+* CMA area again.
+*/
+   thp_gfpmask &= ~__GFP_MOVABLE;
+   thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+   if (!thp)
+   return NULL;
+   prep_transhuge_page(thp);
+   return thp;
+
+#ifdef  CONFIG_HUGETLB_PAGE
+   } else if (PageHuge(page)) {
+
+   struct hstate *h = page_hstate(page);
+   /*
+* We don't want to dequeue from the pool because pool pages 
will
+* mostly be from the CMA region.
+*/
+   return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+#endif
+   }
+
+   return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+/**
+ * get_user_pages_cma_migrate() - pin user pages in memory by migrating pages 
in CMA region
+ * @start: starting user address
+ * @nr_pages:  number of pages from start to pin
+ * @write: whether pages will be written to
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * If not successful, it will fall back to taking the lock and
+ * calling get_user_pages().
+ *
+ * If the pinned pages are backed by CMA region, we migrate those pages out,
+ * allocating new pages from non-CMA region. This helps in avoiding keeping
+ * pages pinned in the CMA region for a long time thereby resulting in
+ * CMA allocation failures.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno.
+ */
+
+int get_user_pages_cma_migrate(unsigned long start, int nr_pages, 

[PATCH V3 0/2] mm/kvm/vfio/ppc64: Migrate compound pages out of CMA region

2018-09-18 Thread Aneesh Kumar K.V
ppc64 use CMA area for the allocation of guest page table (hash page table). We 
won't
be able to start guest if we fail to allocate hash page table. We have observed
hash table allocation failure because we failed to migrate pages out of CMA 
region
because they were pinned. This happen when we are using VFIO. VFIO on ppc64 pins
the entire guest RAM. If the guest RAM pages get allocated out of CMA region, we
won't be able to migrate those pages. The pages are also pinned for the 
lifetime of the
guest.

Currently we support migration of non-compound pages. With THP and with the 
addition of
 hugetlb migration we can end up allocating compound pages from CMA region. This
patch series add support for migrating compound pages. The first path adds the 
helper
get_user_pages_cma_migrate() which pin the page making sure we migrate them out 
of
CMA region before incrementing the reference count. 

Aneesh Kumar K.V (2):
  mm: Add get_user_pages_cma_migrate
  powerpc/mm/iommu: Allow migration of cma allocated pages during
mm_iommu_get

 arch/powerpc/mm/mmu_context_iommu.c | 120 -
 include/linux/hugetlb.h |   2 +
 include/linux/migrate.h |   3 +
 mm/hugetlb.c|   4 +-
 mm/migrate.c| 132 
 5 files changed, 174 insertions(+), 87 deletions(-)

-- 
2.17.1



Re: How to handle PTE tables with non contiguous entries ?

2018-09-18 Thread Christophe LEROY




Le 18/09/2018 à 13:47, Aneesh Kumar K.V a écrit :

Christophe LEROY  writes:


Le 17/09/2018 à 11:03, Aneesh Kumar K.V a écrit :

Christophe Leroy  writes:


Hi,

I'm having a hard time figuring out the best way to handle the following
situation:

On the powerpc8xx, handling 16k size pages requires to have page tables
with 4 identical entries.


I assume that hugetlb page size? If so isn't that similar to FSL hugetlb
page table layout?


No, it is not for 16k hugepage size with a standard page size of 4k.

Here I'm trying to handle the case of CONFIG_PPC_16K_PAGES.
As of today, it is implemented by using the standard Linux page layout,
ie one PTE entry for each 16k page. This forbids the use the 8xx HW
assistance.





Initially I was thinking about handling this by simply modifying
pte_index() which changing pte_t type in order to have one entry every
16 bytes, then replicate the PTE value at *ptep, *ptep+1,*ptep+2 and
*ptep+3 both in set_pte_at() and pte_update().

However, this doesn't work because many many places in the mm core part
of the kernel use loops on ptep with single ptep++ increment.

Therefore did it with the following hack:

/* PTE level */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+typedef struct { pte_basic_t pte, pte1, pte2, pte3; } pte_t;
+#else
typedef struct { pte_basic_t pte; } pte_t;
+#endif

@@ -181,7 +192,13 @@ static inline unsigned long pte_update(pte_t *p,
   : "cc" );
#else /* PTE_ATOMIC_UPDATES */
   unsigned long old = pte_val(*p);
-   *p = __pte((old & ~clr) | set);
+   unsigned long new = (old & ~clr) | set;
+
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+   p->pte = p->pte1 = p->pte2 = p->pte3 = new;
+#else
+   *p = __pte(new);
+#endif
#endif /* !PTE_ATOMIC_UPDATES */

#ifdef CONFIG_44x


@@ -161,7 +161,11 @@ static inline void __set_pte_at(struct mm_struct
*mm, unsigned long addr,
   /* Anything else just stores the PTE normally. That covers all
64-bit
* cases, and 32-bit non-hash with 32-bit PTEs.
*/
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
+   ptep->pte = ptep->pte1 = ptep->pte2 = ptep->pte3 = pte_val(pte);
+#else
   *ptep = pte;
+#endif



But I'm not too happy with it as it means pte_t is not a single type
anymore so passing it from one function to the other is quite heavy.


Would someone have an idea of an elegent way to handle that ?

Thanks
Christophe


Why would pte_update bother about updating all the 4 entries?. Can you
help me understand the issue?


Because the 8xx HW assistance expects 4 identical entries for each 16k
page, so everytime a PTE is updated the 4 entries have to be updated.



What you suggested in the original mail is what matches that best isn't it?
That is a linux pte update involves updating 4 slot. Hence a linux pte
consist of 4 unsigned long?



Yes indeed.
It seems hopeless to avoid carrying the 4 longs from one function to the 
other allthough that's four times the same thing.


Christophe


[PATCH v1 6/6] memory-hotplug.txt: Add some details about locking internals

2018-09-18 Thread David Hildenbrand
Let's document the magic a bit, especially why device_hotplug_lock is
required when adding/removing memory and how it all play together with
requests to online/offline memory from user space.

Cc: Jonathan Corbet 
Cc: Michal Hocko 
Cc: Andrew Morton 
Reviewed-by: Pavel Tatashin 
Signed-off-by: David Hildenbrand 
---
 Documentation/memory-hotplug.txt | 39 +++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 7f49ebf3ddb2..03aaad7d7373 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -3,7 +3,7 @@ Memory Hotplug
 ==
 
 :Created:  Jul 28 2007
-:Updated: Add description of notifier of memory hotplug:   Oct 11 2007
+:Updated: Add some details about locking internals:Aug 20 2018
 
 This document is about memory hotplug including how-to-use and current status.
 Because Memory Hotplug is still under development, contents of this text will
@@ -495,6 +495,43 @@ further processing of the notification queue.
 
 NOTIFY_STOP stops further processing of the notification queue.
 
+
+Locking Internals
+=
+
+When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
+the device_hotplug_lock should be held to:
+
+- synchronize against online/offline requests (e.g. via sysfs). This way, 
memory
+  block devices can only be accessed (.online/.state attributes) by user
+  space once memory has been fully added. And when removing memory, we
+  know nobody is in critical sections.
+- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
+
+Especially, there is a possible lock inversion that is avoided using
+device_hotplug_lock when adding memory and user space tries to online that
+memory faster than expected:
+
+- device_online() will first take the device_lock(), followed by
+  mem_hotplug_lock
+- add_memory_resource() will first take the mem_hotplug_lock, followed by
+  the device_lock() (while creating the devices, during bus_add_device()).
+
+As the device is visible to user space before taking the device_lock(), this
+can result in a lock inversion.
+
+onlining/offlining of memory should be done via device_online()/
+device_offline() - to make sure it is properly synchronized to actions
+via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
+
+When adding/removing/onlining/offlining memory or adding/removing
+heterogeneous/device memory, we should always hold the mem_hotplug_lock to
+serialise memory hotplug (e.g. access to global/zone variables).
+
+In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) allows
+for a quite efficient get_online_mems/put_online_mems implementation.
+
+
 Future Work
 ===
 
-- 
2.17.1



[PATCH v1 5/6] powerpc/powernv: hold device_hotplug_lock in memtrace_offline_pages()

2018-09-18 Thread David Hildenbrand
Let's perform all checking + offlining + removing under
device_hotplug_lock, so nobody can mess with these devices via
sysfs concurrently.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Rashmica Gupta 
Cc: Balbir Singh 
Cc: Michael Neuling 
Reviewed-by: Pavel Tatashin 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index ef7181d4fe68..473e59842ec5 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -74,9 +74,13 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, 
u64 nr_pages)
 {
u64 end_pfn = start_pfn + nr_pages - 1;
 
+   lock_device_hotplug();
+
if (walk_memory_range(start_pfn, end_pfn, NULL,
-   check_memblock_online))
+   check_memblock_online)) {
+   unlock_device_hotplug();
return false;
+   }
 
walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE,
  change_memblock_state);
@@ -84,14 +88,16 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, 
u64 nr_pages)
if (offline_pages(start_pfn, nr_pages)) {
walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE,
  change_memblock_state);
+   unlock_device_hotplug();
return false;
}
 
walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
  change_memblock_state);
 
-   remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
+   __remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
 
+   unlock_device_hotplug();
return true;
 }
 
-- 
2.17.1



[PATCH v1 4/6] powerpc/powernv: hold device_hotplug_lock when calling device_online()

2018-09-18 Thread David Hildenbrand
device_online() should be called with device_hotplug_lock() held.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Rashmica Gupta 
Cc: Balbir Singh 
Cc: Michael Neuling 
Reviewed-by: Pavel Tatashin 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 8f1cd4f3bfd5..ef7181d4fe68 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -229,9 +229,11 @@ static int memtrace_online(void)
 * we need to online the memory ourselves.
 */
if (!memhp_auto_online) {
+   lock_device_hotplug();
walk_memory_range(PFN_DOWN(ent->start),
  PFN_UP(ent->start + ent->size - 1),
  NULL, online_mem_block);
+   unlock_device_hotplug();
}
 
/*
-- 
2.17.1



[PATCH v1 3/6] mm/memory_hotplug: fix online/offline_pages called w.o. mem_hotplug_lock

2018-09-18 Thread David Hildenbrand
There seem to be some problems as result of 30467e0b3be ("mm, hotplug:
fix concurrent memory hot-add deadlock"), which tried to fix a possible
lock inversion reported and discussed in [1] due to the two locks
a) device_lock()
b) mem_hotplug_lock

While add_memory() first takes b), followed by a) during
bus_probe_device(), onlining of memory from user space first took a),
followed by b), exposing a possible deadlock.

In [1], and it was decided to not make use of device_hotplug_lock, but
rather to enforce a locking order.

The problems I spotted related to this:

1. Memory block device attributes: While .state first calls
   mem_hotplug_begin() and the calls device_online() - which takes
   device_lock() - .online does no longer call mem_hotplug_begin(), so
   effectively calls online_pages() without mem_hotplug_lock.

2. device_online() should be called under device_hotplug_lock, however
   onlining memory during add_memory() does not take care of that.

In addition, I think there is also something wrong about the locking in

3. arch/powerpc/platforms/powernv/memtrace.c calls offline_pages()
   without locks. This was introduced after 30467e0b3be. And skimming over
   the code, I assume it could need some more care in regards to locking
   (e.g. device_online() called without device_hotplug_lock. This will
   be addressed in the following patches.

Now that we hold the device_hotplug_lock when
- adding memory (e.g. via add_memory()/add_memory_resource())
- removing memory (e.g. via remove_memory())
- device_online()/device_offline()

We can move mem_hotplug_lock usage back into
online_pages()/offline_pages().

Why is mem_hotplug_lock still needed? Essentially to make
get_online_mems()/put_online_mems() be very fast (relying on
device_hotplug_lock would be very slow), and to serialize against
addition of memory that does not create memory block devices (hmm).

[1] http://driverdev.linuxdriverproject.org/pipermail/ driverdev-devel/
2015-February/065324.html

This patch is partly based on a patch by Vitaly Kuznetsov.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Rashmica Gupta 
Cc: Michael Neuling 
Cc: Balbir Singh 
Cc: Kate Stewart 
Cc: Thomas Gleixner 
Cc: Philippe Ombredanne 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Pavel Tatashin 
Cc: Vlastimil Babka 
Cc: Dan Williams 
Cc: Oscar Salvador 
Cc: YASUAKI ISHIMATSU 
Cc: Mathieu Malaterre 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c | 13 +
 mm/memory_hotplug.c   | 28 
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 40cac122ec73..0e5985682642 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -228,7 +228,6 @@ static bool pages_correctly_probed(unsigned long start_pfn)
 /*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
- * Must already be protected by mem_hotplug_begin().
  */
 static int
 memory_block_action(unsigned long phys_index, unsigned long action, int 
online_type)
@@ -294,7 +293,6 @@ static int memory_subsys_online(struct device *dev)
if (mem->online_type < 0)
mem->online_type = MMOP_ONLINE_KEEP;
 
-   /* Already under protection of mem_hotplug_begin() */
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 
/* clear online_type */
@@ -341,19 +339,11 @@ store_mem_state(struct device *dev,
goto err;
}
 
-   /*
-* Memory hotplug needs to hold mem_hotplug_begin() for probe to find
-* the correct memory block to online before doing device_online(dev),
-* which will take dev->mutex.  Take the lock early to prevent an
-* inversion, memory_subsys_online() callbacks will be implemented by
-* assuming it's already protected.
-*/
-   mem_hotplug_begin();
-
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
case MMOP_ONLINE_KEEP:
+   /* mem->online_type is protected by device_hotplug_lock */
mem->online_type = online_type;
ret = device_online(>dev);
break;
@@ -364,7 +354,6 @@ store_mem_state(struct device *dev,
ret = -EINVAL; /* should never happen */
}
 
-   mem_hotplug_done();
 err:
unlock_device_hotplug();
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ef5444145c88..497e9315ca6f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -881,7 +881,6 @@ static struct zone * __meminit move_pfn_range(int 
online_type, int nid,
  

[PATCH v1 2/6] mm/memory_hotplug: make add_memory() take the device_hotplug_lock

2018-09-18 Thread David Hildenbrand
add_memory() currently does not take the device_hotplug_lock, however
is aleady called under the lock from
arch/powerpc/platforms/pseries/hotplug-memory.c
drivers/acpi/acpi_memhotplug.c
to synchronize against CPU hot-remove and similar.

In general, we should hold the device_hotplug_lock when adding memory
to synchronize against online/offline request (e.g. from user space) -
which already resulted in lock inversions due to device_lock() and
mem_hotplug_lock - see 30467e0b3be ("mm, hotplug: fix concurrent memory
hot-add deadlock"). add_memory()/add_memory_resource() will create memory
block devices, so this really feels like the right thing to do.

Holding the device_hotplug_lock makes sure that a memory block device
can really only be accessed (e.g. via .online/.state) from user space,
once the memory has been fully added to the system.

The lock is not held yet in
drivers/xen/balloon.c
arch/powerpc/platforms/powernv/memtrace.c
drivers/s390/char/sclp_cmd.c
drivers/hv/hv_balloon.c
So, let's either use the locked variants or take the lock.

Don't export add_memory_resource(), as it once was exported to be used
by XEN, which is never built as a module. If somebody requires it, we
also have to export a locked variant (as device_hotplug_lock is never
exported).

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Nathan Fontenot 
Cc: John Allen 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Joonsoo Kim 
Cc: Vlastimil Babka 
Cc: Oscar Salvador 
Cc: Mathieu Malaterre 
Cc: Pavel Tatashin 
Cc: YASUAKI ISHIMATSU 
Reviewed-by: Pavel Tatashin 
Signed-off-by: David Hildenbrand 
---
 .../platforms/pseries/hotplug-memory.c|  2 +-
 drivers/acpi/acpi_memhotplug.c|  2 +-
 drivers/base/memory.c |  9 ++--
 drivers/xen/balloon.c |  3 +++
 include/linux/memory_hotplug.h|  1 +
 mm/memory_hotplug.c   | 22 ---
 6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index b3f54466e25f..2e6f41dc103a 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -702,7 +702,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
/* Add the memory */
-   rc = add_memory(nid, lmb->base_addr, block_sz);
+   rc = __add_memory(nid, lmb->base_addr, block_sz);
if (rc) {
dlpar_remove_device_tree_lmb(lmb);
return rc;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 811148415993..8fe0960ea572 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -228,7 +228,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (node < 0)
node = memory_add_physaddr_to_nid(info->start_addr);
 
-   result = add_memory(node, info->start_addr, info->length);
+   result = __add_memory(node, info->start_addr, info->length);
 
/*
 * If the memory block has been used by the kernel, add_memory()
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 817320c7c4c1..40cac122ec73 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -519,15 +519,20 @@ memory_probe_store(struct device *dev, struct 
device_attribute *attr,
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
return -EINVAL;
 
+   ret = lock_device_hotplug_sysfs();
+   if (ret)
+   goto out;
+
nid = memory_add_physaddr_to_nid(phys_addr);
-   ret = add_memory(nid, phys_addr,
-MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+   ret = __add_memory(nid, phys_addr,
+  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
 
if (ret)
goto out;
 
ret = count;
 out:
+   unlock_device_hotplug();
return ret;
 }
 
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index e12bb256036f..6bab019a82b1 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -395,7 +395,10 @@ static enum bp_state reserve_additional_memory(void)
 * callers drop the mutex before trying again.
 */
mutex_unlock(_mutex);
+   /* add_memory_resource() requires the device_hotplug lock */
+   lock_device_hotplug();
rc = add_memory_resource(nid, resource, memhp_auto_online);
+   unlock_device_hotplug();
mutex_lock(_mutex);
 
if (rc) {
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 1f096852f479..ffd9cd10fcf3 

[PATCH v1 1/6] mm/memory_hotplug: make remove_memory() take the device_hotplug_lock

2018-09-18 Thread David Hildenbrand
remove_memory() is exported right now but requires the
device_hotplug_lock, which is not exported. So let's provide a variant
that takes the lock and only export that one.

The lock is already held in
arch/powerpc/platforms/pseries/hotplug-memory.c
drivers/acpi/acpi_memhotplug.c
So, let's use the locked variant.

The lock is not held (but taken in)
arch/powerpc/platforms/powernv/memtrace.c
So let's keep using the (now) locked variant.

Apart from that, there are not other users in the tree.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Rashmica Gupta 
Cc: Michael Neuling 
Cc: Balbir Singh 
Cc: Nathan Fontenot 
Cc: John Allen 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Joonsoo Kim 
Cc: Vlastimil Babka 
Cc: Pavel Tatashin 
Cc: Greg Kroah-Hartman 
Cc: Oscar Salvador 
Cc: YASUAKI ISHIMATSU 
Cc: Mathieu Malaterre 
Reviewed-by: Pavel Tatashin 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c   | 2 --
 arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++---
 drivers/acpi/acpi_memhotplug.c  | 2 +-
 include/linux/memory_hotplug.h  | 3 ++-
 mm/memory_hotplug.c | 9 -
 5 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 51dc398ae3f7..8f1cd4f3bfd5 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -90,9 +90,7 @@ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, 
u64 nr_pages)
walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE,
  change_memblock_state);
 
-   lock_device_hotplug();
remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT);
-   unlock_device_hotplug();
 
return true;
 }
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index c1578f54c626..b3f54466e25f 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -334,7 +334,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
nid = memory_add_physaddr_to_nid(base);
 
for (i = 0; i < sections_per_block; i++) {
-   remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+   __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
 
@@ -423,7 +423,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
block_sz = pseries_memory_block_size();
nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
-   remove_memory(nid, lmb->base_addr, block_sz);
+   __remove_memory(nid, lmb->base_addr, block_sz);
 
/* Update memory regions for memory remove */
memblock_remove(lmb->base_addr, block_sz);
@@ -710,7 +710,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
rc = dlpar_online_lmb(lmb);
if (rc) {
-   remove_memory(nid, lmb->base_addr, block_sz);
+   __remove_memory(nid, lmb->base_addr, block_sz);
dlpar_remove_device_tree_lmb(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 6b0d3ef7309c..811148415993 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -282,7 +282,7 @@ static void acpi_memory_remove_memory(struct 
acpi_memory_device *mem_device)
nid = memory_add_physaddr_to_nid(info->start_addr);
 
acpi_unbind_memory_blocks(info);
-   remove_memory(nid, info->start_addr, info->length);
+   __remove_memory(nid, info->start_addr, info->length);
list_del(>list);
kfree(info);
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 34a28227068d..1f096852f479 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -301,6 +301,7 @@ extern bool is_mem_section_removable(unsigned long pfn, 
unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
+extern void __remove_memory(int nid, u64 start, u64 size);
 
 #else
 static inline bool is_mem_section_removable(unsigned long pfn,
@@ -317,6 +318,7 @@ static inline int offline_pages(unsigned long start_pfn, 
unsigned long nr_pages)
 }
 
 static inline void remove_memory(int nid, u64 start, u64 size) {}
+static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 extern void __ref free_area_init_core_hotplug(int nid);
@@ -330,7 +332,6 @@ extern void 

[PATCH v1 0/6] mm: online/offline_pages called w.o. mem_hotplug_lock

2018-09-18 Thread David Hildenbrand
Reading through the code and studying how mem_hotplug_lock is to be used,
I noticed that there are two places where we can end up calling
device_online()/device_offline() - online_pages()/offline_pages() without
the mem_hotplug_lock. And there are other places where we call
device_online()/device_offline() without the device_hotplug_lock.

While e.g.
echo "online" > /sys/devices/system/memory/memory9/state
is fine, e.g.
echo 1 > /sys/devices/system/memory/memory9/online
Will not take the mem_hotplug_lock. However the device_lock() and
device_hotplug_lock.

E.g. via memory_probe_store(), we can end up calling
add_memory()->online_pages() without the device_hotplug_lock. So we can
have concurrent callers in online_pages(). We e.g. touch in online_pages()
basically unprotected zone->present_pages then.

Looks like there is a longer history to that (see Patch #2 for details),
and fixing it to work the way it was intended is not really possible. We
would e.g. have to take the mem_hotplug_lock in device/base/core.c, which
sounds wrong.

Summary: We had a lock inversion on mem_hotplug_lock and device_lock().
More details can be found in patch 3 and patch 6.

I propose the general rules (documentation added in patch 6):

1. add_memory/add_memory_resource() must only be called with
   device_hotplug_lock.
2. remove_memory() must only be called with device_hotplug_lock. This is
   already documented and holds for all callers.
3. device_online()/device_offline() must only be called with
   device_hotplug_lock. This is already documented and true for now in core
   code. Other callers (related to memory hotplug) have to be fixed up.
4. mem_hotplug_lock is taken inside of add_memory/remove_memory/
   online_pages/offline_pages.

To me, this looks way cleaner than what we have right now (and easier to
verify). And looking at the documentation of remove_memory, using
lock_device_hotplug also for add_memory() feels natural.


RFCv2 -> v1:
- Dropped an unnecessary _ref from remove_memory() in patch #1
- Minor patch description fixes.
- Added rb's

RFC -> RFCv2:
- Don't export device_hotplug_lock, provide proper remove_memory/add_memory
  wrappers.
- Split up the patches a bit.
- Try to improve powernv memtrace locking
- Add some documentation for locking that matches my knowledge

David Hildenbrand (6):
  mm/memory_hotplug: make remove_memory() take the device_hotplug_lock
  mm/memory_hotplug: make add_memory() take the device_hotplug_lock
  mm/memory_hotplug: fix online/offline_pages called w.o.
mem_hotplug_lock
  powerpc/powernv: hold device_hotplug_lock when calling device_online()
  powerpc/powernv: hold device_hotplug_lock in memtrace_offline_pages()
  memory-hotplug.txt: Add some details about locking internals

 Documentation/memory-hotplug.txt  | 39 +++-
 arch/powerpc/platforms/powernv/memtrace.c | 14 +++--
 .../platforms/pseries/hotplug-memory.c|  8 +--
 drivers/acpi/acpi_memhotplug.c|  4 +-
 drivers/base/memory.c | 22 +++
 drivers/xen/balloon.c |  3 +
 include/linux/memory_hotplug.h|  4 +-
 mm/memory_hotplug.c   | 59 +++
 8 files changed, 115 insertions(+), 38 deletions(-)

-- 
2.17.1



Re: How to handle PTE tables with non contiguous entries ?

2018-09-18 Thread Aneesh Kumar K.V
Christophe LEROY  writes:

> Le 17/09/2018 à 11:03, Aneesh Kumar K.V a écrit :
>> Christophe Leroy  writes:
>> 
>>> Hi,
>>>
>>> I'm having a hard time figuring out the best way to handle the following
>>> situation:
>>>
>>> On the powerpc8xx, handling 16k size pages requires to have page tables
>>> with 4 identical entries.
>> 
>> I assume that hugetlb page size? If so isn't that similar to FSL hugetlb
>> page table layout?
>
> No, it is not for 16k hugepage size with a standard page size of 4k.
>
> Here I'm trying to handle the case of CONFIG_PPC_16K_PAGES.
> As of today, it is implemented by using the standard Linux page layout, 
> ie one PTE entry for each 16k page. This forbids the use the 8xx HW 
> assistance.
>
>> 
>>>
>>> Initially I was thinking about handling this by simply modifying
>>> pte_index() which changing pte_t type in order to have one entry every
>>> 16 bytes, then replicate the PTE value at *ptep, *ptep+1,*ptep+2 and
>>> *ptep+3 both in set_pte_at() and pte_update().
>>>
>>> However, this doesn't work because many many places in the mm core part
>>> of the kernel use loops on ptep with single ptep++ increment.
>>>
>>> Therefore did it with the following hack:
>>>
>>>/* PTE level */
>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>> +typedef struct { pte_basic_t pte, pte1, pte2, pte3; } pte_t;
>>> +#else
>>>typedef struct { pte_basic_t pte; } pte_t;
>>> +#endif
>>>
>>> @@ -181,7 +192,13 @@ static inline unsigned long pte_update(pte_t *p,
>>>   : "cc" );
>>>#else /* PTE_ATOMIC_UPDATES */
>>>   unsigned long old = pte_val(*p);
>>> -   *p = __pte((old & ~clr) | set);
>>> +   unsigned long new = (old & ~clr) | set;
>>> +
>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>> +   p->pte = p->pte1 = p->pte2 = p->pte3 = new;
>>> +#else
>>> +   *p = __pte(new);
>>> +#endif
>>>#endif /* !PTE_ATOMIC_UPDATES */
>>>
>>>#ifdef CONFIG_44x
>>>
>>>
>>> @@ -161,7 +161,11 @@ static inline void __set_pte_at(struct mm_struct
>>> *mm, unsigned long addr,
>>>   /* Anything else just stores the PTE normally. That covers all
>>> 64-bit
>>>* cases, and 32-bit non-hash with 32-bit PTEs.
>>>*/
>>> +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PPC_16K_PAGES)
>>> +   ptep->pte = ptep->pte1 = ptep->pte2 = ptep->pte3 = pte_val(pte);
>>> +#else
>>>   *ptep = pte;
>>> +#endif
>>>
>>>
>>>
>>> But I'm not too happy with it as it means pte_t is not a single type
>>> anymore so passing it from one function to the other is quite heavy.
>>>
>>>
>>> Would someone have an idea of an elegent way to handle that ?
>>>
>>> Thanks
>>> Christophe
>> 
>> Why would pte_update bother about updating all the 4 entries?. Can you
>> help me understand the issue?
>
> Because the 8xx HW assistance expects 4 identical entries for each 16k 
> page, so everytime a PTE is updated the 4 entries have to be updated.
>

What you suggested in the original mail is what matches that best isn't it?
That is a linux pte update involves updating 4 slot. Hence a linux pte
consist of 4 unsigned long?

-aneesh



Re: [PATCH v4] powerpc: Avoid code patching freed init sections

2018-09-18 Thread Michal Suchánek
On Tue, 18 Sep 2018 10:52:09 +0200
Christophe LEROY  wrote:

> 
> 
> Le 14/09/2018 à 06:22, Nicholas Piggin a écrit :
> > On Fri, 14 Sep 2018 11:14:11 +1000
> > Michael Neuling  wrote:
> > 
> >> This stops us from doing code patching in init sections after
> >> they've been freed.
> >>
> >> In this chain:
> >>kvm_guest_init() ->
> >>  kvm_use_magic_page() ->
> >>fault_in_pages_readable() ->
> >> __get_user() ->
> >>   __get_user_nocheck() ->
> >> barrier_nospec();
> >>
> >> We have a code patching location at barrier_nospec() and
> >> kvm_guest_init() is an init function. This whole chain gets
> >> inlined, so when we free the init section (hence
> >> kvm_guest_init()), this code goes away and hence should no longer
> >> be patched.
> >>
> >> We seen this as userspace memory corruption when using a memory
> >> checker while doing partition migration testing on powervm (this
> >> starts the code patching post migration via
> >> /sys/kernel/mobility/migration). In theory, it could also happen
> >> when using /sys/kernel/debug/powerpc/barrier_nospec.
> >>
> >> cc: sta...@vger.kernel.org # 4.13+
> >> Signed-off-by: Michael Neuling 
> >>
> >> ---
> >> For stable I've marked this as v4.13+ since that's when we
> >> refactored code-patching.c but it could go back even further than
> >> that. In reality though, I think we can only hit this since the
> >> first spectre/meltdown changes.
> >>
> >> v4:
> >>   Feedback from Christophe Leroy:
> >> - init_mem_free -> init_mem_is_free
> >> - prlog %lx -> %px
> >>
> >> v3:
> >>   Add init_mem_free flag to avoid potential race.
> >>   Feedback from Christophe Leroy:
> >> - use init_section_contains()
> >> - change order of init test for performance
> >> - use pr_debug()
> >> - remove blank line
> >>
> >> v2:
> >>Print when we skip an address
> >> ---
> >>   arch/powerpc/include/asm/setup.h | 1 +
> >>   arch/powerpc/lib/code-patching.c | 6 ++
> >>   arch/powerpc/mm/mem.c| 2 ++
> >>   3 files changed, 9 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/setup.h
> >> b/arch/powerpc/include/asm/setup.h index 1a951b0046..1fffbba8d6
> >> 100644 --- a/arch/powerpc/include/asm/setup.h
> >> +++ b/arch/powerpc/include/asm/setup.h
> >> @@ -9,6 +9,7 @@ extern void ppc_printk_progress(char *s, unsigned
> >> short hex); 
> >>   extern unsigned int rtas_data;
> >>   extern unsigned long long memory_limit;
> >> +extern bool init_mem_is_free;
> >>   extern unsigned long klimit;
> >>   extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
> >>   
> >> diff --git a/arch/powerpc/lib/code-patching.c
> >> b/arch/powerpc/lib/code-patching.c index 850f3b8f4d..6ae2777c22
> >> 100644 --- a/arch/powerpc/lib/code-patching.c
> >> +++ b/arch/powerpc/lib/code-patching.c
> >> @@ -28,6 +28,12 @@ static int __patch_instruction(unsigned int
> >> *exec_addr, unsigned int instr, {
> >>int err;
> >>   
> >> +  /* Make sure we aren't patching a freed init section */
> >> +  if (init_mem_is_free && init_section_contains(exec_addr,
> >> 4)) {
> >> +  pr_debug("Skipping init section patching addr:
> >> 0x%px\n", exec_addr);
> >> +  return 0;
> >> +  }
> > 
> > What we should do is a whitelist, make sure it's only patching the
> > sections we want it to.
> > 
> > That's a bigger job when you consider modules and things too though,
> > so this looks good for now. Thanks,
> 
> What about using kernel_text_address() for it then ? It also handles 
> modules, is it more complicated than that ?

Modules are patched separately so should not need to be excluded here.

There is a different problem with modules: when the mitigation type
changes the modules are not re-patched with the new settings.

Thanks

Michal


Re: [PATCH v2 2/5] powerpc/boot: Fix crt0.S syntax for clang

2018-09-18 Thread Michael Ellerman
Joel Stanley  writes:

> On Tue, 18 Sep 2018 at 06:11, Nick Desaulniers  
> wrote:
>>
>> On Fri, Sep 14, 2018 at 2:08 PM Segher Boessenkool
>>  wrote:
>> >
>> > On Fri, Sep 14, 2018 at 10:47:08AM -0700, Nick Desaulniers wrote:
>> > > On Thu, Sep 13, 2018 at 9:07 PM Joel Stanley  wrote:
>> > > >  10:addis   r12,r12,(-RELACOUNT)@ha
>> > > > -   cmpdi   r12,RELACOUNT@l
>> > > > +   cmpdi   r12,(RELACOUNT)@l
>> > >
>> > > Yep, as we can see above, when RELACOUNT is negated, it's wrapped in
>> > > parens.
>>
>> Looks like this was just fixed in Clang-8:
>> https://bugs.llvm.org/show_bug.cgi?id=38945
>> https://reviews.llvm.org/D52188
>
> Nice!
>
> mpe, given we need the local references to labels fix which is also in
> clang-8 I suggest we drop this patch.

OK, no worries.

cheers


Re: [PATCH] powerpc/makefile: remove check on obsolete GCC versions

2018-09-18 Thread Michael Ellerman
Christophe LEROY  writes:
> Le 18/09/2018 à 07:48, Joel Stanley a écrit :
>> Hey Christophe,
>> 
>> On Tue, 18 Sep 2018 at 15:13, Christophe Leroy  
>> wrote:
>>>
>>> Since commit cafa0010cd51 ("Raise the minimum required gcc version
>>> to 4.6"), it is not possible to build kernel with GCC lower than 4.6
>>>
>>> This patch removes checkbin tests addressing older versions of GCC.
>> 
>> This is the same as Nick's patch:
>> 
>>   https://patchwork.ozlabs.org/patch/969624/
>> 
>
> Oops, thanks, I missed that.
>
> And even before Nick's, there is this one 
> https://patchwork.ozlabs.org/patch/962319/
>
> So I missed twice :(

Haha, I missed that one too.

Everyone loves deleting code ;)

I have Nick's patch queued.

cheers


Re: [PATCH] powerpc/pseries: Disable CPU hotplug across migrations

2018-09-18 Thread Gautham R Shenoy
Hi Nathan,
On Tue, Sep 18, 2018 at 1:05 AM Nathan Fontenot
 wrote:
>
> When performing partition migrations all present CPUs must be online
> as all present CPUs must make the H_JOIN call as part of the migration
> process. Once all present CPUs make the H_JOIN call, one CPU is returned
> to make the rtas call to perform the migration to the destination system.
>
> During testing of migration and changing the SMT state we have found
> instances where CPUs are offlined, as part of the SMT state change,
> before they make the H_JOIN call. This results in a hung system where
> every CPU is either in H_JOIN or offline.
>
> To prevent this this patch disables CPU hotplug during the migration
> process.
>
> Signed-off-by: Nathan Fontenot 
> ---
>  arch/powerpc/kernel/rtas.c |2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 8afd146bc9c7..2c7ed31c736e 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -981,6 +981,7 @@ int rtas_ibm_suspend_me(u64 handle)
> goto out;
> }
>
> +   cpu_hotplug_disable();

So, some of the onlined CPUs ( via
rtas_online_cpus_mask(offline_mask);) can go still offline,
if the userspace issues an offline command, just before we execute
cpu_hotplug_disable().

So we are narrowing down the race, but it still exists. Am I missing something ?

> stop_topology_update();
>
> /* Call function on all CPUs.  One of us will make the
> @@ -995,6 +996,7 @@ int rtas_ibm_suspend_me(u64 handle)
> printk(KERN_ERR "Error doing global join\n");
>
> start_topology_update();
> +   cpu_hotplug_enable();
>
> /* Take down CPUs not online prior to suspend */
> cpuret = rtas_offline_cpus_mask(offline_mask);
>


-- 
Thanks and Regards
gautham.


Checkpatch bad Warning (Re: [PATCH] powerpc/kgdb: add kgdb_arch_set/remove_breakpoint())

2018-09-18 Thread Christophe Leroy

On the below patch, checkpatch reports

WARNING: struct kgdb_arch should normally be const
#127: FILE: arch/powerpc/kernel/kgdb.c:480:
+struct kgdb_arch arch_kgdb_ops;

But when I add 'const', I get compilation failure

  CC  arch/powerpc/kernel/kgdb.o
arch/powerpc/kernel/kgdb.c:480:24: error: conflicting type qualifiers 
for ‘arch_kgdb_ops’

 const struct kgdb_arch arch_kgdb_ops;
^
In file included from arch/powerpc/kernel/kgdb.c:18:0:
./include/linux/kgdb.h:284:26: note: previous declaration of 
‘arch_kgdb_ops’ was here

 extern struct kgdb_arch  arch_kgdb_ops;
  ^
make[1]: *** [arch/powerpc/kernel/kgdb.o] Error 1

Christophe

On 09/18/2018 09:26 AM, Christophe Leroy wrote:

Generic implementation fails to remove breakpoints after init
when CONFIG_STRICT_KERNEL_RWX is selected:

[   13.251285] KGDB: BP remove failed: c001c338
[   13.259587] kgdbts: ERROR PUT: end of test buffer on 'do_fork_test' line 8 
expected OK got $E14#aa
[   13.268969] KGDB: re-enter exception: ALL breakpoints killed
[   13.275099] CPU: 0 PID: 1 Comm: init Not tainted 4.18.0-g82bbb913ffd8 #860
[   13.282836] Call Trace:
[   13.285313] [c60e1ba0] [c0080ef0] kgdb_handle_exception+0x6f4/0x720 
(unreliable)
[   13.292618] [c60e1c30] [c000e97c] kgdb_handle_breakpoint+0x3c/0x98
[   13.298709] [c60e1c40] [c000af54] program_check_exception+0x104/0x700
[   13.305083] [c60e1c60] [c000e45c] ret_from_except_full+0x0/0x4
[   13.310845] [c60e1d20] [c02a22ac] run_simple_test+0x2b4/0x2d4
[   13.316532] [c60e1d30] [c0081698] put_packet+0xb8/0x158
[   13.321694] [c60e1d60] [c00820b4] gdb_serial_stub+0x230/0xc4c
[   13.327374] [c60e1dc0] [c0080af8] kgdb_handle_exception+0x2fc/0x720
[   13.333573] [c60e1e50] [c000e928] kgdb_singlestep+0xb4/0xcc
[   13.339068] [c60e1e70] [c000ae1c] single_step_exception+0x90/0xac
[   13.345100] [c60e1e80] [c000e45c] ret_from_except_full+0x0/0x4
[   13.350865] [c60e1f40] [c000e11c] ret_from_syscall+0x0/0x38
[   13.356346] Kernel panic - not syncing: Recursive entry to debugger

This patch creates powerpc specific version of
kgdb_arch_set_breakpoint() and kgdb_arch_remove_breakpoint()
using patch_instruction()

Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some 
configs")
Signed-off-by: Christophe Leroy 
---
  arch/powerpc/include/asm/kgdb.h |  5 -
  arch/powerpc/kernel/kgdb.c  | 43 +
  2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h
index 9db24e77b9f4..a9e098a3b881 100644
--- a/arch/powerpc/include/asm/kgdb.h
+++ b/arch/powerpc/include/asm/kgdb.h
@@ -26,9 +26,12 @@
  #define BREAK_INSTR_SIZE  4
  #define BUFMAX((NUMREGBYTES * 2) + 512)
  #define OUTBUFMAX ((NUMREGBYTES * 2) + 512)
+
+#define BREAK_INSTR0x7d821008  /* twge r2, r2 */
+
  static inline void arch_kgdb_breakpoint(void)
  {
-   asm(".long 0x7d821008"); /* twge r2, r2 */
+   asm(stringify_in_c(.long BREAK_INSTR));
  }
  #define CACHE_FLUSH_IS_SAFE   1
  #define DBG_MAX_REG_NUM 70
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 35e240a0a408..59c578f865aa 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -24,6 +24,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  
  /*

@@ -144,7 +145,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0)
return 0;
  
-	if (*(u32 *) (regs->nip) == *(u32 *) (_kgdb_ops.gdb_bpt_instr))

+   if (*(u32 *)regs->nip == BREAK_INSTR)
regs->nip += BREAK_INSTR_SIZE;
  
  	return 1;

@@ -441,16 +442,42 @@ int kgdb_arch_handle_exception(int vector, int signo, int 
err_code,
return -1;
  }
  
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)

+{
+   int err;
+   unsigned int instr;
+   unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+   err = probe_kernel_address(addr, instr);
+   if (err)
+   return err;
+
+   err = patch_instruction(addr, BREAK_INSTR);
+   if (err)
+   return -EFAULT;
+
+   *(unsigned int *)bpt->saved_instr = instr;
+
+   return 0;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+   int err;
+   unsigned int instr = *(unsigned int *)bpt->saved_instr;
+   unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+   err = patch_instruction(addr, instr);
+   if (err)
+   return -EFAULT;
+
+   return 0;
+}
+
  /*
   * Global data
   */
-struct kgdb_arch arch_kgdb_ops = {
-#ifdef __LITTLE_ENDIAN__
-   .gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d},
-#else
-   .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
-#endif
-};
+struct kgdb_arch arch_kgdb_ops;
  
  static int kgdb_not_implemented(struct pt_regs *regs)

  {



[PATCH] powerpc/kgdb: add kgdb_arch_set/remove_breakpoint()

2018-09-18 Thread Christophe Leroy
Generic implementation fails to remove breakpoints after init
when CONFIG_STRICT_KERNEL_RWX is selected:

[   13.251285] KGDB: BP remove failed: c001c338
[   13.259587] kgdbts: ERROR PUT: end of test buffer on 'do_fork_test' line 8 
expected OK got $E14#aa
[   13.268969] KGDB: re-enter exception: ALL breakpoints killed
[   13.275099] CPU: 0 PID: 1 Comm: init Not tainted 4.18.0-g82bbb913ffd8 #860
[   13.282836] Call Trace:
[   13.285313] [c60e1ba0] [c0080ef0] kgdb_handle_exception+0x6f4/0x720 
(unreliable)
[   13.292618] [c60e1c30] [c000e97c] kgdb_handle_breakpoint+0x3c/0x98
[   13.298709] [c60e1c40] [c000af54] program_check_exception+0x104/0x700
[   13.305083] [c60e1c60] [c000e45c] ret_from_except_full+0x0/0x4
[   13.310845] [c60e1d20] [c02a22ac] run_simple_test+0x2b4/0x2d4
[   13.316532] [c60e1d30] [c0081698] put_packet+0xb8/0x158
[   13.321694] [c60e1d60] [c00820b4] gdb_serial_stub+0x230/0xc4c
[   13.327374] [c60e1dc0] [c0080af8] kgdb_handle_exception+0x2fc/0x720
[   13.333573] [c60e1e50] [c000e928] kgdb_singlestep+0xb4/0xcc
[   13.339068] [c60e1e70] [c000ae1c] single_step_exception+0x90/0xac
[   13.345100] [c60e1e80] [c000e45c] ret_from_except_full+0x0/0x4
[   13.350865] [c60e1f40] [c000e11c] ret_from_syscall+0x0/0x38
[   13.356346] Kernel panic - not syncing: Recursive entry to debugger

This patch creates powerpc specific version of
kgdb_arch_set_breakpoint() and kgdb_arch_remove_breakpoint()
using patch_instruction()

Fixes: 1e0fc9d1eb2b ("powerpc/Kconfig: Enable STRICT_KERNEL_RWX for some 
configs")
Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/kgdb.h |  5 -
 arch/powerpc/kernel/kgdb.c  | 43 +
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kgdb.h b/arch/powerpc/include/asm/kgdb.h
index 9db24e77b9f4..a9e098a3b881 100644
--- a/arch/powerpc/include/asm/kgdb.h
+++ b/arch/powerpc/include/asm/kgdb.h
@@ -26,9 +26,12 @@
 #define BREAK_INSTR_SIZE   4
 #define BUFMAX ((NUMREGBYTES * 2) + 512)
 #define OUTBUFMAX  ((NUMREGBYTES * 2) + 512)
+
+#define BREAK_INSTR0x7d821008  /* twge r2, r2 */
+
 static inline void arch_kgdb_breakpoint(void)
 {
-   asm(".long 0x7d821008"); /* twge r2, r2 */
+   asm(stringify_in_c(.long BREAK_INSTR));
 }
 #define CACHE_FLUSH_IS_SAFE1
 #define DBG_MAX_REG_NUM 70
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index 35e240a0a408..59c578f865aa 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -144,7 +145,7 @@ static int kgdb_handle_breakpoint(struct pt_regs *regs)
if (kgdb_handle_exception(1, SIGTRAP, 0, regs) != 0)
return 0;
 
-   if (*(u32 *) (regs->nip) == *(u32 *) (_kgdb_ops.gdb_bpt_instr))
+   if (*(u32 *)regs->nip == BREAK_INSTR)
regs->nip += BREAK_INSTR_SIZE;
 
return 1;
@@ -441,16 +442,42 @@ int kgdb_arch_handle_exception(int vector, int signo, int 
err_code,
return -1;
 }
 
+int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt)
+{
+   int err;
+   unsigned int instr;
+   unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+   err = probe_kernel_address(addr, instr);
+   if (err)
+   return err;
+
+   err = patch_instruction(addr, BREAK_INSTR);
+   if (err)
+   return -EFAULT;
+
+   *(unsigned int *)bpt->saved_instr = instr;
+
+   return 0;
+}
+
+int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt)
+{
+   int err;
+   unsigned int instr = *(unsigned int *)bpt->saved_instr;
+   unsigned int *addr = (unsigned int *)bpt->bpt_addr;
+
+   err = patch_instruction(addr, instr);
+   if (err)
+   return -EFAULT;
+
+   return 0;
+}
+
 /*
  * Global data
  */
-struct kgdb_arch arch_kgdb_ops = {
-#ifdef __LITTLE_ENDIAN__
-   .gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d},
-#else
-   .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
-#endif
-};
+struct kgdb_arch arch_kgdb_ops;
 
 static int kgdb_not_implemented(struct pt_regs *regs)
 {
-- 
2.13.3



Re: [PATCH v4] powerpc: Avoid code patching freed init sections

2018-09-18 Thread Christophe LEROY




Le 14/09/2018 à 06:22, Nicholas Piggin a écrit :

On Fri, 14 Sep 2018 11:14:11 +1000
Michael Neuling  wrote:


This stops us from doing code patching in init sections after they've
been freed.

In this chain:
   kvm_guest_init() ->
 kvm_use_magic_page() ->
   fault_in_pages_readable() ->
 __get_user() ->
   __get_user_nocheck() ->
 barrier_nospec();

We have a code patching location at barrier_nospec() and
kvm_guest_init() is an init function. This whole chain gets inlined,
so when we free the init section (hence kvm_guest_init()), this code
goes away and hence should no longer be patched.

We seen this as userspace memory corruption when using a memory
checker while doing partition migration testing on powervm (this
starts the code patching post migration via
/sys/kernel/mobility/migration). In theory, it could also happen when
using /sys/kernel/debug/powerpc/barrier_nospec.

cc: sta...@vger.kernel.org # 4.13+
Signed-off-by: Michael Neuling 

---
For stable I've marked this as v4.13+ since that's when we refactored
code-patching.c but it could go back even further than that. In
reality though, I think we can only hit this since the first
spectre/meltdown changes.

v4:
  Feedback from Christophe Leroy:
- init_mem_free -> init_mem_is_free
- prlog %lx -> %px

v3:
  Add init_mem_free flag to avoid potential race.
  Feedback from Christophe Leroy:
- use init_section_contains()
- change order of init test for performance
- use pr_debug()
- remove blank line

v2:
   Print when we skip an address
---
  arch/powerpc/include/asm/setup.h | 1 +
  arch/powerpc/lib/code-patching.c | 6 ++
  arch/powerpc/mm/mem.c| 2 ++
  3 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index 1a951b0046..1fffbba8d6 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -9,6 +9,7 @@ extern void ppc_printk_progress(char *s, unsigned short hex);
  
  extern unsigned int rtas_data;

  extern unsigned long long memory_limit;
+extern bool init_mem_is_free;
  extern unsigned long klimit;
  extern void *zalloc_maybe_bootmem(size_t size, gfp_t mask);
  
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c

index 850f3b8f4d..6ae2777c22 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -28,6 +28,12 @@ static int __patch_instruction(unsigned int *exec_addr, 
unsigned int instr,
  {
int err;
  
+	/* Make sure we aren't patching a freed init section */

+   if (init_mem_is_free && init_section_contains(exec_addr, 4)) {
+   pr_debug("Skipping init section patching addr: 0x%px\n", 
exec_addr);
+   return 0;
+   }


What we should do is a whitelist, make sure it's only patching the
sections we want it to.

That's a bigger job when you consider modules and things too though,
so this looks good for now. Thanks,


What about using kernel_text_address() for it then ? It also handles 
modules, is it more complicated than that ?


Christophe


Re: [PATCH] powerpc/time: Calculate proper wday

2018-09-18 Thread Joakim Tjernlund
On Tue, 2018-09-18 at 10:08 +0200, Mathieu Malaterre wrote:
> 
> 
> On Wed, Aug 29, 2018 at 10:03 AM Joakim Tjernlund 
>  wrote:
> >
> > to_tm() hardcodes wday to -1 as "No-one uses the day of the week".
> > But recently rtc driver ds1307 does care and tries to correct wday.
> >
> > Add wday calculation(stolen from rtc_time64_to_tm) to to_tm() to please 
> > ds1307.
> 
> Is this still an issue after:
> 
> 34efabe41895 powerpc: remove unused to_tm() helper

No, it is not an issue anymore. You can drop this patch.

 Jocke

> 
> ?
> 
> > Signed-off-by: Joakim Tjernlund 
> > ---
> >  arch/powerpc/kernel/time.c | 8 +++-
> >  1 file changed, 3 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> > index fe6f3a285455..f4a09ee01944 100644
> > --- a/arch/powerpc/kernel/time.c
> > +++ b/arch/powerpc/kernel/time.c
> > @@ -1160,6 +1160,9 @@ void to_tm(int tim, struct rtc_time * tm)
> > day = tim / SECDAY;
> > hms = tim % SECDAY;
> >
> > +   /* day of the week, 1970-01-01 was a Thursday */
> > +   tm->tm_wday = (day + 4) % 7;
> > +
> > /* Hours, minutes, seconds are easy */
> > tm->tm_hour = hms / 3600;
> > tm->tm_min = (hms % 3600) / 60;
> > @@ -1180,11 +1183,6 @@ void to_tm(int tim, struct rtc_time * tm)
> >
> > /* Days are what is left over (+1) from all that. */
> > tm->tm_mday = day + 1;
> > -
> > -   /*
> > -* No-one uses the day of the week.
> > -*/
> > -   tm->tm_wday = -1;
> >  }
> >  EXPORT_SYMBOL(to_tm);
> >
> > --
> > 2.16.4



RE: [PATCH v11 00/26] Speculative page faults

2018-09-18 Thread Song, HaiyanX
Hi Laurent,

I am sorry for replying you so late. 
The previous LKP test for this case are running on the same Intel skylake 4s 
platform, but it need maintain recently. 
So I changed to another test box to run the page_fault3 test case, it is Intel 
skylake 2s platform (nr_cpu: 104, memory: 64G).

I applied your patch to the SPF kernel (commit : 
a7a8993bfe3ccb54ad468b9f1799649e4ad1ff12), then triggered below 2 cases test.
a)  Turn on the SPF handler by below command, then run page_fault3-thp-always 
test.
echo 1 > /proc/sys/vm/speculative_page_fault

b) Turn off the SPF handler by below command, then run page_fault3-thp-always 
test.
 echo 0 > /proc/sys/vm/speculative_page_fault

Every test run 3 times, and then get test result and capture perf data. 
Here is average result for will-it-scale.per_thread_ops:
 

  SPF_turn_off   SPF_turn_on
page_fault3-THP-Alwasys.will-it-scale.per_thread_ops31963  
26285

Best regards,
Haiyan Song


From: owner-linux...@kvack.org [owner-linux...@kvack.org] on behalf of Laurent 
Dufour [lduf...@linux.vnet.ibm.com]
Sent: Wednesday, August 22, 2018 10:23 PM
To: Song, HaiyanX
Cc: a...@linux-foundation.org; mho...@kernel.org; pet...@infradead.org; 
kir...@shutemov.name; a...@linux.intel.com; d...@stgolabs.net; j...@suse.cz; 
Matthew Wilcox; khand...@linux.vnet.ibm.com; aneesh.ku...@linux.vnet.ibm.com; 
b...@kernel.crashing.org; m...@ellerman.id.au; pau...@samba.org; Thomas 
Gleixner; Ingo Molnar; h...@zytor.com; Will Deacon; Sergey Senozhatsky; 
sergey.senozhatsky.w...@gmail.com; Andrea Arcangeli; Alexei Starovoitov; Wang, 
Kemi; Daniel Jordan; David Rientjes; Jerome Glisse; Ganesh Mahendran; Minchan 
Kim; Punit Agrawal; vinayak menon; Yang Shi; linux-ker...@vger.kernel.org; 
linux...@kvack.org; ha...@linux.vnet.ibm.com; npig...@gmail.com; 
bsinghar...@gmail.com; paul...@linux.vnet.ibm.com; Tim Chen; 
linuxppc-dev@lists.ozlabs.org; x...@kernel.org
Subject: Re: [PATCH v11 00/26] Speculative page faults

On 03/08/2018 08:36, Song, HaiyanX wrote:
> Hi Laurent,

Hi Haiyan,

Sorry for the late answer, I was off a couple of days.

>
> Thanks for your analysis for the last perf results.
> Your mentioned ," the major differences at the head of the perf report is the 
> 92% testcase which is weirdly not reported
> on the head side", which is a bug of 0-day,and it caused the item is not 
> counted in perf.
>
> I've triggered the test page_fault2 and page_fault3 again only with thread 
> mode of will-it-scale on 0-day (on the same test box,every case tested 3 
> times).
> I checked the perf report have no above mentioned problem.
>
> I have compared them, found some items have difference, such as below case:
>page_fault2-thp-always: handle_mm_fault, base: 45.22%head: 29.41%
>page_fault3-thp-always: handle_mm_fault, base: 22.95%head: 14.15%

These would mean that the system spends lees time running handle_mm_fault()
when SPF is in the picture in this 2 cases which is good. This should lead to
better results with the SPF series, and I can't find any values higher on the
head side.

>
> So i attached the perf result in mail again, could your have a look again for 
> checking the difference between base and head commit.

I took a close look to all the perf result you sent, but I can't identify any
major difference. But the compiler optimization is getting rid of the
handle_pte_fault() symbol on the base kernel which add complexity to check the
differences.

To get rid of that, I'm proposing that you applied the attached patch to the
spf kernel. This patch is allowing to turn on/off the SPF handler through
/proc/sys/vm/speculative_page_fault.

This should ease the testing by limiting the reboot and avoid kernel's symbols
mismatch. Obviously there is still a small overhead due to the check but it
should not be viewable.

With this patch applied you can simply run
echo 1 > /proc/sys/vm/speculative_page_fault
to run a test with the speculative page fault handler activated. Or run
echo 0 > /proc/sys/vm/speculative_page_fault
to run a test without it.

I'm really sorry to asking that again, but could please run the test
page_fault3_base_THP-Always with and without SPF and capture the perf output.

I think we should focus on that test which showed the biggest regression.

Thanks,
Laurent.


>
> Thanks,
> Haiyan, Song
>
> 
> From: owner-linux...@kvack.org [owner-linux...@kvack.org] on behalf of 
> Laurent Dufour [lduf...@linux.vnet.ibm.com]
> Sent: Tuesday, July 17, 2018 5:36 PM
> To: Song, HaiyanX
> Cc: a...@linux-foundation.org; mho...@kernel.org; pet...@infradead.org; 
> kir...@shutemov.name; a...@linux.intel.com; d...@stgolabs.net; j...@suse.cz; 
> Matthew Wilcox; khand...@linux.vnet.ibm.com; aneesh.ku...@linux.vnet.ibm.com; 
> 

Re: [RFC PATCH 11/11] selftests/powerpc: Adapt the test

2018-09-18 Thread Michael Neuling
On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> The Documentation/powerpc/transactional_memory.txt says:
> 
>  "Syscalls made from within a suspended transaction are performed as normal
>   and the transaction is not explicitly doomed by the kernel.  However,
>   what the kernel does to perform the syscall may result in the transaction
>   being doomed by the hardware."
> 
> With this new TM mechanism, the syscall will continue to be executed if the
> syscall happens on a suspended syscall, but, the syscall will *fail* if the
> transaction is still active during the syscall invocation.

Not sure I get this. This doesn't seem any different to before.

An active (not suspended) transaction *will* result in the syscall failing and
the transaction being doomed.  

A syscall in a suspended transaction should succeed and the transaction.

You might need to clean up the language. I try to use:

   Active == transactional but not suspended (ie MSR[TS] = T)
   Suspended == suspended (ie MSR [TS] = S)
   Doomed == transaction to be rolled back at next opportinity (ie tcheck 
returns doomed)

(note: the kernel MSR_TM_ACTIVE() macro is not consistent with this since it's
MSR[TS] == S or T).

> On the syscall path, if the transaction is active and not suspended, it
> will call TM_KERNEL_ENTRY which will reclaim and recheckpoint the
> transaction, thus, dooming the transaction on userspace return, with
> failure code TM_CAUSE_SYSCALL.

But the test below is on a suspend transaction?

> This new model will break part of this test, but I understand that that the
> documentation above didn't guarantee that the syscall would succeed, and it
> will never succeed anymore now on.

The syscall should pass in suspend (modulo the normal syscall checks). The
transaction may fail as a result.

> In fact, glibc is calling 'tabort' before every syscalls, thus, any syscall
> called through glibc from inside a transaction will be doomed anyhow.
> 
> This patch updates the test case to not assume that a syscall inside a
> active transaction will succeed, because it will not anymore.
> 
> Signed-off-by: Breno Leitao 
> ---
>  tools/testing/selftests/powerpc/tm/tm-syscall.c | 6 --
>  1 file changed, 6 deletions(-)
> 
> diff --git a/tools/testing/selftests/powerpc/tm/tm-syscall.c
> b/tools/testing/selftests/powerpc/tm/tm-syscall.c
> index 454b965a2db3..1439a87eba3a 100644
> --- a/tools/testing/selftests/powerpc/tm/tm-syscall.c
> +++ b/tools/testing/selftests/powerpc/tm/tm-syscall.c
> @@ -78,12 +78,6 @@ int tm_syscall(void)
>   timeradd(, , );
>  
>   for (count = 0; timercmp(, , <); count++) {
> - /*
> -  * Test a syscall within a suspended transaction and verify
> -  * that it succeeds.
> -  */
> - FAIL_IF(getppid_tm(true) == -1); /* Should succeed. */
> -
>   /*
>* Test a syscall within an active transaction and verify that
>* it fails with the correct failure code.


[PATCH net-next] net: ibm: fix return type of ndo_start_xmit function

2018-09-18 Thread YueHaibing
The method ndo_start_xmit() is defined as returning an 'netdev_tx_t',
which is a typedef for an enum type, so make sure the implementation in
this driver has returns 'netdev_tx_t' value, and change the function
return type to netdev_tx_t.

Found by coccinelle.

Signed-off-by: YueHaibing 
---
 drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 +-
 drivers/net/ethernet/ibm/emac/core.c  | 7 ---
 drivers/net/ethernet/ibm/ibmvnic.c| 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c 
b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index ba580bf..88128d3 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2038,7 +2038,7 @@ static void ehea_xmit3(struct sk_buff *skb, struct 
net_device *dev,
dev_consume_skb_any(skb);
 }
 
-static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
struct ehea_port *port = netdev_priv(dev);
struct ehea_swqe *swqe;
diff --git a/drivers/net/ethernet/ibm/emac/core.c 
b/drivers/net/ethernet/ibm/emac/core.c
index 7410a1d..5107c94 100644
--- a/drivers/net/ethernet/ibm/emac/core.c
+++ b/drivers/net/ethernet/ibm/emac/core.c
@@ -1409,7 +1409,7 @@ static inline u16 emac_tx_csum(struct emac_instance *dev,
return 0;
 }
 
-static inline int emac_xmit_finish(struct emac_instance *dev, int len)
+static inline netdev_tx_t emac_xmit_finish(struct emac_instance *dev, int len)
 {
struct emac_regs __iomem *p = dev->emacp;
struct net_device *ndev = dev->ndev;
@@ -1436,7 +1436,7 @@ static inline int emac_xmit_finish(struct emac_instance 
*dev, int len)
 }
 
 /* Tx lock BH */
-static int emac_start_xmit(struct sk_buff *skb, struct net_device *ndev)
+static netdev_tx_t emac_start_xmit(struct sk_buff *skb, struct net_device 
*ndev)
 {
struct emac_instance *dev = netdev_priv(ndev);
unsigned int len = skb->len;
@@ -1494,7 +1494,8 @@ static inline int emac_xmit_split(struct emac_instance 
*dev, int slot,
 }
 
 /* Tx lock BH disabled (SG version for TAH equipped EMACs) */
-static int emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
+static netdev_tx_t
+emac_start_xmit_sg(struct sk_buff *skb, struct net_device *ndev)
 {
struct emac_instance *dev = netdev_priv(ndev);
int nr_frags = skb_shinfo(skb)->nr_frags;
diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 4f0daf6..a8369ad 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -1428,7 +1428,7 @@ static int ibmvnic_xmit_workarounds(struct sk_buff *skb,
return 0;
 }
 
-static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
+static netdev_tx_t ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev)
 {
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
int queue_num = skb_get_queue_mapping(skb);
@@ -1452,7 +1452,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct 
net_device *netdev)
u64 *handle_array;
int index = 0;
u8 proto = 0;
-   int ret = 0;
+   netdev_tx_t ret = NETDEV_TX_OK;
 
if (adapter->resetting) {
if (!netif_subqueue_stopped(netdev, skb))
-- 
1.8.3.1