[PATCH] powerpc/book3s: Remove a few page table update interfaces.

2019-02-13 Thread Aneesh Kumar K.V
When updating page tables, we need to make sure we fill the page table
entry valid bit. We should be using page table populate interface for
updating the table entries. The page table 'set' interface allows
updating the raw value of page table entry. This can result in
updating the entry wrongly. Remove the 'set' interface so that we avoid
its future usage.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/pgalloc.h |  8 
 arch/powerpc/include/asm/book3s/64/pgtable.h | 14 --
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h 
b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index 9c1173283b96..138bc2ecc0c4 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -111,7 +111,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t 
*pgd)
 
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
-   pgd_set(pgd, __pgtable_ptr_val(pud) | PGD_VAL_BITS);
+   *pgd =  __pgd(__pgtable_ptr_val(pud) | PGD_VAL_BITS);
 }
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -138,7 +138,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t 
*pud)
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-   pud_set(pud, __pgtable_ptr_val(pmd) | PUD_VAL_BITS);
+   *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS);
 }
 
 static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
@@ -176,13 +176,13 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, 
pmd_t *pmd,
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
   pte_t *pte)
 {
-   pmd_set(pmd, __pgtable_ptr_val(pte) | PMD_VAL_BITS);
+   *pmd = __pmd(__pgtable_ptr_val(pte) | PMD_VAL_BITS);
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
pgtable_t pte_page)
 {
-   pmd_set(pmd, __pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
+   *pmd = __pmd(__pgtable_ptr_val(pte_page) | PMD_VAL_BITS);
 }
 
 static inline pgtable_t pmd_pgtable(pmd_t pmd)
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index dc71e2b92003..a24e00fb7fa7 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -853,11 +853,6 @@ static inline bool pte_ci(pte_t pte)
return false;
 }
 
-static inline void pmd_set(pmd_t *pmdp, unsigned long val)
-{
-   *pmdp = __pmd(val);
-}
-
 static inline void pmd_clear(pmd_t *pmdp)
 {
*pmdp = __pmd(0);
@@ -889,11 +884,6 @@ static inline int pmd_bad(pmd_t pmd)
return hash__pmd_bad(pmd);
 }
 
-static inline void pud_set(pud_t *pudp, unsigned long val)
-{
-   *pudp = __pud(val);
-}
-
 static inline void pud_clear(pud_t *pudp)
 {
*pudp = __pud(0);
@@ -936,10 +926,6 @@ static inline bool pud_access_permitted(pud_t pud, bool 
write)
 }
 
 #define pgd_write(pgd) pte_write(pgd_pte(pgd))
-static inline void pgd_set(pgd_t *pgdp, unsigned long val)
-{
-   *pgdp = __pgd(val);
-}
 
 static inline void pgd_clear(pgd_t *pgdp)
 {
-- 
2.20.1



[PATCH] powerpc/64s: Fix possible corruption on big endian due to pgd/pud_present()

2019-02-13 Thread Michael Ellerman
In v4.20 we changed our pgd/pud_present() to check for _PAGE_PRESENT
rather than just checking that the value is non-zero, e.g.:

  static inline int pgd_present(pgd_t pgd)
  {
 -   return !pgd_none(pgd);
 +   return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
  }

Unfortunately this is broken on big endian, as the result of the
bitwise && is truncated to int, which is always zero because
_PAGE_PRESENT is 0x8000ul. This means pgd_present() and
pud_present() are always false at compile time, and the compiler
elides the subsequent code.

Remarkably with that bug present we are still able to boot and run
with few noticeable effects. However under some work loads we are able
to trigger a warning in the ext4 code:

  WARNING: CPU: 11 PID: 29593 at fs/ext4/inode.c:3927 
.ext4_set_page_dirty+0x70/0xb0
  CPU: 11 PID: 29593 Comm: debugedit Not tainted 4.20.0-rc1 #1
  ...
  NIP .ext4_set_page_dirty+0x70/0xb0
  LR  .set_page_dirty+0xa0/0x150
  Call Trace:
   .set_page_dirty+0xa0/0x150
   .unmap_page_range+0xbf0/0xe10
   .unmap_vmas+0x84/0x130
   .unmap_region+0xe8/0x190
   .__do_munmap+0x2f0/0x510
   .__vm_munmap+0x80/0x110
   .__se_sys_munmap+0x14/0x30
   system_call+0x5c/0x70

The fix is simple, we need to convert the result of the bitwise && to
an int before returning it.

Thanks to Jan Kara and Aneesh for help with debugging.

Fixes: da7ad366b497 ("powerpc/mm/book3s: Update pmd_present to look at 
_PAGE_PRESENT bit")
Cc: sta...@vger.kernel.org # v4.20+
Reported-by: Erhard F. 
Reviewed-by: Aneesh Kumar K.V 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c9bfe526ca9d..d8c8d7c9df15 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -904,7 +904,7 @@ static inline int pud_none(pud_t pud)
 
 static inline int pud_present(pud_t pud)
 {
-   return (pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT));
+   return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PRESENT));
 }
 
 extern struct page *pud_page(pud_t pud);
@@ -951,7 +951,7 @@ static inline int pgd_none(pgd_t pgd)
 
 static inline int pgd_present(pgd_t pgd)
 {
-   return (pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
+   return !!(pgd_raw(pgd) & cpu_to_be64(_PAGE_PRESENT));
 }
 
 static inline pte_t pgd_pte(pgd_t pgd)
-- 
2.20.1



[PATCH 1/1] powerpc/64: Adjust order in pcibios_init()

2019-02-13 Thread Sam Bobroff
The pcibios_init() function for 64 bit PowerPC currently calls
pci_bus_add_devices() before pcibios_resource_survey(), which seems
incorrect because it adds devices and attempts to bind their drivers
before allocating their resources (although no problems seem to be
apparent).

So move the call to pci_bus_add_devices() to after
pcibios_resource_survey().

This will also allow the ppc_md.pcibios_bus_add_device() hooks to
perform actions that depend on PCI resources, both during rescanning
(where this is already the case) and at boot time, which should
support improvements and refactoring.

Signed-off-by: Sam Bobroff 
---
Hi everyone,

I've tested this on a P9 for both the host and a KVM guest, and the change
hasn't caused any differences in PCI resource assignments or the general boot
messages.

I've also had a go at inspecting most of the code used by pci_bus_add_devices()
and pcibios_resource_survey() and it doesn't look like there are going to be
any changes in behaviour caused by reordering.  It might be worth mentioning
that the hotplug path (see pcibios_finish_adding_to_bus()) already does
resource allocation before calling pci_bus_add_devices().

However, it would be great if someone could test this change on some older
hardware or comment on wether we should make the same change on 32 bit machines.

Cheers,
Sam.

 arch/powerpc/kernel/pci_64.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index 9d8c10d55407..1ce2dbdb 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -58,14 +58,16 @@ static int __init pcibios_init(void)
pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0);
 
/* Scan all of the recorded PCI controllers.  */
-   list_for_each_entry_safe(hose, tmp, _list, list_node) {
+   list_for_each_entry_safe(hose, tmp, _list, list_node)
pcibios_scan_phb(hose);
-   pci_bus_add_devices(hose->bus);
-   }
 
/* Call common code to handle resource allocation */
pcibios_resource_survey();
 
+   /* Add devices. */
+   list_for_each_entry_safe(hose, tmp, _list, list_node)
+   pci_bus_add_devices(hose->bus);
+
printk(KERN_DEBUG "PCI: Probing PCI hardware done\n");
 
return 0;
-- 
2.19.0.2.gcad72f5712



Re: [PATCH v3 2/2] drivers/mtd: Fix device registration error

2019-02-13 Thread Aneesh Kumar K.V

On 2/13/19 6:58 PM, Boris Brezillon wrote:

Subject prefix should be "mtd: powernv_flash: "

On Mon, 11 Feb 2019 19:03:38 +0530
"Aneesh Kumar K.V"  wrote:


This change helps me to get multiple mtd device registered. Without this
I get

sysfs: cannot create duplicate filename '/bus/nvmem/devices/flash0'
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2-00557-g1ef20ef21f22 #13
Call Trace:
[c000b38e3220] [c0b58fe4] dump_stack+0xe8/0x164 (unreliable)
[c000b38e3270] [c04cf074] sysfs_warn_dup+0x84/0xb0
[c000b38e32f0] [c04cf6c4] sysfs_do_create_link_sd.isra.0+0x114/0x150
[c000b38e3340] [c0726a84] bus_add_device+0x94/0x1e0
[c000b38e33c0] [c07218f0] device_add+0x4d0/0x830
[c000b38e3480] [c09d54a8] nvmem_register.part.2+0x1c8/0xb30
[c000b38e3560] [c0834530] mtd_nvmem_add+0x90/0x120
[c000b38e3650] [c0835bc8] add_mtd_device+0x198/0x4e0
[c000b38e36f0] [c083619c] mtd_device_parse_register+0x11c/0x280
[c000b38e3780] [c0840830] powernv_flash_probe+0x180/0x250
[c000b38e3820] [c072c120] platform_drv_probe+0x60/0xf0
[c000b38e38a0] [c07283c8] really_probe+0x138/0x4d0
[c000b38e3930] [c0728acc] driver_probe_device+0x13c/0x1b0
[c000b38e39b0] [c0728c7c] __driver_attach+0x13c/0x1c0
[c000b38e3a30] [c0725130] bus_for_each_dev+0xa0/0x120
[c000b38e3a90] [c0727b2c] driver_attach+0x2c/0x40
[c000b38e3ab0] [c07270f8] bus_add_driver+0x228/0x360
[c000b38e3b40] [c072a2e0] driver_register+0x90/0x1a0
[c000b38e3bb0] [c072c020] __platform_driver_register+0x50/0x70
[c000b38e3bd0] [c105c984] powernv_flash_driver_init+0x24/0x38
[c000b38e3bf0] [c0010904] do_one_initcall+0x84/0x464
[c000b38e3cd0] [c1004548] kernel_init_freeable+0x530/0x634
[c000b38e3db0] [c0011154] kernel_init+0x1c/0x168
[c000b38e3e20] [c000bed4] ret_from_kernel_thread+0x5c/0x68
mtd mtd1: Failed to register NVMEM device

With the change we now have

root@(none):/sys/bus/nvmem/devices# ls -al
total 0
drwxr-xr-x 2 root root 0 Feb  6 20:49 .
drwxr-xr-x 4 root root 0 Feb  6 20:49 ..
lrwxrwxrwx 1 root root 0 Feb  6 20:49 flash@0 -> 
../../../devices/platform/ibm,opal:flash@0/mtd/mtd0/flash@0
lrwxrwxrwx 1 root root 0 Feb  6 20:49 flash@1 -> 
../../../devices/platform/ibm,opal:flash@1/mtd/mtd1/flash@1

Fixes: acfe63ec1c59 ("mtd: Convert to using %pOFn instead of device_node.name")


Actually it's not this commit that is at fault as mtd->name was already
given the value of device_node->name before that. I think you're
actually fixing 1cbb4a1c433a ("mtd: powernv: Add powernv flash MTD
abstraction driver").

No need to send a new version, I can fix that when applying, just let
me know if you're okay with the changes I suggested.



The suggested changes looks good.

Thanks
-aneesh



Re: [PATCH 0/5] use pinned_vm instead of locked_vm to account pinned pages

2019-02-13 Thread Ira Weiny
On Mon, Feb 11, 2019 at 03:54:47PM -0700, Jason Gunthorpe wrote:
> On Mon, Feb 11, 2019 at 05:44:32PM -0500, Daniel Jordan wrote:
> 
> > All five of these places, and probably some of Davidlohr's conversions,
> > probably want to be collapsed into a common helper in the core mm for
> > accounting pinned pages.  I tried, and there are several details that
> > likely need discussion, so this can be done as a follow-on.
> 
> I've wondered the same..

I'm really thinking this would be a nice way to ensure it gets cleaned up and
does not happen again.

Also, by moving it to the core we could better manage any user visible changes.

>From a high level, pinned is a subset of locked so it seems like we need a 2
sets of helpers.

try_increment_locked_vm(...)
decrement_locked_vm(...)

try_increment_pinned_vm(...)
decrement_pinned_vm(...)

Where try_increment_pinned_vm() also increments locked_vm...  Of course this
may end up reverting the improvement of Davidlohr  Bueso's atomic work...  :-(

Furthermore it would seem better (although I don't know if at all possible) if
this were accounted for in core calls which tracked them based on how the pages
are being used so that drivers can't call try_increment_locked_vm() and then
pin the pages...  Thus getting the account wrong vs what actually happened.

And then in the end we can go back to locked_vm being the value checked against
RLIMIT_MEMLOCK.

Ira



Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-13 Thread Daniel Jordan
On Wed, Feb 13, 2019 at 01:03:30PM -0700, Alex Williamson wrote:
> Daniel Jordan  wrote:
> > On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> > > This still makes me nervous because we have userspace dependencies on
> > > setting process locked memory.  
> > 
> > Could you please expand on this?  Trying to get more context.
> 
> VFIO is a userspace driver interface and the pinned/locked page
> accounting we're doing here is trying to prevent a user from exceeding
> their locked memory limits.  Thus a VM management tool or unprivileged
> userspace driver needs to have appropriate locked memory limits
> configured for their use case.  Currently we do not have a unified
> accounting scheme, so if a page is mlock'd by the user and also mapped
> through VFIO for DMA, it's accounted twice, these both increment
> locked_vm and userspace needs to manage that.  If pinned memory
> and locked memory are now two separate buckets and we're only comparing
> one of them against the locked memory limit, then it seems we have
> effectively doubled the user's locked memory for this use case, as
> Jason questioned.  The user could mlock one page and DMA map another,
> they're both "locked", but now they only take one slot in each bucket.

Right, yes.  Should have been more specific.  I was after a concrete use case
where this would happen (sounded like you may have had a specific tool in
mind).

But it doesn't matter.  I understand your concern and agree that, given the
possibility that accounting in _some_ tool can be affected, we should fix
accounting before changing user visible behavior.  I can start a separate
discussion, having opened the can of worms again :)

> If we continue forward with using a separate bucket here, userspace
> could infer that accounting is unified and lower the user's locked
> memory limit, or exploit the gap that their effective limit might
> actually exceed system memory.  In the former case, if we do eventually
> correct to compare the total of the combined buckets against the user's
> locked memory limits, we'll break users that have adapted their locked
> memory limits to meet the apparent needs.  In the latter case, the
> inconsistent accounting is potentially an attack vector.

Makes sense.

> > > There's a user visible difference if we
> > > account for them in the same bucket vs separate.  Perhaps we're
> > > counting in the wrong bucket now, but if we "fix" that and userspace
> > > adapts, how do we ever go back to accounting both mlocked and pinned
> > > memory combined against rlimit?  Thanks,  
> > 
> > PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> > locked_vm accounting separate, but allowed the two to be added safely to be
> > compared against RLIMIT_MEMLOCK.
> 
> Unless I'm incorrect in the concerns above, I don't see how we can
> convert vfio before this occurs.
>  
> > Anyway, until some solution is agreed on, are there objections to converting
> > locked_vm to an atomic, to avoid user-visible changes, instead of switching
> > locked_vm users to pinned_vm?
> 
> Seems that as long as we have separate buckets that are compared
> individually to rlimit that we've got problems, it's just a matter of
> where they're exposed based on which bucket is used for which
> interface.  Thanks,

Indeed.  But for now, any concern with simply changing the type of the
currently used counter to an atomic, to reduce mmap_sem usage?  This is just an
implementation detail, invisible to userspace.


Re: [PATCH V2 3/7] mm/gup: Change GUP fast to use flags rather than a write 'bool'

2019-02-13 Thread Ira Weiny
On Wed, Feb 13, 2019 at 04:11:10PM -0700, Jason Gunthorpe wrote:
> On Wed, Feb 13, 2019 at 03:04:51PM -0800, ira.we...@intel.com wrote:
> > From: Ira Weiny 
> > 
> > To facilitate additional options to get_user_pages_fast() change the
> > singular write parameter to be gup_flags.
> 
> So now we have:
> 
> long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
>   struct page **pages, unsigned int gup_flags);
> 
> and 
> 
> int get_user_pages_fast(unsigned long start, int nr_pages,
>   unsigned int gup_flags, struct page **pages)
> 
> Does this make any sense? At least the arguments should be in the same
> order, I think.

Yes...  and no.  see below.

> 
> Also this comment:
> /*
>  * get_user_pages_unlocked() is suitable to replace the form:
>  *
>  *  down_read(>mmap_sem);
>  *  get_user_pages(tsk, mm, ..., pages, NULL);
>  *  up_read(>mmap_sem);
>  *
>  *  with:
>  *
>  *  get_user_pages_unlocked(tsk, mm, ..., pages);
>  *
>  * It is functionally equivalent to get_user_pages_fast so
>  * get_user_pages_fast should be used instead if specific gup_flags
>  * (e.g. FOLL_FORCE) are not required.
>  */
> 
> Needs some attention as the recommendation is now nonsense.

IMO they are not functionally equivalent.

We can't remove *_unlocked() as it is used as both a helper for the arch
specific *_fast() calls, _and_ in drivers.  Again I don't know the history here
but it could be that the drivers should never have used the call in the first
place???  Or been converted at some point?

I could change the comment to be something like

/*
 * get_user_pages_unlocked() is only to be used by arch specific
 * get_user_pages_fast() calls.  Drivers should be calling
 * get_user_pages_fast()
 */

Instead of the current comment.

And change the drivers to get_user_pages_fast().

However, I'm not sure if these drivers need the FOLL_TOUCH flag which
*_unlocked() adds for them.  And adding FOLL_TOUCH to *_fast() is not going to
give the same functionality.

It _looks_ like we can add FOLL_TOUCH functionality to the fast path in the
generic code.  I'm not sure about the arch's.

If we did that then we can have those drivers use FOLL_TOUCH or not in *_fast()
if they want/need.

> 
> Honestly a proper explanation of why two functions exist would be
> great at this point :)

I've not researched it.  I do agree that there seems to be a lot of calls in
this file and the differences are subtle.

Ira

> 
> Jason


Re: [PATCH V2 3/7] mm/gup: Change GUP fast to use flags rather than a write 'bool'

2019-02-13 Thread Jason Gunthorpe
On Wed, Feb 13, 2019 at 03:04:51PM -0800, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> To facilitate additional options to get_user_pages_fast() change the
> singular write parameter to be gup_flags.

So now we have:

long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);

and 

int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)

Does this make any sense? At least the arguments should be in the same
order, I think.

Also this comment:
/*
 * get_user_pages_unlocked() is suitable to replace the form:
 *
 *  down_read(>mmap_sem);
 *  get_user_pages(tsk, mm, ..., pages, NULL);
 *  up_read(>mmap_sem);
 *
 *  with:
 *
 *  get_user_pages_unlocked(tsk, mm, ..., pages);
 *
 * It is functionally equivalent to get_user_pages_fast so
 * get_user_pages_fast should be used instead if specific gup_flags
 * (e.g. FOLL_FORCE) are not required.
 */

Needs some attention as the recommendation is now nonsense.

Honestly a proper explanation of why two functions exist would be
great at this point :)

Jason


[PATCH V2 7/7] IB/mthca: Use the new FOLL_LONGTERM flag to get_user_pages_fast()

2019-02-13 Thread ira . weiny
From: Ira Weiny 

Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/hw/mthca/mthca_memfree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c 
b/drivers/infiniband/hw/mthca/mthca_memfree.c
index 112d2f38e0de..8ff0e90d7564 100644
--- a/drivers/infiniband/hw/mthca/mthca_memfree.c
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -472,7 +472,8 @@ int mthca_map_user_db(struct mthca_dev *dev, struct 
mthca_uar *uar,
goto out;
}
 
-   ret = get_user_pages_fast(uaddr & PAGE_MASK, 1, FOLL_WRITE, pages);
+   ret = get_user_pages_fast(uaddr & PAGE_MASK, 1,
+ FOLL_WRITE | FOLL_LONGTERM, pages);
if (ret < 0)
goto out;
 
-- 
2.20.1



[PATCH V2 6/7] IB/qib: Use the new FOLL_LONGTERM flag to get_user_pages_fast()

2019-02-13 Thread ira . weiny
From: Ira Weiny 

Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/hw/qib/qib_user_sdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c 
b/drivers/infiniband/hw/qib/qib_user_sdma.c
index 31c523b2a9f5..b53cc0240e02 100644
--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -673,7 +673,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata 
*dd,
else
j = npages;
 
-   ret = get_user_pages_fast(addr, j, 0, pages);
+   ret = get_user_pages_fast(addr, j, FOLL_LONGTERM, pages);
if (ret != j) {
i = 0;
j = ret;
-- 
2.20.1



[PATCH V2 5/7] IB/hfi1: Use the new FOLL_LONGTERM flag to get_user_pages_fast()

2019-02-13 Thread ira . weiny
From: Ira Weiny 

Use the new FOLL_LONGTERM to get_user_pages_fast() to protect against
FS DAX pages being mapped.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/hw/hfi1/user_pages.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/user_pages.c 
b/drivers/infiniband/hw/hfi1/user_pages.c
index 78ccacaf97d0..6a7f9cd5a94e 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -104,9 +104,11 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned 
long vaddr, size_t np
bool writable, struct page **pages)
 {
int ret;
+   unsigned int gup_flags = writable ? FOLL_WRITE : 0;
 
-   ret = get_user_pages_fast(vaddr, npages, writable ? FOLL_WRITE : 0,
- pages);
+   gup_flags |= FOLL_LONGTERM;
+
+   ret = get_user_pages_fast(vaddr, npages, gup_flags, pages);
if (ret < 0)
return ret;
 
-- 
2.20.1



[PATCH V2 4/7] mm/gup: Add FOLL_LONGTERM capability to GUP fast

2019-02-13 Thread ira . weiny
From: Ira Weiny 

DAX pages were previously unprotected from longterm pins when users
called get_user_pages_fast().

Use the new FOLL_LONGTERM flag to check for DEVMAP pages and fall
back to regular GUP processing if a DEVMAP page is encountered.

Signed-off-by: Ira Weiny 
---
 mm/gup.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 6f32d36b3c5b..f7e759c523bb 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1439,6 +1439,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
goto pte_unmap;
 
if (pte_devmap(pte)) {
+   if (unlikely(flags & FOLL_LONGTERM))
+   goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1578,8 +1581,11 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
unsigned long addr,
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
 
-   if (pmd_devmap(orig))
+   if (pmd_devmap(orig)) {
+   if (unlikely(flags & FOLL_LONGTERM))
+   return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+   }
 
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1904,8 +1910,20 @@ int get_user_pages_fast(unsigned long start, int 
nr_pages,
start += nr << PAGE_SHIFT;
pages += nr;
 
-   ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- gup_flags);
+   if (gup_flags & FOLL_LONGTERM) {
+   down_read(>mm->mmap_sem);
+   ret = __gup_longterm_locked(current, current->mm,
+   start, nr_pages - nr,
+   pages, NULL, gup_flags);
+   up_read(>mm->mmap_sem);
+   } else {
+   /*
+* retain FAULT_FOLL_ALLOW_RETRY optimization if
+* possible
+*/
+   ret = get_user_pages_unlocked(start, nr_pages - nr,
+ pages, gup_flags);
+   }
 
/* Have to be a bit careful with return values */
if (nr > 0) {
-- 
2.20.1



[PATCH V2 3/7] mm/gup: Change GUP fast to use flags rather than a write 'bool'

2019-02-13 Thread ira . weiny
From: Ira Weiny 

To facilitate additional options to get_user_pages_fast() change the
singular write parameter to be gup_flags.

This patch does not change any functionality.  New functionality will
follow in subsequent patches.

Some of the get_user_pages_fast() call sites were unchanged because they
already passed FOLL_WRITE or 0 for the write parameter.

Signed-off-by: Ira Weiny 
---
 arch/mips/mm/gup.c | 11 ++-
 arch/powerpc/kvm/book3s_64_mmu_hv.c|  4 ++--
 arch/powerpc/kvm/e500_mmu.c|  2 +-
 arch/powerpc/mm/mmu_context_iommu.c|  4 ++--
 arch/s390/kvm/interrupt.c  |  2 +-
 arch/s390/mm/gup.c | 12 ++--
 arch/sh/mm/gup.c   | 11 ++-
 arch/sparc/mm/gup.c|  9 +
 arch/x86/kvm/paging_tmpl.h |  2 +-
 arch/x86/kvm/svm.c |  2 +-
 drivers/fpga/dfl-afu-dma-region.c  |  2 +-
 drivers/gpu/drm/via/via_dmablit.c  |  3 ++-
 drivers/infiniband/hw/hfi1/user_pages.c|  3 ++-
 drivers/misc/genwqe/card_utils.c   |  2 +-
 drivers/misc/vmw_vmci/vmci_host.c  |  2 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c|  6 --
 drivers/platform/goldfish/goldfish_pipe.c  |  3 ++-
 drivers/rapidio/devices/rio_mport_cdev.c   |  4 +++-
 drivers/sbus/char/oradax.c |  2 +-
 drivers/scsi/st.c  |  3 ++-
 drivers/staging/gasket/gasket_page_table.c |  4 ++--
 drivers/tee/tee_shm.c  |  2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c|  3 ++-
 drivers/vhost/vhost.c  |  2 +-
 drivers/video/fbdev/pvr2fb.c   |  2 +-
 drivers/virt/fsl_hypervisor.c  |  2 +-
 drivers/xen/gntdev.c   |  2 +-
 fs/orangefs/orangefs-bufmap.c  |  2 +-
 include/linux/mm.h |  4 ++--
 kernel/futex.c |  2 +-
 lib/iov_iter.c |  7 +--
 mm/gup.c   | 10 +-
 mm/util.c  |  8 
 net/ceph/pagevec.c |  2 +-
 net/rds/info.c |  2 +-
 net/rds/rdma.c |  3 ++-
 36 files changed, 81 insertions(+), 65 deletions(-)

diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c
index 0d14e0d8eacf..4c2b4483683c 100644
--- a/arch/mips/mm/gup.c
+++ b/arch/mips/mm/gup.c
@@ -235,7 +235,7 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
  * get_user_pages_fast() - pin user pages in memory
  * @start: starting user address
  * @nr_pages:  number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
  * @pages: array that receives pointers to the pages pinned.
  * Should be at least nr_pages long.
  *
@@ -247,8 +247,8 @@ int __get_user_pages_fast(unsigned long start, int 
nr_pages, int write,
  * requested. If nr_pages is 0 or negative, returns 0. If no pages
  * were pinned, returns -errno.
  */
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
-   struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+   unsigned int gup_flags, struct page **pages)
 {
struct mm_struct *mm = current->mm;
unsigned long addr, len, end;
@@ -273,7 +273,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, 
int write,
next = pgd_addr_end(addr, end);
if (pgd_none(pgd))
goto slow;
-   if (!gup_pud_range(pgd, addr, next, write, pages, ))
+   if (!gup_pud_range(pgd, addr, next, gup_flags & FOLL_WRITE,
+  pages, ))
goto slow;
} while (pgdp++, addr = next, addr != end);
local_irq_enable();
@@ -289,7 +290,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, 
int write,
pages += nr;
 
ret = get_user_pages_unlocked(start, (end - start) >> PAGE_SHIFT,
- pages, write ? FOLL_WRITE : 0);
+ pages, gup_flags);
 
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index bd2dcfbf00cd..8fcb0a921e46 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -582,7 +582,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
-   npages = get_user_pages_fast(hva, 1, writing, pages);
+   npages = get_user_pages_fast(hva, 1, 

[PATCH V2 1/7] mm/gup: Replace get_user_pages_longterm() with FOLL_LONGTERM

2019-02-13 Thread ira . weiny
From: Ira Weiny 

Rather than have a separate get_user_pages_longterm() call,
introduce FOLL_LONGTERM and change the longterm callers to use
it.

This patch does not change any functionality.

FOLL_LONGTERM can only be supported with get_user_pages() as it
requires vmas to determine if DAX is in use.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/core/umem.c |   5 +-
 drivers/infiniband/hw/qib/qib_user_pages.c |   8 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c   |   9 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c  |   6 +-
 drivers/vfio/vfio_iommu_type1.c|   3 +-
 include/linux/mm.h |  13 +-
 mm/gup.c   | 138 -
 mm/gup_benchmark.c |   5 +-
 8 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index b69d3efa8712..120a40df91b4 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -185,10 +185,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
 
while (npages) {
down_read(>mmap_sem);
-   ret = get_user_pages_longterm(cur_base,
+   ret = get_user_pages(cur_base,
 min_t(unsigned long, npages,
   PAGE_SIZE / sizeof (struct page *)),
-gup_flags, page_list, vma_list);
+gup_flags | FOLL_LONGTERM,
+page_list, vma_list);
if (ret < 0) {
up_read(>mmap_sem);
goto umem_release;
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c 
b/drivers/infiniband/hw/qib/qib_user_pages.c
index ef8bcf366ddc..1b9368261035 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -114,10 +114,10 @@ int qib_get_user_pages(unsigned long start_page, size_t 
num_pages,
 
down_read(>mm->mmap_sem);
for (got = 0; got < num_pages; got += ret) {
-   ret = get_user_pages_longterm(start_page + got * PAGE_SIZE,
- num_pages - got,
- FOLL_WRITE | FOLL_FORCE,
- p + got, NULL);
+   ret = get_user_pages(start_page + got * PAGE_SIZE,
+num_pages - got,
+FOLL_LONGTERM | FOLL_WRITE | FOLL_FORCE,
+p + got, NULL);
if (ret < 0) {
up_read(>mm->mmap_sem);
goto bail_release;
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c 
b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 06862a6af185..1d9a182ac163 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -143,10 +143,11 @@ static int usnic_uiom_get_pages(unsigned long addr, 
size_t size, int writable,
ret = 0;
 
while (npages) {
-   ret = get_user_pages_longterm(cur_base,
-   min_t(unsigned long, npages,
-   PAGE_SIZE / sizeof(struct page *)),
-   gup_flags, page_list, NULL);
+   ret = get_user_pages(cur_base,
+min_t(unsigned long, npages,
+PAGE_SIZE / sizeof(struct page *)),
+gup_flags | FOLL_LONGTERM,
+page_list, NULL);
 
if (ret < 0)
goto out;
diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c 
b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 08929c087e27..870a2a526e0b 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -186,12 +186,12 @@ static int videobuf_dma_init_user_locked(struct 
videobuf_dmabuf *dma,
dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",
data, size, dma->nr_pages);
 
-   err = get_user_pages_longterm(data & PAGE_MASK, dma->nr_pages,
-flags, dma->pages, NULL);
+   err = get_user_pages(data & PAGE_MASK, dma->nr_pages,
+flags | FOLL_LONGTERM, dma->pages, NULL);
 
if (err != dma->nr_pages) {
dma->nr_pages = (err >= 0) ? err : 0;
-   dprintk(1, "get_user_pages_longterm: err=%d [%d]\n", err,
+   dprintk(1, "get_user_pages: err=%d [%d]\n", err,
dma->nr_pages);
return err < 0 ? err : -EINVAL;
}
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..1500bd0bb6da 100644
--- 

[PATCH V2 0/7] Add FOLL_LONGTERM to GUP fast and use it

2019-02-13 Thread ira . weiny
From: Ira Weiny 

NOTE: This series depends on my clean up patch to remove the write parameter
from gup_fast_permitted()[1]

HFI1, qib, and mthca, use get_user_pages_fast() due to it performance
advantages.  These pages can be held for a significant time.  But
get_user_pages_fast() does not protect against mapping of FS DAX pages.

Introduce FOLL_LONGTERM and use this flag in get_user_pages_fast() which
retains the performance while also adding the FS DAX checks.  XDP has also
shown interest in using this functionality.[2]

In addition we change get_user_pages() to use the new FOLL_LONGTERM flag and
remove the specialized get_user_pages_longterm call.

[1] https://lkml.org/lkml/2019/2/11/237
[2] https://lkml.org/lkml/2019/2/11/1789

Ira Weiny (7):
  mm/gup: Replace get_user_pages_longterm() with FOLL_LONGTERM
  mm/gup: Change write parameter to flags in fast walk
  mm/gup: Change GUP fast to use flags rather than a write 'bool'
  mm/gup: Add FOLL_LONGTERM capability to GUP fast
  IB/hfi1: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
  IB/qib: Use the new FOLL_LONGTERM flag to get_user_pages_fast()
  IB/mthca: Use the new FOLL_LONGTERM flag to get_user_pages_fast()

 arch/mips/mm/gup.c  |  11 +-
 arch/powerpc/kvm/book3s_64_mmu_hv.c |   4 +-
 arch/powerpc/kvm/e500_mmu.c |   2 +-
 arch/powerpc/mm/mmu_context_iommu.c |   4 +-
 arch/s390/kvm/interrupt.c   |   2 +-
 arch/s390/mm/gup.c  |  12 +-
 arch/sh/mm/gup.c|  11 +-
 arch/sparc/mm/gup.c |   9 +-
 arch/x86/kvm/paging_tmpl.h  |   2 +-
 arch/x86/kvm/svm.c  |   2 +-
 drivers/fpga/dfl-afu-dma-region.c   |   2 +-
 drivers/gpu/drm/via/via_dmablit.c   |   3 +-
 drivers/infiniband/core/umem.c  |   5 +-
 drivers/infiniband/hw/hfi1/user_pages.c |   5 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |   3 +-
 drivers/infiniband/hw/qib/qib_user_pages.c  |   8 +-
 drivers/infiniband/hw/qib/qib_user_sdma.c   |   2 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c|   9 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c   |   6 +-
 drivers/misc/genwqe/card_utils.c|   2 +-
 drivers/misc/vmw_vmci/vmci_host.c   |   2 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c |   6 +-
 drivers/platform/goldfish/goldfish_pipe.c   |   3 +-
 drivers/rapidio/devices/rio_mport_cdev.c|   4 +-
 drivers/sbus/char/oradax.c  |   2 +-
 drivers/scsi/st.c   |   3 +-
 drivers/staging/gasket/gasket_page_table.c  |   4 +-
 drivers/tee/tee_shm.c   |   2 +-
 drivers/vfio/vfio_iommu_spapr_tce.c |   3 +-
 drivers/vfio/vfio_iommu_type1.c |   3 +-
 drivers/vhost/vhost.c   |   2 +-
 drivers/video/fbdev/pvr2fb.c|   2 +-
 drivers/virt/fsl_hypervisor.c   |   2 +-
 drivers/xen/gntdev.c|   2 +-
 fs/orangefs/orangefs-bufmap.c   |   2 +-
 include/linux/mm.h  |  17 +-
 kernel/futex.c  |   2 +-
 lib/iov_iter.c  |   7 +-
 mm/gup.c| 220 
 mm/gup_benchmark.c  |   5 +-
 mm/util.c   |   8 +-
 net/ceph/pagevec.c  |   2 +-
 net/rds/info.c  |   2 +-
 net/rds/rdma.c  |   3 +-
 44 files changed, 232 insertions(+), 180 deletions(-)

-- 
2.20.1



[PATCH V2 2/7] mm/gup: Change write parameter to flags in fast walk

2019-02-13 Thread ira . weiny
From: Ira Weiny 

In order to support more options in the GUP fast walk, change
the write parameter to flags throughout the call stack.

This patch does not change functionality and passes FOLL_WRITE
where write was previously used.

Signed-off-by: Ira Weiny 
---
 mm/gup.c | 52 ++--
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index ee96eaff118c..681388236106 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1417,7 +1417,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, 
struct page **pages)
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-int write, struct page **pages, int *nr)
+unsigned int flags, struct page **pages, int *nr)
 {
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1435,7 +1435,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
if (pte_protnone(pte))
goto pte_unmap;
 
-   if (!pte_access_permitted(pte, write))
+   if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
 
if (pte_devmap(pte)) {
@@ -1487,7 +1487,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
  * useful to have gup_huge_pmd even if we can't operate on ptes.
  */
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-int write, struct page **pages, int *nr)
+unsigned int flags, struct page **pages, int *nr)
 {
return 0;
 }
@@ -1570,12 +1570,12 @@ static int __gup_device_huge_pud(pud_t pud, pud_t 
*pudp, unsigned long addr,
 #endif
 
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-   unsigned long end, int write, struct page **pages, int *nr)
+   unsigned long end, unsigned int flags, struct page **pages, int 
*nr)
 {
struct page *head, *page;
int refs;
 
-   if (!pmd_access_permitted(orig, write))
+   if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
 
if (pmd_devmap(orig))
@@ -1608,12 +1608,12 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
unsigned long addr,
 }
 
 static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
-   unsigned long end, int write, struct page **pages, int *nr)
+   unsigned long end, unsigned int flags, struct page **pages, int 
*nr)
 {
struct page *head, *page;
int refs;
 
-   if (!pud_access_permitted(orig, write))
+   if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
 
if (pud_devmap(orig))
@@ -1646,13 +1646,13 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, 
unsigned long addr,
 }
 
 static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
-   unsigned long end, int write,
+   unsigned long end, unsigned int flags,
struct page **pages, int *nr)
 {
int refs;
struct page *head, *page;
 
-   if (!pgd_access_permitted(orig, write))
+   if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
 
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1683,7 +1683,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned 
long addr,
 }
 
 static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
-   int write, struct page **pages, int *nr)
+   unsigned int flags, struct page **pages, int *nr)
 {
unsigned long next;
pmd_t *pmdp;
@@ -1705,7 +1705,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
if (pmd_protnone(pmd))
return 0;
 
-   if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+   if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
 
@@ -1715,9 +1715,9 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
 * pmd format and THP pmd format
 */
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
-PMD_SHIFT, next, write, pages, nr))
+PMD_SHIFT, next, flags, pages, nr))
return 0;
-   } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+   } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
 
@@ -1725,7 +1725,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, 
unsigned long end,
 }
 
 static int 

Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-13 Thread Jason Gunthorpe
On Wed, Feb 13, 2019 at 01:03:30PM -0700, Alex Williamson wrote:
> > PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> > locked_vm accounting separate, but allowed the two to be added safely to be
> > compared against RLIMIT_MEMLOCK.
> 
> Unless I'm incorrect in the concerns above, I don't see how we can
> convert vfio before this occurs.

RDMA was converted to this pinned_vm scheme a long time ago, arguably
it is a mistake that VFIO did something different... This was to fix
some other bug where reporting of pages was wrong.

You are not wrong that this approach doesn't entirely make sense
though. :)

Jason


[PATCH v4 3/3] locking/rwsem: Optimize down_read_trylock()

2019-02-13 Thread Waiman Long
Modify __down_read_trylock() to optimize for an unlocked rwsem and make
it generate slightly better code.

Before this patch, down_read_trylock:

   0x <+0>: callq  0x5 
   0x0005 <+5>: jmp0x18 
   0x0007 <+7>: lea0x1(%rdx),%rcx
   0x000b <+11>:mov%rdx,%rax
   0x000e <+14>:lock cmpxchg %rcx,(%rdi)
   0x0013 <+19>:cmp%rax,%rdx
   0x0016 <+22>:je 0x23 
   0x0018 <+24>:mov(%rdi),%rdx
   0x001b <+27>:test   %rdx,%rdx
   0x001e <+30>:jns0x7 
   0x0020 <+32>:xor%eax,%eax
   0x0022 <+34>:retq
   0x0023 <+35>:mov%gs:0x0,%rax
   0x002c <+44>:or $0x3,%rax
   0x0030 <+48>:mov%rax,0x20(%rdi)
   0x0034 <+52>:mov$0x1,%eax
   0x0039 <+57>:retq

After patch, down_read_trylock:

   0x <+0>: callq  0x5 
   0x0005 <+5>: xor%eax,%eax
   0x0007 <+7>: lea0x1(%rax),%rdx
   0x000b <+11>:lock cmpxchg %rdx,(%rdi)
   0x0010 <+16>:jne0x29 
   0x0012 <+18>:mov%gs:0x0,%rax
   0x001b <+27>:or $0x3,%rax
   0x001f <+31>:mov%rax,0x20(%rdi)
   0x0023 <+35>:mov$0x1,%eax
   0x0028 <+40>:retq
   0x0029 <+41>:test   %rax,%rax
   0x002c <+44>:jns0x7 
   0x002e <+46>:xor%eax,%eax
   0x0030 <+48>:retq

By using a rwsem microbenchmark, the down_read_trylock() rate (with a
load of 10 to lengthen the lock critical section) on a x86-64 system
before and after the patch were:

 Before PatchAfter Patch
   # of Threads rlock   rlock
    -   -
1   14,496  14,716
28,644   8,453
46,799   6,983
85,664   7,190

On a ARM64 system, the performance results were:

 Before PatchAfter Patch
   # of Threads rlock   rlock
    -   -
1   23,676  24,488
27,697   9,502
44,945   3,440
82,641   1,603

For the uncontended case (1 thread), the new down_read_trylock() is a
little bit faster. For the contended cases, the new down_read_trylock()
perform pretty well in x86-64, but performance degrades at high
contention level on ARM64.

Suggested-by: Linus Torvalds 
Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem.h | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 45ee002..1f5775a 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -174,14 +174,17 @@ static inline int __down_read_killable(struct 
rw_semaphore *sem)
 
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
-   long tmp;
+   /*
+* Optimize for the case when the rwsem is not locked at all.
+*/
+   long tmp = RWSEM_UNLOCKED_VALUE;
 
-   while ((tmp = atomic_long_read(>count)) >= 0) {
-   if (tmp == atomic_long_cmpxchg_acquire(>count, tmp,
-  tmp + RWSEM_ACTIVE_READ_BIAS)) {
+   do {
+   if (atomic_long_try_cmpxchg_acquire(>count, ,
+   tmp + RWSEM_ACTIVE_READ_BIAS)) {
return 1;
}
-   }
+   } while (tmp >= 0);
return 0;
 }
 
-- 
1.8.3.1



[PATCH v4 2/3] locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all archs

2019-02-13 Thread Waiman Long
Currently, we have two different implementation of rwsem:
 1) CONFIG_RWSEM_GENERIC_SPINLOCK (rwsem-spinlock.c)
 2) CONFIG_RWSEM_XCHGADD_ALGORITHM (rwsem-xadd.c)

As we are going to use a single generic implementation for rwsem-xadd.c
and no architecture-specific code will be needed, there is no point
in keeping two different implementations of rwsem. In most cases, the
performance of rwsem-spinlock.c will be worse. It also doesn't get all
the performance tuning and optimizations that had been implemented in
rwsem-xadd.c over the years.

For simplication, we are going to remove rwsem-spinlock.c and make all
architectures use a single implementation of rwsem - rwsem-xadd.c.

All references to RWSEM_GENERIC_SPINLOCK and RWSEM_XCHGADD_ALGORITHM
in the code are removed.

Suggested-by: Peter Zijlstra 
Signed-off-by: Waiman Long 
---
 arch/alpha/Kconfig  |   7 -
 arch/arc/Kconfig|   3 -
 arch/arm/Kconfig|   4 -
 arch/arm64/Kconfig  |   3 -
 arch/c6x/Kconfig|   3 -
 arch/csky/Kconfig   |   3 -
 arch/h8300/Kconfig  |   3 -
 arch/hexagon/Kconfig|   6 -
 arch/ia64/Kconfig   |   4 -
 arch/m68k/Kconfig   |   7 -
 arch/microblaze/Kconfig |   6 -
 arch/mips/Kconfig   |   7 -
 arch/nds32/Kconfig  |   3 -
 arch/nios2/Kconfig  |   3 -
 arch/openrisc/Kconfig   |   6 -
 arch/parisc/Kconfig |   6 -
 arch/powerpc/Kconfig|   7 -
 arch/riscv/Kconfig  |   3 -
 arch/s390/Kconfig   |   6 -
 arch/sh/Kconfig |   6 -
 arch/sparc/Kconfig  |   8 -
 arch/unicore32/Kconfig  |   6 -
 arch/x86/Kconfig|   3 -
 arch/x86/um/Kconfig |   6 -
 arch/xtensa/Kconfig |   3 -
 include/linux/rwsem-spinlock.h  |  47 --
 include/linux/rwsem.h   |   5 -
 kernel/Kconfig.locks|   2 +-
 kernel/locking/Makefile |   4 +-
 kernel/locking/rwsem-spinlock.c | 339 
 kernel/locking/rwsem.h  |   3 -
 31 files changed, 2 insertions(+), 520 deletions(-)
 delete mode 100644 include/linux/rwsem-spinlock.h
 delete mode 100644 kernel/locking/rwsem-spinlock.c

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 584a6e1..27c8712 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -49,13 +49,6 @@ config MMU
bool
default y
 
-config RWSEM_GENERIC_SPINLOCK
-   bool
-
-config RWSEM_XCHGADD_ALGORITHM
-   bool
-   default y
-
 config ARCH_HAS_ILOG2_U32
bool
default n
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 376366a..c0dd229 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -63,9 +63,6 @@ config SCHED_OMIT_FRAME_POINTER
 config GENERIC_CSUM
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config ARCH_DISCONTIGMEM_ENABLE
def_bool n
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 664e918..4d81f69 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -176,10 +176,6 @@ config TRACE_IRQFLAGS_SUPPORT
bool
default !CPU_V7M
 
-config RWSEM_XCHGADD_ALGORITHM
-   bool
-   default y
-
 config ARCH_HAS_ILOG2_U32
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d3..24bbcfa 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -238,9 +238,6 @@ config LOCKDEP_SUPPORT
 config TRACE_IRQFLAGS_SUPPORT
def_bool y
 
-config RWSEM_XCHGADD_ALGORITHM
-   def_bool y
-
 config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 456e154..f114655 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -26,9 +26,6 @@ config MMU
 config FPU
def_bool n
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 398113c..90279a1 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -93,9 +93,6 @@ config GENERIC_HWEIGHT
 config MMU
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config STACKTRACE_SUPPORT
def_bool y
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 6472a06..ba33326 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -26,9 +26,6 @@ config H8300
 config CPU_BIG_ENDIAN
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index fb2fbfc..5d07c8d 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -64,12 +64,6 @@ config GENERIC_CSUM
 config GENERIC_IRQ_PROBE
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool n
-
-config RWSEM_XCHGADD_ALGORITHM
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig

[PATCH v4 1/3] locking/rwsem: Remove arch specific rwsem files

2019-02-13 Thread Waiman Long
As the generic rwsem-xadd code is using the appropriate acquire and
release versions of the atomic operations, the arch specific rwsem.h
files will not be that much faster than the generic code as long as the
atomic functions are properly implemented. So we can remove those arch
specific rwsem.h and stop building asm/rwsem.h to reduce maintenance
effort.

Currently, only x86, alpha and ia64 have implemented architecture
specific fast paths. I don't have access to alpha and ia64 systems for
testing, but they are legacy systems that are not likely to be updated
to the latest kernel anyway.

By using a rwsem microbenchmark, the total locking rates on a 4-socket
56-core 112-thread x86-64 system before and after the patch were as
follows (mixed means equal # of read and write locks):

  Before Patch  After Patch
   # of Threads  wlock   rlock   mixed wlock   rlock   mixed
     -   -   - -   -   -
129,201  30,143  29,45828,615  30,172  29,201
2 6,807  13,299   1,171 7,725  15,025   1,804
4 6,504  12,755   1,520 7,127  14,286   1,345
8 6,762  13,412 764 6,826  13,652 726
   16 6,693  15,408 662 6,599  15,938 626
   32 6,145  15,286 496 5,549  15,487 511
   64 5,812  15,495  60 5,858  15,572  60

There were some run-to-run variations for the multi-thread tests. For
x86-64, using the generic C code fast path seems to be a little bit
faster than the assembly version with low lock contention.  Looking at
the assembly version of the fast paths, there are assembly to/from C
code wrappers that save and restore all the callee-clobbered registers
(7 registers on x86-64). The assembly generated from the generic C
code doesn't need to do that. That may explain the slight performance
gain here.

The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h
with no code change as no other code other than those under
kernel/locking needs to access the internal rwsem macros and functions.

Signed-off-by: Waiman Long 
---
 MAINTAINERS |   1 -
 arch/alpha/include/asm/rwsem.h  | 211 ---
 arch/arm/include/asm/Kbuild |   1 -
 arch/arm64/include/asm/Kbuild   |   1 -
 arch/hexagon/include/asm/Kbuild |   1 -
 arch/ia64/include/asm/rwsem.h   | 172 -
 arch/powerpc/include/asm/Kbuild |   1 -
 arch/s390/include/asm/Kbuild|   1 -
 arch/sh/include/asm/Kbuild  |   1 -
 arch/sparc/include/asm/Kbuild   |   1 -
 arch/x86/include/asm/rwsem.h| 237 
 arch/x86/lib/Makefile   |   1 -
 arch/x86/lib/rwsem.S| 156 --
 arch/x86/um/Makefile|   1 -
 arch/xtensa/include/asm/Kbuild  |   1 -
 include/asm-generic/rwsem.h | 140 
 include/linux/rwsem.h   |   4 +-
 kernel/locking/percpu-rwsem.c   |   2 +
 kernel/locking/rwsem.h  | 130 ++
 19 files changed, 133 insertions(+), 930 deletions(-)
 delete mode 100644 arch/alpha/include/asm/rwsem.h
 delete mode 100644 arch/ia64/include/asm/rwsem.h
 delete mode 100644 arch/x86/include/asm/rwsem.h
 delete mode 100644 arch/x86/lib/rwsem.S
 delete mode 100644 include/asm-generic/rwsem.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9919840..053f536 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8926,7 +8926,6 @@ F:arch/*/include/asm/spinlock*.h
 F: include/linux/rwlock*.h
 F: include/linux/mutex*.h
 F: include/linux/rwsem*.h
-F: arch/*/include/asm/rwsem.h
 F: include/linux/seqlock.h
 F: lib/locking*.[ch]
 F: kernel/locking/
diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
deleted file mode 100644
index cf8fc8f9..000
--- a/arch/alpha/include/asm/rwsem.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ALPHA_RWSEM_H
-#define _ALPHA_RWSEM_H
-
-/*
- * Written by Ivan Kokshaysky , 2001.
- * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h
- */
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#include 
-
-#define RWSEM_UNLOCKED_VALUE   0xL
-#define RWSEM_ACTIVE_BIAS  0x0001L
-#define RWSEM_ACTIVE_MASK  0xL
-#define RWSEM_WAITING_BIAS (-0x0001L)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS(RWSEM_WAITING_BIAS + 
RWSEM_ACTIVE_BIAS)
-
-static inline int ___down_read(struct rw_semaphore *sem)
-{
-   long oldcount;
-#ifndefCONFIG_SMP
-   oldcount = sem->count.counter;
-   sem->count.counter += RWSEM_ACTIVE_READ_BIAS;
-#else
-   long temp;
-   __asm__ __volatile__(
- 

[PATCH v4 0/3] locking/rwsem: Rwsem rearchitecture part 0

2019-02-13 Thread Waiman Long
v4:
 - Remove rwsem-spinlock.c and make all archs use rwsem-xadd.c.

v3:
 - Optimize __down_read_trylock() for the uncontended case as suggested
   by Linus.

v2:
 - Add patch 2 to optimize __down_read_trylock() as suggested by PeterZ.
 - Update performance test data in patch 1.

The goal of this patchset is to remove the architecture specific files
for rwsem-xadd to make it easer to add enhancements in the later rwsem
patches. It also removes the legacy rwsem-spinlock.c file and make all
the architectures use one single implementation of rwsem - rwsem-xadd.c.

Waiman Long (3):
  locking/rwsem: Remove arch specific rwsem files
  locking/rwsem: Remove rwsem-spinlock.c & use rwsem-xadd.c for all
archs
  locking/rwsem: Optimize down_read_trylock()

 MAINTAINERS |   1 -
 arch/alpha/Kconfig  |   7 -
 arch/alpha/include/asm/rwsem.h  | 211 -
 arch/arc/Kconfig|   3 -
 arch/arm/Kconfig|   4 -
 arch/arm/include/asm/Kbuild |   1 -
 arch/arm64/Kconfig  |   3 -
 arch/arm64/include/asm/Kbuild   |   1 -
 arch/c6x/Kconfig|   3 -
 arch/csky/Kconfig   |   3 -
 arch/h8300/Kconfig  |   3 -
 arch/hexagon/Kconfig|   6 -
 arch/hexagon/include/asm/Kbuild |   1 -
 arch/ia64/Kconfig   |   4 -
 arch/ia64/include/asm/rwsem.h   | 172 
 arch/m68k/Kconfig   |   7 -
 arch/microblaze/Kconfig |   6 -
 arch/mips/Kconfig   |   7 -
 arch/nds32/Kconfig  |   3 -
 arch/nios2/Kconfig  |   3 -
 arch/openrisc/Kconfig   |   6 -
 arch/parisc/Kconfig |   6 -
 arch/powerpc/Kconfig|   7 -
 arch/powerpc/include/asm/Kbuild |   1 -
 arch/riscv/Kconfig  |   3 -
 arch/s390/Kconfig   |   6 -
 arch/s390/include/asm/Kbuild|   1 -
 arch/sh/Kconfig |   6 -
 arch/sh/include/asm/Kbuild  |   1 -
 arch/sparc/Kconfig  |   8 -
 arch/sparc/include/asm/Kbuild   |   1 -
 arch/unicore32/Kconfig  |   6 -
 arch/x86/Kconfig|   3 -
 arch/x86/include/asm/rwsem.h| 237 
 arch/x86/lib/Makefile   |   1 -
 arch/x86/lib/rwsem.S| 156 --
 arch/x86/um/Kconfig |   6 -
 arch/x86/um/Makefile|   1 -
 arch/xtensa/Kconfig |   3 -
 arch/xtensa/include/asm/Kbuild  |   1 -
 include/asm-generic/rwsem.h | 140 -
 include/linux/rwsem-spinlock.h  |  47 --
 include/linux/rwsem.h   |   9 +-
 kernel/Kconfig.locks|   2 +-
 kernel/locking/Makefile |   4 +-
 kernel/locking/percpu-rwsem.c   |   2 +
 kernel/locking/rwsem-spinlock.c | 339 
 kernel/locking/rwsem.h  | 130 +++
 48 files changed, 135 insertions(+), 1447 deletions(-)
 delete mode 100644 arch/alpha/include/asm/rwsem.h
 delete mode 100644 arch/ia64/include/asm/rwsem.h
 delete mode 100644 arch/x86/include/asm/rwsem.h
 delete mode 100644 arch/x86/lib/rwsem.S
 delete mode 100644 include/asm-generic/rwsem.h
 delete mode 100644 include/linux/rwsem-spinlock.h
 delete mode 100644 kernel/locking/rwsem-spinlock.c

-- 
1.8.3.1



Re: [PATCH v4 1/2] dt-bindings: soc: fsl: Document Qixis FPGA usage

2019-02-13 Thread Rob Herring
On Tue, Feb 05, 2019 at 10:14:40AM +, Pankaj Bansal wrote:
> an FPGA-based system controller, called “Qixis”, which
> manages several critical system features, including:
> • Reset sequencing
> • Power supply configuration
> • Board configuration
> • hardware configuration
> 
> The qixis registers are accessible over one or more system-specific
> interfaces, typically I2C, JTAG or an embedded processor.
> 
> Signed-off-by: Pankaj Bansal 
> ---
> 
> Notes:
> V4:
> - No Change
> V3:
> - Added boardname based compatible field in bindings
> - Added bindings for MMIO based FPGA
> V2:
> - No change
> 
>  .../bindings/soc/fsl/qixis_ctrl.txt  | 53 ++
>  1 file changed, 53 insertions(+)
> 
> diff --git a/Documentation/devicetree/bindings/soc/fsl/qixis_ctrl.txt 
> b/Documentation/devicetree/bindings/soc/fsl/qixis_ctrl.txt
> new file mode 100644
> index ..5d510df14be8
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/soc/fsl/qixis_ctrl.txt
> @@ -0,0 +1,53 @@
> +* QIXIS FPGA block
> +
> +an FPGA-based system controller, called “Qixis”, which
> +manages several critical system features, including:
> +• Configuration switch monitoring
> +• Power on/off sequencing
> +• Reset sequencing
> +• Power supply configuration
> +• Board configuration
> +• hardware configuration
> +• Background power data collection (DCM)
> +• Fault monitoring
> +• RCW bypass SRAM (replace flash RCW with internal RCW) (NOR only)
> +• Dedicated functional validation blocks (POSt/IRS, triggered event, and so 
> on)
> +• I2C master for remote board control even with no DUT available
> +
> +The qixis registers are accessible over one or more system-specific 
> interfaces,
> +typically I2C, JTAG or an embedded processor.
> +
> +FPGA connected to I2C:
> +Required properties:
> +
> + - compatible: should be a board-specific string followed by a string
> +   indicating the type of FPGA.  Example:
> + "fsl,-fpga", "fsl,fpga-qixis-i2c"

You don't really need the '-i2c' part because it will only get bound to 
an i2c based driver when a child of an i2c controller.

> + - reg : i2c address of the qixis device.
> +
> +Example (LX2160A-QDS):
> + /* The FPGA node */
> +fpga@66 {
> + compatible = "fsl,lx2160aqds-fpga", "fsl,fpga-qixis-i2c";
> + reg = <0x66>;

> + #address-cells = <1>;
> + #size-cells = <0>;

You don't need this unless you have child nodes with 'reg'.

> + }
> +
> +* Freescale on-board FPGA
> +
> +This is the memory-mapped registers for on board FPGA.
> +
> +Required properties:
> +- compatible: should be a board-specific string followed by a string
> +  indicating the type of FPGA.  Example:
> + "fsl,-fpga", "fsl,fpga-qixis"
> +- reg: should contain the address and the length of the FPGA register set.
> +
> +Example (LS2080A-RDB):
> +
> +cpld@3,0 {
> +compatible = "fsl,ls2080ardb-fpga", "fsl,fpga-qixis";
> +reg = <0x3 0 0x1>;
> +};
> +
> -- 
> 2.17.1
> 


Re: [QUESTION] powerpc, libseccomp, and spu

2019-02-13 Thread Paul Moore
On Tue, Feb 12, 2019 at 9:50 AM Tom Hromatka  wrote:
> On 2/11/19 11:54 AM, Tom Hromatka wrote:
> > PowerPC experts,
> >
> > Paul Moore and I are working on the v2.4 release of libseccomp,
> > and as part of this work I need to update the syscall table for
> > each architecture.
> >
> > I have incorporated the new ppc syscall.tbl into libseccomp, but
> > I am not familiar with the value of "spu" in the ABI column.  For
> > example:
> >
> > 2232umountsys_oldumount
> > 2264umountsys_ni_syscall
> > 22spuumountsys_ni_syscall
> >
> > In libseccomp, we maintain a 32-bit ppc syscall table and a 64-bit
> > ppc syscall table.  Do we also need to add a "spu" ppc syscall
> > table?  Some clarification on the syscalls marked "spu" and "nospu"
> > would be greatly appreciated.
>
> Thanks for the awesome responses, Ben and Michael.  I'll definitely
> get Paul's input as well, but it sounds reasonable to exclude SPUs
> from the newest libseccomp release.

Based on this thread, I don't think we need to worry about "spu" at
this point in time.  Thanks everyone.

> Michael's recommendation to replace "nospu" with common" and ignore
> "spu" entirely has allowed ppc and ppc64 to pass all of our internal
> checks.
>
> Thanks again!
>
> Tom

-- 
paul moore
www.paul-moore.com


[PATCH v3 2/2] locking/rwsem: Optimize down_read_trylock()

2019-02-13 Thread Waiman Long
Modify __down_read_trylock() to optimize for an unlocked rwsem and make
it generate slightly better code.

Before this patch, down_read_trylock:

   0x <+0>: callq  0x5 
   0x0005 <+5>: jmp0x18 
   0x0007 <+7>: lea0x1(%rdx),%rcx
   0x000b <+11>:mov%rdx,%rax
   0x000e <+14>:lock cmpxchg %rcx,(%rdi)
   0x0013 <+19>:cmp%rax,%rdx
   0x0016 <+22>:je 0x23 
   0x0018 <+24>:mov(%rdi),%rdx
   0x001b <+27>:test   %rdx,%rdx
   0x001e <+30>:jns0x7 
   0x0020 <+32>:xor%eax,%eax
   0x0022 <+34>:retq
   0x0023 <+35>:mov%gs:0x0,%rax
   0x002c <+44>:or $0x3,%rax
   0x0030 <+48>:mov%rax,0x20(%rdi)
   0x0034 <+52>:mov$0x1,%eax
   0x0039 <+57>:retq

After patch, down_read_trylock:

   0x <+0>: callq  0x5 
   0x0005 <+5>: xor%eax,%eax
   0x0007 <+7>: lea0x1(%rax),%rdx
   0x000b <+11>:lock cmpxchg %rdx,(%rdi)
   0x0010 <+16>:jne0x29 
   0x0012 <+18>:mov%gs:0x0,%rax
   0x001b <+27>:or $0x3,%rax
   0x001f <+31>:mov%rax,0x20(%rdi)
   0x0023 <+35>:mov$0x1,%eax
   0x0028 <+40>:retq
   0x0029 <+41>:test   %rax,%rax
   0x002c <+44>:jns0x7 
   0x002e <+46>:xor%eax,%eax
   0x0030 <+48>:retq

By using a rwsem microbenchmark, the down_read_trylock() rate (with a
load of 10 to lengthen the lock critical section) on a x86-64 system
before and after the patch were:

 Before PatchAfter Patch
   # of Threads rlock   rlock
    -   -
1   14,496  14,716
28,644   8,453
46,799   6,983
85,664   7,190

On a ARM64 system, the performance results were:

 Before PatchAfter Patch
   # of Threads rlock   rlock
    -   -
1   23,676  24,488
27,697   9,502
44,945   3,440
82,641   1,603

For the uncontended case (1 thread), the new down_read_trylock() is a
little bit faster. For the contended cases, the new down_read_trylock()
perform pretty well in x86-64, but performance degrades at high
contention level on ARM64.

Suggested-by: Linus Torvalds 
Signed-off-by: Waiman Long 
---
 kernel/locking/rwsem.h | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
index 067e265..e0bcc11 100644
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -175,14 +175,17 @@ static inline int __down_read_killable(struct 
rw_semaphore *sem)
 
 static inline int __down_read_trylock(struct rw_semaphore *sem)
 {
-   long tmp;
+   /*
+* Optimize for the case when the rwsem is not locked at all.
+*/
+   long tmp = RWSEM_UNLOCKED_VALUE;
 
-   while ((tmp = atomic_long_read(>count)) >= 0) {
-   if (tmp == atomic_long_cmpxchg_acquire(>count, tmp,
-  tmp + RWSEM_ACTIVE_READ_BIAS)) {
+   do {
+   if (atomic_long_try_cmpxchg_acquire(>count, ,
+   tmp + RWSEM_ACTIVE_READ_BIAS)) {
return 1;
}
-   }
+   } while (tmp >= 0);
return 0;
 }
 
-- 
1.8.3.1



[PATCH v3 1/2] locking/rwsem: Remove arch specific rwsem files

2019-02-13 Thread Waiman Long
As the generic rwsem-xadd code is using the appropriate acquire and
release versions of the atomic operations, the arch specific rwsem.h
files will not be that much faster than the generic code as long as the
atomic functions are properly implemented. So we can remove those arch
specific rwsem.h and stop building asm/rwsem.h to reduce maintenance
effort.

Currently, only x86, alpha and ia64 have implemented architecture
specific fast paths. I don't have access to alpha and ia64 systems for
testing, but they are legacy systems that are not likely to be updated
to the latest kernel anyway.

By using a rwsem microbenchmark, the total locking rates on a 4-socket
56-core 112-thread x86-64 system before and after the patch were as
follows (mixed means equal # of read and write locks):

  Before Patch  After Patch
   # of Threads  wlock   rlock   mixed wlock   rlock   mixed
     -   -   - -   -   -
129,201  30,143  29,45828,615  30,172  29,201
2 6,807  13,299   1,171 7,725  15,025   1,804
4 6,504  12,755   1,520 7,127  14,286   1,345
8 6,762  13,412 764 6,826  13,652 726
   16 6,693  15,408 662 6,599  15,938 626
   32 6,145  15,286 496 5,549  15,487 511
   64 5,812  15,495  60 5,858  15,572  60

There were some run-to-run variations for the multi-thread tests. For
x86-64, using the generic C code fast path seems to be a little bit
faster than the assembly version with low lock contention.  Looking at
the assembly version of the fast paths, there are assembly to/from C
code wrappers that save and restore all the callee-clobbered registers
(7 registers on x86-64). The assembly generated from the generic C
code doesn't need to do that. That may explain the slight performance
gain here.

The generic asm rwsem.h can also be merged into kernel/locking/rwsem.h
with no code change as no other code other than those under
kernel/locking needs to access the internal rwsem macros and functions.

Signed-off-by: Waiman Long 
---
 MAINTAINERS |   1 -
 arch/alpha/include/asm/rwsem.h  | 211 ---
 arch/arm/include/asm/Kbuild |   1 -
 arch/arm64/include/asm/Kbuild   |   1 -
 arch/hexagon/include/asm/Kbuild |   1 -
 arch/ia64/include/asm/rwsem.h   | 172 -
 arch/powerpc/include/asm/Kbuild |   1 -
 arch/s390/include/asm/Kbuild|   1 -
 arch/sh/include/asm/Kbuild  |   1 -
 arch/sparc/include/asm/Kbuild   |   1 -
 arch/x86/include/asm/rwsem.h| 237 
 arch/x86/lib/Makefile   |   1 -
 arch/x86/lib/rwsem.S| 156 --
 arch/xtensa/include/asm/Kbuild  |   1 -
 include/asm-generic/rwsem.h | 140 
 include/linux/rwsem.h   |   4 +-
 kernel/locking/percpu-rwsem.c   |   2 +
 kernel/locking/rwsem.h  | 130 ++
 18 files changed, 133 insertions(+), 929 deletions(-)
 delete mode 100644 arch/alpha/include/asm/rwsem.h
 delete mode 100644 arch/ia64/include/asm/rwsem.h
 delete mode 100644 arch/x86/include/asm/rwsem.h
 delete mode 100644 arch/x86/lib/rwsem.S
 delete mode 100644 include/asm-generic/rwsem.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 9919840..053f536 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8926,7 +8926,6 @@ F:arch/*/include/asm/spinlock*.h
 F: include/linux/rwlock*.h
 F: include/linux/mutex*.h
 F: include/linux/rwsem*.h
-F: arch/*/include/asm/rwsem.h
 F: include/linux/seqlock.h
 F: lib/locking*.[ch]
 F: kernel/locking/
diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h
deleted file mode 100644
index cf8fc8f9..000
--- a/arch/alpha/include/asm/rwsem.h
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ALPHA_RWSEM_H
-#define _ALPHA_RWSEM_H
-
-/*
- * Written by Ivan Kokshaysky , 2001.
- * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h
- */
-
-#ifndef _LINUX_RWSEM_H
-#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
-#endif
-
-#ifdef __KERNEL__
-
-#include 
-
-#define RWSEM_UNLOCKED_VALUE   0xL
-#define RWSEM_ACTIVE_BIAS  0x0001L
-#define RWSEM_ACTIVE_MASK  0xL
-#define RWSEM_WAITING_BIAS (-0x0001L)
-#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
-#define RWSEM_ACTIVE_WRITE_BIAS(RWSEM_WAITING_BIAS + 
RWSEM_ACTIVE_BIAS)
-
-static inline int ___down_read(struct rw_semaphore *sem)
-{
-   long oldcount;
-#ifndefCONFIG_SMP
-   oldcount = sem->count.counter;
-   sem->count.counter += RWSEM_ACTIVE_READ_BIAS;
-#else
-   long temp;
-   __asm__ __volatile__(
-   "1: ldq_l   %0,%1\n"
-   " 

[PATCH v3 0/2] locking/rwsem: Remove arch specific rwsem files

2019-02-13 Thread Waiman Long
v3:
 - Optimize __down_read_trylock() for the uncontended case as suggested
   by Linus.

v2:
 - Add patch 2 to optimize __down_read_trylock() as suggested by PeterZ.
 - Update performance test data in patch 1.

This is part 0 of my rwsem patchset. It just removes the architecture
specific files to make it easer to add enhancements in the upcoming
rwsem patches.

Since the two ll/sc platforms that I can tested on (arm64 & ppc) are
both using the generic C codes, the rwsem performance shouldn't be
affected by this patch except the down_read_trylock() code which was
included in patch 2 for arm64.

Waiman Long (2):
  locking/rwsem: Remove arch specific rwsem files
  locking/rwsem: Optimize down_read_trylock()

 MAINTAINERS |   1 -
 arch/alpha/include/asm/rwsem.h  | 211 ---
 arch/arm/include/asm/Kbuild |   1 -
 arch/arm64/include/asm/Kbuild   |   1 -
 arch/hexagon/include/asm/Kbuild |   1 -
 arch/ia64/include/asm/rwsem.h   | 172 -
 arch/powerpc/include/asm/Kbuild |   1 -
 arch/s390/include/asm/Kbuild|   1 -
 arch/sh/include/asm/Kbuild  |   1 -
 arch/sparc/include/asm/Kbuild   |   1 -
 arch/x86/include/asm/rwsem.h| 237 
 arch/x86/lib/Makefile   |   1 -
 arch/x86/lib/rwsem.S| 156 --
 arch/xtensa/include/asm/Kbuild  |   1 -
 include/asm-generic/rwsem.h | 140 
 include/linux/rwsem.h   |   4 +-
 kernel/locking/percpu-rwsem.c   |   2 +
 kernel/locking/rwsem.h  | 133 ++
 18 files changed, 136 insertions(+), 929 deletions(-)
 delete mode 100644 arch/alpha/include/asm/rwsem.h
 delete mode 100644 arch/ia64/include/asm/rwsem.h
 delete mode 100644 arch/x86/include/asm/rwsem.h
 delete mode 100644 arch/x86/lib/rwsem.S
 delete mode 100644 include/asm-generic/rwsem.h

-- 
1.8.3.1



Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-13 Thread Alex Williamson
On Tue, 12 Feb 2019 19:26:50 -0500
Daniel Jordan  wrote:

> On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> > Daniel Jordan  wrote:  
> > > On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:  
> > > > I haven't looked at this super closely, but how does this stuff work?
> > > > 
> > > > do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> > > > 
> > > > Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
> > > >
> > > > Otherwise MEMLOCK is really doubled..
> > > 
> > > So this has been a problem for some time, but it's not as easy as adding 
> > > them
> > > together, see [1][2] for a start.
> > > 
> > > The locked_vm/pinned_vm issue definitely needs fixing, but all this 
> > > series is
> > > trying to do is account to the right counter.  
> 
> Thanks for taking a look, Alex.
> 
> > This still makes me nervous because we have userspace dependencies on
> > setting process locked memory.  
> 
> Could you please expand on this?  Trying to get more context.

VFIO is a userspace driver interface and the pinned/locked page
accounting we're doing here is trying to prevent a user from exceeding
their locked memory limits.  Thus a VM management tool or unprivileged
userspace driver needs to have appropriate locked memory limits
configured for their use case.  Currently we do not have a unified
accounting scheme, so if a page is mlock'd by the user and also mapped
through VFIO for DMA, it's accounted twice, these both increment
locked_vm and userspace needs to manage that.  If pinned memory
and locked memory are now two separate buckets and we're only comparing
one of them against the locked memory limit, then it seems we have
effectively doubled the user's locked memory for this use case, as
Jason questioned.  The user could mlock one page and DMA map another,
they're both "locked", but now they only take one slot in each bucket.

If we continue forward with using a separate bucket here, userspace
could infer that accounting is unified and lower the user's locked
memory limit, or exploit the gap that their effective limit might
actually exceed system memory.  In the former case, if we do eventually
correct to compare the total of the combined buckets against the user's
locked memory limits, we'll break users that have adapted their locked
memory limits to meet the apparent needs.  In the latter case, the
inconsistent accounting is potentially an attack vector.

> > There's a user visible difference if we
> > account for them in the same bucket vs separate.  Perhaps we're
> > counting in the wrong bucket now, but if we "fix" that and userspace
> > adapts, how do we ever go back to accounting both mlocked and pinned
> > memory combined against rlimit?  Thanks,  
> 
> PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> locked_vm accounting separate, but allowed the two to be added safely to be
> compared against RLIMIT_MEMLOCK.

Unless I'm incorrect in the concerns above, I don't see how we can
convert vfio before this occurs.
 
> Anyway, until some solution is agreed on, are there objections to converting
> locked_vm to an atomic, to avoid user-visible changes, instead of switching
> locked_vm users to pinned_vm?

Seems that as long as we have separate buckets that are compared
individually to rlimit that we've got problems, it's just a matter of
where they're exposed based on which bucket is used for which
interface.  Thanks,

Alex


Re: [PATCH-tip 00/22] locking/rwsem: Rework rwsem-xadd & enable new rwsem features

2019-02-13 Thread Linus Torvalds
Ok, those test robot reports are hard to read, but trying to distill it down:

On Wed, Feb 13, 2019 at 1:19 AM Chen Rong  wrote:
>
>  %stddev %change %stddev
>  \  |\
> 196250 ±  8% -64.1%  70494will-it-scale.per_thread_ops

That's the original 64% regression..

And then with the patch set:

>  %stddev  change %stddev
>  \  |\
>  71190 180% 199232 ±  4%  will-it-scale.per_thread_ops

looks like it's back up where it used to be.

So I guess we have numbers for the regression now. Thanks.

And that closes my biggest question for the new model, and with the
new organization that gets ird of the arch-specific asm separately
first and makes it a bit more legible that way, I guess I'll just Ack
the whole series.

 Linus


Re: [PATCH v2] hugetlb: allow to free gigantic pages regardless of the configuration

2019-02-13 Thread Dave Hansen
> -#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || 
> defined(CONFIG_CMA)
> +#ifdef CONFIG_COMPACTION_CORE
>  static __init int gigantic_pages_init(void)
>  {
>   /* With compaction or CMA we can allocate gigantic pages at runtime */
> diff --git a/fs/Kconfig b/fs/Kconfig
> index ac474a61be37..8fecd3ea5563 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -207,8 +207,9 @@ config HUGETLB_PAGE
>  config MEMFD_CREATE
>   def_bool TMPFS || HUGETLBFS
>  
> -config ARCH_HAS_GIGANTIC_PAGE
> +config COMPACTION_CORE
>   bool
> + default y if (MEMORY_ISOLATION && MIGRATION) || CMA

This takes a hard dependency (#if) and turns it into a Kconfig *default*
that can be overridden.  That seems like trouble.

Shouldn't it be:

config COMPACTION_CORE
def_bool y
depends on (MEMORY_ISOLATION && MIGRATION) || CMA

?


[PATCH v2] hugetlb: allow to free gigantic pages regardless of the configuration

2019-02-13 Thread Alexandre Ghiti
On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but
that support gigantic pages, boottime reserved gigantic pages can not be
freed at all. This patch simply enables the possibility to hand back
those pages to memory allocator.

This patch also renames:

- the triplet CMA or (MEMORY_ISOLATION && COMPACTION) into COMPACTION_CORE,
and gets rid of all use of it in architecture specific code (and then
removes ARCH_HAS_GIGANTIC_PAGE config).
- gigantic_page_supported to make it more accurate: this value being false
does not mean that the system cannot use gigantic pages, it just means that
runtime allocation of gigantic pages is not supported, one can still
allocate boottime gigantic pages if the architecture supports it.

Signed-off-by: Alexandre Ghiti 
---

Changes in v2 as suggested by Vlastimil Babka:
- Get rid of ARCH_HAS_GIGANTIC_PAGE
- Get rid of architecture specific gigantic_page_supported
- Factorize CMA or (MEMORY_ISOLATION && COMPACTION) into COMPACTION_CORE

Compiles on all arches and validated on riscv.

 arch/arm64/Kconfig   |  1 -
 arch/arm64/include/asm/hugetlb.h |  4 --
 arch/powerpc/include/asm/book3s/64/hugetlb.h |  7 
 arch/powerpc/platforms/Kconfig.cputype   |  1 -
 arch/s390/Kconfig|  1 -
 arch/s390/include/asm/hugetlb.h  |  3 --
 arch/x86/Kconfig |  1 -
 arch/x86/include/asm/hugetlb.h   |  4 --
 arch/x86/mm/hugetlbpage.c|  2 +-
 fs/Kconfig   |  3 +-
 include/linux/gfp.h  |  4 +-
 mm/hugetlb.c | 44 +++-
 mm/page_alloc.c  |  7 ++--
 13 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d366127..6c778046b9f7 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -18,7 +18,6 @@ config ARM64
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index fb6609875455..59893e766824 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -65,8 +65,4 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, 
unsigned long addr,
 
 #include 
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
-
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h 
b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index 5b0177733994..d04a0bcc2f1c 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -32,13 +32,6 @@ static inline int hstate_get_psize(struct hstate *hstate)
}
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void)
-{
-   return true;
-}
-#endif
-
 /* hugepd entry valid bit */
 #define HUGEPD_VAL_BITS(0x8000UL)
 
diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 8c7464c3f27f..3e629dfb5efa 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -319,7 +319,6 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 config PPC_RADIX_MMU
bool "Radix MMU Support"
depends on PPC_BOOK3S_64
-   select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
default y
help
  Enable support for the Power ISA 3.0 Radix style MMU. Currently this
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index ed554b09eb3f..556860f290e9 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -69,7 +69,6 @@ config S390
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
-   select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
select ARCH_HAS_KCOV
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 2d1afa58a4b6..bd191560efcf 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -116,7 +116,4 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t 
newprot)
return pte_modify(pte, newprot);
 }
 
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static inline bool gigantic_page_supported(void) { return true; }
-#endif
 #endif /* _ASM_S390_HUGETLB_H */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 68261430fe6e..2fd983e2b2f6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,7 +23,6 @@ config X86_64

Re: [PATCH] hugetlb: allow to free gigantic pages regardless of the configuration

2019-02-13 Thread Alex Ghiti



On 2/13/19 6:27 AM, Vlastimil Babka wrote:

On 1/17/19 7:39 PM, Alexandre Ghiti wrote:

From: Alexandre Ghiti 

On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but
that support gigantic pages, boottime reserved gigantic pages can not be
freed at all. This patchs simply enables the possibility to hand back
those pages to memory allocator.

This commit then renames gigantic_page_supported and
ARCH_HAS_GIGANTIC_PAGE to make them more accurate. Indeed, those values
being false does not mean that the system cannot use gigantic pages: it
just means that runtime allocation of gigantic pages is not supported,
one can still allocate boottime gigantic pages if the architecture supports
it.

Signed-off-by: Alexandre Ghiti 

I'm fine with the change, but wonder if this can be structured better in a way
which would remove the duplicated "if (MEMORY_ISOLATION && COMPACTION) || CMA"
from all arches, as well as the duplicated
gigantic_page_runtime_allocation_supported()



Yeah, totally, we can factorize more than what I did. I prepared a v2 of 
this

patch that does exactly that: remove the triplet from arch specific code
and the duplicated gigantic_page_runtime_allocation_supported.



something like:

- "select ARCH_HAS_GIGANTIC_PAGE" has no conditions, it just says the arch can
support them either at boottime or runtime (but runtime is usable only if other
conditions are met)



And the v2 gets rid of ARCH_HAS_GIGANTIC_PAGE totally since it
is not needed by arch to advertise the fact they support gigantic page,
actually, when selected, it really just means that an arch has the means
to allocate runtime gigantic page: it is equivalent to
(MEMORY_ISOLATION && COMPACTION) || CMA.



- gigantic_page_runtime_allocation_supported() is a function that returns true
if ARCH_HAS_GIGANTIC_PAGE && ((MEMORY_ISOLATION && COMPACTION) || CMA) and
there's a single instance, not per-arch.
- code for freeing gigantic pages can probably still be conditional on
ARCH_HAS_GIGANTIC_PAGE

BTW I wanted also to do something about the "(MEMORY_ISOLATION && COMPACTION) ||
CMA" ugliness itself, i.e. put the common parts behind some new kconfig
(COMPACTION_CORE ?) and expose it better to users, but I can take a stab on that
once the above part is settled.
Vlastimil



I send the v2 right away, if you can take a look Vlastimil, that would 
be great.

Note that Andrew already picked this patch in its tree, I'm not sure how to
proceed.


Thanks for your remarks !


Alex



Re: [PATCH 06/12] dma-mapping: improve selection of dma_declare_coherent availability

2019-02-13 Thread Rob Herring
On Wed, Feb 13, 2019 at 12:24 PM Christoph Hellwig  wrote:
>
> On Tue, Feb 12, 2019 at 02:40:23PM -0600, Rob Herring wrote:
> > > diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
> > > index 3607fd2810e4..f8c66a9472a4 100644
> > > --- a/drivers/of/Kconfig
> > > +++ b/drivers/of/Kconfig
> > > @@ -43,6 +43,7 @@ config OF_FLATTREE
> > >
> > >  config OF_EARLY_FLATTREE
> > > bool
> > > +   select DMA_DECLARE_COHERENT
> >
> > Is selecting DMA_DECLARE_COHERENT okay on UML? We run the unittests with 
> > UML.
>
> No, that will fail with undefined references to memunmap.
>
> I gues this needs to be
>
> select DMA_DECLARE_COHERENT if HAS_DMA
>
> > Maybe we should just get rid of OF_RESERVED_MEM. If we support booting
> > from DT, then it should always be enabled anyways.
>
> Fine with me.  Do you want me to respin the series to just remove
> it?

Either now or it can wait. I don't want to hold this up any.

Rob


Re: [PATCH 06/12] dma-mapping: improve selection of dma_declare_coherent availability

2019-02-13 Thread Christoph Hellwig
On Tue, Feb 12, 2019 at 02:40:23PM -0600, Rob Herring wrote:
> > diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
> > index 3607fd2810e4..f8c66a9472a4 100644
> > --- a/drivers/of/Kconfig
> > +++ b/drivers/of/Kconfig
> > @@ -43,6 +43,7 @@ config OF_FLATTREE
> >
> >  config OF_EARLY_FLATTREE
> > bool
> > +   select DMA_DECLARE_COHERENT
> 
> Is selecting DMA_DECLARE_COHERENT okay on UML? We run the unittests with UML.

No, that will fail with undefined references to memunmap.

I gues this needs to be

select DMA_DECLARE_COHERENT if HAS_DMA

> Maybe we should just get rid of OF_RESERVED_MEM. If we support booting
> from DT, then it should always be enabled anyways.

Fine with me.  Do you want me to respin the series to just remove
it?


Re: [PATCH 01/12] mfd/sm501: depend on HAS_DMA

2019-02-13 Thread Christoph Hellwig
On Wed, Feb 13, 2019 at 07:29:31AM +, Lee Jones wrote:
> I would normally have taken this, but I fear it will conflict with
> [PATCH 06/12].  For that reason, just take my:
> 
>   Acked-by: Lee Jones 

Yes, I'll need it for the later patches in the series.

Thanks for the review.


Re: [PATCH 03/12] of: mark early_init_dt_alloc_reserved_memory_arch static

2019-02-13 Thread Christoph Hellwig
On Tue, Feb 12, 2019 at 02:24:19PM -0600, Rob Herring wrote:
> Looks like this one isn't a dependency, so I can take it if you want.

Sure, please go ahead.


[PATCH 11/11] s390: don't redefined the HAS_IOMEM symbol

2019-02-13 Thread Christoph Hellwig
Rely on the common defintion instead.

Signed-off-by: Christoph Hellwig 
---
 arch/s390/Kconfig | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9a25e19364f5..0f62e33ffcb2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -159,6 +159,7 @@ config S390
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE   if PCI
select NEED_SG_DMA_LENGTH   if PCI
+   select NO_IOMEM if !PCI
select OLD_SIGACTION
select OLD_SIGSUSPEND3
select PCI_DOMAINS  if PCI
@@ -708,9 +709,6 @@ config PCI_NR_FUNCTIONS
 
 endif  # PCI
 
-config HAS_IOMEM
-   def_bool PCI
-
 config CHSC_SCH
def_tristate m
prompt "Support for CHSC subchannels"
-- 
2.20.1



[PATCH 10/11] lib: consolidate the GENERIC_HWEIGHT symbol

2019-02-13 Thread Christoph Hellwig
Introduce a new ARCH_HAS_HWEIGHT symbol for alpha and ia64, and just
default to the generic version otherwise.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig  | 5 +
 arch/arc/Kconfig| 3 ---
 arch/arm/Kconfig| 4 
 arch/arm64/Kconfig  | 3 ---
 arch/c6x/Kconfig| 3 ---
 arch/csky/Kconfig   | 3 ---
 arch/h8300/Kconfig  | 3 ---
 arch/hexagon/Kconfig| 3 ---
 arch/ia64/Kconfig   | 1 +
 arch/m68k/Kconfig   | 4 
 arch/microblaze/Kconfig | 3 ---
 arch/mips/Kconfig   | 4 
 arch/nds32/Kconfig  | 3 ---
 arch/nios2/Kconfig  | 3 ---
 arch/openrisc/Kconfig   | 3 ---
 arch/parisc/Kconfig | 4 
 arch/powerpc/Kconfig| 4 
 arch/riscv/Kconfig  | 3 ---
 arch/s390/Kconfig   | 3 ---
 arch/sh/Kconfig | 3 ---
 arch/sparc/Kconfig  | 4 
 arch/unicore32/Kconfig  | 3 ---
 arch/x86/Kconfig| 3 ---
 arch/x86/um/Kconfig | 3 ---
 arch/xtensa/Kconfig | 3 ---
 lib/Kconfig | 7 +++
 26 files changed, 9 insertions(+), 79 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 65b5514e5a7f..a549c53563e8 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -2,6 +2,7 @@
 config ALPHA
bool
default y
+   select ARCH_HAS_HWEIGHT if ALPHA_EV67
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_NO_PREEMPT
@@ -446,10 +447,6 @@ config ALPHA_IRONGATE
depends on ALPHA_NAUTILUS
default y
 
-config GENERIC_HWEIGHT
-   bool
-   default y if !ALPHA_EV67
-
 config ALPHA_AVANTI
bool
depends on ALPHA_XL || ALPHA_AVANTI_CH
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 6476404b98b8..8bf4c0f7cc1d 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -72,9 +72,6 @@ config MMU
 config NO_IOPORT_MAP
def_bool y
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
depends on ARC_MMU_V4
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index c230fb1e09ba..b47825767e3d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -182,10 +182,6 @@ config ARCH_HAS_BANDGAP
 config FIX_EARLYCON_MEM
def_bool y if MMU
 
-config GENERIC_HWEIGHT
-   bool
-   default y
-
 config ARCH_MAY_HAVE_PC_FDC
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7cc3334aba29..98c3776ccf6b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -237,9 +237,6 @@ config ILLEGAL_POINTER_VALUE
hex
default 0xdead
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 config ZONE_DMA32
def_bool y
 
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 19b145ef7d92..c439d2f46af0 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -28,9 +28,6 @@ config MMU
 config FPU
def_bool n
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 config C6X_BIG_KERNEL
bool "Build a big kernel"
help
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index c0a49cbd3df0..14a9905e99a4 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -84,9 +84,6 @@ config CPU_NO_USER_BKPT
  instruction exception.
  In kernel we parse the *regs->pc to determine whether to send SIGTRAP 
or not.
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 config MMU
def_bool y
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 4f5a1efab822..77ce104bb42e 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -27,9 +27,6 @@ config H8300
 config CPU_BIG_ENDIAN
def_bool y
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 config NO_IOPORT_MAP
def_bool y
 
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 7a01f4c5a4f6..fb2996063d5a 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -61,9 +61,6 @@ config MMU
 config GENERIC_IRQ_PROBE
def_bool y
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 menu "Machine selection"
 
 choice
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index fc44c046953e..c9eb106b1f4c 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -8,6 +8,7 @@ menu "Processor type and features"
 
 config IA64
bool
+   select ARCH_HAS_HWEIGHT
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ACPI if (!IA64_HP_SIM)
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 1bf6abaea604..91b150b6572c 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -38,10 +38,6 @@ config ARCH_HAS_ILOG2_U32
 config ARCH_HAS_ILOG2_U64
bool
 
-config GENERIC_HWEIGHT
-   bool
-   default y
-
 config TIME_LOW_RES
bool
default y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index d8907d6f969c..0185ac1f0268 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -71,9 +71,6 @@ config ARCH_HAS_ILOG2_U32
 config ARCH_HAS_ILOG2_U64
def_bool n
 
-config GENERIC_HWEIGHT
-   def_bool y
-
 

[PATCH 09/11] lib: consolidate the GENERIC_CSUM symbol

2019-02-13 Thread Christoph Hellwig
Add one definition to lib/Kconfig and let the architectures
select if it supported.

Signed-off-by: Christoph Hellwig 
---
 arch/arc/Kconfig| 4 +---
 arch/arm64/Kconfig  | 4 +---
 arch/csky/Kconfig   | 4 +---
 arch/h8300/Kconfig  | 4 +---
 arch/hexagon/Kconfig| 4 +---
 arch/m68k/Kconfig   | 3 ---
 arch/microblaze/Kconfig | 4 +---
 arch/mips/Kconfig   | 5 +
 arch/nds32/Kconfig  | 4 +---
 arch/nios2/Kconfig  | 4 +---
 arch/openrisc/Kconfig   | 6 +-
 arch/powerpc/Kconfig| 3 ---
 arch/riscv/Kconfig  | 4 +---
 arch/sh/Kconfig | 5 +
 arch/unicore32/Kconfig  | 3 ---
 lib/Kconfig | 3 +++
 16 files changed, 15 insertions(+), 49 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index e965383c05d7..6476404b98b8 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -20,6 +20,7 @@ config ARC
select GENERIC_CALIBRATE_DELAY
select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
select GENERIC_CLOCKEVENTS
+   select GENERIC_CSUM
select GENERIC_FIND_FIRST_BIT
# for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP
select GENERIC_IRQ_SHOW
@@ -59,9 +60,6 @@ config ARCH_HAS_CACHE_LINE_SIZE
 config SCHED_OMIT_FRAME_POINTER
def_bool y
 
-config GENERIC_CSUM
-   def_bool y
-
 config ARCH_DISCONTIGMEM_ENABLE
def_bool n
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 913b2ca7ec22..7cc3334aba29 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -91,6 +91,7 @@ config ARM64
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_CPU_AUTOPROBE
+   select GENERIC_CSUM
select GENERIC_EARLY_IOREMAP
select GENERIC_IDLE_POLL_SETUP
select GENERIC_IRQ_MULTI_HANDLER
@@ -239,9 +240,6 @@ config ILLEGAL_POINTER_VALUE
 config GENERIC_HWEIGHT
def_bool y
 
-config GENERIC_CSUM
-def_bool y
-
 config ZONE_DMA32
def_bool y
 
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 4085ba807e0c..c0a49cbd3df0 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -12,6 +12,7 @@ config CSKY
select HANDLE_DOMAIN_IRQ
select DW_APB_TIMER_OF
select GENERIC_CALIBRATE_DELAY
+   select GENERIC_CSUM
select GENERIC_LIB_ASHLDI3
select GENERIC_LIB_ASHRDI3
select GENERIC_LIB_LSHRDI3
@@ -83,9 +84,6 @@ config CPU_NO_USER_BKPT
  instruction exception.
  In kernel we parse the *regs->pc to determine whether to send SIGTRAP 
or not.
 
-config GENERIC_CSUM
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index ba33326e7c54..4f5a1efab822 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -9,6 +9,7 @@ config H8300
select GENERIC_CPU_DEVICES
select MODULES_USE_ELF_RELA
select GENERIC_CLOCKEVENTS
+   select GENERIC_CSUM
select CLKDEV_LOOKUP
select COMMON_CLK
select ARCH_WANT_FRAME_POINTERS
@@ -32,9 +33,6 @@ config GENERIC_HWEIGHT
 config NO_IOPORT_MAP
def_bool y
 
-config GENERIC_CSUM
-def_bool y
-
 config HZ
int
default 100
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 5eb4f48506b6..7a01f4c5a4f6 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -19,6 +19,7 @@ config HEXAGON
select HAVE_PERF_EVENTS
# GENERIC_ALLOCATOR is used by dma_alloc_coherent()
select GENERIC_ALLOCATOR
+   select GENERIC_CSUM
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
@@ -54,9 +55,6 @@ config EARLY_PRINTK
 config MMU
def_bool y
 
-config GENERIC_CSUM
-   def_bool y
-
 #
 # Use the generic interrupt handling code in kernel/irq/:
 #
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index ed03da5430d9..1bf6abaea604 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -42,9 +42,6 @@ config GENERIC_HWEIGHT
bool
default y
 
-config GENERIC_CSUM
-   bool
-
 config TIME_LOW_RES
bool
default y
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 65a44727a7a2..d8907d6f969c 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -16,6 +16,7 @@ config MICROBLAZE
select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS
select GENERIC_CPU_DEVICES
+   select GENERIC_CSUM
select GENERIC_IDLE_POLL_SETUP
select GENERIC_IRQ_PROBE
select GENERIC_IRQ_SHOW
@@ -73,9 +74,6 @@ config ARCH_HAS_ILOG2_U64
 config GENERIC_HWEIGHT
def_bool y
 
-config GENERIC_CSUM
-   def_bool y
-
 source "arch/microblaze/Kconfig.platform"
 
 menu "Processor type and features"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 530eaf950744..bd0c9be7e7cf 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -23,6 +23,7 @@ config MIPS

[PATCH 08/11] lib: consolidate the GENERIC_BUG symbol

2019-02-13 Thread Christoph Hellwig
And just let the architectures that want it select the symbol.
Same for GENERIC_BUG_RELATIVE_POINTERS.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/Kconfig |  5 +
 arch/arm64/Kconfig   | 10 ++
 arch/c6x/Kconfig |  5 +
 arch/hexagon/Kconfig |  5 +
 arch/parisc/Kconfig  |  6 +-
 arch/powerpc/Kconfig |  6 +-
 arch/riscv/Kconfig   | 10 ++
 arch/s390/Kconfig|  8 ++--
 arch/sh/Kconfig  |  5 +
 arch/x86/Kconfig | 10 ++
 lib/Kconfig  |  6 ++
 11 files changed, 20 insertions(+), 56 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 96780ab64a2e..c230fb1e09ba 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -36,6 +36,7 @@ config ARM
select GENERIC_ALLOCATOR
select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY
select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI
+   select GENERIC_BUG if BUG
select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_CPU_AUTOPROBE
@@ -256,10 +257,6 @@ config PHYS_OFFSET
  Please provide the physical address corresponding to the
  location of main memory in your system.
 
-config GENERIC_BUG
-   def_bool y
-   depends on BUG
-
 config PGTABLE_LEVELS
int
default 3 if ARM_LPAE
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c39dac831f08..913b2ca7ec22 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -85,6 +85,8 @@ config ARM64
select FRAME_POINTER
select GENERIC_ALLOCATOR
select GENERIC_ARCH_TOPOLOGY
+   select GENERIC_BUG if BUG
+   select GENERIC_BUG_RELATIVE_POINTERS
select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST
@@ -234,14 +236,6 @@ config ILLEGAL_POINTER_VALUE
hex
default 0xdead
 
-config GENERIC_BUG
-   def_bool y
-   depends on BUG
-
-config GENERIC_BUG_RELATIVE_POINTERS
-   def_bool y
-   depends on GENERIC_BUG
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index d5f382830f49..19b145ef7d92 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -10,6 +10,7 @@ config C6X
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select CLKDEV_LOOKUP
select GENERIC_ATOMIC64
+   select GENERIC_BUG if BUG
select GENERIC_CALIBRATE_DELAY
select GENERIC_IRQ_SHOW
select HAVE_ARCH_TRACEHOOK
@@ -30,10 +31,6 @@ config FPU
 config GENERIC_HWEIGHT
def_bool y
 
-config GENERIC_BUG
-   def_bool y
-   depends on BUG
-
 config C6X_BIG_KERNEL
bool "Build a big kernel"
help
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 425217c98a77..5eb4f48506b6 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -14,6 +14,7 @@ config HEXAGON
# select GPIOLIB
# select HAVE_CLK
# select GENERIC_PENDING_IRQ if SMP
+   select GENERIC_BUG if BUG
select GENERIC_ATOMIC64
select HAVE_PERF_EVENTS
# GENERIC_ALLOCATOR is used by dma_alloc_coherent()
@@ -65,10 +66,6 @@ config GENERIC_IRQ_PROBE
 config GENERIC_HWEIGHT
def_bool y
 
-config GENERIC_BUG
-   def_bool y
-   depends on BUG
-
 menu "Machine selection"
 
 choice
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fcbc67b6b830..42282b8e086d 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -27,6 +27,7 @@ config PARISC
select HAVE_KERNEL_LZO
select HAVE_KERNEL_XZ
select GENERIC_ATOMIC64 if !64BIT
+   select GENERIC_BUG if BUG
select GENERIC_CALIBRATE_DELAY
select GENERIC_IRQ_PROBE
select GENERIC_PCI_IOMAP
@@ -85,11 +86,6 @@ config ARCH_HAS_ILOG2_U64
bool
default n
 
-config GENERIC_BUG
-   bool
-   default y
-   depends on BUG
-
 config GENERIC_HWEIGHT
bool
default y
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8b9f3639555f..1684017fa496 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -142,6 +142,7 @@ config PPC
select EDAC_ATOMIC_SCRUB
select EDAC_SUPPORT
select GENERIC_ATOMIC64 if PPC32
+   select GENERIC_BUG  if BUG
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCASTif SMP
select GENERIC_CMOS_UPDATE
@@ -283,11 +284,6 @@ config AUDIT_ARCH
bool
default y
 
-config GENERIC_BUG
-   bool
-   default y
-   depends on BUG
-
 config SYS_SUPPORTS_APM_EMULATION
default y if PMAC_APM_EMU
bool
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 732614eb3683..c410ed896567 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -19,6 +19,8 @@ config RISCV
select ARCH_WANT_FRAME_POINTERS
select CLONE_BACKWARDS
select COMMON_CLK
+   

[PATCH 07/11] init: consolidate the GENERIC_CALIBRATE_DELAY symbol

2019-02-13 Thread Christoph Hellwig
Add one definition to init/Kconfig and let the architectures select it
if supported.  The only complication is xtensa, where it is a user
visible option - we introduce a xtensa-specific symbol instead to work
around this.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig  | 5 +
 arch/arc/Kconfig| 4 +---
 arch/arm/Kconfig| 5 +
 arch/arm64/Kconfig  | 4 +---
 arch/c6x/Kconfig| 4 +---
 arch/csky/Kconfig   | 4 +---
 arch/ia64/Kconfig   | 5 +
 arch/m68k/Kconfig   | 5 +
 arch/microblaze/Kconfig | 4 +---
 arch/mips/Kconfig   | 5 +
 arch/nds32/Kconfig  | 4 +---
 arch/nios2/Kconfig  | 4 +---
 arch/parisc/Kconfig | 5 +
 arch/riscv/Kconfig  | 4 +---
 arch/sh/Kconfig | 3 ---
 arch/sparc/Kconfig  | 5 +
 arch/um/Kconfig | 5 +
 arch/unicore32/Kconfig  | 4 +---
 arch/x86/Kconfig| 4 +---
 arch/xtensa/Kconfig | 3 ++-
 init/Kconfig| 3 +++
 21 files changed, 23 insertions(+), 66 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 474202d89b25..65b5514e5a7f 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -25,6 +25,7 @@ config ALPHA
select ARCH_WANT_IPC_PARSE_VERSION
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select AUDIT_ARCH
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS
select GENERIC_CPU_VULNERABILITIES
select GENERIC_SMP_IDLE_THREAD
@@ -58,10 +59,6 @@ config ARCH_HAS_ILOG2_U64
bool
default n
 
-config GENERIC_CALIBRATE_DELAY
-   bool
-   default y
-
 config ZONE_DMA
bool
default y
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index b1d6f297e448..e965383c05d7 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -17,6 +17,7 @@ config ARC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
select COMMON_CLK
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
select GENERIC_CLOCKEVENTS
select GENERIC_FIND_FIRST_BIT
@@ -73,9 +74,6 @@ config MMU
 config NO_IOPORT_MAP
def_bool y
 
-config GENERIC_CALIBRATE_DELAY
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d6e7713a71ae..96780ab64a2e 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -36,6 +36,7 @@ config ARM
select GENERIC_ALLOCATOR
select GENERIC_ARCH_TOPOLOGY if ARM_CPU_TOPOLOGY
select GENERIC_ATOMIC64 if CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select GENERIC_CPU_AUTOPROBE
select GENERIC_EARLY_IOREMAP
@@ -184,10 +185,6 @@ config GENERIC_HWEIGHT
bool
default y
 
-config GENERIC_CALIBRATE_DELAY
-   bool
-   default y
-
 config ARCH_MAY_HAVE_PC_FDC
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index f3d3e48aff26..c39dac831f08 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -85,6 +85,7 @@ config ARM64
select FRAME_POINTER
select GENERIC_ALLOCATOR
select GENERIC_ARCH_TOPOLOGY
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_CPU_AUTOPROBE
@@ -247,9 +248,6 @@ config GENERIC_HWEIGHT
 config GENERIC_CSUM
 def_bool y
 
-config GENERIC_CALIBRATE_DELAY
-   def_bool y
-
 config ZONE_DMA32
def_bool y
 
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index f11465554ecf..d5f382830f49 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -10,6 +10,7 @@ config C6X
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
select CLKDEV_LOOKUP
select GENERIC_ATOMIC64
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_IRQ_SHOW
select HAVE_ARCH_TRACEHOOK
select SPARSE_IRQ
@@ -26,9 +27,6 @@ config MMU
 config FPU
def_bool n
 
-config GENERIC_CALIBRATE_DELAY
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 3c3de7ac95bf..4085ba807e0c 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -11,6 +11,7 @@ config CSKY
select IRQ_DOMAIN
select HANDLE_DOMAIN_IRQ
select DW_APB_TIMER_OF
+   select GENERIC_CALIBRATE_DELAY
select GENERIC_LIB_ASHLDI3
select GENERIC_LIB_ASHRDI3
select GENERIC_LIB_LSHRDI3
@@ -82,9 +83,6 @@ config CPU_NO_USER_BKPT
  instruction exception.
  In kernel we parse the *regs->pc to determine whether to send SIGTRAP 
or not.
 
-config GENERIC_CALIBRATE_DELAY
-   def_bool y
-
 config GENERIC_CSUM
def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 8c14b669d194..fc44c046953e 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -34,6 +34,7 @@ config IA64
select 

[PATCH 06/11] lockdep: consolidate the LOCKDEP_SUPPORT symbol

2019-02-13 Thread Christoph Hellwig
Add one definition to lib/Kconfig.debug and let the architectures
select if it supported.

Signed-off-by: Christoph Hellwig 
---
 arch/arc/Kconfig| 4 +---
 arch/arm/Kconfig| 5 +
 arch/arm64/Kconfig  | 4 +---
 arch/hexagon/Kconfig| 4 +---
 arch/microblaze/Kconfig | 4 +---
 arch/mips/Kconfig   | 5 +
 arch/openrisc/Kconfig   | 4 +---
 arch/powerpc/Kconfig| 5 +
 arch/s390/Kconfig   | 4 +---
 arch/sh/Kconfig | 4 +---
 arch/sparc/Kconfig  | 5 +
 arch/um/Kconfig | 5 +
 arch/unicore32/Kconfig  | 4 +---
 arch/x86/Kconfig| 4 +---
 arch/xtensa/Kconfig | 4 +---
 lib/Kconfig.debug   | 3 +++
 16 files changed, 18 insertions(+), 50 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 065fbd55dcc4..b1d6f297e448 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -42,6 +42,7 @@ config ARC
select HANDLE_DOMAIN_IRQ
select IRQ_DOMAIN
select MODULES_USE_ELF_RELA
+   select LOCKDEP_SUPPORT
select OF
select OF_EARLY_FLATTREE
select OF_RESERVED_MEM
@@ -54,9 +55,6 @@ config ARC
 config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
 
-config LOCKDEP_SUPPORT
-   def_bool y
-
 config SCHED_OMIT_FRAME_POINTER
def_bool y
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1995e1b24506..d6e7713a71ae 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -98,6 +98,7 @@ config ARM
select HAVE_UID16
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select IRQ_FORCED_THREADING
+   select LOCKDEP_SUPPORT
select MODULES_USE_ELF_REL
select NEED_DMA_MAP_STATE
select OF_EARLY_FLATTREE if OF
@@ -167,10 +168,6 @@ config NO_IOPORT_MAP
 config SBUS
bool
 
-config LOCKDEP_SUPPORT
-   bool
-   default y
-
 config ARCH_HAS_ILOG2_U32
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index ecbe481ce064..f3d3e48aff26 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -157,6 +157,7 @@ config ARM64
select IOMMU_DMA if IOMMU_SUPPORT
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
+   select LOCKDEP_SUPPORT
select MODULES_USE_ELF_RELA
select MULTI_IRQ_HANDLER
select NEED_DMA_MAP_STATE
@@ -232,9 +233,6 @@ config ILLEGAL_POINTER_VALUE
hex
default 0xdead
 
-config LOCKDEP_SUPPORT
-   def_bool y
-
 config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index b6c3111ec5f9..425217c98a77 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -21,6 +21,7 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
+   select LOCKDEP_SUPPORT
select ARCH_DISCARD_MEMBLOCK
select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
@@ -46,9 +47,6 @@ config HEXAGON_PHYS_OFFSET
 config FRAME_POINTER
def_bool y
 
-config LOCKDEP_SUPPORT
-   def_bool y
-
 config EARLY_PRINTK
def_bool y
 
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index 238f8b410331..1989ba1d1798 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -31,6 +31,7 @@ config MICROBLAZE
select HAVE_OPROFILE
select HAVE_PCI
select IRQ_DOMAIN
+   select LOCKDEP_SUPPORT
select XILINX_INTC
select MODULES_USE_ELF_RELA
select OF
@@ -77,9 +78,6 @@ config GENERIC_CALIBRATE_DELAY
 config GENERIC_CSUM
def_bool y
 
-config LOCKDEP_SUPPORT
-   def_bool y
-
 source "arch/microblaze/Kconfig.platform"
 
 menu "Processor type and features"
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index d1c89635a459..88792685687c 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -76,6 +76,7 @@ config MIPS
select HAVE_VIRT_CPU_ACCOUNTING_GEN if 64BIT || !SMP
select IRQ_FORCED_THREADING
select ISA if EISA
+   select LOCKDEP_SUPPORT
select MODULES_USE_ELF_RELA if MODULES && 64BIT
select MODULES_USE_ELF_REL if MODULES
select PERF_USE_VMALLOC
@@ -3051,10 +3052,6 @@ endchoice
 
 endmenu
 
-config LOCKDEP_SUPPORT
-   bool
-   default y
-
 config HAVE_LATENCYTOP_SUPPORT
bool
default y
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 514787c0c469..6cb7632fa5b0 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -28,6 +28,7 @@ config OPENRISC
select GENERIC_SMP_IDLE_THREAD
select MODULES_USE_ELF_RELA
select HAVE_DEBUG_STACKOVERFLOW
+   select LOCKDEP_SUPPORT
select OR1K_PIC
select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
select ARCH_USE_QUEUED_SPINLOCKS
@@ -55,9 +56,6 @@ config NO_IOPORT_MAP
 config GENERIC_CSUM
 def_bool y
 
-config LOCKDEP_SUPPORT
-   def_bool  y
-
 menu "Processor type and features"
 
 choice
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig

[PATCH 05/11] tracing: consolidate the TRACE_IRQFLAGS_SUPPORT symbol

2019-02-13 Thread Christoph Hellwig
Add one definition to kernel/trace/Kconfig and let the architectures
select if it supported.

Signed-off-by: Christoph Hellwig 
---
 arch/arc/Kconfig  | 4 +---
 arch/arm/Kconfig  | 5 +
 arch/arm64/Kconfig| 4 +---
 arch/csky/Kconfig | 4 +---
 arch/hexagon/Kconfig  | 4 +---
 arch/microblaze/Kconfig   | 1 +
 arch/microblaze/Kconfig.debug | 2 --
 arch/mips/Kconfig | 1 +
 arch/mips/Kconfig.debug   | 4 
 arch/nds32/Kconfig| 4 +---
 arch/nios2/Kconfig| 4 +---
 arch/nios2/Kconfig.debug  | 3 ---
 arch/openrisc/Kconfig | 4 +---
 arch/parisc/Kconfig   | 1 +
 arch/parisc/Kconfig.debug | 2 --
 arch/powerpc/Kconfig  | 5 +
 arch/riscv/Kconfig| 4 +---
 arch/s390/Kconfig | 1 +
 arch/s390/Kconfig.debug   | 3 ---
 arch/sh/Kconfig   | 1 +
 arch/sh/Kconfig.debug | 3 ---
 arch/sparc/Kconfig| 1 +
 arch/sparc/Kconfig.debug  | 4 
 arch/um/Kconfig   | 5 +
 arch/x86/Kconfig  | 1 +
 arch/x86/Kconfig.debug| 3 ---
 arch/xtensa/Kconfig   | 4 +---
 kernel/trace/Kconfig  | 3 +++
 28 files changed, 22 insertions(+), 63 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index e8e3776fc5fa..065fbd55dcc4 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -49,13 +49,11 @@ config ARC
select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
select STACKTRACE_SUPPORT
select STACKTRACE
+   select TRACE_IRQFLAGS_SUPPORT
 
 config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
 
-config TRACE_IRQFLAGS_SUPPORT
-   def_bool y
-
 config LOCKDEP_SUPPORT
def_bool y
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 68a891f3ffa0..1995e1b24506 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -111,6 +111,7 @@ config ARM
select RWSEM_XCHGADD_ALGORITHM
select STACKTRACE_SUPPORT
select SYS_SUPPORTS_APM_EMULATION
+   select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M
# Above selects are sorted alphabetically; please add new ones
# according to that.  Thanks.
help
@@ -170,10 +171,6 @@ config LOCKDEP_SUPPORT
bool
default y
 
-config TRACE_IRQFLAGS_SUPPORT
-   bool
-   default !CPU_V7M
-
 config ARCH_HAS_ILOG2_U32
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a6a0bb868369..ecbe481ce064 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -176,6 +176,7 @@ config ARM64
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
+   select TRACE_IRQFLAGS_SUPPORT
help
  ARM 64-bit (AArch64) Linux support.
 
@@ -234,9 +235,6 @@ config ILLEGAL_POINTER_VALUE
 config LOCKDEP_SUPPORT
def_bool y
 
-config TRACE_IRQFLAGS_SUPPORT
-   def_bool y
-
 config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 90279a11fcf7..3c3de7ac95bf 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -46,6 +46,7 @@ config CSKY
select PERF_USE_VMALLOC if CPU_CK610
select RTC_LIB
select TIMER_OF
+   select TRACE_IRQFLAGS_SUPPORT
select USB_ARCH_HAS_EHCI
select USB_ARCH_HAS_OHCI
 
@@ -99,9 +100,6 @@ config STACKTRACE_SUPPORT
 config TIME_LOW_RES
def_bool y
 
-config TRACE_IRQFLAGS_SUPPORT
-   def_bool y
-
 config CPU_TLB_SIZE
int
default "128"   if (CPU_CK610 || CPU_CK807 || CPU_CK810)
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 615693b62ea0..b6c3111ec5f9 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -33,6 +33,7 @@ config HEXAGON
select MODULES_USE_ELF_RELA
select GENERIC_CPU_DEVICES
select RWSEM_XCHGADD_ALGORITHM
+   select TRACE_IRQFLAGS_SUPPORT
---help---
  Qualcomm Hexagon is a processor architecture designed for high
  performance and low power across a wide variety of applications.
@@ -54,9 +55,6 @@ config EARLY_PRINTK
 config MMU
def_bool y
 
-config TRACE_IRQFLAGS_SUPPORT
-   def_bool y
-
 config GENERIC_CSUM
def_bool y
 
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index ecccf8651caa..238f8b410331 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -38,6 +38,7 @@ config MICROBLAZE
select PCI_DOMAINS_GENERIC if PCI
select PCI_SYSCALL if PCI
select STACKTRACE_SUPPORT
+   select TRACE_IRQFLAGS_SUPPORT
select TRACING_SUPPORT
select VIRT_TO_BUS
select CPU_NO_EFFICIENT_FFS
diff --git a/arch/microblaze/Kconfig.debug b/arch/microblaze/Kconfig.debug
index dc2e3c45e8a2..617df4f48a5d 100644
--- a/arch/microblaze/Kconfig.debug
+++ b/arch/microblaze/Kconfig.debug
@@ -1,5 +1,3 @@
 # For a description of the syntax of this configuration file,
 # see 

[PATCH 04/11] tracing: consolidate the STACKTRACE_SUPPORT symbol

2019-02-13 Thread Christoph Hellwig
Add one definition to kernel/trace/Kconfig and let the architectures
select if it supported.

Signed-off-by: Christoph Hellwig 
---
 arch/arc/Kconfig| 6 ++
 arch/arm/Kconfig| 5 +
 arch/arm64/Kconfig  | 4 +---
 arch/hexagon/Kconfig| 5 +
 arch/ia64/Kconfig   | 4 +---
 arch/microblaze/Kconfig | 4 +---
 arch/mips/Kconfig   | 5 +
 arch/nds32/Kconfig  | 4 +---
 arch/openrisc/Kconfig   | 4 +---
 arch/parisc/Kconfig | 4 +---
 arch/powerpc/Kconfig| 5 +
 arch/riscv/Kconfig  | 4 +---
 arch/s390/Kconfig   | 4 +---
 arch/sh/Kconfig | 4 +---
 arch/sparc/Kconfig  | 5 +
 arch/um/Kconfig | 7 ++-
 arch/unicore32/Kconfig  | 4 +---
 arch/x86/Kconfig| 4 +---
 arch/xtensa/Kconfig | 4 +---
 kernel/trace/Kconfig| 3 +++
 20 files changed, 24 insertions(+), 65 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index c0dd229af534..e8e3776fc5fa 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -47,6 +47,8 @@ config ARC
select OF_RESERVED_MEM
select PCI_SYSCALL if PCI
select PERF_USE_VMALLOC if ARC_CACHE_VIPT_ALIASING
+   select STACKTRACE_SUPPORT
+   select STACKTRACE
 
 config ARCH_HAS_CACHE_LINE_SIZE
def_bool y
@@ -81,10 +83,6 @@ config GENERIC_CALIBRATE_DELAY
 config GENERIC_HWEIGHT
def_bool y
 
-config STACKTRACE_SUPPORT
-   def_bool y
-   select STACKTRACE
-
 config HAVE_ARCH_TRANSPARENT_HUGEPAGE
def_bool y
depends on ARC_MMU_V4
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 1ed4c0560b50..68a891f3ffa0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -109,6 +109,7 @@ config ARM
select REFCOUNT_FULL
select RTC_LIB
select RWSEM_XCHGADD_ALGORITHM
+   select STACKTRACE_SUPPORT
select SYS_SUPPORTS_APM_EMULATION
# Above selects are sorted alphabetically; please add new ones
# according to that.  Thanks.
@@ -165,10 +166,6 @@ config NO_IOPORT_MAP
 config SBUS
bool
 
-config STACKTRACE_SUPPORT
-   bool
-   default y
-
 config LOCKDEP_SUPPORT
bool
default y
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 2f8da4e18a1e..a6a0bb868369 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -172,6 +172,7 @@ config ARM64
select REFCOUNT_FULL
select RWSEM_XCHGADD_ALGORITHM
select SPARSE_IRQ
+   select STACKTRACE_SUPPORT
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
@@ -226,9 +227,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MAX
 config NO_IOPORT_MAP
def_bool y if !PCI
 
-config STACKTRACE_SUPPORT
-   def_bool y
-
 config ILLEGAL_POINTER_VALUE
hex
default 0xdead
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 49f364ea18d4..615693b62ea0 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -26,6 +26,7 @@ config HEXAGON
select NO_IOPORT_MAP
select GENERIC_IOMAP
select GENERIC_SMP_IDLE_THREAD
+   select STACKTRACE
select STACKTRACE_SUPPORT
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST
@@ -68,10 +69,6 @@ config GENERIC_IRQ_PROBE
 config GENERIC_HWEIGHT
def_bool y
 
-config STACKTRACE_SUPPORT
-   def_bool y
-   select STACKTRACE
-
 config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 99a629f05de4..8c14b669d194 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -54,6 +54,7 @@ config IA64
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
select RWSEM_XCHGADD_ALGORITHM
+   select STACKTRACE_SUPPORT
default y
help
  The Itanium Processor Family is Intel's 64-bit successor to
@@ -78,9 +79,6 @@ config MMU
bool
default y
 
-config STACKTRACE_SUPPORT
-   def_bool y
-
 config GENERIC_LOCKBREAK
def_bool n
 
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig
index d459c70d9a66..ecccf8651caa 100644
--- a/arch/microblaze/Kconfig
+++ b/arch/microblaze/Kconfig
@@ -37,6 +37,7 @@ config MICROBLAZE
select OF_EARLY_FLATTREE
select PCI_DOMAINS_GENERIC if PCI
select PCI_SYSCALL if PCI
+   select STACKTRACE_SUPPORT
select TRACING_SUPPORT
select VIRT_TO_BUS
select CPU_NO_EFFICIENT_FFS
@@ -75,9 +76,6 @@ config GENERIC_CALIBRATE_DELAY
 config GENERIC_CSUM
def_bool y
 
-config STACKTRACE_SUPPORT
-   def_bool y
-
 config LOCKDEP_SUPPORT
def_bool y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 6ad1882a8db9..0645e7b96493 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -80,6 +80,7 @@ config MIPS
select MODULES_USE_ELF_REL if MODULES
select PERF_USE_VMALLOC
select RTC_LIB
+   select STACKTRACE_SUPPORT
select SYSCTL_EXCEPTION_TRACE
select VIRT_TO_BUS
 

[PATCH 03/11] kernel/locks: consolidate RWSEM_GENERIC_* options

2019-02-13 Thread Christoph Hellwig
Introduce one central definition of RWSEM_XCHGADD_ALGORITHM and
RWSEM_GENERIC_SPINLOCK in kernel/Kconfig.locks and let architectures
select RWSEM_XCHGADD_ALGORITHM if they want it, otherwise default to
the spinlock version.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig  | 8 +---
 arch/arc/Kconfig| 3 ---
 arch/arm/Kconfig| 5 +
 arch/arm64/Kconfig  | 4 +---
 arch/c6x/Kconfig| 3 ---
 arch/csky/Kconfig   | 3 ---
 arch/h8300/Kconfig  | 3 ---
 arch/hexagon/Kconfig| 7 +--
 arch/ia64/Kconfig   | 5 +
 arch/m68k/Kconfig   | 7 ---
 arch/microblaze/Kconfig | 6 --
 arch/mips/Kconfig   | 7 ---
 arch/nds32/Kconfig  | 3 ---
 arch/nios2/Kconfig  | 3 ---
 arch/openrisc/Kconfig   | 6 --
 arch/parisc/Kconfig | 6 --
 arch/powerpc/Kconfig| 8 +---
 arch/riscv/Kconfig  | 3 ---
 arch/s390/Kconfig   | 7 +--
 arch/sh/Kconfig | 6 --
 arch/sparc/Kconfig  | 9 +
 arch/unicore32/Kconfig  | 6 --
 arch/x86/Kconfig| 4 +---
 arch/x86/um/Kconfig | 7 +--
 arch/xtensa/Kconfig | 4 +---
 kernel/Kconfig.locks| 7 +++
 26 files changed, 18 insertions(+), 122 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 584a6e114853..474202d89b25 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -35,6 +35,7 @@ config ALPHA
select MODULES_USE_ELF_RELA
select ODD_RT_SIGACTION
select OLD_SIGSUSPEND
+   select RWSEM_XCHGADD_ALGORITHM
select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
help
  The Alpha is a 64-bit general-purpose processor designed and
@@ -49,13 +50,6 @@ config MMU
bool
default y
 
-config RWSEM_GENERIC_SPINLOCK
-   bool
-
-config RWSEM_XCHGADD_ALGORITHM
-   bool
-   default y
-
 config ARCH_HAS_ILOG2_U32
bool
default n
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 376366a7db81..c0dd229af534 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -63,9 +63,6 @@ config SCHED_OMIT_FRAME_POINTER
 config GENERIC_CSUM
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config ARCH_DISCONTIGMEM_ENABLE
def_bool n
 
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 664e918e2624..1ed4c0560b50 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -108,6 +108,7 @@ config ARM
select PERF_USE_VMALLOC
select REFCOUNT_FULL
select RTC_LIB
+   select RWSEM_XCHGADD_ALGORITHM
select SYS_SUPPORTS_APM_EMULATION
# Above selects are sorted alphabetically; please add new ones
# according to that.  Thanks.
@@ -176,10 +177,6 @@ config TRACE_IRQFLAGS_SUPPORT
bool
default !CPU_V7M
 
-config RWSEM_XCHGADD_ALGORITHM
-   bool
-   default y
-
 config ARCH_HAS_ILOG2_U32
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index a4168d366127..2f8da4e18a1e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -170,6 +170,7 @@ config ARM64
select POWER_RESET
select POWER_SUPPLY
select REFCOUNT_FULL
+   select RWSEM_XCHGADD_ALGORITHM
select SPARSE_IRQ
select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
@@ -238,9 +239,6 @@ config LOCKDEP_SUPPORT
 config TRACE_IRQFLAGS_SUPPORT
def_bool y
 
-config RWSEM_XCHGADD_ALGORITHM
-   def_bool y
-
 config GENERIC_BUG
def_bool y
depends on BUG
diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig
index 456e154674d1..f11465554ecf 100644
--- a/arch/c6x/Kconfig
+++ b/arch/c6x/Kconfig
@@ -26,9 +26,6 @@ config MMU
 config FPU
def_bool n
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config GENERIC_CALIBRATE_DELAY
def_bool y
 
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 398113c845f5..90279a11fcf7 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -93,9 +93,6 @@ config GENERIC_HWEIGHT
 config MMU
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config STACKTRACE_SUPPORT
def_bool y
 
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 6472a0685470..ba33326e7c54 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -26,9 +26,6 @@ config H8300
 config CPU_BIG_ENDIAN
def_bool y
 
-config RWSEM_GENERIC_SPINLOCK
-   def_bool y
-
 config GENERIC_HWEIGHT
def_bool y
 
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index fb2fbfcfc532..49f364ea18d4 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -31,6 +31,7 @@ config HEXAGON
select GENERIC_CLOCKEVENTS_BROADCAST
select MODULES_USE_ELF_RELA
select GENERIC_CPU_DEVICES
+   select RWSEM_XCHGADD_ALGORITHM
---help---
  Qualcomm Hexagon is a processor architecture designed for high
  performance and low power across a wide variety of applications.
@@ -64,12 +65,6 @@ config GENERIC_CSUM
 config GENERIC_IRQ_PROBE

consolidate a few more arch support config options

2019-02-13 Thread Christoph Hellwig
Hi all,

this series moves various config options that are defined in multiple
arch Kconfig files into common files, usually close to the code
supporting such features.


[PATCH 01/11] powerpc: remove dead ifdefs in

2019-02-13 Thread Christoph Hellwig
__KERNEL__ is never not defined for non-uapi headers, and GENERIC_CSUM
isn't ever set for powerpc either.

Signed-off-by: Christoph Hellwig 
---
 arch/powerpc/include/asm/checksum.h | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/checksum.h 
b/arch/powerpc/include/asm/checksum.h
index a78a57e5058d..37c309500260 100644
--- a/arch/powerpc/include/asm/checksum.h
+++ b/arch/powerpc/include/asm/checksum.h
@@ -1,6 +1,5 @@
 #ifndef _ASM_POWERPC_CHECKSUM_H
 #define _ASM_POWERPC_CHECKSUM_H
-#ifdef __KERNEL__
 
 /*
  * This program is free software; you can redistribute it and/or
@@ -9,9 +8,6 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifdef CONFIG_GENERIC_CSUM
-#include 
-#else
 #include 
 #include 
 /*
@@ -217,6 +213,4 @@ __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
__u32 len, __u8 proto, __wsum sum);
 
-#endif
-#endif /* __KERNEL__ */
-#endif
+#endif /* _ASM_POWERPC_CHECKSUM_H */
-- 
2.20.1



[PATCH 02/11] riscv: remove the HAVE_KPROBES option

2019-02-13 Thread Christoph Hellwig
HAVE_KPROBES is defined genericly in arch/Kconfig and architectures
should just select it if supported.

Signed-off-by: Christoph Hellwig 
---
 arch/riscv/Kconfig | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 515fc3cc9687..b60f4e3e36f4 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -94,9 +94,6 @@ config PGTABLE_LEVELS
default 3 if 64BIT
default 2
 
-config HAVE_KPROBES
-   def_bool n
-
 menu "Platform type"
 
 choice
-- 
2.20.1



Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Michal Hocko
On Wed 13-02-19 08:14:50, Dave Hansen wrote:
> On 2/13/19 1:43 AM, Michal Hocko wrote:
> > 
> > We have seen several bugs where zonelists have not been initialized
> > properly and it is not really straightforward to track those bugs down.
> > One way to help a bit at least is to dump zonelists of each node when
> > they are (re)initialized.
> 
> Were you thinking of boot-time bugs and crashes, or just stuff going
> wonky after boot?

Mostly boot time. I haven't seen hotplug related bugs in this direction.
All the issues I have seen so far is that we forget a node altogether
and it ends up with no zonelists at all. But who knows maybe we have
some hidden bugs where zonelists is initialized only partially for some
reason and there is no real way to find out.

> We don't have the zonelists dumped in /proc anywhere, do we?  Would that
> help?

I would prefer to not export such an implementation detail into proc

-- 
Michal Hocko
SUSE Labs


Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Dave Hansen
On 2/13/19 1:43 AM, Michal Hocko wrote:
> 
> We have seen several bugs where zonelists have not been initialized
> properly and it is not really straightforward to track those bugs down.
> One way to help a bit at least is to dump zonelists of each node when
> they are (re)initialized.

Were you thinking of boot-time bugs and crashes, or just stuff going
wonky after boot?

We don't have the zonelists dumped in /proc anywhere, do we?  Would that
help?


[PATCH v3 2/2] powerpc/8xx: Map 32Mb of RAM at init.

2019-02-13 Thread Christophe Leroy
At the time being, initial MMU setup allows 24 Mbytes
of DATA and 8 Mbytes of code.

Some debug setup like CONFIG_KASAN generate huge
kernels with text size over the 8M limit and data over the
24 Mbytes limit.

Here is an 8xx kernel compiled with CONFIG_KASAN_INLINE for
one of my boards:

[root@po16846vm linux-powerpc]# size -x vmlinux
   textdata bss dec hex filename
0x111019c   0x41b0d40x490de02698452819bc050 vmlinux

This patch maps up to 32 Mbytes code based on _einittext symbol
and allows 32 Mbytes of memory instead of 24.

Signed-off-by: Christophe Leroy 
---
 v3: Maps 32M of both data and text.

 v2: Using IS_ENABLED() instead of #ifdef in 8xx_mmu.c

 arch/powerpc/kernel/head_8xx.S | 51 +-
 arch/powerpc/mm/8xx_mmu.c  |  7 --
 2 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 20cc816b3508..fe2857ef0309 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -337,8 +337,8 @@ InstructionTLBMiss:
rlwinm  r10, r10, 16, 0xfff8
cmpli   cr0, r10, PAGE_OFFSET@h
 #ifndef CONFIG_PIN_TLB_TEXT
-   /* It is assumed that kernel code fits into the first 8M page */
-0: cmpli   cr7, r10, (PAGE_OFFSET + 0x080)@h
+   /* It is assumed that kernel code fits into the first 32M */
+0: cmpli   cr7, r10, (PAGE_OFFSET + 0x200)@h
patch_site  0b, patch__itlbmiss_linmem_top
 #endif
 #endif
@@ -434,7 +434,7 @@ DataStoreTLBMiss:
 #ifndef CONFIG_PIN_TLB_IMMR
cmpli   cr6, r10, VIRT_IMMR_BASE@h
 #endif
-0: cmpli   cr7, r10, (PAGE_OFFSET + 0x180)@h
+0: cmpli   cr7, r10, (PAGE_OFFSET + 0x200)@h
patch_site  0b, patch__dtlbmiss_linmem_top
 
mfspr   r10, SPRN_M_TWB /* Get level 1 table */
@@ -886,28 +886,11 @@ initial_mmu:
mtspr   SPRN_MD_CTR, r10/* remove PINNED DTLB entries */
 
tlbia   /* Invalidate all TLB entries */
-#ifdef CONFIG_PIN_TLB_TEXT
-   lis r8, MI_RSV4I@h
-   ori r8, r8, 0x1c00
-
-   mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
-#endif
-
 #ifdef CONFIG_PIN_TLB_DATA
orisr10, r10, MD_RSV4I@h
mtspr   SPRN_MD_CTR, r10/* Set data TLB control */
 #endif
 
-   /* Now map the lower 8 Meg into the ITLB. */
-   lis r8, KERNELBASE@h/* Create vaddr for TLB */
-   ori r8, r8, MI_EVALID   /* Mark it valid */
-   mtspr   SPRN_MI_EPN, r8
-   li  r8, MI_PS8MEG /* Set 8M byte page */
-   ori r8, r8, MI_SVALID   /* Make it valid */
-   mtspr   SPRN_MI_TWC, r8
-   li  r8, MI_BOOTINIT /* Create RPN for address 0 */
-   mtspr   SPRN_MI_RPN, r8 /* Store TLB entry */
-
lis r8, MI_APG_INIT@h   /* Set protection modes */
ori r8, r8, MI_APG_INIT@l
mtspr   SPRN_MI_AP, r8
@@ -937,6 +920,34 @@ initial_mmu:
mtspr   SPRN_MD_RPN, r8
 #endif
 
+   /* Now map the lower RAM (up to 32 Mbytes) into the ITLB. */
+#ifdef CONFIG_PIN_TLB_TEXT
+   lis r8, MI_RSV4I@h
+   ori r8, r8, 0x1c00
+#endif
+   li  r9, 4   /* up to 4 pages of 8M */
+   mtctr   r9
+   lis r9, KERNELBASE@h/* Create vaddr for TLB */
+   li  r10, MI_PS8MEG | MI_SVALID  /* Set 8M byte page */
+   li  r11, MI_BOOTINIT/* Create RPN for address 0 */
+   lis r12, _einittext@h
+   ori r12, r12, _einittext@l
+1:
+#ifdef CONFIG_PIN_TLB_TEXT
+   mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
+   addir8, r8, 0x100
+#endif
+
+   ori r0, r9, MI_EVALID   /* Mark it valid */
+   mtspr   SPRN_MI_EPN, r0
+   mtspr   SPRN_MI_TWC, r10
+   mtspr   SPRN_MI_RPN, r11/* Store TLB entry */
+   addis   r9, r9, 0x80
+   addis   r11, r11, 0x80
+
+   cmplcr0, r9, r12
+   bdnzf   gt, 1b
+
/* Since the cache is enabled according to the information we
 * just loaded into the TLB, invalidate and enable the caches here.
 * We should probably check/set other modeslater.
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index 92b677faea8c..50c8cd8d3cb9 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -112,6 +112,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
mmu_patch_cmp_limit(__itlbmiss_linmem_top, 0);
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+   if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+   mmu_patch_cmp_limit(__itlbmiss_linmem_top,
+   _ALIGN(__pa(_einittext), 8 << 20));
}
 
mmu_patch_cmp_limit(__dtlbmiss_linmem_top, mapped);
@@ -140,8 +143,8 @@ void __init 

[PATCH v3 1/2] powerpc/8xx: replace most #ifdef by IS_ENABLED() in 8xx_mmu.c

2019-02-13 Thread Christophe Leroy
This patch replaces most #ifdef mess by IS_ENABLED() in 8xx_mmu.c
This has the advantage of allowing syntax verification at compile
time regardless of selected options.

Signed-off-by: Christophe Leroy 
---
 v3: no change

 v2: left CONFIG_BDI_SWITCH change aside as it goes away in another patch

 arch/powerpc/mm/8xx_mmu.c | 44 +++-
 1 file changed, 19 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index bfa503cff351..92b677faea8c 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -66,26 +66,22 @@ unsigned long p_block_mapped(phys_addr_t pa)
 void __init MMU_init_hw(void)
 {
/* PIN up to the 3 first 8Mb after IMMR in DTLB table */
-#ifdef CONFIG_PIN_TLB_DATA
-   unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe00;
-   unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
-#ifdef CONFIG_PIN_TLB_IMMR
-   int i = 29;
-#else
-   int i = 28;
-#endif
-   unsigned long addr = 0;
-   unsigned long mem = total_lowmem;
-
-   for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
-   mtspr(SPRN_MD_CTR, ctr | (i << 8));
-   mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | MD_EVALID);
-   mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
-   mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
-   addr += LARGE_PAGE_SIZE_8M;
-   mem -= LARGE_PAGE_SIZE_8M;
+   if (IS_ENABLED(CONFIG_PIN_TLB_DATA)) {
+   unsigned long ctr = mfspr(SPRN_MD_CTR) & 0xfe00;
+   unsigned long flags = 0xf0 | MD_SPS16K | _PAGE_SH | _PAGE_DIRTY;
+   int i = IS_ENABLED(CONFIG_PIN_TLB_IMMR) ? 29 : 28;
+   unsigned long addr = 0;
+   unsigned long mem = total_lowmem;
+
+   for (; i < 32 && mem >= LARGE_PAGE_SIZE_8M; i++) {
+   mtspr(SPRN_MD_CTR, ctr | (i << 8));
+   mtspr(SPRN_MD_EPN, (unsigned long)__va(addr) | 
MD_EVALID);
+   mtspr(SPRN_MD_TWC, MD_PS8MEG | MD_SVALID);
+   mtspr(SPRN_MD_RPN, addr | flags | _PAGE_PRESENT);
+   addr += LARGE_PAGE_SIZE_8M;
+   mem -= LARGE_PAGE_SIZE_8M;
+   }
}
-#endif
 }
 
 static void __init mmu_mapin_immr(void)
@@ -110,12 +106,10 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
if (__map_without_ltlbs) {
mapped = 0;
mmu_mapin_immr();
-#ifndef CONFIG_PIN_TLB_IMMR
-   patch_instruction_site(__dtlbmiss_immr_jmp, PPC_INST_NOP);
-#endif
-#ifndef CONFIG_PIN_TLB_TEXT
-   mmu_patch_cmp_limit(__itlbmiss_linmem_top, 0);
-#endif
+   if (!IS_ENABLED(CONFIG_PIN_TLB_IMMR))
+   patch_instruction_site(__dtlbmiss_immr_jmp, 
PPC_INST_NOP);
+   if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+   mmu_patch_cmp_limit(__itlbmiss_linmem_top, 0);
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
}
-- 
2.13.3



Re: [PATCH v2 2/2] locking/rwsem: Optimize down_read_trylock()

2019-02-13 Thread Waiman Long
On 02/13/2019 02:45 AM, Ingo Molnar wrote:
> * Waiman Long  wrote:
>
>> I looked at the assembly code in arch/x86/include/asm/rwsem.h. For both
>> trylocks (read & write), the count is read first before attempting to
>> lock it. We did the same for all trylock functions in other locks.
>> Depending on how the trylock is used and how contended the lock is, it
>> may help or hurt performance. Changing down_read_trylock to do an
>> unconditional cmpxchg will change the performance profile of existing
>> code. So I would prefer keeping the current code.
>>
>> I do notice now that the generic down_write_trylock() code is doing an
>> unconditional compxchg. So I wonder if we should change it to read the
>> lock first like other trylocks or just leave it as it is.
> No, I think we should instead move the other trylocks to the 
> try-for-ownership model as well, like Linus suggested.
>
> That's the general assumption we make in locking primitives, that we 
> optimize for the common, expected case - which would be that the trylock 
> succeeds, and I don't see why trylock primitives should be different.
>
> In fact I can see more ways for read-for-sharing to perform suboptimally 
> on larger systems.

I don't mind changing to the try-for-ownership model for rwsem and
mutex. I do have some concern to do that for spinlock. Some of the lock
slowpath code do optimistic trylock. Making them unconditional cmpxchg
will impact lock contention performance.

I will update this rwsem patch to make the change while I am working on
it. For other locks, I will suggest we go slow and carefully evaluate
the performance implication before we make the changes.

Cheers,
Longman




Re: Kernel panic when loading the IDE controller driver

2019-02-13 Thread Christophe Leroy




Le 13/02/2019 à 13:53, sgosavi1 a écrit :

Why using 4.15.13 which is obsolete instead of using one of the Long
Term Support versions which are still maintained, like 4.14 or 4.19 ?
(see the complete list at https://www.kernel.org/category/releases.html)


Well, when I started this task 4.15.13 was probably the latest stable
release and hence we decided to port this version. In the older kernel, we
have the m8260_setup.c source file for our board where the function
"io_block_mapping" was used to configure the non-standard IO port address
starting at 0xe000 location. This address was passed as the base address
followed by control address and IRQ number to the ide-core.ko module. In the
new kernel we do not have an option to send these addresses and IRQ numbers
as arguments to the driver. Instead the ide-generic.c source file in the new
kernel uses the standard IO port values and IRQ values. I modified the code
in the above file to used the addresses and IRQ number we used in the past.
Also, added code in the "MMU_init" function call available under
arch/PowerPC/init_32.c to setup the IO port address range by adding the
"io_block_mapping" call and the required IO port address range.

Is there anything else that needs to be added or how can we configure the
desired IO address range in the new kernel?



Maybe look around 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=9a0e77f28b50128df0c9e26ae489e44e29a7270a


Also look at ide_platform.c. I imagine there must be some way to set it 
up in your device tree.


Maybe Bartlomiej Zolnierkiewicz can help ?

Christophe


Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Michal Hocko
On Wed 13-02-19 14:11:31, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 12:50:14PM +0100, Michal Hocko wrote:
> > On Wed 13-02-19 11:32:31, Peter Zijlstra wrote:
> > > On Wed, Feb 13, 2019 at 10:43:15AM +0100, Michal Hocko wrote:
> > > > @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
> > > >  
> > > > build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> > > > build_thisnode_zonelists(pgdat);
> > > > +
> > > > +   pr_info("node[%d] zonelist: ", pgdat->node_id);
> > > > +   for_each_zone_zonelist(zone, z, 
> > > > >node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> > > > +   pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> > > > +   pr_cont("\n");
> > > >  }
> > > 
> > > Have you ran this by the SGI and other stupid large machine vendors?
> > 
> > I do not have such a large machine handy. The biggest I have has
> > handfull (say dozen) of NUMA nodes.
> > 
> > > Traditionally they tend to want to remove such things instead of adding
> > > them.
> > 
> > I do not insist on this patch but I find it handy. If there is an
> > opposition I will not miss it much.
> 
> Well, I don't have machines like that either and don't mind the patch.
> Just raising the issue; I've had the big iron boys complain about
> similar things (typically printing something for every CPU, which gets
> out of hand much faster than zones, but still).

Maybe we can try to push this through and revert if somebody complains
about an excessive output.

-- 
Michal Hocko
SUSE Labs


Re: [PATCH v3 2/2] drivers/mtd: Fix device registration error

2019-02-13 Thread Boris Brezillon
Subject prefix should be "mtd: powernv_flash: "

On Mon, 11 Feb 2019 19:03:38 +0530
"Aneesh Kumar K.V"  wrote:

> This change helps me to get multiple mtd device registered. Without this
> I get
> 
> sysfs: cannot create duplicate filename '/bus/nvmem/devices/flash0'
> CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.0.0-rc2-00557-g1ef20ef21f22 #13
> Call Trace:
> [c000b38e3220] [c0b58fe4] dump_stack+0xe8/0x164 (unreliable)
> [c000b38e3270] [c04cf074] sysfs_warn_dup+0x84/0xb0
> [c000b38e32f0] [c04cf6c4] 
> sysfs_do_create_link_sd.isra.0+0x114/0x150
> [c000b38e3340] [c0726a84] bus_add_device+0x94/0x1e0
> [c000b38e33c0] [c07218f0] device_add+0x4d0/0x830
> [c000b38e3480] [c09d54a8] nvmem_register.part.2+0x1c8/0xb30
> [c000b38e3560] [c0834530] mtd_nvmem_add+0x90/0x120
> [c000b38e3650] [c0835bc8] add_mtd_device+0x198/0x4e0
> [c000b38e36f0] [c083619c] mtd_device_parse_register+0x11c/0x280
> [c000b38e3780] [c0840830] powernv_flash_probe+0x180/0x250
> [c000b38e3820] [c072c120] platform_drv_probe+0x60/0xf0
> [c000b38e38a0] [c07283c8] really_probe+0x138/0x4d0
> [c000b38e3930] [c0728acc] driver_probe_device+0x13c/0x1b0
> [c000b38e39b0] [c0728c7c] __driver_attach+0x13c/0x1c0
> [c000b38e3a30] [c0725130] bus_for_each_dev+0xa0/0x120
> [c000b38e3a90] [c0727b2c] driver_attach+0x2c/0x40
> [c000b38e3ab0] [c07270f8] bus_add_driver+0x228/0x360
> [c000b38e3b40] [c072a2e0] driver_register+0x90/0x1a0
> [c000b38e3bb0] [c072c020] __platform_driver_register+0x50/0x70
> [c000b38e3bd0] [c105c984] powernv_flash_driver_init+0x24/0x38
> [c000b38e3bf0] [c0010904] do_one_initcall+0x84/0x464
> [c000b38e3cd0] [c1004548] kernel_init_freeable+0x530/0x634
> [c000b38e3db0] [c0011154] kernel_init+0x1c/0x168
> [c000b38e3e20] [c000bed4] ret_from_kernel_thread+0x5c/0x68
> mtd mtd1: Failed to register NVMEM device
> 
> With the change we now have
> 
> root@(none):/sys/bus/nvmem/devices# ls -al
> total 0
> drwxr-xr-x 2 root root 0 Feb  6 20:49 .
> drwxr-xr-x 4 root root 0 Feb  6 20:49 ..
> lrwxrwxrwx 1 root root 0 Feb  6 20:49 flash@0 -> 
> ../../../devices/platform/ibm,opal:flash@0/mtd/mtd0/flash@0
> lrwxrwxrwx 1 root root 0 Feb  6 20:49 flash@1 -> 
> ../../../devices/platform/ibm,opal:flash@1/mtd/mtd1/flash@1
> 
> Fixes: acfe63ec1c59 ("mtd: Convert to using %pOFn instead of 
> device_node.name")

Actually it's not this commit that is at fault as mtd->name was already
given the value of device_node->name before that. I think you're
actually fixing 1cbb4a1c433a ("mtd: powernv: Add powernv flash MTD
abstraction driver").

No need to send a new version, I can fix that when applying, just let
me know if you're okay with the changes I suggested.

> Signed-off-by: Aneesh Kumar K.V 
> ---
>  drivers/mtd/devices/powernv_flash.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/mtd/devices/powernv_flash.c 
> b/drivers/mtd/devices/powernv_flash.c
> index 22f753e555ac..83f88b8b5d9f 100644
> --- a/drivers/mtd/devices/powernv_flash.c
> +++ b/drivers/mtd/devices/powernv_flash.c
> @@ -212,7 +212,7 @@ static int powernv_flash_set_driver_info(struct device 
> *dev,
>* Going to have to check what details I need to set and how to
>* get them
>*/
> - mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%pOFn", dev->of_node);
> + mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%pOFP", dev->of_node);
>   mtd->type = MTD_NORFLASH;
>   mtd->flags = MTD_WRITEABLE;
>   mtd->size = size;



Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Peter Zijlstra
On Wed, Feb 13, 2019 at 12:50:14PM +0100, Michal Hocko wrote:
> On Wed 13-02-19 11:32:31, Peter Zijlstra wrote:
> > On Wed, Feb 13, 2019 at 10:43:15AM +0100, Michal Hocko wrote:
> > > @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
> > >  
> > >   build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> > >   build_thisnode_zonelists(pgdat);
> > > +
> > > + pr_info("node[%d] zonelist: ", pgdat->node_id);
> > > + for_each_zone_zonelist(zone, z, 
> > > >node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> > > + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> > > + pr_cont("\n");
> > >  }
> > 
> > Have you ran this by the SGI and other stupid large machine vendors?
> 
> I do not have such a large machine handy. The biggest I have has
> handfull (say dozen) of NUMA nodes.
> 
> > Traditionally they tend to want to remove such things instead of adding
> > them.
> 
> I do not insist on this patch but I find it handy. If there is an
> opposition I will not miss it much.

Well, I don't have machines like that either and don't mind the patch.
Just raising the issue; I've had the big iron boys complain about
similar things (typically printing something for every CPU, which gets
out of hand much faster than zones, but still).


Re: Kernel panic when loading the IDE controller driver

2019-02-13 Thread sgosavi1
> Why using 4.15.13 which is obsolete instead of using one of the Long 
> Term Support versions which are still maintained, like 4.14 or 4.19 ? 
> (see the complete list at https://www.kernel.org/category/releases.html)

Well, when I started this task 4.15.13 was probably the latest stable
release and hence we decided to port this version. In the older kernel, we
have the m8260_setup.c source file for our board where the function
"io_block_mapping" was used to configure the non-standard IO port address
starting at 0xe000 location. This address was passed as the base address
followed by control address and IRQ number to the ide-core.ko module. In the
new kernel we do not have an option to send these addresses and IRQ numbers
as arguments to the driver. Instead the ide-generic.c source file in the new
kernel uses the standard IO port values and IRQ values. I modified the code
in the above file to used the addresses and IRQ number we used in the past.
Also, added code in the "MMU_init" function call available under
arch/PowerPC/init_32.c to setup the IO port address range by adding the
"io_block_mapping" call and the required IO port address range.

Is there anything else that needs to be added or how can we configure the
desired IO address range in the new kernel?

Thanks,
Sachin



--
Sent from: http://linuxppc.10917.n7.nabble.com/linuxppc-dev-f3.html


Re: [PATCH] mmap.2: describe the 5level paging hack

2019-02-13 Thread Will Deacon
Hi Jann,

On Mon, Feb 11, 2019 at 05:36:53PM +0100, Jann Horn wrote:
> The manpage is missing information about the compatibility hack for
> 5-level paging that went in in 4.14, around commit ee00f4a32a76 ("x86/mm:
> Allow userspace have mappings above 47-bit"). Add some information about
> that.
> 
> While I don't think any hardware supporting this is shipping yet (?), I
> think it's useful to try to write a manpage for this API, partly to
> figure out how usable that API actually is, and partly because when this
> hardware does ship, it'd be nice if distro manpages had information about
> how to use it.
> 
> Signed-off-by: Jann Horn 
> ---
> This patch goes on top of the patch "[PATCH] mmap.2: fix description of
> treatment of the hint" that I just sent, but I'm not sending them in a
> series because I want the first one to go in, and I think this one might
> be a bit more controversial.
> 
> It would be nice if the architecture maintainers and mm folks could have
> a look at this and check that what I wrote is right - I only looked at
> the source for this, I haven't tried it.
> 
>  man2/mmap.2 | 15 +++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/man2/mmap.2 b/man2/mmap.2
> index 8556bbfeb..977782fa8 100644
> --- a/man2/mmap.2
> +++ b/man2/mmap.2
> @@ -67,6 +67,8 @@ is NULL,
>  then the kernel chooses the (page-aligned) address
>  at which to create the mapping;
>  this is the most portable method of creating a new mapping.
> +On Linux, in this case, the kernel may limit the maximum address that can be
> +used for allocations to a legacy limit for compatibility reasons.
>  If
>  .I addr
>  is not NULL,
> @@ -77,6 +79,19 @@ or equal to the value specified by
>  and attempt to create the mapping there.
>  If another mapping already exists there, the kernel picks a new
>  address, independent of the hint.
> +However, if a hint above the architecture's legacy address limit is provided
> +(on x86-64: above 0x7000, on arm64: above 0x1, on ppc64 
> with
> +book3s: above 0x7fff or 0x3fff, depending on page size), the
> +kernel is permitted to allocate mappings beyond the architecture's legacy
> +address limit.

On arm64 we support 36-bit, 39-bit, 42-bit, 47-bit, 48-bit and 52-bit user
virtual addresses, some of which also enforce a particular page size of 4k,
16k or 64k. With the exception of 52-bit, the user virtual address size is
fixed at compile time and mmap() can allocate up to the maximum address
size.

When 52-bit virtual addressing is configured, we continue to allocate up to
48 bits unless either a hint is passed to mmap() as you describe, or
CONFIG_ARM64_FORCE_52BIT=y (this is really intended as a debug option and is
hidden behind EXPERT as well as being off by default).

One thing that just occurred to me is that our ASLR code is probably pretty
weak for addresses greater than 48 bits because I don't think it was updated
when we added 52-bit support. I'll take a deeper look when I get some time.

Will


Re: Kernel panic when loading the IDE controller driver

2019-02-13 Thread Christophe Leroy




Le 13/02/2019 à 13:24, sgosavi1 a écrit :

What it the last linux version known to work properly?



We have used it successfully in the Linux-2.6.17.6 version.


Oh, ok, there's a big gap between the two versions.

Why using 4.15.13 which is obsolete instead of using one of the Long 
Term Support versions which are still maintained, like 4.14 or 4.19 ? 
(see the complete list at https://www.kernel.org/category/releases.html)


Christophe




Thanks,
Sachin.



--
Sent from: http://linuxppc.10917.n7.nabble.com/linuxppc-dev-f3.html



Re: Kernel panic when loading the IDE controller driver

2019-02-13 Thread sgosavi1
> What it the last linux version known to work properly?


We have used it successfully in the Linux-2.6.17.6 version. 


Thanks,
Sachin.



--
Sent from: http://linuxppc.10917.n7.nabble.com/linuxppc-dev-f3.html


Re: [PATCH v2 2/2] powerpc/8xx: Map a second 8M text page at startup when needed.

2019-02-13 Thread Christophe Leroy



Le 21/01/2019 à 12:34, Christophe Leroy a écrit :

Some debug setup like CONFIG_KASAN generate huge
kernels with text size over the 8M limit.

This patch maps a second 8M page when _einittext is over 8M.


This is not enough for CONFIG_KASAN_INLINE. I'll send a v3 which maps up 
to 32M based on _einittext.





Signed-off-by: Christophe Leroy 
---
  v2: Using IS_ENABLED() instead of #ifdef in 8xx_mmu.c

  arch/powerpc/kernel/head_8xx.S | 27 +--
  arch/powerpc/mm/8xx_mmu.c  |  3 +++
  2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index 20cc816b3508..3b3b7846247f 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -337,8 +337,8 @@ InstructionTLBMiss:
rlwinm  r10, r10, 16, 0xfff8
cmpli   cr0, r10, PAGE_OFFSET@h
  #ifndef CONFIG_PIN_TLB_TEXT
-   /* It is assumed that kernel code fits into the first 8M page */
-0: cmpli   cr7, r10, (PAGE_OFFSET + 0x080)@h
+   /* It is assumed that kernel code fits into the two first 8M pages */
+0: cmpli   cr7, r10, (PAGE_OFFSET + 0x100)@h
patch_site  0b, patch__itlbmiss_linmem_top
  #endif
  #endif
@@ -908,6 +908,29 @@ initial_mmu:
li  r8, MI_BOOTINIT /* Create RPN for address 0 */
mtspr   SPRN_MI_RPN, r8 /* Store TLB entry */
  
+	/* Map a second 8M page if needed */

+   lis r9, _einittext@h
+   orisr9, r9, _einittext@l
+   cmpli   cr0, r9, (PAGE_OFFSET + 0x800)@h


Should be 0x80 here

Christophe


+   blt 1f
+
+#ifdef CONFIG_PIN_TLB_TEXT
+   lis r8, MI_RSV4I@h
+   ori r8, r8, 0x1d00
+
+   mtspr   SPRN_MI_CTR, r8 /* Set instruction MMU control */
+#endif
+
+   lis r8, (KERNELBASE + 0x80)@h   /* Create vaddr for TLB */
+   ori r8, r8, MI_EVALID   /* Mark it valid */
+   mtspr   SPRN_MI_EPN, r8
+   li  r8, MI_PS8MEG /* Set 8M byte page */
+   ori r8, r8, MI_SVALID   /* Make it valid */
+   mtspr   SPRN_MI_TWC, r8
+   li  r8, MI_BOOTINIT /* Create RPN for address 0 */
+   addis   r8, r8, 0x80
+   mtspr   SPRN_MI_RPN, r8 /* Store TLB entry */
+1:
lis r8, MI_APG_INIT@h   /* Set protection modes */
ori r8, r8, MI_APG_INIT@l
mtspr   SPRN_MI_AP, r8
diff --git a/arch/powerpc/mm/8xx_mmu.c b/arch/powerpc/mm/8xx_mmu.c
index 92b677faea8c..b5f6d794281d 100644
--- a/arch/powerpc/mm/8xx_mmu.c
+++ b/arch/powerpc/mm/8xx_mmu.c
@@ -112,6 +112,9 @@ unsigned long __init mmu_mapin_ram(unsigned long top)
mmu_patch_cmp_limit(__itlbmiss_linmem_top, 0);
} else {
mapped = top & ~(LARGE_PAGE_SIZE_8M - 1);
+   if (!IS_ENABLED(CONFIG_PIN_TLB_TEXT))
+   mmu_patch_cmp_limit(__itlbmiss_linmem_top,
+   _ALIGN(__pa(_einittext), 8 << 20));
}
  
  	mmu_patch_cmp_limit(__dtlbmiss_linmem_top, mapped);




Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Michal Hocko
On Wed 13-02-19 11:32:31, Peter Zijlstra wrote:
> On Wed, Feb 13, 2019 at 10:43:15AM +0100, Michal Hocko wrote:
> > @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
> >  
> > build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> > build_thisnode_zonelists(pgdat);
> > +
> > +   pr_info("node[%d] zonelist: ", pgdat->node_id);
> > +   for_each_zone_zonelist(zone, z, 
> > >node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> > +   pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> > +   pr_cont("\n");
> >  }
> 
> Have you ran this by the SGI and other stupid large machine vendors?

I do not have such a large machine handy. The biggest I have has
handfull (say dozen) of NUMA nodes.

> Traditionally they tend to want to remove such things instead of adding
> them.

I do not insist on this patch but I find it handy. If there is an
opposition I will not miss it much.
-- 
Michal Hocko
SUSE Labs


Re: Kernel panic when loading the IDE controller driver

2019-02-13 Thread Christophe Leroy

Hi,

Le 13/02/2019 à 12:01, sgosavi1 a écrit :

Hi All,

I have been working on porting Linux-4.15.13 kernel on our existing MPC8270
processor board. For this exercise, I have used pq2fads as a reference
board, its associated device tree and used cuImage for building the kernel.


What it the last linux version known to work properly ?

Christophe



I am facing an issue with the generic IDE flash controller driver in the new
kernel source where it fails to detect the Flash controller connected to 2GB
NAND flash available on the board. We have used non-standard IO port
addresses for this driver in the older kernel and I have added the required
code in the new kernel to setup the set of addresses that we need as IO
ports. Also, modified the code in the drivers/ide/ source to use the
non-standard IO port address. However, during boot up while inserting the
module I continue to get the below errors.

[4.116587] ide: forcing hda as a disk (3543/255/63)
[4.184190] Probing IDE interface ide0...
[4.226330] Machine check in kernel mode.
[4.233809] Caused by (from SRR1=49030):
[4.233826] Transfer error ack signal
[4.249263] IN from bad port e00e at 004123ef

I understand that this is an addressing issue but not sure exactly what am I
missing to fix the problem. Can you provide me your inputs on debugging this
issue?

Thanks,
Sachin.




--
Sent from: http://linuxppc.10917.n7.nabble.com/linuxppc-dev-f3.html



Re: [PATCH] hugetlb: allow to free gigantic pages regardless of the configuration

2019-02-13 Thread Vlastimil Babka
On 1/17/19 7:39 PM, Alexandre Ghiti wrote:
> From: Alexandre Ghiti 
> 
> On systems without CMA or (MEMORY_ISOLATION && COMPACTION) activated but
> that support gigantic pages, boottime reserved gigantic pages can not be
> freed at all. This patchs simply enables the possibility to hand back
> those pages to memory allocator.
> 
> This commit then renames gigantic_page_supported and
> ARCH_HAS_GIGANTIC_PAGE to make them more accurate. Indeed, those values
> being false does not mean that the system cannot use gigantic pages: it
> just means that runtime allocation of gigantic pages is not supported,
> one can still allocate boottime gigantic pages if the architecture supports
> it.
> 
> Signed-off-by: Alexandre Ghiti 

I'm fine with the change, but wonder if this can be structured better in a way
which would remove the duplicated "if (MEMORY_ISOLATION && COMPACTION) || CMA"
from all arches, as well as the duplicated
gigantic_page_runtime_allocation_supported()

something like:

- "select ARCH_HAS_GIGANTIC_PAGE" has no conditions, it just says the arch can
support them either at boottime or runtime (but runtime is usable only if other
conditions are met)
- gigantic_page_runtime_allocation_supported() is a function that returns true
if ARCH_HAS_GIGANTIC_PAGE && ((MEMORY_ISOLATION && COMPACTION) || CMA) and
there's a single instance, not per-arch.
- code for freeing gigantic pages can probably still be conditional on
ARCH_HAS_GIGANTIC_PAGE

BTW I wanted also to do something about the "(MEMORY_ISOLATION && COMPACTION) ||
CMA" ugliness itself, i.e. put the common parts behind some new kconfig
(COMPACTION_CORE ?) and expose it better to users, but I can take a stab on that
once the above part is settled.

Vlastimil


[PATCH] powerpc/mm/hash: Increase vmalloc space with hash translation mode

2019-02-13 Thread Aneesh Kumar K.V
From: Michael Ellerman 

This patch updates the kernel none linear virtual map area size to 512TB
with 64K page size and hash translation mode. We allocate one context
for the vmalloc region and hence the max virtual area size is limited
by the context map size (512TB for 64K and 64TB for 4K page size).

This patch fixes boot failures with large amounts of system RAM where we
need large vmalloc space to handle per cpu allocation.

Signed-off-by: Michael Ellerman 
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/hash.h  | 30 +++---
 arch/powerpc/include/asm/book3s/64/radix.h |  5 +---
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 247aff9cc6ba..0a7b7d5bfa86 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -40,22 +40,34 @@
 #else
 #define H_PUD_CACHE_INDEX  (H_PUD_INDEX_SIZE)
 #endif
+
 /*
- * Define the address range of the kernel non-linear virtual area
+ * Define the address range of the kernel non-linear virtual area. In contrast
+ * to the linear mapping, this is managed using the kernel page tables and then
+ * inserted into the hash page table to actually take effect, similarly to user
+ * mappings.
  */
 #define H_KERN_VIRT_START ASM_CONST(0xD000)
-#define H_KERN_VIRT_SIZE  ASM_CONST(0x4000) /* 64T */
+/*
+ * Allow virtual mapping of one context size.
+ * 512TB for 64K page size
+ * 64TB for 4K page size
+ */
+#define H_KERN_VIRT_SIZE (1UL << MAX_EA_BITS_PER_CONTEXT)
+/*
+ * 8TB IO mapping size
+ */
+#define H_KERN_IO_SIZE ASM_CONST(0x800) /* 8T */
 
 /*
- * The vmalloc space starts at the beginning of that region, and
- * occupies half of it on hash CPUs and a quarter of it on Book3E
- * (we keep a quarter for the virtual memmap)
+ * The vmalloc space starts at the beginning of the kernel non-linear virtual
+ * region, and occupies 504T (64K) or 56T (4K)
  */
-#define H_VMALLOC_STARTH_KERN_VIRT_START
-#define H_VMALLOC_SIZE ASM_CONST(0x3800) /* 56T */
-#define H_VMALLOC_END  (H_VMALLOC_START + H_VMALLOC_SIZE)
+#define H_VMALLOC_START H_KERN_VIRT_START
+#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE - H_KERN_IO_SIZE)
+#define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE)
 
-#define H_KERN_IO_STARTH_VMALLOC_END
+#define H_KERN_IO_START H_VMALLOC_END
 
 /*
  * Region IDs
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index 7d1a3d1543fc..c7a0feaa1013 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -104,10 +104,7 @@
 #define RADIX_VMALLOC_STARTRADIX_KERN_VIRT_START
 #define RADIX_VMALLOC_SIZE (RADIX_KERN_VIRT_SIZE >> 2)
 #define RADIX_VMALLOC_END  (RADIX_VMALLOC_START + RADIX_VMALLOC_SIZE)
-/*
- * Defines the address of the vmemap area, in its own region on
- * hash table CPUs.
- */
+
 #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END)
 
 #define RADIX_KERN_IO_START(RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE 
>> 1))
-- 
2.20.1



Kernel panic when loading the IDE controller driver

2019-02-13 Thread sgosavi1
Hi All,

I have been working on porting Linux-4.15.13 kernel on our existing MPC8270
processor board. For this exercise, I have used pq2fads as a reference
board, its associated device tree and used cuImage for building the kernel.
I am facing an issue with the generic IDE flash controller driver in the new
kernel source where it fails to detect the Flash controller connected to 2GB
NAND flash available on the board. We have used non-standard IO port
addresses for this driver in the older kernel and I have added the required
code in the new kernel to setup the set of addresses that we need as IO
ports. Also, modified the code in the drivers/ide/ source to use the
non-standard IO port address. However, during boot up while inserting the
module I continue to get the below errors.

[4.116587] ide: forcing hda as a disk (3543/255/63)
[4.184190] Probing IDE interface ide0...
[4.226330] Machine check in kernel mode.
[4.233809] Caused by (from SRR1=49030):
[4.233826] Transfer error ack signal
[4.249263] IN from bad port e00e at 004123ef

I understand that this is an addressing issue but not sure exactly what am I
missing to fix the problem. Can you provide me your inputs on debugging this
issue?

Thanks,
Sachin.




--
Sent from: http://linuxppc.10917.n7.nabble.com/linuxppc-dev-f3.html


Re: [PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Peter Zijlstra
On Wed, Feb 13, 2019 at 10:43:15AM +0100, Michal Hocko wrote:
> @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
>  
>   build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
>   build_thisnode_zonelists(pgdat);
> +
> + pr_info("node[%d] zonelist: ", pgdat->node_id);
> + for_each_zone_zonelist(zone, z, 
> >node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> + pr_cont("\n");
>  }

Have you ran this by the SGI and other stupid large machine vendors?
Traditionally they tend to want to remove such things instead of adding
them.




[PATCH v3 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Michal Hocko
From: Michal Hocko 

We have seen several bugs where zonelists have not been initialized
properly and it is not really straightforward to track those bugs down.
One way to help a bit at least is to dump zonelists of each node when
they are (re)initialized.

Signed-off-by: Michal Hocko 
---

Sorry for spamming. I have screwed up ammending the previous version.

 mm/page_alloc.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e097f336126..52e54d16662a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+   struct zone *zone;
+   struct zoneref *z;
 
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
 
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
+   pr_info("node[%d] zonelist: ", pgdat->node_id);
+   for_each_zone_zonelist(zone, z, 
>node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
+   pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+   pr_cont("\n");
 }
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
-- 
2.20.1



[PATCH v2 2/2] mm: be more verbose about zonelist initialization

2019-02-13 Thread Michal Hocko
From: Michal Hocko 

We have seen several bugs where zonelists have not been initialized
properly and it is not really straightforward to track those bugs down.
One way to help a bit at least is to dump zonelists of each node when
they are (re)initialized.

Signed-off-by: Michal Hocko 
---
 mm/page_alloc.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2e097f336126..02c843f0db4f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5234,6 +5234,7 @@ static void build_zonelists(pg_data_t *pgdat)
int node, load, nr_nodes = 0;
nodemask_t used_mask;
int local_node, prev_node;
+   struct zone *zone;
 
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
@@ -5259,6 +5260,11 @@ static void build_zonelists(pg_data_t *pgdat)
 
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+
+   pr_info("node[%d] zonelist: ", pgdat->node_id);
+   for_each_zone_zonelist(zone, z, 
>node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
+   pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
+   pr_cont("\n");
 }
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
-- 
2.20.1



Re: [PATCH-tip 00/22] locking/rwsem: Rework rwsem-xadd & enable new rwsem features

2019-02-13 Thread Chen Rong
Hi all,

Kernel test robot reported a will-it-scale.per_thread_ops -64.1% regression on 
IVB-desktop for v4.20-rc1.
The first bad commit is: 9bc8039e715da3b53dbac89525323a9f2f69b7b5, Yang Shi 
: mm: brk: downgrade mmap_sem to read when shrinking
(https://lists.01.org/pipermail/lkp/2018-November/009335.html).

=
compiler/cpufreq_governor/kconfig/mode/nr_task/rootfs/tbox_group/test/testcase/ucode:
  
gcc-7/performance/x86_64-rhel-7.2/thread/100%/debian-x86_64-2018-04-03.cgz/lkp-ivb-d01/brk1/will-it-scale/0x20

commit: 
  85a06835f6 ("mm: mremap: downgrade mmap_sem to read when shrinking")
  9bc8039e71 ("mm: brk: downgrade mmap_sem to read when shrinking")

85a06835f6f1ba79 9bc8039e715da3b53dbac89525 
 -- 
 %stddev %change %stddev
 \  |\  
196250 ±  8% -64.1%  70494will-it-scale.per_thread_ops
127330 ± 19% -98.0%   2525 ± 24%  
will-it-scale.time.involuntary_context_switches
727.50 ±  2% -77.0% 167.25
will-it-scale.time.percent_of_cpu_this_job_got
  2141 ±  2% -77.6% 479.12will-it-scale.time.system_time
 50.48 ±  7% -48.5%  25.98will-it-scale.time.user_time
  34925294 ± 18%+270.3%  1.293e+08 ±  4%  
will-it-scale.time.voluntary_context_switches
   1570007 ±  8% -64.1% 563958will-it-scale.workload
  6435 ±  2%  -6.4%   6024proc-vmstat.nr_shmem
  1298 ± 16% -44.5% 721.00 ± 18%  proc-vmstat.pgactivate
  2341   +16.4%   2724slabinfo.kmalloc-96.active_objs
  2341   +16.4%   2724slabinfo.kmalloc-96.num_objs
  6346 ±150% -87.8% 776.25 ±  9%  softirqs.NET_RX
160107 ±  8%+151.9% 403273softirqs.SCHED
   1097999   -13.0% 955526softirqs.TIMER
  5.50 ±  9% -81.8%   1.00vmstat.procs.r
230700 ± 19%+269.9% 853292 ±  4%  vmstat.system.cs
 26706 ±  3% +15.7%  30910 ±  5%  vmstat.system.in
 11.24 ± 23% +72.2   83.39mpstat.cpu.idle%
  0.00 ±131%  +0.00.04 ± 99%  mpstat.cpu.iowait%
 86.32 ±  2% -70.8   15.54mpstat.cpu.sys%
  2.44 ±  7%  -1.41.04 ±  8%  mpstat.cpu.usr%
  20610709 ± 15%   +2376.0%  5.103e+08 ± 34%  cpuidle.C1.time
   3233399 ±  8%+241.5%   11042785 ± 25%  cpuidle.C1.usage
  36172040 ±  6%+931.3%   3.73e+08 ± 15%  cpuidle.C1E.time
783605 ±  4%+548.7%5083041 ± 18%  cpuidle.C1E.usage
  28753819 ± 39%   +1054.5%  3.319e+08 ± 49%  cpuidle.C3.time
283912 ± 25%+688.4%2238225 ± 34%  cpuidle.C3.usage
 1.507e+08 ± 47%+292.3%  5.913e+08 ± 28%  cpuidle.C6.time
339861 ± 37%+549.7%2208222 ± 24%  cpuidle.C6.usage
   2709719 ±  5%+824.2%   25043444cpuidle.POLL.time
  28602864 ± 18%+173.7%   78276116 ± 10%  cpuidle.POLL.usage


We found that the patchset could fix the regression.

tests: 1
testcase/path_params/tbox_group/run: 
will-it-scale/performance-thread-100%-brk1-ucode=0x20/lkp-ivb-d01

commit: 
  85a06835f6 ("mm: mremap: downgrade mmap_sem to read when shrinking")
  fb835fe7f0 ("locking/rwsem: Ensure an RT task will not spin on reader")

85a06835f6f1ba79  fb835fe7f0adbd7c2c074b98ec  
  --  
 %stddev  change %stddev
 \  |\  
120736 ± 22%56% 188019 ±  6%  
will-it-scale.time.involuntary_context_switches
  2126 ±  3% 4%   2215will-it-scale.time.system_time
   722 ±  3% 4%752
will-it-scale.time.percent_of_cpu_this_job_got
  36256485 ± 27%   -35%   23682989 ±  3%  
will-it-scale.time.voluntary_context_switches
  3151 ±  9%11%   3504turbostat.Avg_MHz
229285 ± 32%   -30% 160660 ±  3%  vmstat.system.cs
120736 ± 22%56% 188019 ±  6%  time.involuntary_context_switches
  2126 ±  3% 4%   2215time.system_time
   722 ±  3% 4%752time.percent_of_cpu_this_job_got
  36256485 ± 27%   -35%   23682989 ±  3%  time.voluntary_context_switches
23 643%171 ±  3%  proc-vmstat.nr_zone_inactive_file
23 643%171 ±  3%  proc-vmstat.nr_inactive_file
  3664  12%   4121proc-vmstat.nr_kernel_stack
  6392   6%   6785proc-vmstat.nr_slab_unreclaimable
  9991   10176proc-vmstat.nr_slab_reclaimable
 63938   62394proc-vmstat.nr_zone_active_anon
 63938   62394proc-vmstat.nr_active_anon
386388 ±  9%-6% 362272proc-vmstat.pgfree
368296 ±  9%   -10% 333074

Re: [PATCH v2 2/2] locking/rwsem: Optimize down_read_trylock()

2019-02-13 Thread Ingo Molnar


* Waiman Long  wrote:

> I looked at the assembly code in arch/x86/include/asm/rwsem.h. For both
> trylocks (read & write), the count is read first before attempting to
> lock it. We did the same for all trylock functions in other locks.
> Depending on how the trylock is used and how contended the lock is, it
> may help or hurt performance. Changing down_read_trylock to do an
> unconditional cmpxchg will change the performance profile of existing
> code. So I would prefer keeping the current code.
> 
> I do notice now that the generic down_write_trylock() code is doing an
> unconditional compxchg. So I wonder if we should change it to read the
> lock first like other trylocks or just leave it as it is.

No, I think we should instead move the other trylocks to the 
try-for-ownership model as well, like Linus suggested.

That's the general assumption we make in locking primitives, that we 
optimize for the common, expected case - which would be that the trylock 
succeeds, and I don't see why trylock primitives should be different.

In fact I can see more ways for read-for-sharing to perform suboptimally 
on larger systems.

Thanks,

Ingo


Re: [PATCH 01/12] mfd/sm501: depend on HAS_DMA

2019-02-13 Thread Lee Jones
On Mon, 11 Feb 2019, Christoph Hellwig wrote:

> Currently the sm501 mfd driver can be compiled without any dependencies,
> but through the use of dma_declare_coherent it really depends on
> having DMA and iomem support.  Normally we don't explicitly require DMA
> support as we have stubs for it if on UML, but in this case the driver
> selects support for dma_declare_coherent and thus also requires
> memmap support.  Guard this by an explicit dependency.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/mfd/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
> index f461460a2aeb..f15f6489803d 100644
> --- a/drivers/mfd/Kconfig
> +++ b/drivers/mfd/Kconfig
> @@ -1066,6 +1066,7 @@ config MFD_SI476X_CORE
>  
>  config MFD_SM501
>   tristate "Silicon Motion SM501"
> + depends on HAS_DMA
>---help---
> This is the core driver for the Silicon Motion SM501 multimedia
> companion chip. This device is a multifunction device which may

I would normally have taken this, but I fear it will conflict with
[PATCH 06/12].  For that reason, just take my:

  Acked-by: Lee Jones 

-- 
Lee Jones [李琼斯]
Linaro Services Technical Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog


Re: [PATCH 06/12] dma-mapping: improve selection of dma_declare_coherent availability

2019-02-13 Thread Lee Jones
On Mon, 11 Feb 2019, Christoph Hellwig wrote:

> This API is primarily used through DT entries, but two architectures
> and two drivers call it directly.  So instead of selecting the config
> symbol for random architectures pull it in implicitly for the actual
> users.  Also rename the Kconfig option to describe the feature better.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/arc/Kconfig| 1 -
>  arch/arm/Kconfig| 2 +-
>  arch/arm64/Kconfig  | 1 -
>  arch/csky/Kconfig   | 1 -
>  arch/mips/Kconfig   | 1 -
>  arch/riscv/Kconfig  | 1 -
>  arch/sh/Kconfig | 2 +-
>  arch/unicore32/Kconfig  | 1 -
>  arch/x86/Kconfig| 1 -

>  drivers/mfd/Kconfig | 2 ++

If everyone else is happy with these changes, then so am I.

  Acked-by: Lee Jones 

>  drivers/of/Kconfig  | 3 ++-
>  include/linux/device.h  | 2 +-
>  include/linux/dma-mapping.h | 8 
>  kernel/dma/Kconfig  | 2 +-
>  kernel/dma/Makefile | 2 +-
>  15 files changed, 13 insertions(+), 17 deletions(-)

-- 
Lee Jones [李琼斯]
Linaro Services Technical Lead
Linaro.org │ Open source software for ARM SoCs
Follow Linaro: Facebook | Twitter | Blog


[PATCH 32/32] powerpc/dma: trim the fat from

2019-02-13 Thread Christoph Hellwig
There is no need to provide anything but get_arch_dma_ops to
.  More the remaining declarations to 
and drop all the includes.

Signed-off-by: Christoph Hellwig 
Tested-by: Christian Zigotzky 
---
 arch/powerpc/include/asm/dma-mapping.h| 29 ---
 arch/powerpc/include/asm/iommu.h  | 10 +++
 arch/powerpc/platforms/44x/ppc476.c   |  1 +
 arch/powerpc/platforms/85xx/corenet_generic.c |  1 +
 arch/powerpc/platforms/85xx/qemu_e500.c   |  1 +
 arch/powerpc/sysdev/fsl_pci.c |  1 +
 6 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/include/asm/dma-mapping.h 
b/arch/powerpc/include/asm/dma-mapping.h
index a59c42879194..565d6f74b189 100644
--- a/arch/powerpc/include/asm/dma-mapping.h
+++ b/arch/powerpc/include/asm/dma-mapping.h
@@ -1,37 +1,9 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
  * Copyright (C) 2004 IBM
- *
- * Implements the generic device dma API for powerpc.
- * the pci and vio busses
  */
 #ifndef _ASM_DMA_MAPPING_H
 #define _ASM_DMA_MAPPING_H
-#ifdef __KERNEL__
-
-#include 
-#include 
-/* need struct page definitions */
-#include 
-#include 
-#include 
-#include 
-#include 
-
-static inline unsigned long device_to_mask(struct device *dev)
-{
-   if (dev->dma_mask && *dev->dma_mask)
-   return *dev->dma_mask;
-   /* Assume devices without mask can take 32 bit addresses */
-   return 0xul;
-}
-
-/*
- * Available generic sets of operations
- */
-#ifdef CONFIG_PPC64
-extern const struct dma_map_ops dma_iommu_ops;
-#endif
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
 {
@@ -43,5 +15,4 @@ static inline const struct dma_map_ops 
*get_arch_dma_ops(struct bus_type *bus)
return NULL;
 }
 
-#endif /* __KERNEL__ */
 #endif /* _ASM_DMA_MAPPING_H */
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 6f00a892ebdf..0ac52392ed99 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -325,5 +325,15 @@ extern bool iommu_fixed_is_weak;
 #define iommu_fixed_is_weak false
 #endif
 
+extern const struct dma_map_ops dma_iommu_ops;
+
+static inline unsigned long device_to_mask(struct device *dev)
+{
+   if (dev->dma_mask && *dev->dma_mask)
+   return *dev->dma_mask;
+   /* Assume devices without mask can take 32 bit addresses */
+   return 0xul;
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_IOMMU_H */
diff --git a/arch/powerpc/platforms/44x/ppc476.c 
b/arch/powerpc/platforms/44x/ppc476.c
index e55933f9cd55..a5e61e5c16e2 100644
--- a/arch/powerpc/platforms/44x/ppc476.c
+++ b/arch/powerpc/platforms/44x/ppc476.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c 
b/arch/powerpc/platforms/85xx/corenet_generic.c
index 808da1e9c0a7..785e9641220d 100644
--- a/arch/powerpc/platforms/85xx/corenet_generic.c
+++ b/arch/powerpc/platforms/85xx/corenet_generic.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
diff --git a/arch/powerpc/platforms/85xx/qemu_e500.c 
b/arch/powerpc/platforms/85xx/qemu_e500.c
index 27631c607f3d..c52c8f9e8385 100644
--- a/arch/powerpc/platforms/85xx/qemu_e500.c
+++ b/arch/powerpc/platforms/85xx/qemu_e500.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include "smp.h"
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index a04c6dde6ed0..f49aec251a5a 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
-- 
2.20.1