[PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-02 Thread Daniel Jordan
Taking and dropping mmap_sem to modify a single counter, locked_vm, is
overkill when the counter could be synchronized separately.

Make mmap_sem a little less coarse by changing locked_vm to an atomic,
the 64-bit variety to avoid issues with overflow on 32-bit systems.

Signed-off-by: Daniel Jordan 
Cc: Alan Tull 
Cc: Alexey Kardashevskiy 
Cc: Alex Williamson 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Davidlohr Bueso 
Cc: Michael Ellerman 
Cc: Moritz Fischer 
Cc: Paul Mackerras 
Cc: Wu Hao 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
---
 arch/powerpc/kvm/book3s_64_vio.c| 14 --
 arch/powerpc/mm/mmu_context_iommu.c | 15 ---
 drivers/fpga/dfl-afu-dma-region.c   | 18 ++
 drivers/vfio/vfio_iommu_spapr_tce.c | 17 +
 drivers/vfio/vfio_iommu_type1.c | 10 ++
 fs/proc/task_mmu.c  |  2 +-
 include/linux/mm_types.h|  2 +-
 kernel/fork.c   |  2 +-
 mm/debug.c  |  5 +++--
 mm/mlock.c  |  4 ++--
 mm/mmap.c   | 18 +-
 mm/mremap.c |  6 +++---
 12 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..e7fdb6d10eeb 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -59,32 +59,34 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
 static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 {
long ret = 0;
+   s64 locked_vm;
 
if (!current || !current->mm)
return ret; /* process exited */
 
down_write(¤t->mm->mmap_sem);
 
+   locked_vm = atomic64_read(¤t->mm->locked_vm);
if (inc) {
unsigned long locked, lock_limit;
 
-   locked = current->mm->locked_vm + stt_pages;
+   locked = locked_vm + stt_pages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
-   current->mm->locked_vm += stt_pages;
+   atomic64_add(stt_pages, ¤t->mm->locked_vm);
} else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
+   if (WARN_ON_ONCE(stt_pages > locked_vm))
+   stt_pages = locked_vm;
 
-   current->mm->locked_vm -= stt_pages;
+   atomic64_sub(stt_pages, ¤t->mm->locked_vm);
}
 
pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
inc ? '+' : '-',
stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
+   atomic64_read(¤t->mm->locked_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK),
ret ? " - exceeded" : "");
 
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index e7a9c4f6bfca..8038ac24a312 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -55,30 +55,31 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
unsigned long npages, bool incr)
 {
long ret = 0, locked, lock_limit;
+   s64 locked_vm;
 
if (!npages)
return 0;
 
down_write(&mm->mmap_sem);
-
+   locked_vm = atomic64_read(&mm->locked_vm);
if (incr) {
-   locked = mm->locked_vm + npages;
+   locked = locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
if (locked > lock_limit && !capable(CAP_IPC_LOCK))
ret = -ENOMEM;
else
-   mm->locked_vm += npages;
+   atomic64_add(npages, &mm->locked_vm);
} else {
-   if (WARN_ON_ONCE(npages > mm->locked_vm))
-   npages = mm->locked_vm;
-   mm->locked_vm -= npages;
+   if (WARN_ON_ONCE(npages > locked_vm))
+   npages = locked_vm;
+   atomic64_sub(npages, &mm->locked_vm);
}
 
pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
current ? current->pid : 0,
incr ? '+' : '-',
npages << PAGE_SHIFT,
-   mm->locked_vm << PAGE_SHIFT,
+   atomic64_read(&mm->loc

[PATCH 5/6] powerpc/mmu: drop mmap_sem now that locked_vm is atomic

2019-04-02 Thread Daniel Jordan
With locked_vm now an atomic, there is no need to take mmap_sem as
writer.  Delete and refactor accordingly.

Signed-off-by: Daniel Jordan 
Cc: Alexey Kardashevskiy 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Davidlohr Bueso 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: 
Cc: 
Cc: 
---
 arch/powerpc/mm/mmu_context_iommu.c | 27 +++
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index 8038ac24a312..a4ef22b67c07 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -54,34 +54,29 @@ struct mm_iommu_table_group_mem_t {
 static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
unsigned long npages, bool incr)
 {
-   long ret = 0, locked, lock_limit;
+   long ret = 0;
+   unsigned long lock_limit;
s64 locked_vm;
 
if (!npages)
return 0;
 
-   down_write(&mm->mmap_sem);
-   locked_vm = atomic64_read(&mm->locked_vm);
if (incr) {
-   locked = locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   locked_vm = atomic64_add_return(npages, &mm->locked_vm);
+   if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   atomic64_add(npages, &mm->locked_vm);
+   atomic64_sub(npages, &mm->locked_vm);
+   }
} else {
-   if (WARN_ON_ONCE(npages > locked_vm))
-   npages = locked_vm;
-   atomic64_sub(npages, &mm->locked_vm);
+   locked_vm = atomic64_sub_return(npages, &mm->locked_vm);
+   WARN_ON_ONCE(locked_vm < 0);
}
 
-   pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-   current ? current->pid : 0,
-   incr ? '+' : '-',
-   npages << PAGE_SHIFT,
-   atomic64_read(&mm->locked_vm) << PAGE_SHIFT,
+   pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %lld/%lu\n",
+   current ? current->pid : 0, incr ? '+' : '-',
+   npages << PAGE_SHIFT, locked_vm << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
-   up_write(&mm->mmap_sem);
 
return ret;
 }
-- 
2.21.0



[PATCH 6/6] kvm/book3s: drop mmap_sem now that locked_vm is atomic

2019-04-02 Thread Daniel Jordan
With locked_vm now an atomic, there is no need to take mmap_sem as
writer.  Delete and refactor accordingly.

Signed-off-by: Daniel Jordan 
Cc: Alexey Kardashevskiy 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Davidlohr Bueso 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: 
Cc: 
Cc: 
Cc: 
---
 arch/powerpc/kvm/book3s_64_vio.c | 34 +++-
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index e7fdb6d10eeb..8e034c3a5d25 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -56,7 +56,7 @@ static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
+static long kvmppc_account_memlimit(unsigned long pages, bool inc)
 {
long ret = 0;
s64 locked_vm;
@@ -64,33 +64,23 @@ static long kvmppc_account_memlimit(unsigned long 
stt_pages, bool inc)
if (!current || !current->mm)
return ret; /* process exited */
 
-   down_write(¤t->mm->mmap_sem);
-
-   locked_vm = atomic64_read(¤t->mm->locked_vm);
if (inc) {
-   unsigned long locked, lock_limit;
+   unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-   locked = locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   locked_vm = atomic64_add_return(pages, ¤t->mm->locked_vm);
+   if (locked_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   atomic64_add(stt_pages, ¤t->mm->locked_vm);
+   atomic64_sub(pages, ¤t->mm->locked_vm);
+   }
} else {
-   if (WARN_ON_ONCE(stt_pages > locked_vm))
-   stt_pages = locked_vm;
-
-   atomic64_sub(stt_pages, ¤t->mm->locked_vm);
+   locked_vm = atomic64_sub_return(pages, ¤t->mm->locked_vm);
+   WARN_ON_ONCE(locked_vm < 0);
}
 
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   atomic64_read(¤t->mm->locked_vm) << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
+   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%lu %lld/%lu%s\n", current->pid,
+   inc ? '+' : '-', pages << PAGE_SHIFT,
+   locked_vm << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
 
return ret;
 }
-- 
2.21.0



[PATCH 0/6] convert locked_vm from unsigned long to atomic64_t

2019-04-02 Thread Daniel Jordan
Hi,

>From patch 1:

  Taking and dropping mmap_sem to modify a single counter, locked_vm, is
  overkill when the counter could be synchronized separately.
  
  Make mmap_sem a little less coarse by changing locked_vm to an atomic,
  the 64-bit variety to avoid issues with overflow on 32-bit systems.

This is a more conservative alternative to [1] with no user-visible
effects.  Thanks to Alexey Kardashevskiy for pointing out the racy
atomics and to Alex Williamson, Christoph Lameter, Ira Weiny, and Jason
Gunthorpe for their comments on [1].

Davidlohr Bueso recently did a similar conversion for pinned_vm[2].

Testing
 1. passes LTP mlock[all], munlock[all], fork, mmap, and mremap tests in an
x86 kvm guest
 2. a VFIO-enabled x86 kvm guest shows the same VmLck in
/proc/pid/status before and after this change
 3. cross-compiles on powerpc

The series is based on v5.1-rc3.  Please consider for 5.2.

Daniel

[1] 
https://lore.kernel.org/linux-mm/20190211224437.25267-1-daniel.m.jor...@oracle.com/
[2] https://lore.kernel.org/linux-mm/20190206175920.31082-1-d...@stgolabs.net/

Daniel Jordan (6):
  mm: change locked_vm's type from unsigned long to atomic64_t
  vfio/type1: drop mmap_sem now that locked_vm is atomic
  vfio/spapr_tce: drop mmap_sem now that locked_vm is atomic
  fpga/dlf/afu: drop mmap_sem now that locked_vm is atomic
  powerpc/mmu: drop mmap_sem now that locked_vm is atomic
  kvm/book3s: drop mmap_sem now that locked_vm is atomic

 arch/powerpc/kvm/book3s_64_vio.c| 34 ++--
 arch/powerpc/mm/mmu_context_iommu.c | 28 +---
 drivers/fpga/dfl-afu-dma-region.c   | 40 -
 drivers/vfio/vfio_iommu_spapr_tce.c | 37 --
 drivers/vfio/vfio_iommu_type1.c | 31 +-
 fs/proc/task_mmu.c  |  2 +-
 include/linux/mm_types.h|  2 +-
 kernel/fork.c   |  2 +-
 mm/debug.c  |  5 ++--
 mm/mlock.c  |  4 +--
 mm/mmap.c   | 18 ++---
 mm/mremap.c |  6 ++---
 12 files changed, 89 insertions(+), 120 deletions(-)


base-commit: 79a3aaa7b82e3106be97842dedfd8429248896e6
-- 
2.21.0



Re: [PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-03 Thread Daniel Jordan
On Tue, Apr 02, 2019 at 03:04:24PM -0700, Andrew Morton wrote:
> On Tue,  2 Apr 2019 16:41:53 -0400 Daniel Jordan  
> wrote:
> >  static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
> >  {
> > long ret = 0;
> > +   s64 locked_vm;
> >  
> > if (!current || !current->mm)
> > return ret; /* process exited */
> >  
> > down_write(¤t->mm->mmap_sem);
> >  
> > +   locked_vm = atomic64_read(¤t->mm->locked_vm);
> > if (inc) {
> > unsigned long locked, lock_limit;
> >  
> > -   locked = current->mm->locked_vm + stt_pages;
> > +   locked = locked_vm + stt_pages;
> > lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > if (locked > lock_limit && !capable(CAP_IPC_LOCK))
> > ret = -ENOMEM;
> > else
> > -   current->mm->locked_vm += stt_pages;
> > +   atomic64_add(stt_pages, ¤t->mm->locked_vm);
> > } else {
> > -   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
> > -   stt_pages = current->mm->locked_vm;
> > +   if (WARN_ON_ONCE(stt_pages > locked_vm))
> > +   stt_pages = locked_vm;
> >  
> > -   current->mm->locked_vm -= stt_pages;
> > +   atomic64_sub(stt_pages, ¤t->mm->locked_vm);
> > }
> 
> With the current code, current->mm->locked_vm cannot go negative. 
> After the patch, it can go negative.  If someone else decreased
> current->mm->locked_vm between this function's atomic64_read() and
> atomic64_sub().
> 
> I guess this is a can't-happen in this case because the racing code
> which performed the modification would have taken it negative anyway.
> 
> But this all makes me rather queazy.

mmap_sem is still held in this patch, so updates to locked_vm are still
serialized and I don't think what you describe can happen.  A later patch
removes mmap_sem, of course, but it also rewrites the code to do something
different.  This first patch is just a mechanical type change from unsigned
long to atomic64_t.

So...does this alleviate your symptoms?

> Also, we didn't remove any down_write(mmap_sem)s from core code so I'm
> thinking that the benefit of removing a few mmap_sem-takings from a few
> obscure drivers (sorry ;)) is pretty small.

Not sure about the other drivers, but vfio type1 isn't obscure.  We use it
extensively in our cloud, and from Andrea's __GFP_THISNODE thread a few months
back it seems Red Hat also uses it:

  https://lore.kernel.org/linux-mm/20180820032204.9591-3-aarca...@redhat.com/

> Also, the argument for switching 32-bit arches to a 64-bit counter was
> suspiciously vague.  What overflow issues?  Or are we just being lazy?

If user-controlled values are used to increase locked_vm, multiple threads
doing it at once on a 32-bit system could theoretically cause overflow, so in
the absence of atomic overflow checking, the 64-bit counter on 32b is defensive
programming.

I wouldn't have thought to do it, but Jason Gunthorpe raised the same issue in
the pinned_vm series:

  https://lore.kernel.org/linux-mm/20190115205311.gd22...@mellanox.com/

I'm fine with changing it to atomic_long_t if the scenario is too theoretical
for people.


Anyway, thanks for looking at this.


Re: [PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-03 Thread Daniel Jordan
On Wed, Apr 03, 2019 at 06:46:07AM +0200, Christophe Leroy wrote:
> 
> 
> Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
> > Taking and dropping mmap_sem to modify a single counter, locked_vm, is
> > overkill when the counter could be synchronized separately.
> > 
> > Make mmap_sem a little less coarse by changing locked_vm to an atomic,
> > the 64-bit variety to avoid issues with overflow on 32-bit systems.
> 
> Can you elaborate on the above ? Previously it was 'unsigned long', what
> were the issues ?

Sure, I responded to this in another thread from this series.

> If there was such issues, shouldn't there be a first patch
> moving it from unsigned long to u64 before this atomic64_t change ? Or at
> least it should be clearly explain here what the issues are and how
> switching to a 64 bit counter fixes them.

Yes, I can explain the motivation in the next version.


Re: [PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-03 Thread Daniel Jordan
On Tue, Apr 02, 2019 at 04:43:57PM -0700, Davidlohr Bueso wrote:
> On Tue, 02 Apr 2019, Andrew Morton wrote:
> 
> > Also, we didn't remove any down_write(mmap_sem)s from core code so I'm
> > thinking that the benefit of removing a few mmap_sem-takings from a few
> > obscure drivers (sorry ;)) is pretty small.
> 
> afaik porting the remaining incorrect users of locked_vm to pinned_vm was
> the next step before this one, which made converting locked_vm to atomic
> hardly worth it. Daniel?
 
Right, as you know I tried those incorrect users first, but there were concerns
about user-visible changes regarding RLIMIT_MEMLOCK and pinned_vm/locked_vm
without the accounting problem between all three being solved.

To my knowledge no one has a solution for that, so in the meantime I'm taking
the incremental step of getting rid of mmap_sem for locked_vm users.  The
locked_vm -> pinned_vm conversion can happen later.


Re: [PATCH 5/6] powerpc/mmu: drop mmap_sem now that locked_vm is atomic

2019-04-03 Thread Daniel Jordan
On Wed, Apr 03, 2019 at 06:58:45AM +0200, Christophe Leroy wrote:
> Le 02/04/2019 à 22:41, Daniel Jordan a écrit :
> > With locked_vm now an atomic, there is no need to take mmap_sem as
> > writer.  Delete and refactor accordingly.
> 
> Could you please detail the change ?

Ok, I'll be more specific in the next version, using some of your language in
fact.  :)

> It looks like this is not the only
> change. I'm wondering what the consequences are.
> 
> Before we did:
> - lock
> - calculate future value
> - check the future value is acceptable
> - update value if future value acceptable
> - return error if future value non acceptable
> - unlock
> 
> Now we do:
> - atomic update with future (possibly too high) value
> - check the new value is acceptable
> - atomic update back with older value if new value not acceptable and return
> error
> 
> So if a concurrent action wants to increase locked_vm with an acceptable
> step while another one has temporarily set it too high, it will now fail.
> 
> I think we should keep the previous approach and do a cmpxchg after
> validating the new value.

That's a good idea, and especially worth doing considering that an arbitrary
number of threads that charge a low amount of locked_vm can fail just because
one thread charges lots of it.

pinned_vm appears to be broken the same way, so I can fix it too unless someone
beats me to it.


Re: [PATCH 0/6] convert locked_vm from unsigned long to atomic64_t

2019-04-03 Thread Daniel Jordan
On Wed, Apr 03, 2019 at 08:51:13AM -0400, Steven Sistare wrote:
> On 4/2/2019 4:41 PM, Daniel Jordan wrote:
> > [1] 
> > https://lore.kernel.org/linux-mm/20190211224437.25267-1-daniel.m.jor...@oracle.com/
> 
>   You could clean all 6 patches up nicely with a common subroutine that
> increases locked_vm subject to the rlimit.  Pass a bool arg that is true if
> the  limit should be enforced, !dma->lock_cap for one call site, and
> !capable(CAP_IPC_LOCK) for the rest.  Push the warnings and debug statements
> to the subroutine as well.  One patch could refactor, and a second could
> change the locking method.

Yes, I tried writing, but didn't end up including, such a subroutine for [1].
The devil was in the details, but with the cmpxchg business, it's more
worthwhile to iron all those out.  I'll give it a try.


Re: [PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-11 Thread Daniel Jordan
On Thu, Apr 11, 2019 at 10:55:43AM +0100, Mark Rutland wrote:
> On Thu, Apr 11, 2019 at 02:22:23PM +1000, Alexey Kardashevskiy wrote:
> > On 03/04/2019 07:41, Daniel Jordan wrote:
> 
> > > - dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
> > > + dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %lld/%lu%s\n", current->pid,
> > >   incr ? '+' : '-', npages << PAGE_SHIFT,
> > > - current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
> > > - ret ? "- exceeded" : "");
> > > + (s64)atomic64_read(¤t->mm->locked_vm) << PAGE_SHIFT,
> > > + rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : "");
> > 
> > 
> > 
> > atomic64_read() returns "long" which matches "%ld", why this change (and
> > similar below)? You did not do this in the two pr_debug()s above anyway.
> 
> Unfortunately, architectures return inconsistent types for atomic64 ops.
> 
> Some return long (e..g. powerpc), some return long long (e.g. arc), and
> some return s64 (e.g. x86).

Yes, Mark said it all, I'm just chiming in to confirm that's why I added the
cast.

Btw, thanks for doing this, Mark.


Re: [PATCH 1/6] mm: change locked_vm's type from unsigned long to atomic64_t

2019-04-22 Thread Daniel Jordan
On Tue, Apr 16, 2019 at 04:33:51PM -0700, Andrew Morton wrote:

Sorry for the delay, I was on vacation all last week.

> What's the status of this patchset, btw?
> 
> I have a note here that
> powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch is to be
> updated.

Yes, the series needs a few updates.  v2 should appear in the next day or two.


[PATCH 2/5] vfio/spapr_tce: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  The SPAPR
TCE VFIO IOMMU driver accounts pinned pages to locked_vm; use pinned_vm
instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan 
---
 Documentation/vfio.txt  |  6 +--
 drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++---
 2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index f1a4d3c3ba0b..fa37d65363f9 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -308,7 +308,7 @@ This implementation has some specifics:
currently there is no way to reduce the number of calls. In order to make
things faster, the map/unmap handling has been implemented in real mode
which provides an excellent performance which has limitations such as
-   inability to do locked pages accounting in real time.
+   inability to do pinned pages accounting in real time.
 
 4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
subtree that can be treated as a unit for the purposes of partitioning and
@@ -324,7 +324,7 @@ This implementation has some specifics:
returns the size and the start of the DMA window on the PCI bus.
 
VFIO_IOMMU_ENABLE
-   enables the container. The locked pages accounting
+   enables the container. The pinned pages accounting
is done at this point. This lets user first to know what
the DMA window is and adjust rlimit before doing any real job.
 
@@ -454,7 +454,7 @@ This implementation has some specifics:
 
PPC64 paravirtualized guests generate a lot of map/unmap requests,
and the handling of those includes pinning/unpinning pages and updating
-   mm::locked_vm counter to make sure we do not exceed the rlimit.
+   mm::pinned_vm counter to make sure we do not exceed the rlimit.
The v2 IOMMU splits accounting and pinning into separate operations:
 
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY/VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY ioctls
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index c424913324e3..f47e020dc5e4 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -34,9 +34,11 @@
 static void tce_iommu_detach_group(void *iommu_data,
struct iommu_group *iommu_group);
 
-static long try_increment_locked_vm(struct mm_struct *mm, long npages)
+static long try_increment_pinned_vm(struct mm_struct *mm, long npages)
 {
-   long ret = 0, locked, lock_limit;
+   long ret = 0;
+   s64 pinned;
+   unsigned long lock_limit;
 
if (WARN_ON_ONCE(!mm))
return -EPERM;
@@ -44,39 +46,33 @@ static long try_increment_locked_vm(struct mm_struct *mm, 
long npages)
if (!npages)
return 0;
 
-   down_write(&mm->mmap_sem);
-   locked = mm->locked_vm + npages;
+   pinned = atomic64_add_return(npages, &mm->pinned_vm);
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   if (pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   mm->locked_vm += npages;
+   atomic64_sub(npages, &mm->pinned_vm);
+   }
 
-   pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
+   pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
npages << PAGE_SHIFT,
-   mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(&mm->mmap_sem);
+   atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
 
return ret;
 }
 
-static void decrement_locked_vm(struct mm_struct *mm, long npages)
+static void decrement_pinned_vm(struct mm_struct *mm, long npages)
 {
if (!mm || !npages)
return;
 
-   down_write(&mm->mmap_sem);
-   if (WARN_ON_ONCE(npages > mm->locked_vm))
-   npages = mm->locked_vm;
-   mm->locked_vm -= npages;
-   pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
+   if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
+   npages = atomic64_read(&mm->pinned_vm);
+   atomic64_sub(npages, &mm->pinned_vm);
+   pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
npages << PAGE_SHIFT,
-   mm-

[PATCH 0/5] use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Hi,

This series converts users that account pinned pages with locked_vm to
account with pinned_vm instead, pinned_vm being the correct counter to
use.  It's based on a similar patch I posted recently[0].

The patches are based on rdma/for-next to build on Davidlohr Bueso's
recent conversion of pinned_vm to an atomic64_t[1].  Seems to make some
sense for these to be routed the same way, despite lack of rdma content?

All five of these places, and probably some of Davidlohr's conversions,
probably want to be collapsed into a common helper in the core mm for
accounting pinned pages.  I tried, and there are several details that
likely need discussion, so this can be done as a follow-on.

I'd appreciate a look at patch 5 especially since the accounting is
unusual no matter whether locked_vm or pinned_vm are used.

On powerpc, this was cross-compile tested only.

[0] http://lkml.kernel.org/r/20181105165558.11698-8-daniel.m.jor...@oracle.com
[1] http://lkml.kernel.org/r/20190206175920.31082-1-d...@stgolabs.net

Daniel Jordan (5):
  vfio/type1: use pinned_vm instead of locked_vm to account pinned pages
  vfio/spapr_tce: use pinned_vm instead of locked_vm to account pinned
pages
  fpga/dlf/afu: use pinned_vm instead of locked_vm to account pinned
pages
  powerpc/mmu: use pinned_vm instead of locked_vm to account pinned
pages
  kvm/book3s: use pinned_vm instead of locked_vm to account pinned pages

 Documentation/vfio.txt  |  6 +--
 arch/powerpc/kvm/book3s_64_vio.c| 35 +++-
 arch/powerpc/mm/mmu_context_iommu.c | 43 ++-
 drivers/fpga/dfl-afu-dma-region.c   | 50 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c | 64 ++---
 drivers/vfio/vfio_iommu_type1.c | 31 ++
 6 files changed, 104 insertions(+), 125 deletions(-)

-- 
2.20.1



[PATCH 4/5] powerpc/mmu: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  The IOMMU
MMU helpers on powerpc account pinned pages to locked_vm; use pinned_vm
instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan 
---
 arch/powerpc/mm/mmu_context_iommu.c | 43 ++---
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index a712a650a8b6..fdf670542847 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -40,36 +40,35 @@ struct mm_iommu_table_group_mem_t {
u64 dev_hpa;/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
+static long mm_iommu_adjust_pinned_vm(struct mm_struct *mm,
unsigned long npages, bool incr)
 {
-   long ret = 0, locked, lock_limit;
+   long ret = 0;
+   unsigned long lock_limit;
+   s64 pinned_vm;
 
if (!npages)
return 0;
 
-   down_write(&mm->mmap_sem);
-
if (incr) {
-   locked = mm->locked_vm + npages;
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   pinned_vm = atomic64_add_return(npages, &mm->pinned_vm);
+   if (pinned_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   mm->locked_vm += npages;
+   atomic64_sub(npages, &mm->pinned_vm);
+   }
} else {
-   if (WARN_ON_ONCE(npages > mm->locked_vm))
-   npages = mm->locked_vm;
-   mm->locked_vm -= npages;
+   pinned_vm = atomic64_read(&mm->pinned_vm);
+   if (WARN_ON_ONCE(npages > pinned_vm))
+   npages = pinned_vm;
+   atomic64_sub(npages, &mm->pinned_vm);
}
 
-   pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-   current ? current->pid : 0,
-   incr ? '+' : '-',
+   pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%lu %ld/%lu\n",
+   current ? current->pid : 0, incr ? '+' : '-',
npages << PAGE_SHIFT,
-   mm->locked_vm << PAGE_SHIFT,
+   atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
-   up_write(&mm->mmap_sem);
 
return ret;
 }
@@ -133,7 +132,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
-   long i, j, ret = 0, locked_entries = 0;
+   long i, j, ret = 0, pinned_entries = 0;
unsigned int pageshift;
unsigned long flags;
unsigned long cur_ua;
@@ -154,11 +153,11 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
}
 
if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) {
-   ret = mm_iommu_adjust_locked_vm(mm, entries, true);
+   ret = mm_iommu_adjust_pinned_vm(mm, entries, true);
if (ret)
goto unlock_exit;
 
-   locked_entries = entries;
+   pinned_entries = entries;
}
 
mem = kzalloc(sizeof(*mem), GFP_KERNEL);
@@ -252,8 +251,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
unsigned long ua,
list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list);
 
 unlock_exit:
-   if (locked_entries && ret)
-   mm_iommu_adjust_locked_vm(mm, locked_entries, false);
+   if (pinned_entries && ret)
+   mm_iommu_adjust_pinned_vm(mm, pinned_entries, false);
 
mutex_unlock(&mem_list_mutex);
 
@@ -352,7 +351,7 @@ long mm_iommu_put(struct mm_struct *mm, struct 
mm_iommu_table_group_mem_t *mem)
mm_iommu_release(mem);
 
if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA)
-   mm_iommu_adjust_locked_vm(mm, entries, false);
+   mm_iommu_adjust_pinned_vm(mm, entries, false);
 
 unlock_exit:
mutex_unlock(&mem_list_mutex);
-- 
2.20.1



[PATCH 3/5] fpga/dlf/afu: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  The FPGA AFU
driver accounts pinned pages to locked_vm; use pinned_vm instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan 
---
 drivers/fpga/dfl-afu-dma-region.c | 50 ++-
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/drivers/fpga/dfl-afu-dma-region.c 
b/drivers/fpga/dfl-afu-dma-region.c
index e18a786fc943..a9a6b317fe2e 100644
--- a/drivers/fpga/dfl-afu-dma-region.c
+++ b/drivers/fpga/dfl-afu-dma-region.c
@@ -32,47 +32,43 @@ void afu_dma_region_init(struct dfl_feature_platform_data 
*pdata)
 }
 
 /**
- * afu_dma_adjust_locked_vm - adjust locked memory
+ * afu_dma_adjust_pinned_vm - adjust pinned memory
  * @dev: port device
  * @npages: number of pages
- * @incr: increase or decrease locked memory
  *
- * Increase or decrease the locked memory size with npages input.
+ * Increase or decrease the pinned memory size with npages input.
  *
  * Return 0 on success.
- * Return -ENOMEM if locked memory size is over the limit and no CAP_IPC_LOCK.
+ * Return -ENOMEM if pinned memory size is over the limit and no CAP_IPC_LOCK.
  */
-static int afu_dma_adjust_locked_vm(struct device *dev, long npages, bool incr)
+static int afu_dma_adjust_pinned_vm(struct device *dev, long pages)
 {
-   unsigned long locked, lock_limit;
+   unsigned long lock_limit;
+   s64 pinned_vm;
int ret = 0;
 
/* the task is exiting. */
-   if (!current->mm)
+   if (!current->mm || !pages)
return 0;
 
-   down_write(¤t->mm->mmap_sem);
-
-   if (incr) {
-   locked = current->mm->locked_vm + npages;
+   if (pages > 0) {
lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   pinned_vm = atomic64_add_return(pages, ¤t->mm->pinned_vm);
+   if (pinned_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   current->mm->locked_vm += npages;
+   atomic64_sub(pages, ¤t->mm->pinned_vm);
+   }
} else {
-   if (WARN_ON_ONCE(npages > current->mm->locked_vm))
-   npages = current->mm->locked_vm;
-   current->mm->locked_vm -= npages;
+   pinned_vm = atomic64_read(¤t->mm->pinned_vm);
+   if (WARN_ON_ONCE(pages > pinned_vm))
+   pages = pinned_vm;
+   atomic64_sub(pages, ¤t->mm->pinned_vm);
}
 
-   dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %ld/%ld%s\n", current->pid,
-   incr ? '+' : '-', npages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK),
-   ret ? "- exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
+   dev_dbg(dev, "[%d] RLIMIT_MEMLOCK %c%ld %lld/%lu%s\n", current->pid,
+   (pages > 0) ? '+' : '-', pages << PAGE_SHIFT,
+   (s64)atomic64_read(¤t->mm->pinned_vm) << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK), ret ? "- exceeded" : "");
 
return ret;
 }
@@ -92,7 +88,7 @@ static int afu_dma_pin_pages(struct dfl_feature_platform_data 
*pdata,
struct device *dev = &pdata->dev->dev;
int ret, pinned;
 
-   ret = afu_dma_adjust_locked_vm(dev, npages, true);
+   ret = afu_dma_adjust_pinned_vm(dev, npages);
if (ret)
return ret;
 
@@ -121,7 +117,7 @@ static int afu_dma_pin_pages(struct 
dfl_feature_platform_data *pdata,
 free_pages:
kfree(region->pages);
 unlock_vm:
-   afu_dma_adjust_locked_vm(dev, npages, false);
+   afu_dma_adjust_pinned_vm(dev, -npages);
return ret;
 }
 
@@ -141,7 +137,7 @@ static void afu_dma_unpin_pages(struct 
dfl_feature_platform_data *pdata,
 
put_all_pages(region->pages, npages);
kfree(region->pages);
-   afu_dma_adjust_locked_vm(dev, npages, false);
+   afu_dma_adjust_pinned_vm(dev, -npages);
 
dev_dbg(dev, "%ld pages unpinned\n", npages);
 }
-- 
2.20.1



[PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Beginning with bc3e53f682d9 ("mm: distinguish between mlocked and pinned
pages"), locked and pinned pages are accounted separately.  Type1
accounts pinned pages to locked_vm; use pinned_vm instead.

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan 
---
 drivers/vfio/vfio_iommu_type1.c | 31 ---
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 73652e21efec..a56cc341813f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -257,7 +257,8 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, 
struct vfio_pfn *vpfn)
 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 {
struct mm_struct *mm;
-   int ret;
+   s64 pinned_vm;
+   int ret = 0;
 
if (!npage)
return 0;
@@ -266,24 +267,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long 
npage, bool async)
if (!mm)
return -ESRCH; /* process exited */
 
-   ret = down_write_killable(&mm->mmap_sem);
-   if (!ret) {
-   if (npage > 0) {
-   if (!dma->lock_cap) {
-   unsigned long limit;
-
-   limit = task_rlimit(dma->task,
-   RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+   pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
 
-   if (mm->locked_vm + npage > limit)
-   ret = -ENOMEM;
-   }
+   if (npage > 0 && !dma->lock_cap) {
+   unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
+  PAGE_SHIFT;
+   if (pinned_vm > limit) {
+   atomic64_sub(npage, &mm->pinned_vm);
+   ret = -ENOMEM;
}
-
-   if (!ret)
-   mm->locked_vm += npage;
-
-   up_write(&mm->mmap_sem);
}
 
if (async)
@@ -401,6 +393,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
unsigned long vaddr,
long ret, pinned = 0, lock_acct = 0;
bool rsvd;
dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
+   atomic64_t *pinned_vm = ¤t->mm->pinned_vm;
 
/* This code path is only user initiated */
if (!current->mm)
@@ -418,7 +411,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
unsigned long vaddr,
 * pages are already counted against the user.
 */
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
-   if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
+   if (!dma->lock_cap && atomic64_read(pinned_vm) + 1 > limit) {
put_pfn(*pfn_base, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
limit << PAGE_SHIFT);
@@ -445,7 +438,7 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, 
unsigned long vaddr,
 
if (!rsvd && !vfio_find_vpfn(dma, iova)) {
if (!dma->lock_cap &&
-   current->mm->locked_vm + lock_acct + 1 > limit) {
+   atomic64_read(pinned_vm) + lock_acct + 1 > limit) {
put_pfn(pfn, dma->prot);
pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
__func__, limit << PAGE_SHIFT);
-- 
2.20.1



[PATCH 5/5] kvm/book3s: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
Memory used for TCE tables in kvm_vm_ioctl_create_spapr_tce is currently
accounted to locked_vm because it stays resident and its allocation is
directly triggered from userspace as explained in f8626985c7c2 ("KVM:
PPC: Account TCE-containing pages in locked_vm").

However, since the memory comes straight from the page allocator (and to
a lesser extent unreclaimable slab) and is effectively pinned, it should
be accounted with pinned_vm (see bc3e53f682d9 ("mm: distinguish between
mlocked and pinned pages")).

pinned_vm recently became atomic and so no longer relies on mmap_sem
held as writer: delete.

Signed-off-by: Daniel Jordan 
---
 arch/powerpc/kvm/book3s_64_vio.c | 35 ++--
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 532ab79734c7..2f8d7c051e4e 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -56,39 +56,34 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
+static long kvmppc_account_memlimit(unsigned long pages, bool inc)
 {
long ret = 0;
+   s64 pinned_vm;
 
if (!current || !current->mm)
return ret; /* process exited */
 
-   down_write(¤t->mm->mmap_sem);
-
if (inc) {
-   unsigned long locked, lock_limit;
+   unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 
-   locked = current->mm->locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+   pinned_vm = atomic64_add_return(pages, ¤t->mm->pinned_vm);
+   if (pinned_vm > lock_limit && !capable(CAP_IPC_LOCK)) {
ret = -ENOMEM;
-   else
-   current->mm->locked_vm += stt_pages;
+   atomic64_sub(pages, ¤t->mm->pinned_vm);
+   }
} else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
+   pinned_vm = atomic64_read(¤t->mm->pinned_vm);
+   if (WARN_ON_ONCE(pages > pinned_vm))
+   pages = pinned_vm;
 
-   current->mm->locked_vm -= stt_pages;
+   atomic64_sub(pages, ¤t->mm->pinned_vm);
}
 
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
+   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%lu %ld/%lu%s\n", current->pid,
+   inc ? '+' : '-', pages << PAGE_SHIFT,
+   atomic64_read(¤t->mm->pinned_vm) << PAGE_SHIFT,
+   rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
 
return ret;
 }
-- 
2.20.1



Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:
> On Mon, Feb 11, 2019 at 05:44:33PM -0500, Daniel Jordan wrote:
> > @@ -266,24 +267,15 @@ static int vfio_lock_acct(struct vfio_dma *dma, long 
> > npage, bool async)
> > if (!mm)
> > return -ESRCH; /* process exited */
> >  
> > -   ret = down_write_killable(&mm->mmap_sem);
> > -   if (!ret) {
> > -   if (npage > 0) {
> > -   if (!dma->lock_cap) {
> > -   unsigned long limit;
> > -
> > -   limit = task_rlimit(dma->task,
> > -   RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +   pinned_vm = atomic64_add_return(npage, &mm->pinned_vm);
> >  
> > -   if (mm->locked_vm + npage > limit)
> > -   ret = -ENOMEM;
> > -   }
> > +   if (npage > 0 && !dma->lock_cap) {
> > +   unsigned long limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >>
> > +
> > -   PAGE_SHIFT;
> 
> I haven't looked at this super closely, but how does this stuff work?
> 
> do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> 
> Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
>
> Otherwise MEMLOCK is really doubled..

So this has been a problem for some time, but it's not as easy as adding them
together, see [1][2] for a start.

The locked_vm/pinned_vm issue definitely needs fixing, but all this series is
trying to do is account to the right counter.

Daniel

[1] 
http://lkml.kernel.org/r/20130523104154.ga23...@twins.programming.kicks-ass.net
[2] 
http://lkml.kernel.org/r/20130524140114.gk23...@twins.programming.kicks-ass.net


Re: [PATCH 0/5] use pinned_vm instead of locked_vm to account pinned pages

2019-02-11 Thread Daniel Jordan
On Mon, Feb 11, 2019 at 03:54:47PM -0700, Jason Gunthorpe wrote:
> On Mon, Feb 11, 2019 at 05:44:32PM -0500, Daniel Jordan wrote:
> > Hi,
> > 
> > This series converts users that account pinned pages with locked_vm to
> > account with pinned_vm instead, pinned_vm being the correct counter to
> > use.  It's based on a similar patch I posted recently[0].
> > 
> > The patches are based on rdma/for-next to build on Davidlohr Bueso's
> > recent conversion of pinned_vm to an atomic64_t[1].  Seems to make some
> > sense for these to be routed the same way, despite lack of rdma content?
> 
> Oy.. I'd be willing to accumulate a branch with acks to send to Linus
> *separately* from RDMA to Linus, but this is very abnormal.
> 
> Better to wait a few weeks for -rc1 and send patches through the
> subsystem trees.

Ok, I can do that.  It did seem strange, so I made it a question...


Re: [PATCH 2/5] vfio/spapr_tce: use pinned_vm instead of locked_vm to account pinned pages

2019-02-12 Thread Daniel Jordan
On Tue, Feb 12, 2019 at 04:50:11PM +, Christopher Lameter wrote:
> On Tue, 12 Feb 2019, Alexey Kardashevskiy wrote:
> 
> > Now it is 3 independent accesses (actually 4 but the last one is
> > diagnostic) with no locking around them. Why do not we need a lock
> > anymore precisely? Thanks,
> 
> Updating a regular counter is racy and requires a lock. It was converted
> to be an atomic which can be incremented without a race.

Yes, though Alexey may have meant that the multiple reads of the atomic in
decrement_pinned_vm are racy.  It only matters when there's a bug that would
make the counter go negative, but it's there.

And FWIW the debug print in try_increment_pinned_vm is also racy.

This fixes all that.  It doesn't try to correct the negative pinned_vm as the
old code did because it's already a bug and adjusting the value by the negative
amount seems to do nothing but make debugging harder.

If it's ok, I'll respin the whole series this way (another point for common
helper)

diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index f47e020dc5e4..b79257304de6 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -53,25 +53,24 @@ static long try_increment_pinned_vm(struct mm_struct *mm, 
long npages)
atomic64_sub(npages, &mm->pinned_vm);
}
 
-   pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%lu%s\n", current->pid,
-   npages << PAGE_SHIFT,
-   atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : "");
+   pr_debug("[%d] RLIMIT_MEMLOCK +%ld %lld/%lu%s\n", current->pid,
+   npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
+   lock_limit, ret ? " - exceeded" : "");
 
return ret;
 }
 
 static void decrement_pinned_vm(struct mm_struct *mm, long npages)
 {
+   s64 pinned;
+
if (!mm || !npages)
return;
 
-   if (WARN_ON_ONCE(npages > atomic64_read(&mm->pinned_vm)))
-   npages = atomic64_read(&mm->pinned_vm);
-   atomic64_sub(npages, &mm->pinned_vm);
-   pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%lu\n", current->pid,
-   npages << PAGE_SHIFT,
-   atomic64_read(&mm->pinned_vm) << PAGE_SHIFT,
+   pinned = atomic64_sub_return(npages, &mm->pinned_vm);
+   WARN_ON_ONCE(pinned < 0);
+   pr_debug("[%d] RLIMIT_MEMLOCK -%ld %lld/%lu\n", current->pid,
+   npages << PAGE_SHIFT, pinned << PAGE_SHIFT,
rlimit(RLIMIT_MEMLOCK));
 }
 



Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-12 Thread Daniel Jordan
On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> Daniel Jordan  wrote:
> > On Mon, Feb 11, 2019 at 03:56:20PM -0700, Jason Gunthorpe wrote:
> > > I haven't looked at this super closely, but how does this stuff work?
> > > 
> > > do_mlock doesn't touch pinned_vm, and this doesn't touch locked_vm...
> > > 
> > > Shouldn't all this be 'if (locked_vm + pinned_vm < RLIMIT_MEMLOCK)' ?
> > >
> > > Otherwise MEMLOCK is really doubled..  
> > 
> > So this has been a problem for some time, but it's not as easy as adding 
> > them
> > together, see [1][2] for a start.
> > 
> > The locked_vm/pinned_vm issue definitely needs fixing, but all this series 
> > is
> > trying to do is account to the right counter.

Thanks for taking a look, Alex.

> This still makes me nervous because we have userspace dependencies on
> setting process locked memory.

Could you please expand on this?  Trying to get more context.

> There's a user visible difference if we
> account for them in the same bucket vs separate.  Perhaps we're
> counting in the wrong bucket now, but if we "fix" that and userspace
> adapts, how do we ever go back to accounting both mlocked and pinned
> memory combined against rlimit?  Thanks,

PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
locked_vm accounting separate, but allowed the two to be added safely to be
compared against RLIMIT_MEMLOCK.

Anyway, until some solution is agreed on, are there objections to converting
locked_vm to an atomic, to avoid user-visible changes, instead of switching
locked_vm users to pinned_vm?

Daniel

[1] 
http://lkml.kernel.org/r/20130524140114.gk23...@twins.programming.kicks-ass.net


Re: [PATCH 1/5] vfio/type1: use pinned_vm instead of locked_vm to account pinned pages

2019-02-13 Thread Daniel Jordan
On Wed, Feb 13, 2019 at 01:03:30PM -0700, Alex Williamson wrote:
> Daniel Jordan  wrote:
> > On Tue, Feb 12, 2019 at 11:41:10AM -0700, Alex Williamson wrote:
> > > This still makes me nervous because we have userspace dependencies on
> > > setting process locked memory.  
> > 
> > Could you please expand on this?  Trying to get more context.
> 
> VFIO is a userspace driver interface and the pinned/locked page
> accounting we're doing here is trying to prevent a user from exceeding
> their locked memory limits.  Thus a VM management tool or unprivileged
> userspace driver needs to have appropriate locked memory limits
> configured for their use case.  Currently we do not have a unified
> accounting scheme, so if a page is mlock'd by the user and also mapped
> through VFIO for DMA, it's accounted twice, these both increment
> locked_vm and userspace needs to manage that.  If pinned memory
> and locked memory are now two separate buckets and we're only comparing
> one of them against the locked memory limit, then it seems we have
> effectively doubled the user's locked memory for this use case, as
> Jason questioned.  The user could mlock one page and DMA map another,
> they're both "locked", but now they only take one slot in each bucket.

Right, yes.  Should have been more specific.  I was after a concrete use case
where this would happen (sounded like you may have had a specific tool in
mind).

But it doesn't matter.  I understand your concern and agree that, given the
possibility that accounting in _some_ tool can be affected, we should fix
accounting before changing user visible behavior.  I can start a separate
discussion, having opened the can of worms again :)

> If we continue forward with using a separate bucket here, userspace
> could infer that accounting is unified and lower the user's locked
> memory limit, or exploit the gap that their effective limit might
> actually exceed system memory.  In the former case, if we do eventually
> correct to compare the total of the combined buckets against the user's
> locked memory limits, we'll break users that have adapted their locked
> memory limits to meet the apparent needs.  In the latter case, the
> inconsistent accounting is potentially an attack vector.

Makes sense.

> > > There's a user visible difference if we
> > > account for them in the same bucket vs separate.  Perhaps we're
> > > counting in the wrong bucket now, but if we "fix" that and userspace
> > > adapts, how do we ever go back to accounting both mlocked and pinned
> > > memory combined against rlimit?  Thanks,  
> > 
> > PeterZ posted an RFC that addresses this point[1].  It kept pinned_vm and
> > locked_vm accounting separate, but allowed the two to be added safely to be
> > compared against RLIMIT_MEMLOCK.
> 
> Unless I'm incorrect in the concerns above, I don't see how we can
> convert vfio before this occurs.
>  
> > Anyway, until some solution is agreed on, are there objections to converting
> > locked_vm to an atomic, to avoid user-visible changes, instead of switching
> > locked_vm users to pinned_vm?
> 
> Seems that as long as we have separate buckets that are compared
> individually to rlimit that we've got problems, it's just a matter of
> where they're exposed based on which bucket is used for which
> interface.  Thanks,

Indeed.  But for now, any concern with simply changing the type of the
currently used counter to an atomic, to reduce mmap_sem usage?  This is just an
implementation detail, invisible to userspace.


[PATCH v2 4/7] padata: add basic support for multithreaded jobs

2020-05-20 Thread Daniel Jordan
Sometimes the kernel doesn't take full advantage of system memory
bandwidth, leading to a single CPU spending excessive time in
initialization paths where the data scales with memory size.

Multithreading naturally addresses this problem.

Extend padata, a framework that handles many parallel yet singlethreaded
jobs, to also handle multithreaded jobs by adding support for splitting
up the work evenly, specifying a minimum amount of work that's
appropriate for one helper thread to do, load balancing between helpers,
and coordinating them.

This is inspired by work from Pavel Tatashin and Steve Sistare.

Signed-off-by: Daniel Jordan 
---
 include/linux/padata.h |  29 
 kernel/padata.c| 152 -
 2 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 3bfa503503ac5..b0affa466a841 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -4,6 +4,9 @@
  *
  * Copyright (C) 2008, 2009 secunet Security Networks AG
  * Copyright (C) 2008, 2009 Steffen Klassert 
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan 
  */
 
 #ifndef PADATA_H
@@ -130,6 +133,31 @@ struct padata_shell {
struct list_headlist;
 };
 
+/**
+ * struct padata_mt_job - represents one multithreaded job
+ *
+ * @thread_fn: Called for each chunk of work that a padata thread does.
+ * @fn_arg: The thread function argument.
+ * @start: The start of the job (units are job-specific).
+ * @size: size of this node's work (units are job-specific).
+ * @align: Ranges passed to the thread function fall on this boundary, with the
+ * possible exceptions of the beginning and end of the job.
+ * @min_chunk: The minimum chunk size in job-specific units.  This allows
+ * the client to communicate the minimum amount of work that's
+ * appropriate for one worker thread to do at once.
+ * @max_threads: Max threads to use for the job, actual number may be less
+ *   depending on task size and minimum chunk size.
+ */
+struct padata_mt_job {
+   void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
+   void*fn_arg;
+   unsigned long   start;
+   unsigned long   size;
+   unsigned long   align;
+   unsigned long   min_chunk;
+   int max_threads;
+};
+
 /**
  * struct padata_instance - The overall control structure.
  *
@@ -171,6 +199,7 @@ extern void padata_free_shell(struct padata_shell *ps);
 extern int padata_do_parallel(struct padata_shell *ps,
  struct padata_priv *padata, int *cb_cpu);
 extern void padata_do_serial(struct padata_priv *padata);
+extern void __init padata_do_multithreaded(struct padata_mt_job *job);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
  cpumask_var_t cpumask);
 extern int padata_start(struct padata_instance *pinst);
diff --git a/kernel/padata.c b/kernel/padata.c
index 78ff9aa529204..e78f57d9aef90 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -7,6 +7,9 @@
  * Copyright (C) 2008, 2009 secunet Security Networks AG
  * Copyright (C) 2008, 2009 Steffen Klassert 
  *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan 
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
@@ -21,6 +24,7 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -32,6 +36,8 @@
 #include 
 #include 
 
+#definePADATA_WORK_ONSTACK 1   /* Work's memory is on stack */
+
 struct padata_work {
struct work_struct  pw_work;
struct list_headpw_list;  /* padata_free_works linkage */
@@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock);
 static struct padata_work *padata_works;
 static LIST_HEAD(padata_free_works);
 
+struct padata_mt_job_state {
+   spinlock_t  lock;
+   struct completion   completion;
+   struct padata_mt_job*job;
+   int nworks;
+   int nworks_fini;
+   unsigned long   chunk_size;
+};
+
 static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
@@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void)
 }
 
 static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
-void *data)
+void *data, int flags)
 {
-   INIT_WORK(&pw->pw_work, work_fn);
+   if (flags & PADATA_WORK_ONSTACK)
+

[PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-20 Thread Daniel Jordan
  5.0)  71.8% 68.7 (  2.1)
 100% ( 16)  19.8%805.3 ( 10.8)  76.4% 57.3 ( 15.9)

Server-oriented distros that enable deferred page init sometimes run in
small VMs, and they still benefit even though the fraction of boot time
saved is smaller:

AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
  1 node * 2 cores * 2 threads = 4 CPUs
  16G/node = 16G memory

   kernel boot deferred init
   
node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
  (  0) --722.3 (  9.5) -- 50.7 (  0.6)
  25% (  1)  -3.3%746.3 (  4.7)  -2.0% 51.7 (  1.2)
  50% (  2)   0.2%721.0 ( 11.3)  29.6% 35.7 (  4.9)
  75% (  3)  -0.3%724.3 ( 11.2)  48.7% 26.0 (  0.0)
 100% (  4)   3.0%700.3 ( 13.6)  55.9% 22.3 (  0.6)

Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
  1 node * 2 cores * 2 threads = 4 CPUs
  14G/node = 14G memory

   kernel boot deferred init
   
node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
  (  0) --673.0 (  6.9) -- 57.0 (  1.0)
  25% (  1)  -0.6%677.3 ( 19.8)   1.8% 56.0 (  1.0)
  50% (  2)   3.4%650.0 (  3.6)  36.8% 36.0 (  5.2)
  75% (  3)   4.2%644.7 (  7.6)  56.1% 25.0 (  1.0)
 100% (  4)   5.3%637.0 (  5.6)  63.2% 21.0 (  0.0)

On Josh's 96-CPU and 192G memory system:

Without this patch series:
[0.487132] node 0 initialised, 23398907 pages in 292ms
[0.499132] node 1 initialised, 24189223 pages in 304ms
...
[0.629376] Run /sbin/init as init process

With this patch series:
[0.227868] node 0 initialised, 23398907 pages in 28ms
[0.230019] node 1 initialised, 24189223 pages in 28ms
...
[0.361069] Run /sbin/init as init process

[1] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf

Signed-off-by: Daniel Jordan 
---
 mm/Kconfig  |  6 ++---
 mm/page_alloc.c | 60 -
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c358..04c1da3f9f44c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -750,13 +750,13 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
+   select PADATA
help
  Ordinarily all struct pages are initialised during early boot in a
  single thread. On very large machines this can take a considerable
  amount of time. If this option is set, large machines will bring up
- a subset of memmap at boot and then initialise the rest in parallel
- by starting one-off "pgdatinitX" kernel thread for each node X. This
- has a potential performance impact on processes running early in the
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
  lifetime of the system until these kthreads finish the
  initialisation.
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d0c0d9364aa6d..9cb780e8dec78 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1814,16 +1815,44 @@ deferred_init_maxorder(u64 *i, struct zone *zone, 
unsigned long *start_pfn,
return nr_pages;
 }
 
+struct definit_args {
+   struct zone *zone;
+   atomic_long_t nr_pages;
+};
+
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+  void *arg)
+{
+   unsigned long spfn, epfn, nr_pages = 0;
+   struct definit_args *args = arg;
+   struct zone *zone = args->zone;
+   u64 i;
+
+   deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+   /*
+* Initialize and free pages in MAX_ORDER sized increments so that we
+* can avoid introducing any issues with the buddy allocator.
+*/
+   while (spfn < end_pfn) {
+   nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+   cond_resched();
+   }
+
+   atomic_long_add(nr_pages, &args->nr_pages);
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
unsigned long spfn = 0, epfn = 0, nr_pages = 0;
-   unsigned long first_init_pfn, flags;
+   unsigned long fi

[PATCH v2 6/7] mm: make deferred init's max threads arch-specific

2020-05-20 Thread Daniel Jordan
Using padata during deferred init has only been tested on x86, so for
now limit it to this architecture.

If another arch wants this, it can find the max thread limit that's best
for it and override deferred_page_init_max_threads().

Signed-off-by: Daniel Jordan 
---
 arch/x86/mm/init_64.c| 12 
 include/linux/memblock.h |  3 +++
 mm/page_alloc.c  | 13 -
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8b5f73f5e207c..2d749ec12ea8a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1260,6 +1260,18 @@ void __init mem_init(void)
mem_init_print_info(NULL);
 }
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+   /*
+* More CPUs always led to greater speedups on tested systems, up to
+* all the nodes' CPUs.  Use all since the system is otherwise idle
+* now.
+*/
+   return max_t(int, cpumask_weight(node_cpumask), 1);
+}
+#endif
+
 int kernel_set_to_readonly;
 
 void mark_rodata_ro(void)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6bc37a731d27b..2b289df44194f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -275,6 +275,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone 
*zone,
 #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
for (; i != U64_MAX;  \
 __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
+
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9cb780e8dec78..0d7d805f98b2d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1843,6 +1843,13 @@ deferred_init_memmap_chunk(unsigned long start_pfn, 
unsigned long end_pfn,
atomic_long_add(nr_pages, &args->nr_pages);
 }
 
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+   return 1;
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
@@ -1891,11 +1898,7 @@ static int __init deferred_init_memmap(void *data)
 first_init_pfn))
goto zone_empty;
 
-   /*
-* More CPUs always led to greater speedups on tested systems, up to
-* all the nodes' CPUs.  Use all since the system is otherwise idle now.
-*/
-   max_threads = max(cpumask_weight(cpumask), 1u);
+   max_threads = deferred_page_init_max_threads(cpumask);
 
while (spfn < epfn) {
epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION);
-- 
2.26.2



[PATCH v2 2/7] padata: initialize earlier

2020-05-20 Thread Daniel Jordan
padata will soon initialize the system's struct pages in parallel, so it
needs to be ready by page_alloc_init_late().

The error return from padata_driver_init() triggers an initcall warning,
so add a warning to padata_init() to avoid silent failure.

Signed-off-by: Daniel Jordan 
---
 include/linux/padata.h |  6 ++
 init/main.c|  2 ++
 kernel/padata.c| 17 -
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index a0d8b41850b25..476ecfa41f363 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -164,6 +164,12 @@ struct padata_instance {
 #definePADATA_INVALID  4
 };
 
+#ifdef CONFIG_PADATA
+extern void __init padata_init(void);
+#else
+static inline void __init padata_init(void) {}
+#endif
+
 extern struct padata_instance *padata_alloc_possible(const char *name);
 extern void padata_free(struct padata_instance *pinst);
 extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
diff --git a/init/main.c b/init/main.c
index 03371976d3872..8ab521f7af5d2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,6 +94,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1482,6 +1483,7 @@ static noinline void __init kernel_init_freeable(void)
smp_init();
sched_init_smp();
 
+   padata_init();
page_alloc_init_late();
/* Initialize page ext after all struct pages are initialized. */
page_ext_init();
diff --git a/kernel/padata.c b/kernel/padata.c
index 835919c745266..6f709bc0fc413 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #define MAX_OBJ_NUM 1000
 
@@ -1050,26 +1049,26 @@ void padata_free_shell(struct padata_shell *ps)
 }
 EXPORT_SYMBOL(padata_free_shell);
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static __init int padata_driver_init(void)
+void __init padata_init(void)
 {
+#ifdef CONFIG_HOTPLUG_CPU
int ret;
 
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
  padata_cpu_online, NULL);
if (ret < 0)
-   return ret;
+   goto err;
hp_online = ret;
 
ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
  NULL, padata_cpu_dead);
if (ret < 0) {
cpuhp_remove_multi_state(hp_online);
-   return ret;
+   goto err;
}
-   return 0;
-}
-module_init(padata_driver_init);
 
+   return;
+err:
+   pr_warn("padata: initialization failed\n");
 #endif
+}
-- 
2.26.2



[PATCH v2 3/7] padata: allocate work structures for parallel jobs from a pool

2020-05-20 Thread Daniel Jordan
padata allocates per-CPU, per-instance work structs for parallel jobs.
A do_parallel call assigns a job to a sequence number and hashes the
number to a CPU, where the job will eventually run using the
corresponding work.

This approach fit with how padata used to bind a job to each CPU
round-robin, makes less sense after commit bfde23ce200e6 ("padata:
unbind parallel jobs from specific CPUs") because a work isn't bound to
a particular CPU anymore, and isn't needed at all for multithreaded jobs
because they don't have sequence numbers.

Replace the per-CPU works with a preallocated pool, which allows sharing
them between existing padata users and the upcoming multithreaded user.
The pool will also facilitate setting NUMA-aware concurrency limits with
later users.

The pool is sized according to the number of possible CPUs.  With this
limit, MAX_OBJ_NUM no longer makes sense, so remove it.

If the global pool is exhausted, a parallel job is run in the current
task instead to throttle a system trying to do too much in parallel.

Signed-off-by: Daniel Jordan 
---
 include/linux/padata.h |   8 +--
 kernel/padata.c| 118 +++--
 2 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 476ecfa41f363..3bfa503503ac5 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -24,7 +24,6 @@
  * @list: List entry, to attach to the padata lists.
  * @pd: Pointer to the internal control structure.
  * @cb_cpu: Callback cpu for serializatioon.
- * @cpu: Cpu for parallelization.
  * @seq_nr: Sequence number of the parallelized data object.
  * @info: Used to pass information from the parallel to the serial function.
  * @parallel: Parallel execution function.
@@ -34,7 +33,6 @@ struct padata_priv {
struct list_headlist;
struct parallel_data*pd;
int cb_cpu;
-   int cpu;
unsigned intseq_nr;
int info;
void(*parallel)(struct padata_priv *padata);
@@ -68,15 +66,11 @@ struct padata_serial_queue {
 /**
  * struct padata_parallel_queue - The percpu padata parallel queue
  *
- * @parallel: List to wait for parallelization.
  * @reorder: List to wait for reordering after parallel processing.
- * @work: work struct for parallelization.
  * @num_obj: Number of objects that are processed by this cpu.
  */
 struct padata_parallel_queue {
-   struct padata_listparallel;
struct padata_listreorder;
-   struct work_structwork;
atomic_t  num_obj;
 };
 
@@ -111,7 +105,7 @@ struct parallel_data {
struct padata_parallel_queue__percpu *pqueue;
struct padata_serial_queue  __percpu *squeue;
atomic_trefcnt;
-   atomic_tseq_nr;
+   unsigned intseq_nr;
unsigned intprocessed;
int cpu;
struct padata_cpumask   cpumask;
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f709bc0fc413..78ff9aa529204 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -32,7 +32,15 @@
 #include 
 #include 
 
-#define MAX_OBJ_NUM 1000
+struct padata_work {
+   struct work_struct  pw_work;
+   struct list_headpw_list;  /* padata_free_works linkage */
+   void*pw_data;
+};
+
+static DEFINE_SPINLOCK(padata_works_lock);
+static struct padata_work *padata_works;
+static LIST_HEAD(padata_free_works);
 
 static void padata_free_pd(struct parallel_data *pd);
 
@@ -58,30 +66,44 @@ static int padata_cpu_hash(struct parallel_data *pd, 
unsigned int seq_nr)
return padata_index_to_cpu(pd, cpu_index);
 }
 
-static void padata_parallel_worker(struct work_struct *parallel_work)
+static struct padata_work *padata_work_alloc(void)
 {
-   struct padata_parallel_queue *pqueue;
-   LIST_HEAD(local_list);
+   struct padata_work *pw;
 
-   local_bh_disable();
-   pqueue = container_of(parallel_work,
- struct padata_parallel_queue, work);
+   lockdep_assert_held(&padata_works_lock);
 
-   spin_lock(&pqueue->parallel.lock);
-   list_replace_init(&pqueue->parallel.list, &local_list);
-   spin_unlock(&pqueue->parallel.lock);
+   if (list_empty(&padata_free_works))
+   return NULL;/* No more work items allowed to be queued. */
 
-   while (!list_empty(&local_list)) {
-   struct padata_priv *padata;
+   pw = list_first_entry(&padata_free_works, struct padata_work, pw_list);
+   list_del(&pw->pw_list);
+   return pw;
+}
 
-   padata = list_entry(local_list.next,
-   struct padata_priv, list);
+static void padata_work_init(struc

[PATCH v2 7/7] padata: document multithreaded jobs

2020-05-20 Thread Daniel Jordan
Add Documentation for multithreaded jobs.

Signed-off-by: Daniel Jordan 
---
 Documentation/core-api/padata.rst | 41 +++
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/Documentation/core-api/padata.rst 
b/Documentation/core-api/padata.rst
index 9a24c111781d9..b7e047af993e8 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -4,23 +4,26 @@
 The padata parallel execution mechanism
 ===
 
-:Date: December 2019
+:Date: April 2020
 
 Padata is a mechanism by which the kernel can farm jobs out to be done in
-parallel on multiple CPUs while retaining their ordering.  It was developed for
-use with the IPsec code, which needs to be able to perform encryption and
-decryption on large numbers of packets without reordering those packets.  The
-crypto developers made a point of writing padata in a sufficiently general
-fashion that it could be put to other uses as well.
+parallel on multiple CPUs while optionally retaining their ordering.
 
-Usage
-=
+It was originally developed for IPsec, which needs to perform encryption and
+decryption on large numbers of packets without reordering those packets.  This
+is currently the sole consumer of padata's serialized job support.
+
+Padata also supports multithreaded jobs, splitting up the job evenly while load
+balancing and coordinating between threads.
+
+Running Serialized Jobs
+===
 
 Initializing
 
 
-The first step in using padata is to set up a padata_instance structure for
-overall control of how jobs are to be run::
+The first step in using padata to run parallel jobs is to set up a
+padata_instance structure for overall control of how jobs are to be run::
 
 #include 
 
@@ -162,6 +165,24 @@ functions that correspond to the allocation in reverse::
 It is the user's responsibility to ensure all outstanding jobs are complete
 before any of the above are called.
 
+Running Multithreaded Jobs
+==
+
+A multithreaded job has a main thread and zero or more helper threads, with the
+main thread participating in the job and then waiting until all helpers have
+finished.  padata splits the job into units called chunks, where a chunk is a
+piece of the job that one thread completes in one call to the thread function.
+
+A user has to do three things to run a multithreaded job.  First, describe the
+job by defining a padata_mt_job structure, which is explained in the Interface
+section.  This includes a pointer to the thread function, which padata will
+call each time it assigns a job chunk to a thread.  Then, define the thread
+function, which accepts three arguments, ``start``, ``end``, and ``arg``, where
+the first two delimit the range that the thread operates on and the last is a
+pointer to the job's shared state, if any.  Prepare the shared state, which is
+typically a stack-allocated structure that wraps the required data.  Last, call
+padata_do_multithreaded(), which will return once the job is finished.
+
 Interface
 =
 
-- 
2.26.2



[PATCH v2 1/7] padata: remove exit routine

2020-05-20 Thread Daniel Jordan
padata_driver_exit() is unnecessary because padata isn't built as a
module and doesn't exit.

padata's init routine will soon allocate memory, so getting rid of the
exit function now avoids pointless code to free it.

Signed-off-by: Daniel Jordan 
---
 kernel/padata.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index a6afa12fb75ee..835919c745266 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1072,10 +1072,4 @@ static __init int padata_driver_init(void)
 }
 module_init(padata_driver_init);
 
-static __exit void padata_driver_exit(void)
-{
-   cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
-   cpuhp_remove_multi_state(hp_online);
-}
-module_exit(padata_driver_exit);
 #endif
-- 
2.26.2



[PATCH v2 0/7] padata: parallelize deferred page init

2020-05-20 Thread Daniel Jordan
Deferred struct page init is a bottleneck in kernel boot--the biggest
for us and probably others.  Optimizing it maximizes availability for
large-memory systems and allows spinning up short-lived VMs as needed
without having to leave them running.  It also benefits bare metal
machines hosting VMs that are sensitive to downtime.  In projects such
as VMM Fast Restart[1], where guest state is preserved across kexec
reboot, it helps prevent application and network timeouts in the guests.

So, multithread deferred init to take full advantage of system memory
bandwidth.

Extend padata, a framework that handles many parallel singlethreaded
jobs, to handle multithreaded jobs as well by adding support for
splitting up the work evenly, specifying a minimum amount of work that's
appropriate for one helper thread to do, load balancing between helpers,
and coordinating them.  More documentation in patches 4 and 7.

This series is the first step in a project to address other memory
proportional bottlenecks in the kernel such as pmem struct page init,
vfio page pinning, hugetlb fallocate, and munmap.  Deferred page init
doesn't require concurrency limits, resource control, or priority
adjustments like these other users will because it happens during boot
when the system is otherwise idle and waiting for page init to finish.

This has been run on a variety of x86 systems and speeds up kernel boot
by 3% to 49%, saving up to 1.6 out of 4 seconds.  Patch 5 has more
numbers.

Please review and test, and thanks to Alex, Andrew, Josh, and Pavel for
their feedback in the last version.

The powerpc and s390 lists are included in case they want to give this a
try, they had enabled this feature when it was configured per arch.

Series based on 5.7-rc6 plus these three from mmotm

  mm-call-touch_nmi_watchdog-on-max-order-boundaries-in-deferred-init.patch
  mm-initialize-deferred-pages-with-interrupts-enabled.patch
  mm-call-cond_resched-from-deferred_init_memmap.patch

and it's available here:

  git://oss.oracle.com/git/linux-dmjordan.git padata-mt-definit-v2
  
https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=shortlog;h=refs/heads/padata-mt-definit-v2

and the future users and related features are available as
work-in-progress:

  git://oss.oracle.com/git/linux-dmjordan.git padata-mt-wip-v0.4
  
https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=shortlog;h=refs/heads/padata-mt-wip-v0.4

v2:
 - Improve the problem statement (Andrew, Josh, Pavel)
 - Add T-b's to unchanged patches (Josh)
 - Fully initialize max-order blocks to avoid buddy issues (Alex)
 - Parallelize on section-aligned boundaries to avoid potential
   false sharing (Alex)
 - Return the maximum thread count from a function that architectures
   can override, with the generic version returning 1 (current
   behavior).  Override for x86 since that's the only arch this series
   has been tested on so far.  Other archs can test with more threads
   by dropping patch 6.
 - Rebase to v5.7-rc6, rerun tests

RFC v4 [2] -> v1:
 - merged with padata (Peter)
 - got rid of the 'task' nomenclature (Peter, Jon)

future work branch:
 - made lockdep-aware (Jason, Peter)
 - adjust workqueue worker priority with renice_or_cancel() (Tejun)
 - fixed undo problem in VFIO (Alex)

The remaining feedback, mainly resource control awareness (cgroup etc),
is TODO for later series.

[1] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf
https://www.youtube.com/watch?v=pBsHnf93tcQ

https://lore.kernel.org/linux-mm/1588812129-8596-1-git-send-email-anthony.yzn...@oracle.com/

[2] 
https://lore.kernel.org/linux-mm/20181105165558.11698-1-daniel.m.jor...@oracle.com/

Daniel Jordan (7):
  padata: remove exit routine
  padata: initialize earlier
  padata: allocate work structures for parallel jobs from a pool
  padata: add basic support for multithreaded jobs
  mm: parallelize deferred_init_memmap()
  mm: make deferred init's max threads arch-specific
  padata: document multithreaded jobs

 Documentation/core-api/padata.rst |  41 +++--
 arch/x86/mm/init_64.c |  12 ++
 include/linux/memblock.h  |   3 +
 include/linux/padata.h|  43 -
 init/main.c   |   2 +
 kernel/padata.c   | 277 --
 mm/Kconfig|   6 +-
 mm/page_alloc.c   |  67 +++-
 8 files changed, 373 insertions(+), 78 deletions(-)


base-commit: b9bbe6ed63b2b9f2c9ee5cbd0f2c946a2723f4ce
prerequisite-patch-id: 4ad522141e1119a325a9799dad2bd982fbac8b7c
prerequisite-patch-id: 169273327e56f5461101a71dfbd6b4cfd4570cf0
prerequisite-patch-id: 0f34692c8a9673d4c4f6a3545cf8ec3a2abf8620
-- 
2.26.2



Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-21 Thread Daniel Jordan
On Wed, May 20, 2020 at 06:29:32PM -0700, Alexander Duyck wrote:
> On Wed, May 20, 2020 at 11:27 AM Daniel Jordan
> > @@ -1814,16 +1815,44 @@ deferred_init_maxorder(u64 *i, struct zone *zone, 
> > unsigned long *start_pfn,
> > return nr_pages;
> >  }
> >
> > +struct definit_args {
> > +   struct zone *zone;
> > +   atomic_long_t nr_pages;
> > +};
> > +
> > +static void __init
> > +deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
> > +  void *arg)
> > +{
> > +   unsigned long spfn, epfn, nr_pages = 0;
> > +   struct definit_args *args = arg;
> > +   struct zone *zone = args->zone;
> > +   u64 i;
> > +
> > +   deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, 
> > start_pfn);
> > +
> > +   /*
> > +* Initialize and free pages in MAX_ORDER sized increments so that 
> > we
> > +* can avoid introducing any issues with the buddy allocator.
> > +*/
> > +   while (spfn < end_pfn) {
> > +   nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> > +   cond_resched();
> > +   }
> > +
> > +   atomic_long_add(nr_pages, &args->nr_pages);
> > +}
> > +
> 
> Personally I would get rid of nr_pages entirely. It isn't worth the
> cache thrash to have this atomic variable bouncing around.

One of the things I tried to optimize was the managed_pages atomic adds in
__free_pages_core, but performance stayed the same on the biggest machine I
tested when it was done once at the end of page init instead of in every thread
for every pageblock.

I'm not sure this atomic would matter either, given it's less frequent.

> You could
> probably just have this function return void since all nr_pages is
> used for is a pr_info  statement at the end of the initialization
> which will be completely useless now anyway since we really have the
> threads running in parallel anyway.

The timestamp is still useful for observability, page init is a significant
part of kernel boot on big machines, over 10% sometimes with these patches.

It's mostly the time that matters though, I agree the number of pages is less
important and is probably worth removing just to simplify the code.  I'll do it
if no one sees a reason to keep it.

> We only really need the nr_pages logic in deferred_grow_zone in order
> to track if we have freed enough pages to allow us to go back to what
> we were doing.
>
> > @@ -1863,11 +1892,32 @@ static int __init deferred_init_memmap(void *data)
> > goto zone_empty;
> >
> > /*
> > -* Initialize and free pages in MAX_ORDER sized increments so
> > -* that we can avoid introducing any issues with the buddy
> > -* allocator.
> > +* More CPUs always led to greater speedups on tested systems, up to
> > +* all the nodes' CPUs.  Use all since the system is otherwise idle 
> > now.
> >  */
> > +   max_threads = max(cpumask_weight(cpumask), 1u);
> > +
> > while (spfn < epfn) {
> > +   epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION);
> > +
> > +   if (IS_ALIGNED(spfn, PAGES_PER_SECTION) &&
> > +   epfn_align - spfn >= PAGES_PER_SECTION) {
> > +   struct definit_args arg = { zone, 
> > ATOMIC_LONG_INIT(0) };
> > +   struct padata_mt_job job = {
> > +   .thread_fn   = deferred_init_memmap_chunk,
> > +   .fn_arg  = &arg,
> > +   .start   = spfn,
> > +   .size= epfn_align - spfn,
> > +   .align   = PAGES_PER_SECTION,
> > +   .min_chunk   = PAGES_PER_SECTION,
> > +   .max_threads = max_threads,
> > +   };
> > +
> > +   padata_do_multithreaded(&job);
> > +   nr_pages += atomic_long_read(&arg.nr_pages);
> > +   spfn = epfn_align;
> > +   }
> > +
> > nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
> > cond_resched();
> > }
> 
> This doesn't look right. You are basically adding threads in addition
> to calls to deferred_init_maxorder.

The deferred_init_maxorder call is the

Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-21 Thread Daniel Jordan
On Thu, May 21, 2020 at 08:00:31AM -0700, Alexander Duyck wrote:
> So I was thinking about my suggestion further and the loop at the end
> isn't quite correct as I believe it could lead to gaps. The loop on
> the end should probably be:
> for_each_free_mem_pfn_range_in_zone_from(i, zone, spfn, epfn) 
> {
> if (epfn <= epfn_align)
> continue;
> if (spfn < epfn_align)
> spfn = epfn_align;
> break;
> }
> 
> That would generate a new range where epfn_align has actually ended
> and there is a range of new PFNs to process.

Whoops, my email crossed with yours.  Agreed, but see the other message.


Re: [PATCH v2 5/7] mm: parallelize deferred_init_memmap()

2020-05-21 Thread Daniel Jordan
On Thu, May 21, 2020 at 09:46:35AM -0700, Alexander Duyck wrote:
> It is more about not bothering with the extra tracking. We don't
> really need it and having it doesn't really add much in the way of
> value.

Yeah, it can probably go.

> > > > @@ -1863,11 +1892,32 @@ static int __init deferred_init_memmap(void 
> > > > *data)
> > > > goto zone_empty;
> > > >
> > > > /*
> > > > -* Initialize and free pages in MAX_ORDER sized increments so
> > > > -* that we can avoid introducing any issues with the buddy
> > > > -* allocator.
> > > > +* More CPUs always led to greater speedups on tested systems, 
> > > > up to
> > > > +* all the nodes' CPUs.  Use all since the system is otherwise 
> > > > idle now.
> > > >  */
> > > > +   max_threads = max(cpumask_weight(cpumask), 1u);
> > > > +
> > > > while (spfn < epfn) {
> > > > +   epfn_align = ALIGN_DOWN(epfn, PAGES_PER_SECTION);
> > > > +
> > > > +   if (IS_ALIGNED(spfn, PAGES_PER_SECTION) &&
> > > > +   epfn_align - spfn >= PAGES_PER_SECTION) {
> > > > +   struct definit_args arg = { zone, 
> > > > ATOMIC_LONG_INIT(0) };
> > > > +   struct padata_mt_job job = {
> > > > +   .thread_fn   = 
> > > > deferred_init_memmap_chunk,
> > > > +   .fn_arg  = &arg,
> > > > +   .start   = spfn,
> > > > +   .size= epfn_align - spfn,
> > > > +   .align   = PAGES_PER_SECTION,
> > > > +   .min_chunk   = PAGES_PER_SECTION,
> > > > +   .max_threads = max_threads,
> > > > +   };
> > > > +
> > > > +   padata_do_multithreaded(&job);
> > > > +   nr_pages += atomic_long_read(&arg.nr_pages);
> > > > +   spfn = epfn_align;
> > > > +   }
> > > > +
> > > > nr_pages += deferred_init_maxorder(&i, zone, &spfn, 
> > > > &epfn);
> > > > cond_resched();
> > > > }
> > >
> > > This doesn't look right. You are basically adding threads in addition
> > > to calls to deferred_init_maxorder.
> >
> > The deferred_init_maxorder call is there to do the remaining, non-section
> > aligned part of a range.  It doesn't have to be done this way.
> 
> It is also doing the advancing though isn't it?

Yes.  Not sure what you're getting at.  There's the 'spfn = epfn_align' before
so nothing is skipped.  It's true that the nonaligned part is done outside of
padata when it could be done by a thread that'd otherwise be waiting or idle,
which should be addressed in the next version.

> I think I resolved this with the fix for it I described in the other
> email. We just need to swap out spfn for epfn and make sure we align
> spfn with epfn_align. Then I think that takes care of possible skips.

Right, though your fix looks a lot like deferred_init_mem_pfn_range_in_zone().
Seems better to just use that and not repeat ourselves.  Lame that it's
starting at the beginning of the ranges every time, maybe it could be
generalized somehow, but I think it should be fast enough.

> > We could use deferred_init_mem_pfn_range_in_zone() instead of the for_each
> > loop.
> >
> > What I was trying to avoid by aligning down is creating a discontiguous pfn
> > range that get passed to padata.  We already discussed how those are handled
> > by the zone iterator in the thread function, but job->size can be 
> > exaggerated
> > to include parts of the range that are never touched.  Thinking more about 
> > it
> > though, it's a small fraction of the total work and shouldn't matter.
> 
> So the problem with aligning down is that you are going to be slowed
> up as you have to go single threaded to initialize whatever remains.
> So worst case scenario is that you have a section aligned block and
> you will process all but 1 section in parallel, and then have to
> process the remaining section one max order block at a time.

Yes, aligning up is better.

> > > This should accomplish the same thing, but much more efficiently.
> >
> > Well, more cleanly.  I'll give it a try.
> 
> I agree I am not sure if it will make a big difference on x86, however
> the more ranges you have to process the faster this approach should be
> as it stays parallel the entire time rather than having to drop out
> and process the last section one max order block at a time.

Right.


[PATCH v3 2/8] padata: initialize earlier

2020-05-27 Thread Daniel Jordan
padata will soon initialize the system's struct pages in parallel, so it
needs to be ready by page_alloc_init_late().

The error return from padata_driver_init() triggers an initcall warning,
so add a warning to padata_init() to avoid silent failure.

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 include/linux/padata.h |  6 ++
 init/main.c|  2 ++
 kernel/padata.c| 17 -
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index a0d8b41850b25..476ecfa41f363 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -164,6 +164,12 @@ struct padata_instance {
 #definePADATA_INVALID  4
 };
 
+#ifdef CONFIG_PADATA
+extern void __init padata_init(void);
+#else
+static inline void __init padata_init(void) {}
+#endif
+
 extern struct padata_instance *padata_alloc_possible(const char *name);
 extern void padata_free(struct padata_instance *pinst);
 extern struct padata_shell *padata_alloc_shell(struct padata_instance *pinst);
diff --git a/init/main.c b/init/main.c
index 03371976d3872..df32f67214d23 100644
--- a/init/main.c
+++ b/init/main.c
@@ -63,6 +63,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1482,6 +1483,7 @@ static noinline void __init kernel_init_freeable(void)
smp_init();
sched_init_smp();
 
+   padata_init();
page_alloc_init_late();
/* Initialize page ext after all struct pages are initialized. */
page_ext_init();
diff --git a/kernel/padata.c b/kernel/padata.c
index 835919c745266..6f709bc0fc413 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -31,7 +31,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #define MAX_OBJ_NUM 1000
 
@@ -1050,26 +1049,26 @@ void padata_free_shell(struct padata_shell *ps)
 }
 EXPORT_SYMBOL(padata_free_shell);
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static __init int padata_driver_init(void)
+void __init padata_init(void)
 {
+#ifdef CONFIG_HOTPLUG_CPU
int ret;
 
ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, "padata:online",
  padata_cpu_online, NULL);
if (ret < 0)
-   return ret;
+   goto err;
hp_online = ret;
 
ret = cpuhp_setup_state_multi(CPUHP_PADATA_DEAD, "padata:dead",
  NULL, padata_cpu_dead);
if (ret < 0) {
cpuhp_remove_multi_state(hp_online);
-   return ret;
+   goto err;
}
-   return 0;
-}
-module_init(padata_driver_init);
 
+   return;
+err:
+   pr_warn("padata: initialization failed\n");
 #endif
+}
-- 
2.26.2



[PATCH v3 4/8] padata: add basic support for multithreaded jobs

2020-05-27 Thread Daniel Jordan
Sometimes the kernel doesn't take full advantage of system memory
bandwidth, leading to a single CPU spending excessive time in
initialization paths where the data scales with memory size.

Multithreading naturally addresses this problem.

Extend padata, a framework that handles many parallel yet singlethreaded
jobs, to also handle multithreaded jobs by adding support for splitting
up the work evenly, specifying a minimum amount of work that's
appropriate for one helper thread to do, load balancing between helpers,
and coordinating them.

This is inspired by work from Pavel Tatashin and Steve Sistare.

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 include/linux/padata.h |  29 
 kernel/padata.c| 152 -
 2 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 3bfa503503ac5..b0affa466a841 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -4,6 +4,9 @@
  *
  * Copyright (C) 2008, 2009 secunet Security Networks AG
  * Copyright (C) 2008, 2009 Steffen Klassert 
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan 
  */
 
 #ifndef PADATA_H
@@ -130,6 +133,31 @@ struct padata_shell {
struct list_headlist;
 };
 
+/**
+ * struct padata_mt_job - represents one multithreaded job
+ *
+ * @thread_fn: Called for each chunk of work that a padata thread does.
+ * @fn_arg: The thread function argument.
+ * @start: The start of the job (units are job-specific).
+ * @size: size of this node's work (units are job-specific).
+ * @align: Ranges passed to the thread function fall on this boundary, with the
+ * possible exceptions of the beginning and end of the job.
+ * @min_chunk: The minimum chunk size in job-specific units.  This allows
+ * the client to communicate the minimum amount of work that's
+ * appropriate for one worker thread to do at once.
+ * @max_threads: Max threads to use for the job, actual number may be less
+ *   depending on task size and minimum chunk size.
+ */
+struct padata_mt_job {
+   void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
+   void*fn_arg;
+   unsigned long   start;
+   unsigned long   size;
+   unsigned long   align;
+   unsigned long   min_chunk;
+   int max_threads;
+};
+
 /**
  * struct padata_instance - The overall control structure.
  *
@@ -171,6 +199,7 @@ extern void padata_free_shell(struct padata_shell *ps);
 extern int padata_do_parallel(struct padata_shell *ps,
  struct padata_priv *padata, int *cb_cpu);
 extern void padata_do_serial(struct padata_priv *padata);
+extern void __init padata_do_multithreaded(struct padata_mt_job *job);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
  cpumask_var_t cpumask);
 extern int padata_start(struct padata_instance *pinst);
diff --git a/kernel/padata.c b/kernel/padata.c
index 78ff9aa529204..e78f57d9aef90 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -7,6 +7,9 @@
  * Copyright (C) 2008, 2009 secunet Security Networks AG
  * Copyright (C) 2008, 2009 Steffen Klassert 
  *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan 
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * version 2, as published by the Free Software Foundation.
@@ -21,6 +24,7 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -32,6 +36,8 @@
 #include 
 #include 
 
+#definePADATA_WORK_ONSTACK 1   /* Work's memory is on stack */
+
 struct padata_work {
struct work_struct  pw_work;
struct list_headpw_list;  /* padata_free_works linkage */
@@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock);
 static struct padata_work *padata_works;
 static LIST_HEAD(padata_free_works);
 
+struct padata_mt_job_state {
+   spinlock_t  lock;
+   struct completion   completion;
+   struct padata_mt_job*job;
+   int nworks;
+   int nworks_fini;
+   unsigned long   chunk_size;
+};
+
 static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
@@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void)
 }
 
 static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
-void *data)
+void *data, int flags)
 {
-   INIT_WORK(&pw->pw_work, work_fn);
+  

[PATCH v3 0/8] padata: parallelize deferred page init

2020-05-27 Thread Daniel Jordan
Thanks to Alex for his continued review and Josh for running v2!  Please
continue to review and test, and acks for the padata parts would be
appreciated.

Daniel

--

Deferred struct page init is a bottleneck in kernel boot--the biggest
for us and probably others.  Optimizing it maximizes availability for
large-memory systems and allows spinning up short-lived VMs as needed
without having to leave them running.  It also benefits bare metal
machines hosting VMs that are sensitive to downtime.  In projects such
as VMM Fast Restart[1], where guest state is preserved across kexec
reboot, it helps prevent application and network timeouts in the guests.

So, multithread deferred init to take full advantage of system memory
bandwidth.

Extend padata, a framework that handles many parallel singlethreaded
jobs, to handle multithreaded jobs as well by adding support for
splitting up the work evenly, specifying a minimum amount of work that's
appropriate for one helper thread to do, load balancing between helpers,
and coordinating them.  More documentation in patches 4 and 8.

This series is the first step in a project to address other memory
proportional bottlenecks in the kernel such as pmem struct page init,
vfio page pinning, hugetlb fallocate, and munmap.  Deferred page init
doesn't require concurrency limits, resource control, or priority
adjustments like these other users will because it happens during boot
when the system is otherwise idle and waiting for page init to finish.

This has been run on a variety of x86 systems and speeds up kernel boot
by 4% to 49%, saving up to 1.6 out of 4 seconds.  Patch 6 has more
numbers.

The powerpc and s390 lists are included in case they want to give this a
try, they had enabled this feature when it was configured per arch.

Series based on v5.7-rc7 plus these three from mmotm

  mm-call-touch_nmi_watchdog-on-max-order-boundaries-in-deferred-init.patch
  mm-initialize-deferred-pages-with-interrupts-enabled.patch
  mm-call-cond_resched-from-deferred_init_memmap.patch

and it's available here:

  git://oss.oracle.com/git/linux-dmjordan.git padata-mt-definit-v3
  
https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=shortlog;h=refs/heads/padata-mt-definit-v3

and the future users and related features are available as
work-in-progress:

  git://oss.oracle.com/git/linux-dmjordan.git padata-mt-wip-v0.5
  
https://oss.oracle.com/git/gitweb.cgi?p=linux-dmjordan.git;a=shortlog;h=refs/heads/padata-mt-wip-v0.5

v3:
 - Remove nr_pages accounting as suggested by Alex, adding a new patch
 - Align deferred init ranges up not down, simplify surrounding code (Alex)
 - Add Josh's T-b's from v2 (Josh's T-b's for v1 lost in rebase, apologies!)
 - Move padata.h include up in init/main.c to reduce patch collisions (Andrew)
 - Slightly reword Documentation patch
 - Rebase on v5.7-rc7 and retest

v2:
 - Improve the problem statement (Andrew, Josh, Pavel)
 - Add T-b's to unchanged patches (Josh)
 - Fully initialize max-order blocks to avoid buddy issues (Alex)
 - Parallelize on section-aligned boundaries to avoid potential
   false sharing (Alex)
 - Return the maximum thread count from a function that architectures
   can override, with the generic version returning 1 (current
   behavior).  Override for x86 since that's the only arch this series
   has been tested on so far.  Other archs can test with more threads
   by dropping patch 6.
 - Rebase to v5.7-rc6, rerun tests

RFC v4 [2] -> v1:
 - merged with padata (Peter)
 - got rid of the 'task' nomenclature (Peter, Jon)

future work branch:
 - made lockdep-aware (Jason, Peter)
 - adjust workqueue worker priority with renice_or_cancel() (Tejun)
 - fixed undo problem in VFIO (Alex)

The remaining feedback, mainly resource control awareness (cgroup etc),
is TODO for later series.

[1] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf
https://www.youtube.com/watch?v=pBsHnf93tcQ

https://lore.kernel.org/linux-mm/1588812129-8596-1-git-send-email-anthony.yzn...@oracle.com/

[2] 
https://lore.kernel.org/linux-mm/20181105165558.11698-1-daniel.m.jor...@oracle.com/

Daniel Jordan (8):
  padata: remove exit routine
  padata: initialize earlier
  padata: allocate work structures for parallel jobs from a pool
  padata: add basic support for multithreaded jobs
  mm: don't track number of pages during deferred initialization
  mm: parallelize deferred_init_memmap()
  mm: make deferred init's max threads arch-specific
  padata: document multithreaded jobs

 Documentation/core-api/padata.rst |  41 +++--
 arch/x86/mm/init_64.c |  12 ++
 include/linux/memblock.h  |   3 +
 include/linux/padata.h|  43 -
 init/main.c   |   2 +
 kernel/padata.c   | 277 --
 mm/Kconfig|   6 +-
 mm/page_alloc.c   |  59 +-

[PATCH v3 6/8] mm: parallelize deferred_init_memmap()

2020-05-27 Thread Daniel Jordan
 29.2)  79.8% 48.7 (  7.4)
 100% ( 16)  21.0%813.7 ( 21.0)  80.5% 47.0 (  5.2)

Server-oriented distros that enable deferred page init sometimes run in
small VMs, and they still benefit even though the fraction of boot time
saved is smaller:

AMD EPYC 7551 32-Core Processor (Zen, kvm guest)
  1 node * 2 cores * 2 threads = 4 CPUs
  16G/node = 16G memory

   kernel boot deferred init
   
node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
  (  0) --716.0 ( 14.0) -- 49.7 (  0.6)
  25% (  1)   1.8%703.0 (  5.3)  -4.0% 51.7 (  0.6)
  50% (  2)   1.6%704.7 (  1.2)  43.0% 28.3 (  0.6)
  75% (  3)   2.7%696.7 ( 13.1)  49.7% 25.0 (  0.0)
 100% (  4)   4.1%687.0 ( 10.4)  55.7% 22.0 (  0.0)

Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz (Haswell, kvm guest)
  1 node * 2 cores * 2 threads = 4 CPUs
  14G/node = 14G memory

   kernel boot deferred init
   
node% (thr)speedup  time_ms (stdev)speedup  time_ms (stdev)
  (  0) --787.7 (  6.4) --122.3 (  0.6)
  25% (  1)   0.2%786.3 ( 10.8)  -2.5%125.3 (  2.1)
  50% (  2)   5.9%741.0 ( 13.9)  37.6% 76.3 ( 19.7)
  75% (  3)   8.3%722.0 ( 19.0)  49.9% 61.3 (  3.2)
 100% (  4)   9.3%714.7 (  9.5)  56.4% 53.3 (  1.5)

On Josh's 96-CPU and 192G memory system:

Without this patch series:
[0.487132] node 0 initialised, 23398907 pages in 292ms
[0.499132] node 1 initialised, 24189223 pages in 304ms
...
[0.629376] Run /sbin/init as init process

With this patch series:
[0.231435] node 1 initialised, 24189223 pages in 32ms
[0.236718] node 0 initialised, 23398907 pages in 36ms

[1] 
https://static.sched.com/hosted_files/kvmforum2019/66/VMM-fast-restart_kvmforum2019.pdf

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 mm/Kconfig  |  6 +++---
 mm/page_alloc.c | 46 --
 2 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c358..04c1da3f9f44c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -750,13 +750,13 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
+   select PADATA
help
  Ordinarily all struct pages are initialised during early boot in a
  single thread. On very large machines this can take a considerable
  amount of time. If this option is set, large machines will bring up
- a subset of memmap at boot and then initialise the rest in parallel
- by starting one-off "pgdatinitX" kernel thread for each node X. This
- has a potential performance impact on processes running early in the
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
  lifetime of the system until these kthreads finish the
  initialisation.
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d64f3027fdfa6..1d47016849531 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1814,6 +1815,26 @@ deferred_init_maxorder(u64 *i, struct zone *zone, 
unsigned long *start_pfn,
return nr_pages;
 }
 
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+  void *arg)
+{
+   unsigned long spfn, epfn;
+   struct zone *zone = arg;
+   u64 i;
+
+   deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+   /*
+* Initialize and free pages in MAX_ORDER sized increments so that we
+* can avoid introducing any issues with the buddy allocator.
+*/
+   while (spfn < end_pfn) {
+   deferred_init_maxorder(&i, zone, &spfn, &epfn);
+   cond_resched();
+   }
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
@@ -1823,7 +1844,7 @@ static int __init deferred_init_memmap(void *data)
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
-   int zid;
+   int zid, max_threads;
u64 i;
 
/* Bind memory initialisation thread to a local node if possible */
@@ -1863,13 +1884,26 @@ static int __init deferred_init_memmap(void *data)
goto zone_empty;
 
/*
-* Initialize and free pages in MAX_ORDER s

[PATCH v3 3/8] padata: allocate work structures for parallel jobs from a pool

2020-05-27 Thread Daniel Jordan
padata allocates per-CPU, per-instance work structs for parallel jobs.
A do_parallel call assigns a job to a sequence number and hashes the
number to a CPU, where the job will eventually run using the
corresponding work.

This approach fit with how padata used to bind a job to each CPU
round-robin, makes less sense after commit bfde23ce200e6 ("padata:
unbind parallel jobs from specific CPUs") because a work isn't bound to
a particular CPU anymore, and isn't needed at all for multithreaded jobs
because they don't have sequence numbers.

Replace the per-CPU works with a preallocated pool, which allows sharing
them between existing padata users and the upcoming multithreaded user.
The pool will also facilitate setting NUMA-aware concurrency limits with
later users.

The pool is sized according to the number of possible CPUs.  With this
limit, MAX_OBJ_NUM no longer makes sense, so remove it.

If the global pool is exhausted, a parallel job is run in the current
task instead to throttle a system trying to do too much in parallel.

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 include/linux/padata.h |   8 +--
 kernel/padata.c| 118 +++--
 2 files changed, 78 insertions(+), 48 deletions(-)

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 476ecfa41f363..3bfa503503ac5 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -24,7 +24,6 @@
  * @list: List entry, to attach to the padata lists.
  * @pd: Pointer to the internal control structure.
  * @cb_cpu: Callback cpu for serializatioon.
- * @cpu: Cpu for parallelization.
  * @seq_nr: Sequence number of the parallelized data object.
  * @info: Used to pass information from the parallel to the serial function.
  * @parallel: Parallel execution function.
@@ -34,7 +33,6 @@ struct padata_priv {
struct list_headlist;
struct parallel_data*pd;
int cb_cpu;
-   int cpu;
unsigned intseq_nr;
int info;
void(*parallel)(struct padata_priv *padata);
@@ -68,15 +66,11 @@ struct padata_serial_queue {
 /**
  * struct padata_parallel_queue - The percpu padata parallel queue
  *
- * @parallel: List to wait for parallelization.
  * @reorder: List to wait for reordering after parallel processing.
- * @work: work struct for parallelization.
  * @num_obj: Number of objects that are processed by this cpu.
  */
 struct padata_parallel_queue {
-   struct padata_listparallel;
struct padata_listreorder;
-   struct work_structwork;
atomic_t  num_obj;
 };
 
@@ -111,7 +105,7 @@ struct parallel_data {
struct padata_parallel_queue__percpu *pqueue;
struct padata_serial_queue  __percpu *squeue;
atomic_trefcnt;
-   atomic_tseq_nr;
+   unsigned intseq_nr;
unsigned intprocessed;
int cpu;
struct padata_cpumask   cpumask;
diff --git a/kernel/padata.c b/kernel/padata.c
index 6f709bc0fc413..78ff9aa529204 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -32,7 +32,15 @@
 #include 
 #include 
 
-#define MAX_OBJ_NUM 1000
+struct padata_work {
+   struct work_struct  pw_work;
+   struct list_headpw_list;  /* padata_free_works linkage */
+   void*pw_data;
+};
+
+static DEFINE_SPINLOCK(padata_works_lock);
+static struct padata_work *padata_works;
+static LIST_HEAD(padata_free_works);
 
 static void padata_free_pd(struct parallel_data *pd);
 
@@ -58,30 +66,44 @@ static int padata_cpu_hash(struct parallel_data *pd, 
unsigned int seq_nr)
return padata_index_to_cpu(pd, cpu_index);
 }
 
-static void padata_parallel_worker(struct work_struct *parallel_work)
+static struct padata_work *padata_work_alloc(void)
 {
-   struct padata_parallel_queue *pqueue;
-   LIST_HEAD(local_list);
+   struct padata_work *pw;
 
-   local_bh_disable();
-   pqueue = container_of(parallel_work,
- struct padata_parallel_queue, work);
+   lockdep_assert_held(&padata_works_lock);
 
-   spin_lock(&pqueue->parallel.lock);
-   list_replace_init(&pqueue->parallel.list, &local_list);
-   spin_unlock(&pqueue->parallel.lock);
+   if (list_empty(&padata_free_works))
+   return NULL;/* No more work items allowed to be queued. */
 
-   while (!list_empty(&local_list)) {
-   struct padata_priv *padata;
+   pw = list_first_entry(&padata_free_works, struct padata_work, pw_list);
+   list_del(&pw->pw_list);
+   return pw;
+}
 
-   padata = list_entry(local_list.next,
-   struct padata_priv, list);

[PATCH v3 5/8] mm: don't track number of pages during deferred initialization

2020-05-27 Thread Daniel Jordan
Deferred page init used to report the number of pages initialized:

  node 0 initialised, 32439114 pages in 97ms

Tracking this makes the code more complicated when using multiple
threads.  Given that the statistic probably has limited value,
especially since a zone grows on demand so that the page count can vary,
just remove it.

The boot message now looks like

  node 0 deferred pages initialised in 97ms

Signed-off-by: Daniel Jordan 
Suggested-by: Alexander Duyck 
---
 mm/page_alloc.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d0c0d9364aa6d..d64f3027fdfa6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1819,7 +1819,7 @@ static int __init deferred_init_memmap(void *data)
 {
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
-   unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+   unsigned long spfn = 0, epfn = 0;
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
@@ -1868,15 +1868,15 @@ static int __init deferred_init_memmap(void *data)
 * allocator.
 */
while (spfn < epfn) {
-   nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+   deferred_init_maxorder(&i, zone, &spfn, &epfn);
cond_resched();
}
 zone_empty:
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
 
-   pr_info("node %d initialised, %lu pages in %ums\n",
-   pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
+   pr_info("node %d deferred pages initialised in %ums\n",
+   pgdat->node_id, jiffies_to_msecs(jiffies - start));
 
pgdat_init_report_one_done();
return 0;
-- 
2.26.2



[PATCH v3 8/8] padata: document multithreaded jobs

2020-05-27 Thread Daniel Jordan
Add Documentation for multithreaded jobs.

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 Documentation/core-api/padata.rst | 41 +++
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/Documentation/core-api/padata.rst 
b/Documentation/core-api/padata.rst
index 9a24c111781d9..0830e5b0e8211 100644
--- a/Documentation/core-api/padata.rst
+++ b/Documentation/core-api/padata.rst
@@ -4,23 +4,26 @@
 The padata parallel execution mechanism
 ===
 
-:Date: December 2019
+:Date: May 2020
 
 Padata is a mechanism by which the kernel can farm jobs out to be done in
-parallel on multiple CPUs while retaining their ordering.  It was developed for
-use with the IPsec code, which needs to be able to perform encryption and
-decryption on large numbers of packets without reordering those packets.  The
-crypto developers made a point of writing padata in a sufficiently general
-fashion that it could be put to other uses as well.
+parallel on multiple CPUs while optionally retaining their ordering.
 
-Usage
-=
+It was originally developed for IPsec, which needs to perform encryption and
+decryption on large numbers of packets without reordering those packets.  This
+is currently the sole consumer of padata's serialized job support.
+
+Padata also supports multithreaded jobs, splitting up the job evenly while load
+balancing and coordinating between threads.
+
+Running Serialized Jobs
+===
 
 Initializing
 
 
-The first step in using padata is to set up a padata_instance structure for
-overall control of how jobs are to be run::
+The first step in using padata to run serialized jobs is to set up a
+padata_instance structure for overall control of how jobs are to be run::
 
 #include 
 
@@ -162,6 +165,24 @@ functions that correspond to the allocation in reverse::
 It is the user's responsibility to ensure all outstanding jobs are complete
 before any of the above are called.
 
+Running Multithreaded Jobs
+==
+
+A multithreaded job has a main thread and zero or more helper threads, with the
+main thread participating in the job and then waiting until all helpers have
+finished.  padata splits the job into units called chunks, where a chunk is a
+piece of the job that one thread completes in one call to the thread function.
+
+A user has to do three things to run a multithreaded job.  First, describe the
+job by defining a padata_mt_job structure, which is explained in the Interface
+section.  This includes a pointer to the thread function, which padata will
+call each time it assigns a job chunk to a thread.  Then, define the thread
+function, which accepts three arguments, ``start``, ``end``, and ``arg``, where
+the first two delimit the range that the thread operates on and the last is a
+pointer to the job's shared state, if any.  Prepare the shared state, which is
+typically allocated on the main thread's stack.  Last, call
+padata_do_multithreaded(), which will return once the job is finished.
+
 Interface
 =
 
-- 
2.26.2



[PATCH v3 1/8] padata: remove exit routine

2020-05-27 Thread Daniel Jordan
padata_driver_exit() is unnecessary because padata isn't built as a
module and doesn't exit.

padata's init routine will soon allocate memory, so getting rid of the
exit function now avoids pointless code to free it.

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 kernel/padata.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/kernel/padata.c b/kernel/padata.c
index a6afa12fb75ee..835919c745266 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -1072,10 +1072,4 @@ static __init int padata_driver_init(void)
 }
 module_init(padata_driver_init);
 
-static __exit void padata_driver_exit(void)
-{
-   cpuhp_remove_multi_state(CPUHP_PADATA_DEAD);
-   cpuhp_remove_multi_state(hp_online);
-}
-module_exit(padata_driver_exit);
 #endif
-- 
2.26.2



[PATCH v3 7/8] mm: make deferred init's max threads arch-specific

2020-05-27 Thread Daniel Jordan
Using padata during deferred init has only been tested on x86, so for
now limit it to this architecture.

If another arch wants this, it can find the max thread limit that's best
for it and override deferred_page_init_max_threads().

Signed-off-by: Daniel Jordan 
Tested-by: Josh Triplett 
---
 arch/x86/mm/init_64.c| 12 
 include/linux/memblock.h |  3 +++
 mm/page_alloc.c  | 13 -
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 8b5f73f5e207c..2d749ec12ea8a 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1260,6 +1260,18 @@ void __init mem_init(void)
mem_init_print_info(NULL);
 }
 
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+   /*
+* More CPUs always led to greater speedups on tested systems, up to
+* all the nodes' CPUs.  Use all since the system is otherwise idle
+* now.
+*/
+   return max_t(int, cpumask_weight(node_cpumask), 1);
+}
+#endif
+
 int kernel_set_to_readonly;
 
 void mark_rodata_ro(void)
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 6bc37a731d27b..2b289df44194f 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -275,6 +275,9 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone 
*zone,
 #define for_each_free_mem_pfn_range_in_zone_from(i, zone, p_start, p_end) \
for (; i != U64_MAX;  \
 __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end))
+
+int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask);
+
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d47016849531..329fd1a809c59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1835,6 +1835,13 @@ deferred_init_memmap_chunk(unsigned long start_pfn, 
unsigned long end_pfn,
}
 }
 
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+   return 1;
+}
+
 /* Initialise remaining memory on a node */
 static int __init deferred_init_memmap(void *data)
 {
@@ -1883,11 +1890,7 @@ static int __init deferred_init_memmap(void *data)
 first_init_pfn))
goto zone_empty;
 
-   /*
-* More CPUs always led to greater speedups on tested systems, up to
-* all the nodes' CPUs.  Use all since the system is otherwise idle now.
-*/
-   max_threads = max(cpumask_weight(cpumask), 1u);
+   max_threads = deferred_page_init_max_threads(cpumask);
 
while (spfn < epfn) {
unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
-- 
2.26.2



Re: [PATCH 5/6] powerpc/mmu: drop mmap_sem now that locked_vm is atomic

2019-04-24 Thread Daniel Jordan
On Wed, Apr 24, 2019 at 11:10:24AM +, Jason Gunthorpe wrote:
> On Tue, Apr 23, 2019 at 07:15:44PM -0700, Davidlohr Bueso wrote:
> > Wouldn't the cmpxchg alternative also be exposed the locked_vm changing 
> > between
> > validating the new value and the cmpxchg() and we'd bogusly fail even when 
> > there
> > is still just because the value changed (I'm assuming we don't hold any 
> > locks,
> > otherwise all this is pointless).

That's true, I hadn't considered that we could retry even when there's enough
locked_vm.  Seems like another one is that RLIMIT_MEMLOCK could change after
it's read.  I guess nothing's going to be perfect.  :/

> Well it needs a loop..
> 
> again:
>current_locked = atomic_read(&mm->locked_vm);
>new_locked = current_locked + npages;
>if (new_locked < lock_limit)
>   if (cmpxchg(&mm->locked_vm, current_locked, new_locked) != 
> current_locked)
> goto again;
> 
> So it won't have bogus failures as there is no unwind after
> error. Basically this is a load locked/store conditional style of
> locking pattern.

This is basically what I have so far.

> > > That's a good idea, and especially worth doing considering that an 
> > > arbitrary
> > > number of threads that charge a low amount of locked_vm can fail just 
> > > because
> > > one thread charges lots of it.
> > 
> > Yeah but the window for this is quite small, I doubt it would be a real 
> > issue.
>
> > What if before doing the atomic_add_return(), we first did the racy 
> > new_locked
> > check for ENOMEM, then do the speculative add and cleanup, if necessary. 
> > This
> > would further reduce the scope of the window where false ENOMEM can occur.

So the upside of this is that there's no retry loop so tasks don't spin under
heavy contention?  Seems better to always guard against false ENOMEM, at least
from the locked_vm side if not from the rlimit changing.

> > > pinned_vm appears to be broken the same way, so I can fix it too unless 
> > > someone
> > > beats me to it.
> > 
> > This should not be a surprise for the rdma folks. Cc'ing Jason nonetheless.
> 
> I think we accepted this tiny race as a side effect of removing the
> lock, which was very beneficial. Really the time window between the
> atomic failing and unwind is very small, and there are enough other
> ways a hostile user could DOS locked_vm that I don't think it really
> matters in practice..
> 
> However, the cmpxchg seems better, so a helper to implement that would
> probably be the best thing to do.

I've collapsed all the locked_vm users into such a helper and am now working on
converting the pinned_vm users to the same helper.  Taking longer than I
thought.


[PATCH] mm: add account_locked_vm utility function

2019-05-03 Thread Daniel Jordan
locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.  Standardize the debug prints, which vary
slightly.  Error codes stay the same, so user-visible behavior does too.

Signed-off-by: Daniel Jordan 
Cc: Alan Tull 
Cc: Alexey Kardashevskiy 
Cc: Alex Williamson 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Christophe Leroy 
Cc: Davidlohr Bueso 
Cc: Jason Gunthorpe 
Cc: Mark Rutland 
Cc: Michael Ellerman 
Cc: Moritz Fischer 
Cc: Paul Mackerras 
Cc: Steve Sistare 
Cc: Wu Hao 
Cc: linux...@kvack.org
Cc: k...@vger.kernel.org
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-f...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
---

Based on v5.1-rc7.  Tested with the vfio type1 driver.  Any feedback
welcome.

Andrew, this one patch replaces these six from [1]:

mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch
vfio-type1-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
vfio-spapr_tce-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
fpga-dlf-afu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
kvm-book3s-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch

That series converts locked_vm to an atomic, but on closer inspection causes at
least one accounting race in mremap, and fixing it just for this type
conversion came with too much ugly in the core mm to justify, especially when
the right long-term fix is making these drivers use pinned_vm instead.

Christophe's suggestion of cmpxchg[2] does prevent the races he
mentioned for pinned_vm, but others would still remain.  In perf_mmap
and the hfi1 driver, pinned_vm is checked against the rlimit racily and
then later increased when the pinned_vm originally read may have gone
stale.  Any fixes for that, that I could think of, seem about as good as
what's there now, so I left it.  I have a patch that uses cmpxchg with
pinned_vm if others feel strongly that the aforementioned races should
be fixed.

Daniel

[1] 
https://lore.kernel.org/linux-mm/20190402204158.27582-1-daniel.m.jor...@oracle.com/
[2] 
https://lore.kernel.org/linux-mm/964bd5b0-f1e5-7bf0-5c58-18e75c550...@c-s.fr/

 arch/powerpc/kvm/book3s_64_vio.c| 44 +++-
 arch/powerpc/mm/mmu_context_iommu.c | 41 +++---
 drivers/fpga/dfl-afu-dma-region.c   | 53 +++--
 drivers/vfio/vfio_iommu_spapr_tce.c | 52 +---
 drivers/vfio/vfio_iommu_type1.c | 23 -
 include/linux/mm.h  | 19 +++
 mm/util.c   | 48 ++
 7 files changed, 94 insertions(+), 186 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index f02b04973710..f7d37fa6003a 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-   long ret = 0;
-
-   if (!current || !current->mm)
-   return ret; /* process exited */
-
-   down_write(¤t->mm->mmap_sem);
-
-   if (inc) {
-   unsigned long locked, lock_limit;
-
-   locked = current->mm->locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-   ret = -ENOMEM;
-   else
-   current->mm->locked_vm += stt_pages;
-   } else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
-
-   current->mm->locked_vm -= stt_pages;
-   }
-
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
-
-   return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -277,7 +241,7 @@ static int kvm_spapr_tce_release(struct inode *inode, 
struct file *filp)
 
kvm_put_kvm(stt->kvm);
 
-   kvmppc_account_memlimit(
+   account_locked_vm(current->mm,
kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
call_rcu(&

Re: [PATCH] mm: add account_locked_vm utility function

2019-05-06 Thread Daniel Jordan
On Fri, May 03, 2019 at 11:28:22PM +, Jason Gunthorpe wrote:
> On Fri, May 03, 2019 at 01:16:30PM -0700, Daniel Jordan wrote:
> > Andrew, this one patch replaces these six from [1]:
> > 
> > mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch
> > vfio-type1-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> > vfio-spapr_tce-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> > fpga-dlf-afu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> > kvm-book3s-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> > powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch
> > 
> > That series converts locked_vm to an atomic, but on closer inspection 
> > causes at
> > least one accounting race in mremap, and fixing it just for this type
> > conversion came with too much ugly in the core mm to justify, especially 
> > when
> > the right long-term fix is making these drivers use pinned_vm instead.
> 
> Did we ever decide what to do here? Should all these drivers be
> switched to pinned_vm anyhow?

Well, there were the concerns about switching in [1].  Alex, is there an
example of an application or library that would break or be exploitable?  If
there were particular worries (qemu for vfio type1, for example), perhaps some
coordinated changes across the kernel and userspace would be possible,
especially given the amount of effort it's likely going to take to get the
locked_vm/pinned_vm accounting sorted out.

[1] https://lore.kernel.org/linux-mm/20190213130330.76ef1...@w520.home/


Re: [PATCH] mm: add account_locked_vm utility function

2019-05-20 Thread Daniel Jordan
On Mon, May 20, 2019 at 04:19:34PM +1000, Alexey Kardashevskiy wrote:
> On 04/05/2019 06:16, Daniel Jordan wrote:
> > locked_vm accounting is done roughly the same way in five places, so
> > unify them in a helper.  Standardize the debug prints, which vary
> > slightly.
> 
> And I rather liked that prints were different and tell precisely which
> one of three each printk is.

I'm not following.  One of three...callsites?  But there were five callsites.

Anyway, I added a _RET_IP_ to the debug print so you can differentiate.

> I commented below but in general this seems working.
> 
> Tested-by: Alexey Kardashevskiy 

Thanks!  And for the review as well.

> > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> > b/drivers/vfio/vfio_iommu_spapr_tce.c
> > index 6b64e45a5269..d39a1b830d82 100644
> > --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> > @@ -34,49 +35,13 @@
> >  static void tce_iommu_detach_group(void *iommu_data,
> > struct iommu_group *iommu_group);
> >  
> > -static long try_increment_locked_vm(struct mm_struct *mm, long npages)
> > +static int tce_account_locked_vm(struct mm_struct *mm, unsigned long 
> > npages,
> > +bool inc)
> >  {
> > -   long ret = 0, locked, lock_limit;
> > -
> > if (WARN_ON_ONCE(!mm))
> > return -EPERM;
> 
> 
> If this WARN_ON is the only reason for having tce_account_locked_vm()
> instead of calling account_locked_vm() directly, you can then ditch the
> check as I have never ever seen this triggered.

Great, will do.

> > diff --git a/drivers/vfio/vfio_iommu_type1.c 
> > b/drivers/vfio/vfio_iommu_type1.c
> > index d0f731c9920a..15ac76171ccd 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -273,25 +273,14 @@ static int vfio_lock_acct(struct vfio_dma *dma, long 
> > npage, bool async)
> > return -ESRCH; /* process exited */
> >  
> > ret = down_write_killable(&mm->mmap_sem);
> > -   if (!ret) {
> > -   if (npage > 0) {
> > -   if (!dma->lock_cap) {
> > -   unsigned long limit;
> > -
> > -   limit = task_rlimit(dma->task,
> > -   RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > -
> > -   if (mm->locked_vm + npage > limit)
> > -   ret = -ENOMEM;
> > -   }
> > -   }
> > +   if (ret)
> > +   goto out;
> 
> 
> A single "goto" to jump just 3 lines below seems unnecessary.

No strong preference here, I'll take out the goto.

> > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool 
> > inc,
> > +   struct task_struct *task, bool bypass_rlim)
> > +{
> > +   unsigned long locked_vm, limit;
> > +   int ret = 0;
> > +
> > +   locked_vm = mm->locked_vm;
> > +   if (inc) {
> > +   if (!bypass_rlim) {
> > +   limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > +   if (locked_vm + pages > limit) {
> > +   ret = -ENOMEM;
> > +   goto out;
> > +   }
> > +   }
> 
> Nit:
> 
> if (!ret)
> 
> and then you don't need "goto out".

Ok, sure.

> > +   mm->locked_vm = locked_vm + pages;
> > +   } else {
> > +   WARN_ON_ONCE(pages > locked_vm);
> > +   mm->locked_vm = locked_vm - pages;
> 
> 
> Can go negative here. Not a huge deal but inaccurate imo.

I hear you, but setting a negative value to zero, as we had done previously,
doesn't make much sense to me.


[PATCH v2] mm: add account_locked_vm utility function

2019-05-24 Thread Daniel Jordan
locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.  Standardize the debug prints, which vary
slightly, but include the helper's caller to disambiguate between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

Signed-off-by: Daniel Jordan 
Tested-by: Alexey Kardashevskiy 
Cc: Alan Tull 
Cc: Alex Williamson 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Christophe Leroy 
Cc: Davidlohr Bueso 
Cc: Jason Gunthorpe 
Cc: Mark Rutland 
Cc: Michael Ellerman 
Cc: Moritz Fischer 
Cc: Paul Mackerras 
Cc: Steve Sistare 
Cc: Wu Hao 
Cc: linux...@kvack.org
Cc: k...@vger.kernel.org
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-f...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
---

Against v5.2-rc1.

v2:
 - applied review comments from Alexey
 - added _RET_IP_ to debug print to disambiguate callers

 arch/powerpc/kvm/book3s_64_vio.c | 44 +++
 arch/powerpc/mm/book3s64/iommu_api.c | 41 +++--
 drivers/fpga/dfl-afu-dma-region.c| 53 +++
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-
 drivers/vfio/vfio_iommu_type1.c  | 17 ++---
 include/linux/mm.h   | 19 ++
 mm/util.c| 46 
 7 files changed, 84 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 66270e07449a..768b645c7edf 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-   long ret = 0;
-
-   if (!current || !current->mm)
-   return ret; /* process exited */
-
-   down_write(¤t->mm->mmap_sem);
-
-   if (inc) {
-   unsigned long locked, lock_limit;
-
-   locked = current->mm->locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-   ret = -ENOMEM;
-   else
-   current->mm->locked_vm += stt_pages;
-   } else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
-
-   current->mm->locked_vm -= stt_pages;
-   }
-
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
-
-   return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -302,7 +266,7 @@ static int kvm_spapr_tce_release(struct inode *inode, 
struct file *filp)
 
kvm_put_kvm(stt->kvm);
 
-   kvmppc_account_memlimit(
+   account_locked_vm(current->mm,
kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -327,7 +291,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return -EINVAL;
 
npages = kvmppc_tce_pages(size);
-   ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+   ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
if (ret)
return ret;
 
@@ -373,7 +337,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
kfree(stt);
  fail_acct:
-   kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+   account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index 5c521f3924a5..18d22eec0ebd 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -51,40 +52,6 @@ struct mm_iommu_table_group_mem_t {
u64 dev_hpa;/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-   unsigned long npage

Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-28 Thread Daniel Jordan
On Sat, May 25, 2019 at 02:51:18PM -0700, Andrew Morton wrote:
> On Fri, 24 May 2019 13:50:45 -0400 Daniel Jordan  
> wrote:
> 
> > locked_vm accounting is done roughly the same way in five places, so
> > unify them in a helper.  Standardize the debug prints, which vary
> > slightly, but include the helper's caller to disambiguate between
> > callsites.
> > 
> > Error codes stay the same, so user-visible behavior does too.  The one
> > exception is that the -EPERM case in tce_account_locked_vm is removed
> > because Alexey has never seen it triggered.
> > 
> > ...
> >
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, 
> > unsigned long nr_pages,
> >  int get_user_pages_fast(unsigned long start, int nr_pages,
> > unsigned int gup_flags, struct page **pages);
> >  
> > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool 
> > inc,
> > +   struct task_struct *task, bool bypass_rlim);
> > +
> > +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> > pages,
> > +   bool inc)
> > +{
> > +   int ret;
> > +
> > +   if (pages == 0 || !mm)
> > +   return 0;
> > +
> > +   down_write(&mm->mmap_sem);
> > +   ret = __account_locked_vm(mm, pages, inc, current,
> > + capable(CAP_IPC_LOCK));
> > +   up_write(&mm->mmap_sem);
> > +
> > +   return ret;
> > +}
> 
> That's quite a mouthful for an inlined function.  How about uninlining
> the whole thing and fiddling drivers/vfio/vfio_iommu_type1.c to suit. 
> I wonder why it does down_write_killable and whether it really needs
> to...

Sure, I can uninline it.  vfio changelogs don't show a particular reason for
_killable[1].  Maybe Alex has something to add.  Otherwise I'll respin without
it since the simplification seems worth removing _killable.

[1] 0cfef2b7410b ("vfio/type1: Remove locked page accounting workqueue")


Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-29 Thread Daniel Jordan
On Wed, May 29, 2019 at 11:05:48AM -0700, Ira Weiny wrote:
> On Fri, May 24, 2019 at 01:50:45PM -0400, Daniel Jordan wrote:
> > +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> > pages,
> > +   bool inc)
> > +{
> > +   int ret;
> > +
> > +   if (pages == 0 || !mm)
> > +   return 0;
> > +
> > +   down_write(&mm->mmap_sem);
> > +   ret = __account_locked_vm(mm, pages, inc, current,
> > + capable(CAP_IPC_LOCK));
> > +   up_write(&mm->mmap_sem);
> > +
> > +   return ret;
> > +}
> > +
...snip...
> > +/**
> > + * __account_locked_vm - account locked pages to an mm's locked_vm
> > + * @mm:  mm to account against, may be NULL
> 
> This kernel doc is wrong.  You dereference mm straight away...
...snip...
> > +
> > +   locked_vm = mm->locked_vm;
> 
> here...
> 
> Perhaps the comment was meant to document account_locked_vm()?

Yes, the comment got out of sync when I moved the !mm check outside
__account_locked_vm.  Thanks for catching, will fix.


[PATCH v3] mm: add account_locked_vm utility function

2019-05-29 Thread Daniel Jordan
locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.

Include the helper's caller in the debug print to distinguish between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

Signed-off-by: Daniel Jordan 
Tested-by: Alexey Kardashevskiy 
Cc: Alan Tull 
Cc: Alex Williamson 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Christophe Leroy 
Cc: Davidlohr Bueso 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Cc: Mark Rutland 
Cc: Michael Ellerman 
Cc: Moritz Fischer 
Cc: Paul Mackerras 
Cc: Steve Sistare 
Cc: Wu Hao 
Cc: linux...@kvack.org
Cc: k...@vger.kernel.org
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-f...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
---
v3:
 - uninline account_locked_vm (Andrew)
 - fix doc comment (Ira)
 - retain down_write_killable in vfio type1 (Alex)
 - leave Alexey's T-b since the code is the same aside from uninlining
 - sanity tested with vfio type1, sanity-built on ppc

 arch/powerpc/kvm/book3s_64_vio.c | 44 ++--
 arch/powerpc/mm/book3s64/iommu_api.c | 41 ++-
 drivers/fpga/dfl-afu-dma-region.c| 53 ++--
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 ++--
 drivers/vfio/vfio_iommu_type1.c  | 17 +--
 include/linux/mm.h   |  4 ++
 mm/util.c| 75 
 7 files changed, 98 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 66270e07449a..768b645c7edf 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-   long ret = 0;
-
-   if (!current || !current->mm)
-   return ret; /* process exited */
-
-   down_write(¤t->mm->mmap_sem);
-
-   if (inc) {
-   unsigned long locked, lock_limit;
-
-   locked = current->mm->locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-   ret = -ENOMEM;
-   else
-   current->mm->locked_vm += stt_pages;
-   } else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
-
-   current->mm->locked_vm -= stt_pages;
-   }
-
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(¤t->mm->mmap_sem);
-
-   return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -302,7 +266,7 @@ static int kvm_spapr_tce_release(struct inode *inode, 
struct file *filp)
 
kvm_put_kvm(stt->kvm);
 
-   kvmppc_account_memlimit(
+   account_locked_vm(current->mm,
kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
call_rcu(&stt->rcu, release_spapr_tce_table);
 
@@ -327,7 +291,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return -EINVAL;
 
npages = kvmppc_tce_pages(size);
-   ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+   ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
if (ret)
return ret;
 
@@ -373,7 +337,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
kfree(stt);
  fail_acct:
-   kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+   account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index 5c521f3924a5..18d22eec0ebd 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -51,40 +52,6 @@ struct mm_iommu_table_group_mem_t {
u64 dev_hpa;/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locke

Re: [PATCH v8 18/24] mm: Provide speculative fault infrastructure

2018-02-26 Thread Daniel Jordan

Hi Laurent,

This series doesn't build for me[*] when CONFIG_TRANSPARENT_HUGEPAGE is unset.

The problem seems to be that the BUILD_BUG() version of pmd_same is called in 
pte_map_lock:

On 02/16/2018 10:25 AM, Laurent Dufour wrote:

+static bool pte_map_lock(struct vm_fault *vmf)
+{

...snip...

+   if (!pmd_same(pmdval, vmf->orig_pmd))
+   goto out;


Since SPF can now call pmd_same without THP, maybe the way to fix it is just

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 2cfa3075d148..e130692db24a 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -375,7 +375,8 @@ static inline int pte_unused(pte_t pte)
 #endif
 
 #ifndef __HAVE_ARCH_PMD_SAME

-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
+defined(CONFIG_SPECULATIVE_PAGE_FAULT)
 static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
 {
return pmd_val(pmd_a) == pmd_val(pmd_b);

?

Daniel


[*]  The errors are:

In file included from /home/dmjordan/src/linux/include/linux/kernel.h:10:0,
 from /home/dmjordan/src/linux/include/linux/list.h:9,
 from /home/dmjordan/src/linux/include/linux/smp.h:12,
 from /home/dmjordan/src/linux/include/linux/kernel_stat.h:5,
 from /home/dmjordan/src/linux/mm/memory.c:41:
In function ‘pmd_same.isra.104’,
inlined from ‘pte_map_lock’ at /home/dmjordan/src/linux/mm/memory.c:2380:7:
/home/dmjordan/src/linux/include/linux/compiler.h:324:38: error: call to 
‘__compiletime_assert_391’ declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^
/home/dmjordan/src/linux/include/linux/compiler.h:304:4: note: in definition of 
macro ‘__compiletime_assert’
prefix ## suffix();\
^~
/home/dmjordan/src/linux/include/linux/compiler.h:324:2: note: in expansion of 
macro ‘_compiletime_assert’
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~
/home/dmjordan/src/linux/include/linux/build_bug.h:45:37: note: in expansion of 
macro ‘compiletime_assert’
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
 ^~
/home/dmjordan/src/linux/include/linux/build_bug.h:79:21: note: in expansion of 
macro ‘BUILD_BUG_ON_MSG’
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
 ^~~~
/home/dmjordan/src/linux/include/asm-generic/pgtable.h:391:2: note: in 
expansion of macro ‘BUILD_BUG’
  BUILD_BUG();
  ^
  CC  block/elevator.o
  CC  crypto/crypto_wq.o
In function ‘pmd_same.isra.104’,
inlined from ‘pte_spinlock’ at /home/dmjordan/src/linux/mm/memory.c:2326:7,
inlined from ‘handle_pte_fault’ at 
/home/dmjordan/src/linux/mm/memory.c:4181:7:
/home/dmjordan/src/linux/include/linux/compiler.h:324:38: error: call to 
‘__compiletime_assert_391’ declared with attribute error: BUILD_BUG failed
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^
/home/dmjordan/src/linux/include/linux/compiler.h:304:4: note: in definition of 
macro ‘__compiletime_assert’
prefix ## suffix();\
^~
/home/dmjordan/src/linux/include/linux/compiler.h:324:2: note: in expansion of 
macro ‘_compiletime_assert’
  _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
  ^~~
/home/dmjordan/src/linux/include/linux/build_bug.h:45:37: note: in expansion of 
macro ‘compiletime_assert’
 #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
 ^~
/home/dmjordan/src/linux/include/linux/build_bug.h:79:21: note: in expansion of 
macro ‘BUILD_BUG_ON_MSG’
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
 ^~~~
/home/dmjordan/src/linux/include/asm-generic/pgtable.h:391:2: note: in 
expansion of macro ‘BUILD_BUG’
  BUILD_BUG();
  ^
...
make[2]: *** [/home/dmjordan/src/linux/scripts/Makefile.build:316: mm/memory.o] 
Error 1
make[1]: *** [/home/dmjordan/src/linux/Makefile:1047: mm] Error 2