Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-29 Thread Alex Williamson
On Tue, 28 May 2019 11:04:24 -0400
Daniel Jordan  wrote:

> On Sat, May 25, 2019 at 02:51:18PM -0700, Andrew Morton wrote:
> > On Fri, 24 May 2019 13:50:45 -0400 Daniel Jordan 
> >  wrote:
> >   
> > > locked_vm accounting is done roughly the same way in five places, so
> > > unify them in a helper.  Standardize the debug prints, which vary
> > > slightly, but include the helper's caller to disambiguate between
> > > callsites.
> > > 
> > > Error codes stay the same, so user-visible behavior does too.  The one
> > > exception is that the -EPERM case in tce_account_locked_vm is removed
> > > because Alexey has never seen it triggered.
> > > 
> > > ...
> > >
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, 
> > > unsigned long nr_pages,
> > >  int get_user_pages_fast(unsigned long start, int nr_pages,
> > >   unsigned int gup_flags, struct page **pages);
> > >  
> > > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool 
> > > inc,
> > > + struct task_struct *task, bool bypass_rlim);
> > > +
> > > +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> > > pages,
> > > + bool inc)
> > > +{
> > > + int ret;
> > > +
> > > + if (pages == 0 || !mm)
> > > + return 0;
> > > +
> > > + down_write(>mmap_sem);
> > > + ret = __account_locked_vm(mm, pages, inc, current,
> > > +   capable(CAP_IPC_LOCK));
> > > + up_write(>mmap_sem);
> > > +
> > > + return ret;
> > > +}  
> > 
> > That's quite a mouthful for an inlined function.  How about uninlining
> > the whole thing and fiddling drivers/vfio/vfio_iommu_type1.c to suit. 
> > I wonder why it does down_write_killable and whether it really needs
> > to...  
> 
> Sure, I can uninline it.  vfio changelogs don't show a particular reason for
> _killable[1].  Maybe Alex has something to add.  Otherwise I'll respin without
> it since the simplification seems worth removing _killable.
> 
> [1] 0cfef2b7410b ("vfio/type1: Remove locked page accounting workqueue")

A userspace vfio driver maps DMA via an ioctl through this path, so I
believe I used killable here just to be friendly that it could be
interrupted and we could fall out with an errno if it were stuck here.
No harm, no foul, the user's mapping is aborted and unwound.  If we're
deadlocked or seriously contended on mmap_sem, maybe we're already in
trouble, but it seemed like a valid and low hanging use case for
killable.  Thanks,

Alex


Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-29 Thread Daniel Jordan
On Wed, May 29, 2019 at 11:05:48AM -0700, Ira Weiny wrote:
> On Fri, May 24, 2019 at 01:50:45PM -0400, Daniel Jordan wrote:
> > +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> > pages,
> > +   bool inc)
> > +{
> > +   int ret;
> > +
> > +   if (pages == 0 || !mm)
> > +   return 0;
> > +
> > +   down_write(>mmap_sem);
> > +   ret = __account_locked_vm(mm, pages, inc, current,
> > + capable(CAP_IPC_LOCK));
> > +   up_write(>mmap_sem);
> > +
> > +   return ret;
> > +}
> > +
...snip...
> > +/**
> > + * __account_locked_vm - account locked pages to an mm's locked_vm
> > + * @mm:  mm to account against, may be NULL
> 
> This kernel doc is wrong.  You dereference mm straight away...
...snip...
> > +
> > +   locked_vm = mm->locked_vm;
> 
> here...
> 
> Perhaps the comment was meant to document account_locked_vm()?

Yes, the comment got out of sync when I moved the !mm check outside
__account_locked_vm.  Thanks for catching, will fix.


Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-29 Thread Ira Weiny
On Fri, May 24, 2019 at 01:50:45PM -0400, Daniel Jordan wrote:

[snip]

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 0e8834ac32b7..72c1034d2ec7 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, 
> unsigned long nr_pages,
>  int get_user_pages_fast(unsigned long start, int nr_pages,
>   unsigned int gup_flags, struct page **pages);
>  
> +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
> + struct task_struct *task, bool bypass_rlim);
> +
> +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> pages,
> + bool inc)
> +{
> + int ret;
> +
> + if (pages == 0 || !mm)
> + return 0;
> +
> + down_write(>mmap_sem);
> + ret = __account_locked_vm(mm, pages, inc, current,
> +   capable(CAP_IPC_LOCK));
> + up_write(>mmap_sem);
> +
> + return ret;
> +}
> +
>  /* Container for pinned pfns / pages */
>  struct frame_vector {
>   unsigned int nr_allocated;  /* Number of frames we have space for */
> diff --git a/mm/util.c b/mm/util.c
> index e2e4f8c3fa12..bd3bdf16a084 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -6,6 +6,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -346,6 +347,51 @@ int __weak get_user_pages_fast(unsigned long start,
>  }
>  EXPORT_SYMBOL_GPL(get_user_pages_fast);
>  
> +/**
> + * __account_locked_vm - account locked pages to an mm's locked_vm
> + * @mm:  mm to account against, may be NULL

This kernel doc is wrong.  You dereference mm straight away...

> + * @pages:   number of pages to account
> + * @inc: %true if @pages should be considered positive, %false if not
> + * @task:task used to check RLIMIT_MEMLOCK
> + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
> + *
> + * Assumes @task and @mm are valid (i.e. at least one reference on each), and
> + * that mmap_sem is held as writer.
> + *
> + * Return:
> + * * 0   on success
> + * * 0   if @mm is NULL (can happen for example if the task is exiting)
> + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
> + */
> +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
> + struct task_struct *task, bool bypass_rlim)
> +{
> + unsigned long locked_vm, limit;
> + int ret = 0;
> +
> + locked_vm = mm->locked_vm;

here...

Perhaps the comment was meant to document account_locked_vm()?  Or should the
parameter checks be moved here?

Ira

> + if (inc) {
> + if (!bypass_rlim) {
> + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + if (locked_vm + pages > limit)
> + ret = -ENOMEM;
> + }
> + if (!ret)
> + mm->locked_vm = locked_vm + pages;
> + } else {
> + WARN_ON_ONCE(pages > locked_vm);
> + mm->locked_vm = locked_vm - pages;
> + }
> +
> + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
> +  (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
> +  locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
> +  ret ? " - exceeded" : "");
> +
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(__account_locked_vm);
>

> +
>  unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
>   unsigned long len, unsigned long prot,
>   unsigned long flag, unsigned long pgoff)
> 
> base-commit: a188339ca5a396acc588e5851ed7e19f66b0ebd9
> -- 
> 2.21.0
> 


Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-28 Thread Daniel Jordan
On Sat, May 25, 2019 at 02:51:18PM -0700, Andrew Morton wrote:
> On Fri, 24 May 2019 13:50:45 -0400 Daniel Jordan  
> wrote:
> 
> > locked_vm accounting is done roughly the same way in five places, so
> > unify them in a helper.  Standardize the debug prints, which vary
> > slightly, but include the helper's caller to disambiguate between
> > callsites.
> > 
> > Error codes stay the same, so user-visible behavior does too.  The one
> > exception is that the -EPERM case in tce_account_locked_vm is removed
> > because Alexey has never seen it triggered.
> > 
> > ...
> >
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, 
> > unsigned long nr_pages,
> >  int get_user_pages_fast(unsigned long start, int nr_pages,
> > unsigned int gup_flags, struct page **pages);
> >  
> > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool 
> > inc,
> > +   struct task_struct *task, bool bypass_rlim);
> > +
> > +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> > pages,
> > +   bool inc)
> > +{
> > +   int ret;
> > +
> > +   if (pages == 0 || !mm)
> > +   return 0;
> > +
> > +   down_write(>mmap_sem);
> > +   ret = __account_locked_vm(mm, pages, inc, current,
> > + capable(CAP_IPC_LOCK));
> > +   up_write(>mmap_sem);
> > +
> > +   return ret;
> > +}
> 
> That's quite a mouthful for an inlined function.  How about uninlining
> the whole thing and fiddling drivers/vfio/vfio_iommu_type1.c to suit. 
> I wonder why it does down_write_killable and whether it really needs
> to...

Sure, I can uninline it.  vfio changelogs don't show a particular reason for
_killable[1].  Maybe Alex has something to add.  Otherwise I'll respin without
it since the simplification seems worth removing _killable.

[1] 0cfef2b7410b ("vfio/type1: Remove locked page accounting workqueue")


Re: [PATCH v2] mm: add account_locked_vm utility function

2019-05-25 Thread Andrew Morton
On Fri, 24 May 2019 13:50:45 -0400 Daniel Jordan  
wrote:

> locked_vm accounting is done roughly the same way in five places, so
> unify them in a helper.  Standardize the debug prints, which vary
> slightly, but include the helper's caller to disambiguate between
> callsites.
> 
> Error codes stay the same, so user-visible behavior does too.  The one
> exception is that the -EPERM case in tce_account_locked_vm is removed
> because Alexey has never seen it triggered.
> 
> ...
>
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1564,6 +1564,25 @@ long get_user_pages_unlocked(unsigned long start, 
> unsigned long nr_pages,
>  int get_user_pages_fast(unsigned long start, int nr_pages,
>   unsigned int gup_flags, struct page **pages);
>  
> +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
> + struct task_struct *task, bool bypass_rlim);
> +
> +static inline int account_locked_vm(struct mm_struct *mm, unsigned long 
> pages,
> + bool inc)
> +{
> + int ret;
> +
> + if (pages == 0 || !mm)
> + return 0;
> +
> + down_write(>mmap_sem);
> + ret = __account_locked_vm(mm, pages, inc, current,
> +   capable(CAP_IPC_LOCK));
> + up_write(>mmap_sem);
> +
> + return ret;
> +}

That's quite a mouthful for an inlined function.  How about uninlining
the whole thing and fiddling drivers/vfio/vfio_iommu_type1.c to suit. 
I wonder why it does down_write_killable and whether it really needs
to...



[PATCH v2] mm: add account_locked_vm utility function

2019-05-24 Thread Daniel Jordan
locked_vm accounting is done roughly the same way in five places, so
unify them in a helper.  Standardize the debug prints, which vary
slightly, but include the helper's caller to disambiguate between
callsites.

Error codes stay the same, so user-visible behavior does too.  The one
exception is that the -EPERM case in tce_account_locked_vm is removed
because Alexey has never seen it triggered.

Signed-off-by: Daniel Jordan 
Tested-by: Alexey Kardashevskiy 
Cc: Alan Tull 
Cc: Alex Williamson 
Cc: Andrew Morton 
Cc: Benjamin Herrenschmidt 
Cc: Christoph Lameter 
Cc: Christophe Leroy 
Cc: Davidlohr Bueso 
Cc: Jason Gunthorpe 
Cc: Mark Rutland 
Cc: Michael Ellerman 
Cc: Moritz Fischer 
Cc: Paul Mackerras 
Cc: Steve Sistare 
Cc: Wu Hao 
Cc: linux...@kvack.org
Cc: k...@vger.kernel.org
Cc: kvm-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-f...@vger.kernel.org
Cc: linux-ker...@vger.kernel.org
---

Against v5.2-rc1.

v2:
 - applied review comments from Alexey
 - added _RET_IP_ to debug print to disambiguate callers

 arch/powerpc/kvm/book3s_64_vio.c | 44 +++
 arch/powerpc/mm/book3s64/iommu_api.c | 41 +++--
 drivers/fpga/dfl-afu-dma-region.c| 53 +++
 drivers/vfio/vfio_iommu_spapr_tce.c  | 54 +++-
 drivers/vfio/vfio_iommu_type1.c  | 17 ++---
 include/linux/mm.h   | 19 ++
 mm/util.c| 46 
 7 files changed, 84 insertions(+), 190 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 66270e07449a..768b645c7edf 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long 
tce_pages)
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
-   long ret = 0;
-
-   if (!current || !current->mm)
-   return ret; /* process exited */
-
-   down_write(>mm->mmap_sem);
-
-   if (inc) {
-   unsigned long locked, lock_limit;
-
-   locked = current->mm->locked_vm + stt_pages;
-   lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-   if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-   ret = -ENOMEM;
-   else
-   current->mm->locked_vm += stt_pages;
-   } else {
-   if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
-   stt_pages = current->mm->locked_vm;
-
-   current->mm->locked_vm -= stt_pages;
-   }
-
-   pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
-   inc ? '+' : '-',
-   stt_pages << PAGE_SHIFT,
-   current->mm->locked_vm << PAGE_SHIFT,
-   rlimit(RLIMIT_MEMLOCK),
-   ret ? " - exceeded" : "");
-
-   up_write(>mm->mmap_sem);
-
-   return ret;
-}
-
 static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
 {
struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -302,7 +266,7 @@ static int kvm_spapr_tce_release(struct inode *inode, 
struct file *filp)
 
kvm_put_kvm(stt->kvm);
 
-   kvmppc_account_memlimit(
+   account_locked_vm(current->mm,
kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
call_rcu(>rcu, release_spapr_tce_table);
 
@@ -327,7 +291,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return -EINVAL;
 
npages = kvmppc_tce_pages(size);
-   ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+   ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
if (ret)
return ret;
 
@@ -373,7 +337,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
kfree(stt);
  fail_acct:
-   kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+   account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
return ret;
 }
 
diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
b/arch/powerpc/mm/book3s64/iommu_api.c
index 5c521f3924a5..18d22eec0ebd 100644
--- a/arch/powerpc/mm/book3s64/iommu_api.c
+++ b/arch/powerpc/mm/book3s64/iommu_api.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -51,40 +52,6 @@ struct mm_iommu_table_group_mem_t {
u64 dev_hpa;/* Device memory base address */
 };
 
-static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
-   unsigned long npages, bool incr)
-{
-   long ret = 0, locked, lock_limit;
-
-   if (!npages)
-   return 0;
-
-   down_write(>mmap_sem);
-
-   if (incr) {
-   locked