[PATCH v2 0/3] mm/gup: introduce vaddr_pin_pages_remote(), FOLL_PIN

2019-08-20 Thread John Hubbard
Hi Ira,

This is for your tree. I'm dropping the RFC because this aspect is
starting to firm up pretty well.

I've moved FOLL_PIN inside the vaddr_pin_*() routines, and moved
FOLL_LONGTERM outside, based on our recent discussions. This is
documented pretty well within the patches.

Note that there are a lot of references in comments and commit
logs, to vaddr_pin_pages(). We'll want to catch all of those if
we rename that. I am pushing pretty hard to rename it to
vaddr_pin_user_pages().

v1 of this may be found here:
https://lore.kernel.org/r/20190812015044.26176-1-jhubb...@nvidia.com

John Hubbard (3):
  For Ira: tiny formatting tweak to kerneldoc
  mm/gup: introduce FOLL_PIN flag for get_user_pages()
  mm/gup: introduce vaddr_pin_pages_remote(), and invoke it

 drivers/infiniband/core/umem.c |  1 +
 include/linux/mm.h | 61 ++
 mm/gup.c   | 40 --
 mm/process_vm_access.c | 23 +++--
 4 files changed, 106 insertions(+), 19 deletions(-)

-- 
2.22.1



[PATCH v2 1/3] For Ira: tiny formatting tweak to kerneldoc

2019-08-20 Thread John Hubbard
For your vaddr_pin_pages() and vaddr_unpin_pages().
Just merge it into wherever it goes please. Didn't want to
cause merge problems so it's a separate patch-let.

Signed-off-by: John Hubbard 
---
 mm/gup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 56421b880325..e49096d012ea 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2465,7 +2465,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages,
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 /**
- * vaddr_pin_pages pin pages by virtual address and return the pages to the
+ * vaddr_pin_pages() - pin pages by virtual address and return the pages to the
  * user.
  *
  * @addr: start address
@@ -2505,7 +2505,7 @@ long vaddr_pin_pages(unsigned long addr, unsigned long 
nr_pages,
 EXPORT_SYMBOL(vaddr_pin_pages);
 
 /**
- * vaddr_unpin_pages - counterpart to vaddr_pin_pages
+ * vaddr_unpin_pages() - counterpart to vaddr_pin_pages
  *
  * @pages: array of pages returned
  * @nr_pages: number of pages in pages
-- 
2.22.1



disregard: [PATCH 1/4] checkpatch: revert broken NOTIFIER_HEAD check

2019-08-20 Thread John Hubbard

On 8/20/19 9:03 PM, John Hubbard wrote:

commit 1a47005dd5aa ("checkpatch: add *_NOTIFIER_HEAD as var
definition") causes the following warning when run on some
patches:



Please disregard this series. It's stale.

thanks,
--
John Hubbard
NVIDIA


[PATCH v2 3/3] mm/gup: introduce vaddr_pin_pages_remote(), and invoke it

2019-08-20 Thread John Hubbard
vaddr_pin_user_pages_remote() is the "vaddr_pin_pages" corresponding
variant to get_user_pages_remote(), except that:
   a) it sets FOLL_PIN, and
   b) it can handle FOLL_LONGTERM (and the associated vaddr_pin arg).

Change process_vm_rw_single_vec() to invoke the new function.

Signed-off-by: John Hubbard 
Cc: Ira Weiny 
---
 include/linux/mm.h |  5 +
 mm/gup.c   | 34 ++
 mm/process_vm_access.c | 23 +--
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6e7de424bf5e..849b509e9f89 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1606,6 +1606,11 @@ int __account_locked_vm(struct mm_struct *mm, unsigned 
long pages, bool inc,
 long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
 unsigned int gup_flags, struct page **pages,
 struct vaddr_pin *vaddr_pin);
+long vaddr_pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+unsigned long start, unsigned long nr_pages,
+unsigned int gup_flags, struct page **pages,
+struct vm_area_struct **vmas, int *locked,
+struct vaddr_pin *vaddr_pin);
 void vaddr_unpin_pages(struct page **pages, unsigned long nr_pages,
   struct vaddr_pin *vaddr_pin, bool make_dirty);
 bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page);
diff --git a/mm/gup.c b/mm/gup.c
index ba316d960d7a..d713ed9d4b9a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2522,3 +2522,37 @@ void vaddr_unpin_pages(struct page **pages, unsigned 
long nr_pages,
__put_user_pages_dirty_lock(pages, nr_pages, make_dirty, vaddr_pin);
 }
 EXPORT_SYMBOL(vaddr_unpin_pages);
+
+/**
+ * vaddr_pin_user_pages_remote() - pin pages by virtual address and return the
+ * pages to the user.
+ *
+ * @tsk:   the task_struct to use for page fault accounting, or
+ * NULL if faults are not to be recorded.
+ * @mm:mm_struct of target mm
+ * @addr:  start address
+ * @nr_pages:  number of pages to pin
+ * @gup_flags: flags to use for the pin. Please see FOLL_* documentation in
+ * mm.h.
+ * @pages: array of pages returned
+ * @vaddr_pin:  If FOLL_LONGTERM is set, then vaddr_pin should point to an
+ * initialized struct that contains the owning mm and file. Otherwise, 
vaddr_pin
+ * should be set to NULL.
+ *
+ * This is the "vaddr_pin_pages" corresponding variant to
+ * get_user_pages_remote(), except that:
+ *a) it sets FOLL_PIN, and
+ *b) it can handle FOLL_LONGTERM (and the associated vaddr_pin arg).
+ */
+long vaddr_pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+unsigned long start, unsigned long nr_pages,
+unsigned int gup_flags, struct page **pages,
+struct vm_area_struct **vmas, int *locked,
+struct vaddr_pin *vaddr_pin)
+{
+   gup_flags |= FOLL_TOUCH | FOLL_REMOTE | FOLL_PIN;
+
+   return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+  locked, gup_flags, vaddr_pin);
+}
+EXPORT_SYMBOL(vaddr_pin_user_pages_remote);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..28e0a17b6080 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -44,7 +44,6 @@ static int process_vm_rw_pages(struct page **pages,
 
if (vm_write) {
copied = copy_page_from_iter(page, offset, copy, iter);
-   set_page_dirty_lock(page);
} else {
copied = copy_page_to_iter(page, offset, copy, iter);
}
@@ -96,7 +95,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
 
while (!rc && nr_pages && iov_iter_count(iter)) {
-   int pages = min(nr_pages, max_pages_per_loop);
+   int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
 
@@ -106,14 +105,17 @@ static int process_vm_rw_single_vec(unsigned long addr,
 * current/current->mm
 */
down_read(>mmap_sem);
-   pages = get_user_pages_remote(task, mm, pa, pages, flags,
- process_pages, NULL, );
+
+   pinned_pages = vaddr_pin_user_pages_remote(task, mm, pa,
+  pinned_pages, flags,
+  process_pages, NULL,
+  , NULL);
if (locked)
up_

[PATCH v2 2/3] mm/gup: introduce FOLL_PIN flag for get_user_pages()

2019-08-20 Thread John Hubbard
As explained in the newly added documentation for FOLL_PIN and
FOLL_LONGTERM, in every case where vaddr_pin_pages() is required,
FOLL_PIN must be set. That reason, plus a desire to keep FOLL_PIN
an internal (to get_user_pages() and follow_page()) detail, is why
vaddr_pin_pages() sets FOLL_PIN.

FOLL_LONGTERM, on the other hand, in only set in *some* cases, but
not all. For that reason, this patch moves the setting of FOLL_LONGTERM
out to the caller.

Also add fairly extensive documentation of the meaning and use
of both FOLL_PIN and FOLL_LONGTERM.

Thanks to Jan Kara and Vlastimil Babka for explaining the 4 cases
in this documentation. (I've reworded it and expanded on it slightly.)

The motivation behind moving away from "bare" get_user_pages() calls
is described in more detail in commit fc1d8e7cca2d
("mm: introduce put_user_page*(), placeholder versions").

Cc: Vlastimil Babka 
Cc: Jan Kara 
Cc: Michal Hocko 
Cc: Ira Weiny 
Signed-off-by: John Hubbard 
---
 drivers/infiniband/core/umem.c |  1 +
 include/linux/mm.h | 56 ++
 mm/gup.c   |  2 +-
 3 files changed, 52 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index e69eecb0023f..d84f1bfb8d21 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -300,6 +300,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
 
while (npages) {
down_read(>mmap_sem);
+   gup_flags |= FOLL_LONGTERM;
ret = vaddr_pin_pages(cur_base,
 min_t(unsigned long, npages,
   PAGE_SIZE / sizeof (struct page *)),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc675e94ddf8..6e7de424bf5e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2644,6 +2644,8 @@ static inline vm_fault_t vmf_error(int err)
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 unsigned int foll_flags);
 
+/* Flags for follow_page(), get_user_pages ("GUP"), and vaddr_pin_pages(): */
+
 #define FOLL_WRITE 0x01/* check pte is writable */
 #define FOLL_TOUCH 0x02/* mark page accessed */
 #define FOLL_GET   0x04/* do get_page on page */
@@ -2663,13 +2665,15 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
 #define FOLL_ANON  0x8000  /* don't do file mappings */
 #define FOLL_LONGTERM  0x1 /* mapping lifetime is indefinite: see below */
 #define FOLL_SPLIT_PMD 0x2 /* split huge pmd before returning */
+#define FOLL_PIN   0x4 /* pages must be released via put_user_page() */
 
 /*
- * NOTE on FOLL_LONGTERM:
+ * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
+ * other. Here is what they mean, and how to use them:
  *
  * FOLL_LONGTERM indicates that the page will be held for an indefinite time
- * period _often_ under userspace control.  This is contrasted with
- * iov_iter_get_pages() where usages which are transient.
+ * period _often_ under userspace control.  This is in contrast to
+ * iov_iter_get_pages(), where usages which are transient.
  *
  * FIXME: For pages which are part of a filesystem, mappings are subject to the
  * lifetime enforced by the filesystem and we need guarantees that longterm
@@ -2684,11 +2688,51 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
  * Currently only get_user_pages() and get_user_pages_fast() support this flag
  * and calls to get_user_pages_[un]locked are specifically not allowed.  This
  * is due to an incompatibility with the FS DAX check and
- * FAULT_FLAG_ALLOW_RETRY
+ * FAULT_FLAG_ALLOW_RETRY.
  *
- * In the CMA case: longterm pins in a CMA region would unnecessarily fragment
- * that region.  And so CMA attempts to migrate the page before pinning when
+ * In the CMA case: long term pins in a CMA region would unnecessarily fragment
+ * that region.  And so, CMA attempts to migrate the page before pinning, when
  * FOLL_LONGTERM is specified.
+ *
+ * FOLL_PIN indicates that a special kind of tracking (not just 
page->_refcount,
+ * but an additional pin counting system) will be invoked. This is intended for
+ * anything that gets a page reference and then touches page data (for example,
+ * Direct IO). This lets the filesystem know that some non-file-system entity 
is
+ * potentially changing the pages' data. FOLL_PIN pages must be released,
+ * ultimately, by a call to put_user_page(). Typically that will be via one of
+ * the vaddr_unpin_pages() variants.
+ *
+ * FIXME: note that this special tracking is not in place yet. However, the
+ * pages should still be released by put_user_page().
+ *
+ * When and where to use each flag:
+ *
+ * CASE 1: Direct IO (DIO). There are GUP references to pages that are serving
+ * as DIO buffers. These

Re: [PATCH 0/3] mm/: 3 more put_user_page() conversions

2019-08-06 Thread John Hubbard
On 8/6/19 2:59 PM, Andrew Morton wrote:
> On Mon,  5 Aug 2019 15:20:16 -0700 john.hubb...@gmail.com wrote:
> 
>> Here are a few more mm/ files that I wasn't ready to send with the
>> larger 34-patch set.
> 
> Seems that a v3 of "put_user_pages(): miscellaneous call sites" is in
> the works, so can we make that a 37 patch series?
> 

Sure, I'll add them to that.

thanks,
-- 
John Hubbard
NVIDIA


[PATCH v3 26/41] futex: convert put_page() to put_user_page*()

2019-08-06 Thread john . hubbard
From: John Hubbard 

For pages that were retained via get_user_pages*(), release those pages
via the new put_user_page*() routines, instead of via put_page() or
release_pages().

This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
("mm: introduce put_user_page*(), placeholder versions").

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: Darren Hart 
Signed-off-by: John Hubbard 
---
 kernel/futex.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 6d50728ef2e7..4b4cae58ec57 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -623,7 +623,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
lock_page(page);
shmem_swizzled = PageSwapCache(page) || page->mapping;
unlock_page(page);
-   put_page(page);
+   put_user_page(page);
 
if (shmem_swizzled)
goto again;
@@ -675,7 +675,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
 
if (READ_ONCE(page->mapping) != mapping) {
rcu_read_unlock();
-   put_page(page);
+   put_user_page(page);
 
goto again;
}
@@ -683,7 +683,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
-   put_page(page);
+   put_user_page(page);
 
goto again;
}
@@ -702,7 +702,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
 */
if (!atomic_inc_not_zero(>i_count)) {
rcu_read_unlock();
-   put_page(page);
+   put_user_page(page);
 
goto again;
}
@@ -723,7 +723,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union 
futex_key *key, enum futex_a
}
 
 out:
-   put_page(page);
+   put_user_page(page);
return err;
 }
 
-- 
2.22.0



[PATCH v3 18/41] drivers/tee: convert put_page() to put_user_page*()

2019-08-06 Thread john . hubbard
From: John Hubbard 

For pages that were retained via get_user_pages*(), release those pages
via the new put_user_page*() routines, instead of via put_page() or
release_pages().

This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
("mm: introduce put_user_page*(), placeholder versions").

Acked-by: Jens Wiklander 
Signed-off-by: John Hubbard 
---
 drivers/tee/tee_shm.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 2da026fd12c9..c967d0420b67 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -31,16 +31,13 @@ static void tee_shm_release(struct tee_shm *shm)
 
poolm->ops->free(poolm, shm);
} else if (shm->flags & TEE_SHM_REGISTER) {
-   size_t n;
int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
 
if (rc)
dev_err(teedev->dev.parent,
"unregister shm %p failed: %d", shm, rc);
 
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
-
+   put_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
 
@@ -313,16 +310,13 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
return shm;
 err:
if (shm) {
-   size_t n;
-
if (shm->id >= 0) {
mutex_lock(>mutex);
idr_remove(>idr, shm->id);
mutex_unlock(>mutex);
}
if (shm->pages) {
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
+   put_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
}
-- 
2.22.0



[PATCH v3 14/41] vmci: convert put_page() to put_user_page*()

2019-08-06 Thread john . hubbard
From: John Hubbard 

For pages that were retained via get_user_pages*(), release those pages
via the new put_user_page*() routines, instead of via put_page() or
release_pages().

This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
("mm: introduce put_user_page*(), placeholder versions").

Note that this effectively changes the code's behavior in
qp_release_pages(): it now ultimately calls set_page_dirty_lock(),
instead of set_page_dirty(). This is probably more accurate.

As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
dealing with a file backed page where we have reference on the inode it
hangs off." [1]

[1] https://lore.kernel.org/r/20190723153640.gb...@lst.de

Cc: Arnd Bergmann 
Cc: Al Viro 
Cc: Gustavo A. R. Silva 
Cc: Kees Cook 
Signed-off-by: John Hubbard 
---
 drivers/misc/vmw_vmci/vmci_context.c|  2 +-
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 11 ++-
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/misc/vmw_vmci/vmci_context.c 
b/drivers/misc/vmw_vmci/vmci_context.c
index 16695366ec92..9daa52ee63b7 100644
--- a/drivers/misc/vmw_vmci/vmci_context.c
+++ b/drivers/misc/vmw_vmci/vmci_context.c
@@ -587,7 +587,7 @@ void vmci_ctx_unset_notify(struct vmci_ctx *context)
 
if (notify_page) {
kunmap(notify_page);
-   put_page(notify_page);
+   put_user_page(notify_page);
}
 }
 
diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c 
b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 8531ae781195..e5434551d0ef 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -626,15 +626,8 @@ static void qp_release_queue_mutex(struct vmci_queue 
*queue)
 static void qp_release_pages(struct page **pages,
 u64 num_pages, bool dirty)
 {
-   int i;
-
-   for (i = 0; i < num_pages; i++) {
-   if (dirty)
-   set_page_dirty(pages[i]);
-
-   put_page(pages[i]);
-   pages[i] = NULL;
-   }
+   put_user_pages_dirty_lock(pages, num_pages, dirty);
+   memset(pages, 0, num_pages * sizeof(struct page *));
 }
 
 /*
-- 
2.22.0



Re: [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()

2019-08-07 Thread John Hubbard

On 8/6/19 11:34 PM, Christoph Hellwig wrote:

On Mon, Aug 05, 2019 at 03:54:35PM -0700, John Hubbard wrote:

On 7/23/19 11:17 PM, Christoph Hellwig wrote:

...

I think we can do this in a simple and better way.  We have 5 ITER_*
types.  Of those ITER_DISCARD as the name suggests never uses pages, so
we can skip handling it.  ITER_PIPE is rejected іn the direct I/O path,
which leaves us with three.



Hi Christoph,

Are you working on anything like this?


I was hoping I could steer you towards it.  But if you don't want to do
it yourself I'll add it to my ever growing todo list.



Sure, I'm up for this. The bvec-related items are the next logical part
of the gup/dma conversions to work on, and I just wanted to avoid solving the
same problem if you were already in the code.



Or on the put_user_bvec() idea?


I have a prototype from two month ago:

http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/gup-bvec

but that only survived the most basic testing, so it'll need more work,
which I'm not sure when I'll find time for.



I'll take a peek, and probably pester you with a few questions if I get
confused. :)

thanks,
--
John Hubbard
NVIDIA


Re: Warnings whilst building 5.2.0+

2019-08-07 Thread John Hubbard

On 8/6/19 11:30 PM, Chris Clayton wrote:

On 09/07/2019 12:39, Chris Clayton wrote:

On 09/07/2019 11:37, Enrico Weigelt, metux IT consult wrote:

On 09.07.19 08:06, Chris Clayton wrote:

...

Can you check older versions, too ? Maybe also trying older gcc ?



I see the same warnings building linux-5.2.0 with gcc9. However, I don't see 
the warnings building linux-5.2.0 with the
the 20190705 of gcc8. So the warnings could result from an improvement (i.e. 
the problem was in the kernel, but
undiscovered by gcc8) or from a regression in gcc9.



 From the discussion starting at 
https://marc.info/?l=linux-kernel=156401014023908, it would appear that the 
problem is
undiscovered by gcc8. Building a fresh pull of Linus' tree this morning 
(v5.3-rc3-282-g33920f1ec5bf), I see that the
warnings are still being emitted. Adding the participants in the other 
discussion to this one.



The warnings are still there because the fix has not been committed to any
tree yet.

If you could try out my proposed fix [1], and reply to that thread with perhaps 
a
Tested-by tag, that would help encourage the maintainers to accept it.

So far it hasn't made it to the top of their inboxes, but I'm hoping... :)


[1] https://lore.kernel.org/r/20190731054627.5627-2-jhubb...@nvidia.com
("x86/boot: save fields explicitly, zero out everything else")

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2] x86/boot: save fields explicitly, zero out everything else

2019-08-07 Thread John Hubbard
On 8/7/19 4:41 AM, David Laight wrote:
> From: john.hubb...@gmail.com
>> Sent: 31 July 2019 06:46
...
>>  if (boot_params->sentinel) {
>> -/* fields in boot_params are left uninitialized, clear them */
>> -boot_params->acpi_rsdp_addr = 0;
>> -memset(_params->ext_ramdisk_image, 0,
>> -   (char *)_params->efi_info -
>> -(char *)_params->ext_ramdisk_image);
>> -memset(_params->kbd_status, 0,
>> -   (char *)_params->hdr -
>> -   (char *)_params->kbd_status);
>> -memset(_params->_pad7[0], 0,
>> -   (char *)_params->edd_mbr_sig_buffer[0] -
>> -(char *)_params->_pad7[0]);
>> -memset(_params->_pad8[0], 0,
>> -   (char *)_params->eddbuf[0] -
>> -(char *)_params->_pad8[0]);
>> -memset(_params->_pad9[0], 0, sizeof(boot_params->_pad9));
> ...
> 
> How about replacing the above first using:
> #define zero_struct_fields(ptr, from, to) memset(>from, 0, (char 
> *)>to - (char *)>from)
>   zero_struct_fields(boot_params, ext_ramdisk_image, efi_info);
>   ...
> Which is absolutely identical to the original code.
> 
> The replacing the define with:
>   #define so(s, m) offsetof(typeof(*s), m)
>   #define zero_struct_fields(ptr, from, to) memset((char *)ptr + so(ptr, 
> from), 0, so(ptr, to) - so(ptr, from))
> which gcc probably doesn't complain about, but should generate identical code 
> again.
> There might be an existing define for so().
> 

Hi David,

There was discussion about that [1], but preference ending up being to
flip this around, in order to more closely match the original intent
of this function (zero out everything except for certain carefully
selected fields), and to therefore be more likely to keep working if 
fields are added. 


[1] 
https://lore.kernel.org/lkml/alpine.deb.2.21.1907252358240.1...@nanos.tec.linutronix.de/

thanks,
-- 
John Hubbard
NVIDIA


Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()

2019-08-07 Thread John Hubbard
On 8/7/19 4:01 AM, Michal Hocko wrote:
> On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote:
>> From: John Hubbard 
>>
>> For pages that were retained via get_user_pages*(), release those pages
>> via the new put_user_page*() routines, instead of via put_page() or
>> release_pages().
> 
> Hmm, this is an interesting code path. There seems to be a mix of pages
> in the game. We get one page via follow_page_mask but then other pages
> in the range are filled by __munlock_pagevec_fill and that does a direct
> pte walk. Is using put_user_page correct in this case? Could you explain
> why in the changelog?
> 

Actually, I think follow_page_mask() gets all the pages, right? And the
get_page() in __munlock_pagevec_fill() is there to allow a pagevec_release() 
later.

But I still think I mighthave missed an error case, because the pvec_putback
in __munlock_pagevec() is never doing put_user_page() on the put-backed pages.

Let me sort through this one more time and maybe I'll need to actually
change the code. And either way, comments and changelog will need some notes, 
agreed.

thanks,
-- 
John Hubbard
NVIDIA



Re: [PATCH] powerpc: convert put_page() to put_user_page*()

2019-08-07 Thread John Hubbard
On 8/7/19 4:24 PM, kbuild test robot wrote:
> Hi,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on linus/master]
> [cannot apply to v5.3-rc3 next-20190807]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/john-hubbard-gmail-com/powerpc-convert-put_page-to-put_user_page/20190805-132131
> config: powerpc-allmodconfig (attached as .config)
> compiler: powerpc64-linux-gcc (GCC) 7.4.0
> reproduce:
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> GCC_VERSION=7.4.0 make.cross ARCH=powerpc 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kbuild test robot 
> 
> All errors (new ones prefixed by >>):
> 
>arch/powerpc/kvm/book3s_64_mmu_radix.c: In function 
> 'kvmppc_book3s_instantiate_page':
>>> arch/powerpc/kvm/book3s_64_mmu_radix.c:879:4: error: too many arguments to 
>>> function 'put_user_pages_dirty_lock'
>put_user_pages_dirty_lock(, 1, dirty);
>^

Yep, I should have included the prerequisite patch. But this is obsolete now,
please refer to the larger patchset instead:

   https://lore.kernel.org/r/20190807013340.9706-1-jhubb...@nvidia.com

thanks,
-- 
John Hubbard
NVIDIA


Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()

2019-08-08 Thread John Hubbard
On 8/8/19 4:09 AM, Vlastimil Babka wrote:
> On 8/8/19 8:21 AM, Michal Hocko wrote:
>> On Wed 07-08-19 16:32:08, John Hubbard wrote:
>>> On 8/7/19 4:01 AM, Michal Hocko wrote:
>>>> On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote:
>>>>> From: John Hubbard 
>>> Actually, I think follow_page_mask() gets all the pages, right? And the
>>> get_page() in __munlock_pagevec_fill() is there to allow a 
>>> pagevec_release() 
>>> later.
>>
>> Maybe I am misreading the code (looking at Linus tree) but 
>> munlock_vma_pages_range
>> calls follow_page for the start address and then if not THP tries to
>> fill up the pagevec with few more pages (up to end), do the shortcut
>> via manual pte walk as an optimization and use generic get_page there.
> 

Yes, I see it finally, thanks. :)  

> That's true. However, I'm not sure munlocking is where the
> put_user_page() machinery is intended to be used anyway? These are
> short-term pins for struct page manipulation, not e.g. dirtying of page
> contents. Reading commit fc1d8e7cca2d I don't think this case falls
> within the reasoning there. Perhaps not all GUP users should be
> converted to the planned separate GUP tracking, and instead we should
> have a GUP/follow_page_mask() variant that keeps using get_page/put_page?
>  

Interesting. So far, the approach has been to get all the gup callers to
release via put_user_page(), but if we add in Jan's and Ira's vaddr_pin_pages()
wrapper, then maybe we could leave some sites unconverted.

However, in order to do so, we would have to change things so that we have
one set of APIs (gup) that do *not* increment a pin count, and another set
(vaddr_pin_pages) that do. 

Is that where we want to go...?

I have a tracking patch that only deals with gup/pup. I could post as an RFC,
but I think it might just muddy the waters at this point, anyway it's this one:


https://github.com/johnhubbard/linux/commit/a0fb73ce0a39c74f0d1fb6bd9d866f660f762eae


thanks,
-- 
John Hubbard
NVIDIA 


Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()

2019-08-08 Thread John Hubbard
On 8/8/19 12:20 PM, John Hubbard wrote:
> On 8/8/19 4:09 AM, Vlastimil Babka wrote:
>> On 8/8/19 8:21 AM, Michal Hocko wrote:
>>> On Wed 07-08-19 16:32:08, John Hubbard wrote:
>>>> On 8/7/19 4:01 AM, Michal Hocko wrote:
>>>>> On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote:
>>>>>> From: John Hubbard 
>>>> Actually, I think follow_page_mask() gets all the pages, right? And the
>>>> get_page() in __munlock_pagevec_fill() is there to allow a 
>>>> pagevec_release() 
>>>> later.
>>>
>>> Maybe I am misreading the code (looking at Linus tree) but 
>>> munlock_vma_pages_range
>>> calls follow_page for the start address and then if not THP tries to
>>> fill up the pagevec with few more pages (up to end), do the shortcut
>>> via manual pte walk as an optimization and use generic get_page there.
>>
> 
> Yes, I see it finally, thanks. :)  
> 
>> That's true. However, I'm not sure munlocking is where the
>> put_user_page() machinery is intended to be used anyway? These are
>> short-term pins for struct page manipulation, not e.g. dirtying of page
>> contents. Reading commit fc1d8e7cca2d I don't think this case falls
>> within the reasoning there. Perhaps not all GUP users should be
>> converted to the planned separate GUP tracking, and instead we should
>> have a GUP/follow_page_mask() variant that keeps using get_page/put_page?
>>  
> 
> Interesting. So far, the approach has been to get all the gup callers to
> release via put_user_page(), but if we add in Jan's and Ira's 
> vaddr_pin_pages()
> wrapper, then maybe we could leave some sites unconverted.
> 
> However, in order to do so, we would have to change things so that we have
> one set of APIs (gup) that do *not* increment a pin count, and another set
> (vaddr_pin_pages) that do. 
> 
> Is that where we want to go...?
> 

Oh, and meanwhile, I'm leaning toward a cheap fix: just use gup_fast() instead
of get_page(), and also fix the releasing code. So this incremental patch, on
top of the existing one, should do it:

diff --git a/mm/mlock.c b/mm/mlock.c
index b980e6270e8a..2ea272c6fee3 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -318,18 +318,14 @@ static void __munlock_pagevec(struct pagevec *pvec, 
struct zone *zone)
/*
 * We won't be munlocking this page in the next phase
 * but we still need to release the follow_page_mask()
-* pin. We cannot do it under lru_lock however. If it's
-* the last pin, __page_cache_release() would deadlock.
+* pin.
 */
-   pagevec_add(_putback, pvec->pages[i]);
+   put_user_page(pages[i]);
pvec->pages[i] = NULL;
}
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(>zone_pgdat->lru_lock);
 
-   /* Now we can release pins of pages that we are not munlocking */
-   pagevec_release(_putback);
-
/* Phase 2: page munlock */
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -394,6 +390,8 @@ static unsigned long __munlock_pagevec_fill(struct pagevec 
*pvec,
start += PAGE_SIZE;
while (start < end) {
struct page *page = NULL;
+   int ret;
+
pte++;
if (pte_present(*pte))
page = vm_normal_page(vma, start, *pte);
@@ -411,7 +409,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec 
*pvec,
if (PageTransCompound(page))
break;
 
-   get_page(page);
+   /*
+* Use get_user_pages_fast(), instead of get_page() so that the
+* releasing code can unconditionally call put_user_page().
+*/
+   ret = get_user_pages_fast(start, 1, 0, );
+   if (ret != 1)
+   break;
/*
 * Increase the address that will be returned *before* the
 * eventual break due to pvec becoming full by adding the page


thanks,
-- 
John Hubbard
NVIDIA


Re: [Linux-kernel-mentees][PATCH v4 1/1] sgi-gru: Remove *pte_lookup functions

2019-08-08 Thread John Hubbard
On 8/8/19 11:55 AM, Bharath Vedartham wrote:
...
>  static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
>   int write, int atomic, unsigned long *gpa, int *pageshift)
>  {
>   struct mm_struct *mm = gts->ts_mm;
>   struct vm_area_struct *vma;
>   unsigned long paddr;
> - int ret, ps;
> + int ret;
> + struct page *page;
>  
>   vma = find_vma(mm, vaddr);
>   if (!vma)
> @@ -263,21 +187,33 @@ static int gru_vtop(struct gru_thread_state *gts, 
> unsigned long vaddr,
>  
>   /*
>* Atomic lookup is faster & usually works even if called in non-atomic
> -  * context.
> +  * context. get_user_pages_fast does atomic lookup before falling back 
> to
> +  * slow gup.
>*/
>   rmb();  /* Must/check ms_range_active before loading PTEs */
> - ret = atomic_pte_lookup(vma, vaddr, write, , );
> - if (ret) {
> - if (atomic)
> + if (atomic) {
> + ret = __get_user_pages_fast(vaddr, 1, write, );
> + if (!ret)
>   goto upm;
> - if (non_atomic_pte_lookup(vma, vaddr, write, , ))
> + } else {
> + ret = get_user_pages_fast(vaddr, 1, write ? FOLL_WRITE : 0, 
> );
> + if (!ret)
>   goto inval;
>   }
> +
> + paddr = page_to_phys(page);
> + put_user_page(page);
> +
> + if (unlikely(is_vm_hugetlb_page(vma)))
> + *pageshift = HPAGE_SHIFT;
> + else
> + *pageshift = PAGE_SHIFT;
> +
>   if (is_gru_paddr(paddr))
>   goto inval;
> - paddr = paddr & ~((1UL << ps) - 1);
> + paddr = paddr & ~((1UL << *pageshift) - 1);
>   *gpa = uv_soc_phys_ram_to_gpa(paddr);
> - *pageshift = ps;

Why are you no longer setting *pageshift? There are a couple of callers
that both use this variable.


thanks,
-- 
John Hubbard
NVIDIA


Re: [Linux-kernel-mentees][PATCH v4 1/1] sgi-gru: Remove *pte_lookup functions

2019-08-08 Thread John Hubbard
On 8/8/19 4:21 PM, John Hubbard wrote:
> On 8/8/19 11:55 AM, Bharath Vedartham wrote:
> ...
>>  if (is_gru_paddr(paddr))
>>  goto inval;
>> -paddr = paddr & ~((1UL << ps) - 1);
>> +paddr = paddr & ~((1UL << *pageshift) - 1);
>>  *gpa = uv_soc_phys_ram_to_gpa(paddr);
>> -*pageshift = ps;
> 
> Why are you no longer setting *pageshift? There are a couple of callers
> that both use this variable.
> 
> 

...and once that's figured out, I can fix it up here and send it up with 
the next misc callsites series. I'm also inclined to make the commit
log read more like this:

sgi-gru: Remove *pte_lookup functions, convert to put_user_page*()

For pages that were retained via get_user_pages*(), release those pages
via the new put_user_page*() routines, instead of via put_page() or
release_pages().

This is part a tree-wide conversion, as described in commit fc1d8e7cca2d
("mm: introduce put_user_page*(), placeholder versions").

As part of this conversion, the *pte_lookup functions can be removed and
be easily replaced with get_user_pages_fast() functions. In the case of
atomic lookup, __get_user_pages_fast() is used, because it does not fall
back to the slow path: get_user_pages(). get_user_pages_fast(), on the other
hand, first calls __get_user_pages_fast(), but then falls back to the
slow path if __get_user_pages_fast() fails.

Also: remove unnecessary CONFIG_HUGETLB ifdefs.


thanks,
-- 
John Hubbard
NVIDIA


Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()

2019-08-08 Thread John Hubbard
On 8/8/19 4:41 PM, Ira Weiny wrote:
> On Thu, Aug 08, 2019 at 03:59:15PM -0700, John Hubbard wrote:
>> On 8/8/19 12:20 PM, John Hubbard wrote:
>>> On 8/8/19 4:09 AM, Vlastimil Babka wrote:
>>>> On 8/8/19 8:21 AM, Michal Hocko wrote:
>>>>> On Wed 07-08-19 16:32:08, John Hubbard wrote:
>>>>>> On 8/7/19 4:01 AM, Michal Hocko wrote:
>>>>>>> On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote:
...
>> Oh, and meanwhile, I'm leaning toward a cheap fix: just use gup_fast() 
>> instead
>> of get_page(), and also fix the releasing code. So this incremental patch, on
>> top of the existing one, should do it:
>>
>> diff --git a/mm/mlock.c b/mm/mlock.c
>> index b980e6270e8a..2ea272c6fee3 100644
>> --- a/mm/mlock.c
>> +++ b/mm/mlock.c
>> @@ -318,18 +318,14 @@ static void __munlock_pagevec(struct pagevec *pvec, 
>> struct zone *zone)
>> /*
>>  * We won't be munlocking this page in the next phase
>>  * but we still need to release the follow_page_mask()
>> -* pin. We cannot do it under lru_lock however. If it's
>> -* the last pin, __page_cache_release() would deadlock.
>> +* pin.
>>  */
>> -   pagevec_add(_putback, pvec->pages[i]);
>> +   put_user_page(pages[i]);

correction, make that:   
   put_user_page(pvec->pages[i]);

(This is not fully tested yet.)

>> pvec->pages[i] = NULL;
>> }
>> __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
>> spin_unlock_irq(>zone_pgdat->lru_lock);
>>  
>> -   /* Now we can release pins of pages that we are not munlocking */
>> -   pagevec_release(_putback);
>> -
> 
> I'm not an expert but this skips a call to lru_add_drain().  Is that ok?

Yes: unless I'm missing something, there is no reason to go through 
lru_add_drain
in this case. These are gup'd pages that are not going to get any further
processing.

> 
>> /* Phase 2: page munlock */
>> for (i = 0; i < nr; i++) {
>> struct page *page = pvec->pages[i];
>> @@ -394,6 +390,8 @@ static unsigned long __munlock_pagevec_fill(struct 
>> pagevec *pvec,
>> start += PAGE_SIZE;
>> while (start < end) {
>> struct page *page = NULL;
>> +   int ret;
>> +
>> pte++;
>> if (pte_present(*pte))
>> page = vm_normal_page(vma, start, *pte);
>> @@ -411,7 +409,13 @@ static unsigned long __munlock_pagevec_fill(struct 
>> pagevec *pvec,
>> if (PageTransCompound(page))
>> break;
>>  
>> -   get_page(page);
>> +   /*
>> +* Use get_user_pages_fast(), instead of get_page() so that 
>> the
>> +* releasing code can unconditionally call put_user_page().
>> +*/
>> +   ret = get_user_pages_fast(start, 1, 0, );
>> +   if (ret != 1)
>> +   break;
> 
> I like the idea of making this a get/put pair but I'm feeling uneasy about how
> this is really supposed to work.
> 
> For sure the GUP/PUP was supposed to be separate from [get|put]_page.
> 

Actually, they both take references on the page. And it is absolutely OK to call
them both on the same page.

But anyway, we're not mixing them up here. If you follow the code paths, either 
gup or follow_page_mask() is used, and then put_user_page() releases. 

So...you haven't actually pointed to a bug here, right? :)


thanks,
-- 
John Hubbard
NVIDIA


Re: [RFC PATCH 2/2] mm/gup: introduce vaddr_pin_pages_remote()

2019-08-15 Thread John Hubbard
On 8/15/19 10:32 AM, Ira Weiny wrote:
> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
>>>>
>>>> Hold on, I *was* forgetting something: this was a two part thing, and
>>>> you're conflating the two points, but they need to remain separate and
>>>> distinct. There were:
>>>>
>>>> 1. FOLL_PIN is necessary because the caller is clearly in the use case that
>>>> requires it--however briefly they might be there. As Jan described it,
>>>>
>>>> "Anything that gets page reference and then touches page data (e.g.
>>>> direct IO) needs the new kind of tracking so that filesystem knows
>>>> someone is messing with the page data." [1]
>>>
>>> So when the GUP user uses MMU notifiers to stop writing to pages whenever
>>> they are writeprotected with page_mkclean(), they don't really need page
>>> pin - their access is then fully equivalent to any other mmap userspace
>>> access and filesystem knows how to deal with those. I forgot out this case
>>> when I wrote the above sentence.
>>>
>>> So to sum up there are three cases:
>>> 1) DIO case - GUP references to pages serving as DIO buffers are needed for
>>>relatively short time, no special synchronization with page_mkclean() or
>>>munmap() => needs FOLL_PIN
>>> 2) RDMA case - GUP references to pages serving as DMA buffers needed for a
>>>long time, no special synchronization with page_mkclean() or munmap()
>>>=> needs FOLL_PIN | FOLL_LONGTERM
>>>This case has also a special case when the pages are actually DAX. Then
>>>the caller additionally needs file lease and additional file_pin
>>>structure is used for tracking this usage.
>>> 3) ODP case - GUP references to pages serving as DMA buffers, MMU notifiers
>>>used to synchronize with page_mkclean() and munmap() => normal page
>>>references are fine.
>>

Thanks Jan, once again, for clarifying all of this!

>> I want to add that I'd like to convert users in cases 1) and 2) from using
>> GUP to using differently named function. Users in case 3) can stay as they
>> are for now although ultimately I'd like to denote such use cases in a
>> special way as well...
>>
> 
> Ok just to make this clear I threw up my current tree with your patches here:
> 
> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
> 
> I'm talking about dropping the final patch:
> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
> 
> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
> but I'll keep it around for a bit.
> 
> This tree is still WIP as I work through all the comments.  So I've not 
> changed
> names or variable types etc...  Just wanted to settle this.
> 

Right. And now that ODP is not a user, I'll take a quick look through my other
call site conversions and see if I can find an easy one, to include here as
the first user of vaddr_pin_pages_remote(). I'll send it your way if that
works out.


thanks,
-- 
John Hubbard
NVIDIA


Re: [RFC PATCH 2/2] mm/gup: introduce vaddr_pin_pages_remote()

2019-08-15 Thread John Hubbard
On 8/15/19 10:41 AM, John Hubbard wrote:
> On 8/15/19 10:32 AM, Ira Weiny wrote:
>> On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:
>>> On Thu 15-08-19 15:26:22, Jan Kara wrote:
>>>> On Wed 14-08-19 20:01:07, John Hubbard wrote:
>>>>> On 8/14/19 5:02 PM, John Hubbard wrote:
...
>> Ok just to make this clear I threw up my current tree with your patches here:
>>
>> https://github.com/weiny2/linux-kernel/commits/mmotm-rdmafsdax-b0-v4
>>
>> I'm talking about dropping the final patch:
>> 05fd2d3afa6b rdma/umem_odp: Use vaddr_pin_pages_remote() in ODP
>>
>> The other 2 can stay.  I split out the *_remote() call.  We don't have a user
>> but I'll keep it around for a bit.
>>
>> This tree is still WIP as I work through all the comments.  So I've not 
>> changed
>> names or variable types etc...  Just wanted to settle this.
>>
> 
> Right. And now that ODP is not a user, I'll take a quick look through my other
> call site conversions and see if I can find an easy one, to include here as
> the first user of vaddr_pin_pages_remote(). I'll send it your way if that
> works out.
> 

OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
patch, maybe eventually [1].  But looking at process_vm_access.c, I think 
it is one of the patches that is no longer applicable, and I can just
drop it entirely...I'd welcome a second opinion on that...

So we might be all out of potential users for vaddr_pin_pages_remote()!

For quick reference, it looks like this:
 
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 357aa7bef6c0..4d29d54ec93f 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -96,7 +96,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
flags |= FOLL_WRITE;
 
while (!rc && nr_pages && iov_iter_count(iter)) {
-   int pages = min(nr_pages, max_pages_per_loop);
+   int pinned_pages = min(nr_pages, max_pages_per_loop);
int locked = 1;
size_t bytes;
 
@@ -106,14 +106,15 @@ static int process_vm_rw_single_vec(unsigned long addr,
 * current/current->mm
 */
down_read(>mmap_sem);
-   pages = get_user_pages_remote(task, mm, pa, pages, flags,
- process_pages, NULL, );
+   pinned_pages = get_user_pages_remote(task, mm, pa, pinned_pages,
+flags, process_pages, NULL,
+);
if (locked)
up_read(>mmap_sem);
-   if (pages <= 0)
+   if (pinned_pages <= 0)
return -EFAULT;
 
-   bytes = pages * PAGE_SIZE - start_offset;
+   bytes = pinned_pages * PAGE_SIZE - start_offset;
if (bytes > len)
bytes = len;
 
@@ -122,10 +123,9 @@ static int process_vm_rw_single_vec(unsigned long addr,
 vm_write);
len -= bytes;
start_offset = 0;
-   nr_pages -= pages;
-   pa += pages * PAGE_SIZE;
-   while (pages)
-   put_page(process_pages[--pages]);
+   nr_pages -= pinned_pages;
+   pa += pinned_pages * PAGE_SIZE;
+   put_user_pages(process_pages, pinned_pages);
}
 
return rc;


[1] 
https://lore.kernel.org/r/1565379497-29266-2-git-send-email-linux.b...@gmail.com

thanks,
-- 
John Hubbard
NVIDIA


Re: [RFC PATCH 2/2] mm/gup: introduce vaddr_pin_pages_remote()

2019-08-16 Thread John Hubbard

On 8/16/19 11:33 AM, Ira Weiny wrote:

On Fri, Aug 16, 2019 at 05:41:08PM +0200, Jan Kara wrote:

On Thu 15-08-19 19:14:08, John Hubbard wrote:

On 8/15/19 10:41 AM, John Hubbard wrote:

On 8/15/19 10:32 AM, Ira Weiny wrote:

On Thu, Aug 15, 2019 at 03:35:10PM +0200, Jan Kara wrote:

On Thu 15-08-19 15:26:22, Jan Kara wrote:

On Wed 14-08-19 20:01:07, John Hubbard wrote:

On 8/14/19 5:02 PM, John Hubbard wrote:

...

OK, there was only process_vm_access.c, plus (sort of) Bharath's sgi-gru
patch, maybe eventually [1].  But looking at process_vm_access.c, I think
it is one of the patches that is no longer applicable, and I can just
drop it entirely...I'd welcome a second opinion on that...


I don't think you can drop the patch. process_vm_rw_pages() clearly touches
page contents and does not synchronize with page_mkclean(). So it is case
1) and needs FOLL_PIN semantics.


John could you send a formal patch using vaddr_pin* and I'll add it to the
tree?



Yes...hints about which struct file to use here are very welcome, btw. This part
of mm is fairly new to me.

thanks,
--
John Hubbard
NVIDIA


Re: [RFC PATCH 2/2] mm/gup: introduce vaddr_pin_pages_remote()

2019-08-16 Thread John Hubbard
On 8/16/19 2:59 PM, Ira Weiny wrote:
> On Fri, Aug 16, 2019 at 11:50:09AM -0700, John Hubbard wrote:
...
>>> John could you send a formal patch using vaddr_pin* and I'll add it to the
>>> tree?
>>>
>>
>> Yes...hints about which struct file to use here are very welcome, btw. This 
>> part
>> of mm is fairly new to me.
> 
> I'm still working out the final semantics of vaddr_pin*.  But right now you
> don't need a vaddr_pin if you don't specify FOLL_LONGTERM.
> 

ah OK.

> Since case 1, this case, does not need FOLL_LONGTERM I think it is safe to
> simply pass NULL here.
> 
> OTOH we could just track this against the mm_struct.  But I don't think we 
> need
> to because this pin should be transient.
> 

Thanks for looking at that, I'm definitely in learning mode here.

> And this is why I keep leaning toward _not_ putting these flags in the
> vaddr_pin*() calls.  I know this is what I did but I think I'm wrong.  It 
> should
> be the caller specifying what they want and the vaddr_pin*() calls check that
> what they are asking for is correct.
> 

Yes. I think we're nearly done finding the right balance of wrapper calls and
FOLL_* flags. I've seen Jan and others asking that the call sites do *not*
set the flags, but we also know that FOLL_PIN and FOLL_LONGTERM need to vary
independently.

That means either:

a) another trivial wrapper calls, on top of vaddr_pin_*(), for each supported 
combination of FOLL_PIN and FOLL_LONGTERM, or

b) just setting FOLL_PIN and FOLL_LONGTERM at each callsite.

I think either way is easy to grep for, so it's hard to get too excited
(fortunately) about which one to pick. Let's start simple with (b) and it's 
easy to convert later if someone wants that.

Meanwhile, we do need to pull the flag setting out of vaddr_pin_pages().

So I will post these small patches for your mmotm-rdmafsdax-b0-v4 branch,
shortly:

1) Add FOLL_PIN 

   --also I guess it's time to add comments documenting FOLL_PIN and
FOLL_LONGTERM use, stealing Jan's and others' wording for the 4 cases,
from earlier. :)

2) Add vaddr_pin_user_pages_remote(), which will not set FOLL_PIN or 
FOLL_LONGTERM
itself. And add the caller, which will.

thanks,
-- 
John Hubbard
NVIDIA


Re: [RFC PATCH v2 2/3] mm/gup: introduce FOLL_PIN flag for get_user_pages()

2019-08-16 Thread John Hubbard
On 8/16/19 7:24 PM, jhubb...@nvidia.com wrote:
> From: John Hubbard 
> DKIM-Signature: v a a-sha256; claxed/relaxed; d idia.com; s;
>   t66008674; bhMai0va6k/z2enpQJ4Nfvbj5WByFxGAO1JwdIBbXioh 
> PGP-Universal:From:To:CC:Subject:Date:Message-ID:X-Mailer:
>In-Reply-To:References:MIME-Version:X-NVConfidentiality:
>Content-Transfer-Encoding:Content-Type;
>   bÖUDSde9XF/IsNteBaYOBWeKiHhWmeU9ekUJNvCviHssBDCtw0T+M/2TlEPEzomIT
>fGXzIQNlGN6MXFbaBoyBmF/zjCu02TmTNExbVJ3/5N6PTyOuJFCx9ZN1/5gXsB11m1
>xAHIWE+VOZs4qqDeHDBqKZq+FaxQHNvGz0j6lyVBA70TfseNoZqZZrSil8uvaKJwKd
>TQ1ht+AGWbw9p610JmaPb4u6o/eV6Ns8Sl3EVnjWWu94T6ISNIaWCiC6wQQF6L1YCH
>G5Pjn+0rEjhk6XG4TyLudi5lWp3IVBHd8+WlWlnl+bvLCC55RUAjPJLn7LaVyVdh0F
>nLHwm3bN2Jotg

I cannot readily explain the above email glitch, but I did just now switch
back to mailgw.nvidia.com for this patchset, in order to get the nice behavior
of having "From:" really be my native NVIDIA email address. That's very nice,
but if the glitches happen again, I'll switch back to using gmail for 
git-send-email.

Sorry about the weirdness. It does still let you apply the patch, I
just now checked on that.

thanks,
-- 
John Hubbard
NVIDIA



Re: [PATCH 3/3] sgi-gru: Use __get_user_pages_fast in atomic_pte_lookup

2019-07-22 Thread John Hubbard
On 7/22/19 10:53 AM, Bharath Vedartham wrote:
> On Sun, Jul 21, 2019 at 07:32:36PM -0700, John Hubbard wrote:
>> On 7/21/19 8:58 AM, Bharath Vedartham wrote:
...

>> Also, optional: as long as you're there, atomic_pte_lookup() ought to
>> either return a bool (true == success) or an errno, rather than a
>> numeric zero or one.
> That makes sense. But the code which uses atomic_pte_lookup uses the
> return value of 1 for success and failure value of 0 in gru_vtop. That's
> why I did not mess with the return values in this code. It would require
> some change in the driver functionality which I am not ready to do :(

It's a static function with only one caller. You could just merge in
something like this, on top of what you have:

diff --git a/drivers/misc/sgi-gru/grufault.c b/drivers/misc/sgi-gru/grufault.c
index 121c9a4ccb94..2f768fc06432 100644
--- a/drivers/misc/sgi-gru/grufault.c
+++ b/drivers/misc/sgi-gru/grufault.c
@@ -189,10 +189,11 @@ static int non_atomic_pte_lookup(struct vm_area_struct 
*vma,
return 0;
 }
 
-/*
- * atomic_pte_lookup
+/**
+ * atomic_pte_lookup() - Convert a user virtual address to a physical address
+ * @Return: true for success, false for failure. Failure means that the page
+ * could not be pinned via gup fast.
  *
- * Convert a user virtual address to a physical address
  * Only supports Intel large pages (2MB only) on x86_64.
  * ZZZ - hugepage support is incomplete
  *
@@ -207,12 +208,12 @@ static int atomic_pte_lookup(struct vm_area_struct *vma, 
unsigned long vaddr,
*pageshift = is_vm_hugetlb_page(vma) ? HPAGE_SHIFT : PAGE_SHIFT;
 
if (!__get_user_pages_fast(vaddr, 1, write, ))
-   return 1;
+   return false;
 
*paddr = page_to_phys(page);
put_user_page(page);
 
-   return 0;
+   return true;
 }
 
 static int gru_vtop(struct gru_thread_state *gts, unsigned long vaddr,
@@ -221,7 +222,8 @@ static int gru_vtop(struct gru_thread_state *gts, unsigned 
long vaddr,
struct mm_struct *mm = gts->ts_mm;
struct vm_area_struct *vma;
unsigned long paddr;
-   int ret, ps;
+   int ps;
+   bool success;
 
vma = find_vma(mm, vaddr);
if (!vma)
@@ -232,8 +234,8 @@ static int gru_vtop(struct gru_thread_state *gts, unsigned 
long vaddr,
 * context.
 */
rmb();  /* Must/check ms_range_active before loading PTEs */
-   ret = atomic_pte_lookup(vma, vaddr, write, , );
-   if (ret) {
+   success = atomic_pte_lookup(vma, vaddr, write, , );
+   if (!success) {
if (atomic)
goto upm;
if (non_atomic_pte_lookup(vma, vaddr, write, , ))


thanks,
-- 
John Hubbard
NVIDIA


[PATCH 01/12] mm/gup: add make_dirty arg to put_user_pages_dirty_lock()

2019-07-23 Thread john . hubbard
From: John Hubbard 

Provide more capable variation of put_user_pages_dirty_lock(),
and delete put_user_pages_dirty(). This is based on the
following:

1. Lots of call sites become simpler if a bool is passed
into put_user_page*(), instead of making the call site
choose which put_user_page*() variant to call.

2. Christoph Hellwig's observation that set_page_dirty_lock()
is usually correct, and set_page_dirty() is usually a
bug, or at least questionable, within a put_user_page*()
calling chain.

This leads to the following API choices:

* put_user_pages_dirty_lock(page, npages, make_dirty)

* There is no put_user_pages_dirty(). You have to
  hand code that, in the rare case that it's
  required.

Cc: Matthew Wilcox 
Cc: Jan Kara 
Cc: Christoph Hellwig 
Cc: Ira Weiny 
Cc: Jason Gunthorpe 
Signed-off-by: John Hubbard 
---
 drivers/infiniband/core/umem.c |   5 +-
 drivers/infiniband/hw/hfi1/user_pages.c|   5 +-
 drivers/infiniband/hw/qib/qib_user_pages.c |   5 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c   |   5 +-
 drivers/infiniband/sw/siw/siw_mem.c|   8 +-
 include/linux/mm.h |   5 +-
 mm/gup.c   | 115 +
 7 files changed, 58 insertions(+), 90 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 08da840ed7ee..965cf9dea71a 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -54,10 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct 
ib_umem *umem, int d
 
for_each_sg_page(umem->sg_head.sgl, _iter, umem->sg_nents, 0) {
page = sg_page_iter_page(_iter);
-   if (umem->writable && dirty)
-   put_user_pages_dirty_lock(, 1);
-   else
-   put_user_page(page);
+   put_user_pages_dirty_lock(, 1, umem->writable && dirty);
}
 
sg_free_table(>sg_head);
diff --git a/drivers/infiniband/hw/hfi1/user_pages.c 
b/drivers/infiniband/hw/hfi1/user_pages.c
index b89a9b9aef7a..469acb961fbd 100644
--- a/drivers/infiniband/hw/hfi1/user_pages.c
+++ b/drivers/infiniband/hw/hfi1/user_pages.c
@@ -118,10 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned 
long vaddr, size_t np
 void hfi1_release_user_pages(struct mm_struct *mm, struct page **p,
 size_t npages, bool dirty)
 {
-   if (dirty)
-   put_user_pages_dirty_lock(p, npages);
-   else
-   put_user_pages(p, npages);
+   put_user_pages_dirty_lock(p, npages, dirty);
 
if (mm) { /* during close after signal, mm can be NULL */
atomic64_sub(npages, >pinned_vm);
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c 
b/drivers/infiniband/hw/qib/qib_user_pages.c
index bfbfbb7e0ff4..6bf764e41891 100644
--- a/drivers/infiniband/hw/qib/qib_user_pages.c
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -40,10 +40,7 @@
 static void __qib_release_user_pages(struct page **p, size_t num_pages,
 int dirty)
 {
-   if (dirty)
-   put_user_pages_dirty_lock(p, num_pages);
-   else
-   put_user_pages(p, num_pages);
+   put_user_pages_dirty_lock(p, num_pages, dirty);
 }
 
 /**
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c 
b/drivers/infiniband/hw/usnic/usnic_uiom.c
index 0b0237d41613..62e6ffa9ad78 100644
--- a/drivers/infiniband/hw/usnic/usnic_uiom.c
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -75,10 +75,7 @@ static void usnic_uiom_put_pages(struct list_head 
*chunk_list, int dirty)
for_each_sg(chunk->page_list, sg, chunk->nents, i) {
page = sg_page(sg);
pa = sg_phys(sg);
-   if (dirty)
-   put_user_pages_dirty_lock(, 1);
-   else
-   put_user_page(page);
+   put_user_pages_dirty_lock(, 1, dirty);
usnic_dbg("pa: %pa\n", );
}
kfree(chunk);
diff --git a/drivers/infiniband/sw/siw/siw_mem.c 
b/drivers/infiniband/sw/siw/siw_mem.c
index 67171c82b0c4..358d440efa11 100644
--- a/drivers/infiniband/sw/siw/siw_mem.c
+++ b/drivers/infiniband/sw/siw/siw_mem.c
@@ -65,13 +65,7 @@ static void siw_free_plist(struct siw_page_chunk *chunk, int 
num_pages,
 {
struct page **p = chunk->plist;
 
-   while (num_pages--) {
-   if (!PageDirty(*p) && dirty)
-   put_user_pages_dirty_lock(p, 1);
-   else
-   put_user_page(*p);
-   p++;
-   }
+   put_user_pages_dirty_lock(chunk->plist, num_pages, dirty);
 }
 
 void siw_umem_release(struct siw_umem *umem, bool dirty)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97

Re: [linux-next PATCH v4] drivers/virt/fsl_hypervisor: Fix error handling path

2020-09-04 Thread John Hubbard

On 9/4/20 6:16 PM, Souptick Joarder wrote:

Hi Andrew,

On Wed, Sep 2, 2020 at 3:00 AM John Hubbard  wrote:


On 9/1/20 2:21 PM, Souptick Joarder wrote:

First, when memory allocation for sg_list_unaligned failed, there
is a bug of calling put_pages() as we haven't pinned any pages.

Second, if get_user_pages_fast() failed we should unpin num_pinned
pages.

This will address both.

As part of these changes, minor update in documentation.

Fixes: 6db7199407ca ("drivers/virt: introduce Freescale hypervisor
management driver")
Signed-off-by: Souptick Joarder 
Reviewed-by: Dan Carpenter 
Reviewed-by: John Hubbard 
---


This looks good to me.


Can you please take this patch through the mm tree ?



Is there a problem with sending it through it's normal tree? It would probably
get better testing coverage there.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 5/5] fs/ceph: use pipe_get_pages_alloc() for pipe

2020-08-24 Thread John Hubbard

On 8/24/20 3:53 AM, Jeff Layton wrote:


This looks fine to me. Let me know if you need this merged via the ceph
tree. Thanks!

Acked-by: Jeff Layton 



Yes, please! It will get proper testing that way, and it doesn't have
any prerequisites, despite being part of this series. So it would be
great to go in via the ceph tree.

For the main series here, I'll send a v2 with only patches 1-3, once
enough feedback has happened.

thanks,
--
John Hubbard
NVIDIA


[PATCH v2] tee: convert convert get_user_pages() --> pin_user_pages()

2020-08-24 Thread John Hubbard
This code was using get_user_pages*(), in a "Case 2" scenario
(DMA/RDMA), using the categorization from [1]. That means that it's
time to convert the get_user_pages*() + put_page() calls to
pin_user_pages*() + unpin_user_pages() calls.

There is some helpful background in [2]: basically, this is a small
part of fixing a long-standing disconnect between pinning pages, and
file systems' use of those pages.

[1] Documentation/core-api/pin_user_pages.rst

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

Cc: Jens Wiklander 
Cc: Sumit Semwal 
Cc: tee-...@lists.linaro.org
Cc: linux-me...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linaro-mm-...@lists.linaro.org
Signed-off-by: John Hubbard 
---

OK, this should be indentical to v1 [1], but now rebased against
Linux 5.9-rc2.

As before, I've compile-tested it again with a cross compiler, but that's
the only testing I'm set up for with CONFIG_TEE.

[1] https://lore.kernel.org/r/20200519051850.2845561-1-jhubb...@nvidia.com

thanks,
John Hubbard
NVIDIA

 drivers/tee/tee_shm.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 827ac3d0fea9..3c29e6c3ebe8 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -32,16 +32,13 @@ static void tee_shm_release(struct tee_shm *shm)
 
poolm->ops->free(poolm, shm);
} else if (shm->flags & TEE_SHM_REGISTER) {
-   size_t n;
int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
 
if (rc)
dev_err(teedev->dev.parent,
"unregister shm %p failed: %d", shm, rc);
 
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
-
+   unpin_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
 
@@ -228,7 +225,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
}
 
if (flags & TEE_SHM_USER_MAPPED) {
-   rc = get_user_pages_fast(start, num_pages, FOLL_WRITE,
+   rc = pin_user_pages_fast(start, num_pages, FOLL_WRITE,
 shm->pages);
} else {
struct kvec *kiov;
@@ -292,16 +289,13 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
return shm;
 err:
if (shm) {
-   size_t n;
-
if (shm->id >= 0) {
mutex_lock(>mutex);
idr_remove(>idr, shm->id);
mutex_unlock(>mutex);
}
if (shm->pages) {
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
+   unpin_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
}
-- 
2.28.0



Re: [PATCH v2] tee: convert convert get_user_pages() --> pin_user_pages()

2020-08-24 Thread John Hubbard

On 8/24/20 11:36 AM, John Hubbard wrote:

This code was using get_user_pages*(), in a "Case 2" scenario
(DMA/RDMA), using the categorization from [1]. That means that it's
time to convert the get_user_pages*() + put_page() calls to
pin_user_pages*() + unpin_user_pages() calls.

There is some helpful background in [2]: basically, this is a small
part of fixing a long-standing disconnect between pinning pages, and
file systems' use of those pages.

[1] Documentation/core-api/pin_user_pages.rst

[2] "Explicit pinning of user-space pages":
 https://lwn.net/Articles/807108/

Cc: Jens Wiklander 
Cc: Sumit Semwal 
Cc: tee-...@lists.linaro.org
Cc: linux-me...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linaro-mm-...@lists.linaro.org
Signed-off-by: John Hubbard 
---

OK, this should be indentical to v1 [1], but now rebased against
Linux 5.9-rc2.



...ohhh, wait, I should have read the earlier message from Jens more
carefully:

"The conflict isn't trivial, I guess we need to handle the different
types of pages differently when releasing them."

So it's not good to have a logically identical patch. argghhh. Let me see
how hard it is to track these memory types separately and handle the release
accordingly, just a sec.

Sorry about the false move here.

thanks,
--
John Hubbard
NVIDIA


As before, I've compile-tested it again with a cross compiler, but that's
the only testing I'm set up for with CONFIG_TEE.

[1] https://lore.kernel.org/r/20200519051850.2845561-1-jhubb...@nvidia.com

thanks,
John Hubbard
NVIDIA

  drivers/tee/tee_shm.c | 12 +++-
  1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 827ac3d0fea9..3c29e6c3ebe8 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -32,16 +32,13 @@ static void tee_shm_release(struct tee_shm *shm)
  
  		poolm->ops->free(poolm, shm);

} else if (shm->flags & TEE_SHM_REGISTER) {
-   size_t n;
int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
  
  		if (rc)

dev_err(teedev->dev.parent,
"unregister shm %p failed: %d", shm, rc);
  
-		for (n = 0; n < shm->num_pages; n++)

-   put_page(shm->pages[n]);
-
+   unpin_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
  
@@ -228,7 +225,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, unsigned long addr,

}
  
  	if (flags & TEE_SHM_USER_MAPPED) {

-   rc = get_user_pages_fast(start, num_pages, FOLL_WRITE,
+   rc = pin_user_pages_fast(start, num_pages, FOLL_WRITE,
 shm->pages);
} else {
struct kvec *kiov;
@@ -292,16 +289,13 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
return shm;
  err:
if (shm) {
-   size_t n;
-
if (shm->id >= 0) {
mutex_lock(>mutex);
idr_remove(>idr, shm->id);
mutex_unlock(>mutex);
}
if (shm->pages) {
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
+   unpin_user_pages(shm->pages, shm->num_pages);
kfree(shm->pages);
}
}



v


Re: [PATCH 5/5] fs/ceph: use pipe_get_pages_alloc() for pipe

2020-08-24 Thread John Hubbard

On 8/24/20 11:54 AM, Matthew Wilcox wrote:

On Mon, Aug 24, 2020 at 02:47:53PM -0400, Jeff Layton wrote:

Ok, I'll plan to pick it up providing no one has issues with exporting that 
symbol.


_GPL, perhaps?


DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=nvidia.com; s=n1;
t=1598295688; bh=TGtHrKY9YE9vgbD3P6YUTHn7yhP+gCmFr3Z8XIdh17s=;
h=X-PGP-Universal:Subject:To:CC:References:From:Message-ID:Date:
 User-Agent:MIME-Version:In-Reply-To:X-Originating-IP:
 X-ClientProxiedBy:Content-Type:Content-Language:
 Content-Transfer-Encoding;
b=KWZ+bMZ8RXIxd4CMBL8Dn6hn0ggsojx1vJaaueo+/+Xwe0yKBa+bMZfnn1XUDWvJs
 KMZJ22FgEdc+HO/8Mx0JKVQsLfKj74dwj3kGjs1z0KA5Vcnx93pzd/iMXDFhClf3uz
 KmyGEdar0p6poBaBOlsapOb59acP6ot16rhi7ZbTch+7ErO/Rupx6VinU1A2Ydc3OP
 mBYXZqz35DZ/H5eqhoE84MuOFj8Ti/Wpen637pLLa5dmtXjSMRmYTQXMRygUmQXdaw
 g9XLuqaxKRxp2lnuoVdFK0T90Hfu/71T+S8asZZYhH9zHY2Wzhgp1VkR07ZtXmMNqI
 W/lB00RAVtj3Q==

I looked at that, and the equivalent iov_iter_get_pages* and related stuff was
just EXPORT_SYMBOL, so I tried to match that. But if it needs to be _GPL then
that's fine too...

thanks,
--
John Hubbard
NVIDIA


[PATCH v3] tee: convert convert get_user_pages() --> pin_user_pages()

2020-08-24 Thread John Hubbard
This code was using get_user_pages*(), in a "Case 2" scenario
(DMA/RDMA), using the categorization from [1]. That means that it's
time to convert the get_user_pages*() + put_page() calls to
pin_user_pages*() + unpin_user_pages() calls.

Factor out a new, small release_registered_pages() function, in
order to consolidate the logic for discerning between
TEE_SHM_USER_MAPPED and TEE_SHM_KERNEL_MAPPED pages. This also
absorbs the kfree() call that is also required there.

There is some helpful background in [2]: basically, this is a small
part of fixing a long-standing disconnect between pinning pages, and
file systems' use of those pages.

[1] Documentation/core-api/pin_user_pages.rst

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

Cc: Jens Wiklander 
Cc: Sumit Semwal 
Cc: tee-...@lists.linaro.org
Cc: linux-me...@vger.kernel.org
Cc: dri-de...@lists.freedesktop.org
Cc: linaro-mm-...@lists.linaro.org
Signed-off-by: John Hubbard 
---

OK, one more try, this time actually handling the _USER_MAPPED vs.
_KERNEL_MAPPED pages!

thanks,
John Hubbard
NVIDIA

 drivers/tee/tee_shm.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/drivers/tee/tee_shm.c b/drivers/tee/tee_shm.c
index 827ac3d0fea9..00472f5ce22e 100644
--- a/drivers/tee/tee_shm.c
+++ b/drivers/tee/tee_shm.c
@@ -12,6 +12,22 @@
 #include 
 #include "tee_private.h"
 
+static void release_registered_pages(struct tee_shm *shm)
+{
+   if (shm->pages) {
+   if (shm->flags & TEE_SHM_USER_MAPPED) {
+   unpin_user_pages(shm->pages, shm->num_pages);
+   } else {
+   size_t n;
+
+   for (n = 0; n < shm->num_pages; n++)
+   put_page(shm->pages[n]);
+   }
+
+   kfree(shm->pages);
+   }
+}
+
 static void tee_shm_release(struct tee_shm *shm)
 {
struct tee_device *teedev = shm->ctx->teedev;
@@ -32,17 +48,13 @@ static void tee_shm_release(struct tee_shm *shm)
 
poolm->ops->free(poolm, shm);
} else if (shm->flags & TEE_SHM_REGISTER) {
-   size_t n;
int rc = teedev->desc->ops->shm_unregister(shm->ctx, shm);
 
if (rc)
dev_err(teedev->dev.parent,
"unregister shm %p failed: %d", shm, rc);
 
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
-
-   kfree(shm->pages);
+   release_registered_pages(shm);
}
 
teedev_ctx_put(shm->ctx);
@@ -228,7 +240,7 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
}
 
if (flags & TEE_SHM_USER_MAPPED) {
-   rc = get_user_pages_fast(start, num_pages, FOLL_WRITE,
+   rc = pin_user_pages_fast(start, num_pages, FOLL_WRITE,
 shm->pages);
} else {
struct kvec *kiov;
@@ -292,18 +304,12 @@ struct tee_shm *tee_shm_register(struct tee_context *ctx, 
unsigned long addr,
return shm;
 err:
if (shm) {
-   size_t n;
-
if (shm->id >= 0) {
mutex_lock(>mutex);
idr_remove(>idr, shm->id);
mutex_unlock(>mutex);
}
-   if (shm->pages) {
-   for (n = 0; n < shm->num_pages; n++)
-   put_page(shm->pages[n]);
-   kfree(shm->pages);
-   }
+   release_registered_pages(shm);
}
kfree(shm);
teedev_ctx_put(ctx);
-- 
2.28.0



[PATCH v2] fs/ceph: use pipe_get_pages_alloc() for pipe

2020-08-24 Thread John Hubbard
This reduces, by one, the number of callers of iov_iter_get_pages().
That's helpful because these calls are being audited and converted over
to use iov_iter_pin_user_pages(), where applicable. And this one here is
already known by the caller to be only for ITER_PIPE, so let's just
simplify it now.

Signed-off-by: John Hubbard 
---

OK, here's a v2 that does EXPORT_SYMBOL_GPL, instead of EXPORT_SYMBOL,
that's the only change from v1. That should help give this patch a
clear bill of passage. :)

thanks,
John Hubbard
NVIDIA

 fs/ceph/file.c  | 3 +--
 include/linux/uio.h | 3 ++-
 lib/iov_iter.c  | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d51c3f2fdca0..d3d7dd957390 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -879,8 +879,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct 
iov_iter *to,
more = len < iov_iter_count(to);
 
if (unlikely(iov_iter_is_pipe(to))) {
-   ret = iov_iter_get_pages_alloc(to, , len,
-  _off);
+   ret = pipe_get_pages_alloc(to, , len, _off);
if (ret <= 0) {
ceph_osdc_put_request(req);
ret = -ENOMEM;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3835a8a8e9ea..270a4dcf5453 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -226,7 +226,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, struct page 
**pages,
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
-
+ssize_t pipe_get_pages_alloc(struct iov_iter *i, struct page ***pages,
+size_t maxsize, size_t *start);
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
 static inline size_t iov_iter_count(const struct iov_iter *i)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f12..6290998df480 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1355,9 +1355,8 @@ static struct page **get_pages_array(size_t n)
return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL);
 }
 
-static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
-  struct page ***pages, size_t maxsize,
-  size_t *start)
+ssize_t pipe_get_pages_alloc(struct iov_iter *i, struct page ***pages,
+size_t maxsize, size_t *start)
 {
struct page **p;
unsigned int iter_head, npages;
@@ -1387,6 +1386,7 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
kvfree(p);
return n;
 }
+EXPORT_SYMBOL_GPL(pipe_get_pages_alloc);
 
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
   struct page ***pages, size_t maxsize,
-- 
2.28.0



Re: [PATCH 0/5] bio: Direct IO: convert to pin_user_pages_fast()

2020-08-24 Thread John Hubbard

On 8/24/20 6:54 PM, Al Viro wrote:

On Fri, Aug 21, 2020 at 09:20:54PM -0700, John Hubbard wrote:


Direct IO behavior:

 ITER_IOVEC:
 pin_user_pages_fast();
 break;

 ITER_KVEC:// already elevated page refcount, leave alone
 ITER_BVEC:// already elevated page refcount, leave alone
 ITER_PIPE:// just, no :)


Why?  What's wrong with splice to O_DIRECT file?



Oh! I'll take a look. Is this the fs/splice.c stuff?  I ruled this out early
mainly based on Christoph's comment in [1] ("ITER_PIPE is rejected іn the
direct I/O path"), but if it's supportable then I'll hook it up.

(As you can see, I'm still very much coming up to speed on the various things
that invoke iov_iter_get_pages*().)

[1] https://lore.kernel.org/kvm/20190724061750.ga19...@infradead.org/

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 0/5] bio: Direct IO: convert to pin_user_pages_fast()

2020-08-24 Thread John Hubbard

On 8/24/20 7:07 PM, Al Viro wrote:

On Tue, Aug 25, 2020 at 02:54:28AM +0100, Al Viro wrote:

On Fri, Aug 21, 2020 at 09:20:54PM -0700, John Hubbard wrote:


Direct IO behavior:

 ITER_IOVEC:
 pin_user_pages_fast();
 break;

 ITER_KVEC:// already elevated page refcount, leave alone
 ITER_BVEC:// already elevated page refcount, leave alone
 ITER_PIPE:// just, no :)


Why?  What's wrong with splice to O_DIRECT file?


Sorry - s/to/from/, obviously.

To spell it out: consider generic_file_splice_read() behaviour when
the source had been opened with O_DIRECT; you will get a call of
->read_iter() into ITER_PIPE destination.  And it bloody well
will hit iov_iter_get_pages() on common filesystems, to pick the
pages we want to read into.

So... what's wrong with having that "pin" primitive making sure
the pages are there and referenced by the pipe?



(our emails crossed) OK, yes, let me hook that up. I was just unaware
of that flow, I'll go off and figure it out.

Thanks for looking at this!

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 0/5] bio: Direct IO: convert to pin_user_pages_fast()

2020-08-24 Thread John Hubbard

On 8/24/20 7:22 PM, Al Viro wrote:

On Mon, Aug 24, 2020 at 07:07:02PM -0700, John Hubbard wrote:

On 8/24/20 6:54 PM, Al Viro wrote:

On Fri, Aug 21, 2020 at 09:20:54PM -0700, John Hubbard wrote:


Direct IO behavior:

  ITER_IOVEC:
  pin_user_pages_fast();
  break;

  ITER_KVEC:// already elevated page refcount, leave alone
  ITER_BVEC:// already elevated page refcount, leave alone
  ITER_PIPE:// just, no :)


Why?  What's wrong with splice to O_DIRECT file?



Oh! I'll take a look. Is this the fs/splice.c stuff?  I ruled this out early
mainly based on Christoph's comment in [1] ("ITER_PIPE is rejected іn the
direct I/O path"), but if it's supportable then I'll hook it up.


; cat >a.c <<'EOF'
#define _GNU_SOURCE
#include 
#include 
#include 

int main()
{
 int fd = open("./a.out", O_DIRECT);
 splice(fd, NULL, 1, NULL, 4096, 0);
return 0;
}
EOF
; cc a.c
; ./a.out | wc -c
4096

and you just had ->read_iter() called with ITER_PIPE destination.



That example saves me a lot of time!  Much appreciated.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v3] tee: convert convert get_user_pages() --> pin_user_pages()

2020-08-25 Thread John Hubbard

On 8/25/20 1:32 AM, Jens Wiklander wrote:

On Mon, Aug 24, 2020 at 02:11:25PM -0700, John Hubbard wrote:

...

OK, one more try, this time actually handling the _USER_MAPPED vs.
_KERNEL_MAPPED pages!

thanks,
John Hubbard
NVIDIA


Looks good and it works too! :-) I've tested it on my Hikey board with
the OP-TEE test suite.
I'm picking this up.



Great! I see that I have, once again, somehow doubled up on the subject line:
"tee: convert convert ...". This particular typo just seems to stick to me. :)

If you get a chance to fix that up by changing it to just a single "convert"
I'd appreciate it.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH] mm/gup: don't permit users to call get_user_pages with FOLL_LONGTERM

2020-08-19 Thread John Hubbard

On 8/19/20 4:01 AM, Barry Song wrote:

gug prohibits users from calling get_user_pages() with FOLL_PIN. But it


Maybe Andrew can fix the typo above: gug --> gup.



allows users to call get_user_pages() with FOLL_LONGTERM only. It seems
insensible.

since FOLL_LONGTERM is a stricter case of FOLL_PIN, we should prohibit
users from calling get_user_pages() with FOLL_LONGTERM while not with
FOLL_PIN.

mm/gup_benchmark.c used to be the only user who did this improperly.
But it has been fixed by moving to use pin_user_pages().


For future patches, you don't have to write everything in the
commit log. Some things are better placed in a cover letter or after
the "---" line, because they don't need to be recorded forever.

Anyway, the diffs seem fine, assuming that you've audited the call sites.

thanks,
--
John Hubbard
NVIDIA



Cc: John Hubbard 
Cc: Jan Kara 
Cc: Jérôme Glisse 
Cc: "Matthew Wilcox (Oracle)" 
Cc: Al Viro 
Cc: Christoph Hellwig 
Cc: Dan Williams 
Cc: Dave Chinner 
Cc: Jason Gunthorpe 
Cc: Jonathan Corbet 
Cc: Michal Hocko 
Cc: Mike Kravetz 
Cc: Shuah Khan 
Cc: Vlastimil Babka 
Signed-off-by: Barry Song 
---
  mm/gup.c | 37 ++---
  1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index ae096ea7583f..4da669f79566 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1789,6 +1789,25 @@ static long __get_user_pages_remote(struct mm_struct *mm,
   gup_flags | FOLL_TOUCH | FOLL_REMOTE);
  }
  
+static bool is_valid_gup_flags(unsigned int gup_flags)

+{
+   /*
+* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+* never directly by the caller, so enforce that with an assertion:
+*/
+   if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+   return false;
+   /*
+* FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
+* that is, FOLL_LONGTERM is a specific case, more restrictive case of
+* FOLL_PIN.
+*/
+   if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+   return false;
+
+   return true;
+}
+
  /**
   * get_user_pages_remote() - pin user pages in memory
   * @mm:   mm_struct of target mm
@@ -1854,11 +1873,7 @@ long get_user_pages_remote(struct mm_struct *mm,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
  {
-   /*
-* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
-* never directly by the caller, so enforce that with an assertion:
-*/
-   if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+   if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
  
  	return __get_user_pages_remote(mm, start, nr_pages, gup_flags,

@@ -1904,11 +1919,7 @@ long get_user_pages(unsigned long start, unsigned long 
nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas)
  {
-   /*
-* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
-* never directly by the caller, so enforce that with an assertion:
-*/
-   if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+   if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
  
  	return __gup_longterm_locked(current->mm, start, nr_pages,

@@ -2810,11 +2821,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
  int get_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
  {
-   /*
-* FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
-* never directly by the caller, so enforce that:
-*/
-   if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+   if (!is_valid_gup_flags(gup_flags))
return -EINVAL;
  
  	/*






Re: [PATCH] mm/gup_benchmark: update the documentation in Kconfig

2020-08-20 Thread John Hubbard

On 8/20/20 8:25 PM, Barry Song wrote:

In the beginning, mm/gup_benchmark.c supported get_user_pages_fast()
only, but right now, it supports the benchmarking of a couple of
get_user_pages() related calls like:
* get_user_pages_fast()
* get_user_pages()
* pin_user_pages_fast()
* pin_user_pages()
The documentation is confusing and needs update.


hmmm, it's not that confusing, given that pin_user_pages() and
get_user_pages() use the same underlying get_user_pages()
implementation.



Cc: John Hubbard 
Cc: Keith Busch 
Cc: Ira Weiny 
Cc: Kirill A. Shutemov 
Signed-off-by: Barry Song 
---
  mm/Kconfig | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 6c974888f86f..f7c9374da7b3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -831,10 +831,10 @@ config PERCPU_STATS
  be used to help understand percpu memory usage.
  
  config GUP_BENCHMARK

-   bool "Enable infrastructure for get_user_pages_fast() benchmarking"
+   bool "Enable infrastructure for get_user_pages() and related calls 
benchmarking"


If we really want to go to the trouble of tweaking this, then I'd go with
something more like:

"Enable infrastructure for get_user_pages() and pin_user_pages benchmarking"

...but I don't think it really warrants a patch just yet. *However*, my
judgment is skewed right now, because I'm planning a small patchset to split
up gup_benchmark a little bit, and to add some more testing and take advantage
of parts of it to do a dump_page() test. At which point "related calls" would
make more sense, but then it would be different enough that this patch would
still need changing.

So I'm inclined to just recommend leaving this alone for a bit, but if others
want to put it in, I'm OK with that too.


help
  Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages_fast().
+ performance of get_user_pages() and related calls.
  
  	  See tools/testing/selftests/vm/gup_benchmark.c
  



thanks,
--
John Hubbard
NVIDIA


Re: [PATCH] mm/gup: don't permit users to call get_user_pages with FOLL_LONGTERM

2020-09-03 Thread John Hubbard

On 9/3/20 12:12 AM, Souptick Joarder wrote:

On Wed, Aug 19, 2020 at 11:45 PM John Hubbard  wrote:


On 8/19/20 4:01 AM, Barry Song wrote:

gug prohibits users from calling get_user_pages() with FOLL_PIN. But it


Maybe Andrew can fix the typo above: gug --> gup.



allows users to call get_user_pages() with FOLL_LONGTERM only. It seems
insensible.

since FOLL_LONGTERM is a stricter case of FOLL_PIN, we should prohibit
users from calling get_user_pages() with FOLL_LONGTERM while not with
FOLL_PIN.

mm/gup_benchmark.c used to be the only user who did this improperly.
But it has been fixed by moving to use pin_user_pages().


For future patches, you don't have to write everything in the
commit log. Some things are better placed in a cover letter or after
the "---" line, because they don't need to be recorded forever.

Anyway, the diffs seem fine, assuming that you've audited the call sites.


We can use is_valid_gup_flags() inside ->
get_user_pages_locked(),
get_user_pages_unlocked(),
pin_user_pages_locked() as well.


Probably it's best to discern between valid pup flags, and valid gup flags.
As in: separate functions for those. Maybe one is a subset of the other, but
still.



Are you planning to add it in future patches ?



It's not on my list. I don't see anything wrong with doing so, other
than avoiding the minor pitfall I called out above. So if you want to
do that, then feel free...


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 16/38] media: videobuf-dma-sg: number of pages should be unsigned long

2020-09-03 Thread John Hubbard

On 9/2/20 9:10 AM, Mauro Carvalho Chehab wrote:

As reported by smatch:

drivers/media/v4l2-core/videobuf-dma-sg.c:245 videobuf_dma_init_kernel() 
warn: should 'nr_pages << 12' be a 64 bit type?

The printk should not be using %d for the number of pages.

After looking better, the real problem here is that the
number of pages should be long int.

Signed-off-by: Mauro Carvalho Chehab 
---
  drivers/media/v4l2-core/videobuf-dma-sg.c | 22 --
  include/media/videobuf-dma-sg.h   |  2 +-
  2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c 
b/drivers/media/v4l2-core/videobuf-dma-sg.c
index 46ff19df9f53..8dd0562de287 100644
--- a/drivers/media/v4l2-core/videobuf-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf-dma-sg.c
@@ -180,7 +180,7 @@ static int videobuf_dma_init_user_locked(struct 
videobuf_dmabuf *dma,
if (rw == READ)
flags |= FOLL_WRITE;
  
-	dprintk(1, "init user [0x%lx+0x%lx => %d pages]\n",

+   dprintk(1, "init user [0x%lx+0x%lx => %lu pages]\n",
data, size, dma->nr_pages);
  
  	err = pin_user_pages(data & PAGE_MASK, dma->nr_pages,



One pre-existing detail to remember is that the gup/pup routines,
specifically pin_user_pages() in this case, use an "int" for the
incoming nr_pages. (I wonder if that should be changed? It's now
becoming a pitfall.) So it's now possible to overflow.

In other situations like this (see xsdfec_table_write() in
drivers/misc/xilinx_sdfec.c), we've added checks such as:

u32 n;
...

if (WARN_ON_ONCE(n > INT_MAX))
return -EINVAL;

nr_pages = n;

res = pin_user_pages_fast((unsigned long)src_ptr, nr_pages, 0, pages);

...in other words, check the value while it's stored in a 64-bit type,
before sending it down into a 32-bit API.

...other than that, everything else looks fine.

thanks,
--
John Hubbard
NVIDIA


[PATCH v2 0/3] bio: Direct IO: convert to pin_user_pages_fast()

2020-08-29 Thread John Hubbard
Hi,

Changes since v1:

* Now handles ITER_PIPE, by appying pin_user_page() to ITER_PIPE pages,
on the Direct IO path. Thanks to Al Viro for pointing me in the right
direction there.

* Removed the ceph and BIO_FOLL_PIN patches: the ceph improvements were
handled separately as a different patch entirely, by Jeff Layton. And
the BIO_FOLL_PIN idea turned out to be completely undesirable here.

Original cover letter, updated for v2:

This converts the Direct IO block/bio layer over to use FOLL_PIN pages
(those acquired via pin_user_pages*()). This effectively converts
several file systems (ext4, for example) that use the common Direct IO
routines. See "Remaining work", below for a bit more detail there.

Quite a few approaches have been considered over the years. This one is
inspired by Christoph Hellwig's July, 2019 observation that there are
only 5 ITER_ types, and we can simplify handling of them for Direct IO
[1]. After working through how bio submission and completion works, I
became convinced that this is the simplest and cleanest approach to
conversion.

Design notes 

This whole approach depends on certain concepts:

1) Each struct bio instance must not mix different types of pages:
FOLL_PIN and non-FOLL_PIN pages. (By FOLL_PIN I'm referring to pages
that were acquired and pinned via pin_user_page*() routines.)
Fortunately, this is already an enforced constraint for bio's, as
evidenced by the existence and use of BIO_NO_PAGE_REF.

2) Christoph Hellwig's July, 2019 observation that there are
only 5 ITER_ types, and we can simplify handling of them for Direct IO
[1]. Accordingly, this series implements the following pseudocode:

Direct IO behavior:

ITER_IOVEC:
pin_user_pages_fast();
break;

ITER_PIPE:
for each page:
 pin_user_page();
break;

ITER_KVEC:// already elevated page refcount, leave alone
ITER_BVEC:// already elevated page refcount, leave alone
ITER_DISCARD: // discard
return -EFAULT or -ENVALID;

...which works for callers that already have sorted out which case they
are in. Such as, Direct IO in the block/bio layers.

Now, this does leave ITER_KVEC and ITER_BVEC unconverted, but on the
other hand, it's not clear that these are actually affected in the real
world, by the get_user_pages()+filesystem interaction problems of [2].
If it turns out to matter, then those can be handled too, but it's just
more refactoring and surgery to do so.

Testing
===

Performance: no obvious regressions from running fio (direct=1: Direct
IO) on both SSD and NVMe drives.

Functionality: selected non-destructive bare metal xfstests on xfs,
ext4, btrfs, orangefs filesystems, plus LTP tests.

Note that I have only a single x86 64-bit test machine, though.

Remaining work
==

Non-converted call sites for iter_iov_get_pages*() at the
moment include: net, crypto, cifs, ceph, vhost, fuse, nfs/direct,
vhost/scsi. However, it's not clear which of those really have to be
converted, because some of them probably use ITER_BVEC or ITER_KVEC.

About-to-be-converted sites (in a subsequent patch) are: Direct IO for
filesystems that use the generic read/write functions.

[1] https://lore.kernel.org/kvm/20190724061750.ga19...@infradead.org/

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/


John Hubbard (3):
  mm/gup: introduce pin_user_page()
  iov_iter: introduce iov_iter_pin_user_pages*() routines
  bio: convert get_user_pages_fast() --> pin_user_pages_fast()

 block/bio.c  |  24 +-
 block/blk-map.c  |   6 +--
 fs/direct-io.c   |  28 +--
 fs/iomap/direct-io.c |   2 +-
 include/linux/mm.h   |   2 +
 include/linux/uio.h  |   5 ++
 lib/iov_iter.c   | 110 +++
 mm/gup.c |  30 
 8 files changed, 169 insertions(+), 38 deletions(-)

-- 
2.28.0



[PATCH v2 2/3] iov_iter: introduce iov_iter_pin_user_pages*() routines

2020-08-29 Thread John Hubbard
The new routines are:
iov_iter_pin_user_pages()
iov_iter_pin_user_pages_alloc()

and those correspond to these pre-existing routines:
iov_iter_get_pages()
iov_iter_get_pages_alloc()

Also, pipe_get_pages() and related are changed so as to pass
down a "use_pup" (use pin_user_page() instead of get_page()) bool
argument.

Unlike the iov_iter_get_pages*() routines, the
iov_iter_pin_user_pages*() routines assert that only ITER_IOVEC or
ITER_PIPE items are passed in. They then call pin_user_page*(), instead
of get_user_pages_fast() or get_page().

Why: In order to incrementally change Direct IO callers from calling
get_user_pages_fast() and put_page(), over to calling
pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
routines that specifically call one or the other systems, for both page
acquisition and page release.

Signed-off-by: John Hubbard 
---
 include/linux/uio.h |   5 ++
 lib/iov_iter.c  | 110 
 2 files changed, 107 insertions(+), 8 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3835a8a8e9ea..29b0504a27cc 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -229,6 +229,11 @@ int iov_iter_npages(const struct iov_iter *i, int 
maxpages);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
+ssize_t iov_iter_pin_user_pages(struct iov_iter *i, struct page **pages,
+   size_t maxsize, unsigned int maxpages, size_t *start);
+ssize_t iov_iter_pin_user_pages_alloc(struct iov_iter *i, struct page ***pages,
+   size_t maxsize, size_t *start);
+
 static inline size_t iov_iter_count(const struct iov_iter *i)
 {
return i->count;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f12..f2eb3279 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1269,7 +1269,8 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
size_t maxsize,
struct page **pages,
int iter_head,
-   size_t *start)
+   size_t *start,
+   bool use_pup)
 {
struct pipe_inode_info *pipe = i->pipe;
unsigned int p_mask = pipe->ring_size - 1;
@@ -1280,7 +1281,11 @@ static inline ssize_t __pipe_get_pages(struct iov_iter 
*i,
maxsize = n;
n += *start;
while (n > 0) {
-   get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+   if (use_pup)
+   pin_user_page(*pages++ = pipe->bufs[iter_head & 
p_mask].page);
+   else
+   get_page(*pages++ = pipe->bufs[iter_head & 
p_mask].page);
+
iter_head++;
n -= PAGE_SIZE;
}
@@ -1290,7 +1295,7 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 
 static ssize_t pipe_get_pages(struct iov_iter *i,
   struct page **pages, size_t maxsize, unsigned maxpages,
-  size_t *start)
+  size_t *start, bool use_pup)
 {
unsigned int iter_head, npages;
size_t capacity;
@@ -1306,8 +1311,51 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
capacity = min(npages, maxpages) * PAGE_SIZE - *start;
 
-   return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, 
start);
+   return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head,
+   start, use_pup);
+}
+
+ssize_t iov_iter_pin_user_pages(struct iov_iter *i,
+  struct page **pages, size_t maxsize, unsigned int maxpages,
+  size_t *start)
+{
+   size_t skip = i->iov_offset;
+   const struct iovec *iov;
+   struct iovec v;
+
+   if (unlikely(iov_iter_is_pipe(i)))
+   return pipe_get_pages(i, pages, maxsize, maxpages, start, true);
+   if (unlikely(iov_iter_is_discard(i)))
+   return -EFAULT;
+   if (WARN_ON_ONCE(!iter_is_iovec(i)))
+   return -EFAULT;
+
+   if (unlikely(!maxsize))
+   return 0;
+   maxsize = min(maxsize, i->count);
+
+   iterate_iovec(i, maxsize, v, iov, skip, ({
+   unsigned long addr = (unsigned long)v.iov_base;
+   size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+   int n;
+   int res;
+
+   if (len > maxpages * PAGE_SIZE)
+   len = maxpages * PAGE_SIZE;
+   addr &= ~(PAGE_SIZE - 1);
+   n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+   res = pin_user_pages_fast(addr, n,
+   iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+   pages);
+   if (unlikely

[PATCH v2 3/3] bio: convert get_user_pages_fast() --> pin_user_pages_fast()

2020-08-29 Thread John Hubbard
Change generic block/bio Direct IO routines, to acquire FOLL_PIN user
pages via the recently added routines:

iov_iter_pin_user_pages()
iov_iter_pin_user_pages_alloc()
pin_user_page()

This effectively converts several file systems (ext4, for example) that
use the common Direct IO routines.

Change the corresponding page release calls from put_page() to
unpin_user_page().

Change bio_release_pages() to handle FOLL_PIN pages. In fact, that
is now the *only* type of pages it handles now.

Design notes


Quite a few approaches have been considered over the years. This one is
inspired by Christoph Hellwig's July, 2019 observation that there are
only 5 ITER_ types, and we can simplify handling of them for Direct IO
[1]. Accordingly, this patch implements the following pseudocode:

Direct IO behavior:

ITER_IOVEC:
pin_user_pages_fast();
break;

ITER_PIPE:
for each page:
 pin_user_page();
break;

ITER_KVEC:// already elevated page refcount, leave alone
ITER_BVEC:// already elevated page refcount, leave alone
ITER_DISCARD: // discard
return -EFAULT or -ENVALID;

...which works for callers that already have sorted out which case they
are in. Such as, Direct IO in the block/bio layers.

Now, this does leave ITER_KVEC and ITER_BVEC unconverted, but on the
other hand, it's not clear that these are actually affected in the real
world, by the get_user_pages()+filesystem interaction problems of [2].
If it turns out to matter, then those can be handled too, but it's just
more refactoring and surgery to do so.

Page acquisition: The iov_iter_get_pages*() routines
above are at just the right level in the call stack: the callers already
know which system to use, and so it's a small change to just drop in the
replacement routines. And it's a fan-in/fan-out point: block/bio call
sites for Direct IO funnel their page acquisitions through the
iov_iter_get_pages*() routines, and there are many other callers of
those. And we can't convert all of the callers at once--too many
subsystems are involved, and it would be a too large and too risky
patch.

Page release: there are already separate release routines: put_page()
vs. unpin_user_page(), so it's already done there.

[1] https://lore.kernel.org/kvm/20190724061750.ga19...@infradead.org/

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

Signed-off-by: John Hubbard 
---
 block/bio.c  | 24 
 block/blk-map.c  |  6 +++---
 fs/direct-io.c   | 28 ++--
 fs/iomap/direct-io.c |  2 +-
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a9931f23d933..f54e9414e6d9 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -955,7 +955,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
-   put_page(bvec->bv_page);
+   unpin_user_page(bvec->bv_page);
}
 }
 EXPORT_SYMBOL_GPL(bio_release_pages);
@@ -986,9 +986,9 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct 
iov_iter *iter)
  * @iter: iov iterator describing the region to be mapped
  *
  * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
- * For multi-segment *iter, this function only adds pages from the
- * next non-empty segment of the iov iterator.
+ * pages will have to be released using put_page() or unpin_user_page() when
+ * done. For multi-segment *iter, this function only adds pages from the next
+ * non-empty segment of the iov iterator.
  */
 static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
@@ -1009,7 +1009,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, 
struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-   size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
+   size = iov_iter_pin_user_pages(iter, pages, LONG_MAX, nr_pages, 
);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
 
@@ -1020,7 +1020,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, 
struct iov_iter *iter)
 
if (__bio_try_merge_page(bio, page, len, offset, _page)) {
if (same_page)
-   put_page(page);
+   unpin_user_page(page);
} else {
if (WARN_ON_ONCE(bio_full(bio, len)))
 return -EINVAL;
@@ -1056,7 +1056,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, 
struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages +=

[PATCH v2 1/3] mm/gup: introduce pin_user_page()

2020-08-29 Thread John Hubbard
pin_user_page() is the FOLL_PIN equivalent of get_page().

This was always a missing piece of the pin/unpin API calls (early
reviewers of pin_user_pages() asked about it, in fact), but until now,
it just wasn't needed. Finally though, now that the Direct IO pieces in
block/bio are about to be converted to use FOLL_PIN, it turns out that
there are some cases in which get_page() and get_user_pages_fast() were
both used. Converting those sites requires a drop-in replacement for
get_page(), which this patch supplies.

[1] and [2] provide some background about pin_user_pages() in general.

[1] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

[2] Documentation/core-api/pin_user_pages.rst

Signed-off-by: John Hubbard 
---
 include/linux/mm.h |  2 ++
 mm/gup.c   | 30 ++
 2 files changed, 32 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 97c83773b6f0..51c6ae4b63f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1152,6 +1152,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
 }
 
+void pin_user_page(struct page *page);
+
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
 
 static inline __must_check bool try_get_page(struct page *page)
diff --git a/mm/gup.c b/mm/gup.c
index ae096ea7583f..2cae5bbbc862 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -123,6 +123,36 @@ static __maybe_unused struct page 
*try_grab_compound_head(struct page *page,
return NULL;
 }
 
+/*
+ * pin_user_page() - elevate the page refcount, and mark as FOLL_PIN
+ *
+ * This the FOLL_PIN equivalent of get_page(). It is intended for use when the
+ * page will be released via unpin_user_page().
+ */
+void pin_user_page(struct page *page)
+{
+   int refs = 1;
+
+   page = compound_head(page);
+
+   VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
+
+   if (hpage_pincount_available(page))
+   hpage_pincount_add(page, 1);
+   else
+   refs = GUP_PIN_COUNTING_BIAS;
+
+   /*
+* Similar to try_grab_compound_head(): even if using the
+* hpage_pincount_add/_sub() routines, be sure to
+* *also* increment the normal page refcount field at least
+* once, so that the page really is pinned.
+*/
+   page_ref_add(page, refs);
+
+   mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+}
+
 /**
  * try_grab_page() - elevate a page's refcount by a flag-dependent amount
  *
-- 
2.28.0



Re: [PATCH v2 1/3] mm/gup: introduce pin_user_page()

2020-08-29 Thread John Hubbard

On 8/29/20 7:54 AM, Christoph Hellwig wrote:

On Sat, Aug 29, 2020 at 01:08:51AM -0700, John Hubbard wrote:

pin_user_page() is the FOLL_PIN equivalent of get_page().

This was always a missing piece of the pin/unpin API calls (early
reviewers of pin_user_pages() asked about it, in fact), but until now,
it just wasn't needed. Finally though, now that the Direct IO pieces in
block/bio are about to be converted to use FOLL_PIN, it turns out that
there are some cases in which get_page() and get_user_pages_fast() were
both used. Converting those sites requires a drop-in replacement for
get_page(), which this patch supplies.


I find the name really confusing vs pin_user_pages*, as it suggests as
single version of the same.  It also seems partially wrong, at least
in the direct I/O case as the only thing pinned here is the zero page.

So maybe pin_kernel_page is a better name together with an explanation?
Or just pin_page?



Yes. Both its behavior and use are closer to get_page() than it is to
get_user_pages(). So pin_page() is just right.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 2/3] iov_iter: introduce iov_iter_pin_user_pages*() routines

2020-08-29 Thread John Hubbard

On 8/29/20 7:58 AM, Christoph Hellwig wrote:

On Sat, Aug 29, 2020 at 01:08:52AM -0700, John Hubbard wrote:

...

@@ -1280,7 +1281,11 @@ static inline ssize_t __pipe_get_pages(struct iov_iter 
*i,
maxsize = n;
n += *start;
while (n > 0) {
-   get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+   if (use_pup)
+   pin_user_page(*pages++ = pipe->bufs[iter_head & 
p_mask].page);
+   else
+   get_page(*pages++ = pipe->bufs[iter_head & 
p_mask].page);


Maybe this would become a little more readable with a local variable
and a little more verbosity:

struct page *page = pipe->bufs[iter_head & p_mask].page;

if (use_pup)
pin_user_page(page);
else
get_page(page);

*pages++ = page;



Yes, that is cleaner, I'll change to that, thanks.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 3/3] bio: convert get_user_pages_fast() --> pin_user_pages_fast()

2020-08-29 Thread John Hubbard

On 8/29/20 8:02 AM, Christoph Hellwig wrote:

-   size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
+   size = iov_iter_pin_user_pages(iter, pages, LONG_MAX, nr_pages, 
);


This is really a comment to the previous patch, but I only spotted it
here:  I think the right name is iov_iter_pin_pages, as bvec, kvec and
pipe aren't usually user pages.  Same as iov_iter_get_pages vs
get_user_pages.  Same for the _alloc variant.



Yes, it is clearly misnamed now! Will fix.


+ * here on.  It will run one unpin_user_page() against each page
+ * and will run one bio_put() against the BIO.


Nit: the ant and the will still fit on the previous line.



Sorry about that, *usually* my text editor does the Right Thing for
those, I must have interfered with the natural flow of things. :)


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 2/3] iov_iter: introduce iov_iter_pin_user_pages*() routines

2020-08-30 Thread John Hubbard

On 8/30/20 1:17 PM, Al Viro wrote:
...

Why: In order to incrementally change Direct IO callers from calling
get_user_pages_fast() and put_page(), over to calling
pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
routines that specifically call one or the other systems, for both page
acquisition and page release.


Hmm...  Do you plan to kill iov_iter_get_pages* off, eventually getting
rid of that use_pup argument?



Yes. That is definitely something I'm interested in doing, and in fact,
I started to write words to that effect into the v1 cover letter. I lost
confidence at the last minute, after poking around the remaining call
sites (which are mostly network file systems, plus notably io_uring),
and wondering if I really understood what the hell I was doing. :)

So I decided to reduce the scope of the proposal, until I got some
feedback. Which I now have!

Looking at this again, I see that there are actually *very* few
ITER_KVEC and ITER_BVEC callers, so...yes, maybe this can all be
collapsed down to calling the new functions, which would always use pup,
and lead to the simplification you asked about.

Any advance warnings, advice, design thoughts are definitely welcome
here.


thanks,
--
John Hubbard
NVIDIA


[PATCH v3 3/3] bio: convert get_user_pages_fast() --> pin_user_pages_fast()

2020-08-31 Thread John Hubbard
Change generic block/bio Direct IO routines, to acquire FOLL_PIN user
pages via the recently added routines:

iov_iter_pin_pages()
iov_iter_pin_pages_alloc()
pin_page()

This effectively converts several file systems (ext4, for example) that
use the common Direct IO routines.

Change the corresponding page release calls from put_page() to
unpin_user_page().

Change bio_release_pages() to handle FOLL_PIN pages. In fact, after this
patch, that is the *only* type of pages that bio_release_pages()
handles.

Design notes


Quite a few approaches have been considered over the years. This one is
inspired by Christoph Hellwig's July, 2019 observation that there are
only 5 ITER_ types, and we can simplify handling of them for Direct IO
[1]. Accordingly, this patch implements the following pseudocode:

Direct IO behavior:

ITER_IOVEC:
pin_user_pages_fast();
break;

ITER_PIPE:
for each page:
 pin_page();
break;

ITER_KVEC:// already elevated page refcount, leave alone
ITER_BVEC:// already elevated page refcount, leave alone
ITER_DISCARD: // discard
return -EFAULT or -ENVALID;

...which works for callers that already have sorted out which case they
are in. Such as, Direct IO in the block/bio layers.

Note that this does leave ITER_KVEC and ITER_BVEC unconverted, for now.

Page acquisition: The iov_iter_get_pages*() routines above are at just
the right level in the call stack: the callers already know which system
to use, and so it's a small change to just drop in the replacement
routines. And it's a fan-in/fan-out point: block/bio call sites for
Direct IO funnel their page acquisitions through the
iov_iter_get_pages*() routines, and there are many other callers of
those. And we can't convert all of the callers at once--too many
subsystems are involved, and it would be a too large and too risky
patch.

Page release: there are already separate release routines: put_page()
vs. unpin_user_page(), so it's already done there.

[1] https://lore.kernel.org/kvm/20190724061750.ga19...@infradead.org/

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

Cc: Christoph Hellwig 
Cc: Al Viro 
Signed-off-by: John Hubbard 
---
 block/bio.c  | 24 
 block/blk-map.c  |  6 +++---
 fs/direct-io.c   | 28 ++--
 fs/iomap/direct-io.c |  2 +-
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index a9931f23d933..76c5843f6050 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -955,7 +955,7 @@ void bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
-   put_page(bvec->bv_page);
+   unpin_user_page(bvec->bv_page);
}
 }
 EXPORT_SYMBOL_GPL(bio_release_pages);
@@ -986,9 +986,9 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct 
iov_iter *iter)
  * @iter: iov iterator describing the region to be mapped
  *
  * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
- * For multi-segment *iter, this function only adds pages from the
- * next non-empty segment of the iov iterator.
+ * pages will have to be released using put_page() or unpin_user_page() when
+ * done. For multi-segment *iter, this function only adds pages from the next
+ * non-empty segment of the iov iterator.
  */
 static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 {
@@ -1009,7 +1009,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, 
struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-   size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
+   size = iov_iter_pin_pages(iter, pages, LONG_MAX, nr_pages, );
if (unlikely(size <= 0))
return size ? size : -EFAULT;
 
@@ -1020,7 +1020,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, 
struct iov_iter *iter)
 
if (__bio_try_merge_page(bio, page, len, offset, _page)) {
if (same_page)
-   put_page(page);
+   unpin_user_page(page);
} else {
if (WARN_ON_ONCE(bio_full(bio, len)))
 return -EINVAL;
@@ -1056,7 +1056,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, 
struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-   size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
+   size = iov_iter_pin_pages(iter, pages, LONG_MAX, nr_pages, );
if (unlikely(size &l

[PATCH v3 2/3] iov_iter: introduce iov_iter_pin_pages*() routines

2020-08-31 Thread John Hubbard
The new routines are:
iov_iter_pin_pages()
iov_iter_pin_pages_alloc()

and those correspond to these pre-existing routines:
iov_iter_get_pages()
iov_iter_get_pages_alloc()

Also, pipe_get_pages() and related are changed so as to pass
down a "use_pup" (use pin_page() instead of get_page()) bool
argument.

Unlike the iov_iter_get_pages*() routines, the iov_iter_pin_pages*()
routines assert that only ITER_IOVEC or ITER_PIPE items are passed in.
They then call pin_user_pages_fast() or pin_page(), instead of
get_user_pages_fast() or get_page().

Why: In order to incrementally change Direct IO callers from calling
get_user_pages_fast() and put_page(), over to calling
pin_user_pages_fast() and unpin_user_page(), there need to be mid-level
routines that specifically call one or the other systems, for both page
acquisition and page release.

Cc: Christoph Hellwig 
Cc: Al Viro 
Signed-off-by: John Hubbard 
---
 include/linux/uio.h |   5 ++
 lib/iov_iter.c  | 113 
 2 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3835a8a8e9ea..e44eed12afdf 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -229,6 +229,11 @@ int iov_iter_npages(const struct iov_iter *i, int 
maxpages);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
 
+ssize_t iov_iter_pin_pages(struct iov_iter *i, struct page **pages,
+   size_t maxsize, unsigned int maxpages, size_t *start);
+ssize_t iov_iter_pin_pages_alloc(struct iov_iter *i, struct page ***pages,
+   size_t maxsize, size_t *start);
+
 static inline size_t iov_iter_count(const struct iov_iter *i)
 {
return i->count;
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f12..2dc1f4812fa9 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1269,7 +1269,8 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
size_t maxsize,
struct page **pages,
int iter_head,
-   size_t *start)
+   size_t *start,
+   bool use_pup)
 {
struct pipe_inode_info *pipe = i->pipe;
unsigned int p_mask = pipe->ring_size - 1;
@@ -1280,7 +1281,14 @@ static inline ssize_t __pipe_get_pages(struct iov_iter 
*i,
maxsize = n;
n += *start;
while (n > 0) {
-   get_page(*pages++ = pipe->bufs[iter_head & p_mask].page);
+   struct page *page = pipe->bufs[iter_head & p_mask].page;
+
+   if (use_pup)
+   pin_page(page);
+   else
+   get_page(page);
+
+   *pages++ = page;
iter_head++;
n -= PAGE_SIZE;
}
@@ -1290,7 +1298,7 @@ static inline ssize_t __pipe_get_pages(struct iov_iter *i,
 
 static ssize_t pipe_get_pages(struct iov_iter *i,
   struct page **pages, size_t maxsize, unsigned maxpages,
-  size_t *start)
+  size_t *start, bool use_pup)
 {
unsigned int iter_head, npages;
size_t capacity;
@@ -1306,9 +1314,52 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
npages = pipe_space_for_user(iter_head, i->pipe->tail, i->pipe);
capacity = min(npages, maxpages) * PAGE_SIZE - *start;
 
-   return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, 
start);
+   return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head,
+   start, use_pup);
 }
 
+ssize_t iov_iter_pin_pages(struct iov_iter *i,
+  struct page **pages, size_t maxsize, unsigned int maxpages,
+  size_t *start)
+{
+   size_t skip = i->iov_offset;
+   const struct iovec *iov;
+   struct iovec v;
+
+   if (unlikely(iov_iter_is_pipe(i)))
+   return pipe_get_pages(i, pages, maxsize, maxpages, start, true);
+   if (unlikely(iov_iter_is_discard(i)))
+   return -EFAULT;
+   if (WARN_ON_ONCE(!iter_is_iovec(i)))
+   return -EFAULT;
+
+   if (unlikely(!maxsize))
+   return 0;
+   maxsize = min(maxsize, i->count);
+
+   iterate_iovec(i, maxsize, v, iov, skip, ({
+   unsigned long addr = (unsigned long)v.iov_base;
+   size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
+   int n;
+   int res;
+
+   if (len > maxpages * PAGE_SIZE)
+   len = maxpages * PAGE_SIZE;
+   addr &= ~(PAGE_SIZE - 1);
+   n = DIV_ROUND_UP(len, PAGE_SIZE);
+
+   res = pin_user_pages_fast(addr, n,
+   iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+ 

[PATCH v3 1/3] mm/gup: introduce pin_page()

2020-08-31 Thread John Hubbard
pin_page() is the FOLL_PIN equivalent of get_page().

This was always a missing piece of the pin/unpin API calls (early
reviewers of pin_user_pages() asked about it, in fact), but until now,
it just wasn't needed. Finally though, now that the Direct IO pieces in
block/bio are about to be converted to use FOLL_PIN, it turns out that
there are some cases in which get_page() and get_user_pages_fast() were
both used. Converting those sites requires a drop-in replacement for
get_page(), which this patch supplies.

[1] and [2] provide some background about the overall effort to convert
things to pin_user_page*() and unpin_user_page*().

[1] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/

[2] Documentation/core-api/pin_user_pages.rst

Cc: Christoph Hellwig 
Signed-off-by: John Hubbard 
---
 include/linux/mm.h |  2 ++
 mm/gup.c   | 33 +
 2 files changed, 35 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ca6e6a81576b..24240cf66c44 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1154,6 +1154,8 @@ static inline void get_page(struct page *page)
page_ref_inc(page);
 }
 
+void pin_page(struct page *page);
+
 bool __must_check try_grab_page(struct page *page, unsigned int flags);
 
 static inline __must_check bool try_get_page(struct page *page)
diff --git a/mm/gup.c b/mm/gup.c
index ae096ea7583f..a3a4bfae224a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -123,6 +123,39 @@ static __maybe_unused struct page 
*try_grab_compound_head(struct page *page,
return NULL;
 }
 
+/*
+ * pin_page() - elevate the page refcount, and mark as FOLL_PIN
+ *
+ * This the FOLL_PIN equivalent of get_page(). It is intended for use when the
+ * page will be released via unpin_user_page().
+ *
+ * Unlike pin_user_page*(), pin_page() may be used on nearly any page, not just
+ * userspace-allocated pages.
+ */
+void pin_page(struct page *page)
+{
+   int refs = 1;
+
+   page = compound_head(page);
+
+   VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
+
+   if (hpage_pincount_available(page))
+   hpage_pincount_add(page, 1);
+   else
+   refs = GUP_PIN_COUNTING_BIAS;
+
+   /*
+* Similar to try_grab_compound_head(): even if using the
+* hpage_pincount_add/_sub() routines, be sure to
+* *also* increment the normal page refcount field at least
+* once, so that the page really is pinned.
+*/
+   page_ref_add(page, refs);
+
+   mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_ACQUIRED, 1);
+}
+
 /**
  * try_grab_page() - elevate a page's refcount by a flag-dependent amount
  *
-- 
2.28.0



[PATCH v3 0/3] bio: Direct IO: convert to pin_user_pages_fast()

2020-08-31 Thread John Hubbard
bio: convert get_user_pages_fast() --> pin_user_pages_fast()

Change generic block/bio Direct IO routines, to acquire FOLL_PIN user
pages via the recently added routines:

iov_iter_pin_pages()
iov_iter_pin_pages_alloc()
pin_page()

This effectively converts several file systems (ext4, for example) that
use the common Direct IO routines.

Change the corresponding page release calls from put_page() to
unpin_user_page().

Change bio_release_pages() to handle FOLL_PIN pages. In fact, after this
patch, that is the *only* type of pages that bio_release_pages()
handles.

Design notes


Quite a few approaches have been considered over the years. This one is
inspired by Christoph Hellwig's July, 2019 observation that there are
only 5 ITER_ types, and we can simplify handling of them for Direct IO
[1]. Accordingly, this patch implements the following pseudocode:

Direct IO behavior:

ITER_IOVEC:
pin_user_pages_fast();
break;

ITER_PIPE:
for each page:
 pin_page();
break;

ITER_KVEC:// already elevated page refcount, leave alone
ITER_BVEC:// already elevated page refcount, leave alone
ITER_DISCARD: // discard
return -EFAULT or -ENVALID;

...which works for callers that already have sorted out which case they
are in. Such as, Direct IO in the block/bio layers.

Note that this does leave ITER_KVEC and ITER_BVEC unconverted, for now.

Page acquisition: The iov_iter_get_pages*() routines above are at just
the right level in the call stack: the callers already know which system
to use, and so it's a small change to just drop in the replacement
routines. And it's a fan-in/fan-out point: block/bio call sites for
Direct IO funnel their page acquisitions through the
iov_iter_get_pages*() routines, and there are many other callers of
those. And we can't convert all of the callers at once--too many
subsystems are involved, and it would be a too large and too risky
patch.

Page release: there are already separate release routines: put_page()
vs. unpin_user_page(), so it's already done there.

[1] https://lore.kernel.org/kvm/20190724061750.ga19...@infradead.org/

[2] "Explicit pinning of user-space pages":
https://lwn.net/Articles/807108/



John Hubbard (3):
  mm/gup: introduce pin_page()
  iov_iter: introduce iov_iter_pin_pages*() routines
  bio: convert get_user_pages_fast() --> pin_user_pages_fast()

 block/bio.c  |  24 -
 block/blk-map.c  |   6 +--
 fs/direct-io.c   |  28 +--
 fs/iomap/direct-io.c |   2 +-
 include/linux/mm.h   |   2 +
 include/linux/uio.h  |   5 ++
 lib/iov_iter.c   | 113 ---
 mm/gup.c |  33 +
 8 files changed, 175 insertions(+), 38 deletions(-)

-- 
2.28.0



[PATCH] mm, dump_page: rename head_mapcount() --> head_compound_mapcount()

2020-08-07 Thread John Hubbard
And similarly, rename head_pincount() --> head_compound_pincount().
These names are more accurate (or less misleading) than the original
ones.

Cc: Qian Cai 
Cc: Matthew Wilcox 
Cc: Vlastimil Babka 
Cc: Kirill A. Shutemov 
Signed-off-by: John Hubbard 
---

Hi,

This is a follow-up patch to v2 of "mm, dump_page: do not crash with bad 
compound_mapcount()", which I see has has already been sent as part of 
Andrew's pull request. Otherwise I would have sent a v3.

Of course, if it's somehow not too late, then squashing this patch into 
that one, effectively creating a v3 with the preferred names, would be a 
nice touch.

thanks,
John Hubbard

 include/linux/mm.h | 8 
 mm/debug.c | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ab941cf73f4..98d379d9806f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -776,7 +776,7 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t 
flags)
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
-static inline int head_mapcount(struct page *head)
+static inline int head_compound_mapcount(struct page *head)
 {
return atomic_read(compound_mapcount_ptr(head)) + 1;
 }
@@ -790,7 +790,7 @@ static inline int compound_mapcount(struct page *page)
 {
VM_BUG_ON_PAGE(!PageCompound(page), page);
page = compound_head(page);
-   return head_mapcount(page);
+   return head_compound_mapcount(page);
 }
 
 /*
@@ -903,7 +903,7 @@ static inline bool hpage_pincount_available(struct page 
*page)
return PageCompound(page) && compound_order(page) > 1;
 }
 
-static inline int head_pincount(struct page *head)
+static inline int head_compound_pincount(struct page *head)
 {
return atomic_read(compound_pincount_ptr(head));
 }
@@ -912,7 +912,7 @@ static inline int compound_pincount(struct page *page)
 {
VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
page = compound_head(page);
-   return head_pincount(page);
+   return head_compound_pincount(page);
 }
 
 static inline void set_compound_order(struct page *page, unsigned int order)
diff --git a/mm/debug.c b/mm/debug.c
index 69b60637112b..a0c060abf1e7 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -102,12 +102,12 @@ void __dump_page(struct page *page, const char *reason)
if (hpage_pincount_available(page)) {
pr_warn("head:%p order:%u compound_mapcount:%d 
compound_pincount:%d\n",
head, compound_order(head),
-   head_mapcount(head),
-   head_pincount(head));
+   head_compound_mapcount(head),
+   head_compound_pincount(head));
} else {
pr_warn("head:%p order:%u compound_mapcount:%d\n",
head, compound_order(head),
-   head_mapcount(head));
+   head_compound_mapcount(head));
}
}
if (PageKsm(page))
-- 
2.28.0



Re: [PATCH v2] mm, dump_page: do not crash with bad compound_mapcount()

2020-08-07 Thread John Hubbard

On 8/7/20 9:48 AM, Kirill A. Shutemov wrote:

[...]

+static inline int head_mapcount(struct page *head)
+{


Do we want VM_BUG_ON_PAGE(!PageHead(head), head) here?


Well, no.  That was the point of the bug report -- by the time we called
compound_mapcount, the page was no longer a head page.


Right. VM_BUG_ON_PAGE(PageTail(head), head)?


Sorry for overlooking that feedback point. Looking at it now, I'd much
rather not put any assertions at all here. This supposed to be for
implementing the failure case, and we've moved past assertions at this
point. In other words, dump_page() is part of *implementing* an
assertion, so calling assertions from within it is undesirable.

It's better to put the assertions in code that would call these inner
routines. Then, if we want to use these routines outside of mm/debug.c,
as per the other thread, then we should provide a wrapper that has such
assertions.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH] mm/gup_benchmark: use pin_user_pages for FOLL_LONGTERM flag

2020-08-15 Thread John Hubbard

On 8/15/20 5:20 AM, Barry Song wrote:

According to Documentation/core-api/pin_user_pages.rst, FOLL_PIN is a
prerequisite to FOLL_LONGTERM. Another way of saying that is,
FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN.

Almost all kernel modules are using pin_user_pages() with FOLL_LONGTERM,
mm/gup_benchmark.c seems to the only exception in which FOLL_PIN is not
a prerequisite to FOLL_LONGTERM.

Cc: John Hubbard 
Cc: Jan Kara 
Cc: Jérôme Glisse 
Cc: "Matthew Wilcox (Oracle)" 
Cc: Al Viro 
Cc: Christoph Hellwig 
Cc: Dan Williams 
Cc: Dave Chinner 
Cc: Jason Gunthorpe 
Cc: Jonathan Corbet 
Cc: Michal Hocko 
Cc: Mike Kravetz 
Cc: Shuah Khan 
Cc: Vlastimil Babka 
Signed-off-by: Barry Song 
---
  mm/gup_benchmark.c | 23 +++---
  tools/testing/selftests/vm/gup_benchmark.c | 14 ++---
  2 files changed, 19 insertions(+), 18 deletions(-)

DKIM-Signature: v anr = get_user_pages(addr, nr, 
gup->flags, pages + i,

NULL);
@@ -118,6 +114,11 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
+   case PIN_LONGTERM_BENCHMARK:
+   nr = pin_user_pages(addr, nr,
+   gup->flags | FOLL_LONGTERM,
+   pages + i, NULL);
+   break;
default:
kvfree(pages);
ret = -EINVAL;
@@ -162,10 +163,10 @@ static long gup_benchmark_ioctl(struct file *filep, 
unsigned int cmd,
  
  	switch (cmd) {

case GUP_FAST_BENCHMARK:
-   case GUP_LONGTERM_BENCHMARK:
case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
case PIN_BENCHMARK:
+   case PIN_LONGTERM_BENCHMARK:
break;
default:
return -EINVAL;
diff --git a/tools/testing/selftests/vm/gup_benchmark.c 
b/tools/testing/selftests/vm/gup_benchmark.c
index 43b4dfe161a2..31f8bb086907 100644
--- a/tools/testing/selftests/vm/gup_benchmark.c
+++ b/tools/testing/selftests/vm/gup_benchmark.c
@@ -15,12 +15,12 @@
  #define PAGE_SIZE sysconf(_SC_PAGESIZE)
  
  #define GUP_FAST_BENCHMARK	_IOWR('g', 1, struct gup_benchmark)

-#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define GUP_BENCHMARK  _IOWR('g', 3, struct gup_benchmark)
+#define GUP_BENCHMARK  _IOWR('g', 2, struct gup_benchmark)
  
  /* Similar to above, but use FOLL_PIN instead of FOLL_GET. */

-#define PIN_FAST_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_BENCHMARK  _IOWR('g', 5, struct gup_benchmark)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
+#define PIN_BENCHMARK  _IOWR('g', 4, struct gup_benchmark)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
  
  /* Just the flags we need, copied from mm.h: */

  #define FOLL_WRITE0x01/* check pte is writable */
@@ -52,6 +52,9 @@ int main(int argc, char **argv)
case 'b':
cmd = PIN_BENCHMARK;
break;
+   case 'L':
+   cmd = PIN_LONGTERM_BENCHMARK;
+   break;
case 'm':
size = atoi(optarg) * MB;
break;
@@ -67,9 +70,6 @@ int main(int argc, char **argv)
case 'T':
thp = 0;
break;
-   case 'L':
-   cmd = GUP_LONGTERM_BENCHMARK;
-   break;
case 'U':
cmd = GUP_BENCHMARK;
break;





Re: [linux-next PATCH] rapidio: Fix error handling path

2020-09-18 Thread John Hubbard

On 9/17/20 7:21 PM, Souptick Joarder wrote:

On Thu, Sep 17, 2020 at 11:17 PM John Hubbard  wrote:

...

I sort of feel like returning partial successes is not working.  We
could easily make a wrapper which either pins everything or it returns
an error code.


Yes we could. And I have the same feeling about this API. It's generated a
remarkable amount of bug fixes, several of which ended up being partial or
wrong in themselves. And mostly this is due to the complicated tristate
return code: instead of 0 or -ERRNO, it also can return "N pages that is
less than what you requested", and there are no standard helpers in the kernel
to make that easier to deal with


There was some discussion on removing return value 0 from one of the
gup variants [1].
I think it might be partially relevant to the current discussion.

[1] https://patchwork.kernel.org/patch/11529795/



Yes, although as I mentioned above, I'm thinking of a 0 or -ERRNO return value,
and not even return nr_pages at all.

But in any case, as a practical matter, I'm not sure if it's a good idea to
actually change all the callsites, or not. If we just fix the remaining buggy
callers, maybe that's better than the churn associated with another API change.

On the other-other hand, there does seem to be more churn coming anyway, with
talk of actually doing a [get|pin]_user_bvec(), for example. So maybe it's 
better
to head off the coming mess.

This is something that should be discussed on linux-mm.





I guess the question is are there drivers which will keep working (or limp
along?) on partial pins?  A quick search of a driver I thought did this does
not apparently any more...  So it sounds good to me from 30,000 feet!  :-D


It sounds good to me too--and from just a *few hundred feet* (having touched 
most
of the call sites at some point)! haha :)

I think the wrapper should be short-term, though, just until all the callers
are converted to the simpler API. Then change the core gup/pup calls to the 
simpler
API. There are more than enough gup/pup API entry points as it is, that's for 
sure.


thanks,
--
John Hubbard
NVIDIA


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/4] mm: Trial do_wp_page() simplification

2020-09-18 Thread John Hubbard

On 9/18/20 1:40 PM, Peter Xu wrote:

On Fri, Sep 18, 2020 at 02:32:40PM -0300, Jason Gunthorpe wrote:

On Fri, Sep 18, 2020 at 12:40:32PM -0400, Peter Xu wrote:


Firstly in the draft patch mm->has_pinned is introduced and it's written to 1
as long as FOLL_GUP is called once.  It's never reset after set.


Worth thinking about also adding FOLL_LONGTERM here, at last as long
as it is not a counter. That further limits the impact.


But theoritically we should also trigger COW here for pages even with PIN &&
!LONGTERM, am I right?  Assuming that FOLL_PIN is already a corner case.



This note, plus Linus' comment about "I'm a normal process, I've never
done any special rdma page pinning", has me a little worried. Because
page_maybe_dma_pinned() is counting both short- and long-term pins,
actually. And that includes O_DIRECT callers.

O_DIRECT pins are short-term, and RDMA systems are long-term (and should
be setting FOLL_LONGTERM). But there's no way right now to discern
between them, once the initial pin_user_pages*() call is complete. All
we can do today is to count the number of FOLL_PIN calls, not the number
of FOLL_PIN | FOLL_LONGTERM calls.

The reason it's that way, is that writeback and such can experience
problems regardless of the duration of the pin. There are ideas about
how to deal with the pins, and the filesystem (layout leases...) but
still disagreement, which is why there's basically no
page_maybe_dma_pinned() callers yet.

Although I think we're getting closer to using it. There was a recent
attempt at using this stuff, from Chris Wilson. [1]


[1] 
https://lore.kernel.org/intel-gfx/20200624191417.16735-1-chris%40chris-wilson.co.uk/


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 5/5] mm/thp: Split huge pmds/puds if they're pinned when fork()

2020-09-23 Thread John Hubbard

On 9/23/20 8:44 AM, Peter Xu wrote:

On Wed, Sep 23, 2020 at 04:01:14PM +0200, Jan Kara wrote:

On Wed 23-09-20 09:50:04, Peter Xu wrote:

...

But the problem is that if you apply mm->has_pinned check on file pages,
you can get false negatives now. And that's not acceptable...


Do you mean the case where proc A pinned page P from a file, then proc B
mapped the same page P on the file, then fork() on proc B?


Yes.


aha, thanks for spelling out the false negative problem.




If proc B didn't explicitly pinned page P in B's address space too,
shouldn't we return "false" for page_likely_dma_pinned(P)?  Because if
proc B didn't pin the page in its own address space, I'd think it's ok to
get the page replaced at any time as long as the content keeps the same.
Or couldn't we?


So it depends on the reason why you call page_likely_dma_pinned(). For your
COW purposes the check is correct but e.g. for "can filesystem safely
writeback this page" the page_likely_dma_pinned() would be wrong. So I'm
not objecting to the mechanism as such. I'm mainly objecting to the generic
function name which suggests something else than what it really checks and
thus it could be used in wrong places in the future... That's why I'd
prefer to restrict the function to PageAnon pages where there's no risk of
confusion what the check actually does.


How about I introduce the helper as John suggested, but rename it to

   page_maybe_dma_pinned_by_mm()

?

Then we also don't need to judge on which is more likely to happen (between
"maybe" and "likely", since that will confuse me if I only read these words..).



You're right, it is too subtle of a distinction after all. I agree that sticking
with "_maybe_" avoids that confusion.



I didn't use any extra suffix like "cow" because I think it might be useful for
things besides cow.  Fundamentally the new helper will be mm-based, so "by_mm"
seems to suite better to me.

Does that sound ok?



Actually, Jan nailed it. I just wasn't understanding his scenario, but now that
I do, and considering your other point about wording, I think we end up with:

anon_page_maybe_pinned()

as a pretty good name for a helper function. (We don't want "_mm" because that
refers more to the mechanism used internally, rather than the behavior of the
function. "anon_" adds more meaning.)

...now I better go and try to grok what Jason is recommending for the new
meaning of FOLL_PIN, in another tributary of this thread. I don't *think* it 
affects
this naming point, though. :)

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/2] mm: reorganize internal_get_user_pages_fast()

2020-10-28 Thread John Hubbard

On 10/27/20 6:15 AM, Jason Gunthorpe wrote:

On Tue, Oct 27, 2020 at 10:33:01AM +0100, Jan Kara wrote:

On Fri 23-10-20 21:44:17, John Hubbard wrote:

On 10/23/20 5:19 PM, Jason Gunthorpe wrote:

+   start += (unsigned long)nr_pinned << PAGE_SHIFT;
+   pages += nr_pinned;
+   ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
+ pages);
+   if (ret < 0) {
/* Have to be a bit careful with return values */


...and can we move that comment up one level, so that it reads:

/* Have to be a bit careful with return values */
if (ret < 0) {
if (nr_pinned)
return nr_pinned;
return ret;
}
return ret + nr_pinned;

Thinking about this longer term, it would be nice if the whole gup/pup API
set just stopped pretending that anyone cares about partial success, because
they *don't*. If we had return values of "0 or -ERRNO" throughout, and an
additional set of API wrappers that did some sort of limited retry just like
some of the callers do, that would be a happier story.


Actually there are callers that care about partial success. See e.g.
iov_iter_get_pages() usage in fs/direct_io.c:dio_refill_pages() or
bio_iov_iter_get_pages(). These places handle partial success just fine and
not allowing partial success from GUP could regress things...


I looked through a bunch of call sites, and there are a wack that


So did I! :)


actually do only want a complete return and are carrying a bunch of
code to fix it:

pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!pvec)
return -ENOMEM;

do {
unsigned num_pages = npages - pinned;
uint64_t ptr = userptr->ptr + pinned * PAGE_SIZE;
struct page **pages = pvec + pinned;

ret = pin_user_pages_fast(ptr, num_pages,
  !userptr->ro ? FOLL_WRITE : 0, pages);
if (ret < 0) {
unpin_user_pages(pvec, pinned);
kvfree(pvec);
return ret;
}

pinned += ret;

} while (pinned < npages);

Is really a lot better if written as:

pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL);
if (!pvec)
return -ENOMEM;
ret = pin_user_pages_fast(userptr->ptr, npages, FOLL_COMPLETE |
  (!userptr->ro ? FOLL_WRITE : 0),
  pvec);
 if (ret) {
   kvfree(pvec);
  return ret;
 }

(eg FOLL_COMPLETE says to return exactly npages or fail)



Yes, exactly. And if I reverse the polarity (to Christoph's FOLL_PARTIAL, 
instead
of FOLL_COMPLETE) it's even smaller, slightly. Which is where I am leaning now.




Some code assumes things work that way already anyhow:

/* Pin user pages for DMA Xfer */
err = pin_user_pages_unlocked(user_dma.uaddr, user_dma.page_count,
dma->map, FOLL_FORCE);

if (user_dma.page_count != err) {
IVTV_DEBUG_WARN("failed to map user pages, returned %d instead of 
%d\n",
   err, user_dma.page_count);
if (err >= 0) {
unpin_user_pages(dma->map, err);
return -EINVAL;
}
return err;
}

Actually I'm quite surprised I didn't find too many missing the tricky
unpin_user_pages() on the error path - eg
videobuf_dma_init_user_locked() is wrong.



Well. That's not accidental. "Some People" (much thanks to Souptick Joarder, 
btw) have
been fixing up many of those sites, during the pin_user_pages() conversions. 
Otherwise
you would have found about 10 or 15 more.

I'll fix up that one above (using your Reported-by, I assume), unless someone 
else is
already taking care of it.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/2] mm: reorganize internal_get_user_pages_fast()

2020-10-28 Thread John Hubbard

On 10/27/20 11:00 PM, John Hubbard wrote:

On 10/27/20 6:15 AM, Jason Gunthorpe wrote:

On Tue, Oct 27, 2020 at 10:33:01AM +0100, Jan Kara wrote:

On Fri 23-10-20 21:44:17, John Hubbard wrote:

On 10/23/20 5:19 PM, Jason Gunthorpe wrote:

...


I'll fix up that one above (using your Reported-by, I assume), unless someone 
else is
already taking care of it.



ummm, strike that--that won't need fixing, it will pick up the fix 
automatically.
Arghh. Let me blame this on the late hour. haha :)


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/2] mm: reorganize internal_get_user_pages_fast()

2020-10-28 Thread John Hubbard

On 10/27/20 2:55 AM, Christoph Hellwig wrote:

On Tue, Oct 27, 2020 at 10:33:01AM +0100, Jan Kara wrote:

Actually there are callers that care about partial success. See e.g.
iov_iter_get_pages() usage in fs/direct_io.c:dio_refill_pages() or
bio_iov_iter_get_pages(). These places handle partial success just fine and
not allowing partial success from GUP could regress things...


Good point. And those also happen to be the key call sites that I haven't
yet converted to pin_user_pages*(). Seeing as how I'm three versions into
attempting to convert the various *iov_iter*() routines, I should have
remembered that they are all about partial success. :)



But most users do indeed not care.  Maybe an explicit FOLL_PARTIAL to
opt into partial handling could clean up a lot of the mess.  Maybe just
for pin_user_pages for now.



That does seem like the perfect mix. IIRC, all of the pin_user_pages()
call sites  today do not accept partial success (and it's easy enough to
audit and confirm). So likely no need to add FOLL_PARTIAL there, and no
huge danger of regressions. It would definitely reduce the line count at
multiple call sites, in return for adding some lines to gup.c.

And maybe it can go further, at some point, but that's a good way to start.

I'm leaning toward just sending out a small series to do that, unless there
are objections and/or better ways to improve this area...


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v5 05/15] mm/frame-vector: Use FOLL_LONGTERM

2020-11-04 Thread John Hubbard

On 11/4/20 10:17 AM, Jason Gunthorpe wrote:

On Wed, Nov 04, 2020 at 04:41:19PM +, Christoph Hellwig wrote:

On Wed, Nov 04, 2020 at 04:37:58PM +, Christoph Hellwig wrote:

On Wed, Nov 04, 2020 at 05:26:58PM +0100, Daniel Vetter wrote:

What we're discussing is whether gup_fast and pup_fast also obey this,
or fall over and can give you the struct page that's backing the
dma_mmap_* memory. Since the _fast variant doesn't check for
vma->vm_flags, and afaict that's the only thing which closes this gap.
And like you restate, that would be a bit a problem. So where's that
check which Jason aren't spotting?


remap_pte_range uses pte_mkspecial to set up the PTEs, and gup_pte_range
errors out on pte_special.  Of course this only works for the
CONFIG_ARCH_HAS_PTE_SPECIAL case, for other architectures we do have
a real problem.


Except that we don't really support pte-level gup-fast without
CONFIG_ARCH_HAS_PTE_SPECIAL, and in fact all architectures selecting
HAVE_FAST_GUP also select ARCH_HAS_PTE_SPECIAL, so we should be fine.


Mm, I thought it was probably the special flag..

Knowing that CONFIG_HAVE_FAST_GUP can't be set without
CONFIG_ARCH_HAS_PTE_SPECIAL is pretty insightful, can we put that in
the Kconfig?

config HAVE_FAST_GUP
 depends on MMU
 depends on ARCH_HAS_PTE_SPECIAL
 bool


Well, the !CONFIG_ARCH_HAS_PTE_SPECIAL case points out in a comment that
gup-fast is not *completely* unavailable there, so I don't think you want
to shut it off like that:

/*
 * If we can't determine whether or not a pte is special, then fail immediately
 * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
 * to be special.
 *
 * For a futex to be placed on a THP tail page, get_futex_key requires a
 * get_user_pages_fast_only implementation that can pin pages. Thus it's still
 * useful to have gup_huge_pmd even if we can't operate on ptes.
 */


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 5/5] mm/thp: Split huge pmds/puds if they're pinned when fork()

2020-09-22 Thread John Hubbard

On 9/21/20 2:20 PM, Peter Xu wrote:
...

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7ff29cc3d55c..c40aac0ad87e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1074,6 +1074,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct 
mm_struct *src_mm,
  
  	src_page = pmd_page(pmd);

VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+   /*
+* If this page is a potentially pinned page, split and retry the fault
+* with smaller page size.  Normally this should not happen because the
+* userspace should use MADV_DONTFORK upon pinned regions.  This is a
+* best effort that the pinned pages won't be replaced by another
+* random page during the coming copy-on-write.
+*/
+   if (unlikely(READ_ONCE(src_mm->has_pinned) &&
+page_maybe_dma_pinned(src_page))) {


This condition would make a good static inline function. It's used in 3 places,
and the condition is quite special and worth documenting, and having a separate
function helps with that, because the function name adds to the story. I'd 
suggest
approximately:

page_likely_dma_pinned()

for the name.


+   pte_free(dst_mm, pgtable);
+   spin_unlock(src_ptl);
+   spin_unlock(dst_ptl);
+   __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+   return -EAGAIN;
+   }



Why wait until we are so deep into this routine to detect this and unwind?
It seems like if you could do a check near the beginning of this routine, and
handle it there, with less unwinding? In fact, after taking only the src_ptl,
the check could be made, right?


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 3/5] mm: Rework return value for copy_one_pte()

2020-09-22 Thread John Hubbard

On 9/21/20 2:17 PM, Peter Xu wrote:

There's one special path for copy_one_pte() with swap entries, in which
add_swap_count_continuation(GFP_ATOMIC) might fail.  In that case we'll return


I might be looking at the wrong place, but the existing code seems to call
add_swap_count_continuation(GFP_KERNEL), not with GFP_ATOMIC?


the swp_entry_t so that the caller will release the locks and redo the same
thing with GFP_KERNEL.

It's confusing when copy_one_pte() must return a swp_entry_t (even if all the
ptes are non-swap entries).  More importantly, we face other requirement to
extend this "we need to do something else, but without the locks held" case.

Rework the return value into something easier to understand, as defined in enum
copy_mm_ret.  We'll pass the swp_entry_t back using the newly introduced union


I like the documentation here, but it doesn't match what you did in the patch.
Actually, the documentation had the right idea (enum, rather than #define, for
COPY_MM_* items). Below...


copy_mm_data parameter.

Another trivial change is to move the reset of the "progress" counter into the
retry path, so that we'll reset it for other reasons too.

This should prepare us with adding new return codes, very soon.

Signed-off-by: Peter Xu 
---
  mm/memory.c | 42 +-
  1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 7525147908c4..1530bb1070f4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -689,16 +689,24 @@ struct page *vm_normal_page_pmd(struct vm_area_struct 
*vma, unsigned long addr,
  }
  #endif
  
+#define  COPY_MM_DONE   0

+#define  COPY_MM_SWAP_CONT  1


Those should be enums, so as to get a little type safety and other goodness from
using non-macro items.

...

@@ -866,13 +877,18 @@ static int copy_pte_range(struct mm_struct *dst_mm, 
struct mm_struct *src_mm,
pte_unmap_unlock(orig_dst_pte, dst_ptl);
cond_resched();
  
-	if (entry.val) {

-   if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+   switch (copy_ret) {
+   case COPY_MM_SWAP_CONT:
+   if (add_swap_count_continuation(data.entry, GFP_KERNEL) < 0)
return -ENOMEM;
-   progress = 0;


Yes. Definitely a little cleaner to reset this above, instead of here.


+   break;
+   default:
+   break;


I assume this no-op noise is to placate the compiler and/or static checkers. :)

I'm unable to find any actual problems with the diffs, aside from the nit about
using an enum.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/5] mm: Introduce mm_struct.has_pinned

2020-09-22 Thread John Hubbard

On 9/22/20 8:17 AM, Peter Xu wrote:

On Mon, Sep 21, 2020 at 04:53:38PM -0700, John Hubbard wrote:

On 9/21/20 2:17 PM, Peter Xu wrote:

(Commit message collected from Jason Gunthorpe)

Reduce the chance of false positive from page_maybe_dma_pinned() by keeping


Not yet, it doesn't. :)  More:


track if the mm_struct has ever been used with pin_user_pages(). mm_structs
that have never been passed to pin_user_pages() cannot have a positive
page_maybe_dma_pinned() by definition. This allows cases that might drive up
the page ref_count to avoid any penalty from handling dma_pinned pages.

Due to complexities with unpining this trivial version is a permanent sticky
bit, future work will be needed to make this a counter.


How about this instead:

Subsequent patches intend to reduce the chance of false positives from
page_maybe_dma_pinned(), by also considering whether or not a page has
even been part of an mm struct that has ever had pin_user_pages*()
applied to any of its pages.

In order to allow that, provide a boolean value (even though it's not
implemented exactly as a boolean type) within the mm struct, that is
simply set once and never cleared. This will suffice for an early, rough
implementation that fixes a few problems.

Future work is planned, to provide a more sophisticated solution, likely
involving a counter, and *not* involving something that is set and never
cleared.


This looks good, thanks.  Though I think Jason's version is good too (as long
as we remove the confusing sentence, that's the one starting with "mm_structs
that have never been passed... ").  Before I drop Jason's version, I think I'd
better figure out what's the major thing we missed so that maybe we can add
another paragraph.  E.g., "future work will be needed to make this a counter"
already means "involving a counter, and *not* involving something that is set
and never cleared" to me... Because otherwise it won't be called a counter..



That's just a bit of harmless redundancy, intended to help clarify where this
is going. But if the redundancy isn't actually helping, you could simply
truncate it to the first half of the sentence, like this:

"Future work is planned, to provide a more sophisticated solution, likely
involving a counter."


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/5] mm: Introduce mm_struct.has_pinned

2020-09-22 Thread John Hubbard

On 9/22/20 8:17 AM, Peter Xu wrote:

On Mon, Sep 21, 2020 at 04:53:38PM -0700, John Hubbard wrote:

On 9/21/20 2:17 PM, Peter Xu wrote:

...

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 496c3ff97cce..6f291f8b74c6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -441,6 +441,16 @@ struct mm_struct {
   #endif
int map_count;  /* number of VMAs */
+   /**
+* @has_pinned: Whether this mm has pinned any pages.  This can
+* be either replaced in the future by @pinned_vm when it
+* becomes stable, or grow into a counter on its own. We're
+* aggresive on this bit now - even if the pinned pages were
+* unpinned later on, we'll still keep this bit set for the
+* lifecycle of this mm just for simplicity.
+*/
+   int has_pinned;


I think this would be elegant as an atomic_t, and using atomic_set() and
atomic_read(), which seem even more self-documenting that what you have here.

But it's admittedly a cosmetic point, combined with my perennial fear that
I'm missing something when I look at a READ_ONCE()/WRITE_ONCE() pair. :)


Yeah but I hope I'm using it right.. :) I used READ_ONCE/WRITE_ONCE explicitly
because I think they're cheaper than atomic operations, (which will, iiuc, lock
the bus).



The "cheaper" argument vanishes if the longer-term plan is to use atomic ops.
Given the intent of this patchset, simpler is better, and "simpler that has the
same performance as the longer term solution" is definitely OK.



It's completely OK to just ignore this comment, but I didn't want to completely
miss the opportunity to make it a tiny bit cleaner to the reader.


This can always become an atomic in the future, or am I wrong?  Actually if
we're going to the counter way I feel like it's a must.



Yes it can change. But you can get the simplicity and clarity now, rather
than waiting.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 5/5] mm/thp: Split huge pmds/puds if they're pinned when fork()

2020-09-22 Thread John Hubbard

On 9/22/20 3:33 AM, Jan Kara wrote:

On Mon 21-09-20 23:41:16, John Hubbard wrote:

On 9/21/20 2:20 PM, Peter Xu wrote:
...

+   if (unlikely(READ_ONCE(src_mm->has_pinned) &&
+page_maybe_dma_pinned(src_page))) {


This condition would make a good static inline function. It's used in 3
places, and the condition is quite special and worth documenting, and
having a separate function helps with that, because the function name
adds to the story. I'd suggest approximately:

 page_likely_dma_pinned()

for the name.


Well, but we should also capture that this really only works for anonymous
pages. For file pages mm->has_pinned does not work because the page may be
still pinned by completely unrelated process as Jann already properly
pointed out earlier in the thread. So maybe anon_page_likely_pinned()?
Possibly also assert PageAnon(page) in it if we want to be paranoid...

Honza


The file-backed case doesn't really change anything, though:
page_maybe_dma_pinned() is already a "fuzzy yes" in the same sense: you
can get a false positive. Just like here, with an mm->has_pinned that
could be a false positive for a process.

And for that reason, I'm also not sure an "assert PageAnon(page)" is
desirable. That assertion would prevent file-backed callers from being
able to call a function that provides a fuzzy answer, but I don't see
why you'd want or need to do that. The goal here is to make the fuzzy
answer a little bit more definite, but it's not "broken" just because
the result is still fuzzy, right?

Apologies if I'm missing a huge point here... :)


thanks,
--
John Hubbard
NVIDIA


[PATCH 1/8] mm/gup_benchmark: rename to mm/gup_test

2020-09-28 Thread John Hubbard
Rename nearly every "gup_benchmark" reference and file name to
"gup_test". The one exception is for the actual gup benchmark test
itself.

The current code already does a *little* bit more than benchmarking,
and definitely covers more than get_user_pages_fast(). More importantly,
however, subsequent patches are about to add some functionality that is
non-benchmark related.

Closely related changes:

* Kconfig: in addition to renaming the options from GUP_BENCHMARK to
  GUP_TEST, update the help text to reflect that it's no longer a
  benchmark-only test.

Signed-off-by: John Hubbard 
---
 Documentation/core-api/pin_user_pages.rst |  6 ++--
 arch/s390/configs/debug_defconfig |  2 +-
 arch/s390/configs/defconfig   |  2 +-
 mm/Kconfig| 15 +---
 mm/Makefile   |  2 +-
 mm/{gup_benchmark.c => gup_test.c}| 36 +--
 tools/testing/selftests/vm/.gitignore |  2 +-
 tools/testing/selftests/vm/Makefile   |  2 +-
 tools/testing/selftests/vm/config |  2 +-
 .../vm/{gup_benchmark.c => gup_test.c}| 16 -
 tools/testing/selftests/vm/run_vmtests|  8 ++---
 11 files changed, 49 insertions(+), 44 deletions(-)
 rename mm/{gup_benchmark.c => gup_test.c} (80%)
 rename tools/testing/selftests/vm/{gup_benchmark.c => gup_test.c} (85%)

diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
index 7ca8c7bac650..eae972b23224 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -221,12 +221,12 @@ Unit testing
 
 This file::
 
- tools/testing/selftests/vm/gup_benchmark.c
+ tools/testing/selftests/vm/gup_test.c
 
 has the following new calls to exercise the new pin*() wrapper functions:
 
-* PIN_FAST_BENCHMARK (./gup_benchmark -a)
-* PIN_BENCHMARK (./gup_benchmark -b)
+* PIN_FAST_BENCHMARK (./gup_test -a)
+* PIN_BENCHMARK (./gup_test -b)
 
 You can monitor how many total dma-pinned pages have been acquired and released
 since the system was booted, via two new /proc/vmstat entries: ::
diff --git a/arch/s390/configs/debug_defconfig 
b/arch/s390/configs/debug_defconfig
index 0784bf3caf43..c624f4b1ad33 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -100,7 +100,7 @@ CONFIG_ZSMALLOC_STAT=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 905bc8c4cfaf..878b89706998 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -94,7 +94,7 @@ CONFIG_ZSMALLOC_STAT=y
 CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
 CONFIG_IDLE_PAGE_TRACKING=y
 CONFIG_PERCPU_STATS=y
-CONFIG_GUP_BENCHMARK=y
+CONFIG_GUP_TEST=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
diff --git a/mm/Kconfig b/mm/Kconfig
index 1896860cf23a..588984ee5fb4 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -834,13 +834,18 @@ config PERCPU_STATS
  information includes global and per chunk statistics, which can
  be used to help understand percpu memory usage.
 
-config GUP_BENCHMARK
-   bool "Enable infrastructure for get_user_pages() and related calls 
benchmarking"
+config GUP_TEST
+   bool "Enable infrastructure for get_user_pages()-related unit tests"
help
- Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages() and related calls.
+ Provides /sys/kernel/debug/gup_test, which in turn provides a way
+ to make ioctl calls that can launch kernel-based unit tests for
+ the get_user_pages*() and pin_user_pages*() family of API calls.
 
- See tools/testing/selftests/vm/gup_benchmark.c
+ These tests include benchmark testing of the _fast variants of
+ get_user_pages*() and pin_user_pages*(), as well as smoke tests of
+ the non-_fast variants.
+
+ See tools/testing/selftests/vm/gup_test.c
 
 config GUP_GET_PTE_LOW_HIGH
bool
diff --git a/mm/Makefile b/mm/Makefile
index cae063dc8298..21fc9af33f9b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -90,7 +90,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
 obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
-obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
+obj-$(CONFIG_GUP_TEST) += gup_test.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/gup_benchmark.c b/mm/gup_test.c
similarity index 80%
rename from mm/gup_benchmark.c
rename to mm/gup_test.c
index 464cae1fa3ea..10f41c0528de 100644
--- a/m

[PATCH 5/8] selftests/vm: only some gup_test items are really benchmarks

2020-09-28 Thread John Hubbard
Therefore, some minor cleanup and improvements are in order:

1. Rename the other items appropriately.

2. Stop reporting timing information on the non-benchmark items. It's
   still being recorded and is available, but there's no point in
   cluttering up the report with data that no one reasonably needs to
   check.

3. Don't do iterations, for non-benchmark items.

4. Print out a shorter, more appropriate report for the non-benchmark
   tests.

5. Add the command that was run, to the report. This really helps, as
   there are quite a lot of options now.

Signed-off-by: John Hubbard 
---
 Documentation/core-api/pin_user_pages.rst |  2 +-
 mm/gup_test.c | 14 +++
 mm/gup_test.h |  8 ++--
 tools/testing/selftests/vm/gup_test.c | 47 +++
 4 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/Documentation/core-api/pin_user_pages.rst 
b/Documentation/core-api/pin_user_pages.rst
index eae972b23224..fcf605be43d0 100644
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -226,7 +226,7 @@ This file::
 has the following new calls to exercise the new pin*() wrapper functions:
 
 * PIN_FAST_BENCHMARK (./gup_test -a)
-* PIN_BENCHMARK (./gup_test -b)
+* PIN_BASIC_TEST (./gup_test -b)
 
 You can monitor how many total dma-pinned pages have been acquired and released
 since the system was booted, via two new /proc/vmstat entries: ::
diff --git a/mm/gup_test.c b/mm/gup_test.c
index a3c86d0fdff7..a980c4a194f0 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -13,13 +13,13 @@ static void put_back_pages(unsigned int cmd, struct page 
**pages,
 
switch (cmd) {
case GUP_FAST_BENCHMARK:
-   case GUP_BENCHMARK:
+   case GUP_BASIC_TEST:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
break;
 
case PIN_FAST_BENCHMARK:
-   case PIN_BENCHMARK:
+   case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
@@ -34,7 +34,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
 
switch (cmd) {
case PIN_FAST_BENCHMARK:
-   case PIN_BENCHMARK:
+   case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
@@ -87,7 +87,7 @@ static int __gup_test_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
 pages + i);
break;
-   case GUP_BENCHMARK:
+   case GUP_BASIC_TEST:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -95,7 +95,7 @@ static int __gup_test_ioctl(unsigned int cmd,
nr = pin_user_pages_fast(addr, nr, gup->flags,
 pages + i);
break;
-   case PIN_BENCHMARK:
+   case PIN_BASIC_TEST:
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -148,10 +148,10 @@ static long gup_test_ioctl(struct file *filep, unsigned 
int cmd,
 
switch (cmd) {
case GUP_FAST_BENCHMARK:
-   case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
-   case PIN_BENCHMARK:
case PIN_LONGTERM_BENCHMARK:
+   case GUP_BASIC_TEST:
+   case PIN_BASIC_TEST:
break;
default:
return -EINVAL;
diff --git a/mm/gup_test.h b/mm/gup_test.h
index 931c2f3f477a..921b4caad8ef 100644
--- a/mm/gup_test.h
+++ b/mm/gup_test.h
@@ -5,10 +5,10 @@
 #include 
 
 #define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
-#define GUP_BENCHMARK  _IOWR('g', 2, struct gup_test)
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_test)
-#define PIN_BENCHMARK  _IOWR('g', 4, struct gup_test)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_test)
+#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test)
+#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
+#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
 
 struct gup_test {
__u64 get_delta_usec;
diff --git a/tools/testing/selftests/vm/gup_test.c 
b/tools/testing/selftests/vm/gup_test.c
index 4e9f5d0ed0fc..67d57a1cc8b6 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -14,12 +14,30 @@
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE 0x01/* check pte is writable */
 
+static char *cmd_to_str(unsigned long cmd)
+{
+   switch (cmd) {
+   case 

[PATCH 7/8] selftests/vm: run_vmtest.sh: update and clean up gup_test invocation

2020-09-28 Thread John Hubbard
Run benchmarks on the _fast variants of gup and pup, as originally
intended.

Run the new gup_test sub-test: dump pages. In addition to exercising the
dump_page() call, it also demonstrates the various options you can use
to specify which pages to dump, and how.

Signed-off-by: John Hubbard 
---
 tools/testing/selftests/vm/run_vmtest.sh | 24 ++--
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/vm/run_vmtest.sh 
b/tools/testing/selftests/vm/run_vmtest.sh
index d1843d5f3c30..e3a8b14d9df6 100755
--- a/tools/testing/selftests/vm/run_vmtest.sh
+++ b/tools/testing/selftests/vm/run_vmtest.sh
@@ -124,9 +124,9 @@ else
 fi
 
 echo ""
-echo "running 'gup_test -U' (normal/slow gup)"
+echo "running 'gup_test -u' (fast gup benchmark)"
 echo ""
-./gup_test -U
+./gup_test -u
 if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
@@ -134,10 +134,22 @@ else
echo "[PASS]"
 fi
 
-echo "--"
-echo "running gup_test -b (pin_user_pages)"
-echo "--"
-./gup_test -b
+echo "---"
+echo "running gup_test -a (pin_user_pages_fast benchmark)"
+echo "---"
+./gup_test -a
+if [ $? -ne 0 ]; then
+   echo "[FAIL]"
+   exitcode=1
+else
+   echo "[PASS]"
+fi
+
+echo "--"
+echo "running gup_test -ct -F 0x1 0 19 0x1000"
+echo "   Dumps pages 0, 19, and 4096, using pin_user_pages (-F 0x1)"
+echo "--"
+./gup_test -ct -F 0x1 0 19 0x1000
 if [ $? -ne 0 ]; then
echo "[FAIL]"
exitcode=1
-- 
2.28.0



[PATCH 3/8] selftests/vm: rename run_vmtests --> run_vmtests.sh

2020-09-28 Thread John Hubbard
Rename to *.sh, in order to match the conventions of all of the other
items in selftest/vm.

The only reason not to use a .sh suffix a shell script like this, might
be to make it look more like a normal program, but that's not an issue
here.

Signed-off-by: John Hubbard 
---
 tools/testing/selftests/vm/Makefile   | 2 +-
 tools/testing/selftests/vm/{run_vmtests => run_vmtest.sh} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename tools/testing/selftests/vm/{run_vmtests => run_vmtest.sh} (100%)

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 9cc6bc087461..5a3bd0c497b6 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -69,7 +69,7 @@ TEST_GEN_FILES += virtual_address_range
 TEST_GEN_FILES += write_to_hugetlbfs
 endif
 
-TEST_PROGS := run_vmtests
+TEST_PROGS := run_vmtests.sh
 
 TEST_FILES := test_vmalloc.sh
 
diff --git a/tools/testing/selftests/vm/run_vmtests 
b/tools/testing/selftests/vm/run_vmtest.sh
similarity index 100%
rename from tools/testing/selftests/vm/run_vmtests
rename to tools/testing/selftests/vm/run_vmtest.sh
-- 
2.28.0



[PATCH 8/8] selftests/vm: hmm-tests: remove the libhugetlbfs dependency

2020-09-28 Thread John Hubbard
HMM selftests are incredibly useful, but they are only effective if
people actually build and run them. All the other tests in selftests/vm
can be built with very standard, always-available libraries: libpthread,
librt. The hmm-tests.c program, on the other hand, requires something
that is (much) less readily available: libhugetlbfs. And so the build
will typically fail for many developers.

A simple attempt to install libhugetlbfs will also run into
complications on some common distros these days: Fedora and Arch Linux
(yes, Arch AUR has it, but that's fragile, as always with AUR). The
library is not maintained actively enough at the moment, for distros to
deal with it. I had to build it from source, for Fedora, and that didn't
go too smoothly either.

It turns out that, out of 21 tests in hmm-tests.c, only 2 actually
require functionality from libhugetlbfs. Therefore, if libhugetlbfs is
missing, simply ifdef those two tests out and allow the developer to at
least have the other 19 tests, if they don't want to pause to work
through the above issues. Also issue a warning, so that it's clear that
there is an imperfection in the build.

In order to do that, a tiny shell script (check_config.sh) runs a quick
compile (not link, that's too prone to false failures with library
paths), and basically, if the compiler doesn't find hugetlbfs.h in its
standard locations, then the script concludes that libhugetlbfs is not
available. The output is in two files, one for inclusion in hmm-test.c
(local_config.h), and one for inclusion in the Makefile
(local_config.mk).

Cc: Ralph Campbell 
Signed-off-by: John Hubbard 
---
 tools/testing/selftests/vm/.gitignore  |  1 +
 tools/testing/selftests/vm/Makefile| 24 +++--
 tools/testing/selftests/vm/check_config.sh | 30 ++
 tools/testing/selftests/vm/hmm-tests.c | 10 +++-
 4 files changed, 62 insertions(+), 3 deletions(-)
 create mode 100755 tools/testing/selftests/vm/check_config.sh

diff --git a/tools/testing/selftests/vm/.gitignore 
b/tools/testing/selftests/vm/.gitignore
index 2c8ddcf41c0e..e90d28bcd518 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -20,3 +20,4 @@ va_128TBswitch
 map_fixed_noreplace
 write_to_hugetlbfs
 hmm-tests
+local_config.*
diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 2579242386ac..986a90fa9653 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0
 # Makefile for vm selftests
+
+include local_config.mk
+
 uname_M := $(shell uname -m 2>/dev/null || echo not)
 MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/')
 
@@ -76,8 +79,6 @@ TEST_FILES := test_vmalloc.sh
 KSFT_KHDR_INSTALL := 1
 include ../lib.mk
 
-$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs
-
 ifeq ($(ARCH),x86_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
 BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64))
@@ -130,3 +131,22 @@ endif
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
 
 $(OUTPUT)/gup_test: ../../../../mm/gup_test.h
+
+$(OUTPUT)/hmm-tests: local_config.h
+
+# HMM_EXTRA_LIBS may get set in local_config.mk, or it may be left empty.
+$(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)
+
+local_config.mk local_config.h: check_config.sh
+   ./check_config.sh
+
+EXTRA_CLEAN += local_config.mk local_config.h
+
+ifeq ($(HMM_EXTRA_LIBS),)
+all: warn_missing_hugelibs
+
+warn_missing_hugelibs:
+   @echo ; \
+   echo "Warning: missing libhugetlbfs support. Some HMM tests will be 
skipped." ; \
+   echo
+endif
diff --git a/tools/testing/selftests/vm/check_config.sh 
b/tools/testing/selftests/vm/check_config.sh
new file mode 100755
index ..651a4b192479
--- /dev/null
+++ b/tools/testing/selftests/vm/check_config.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# Probe for libraries and create header files to record the results. Both C
+# header files and Makefile include fragments are created.
+
+OUTPUT_H_FILE=local_config.h
+OUTPUT_MKFILE=local_config.mk
+
+# libhugetlbfs
+tmpname=$(mktemp)
+tmpfile_c=${tmpname}.c
+tmpfile_o=${tmpname}.o
+
+echo "#include "> $tmpfile_c
+echo "#include "   >> $tmpfile_c
+echo "int func(void) { return 0; }" >> $tmpfile_c
+
+gcc -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+
+if [ -f $tmpfile_o ]; then
+echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE
+echo "HMM_EXTRA_LIBS = -lhugetlbfs" > $OUTPUT_MKFILE
+else
+echo "// No libhugetlbfs support found"  > $OUTPUT_H_FILE
+echo "# No libhugetlbfs support found, so:"  > $OUTPUT_MKFILE
+echo "HMM_EXTRA_LIBS = ">> $OUTPUT_MKFILE
+fi
+
+rm ${tmpname}.*
diff --git a/tools/testing/selftests/vm/hm

[PATCH 0/8] selftests/vm: gup_test, hmm-tests, assorted improvements

2020-09-28 Thread John Hubbard
This is based on the latest mmotm.

Summary: This series provides two main things, and a number of smaller
supporting goodies. The two main points are:

1) Add a new sub-test to gup_test, which in turn is a renamed version of
gup_benchmark. This sub-test allows nicer testing of dump_pages(), at
least on user-space pages.

For quite a while, I was doing a quick hack to gup_test.c whenever I
wanted to try out changes to dump_page(). Then Matthew Wilcox asked me
what I meant when I said "I used my dump_page() unit test", and I
realized that it might be nice to check in a polished up version of
that.

Details about how it works and how to use it are in the commit
description for patch #6.

2) Fixes a limitation of hmm-tests: these tests are incredibly useful,
but only if people actually build and run them. And it turns out that
libhugetlbfs is a little too effective at throwing a wrench in the
works, there. So I've added a little configuration check that removes
just two of the 21 hmm-tests, if libhugetlbfs is not available.

Further details in the commit description of patch #8.

Other smaller things that this series does:

a) Remove code duplication by creating gup_test.h.

b) Clear up the sub-test organization, and their invocation within
run_vmtests.sh.

c) Other minor assorted improvements.

John Hubbard (8):
  mm/gup_benchmark: rename to mm/gup_test
  selftests/vm: use a common gup_test.h
  selftests/vm: rename run_vmtests --> run_vmtests.sh
  selftests/vm: minor cleanup: Makefile and gup_test.c
  selftests/vm: only some gup_test items are really benchmarks
  selftests/vm: gup_test: introduce the dump_pages() sub-test
  selftests/vm: run_vmtest.sh: update and clean up gup_test invocation
  selftests/vm: hmm-tests: remove the libhugetlbfs dependency

 Documentation/core-api/pin_user_pages.rst |   6 +-
 arch/s390/configs/debug_defconfig |   2 +-
 arch/s390/configs/defconfig   |   2 +-
 mm/Kconfig|  21 +-
 mm/Makefile   |   2 +-
 mm/{gup_benchmark.c => gup_test.c}| 109 ++
 mm/gup_test.h |  32 +++
 tools/testing/selftests/vm/.gitignore |   3 +-
 tools/testing/selftests/vm/Makefile   |  38 +++-
 tools/testing/selftests/vm/check_config.sh|  30 +++
 tools/testing/selftests/vm/config |   2 +-
 tools/testing/selftests/vm/gup_benchmark.c| 137 -
 tools/testing/selftests/vm/gup_test.c | 188 ++
 tools/testing/selftests/vm/hmm-tests.c|  10 +-
 .../vm/{run_vmtests => run_vmtest.sh} |  24 ++-
 15 files changed, 403 insertions(+), 203 deletions(-)
 rename mm/{gup_benchmark.c => gup_test.c} (59%)
 create mode 100644 mm/gup_test.h
 create mode 100755 tools/testing/selftests/vm/check_config.sh
 delete mode 100644 tools/testing/selftests/vm/gup_benchmark.c
 create mode 100644 tools/testing/selftests/vm/gup_test.c
 rename tools/testing/selftests/vm/{run_vmtests => run_vmtest.sh} (91%)

-- 
2.28.0



[PATCH 2/8] selftests/vm: use a common gup_test.h

2020-09-28 Thread John Hubbard
Avoid the need to copy-paste the gup_test ioctl commands and the struct
gup_test definition, between the kernel and the user space application,
by providing a new header file for these. This allows easier and safer
adding of new ioctl calls, as well as reducing the overall line count.

Details: The header file has to be able to compile independently,
because of the arguably unfortunate way that the Makefile is written:
the Makefile tries to build all of its prerequisites, when really it
should be only building the .c files, and leaving the other
prerequisites (LOCAL_HDRS) as pure dependencies.

That Makefile limitation is probably not worth fixing, but it explains
why one of the includes had to be moved into the new header file.

Also: simplify the ioctl struct (struct gup_test), by deleting the
unused __expansion[10] field. This sort of thing is what you might see
in a stable ABI, but this low-level, kernel-developer-oriented
selftests/vm system is very much not subject to ABI stability. So
"expansion" and "reserved" fields are unnecessary here.

Signed-off-by: John Hubbard 
---
 mm/gup_test.c | 17 +
 mm/gup_test.h | 22 ++
 tools/testing/selftests/vm/Makefile   |  2 ++
 tools/testing/selftests/vm/gup_test.c | 22 +-
 4 files changed, 26 insertions(+), 37 deletions(-)
 create mode 100644 mm/gup_test.h

diff --git a/mm/gup_test.c b/mm/gup_test.c
index 10f41c0528de..a3c86d0fdff7 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -4,22 +4,7 @@
 #include 
 #include 
 #include 
-
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
-#define GUP_BENCHMARK  _IOWR('g', 2, struct gup_test)
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_test)
-#define PIN_BENCHMARK  _IOWR('g', 4, struct gup_test)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_test)
-
-struct gup_test {
-   __u64 get_delta_usec;
-   __u64 put_delta_usec;
-   __u64 addr;
-   __u64 size;
-   __u32 nr_pages_per_call;
-   __u32 flags;
-   __u64 expansion[10];/* For future use */
-};
+#include "gup_test.h"
 
 static void put_back_pages(unsigned int cmd, struct page **pages,
   unsigned long nr_pages)
diff --git a/mm/gup_test.h b/mm/gup_test.h
new file mode 100644
index ..931c2f3f477a
--- /dev/null
+++ b/mm/gup_test.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __GUP_TEST_H
+#define __GUP_TEST_H
+
+#include 
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
+#define GUP_BENCHMARK  _IOWR('g', 2, struct gup_test)
+#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_test)
+#define PIN_BENCHMARK  _IOWR('g', 4, struct gup_test)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_test)
+
+struct gup_test {
+   __u64 get_delta_usec;
+   __u64 put_delta_usec;
+   __u64 addr;
+   __u64 size;
+   __u32 nr_pages_per_call;
+   __u32 flags;
+};
+
+#endif /* __GUP_TEST_H */
diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index d1ae706d9927..9cc6bc087461 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -130,3 +130,5 @@ endif
 $(OUTPUT)/userfaultfd: LDLIBS += -lpthread
 
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
+
+$(OUTPUT)/gup_test: ../../../../mm/gup_test.h
diff --git a/tools/testing/selftests/vm/gup_test.c 
b/tools/testing/selftests/vm/gup_test.c
index e930135727a2..70db259582c3 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -2,39 +2,19 @@
 #include 
 #include 
 #include 
-
 #include 
 #include 
 #include 
 #include 
 #include 
-
-#include 
+#include "../../../../mm/gup_test.h"
 
 #define MB (1UL << 20)
 #define PAGE_SIZE sysconf(_SC_PAGESIZE)
 
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
-#define GUP_BENCHMARK  _IOWR('g', 2, struct gup_test)
-
-/* Similar to above, but use FOLL_PIN instead of FOLL_GET. */
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_test)
-#define PIN_BENCHMARK  _IOWR('g', 4, struct gup_test)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_test)
-
 /* Just the flags we need, copied from mm.h: */
 #define FOLL_WRITE 0x01/* check pte is writable */
 
-struct gup_test {
-   __u64 get_delta_usec;
-   __u64 put_delta_usec;
-   __u64 addr;
-   __u64 size;
-   __u32 nr_pages_per_call;
-   __u32 flags;
-   __u64 expansion[10];/* For future use */
-};
-
 int main(int argc, char **argv)
 {
struct gup_test gup;
-- 
2.28.0



[PATCH 4/8] selftests/vm: minor cleanup: Makefile and gup_test.c

2020-09-28 Thread John Hubbard
A few cleanups that don't deserve separate patches, but that
also should not clutter up other functional changes:

1. Remove an unnecessary #include 

2. Restore the sorted order of TEST_GEN_FILES.

3. Add -lpthread to the common LDLIBS, as it is harmless and several
   tests use it. Including, soon, gup_test.c. This gets rid of one
   special rule already.

Signed-off-by: John Hubbard 
---
 tools/testing/selftests/vm/Makefile   | 10 --
 tools/testing/selftests/vm/gup_test.c |  1 -
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 5a3bd0c497b6..2579242386ac 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -21,14 +21,15 @@ MACHINE ?= $(shell echo $(uname_M) | sed -e 
's/aarch64.*/arm64/')
 MAKEFLAGS += --no-builtin-rules
 
 CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS)
-LDLIBS = -lrt
+LDLIBS = -lrt -lpthread
 TEST_GEN_FILES = compaction_test
 TEST_GEN_FILES += gup_test
 TEST_GEN_FILES += hmm-tests
 TEST_GEN_FILES += hugepage-mmap
 TEST_GEN_FILES += hugepage-shm
-TEST_GEN_FILES += map_hugetlb
+TEST_GEN_FILES += khugepaged
 TEST_GEN_FILES += map_fixed_noreplace
+TEST_GEN_FILES += map_hugetlb
 TEST_GEN_FILES += map_populate
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += mlock2-tests
@@ -37,7 +38,6 @@ TEST_GEN_FILES += on-fault-limit
 TEST_GEN_FILES += thuge-gen
 TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
-TEST_GEN_FILES += khugepaged
 
 ifeq ($(ARCH),x86_64)
 CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh $(CC) 
../x86/trivial_32bit_program.c -m32)
@@ -76,7 +76,7 @@ TEST_FILES := test_vmalloc.sh
 KSFT_KHDR_INSTALL := 1
 include ../lib.mk
 
-$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs -lpthread
+$(OUTPUT)/hmm-tests: LDLIBS += -lhugetlbfs
 
 ifeq ($(ARCH),x86_64)
 BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32))
@@ -127,8 +127,6 @@ warn_32bit_failure:
 endif
 endif
 
-$(OUTPUT)/userfaultfd: LDLIBS += -lpthread
-
 $(OUTPUT)/mlock-random-test: LDLIBS += -lcap
 
 $(OUTPUT)/gup_test: ../../../../mm/gup_test.h
diff --git a/tools/testing/selftests/vm/gup_test.c 
b/tools/testing/selftests/vm/gup_test.c
index 70db259582c3..4e9f5d0ed0fc 100644
--- a/tools/testing/selftests/vm/gup_test.c
+++ b/tools/testing/selftests/vm/gup_test.c
@@ -4,7 +4,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "../../../../mm/gup_test.h"
-- 
2.28.0



[PATCH 6/8] selftests/vm: gup_test: introduce the dump_pages() sub-test

2020-09-28 Thread John Hubbard
For quite a while, I was doing a quick hack to gup_test.c (previously,
gup_benchmark.c) whenever I wanted to try out my changes to dump_page().
This makes that hack unnecessary, and instead allows anyone to easily
get the same coverage from a user space program. That saves a lot of
time because you don't have to change the kernel, in order to test
different pages and options.

The new sub-test takes advantage of the existing gup_test
infrastructure, which already provides a simple user space program, some
allocated user space pages, an ioctl call, pinning of those pages (via
either get_user_pages or pin_user_pages) and a corresponding kernel-side
test invocation. There's not much more required, mainly just a couple of
inputs from the user.

In fact, the new test re-uses the existing command line options in order
to get various helpful combinations (THP or normal, _fast or slow gup,
gup vs. pup, and more).

New command line options are: which pages to dump, and what type of
"get/pin" to use.

In order to figure out which pages to dump, the logic is:

* If the user doesn't specify anything, the page 0 (the first page in
the address range that the program sets up for testing) is dumped.

* Or, the user can type up to 8 page indices anywhere on the command
line. If you type more than 8, then it uses the first 8 and ignores the
remaining items.

For example:

./gup_test -ct -F 1 0 19 0x1000

Meaning:
-c:  dump pages sub-test
-t:  use THP pages
-F 1:use pin_user_pages() instead of get_user_pages()
0 19 0x1000: dump pages 0, 19, and 4096

Also, invoke the new test from run_vmtests.sh. This keeps it in use, and
also provides a good example of how to invoke it.

Signed-off-by: John Hubbard 
---
 mm/Kconfig|  6 +++
 mm/gup_test.c | 54 ++-
 mm/gup_test.h | 10 +
 tools/testing/selftests/vm/gup_test.c | 47 +--
 4 files changed, 112 insertions(+), 5 deletions(-)

diff --git a/mm/Kconfig b/mm/Kconfig
index 588984ee5fb4..f7c4c21e5cb1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -845,6 +845,12 @@ config GUP_TEST
  get_user_pages*() and pin_user_pages*(), as well as smoke tests of
  the non-_fast variants.
 
+ There is also a sub-test that allows running dump_page() on any
+ of up to eight pages (selected by command line args) within the
+ range of user-space addresses. These pages are either pinned via
+ pin_user_pages*(), or pinned via get_user_pages*(), as specified
+ by other command line arguments.
+
  See tools/testing/selftests/vm/gup_test.c
 
 config GUP_GET_PTE_LOW_HIGH
diff --git a/mm/gup_test.c b/mm/gup_test.c
index a980c4a194f0..e79dc364eafb 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -7,7 +7,7 @@
 #include "gup_test.h"
 
 static void put_back_pages(unsigned int cmd, struct page **pages,
-  unsigned long nr_pages)
+  unsigned long nr_pages, unsigned int gup_test_flags)
 {
unsigned long i;
 
@@ -23,6 +23,15 @@ static void put_back_pages(unsigned int cmd, struct page 
**pages,
case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
+   case DUMP_USER_PAGES_TEST:
+   if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) {
+   unpin_user_pages(pages, nr_pages);
+   } else {
+   for (i = 0; i < nr_pages; i++)
+   put_page(pages[i]);
+
+   }
+   break;
}
 }
 
@@ -49,6 +58,37 @@ static void verify_dma_pinned(unsigned int cmd, struct page 
**pages,
}
 }
 
+static void dump_pages_test(struct gup_test *gup, struct page **pages,
+   unsigned long nr_pages)
+{
+   unsigned int index_to_dump;
+   unsigned int i;
+
+   /*
+* Zero out any user-supplied page index that is out of range. Remember:
+* .which_pages[] contains a 1-based set of page indices.
+*/
+   for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+   if (gup->which_pages[i] > nr_pages) {
+   pr_warn("ZEROING due to out of range: .which_pages[%u]: 
%u\n",
+   i, gup->which_pages[i]);
+   gup->which_pages[i] = 0;
+   }
+   }
+
+   for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+   index_to_dump = gup->which_pages[i];
+
+   if (index_to_dump) {
+   index_to_dump--; // Decode from 1-based, to 0-based
+   pr_info(" page #%u, starting from user virt addr: 
0x%llx\n",
+   index_to_dump, gup->addr);
+   dump_page(pages[index_t

Re: [PATCH 7/8] selftests/vm: run_vmtest.sh: update and clean up gup_test invocation

2020-09-28 Thread John Hubbard

On 9/28/20 12:26 PM, Ira Weiny wrote:

On Sun, Sep 27, 2020 at 11:21:58PM -0700, John Hubbard wrote:

...

+echo "--"
+echo "running gup_test -ct -F 0x1 0 19 0x1000"
+echo "   Dumps pages 0, 19, and 4096, using pin_user_pages (-F 0x1)"
+echo "--"
+./gup_test -ct -F 0x1 0 19 0x1000


Ah here it is...  Maybe just remove that from the previous commit message.

Ira


Yes, will do, thanks for spotting that.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 2/8] selftests/vm: use a common gup_test.h

2020-09-28 Thread John Hubbard

On 9/28/20 5:57 AM, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 11:21:53PM -0700, John Hubbard wrote:

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index d1ae706d9927..9cc6bc087461 100644
+++ b/tools/testing/selftests/vm/Makefile
@@ -130,3 +130,5 @@ endif
  $(OUTPUT)/userfaultfd: LDLIBS += -lpthread
  
  $(OUTPUT)/mlock-random-test: LDLIBS += -lcap

+
+$(OUTPUT)/gup_test: ../../../../mm/gup_test.h


There is no reason to do this, the auto depends will pick up header
files, and gup_test.h isn't a generated file



It is less capable than you might think. Without the admittedly ugly technique
above, it fails to build, and as you can see, the include paths that are fed to
gcc are just a single one: usr/include:

$ make
make --no-builtin-rules ARCH=x86 -C ../../../.. headers_install
gcc -Wall -I ../../../../usr/include gup_test.c 
/kernel_work/linux-next-github/tools/testing/selftests/kselftest_harness.h 
/kernel_work/linux-next-github/tools/testing/selftests/kselftest.h ../../../../mm/gup_test.h -lrt -o 
/kernel_work/linux-next-github/tools/testing/selftests/vm/gup_test

make[1]: Entering directory '/kernel_work/linux-next-github'
gup_test.c:10:10: fatal error: gup_test.h: No such file or directory
   10 | #include "gup_test.h"
  |  ^~~~


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 8/8] selftests/vm: hmm-tests: remove the libhugetlbfs dependency

2020-09-28 Thread John Hubbard

On 9/28/20 6:02 AM, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 11:21:59PM -0700, John Hubbard wrote:

...

+gcc -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1


This gcc has to come from some makefile variable


ahem, yes, that really should have just been $(CC), will change to that.



This is kind of janky :\

Could we just not use libhugetlbfs? Doesn't it all just boil down to
creating a file in /dev/huge? Eg look at 
tools/testing/selftests/vm/hugepage-mmap.c



Well, the situation is a lot worse than that, because hmm-tests.c is using a few
helper functions that end up pulling in more and more.

My first attempt was actually in your direction: just grab a bit of code from 
the
library and drop it in. But that ended up turning into several pages of code 
from
quite a few functions and definitions, and it was looking maybe excessive. (I
can look at it again, though. Maybe it feels less excessive if there are no 
other
acceptible alternatives.)

So then I thought, why not just *delete* those two routines from hmm-tests.c? 
But
Ralph didn't like that because he notes that hmm_range_fault() loses some useful
test coverage by being exercised with hugetlbfs.

So finally I landed here, which is so far, the smallest change that would be
potentially acceptible: a couple dozen lines, in order to selectively disable 
the
problematic routines.

Anyway, thoughts? I like the current approach but am open to anything that makes
hmm-test Just Work for more people, on the *first* try.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 8/8] selftests/vm: hmm-tests: remove the libhugetlbfs dependency

2020-09-28 Thread John Hubbard

On 9/28/20 1:18 PM, John Hubbard wrote:

On 9/28/20 6:02 AM, Jason Gunthorpe wrote:

On Sun, Sep 27, 2020 at 11:21:59PM -0700, John Hubbard wrote:

...

+gcc -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1


This gcc has to come from some makefile variable

 I plan on posting a v2 with this additional change, to fix the above point:

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index 986a90fa9653..019cbb7f3cf8 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -138,7 +138,7 @@ $(OUTPUT)/hmm-tests: local_config.h
 $(OUTPUT)/hmm-tests: LDLIBS += $(HMM_EXTRA_LIBS)

 local_config.mk local_config.h: check_config.sh
-   ./check_config.sh
+   ./check_config.sh $(CC)

 EXTRA_CLEAN += local_config.mk local_config.h

diff --git a/tools/testing/selftests/vm/check_config.sh 
b/tools/testing/selftests/vm/check_config.sh
index 651a4b192479..079c8a40b85d 100755
--- a/tools/testing/selftests/vm/check_config.sh
+++ b/tools/testing/selftests/vm/check_config.sh
@@ -16,7 +16,8 @@ echo "#include "> $tmpfile_c
 echo "#include "   >> $tmpfile_c
 echo "int func(void) { return 0; }" >> $tmpfile_c

-gcc -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1
+CC=${1:?"Usage: $0  # example compiler: gcc"}
+$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1

 if [ -f $tmpfile_o ]; then
     echo "#define LOCAL_CONFIG_HAVE_LIBHUGETLBFS 1" > $OUTPUT_H_FILE



thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 1/5] mm: Introduce mm_struct.has_pinned

2020-09-28 Thread John Hubbard

On 9/28/20 4:57 PM, Jason Gunthorpe wrote:

On Mon, Sep 28, 2020 at 12:29:55PM -0700, Linus Torvalds wrote:

...

I think this is really hard to use and ugly. My thinking has been to
just stick:

if (flags & FOLL_LONGTERM)
flags |= FOLL_FORCE | FOLL_WRITE

In pin_user_pages(). It would make the driver API cleaner. If we can


+1, yes. The other choices so far are, as you say, really difficult to figure
out.


do a bit better somehow by not COW'ing for certain VMA's as you
explained then all the better, but not my primary goal..

Basically, I think if a driver is using FOLL_LONGTERM | FOLL_PIN we
should guarentee that driver a consistent MM and take the gup_fast
performance hit to do it.

AFAICT the giant wack of other cases not using FOLL_LONGTERM really
shouldn't care about read-decoherence. For those cases the user should
really not be racing write's with data under read-only pin, and the
new COW logic looks like it solves the other issues with this.


I hope this doesn't kill the seqcount() idea, though. That was my favorite
part of the discussion, because it neatly separates out the two racing domains
(fork, gup/pup) and allows easy reasoning about them--without really impacting
performance.

Truly elegant. We should go there.



I know Jann/John have been careful to not have special behaviors for
the DMA case, but I think it makes sense here. It is actually different.



I think that makes sense. Everyone knew that DMA/FOLL_LONGTERM call sites
were at least potentially special, despite the spirited debates in at least
two conferences about the meaning and implications of "long term". :)

And here we are seeing an example of such a special case, which I think is
natural enough.


thanks,
--
John Hubbard
NVIDIA


Re: [resource] 22b17dc667: Kernel panic - not syncing: Fatal exception

2020-11-02 Thread John Hubbard
  0010 DS:  ES:  CR0: 80050033
[   29.017409] CR2:  CR3: 000100120003 CR4: 007706f0
[   29.024539] DR0:  DR1:  DR2: 
[   29.031671] DR3:  DR6: fffe0ff0 DR7: 0400
[   29.038804] PKRU: 5554
[   29.041508] Kernel panic - not syncing: Fatal exception
ACPI MEMORY or I/O RESET_REG.


To reproduce:

 git clone https://github.com/intel/lkp-tests.git
 cd lkp-tests
 bin/lkp install job.yaml  # job file is attached in this email
 bin/lkp run job.yaml



Thanks,
oliver.s...@intel.com



thanks,
--
John Hubbard
NVIDIA


Re: [RFC PATCH resend 3/6] mm: Add refcount for preserving mm_struct without pgd

2020-11-03 Thread John Hubbard

On 11/3/20 5:21 AM, Jason Gunthorpe wrote:

On Tue, Nov 03, 2020 at 04:19:11AM +0100, Jann Horn wrote:

On Tue, Nov 3, 2020 at 3:11 AM Jann Horn  wrote:

On Sat, Oct 17, 2020 at 2:30 AM Jann Horn  wrote:

On Sat, Oct 17, 2020 at 1:21 AM Jason Gunthorpe  wrote:

On Sat, Oct 17, 2020 at 01:09:12AM +0200, Jann Horn wrote:

Currently, mm_struct has two refcounts:

...

Either way can work, I liked the suggestion because it suggests an
good name for the ref: 'mmget_pgd' or somesuch

What I don't like is how nonsensical the names here are becoming:
mmget/mmgrab/mm_ref

Gives no impression at the callsite what is right/wrong

Names like this:
  mmget_struct
  mmget_pgd
  mmget_tables



What?! I had just resigned myself to a bimonthly exercise, re-memorizing
the mm_struct naming correlation between grab, drop, get, put, count,
and users. And now you want to make it directly understandable? :)


Make alot more sense to me..

I think this patch needs to do something about the naming..



A third counter also seems like the tipping point, to me.

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 2/2] mm: prevent gup_fast from racing with COW during fork

2020-11-03 Thread John Hubbard

On 11/3/20 5:32 PM, Ahmed S. Darwish wrote:

On Tue, Nov 03, 2020 at 09:40:22AM -0800, Linus Torvalds wrote:

On Mon, Nov 2, 2020 at 10:52 PM Ahmed S. Darwish
 wrote:

...


>
> patch #1:
>

Subject: [RFC][PATCH 1/2] seqlock: Use __do_ prefix instead of non-standed
  _seqcount_t_ marker

The use of "*_seqcount_t_*" as a marker to denote internal seqlock.h
functions taking only plain seqcount_t instead of the whole
seqcount_LOCKNAME_t family is confusing users, as it's also not the
standard kernel pattern for denoting header file internal functions.

Use the __do_ prefix instead.

Note, a plain "__" prefix is not used since seqlock.h already uses it
for some of its exported functions; e.g. __read_seqcount_begin() and
__read_seqcount_retry().

Reported-by: Jason Gunthorpe 
Reported-by: John Hubbard 
Reported-by: Linus Torvalds 
Link: 
https://lkml.kernel.org/r/CAHk-=wgb8nyoqufpn0o6a5bpjcjpnxvh+krxapujhsgg+7q...@mail.gmail.com
Signed-off-by: Ahmed S. Darwish 
---
  include/linux/seqlock.h | 62 -
  1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index cbfc78b92b65..5de043841d33 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -425,9 +425,9 @@ SEQCOUNT_LOCKNAME(ww_mutex, struct ww_mutex, true, 
>lock->base, ww_mu
   * Return: true if a read section retry is required, else false
   */
  #define __read_seqcount_retry(s, start)   
\
-   __read_seqcount_t_retry(__seqcount_ptr(s), start)
+   __do___read_seqcount_retry(__seqcount_ptr(s), start)


Looking better. "do_" is clearly an internal function name prefix, so that's 
good.

A nit: while various numbers of leading underscores are sometimes used, it's a 
lot
less common to use, say, 3 consecutive underscores (as above) *within* the 
name. And
I don't think you need it for uniqueness, at least from a quick look around 
here.

In fact, if you actually need 3 vs 2 vs 1 underscore within the name, I'm going 
to
claim that that's too far afield, and the naming should be re-revisited. :)

So why not just:

__do_read_seqcount_retry()

?

...or, if you feeling bold:

do_read_seqcount_retry()

...thus taking further advantage of the "do" convention, in order to get rid of 
some
more underscores.

And similarly for other __do___*() functions.

But again, either way, I think "do" is helping a *lot* here (as is getting rid
of the _t_ idea).


thanks,
--
John Hubbard
NVIDIA



-static inline int __read_seqcount_t_retry(const seqcount_t *s, unsigned start)
+static inline int __do___read_seqcount_retry(const seqcount_t *s, unsigned 
start)
  {
kcsan_atomic_next(0);
return unlikely(READ_ONCE(s->sequence) != start);
@@ -445,12 +445,12 @@ static inline int __read_seqcount_t_retry(const 
seqcount_t *s, unsigned start)
   * Return: true if a read section retry is required, else false
   */
  #define read_seqcount_retry(s, start) \
-   read_seqcount_t_retry(__seqcount_ptr(s), start)
+   __do_read_seqcount_retry(__seqcount_ptr(s), start)

-static inline int read_seqcount_t_retry(const seqcount_t *s, unsigned start)
+static inline int __do_read_seqcount_retry(const seqcount_t *s, unsigned start)
  {
smp_rmb();
-   return __read_seqcount_t_retry(s, start);
+   return __do___read_seqcount_retry(s, start);
  }

  /**
@@ -462,10 +462,10 @@ do {  
\
if (__seqcount_lock_preemptible(s)) \
preempt_disable();  \
\
-   raw_write_seqcount_t_begin(__seqcount_ptr(s));  \
+   __do_raw_write_seqcount_begin(__seqcount_ptr(s));   \
  } while (0)

-static inline void raw_write_seqcount_t_begin(seqcount_t *s)
+static inline void __do_raw_write_seqcount_begin(seqcount_t *s)
  {
kcsan_nestable_atomic_begin();
s->sequence++;
@@ -478,13 +478,13 @@ static inline void raw_write_seqcount_t_begin(seqcount_t 
*s)
   */
  #define raw_write_seqcount_end(s) \
  do {  \
-   raw_write_seqcount_t_end(__seqcount_ptr(s));\
+   __do_raw_write_seqcount_end(__seqcount_ptr(s)); \
\
if (__seqcount_lock_preemptible(s)) \
preempt_enable();   \
  } while (0)

-static inline void raw_write_seqcount_t_end(seqcount_t *s)
+static inline void __do_raw_write_seqcount_end(seqcount_t *s)

Re: [PATCH v5 05/15] mm/frame-vector: Use FOLL_LONGTERM

2020-10-31 Thread John Hubbard

On 10/31/20 7:45 AM, Daniel Vetter wrote:

On Sat, Oct 31, 2020 at 3:55 AM John Hubbard  wrote:

On 10/30/20 3:08 AM, Daniel Vetter wrote:

...

By removing this check from this location, and changing from
pin_user_pages_locked() to pin_user_pages_fast(), I *think* we end up
losing the check entirely. Is that intended? If so it could use a comment
somewhere to explain why.


Yeah this wasn't intentional. I think I needed to drop the _locked
version to prep for FOLL_LONGTERM, and figured _fast is always better.
But I didn't realize that _fast doesn't have the vma checks, gup.c got
me a bit confused.


Actually, I thought that the change to _fast was a very nice touch, btw.



I'll remedy this in all the patches where this applies (because a
VM_IO | VM_PFNMAP can point at struct page backed memory, and that
exact use-case is what we want to stop with the unsafe_follow_pfn work
since it wreaks things like cma or security).

Aside: I do wonder whether the lack for that check isn't a problem.
VM_IO | VM_PFNMAP generally means driver managed, which means the
driver isn't going to consult the page pin count or anything like that
(at least not necessarily) when revoking or moving that memory, since
we're assuming it's totally under driver control. So if pup_fast can
get into such a mapping, we might have a problem.
-Daniel



Yes. I don't know why that check is missing from the _fast path.
Probably just an oversight, seeing as how it's in the slow path. Maybe
the appropriate response here is to add a separate patch that adds the
check.

I wonder if I'm overlooking something, but it certainly seems correct to
do that.

 thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v5 05/15] mm/frame-vector: Use FOLL_LONGTERM

2020-11-01 Thread John Hubbard

On 11/1/20 2:30 AM, Daniel Vetter wrote:

On Sun, Nov 1, 2020 at 6:22 AM John Hubbard  wrote:


On 10/31/20 7:45 AM, Daniel Vetter wrote:

On Sat, Oct 31, 2020 at 3:55 AM John Hubbard  wrote:

On 10/30/20 3:08 AM, Daniel Vetter wrote:

...

By removing this check from this location, and changing from
pin_user_pages_locked() to pin_user_pages_fast(), I *think* we end up
losing the check entirely. Is that intended? If so it could use a comment
somewhere to explain why.


Yeah this wasn't intentional. I think I needed to drop the _locked
version to prep for FOLL_LONGTERM, and figured _fast is always better.
But I didn't realize that _fast doesn't have the vma checks, gup.c got
me a bit confused.


Actually, I thought that the change to _fast was a very nice touch, btw.



I'll remedy this in all the patches where this applies (because a
VM_IO | VM_PFNMAP can point at struct page backed memory, and that
exact use-case is what we want to stop with the unsafe_follow_pfn work
since it wreaks things like cma or security).

Aside: I do wonder whether the lack for that check isn't a problem.
VM_IO | VM_PFNMAP generally means driver managed, which means the
driver isn't going to consult the page pin count or anything like that
(at least not necessarily) when revoking or moving that memory, since
we're assuming it's totally under driver control. So if pup_fast can
get into such a mapping, we might have a problem.
-Daniel



Yes. I don't know why that check is missing from the _fast path.
Probably just an oversight, seeing as how it's in the slow path. Maybe
the appropriate response here is to add a separate patch that adds the
check.

I wonder if I'm overlooking something, but it certainly seems correct to
do that.


You'll need the mmap_sem to get at the vma to be able to do this
check. If you add that to _fast, you made it as fast as the slow one.


Arggh, yes of course. Strike that, please. :)


Plus there's _fast_only due to locking recurion issues in fast-paths
(I assume, I didn't check all the callers).

I'm just wondering whether we have a bug somewhere with device
drivers. For CMA regions we always check in try_grab_page, but for dax


OK, so here you're talking about a different bug than the VM_IO | VM_PFNMAP
pages, I think. This is about the "FOLL_LONGTERM + CMA + gup/pup _fast"
combination that is not allowed, right?

For that: try_grab_page() doesn't check anything, but try_grab_compound_head()
does, but only for pup_fast, not gup_fast. That was added by commit
df3a0a21b698d ("mm/gup: fix omission of check on FOLL_LONGTERM in gup fast
path") in April.

I recall that the patch was just plugging a very specific hole, as opposed
to locking down the API against mistakes or confused callers. And it does
seem that there are some holes.


I'm not seeing where the checks in the _fast fastpaths are, and that
all still leaves random device driver mappings behind which aren't
backed by CMA but still point to something with a struct page behind
it. I'm probably just missing something, but no idea what.
-Daniel



Certainly we've established that we can't check VMA flags by that time,
so I'm not sure that there is much we can check by the time we get to
gup/pup _fast. Seems like the device drivers have to avoid calling _fast
with pages that live in VM_IO | VM_PFNMAP, by design, right? Or maybe
you're talking about CMA checks only?


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 2/2] mm: prevent gup_fast from racing with COW during fork

2020-11-02 Thread John Hubbard

On 11/2/20 4:41 PM, Ahmed S. Darwish wrote:

On Mon, Nov 02, 2020 at 08:25:32PM -0400, Jason Gunthorpe wrote:

On Tue, Nov 03, 2020 at 01:17:12AM +0100, Ahmed S. Darwish wrote:


Please stick with the official exported API: raw_write_seqcount_begin().


How did you know this was 'offical exported API' ??



All the official exported seqlock.h APIs are marked with verbose
kernel-doc annotations on top. The rest are internal...



OK, but no one here was able to deduce that, probably because there is not
enough consistency throughout the kernel to be able to assume such things--even
though your seqlock project is internally consistent. It's just not *quite*
enough communication.

I think if we added the following it would be very nice:

a) Short comments to the "unofficial and internal" routines, identifying them as
such, and

b) Short comments to the "official API for general use", also identifying
those as such.

c) A comment about what makes "raw" actually raw, for seqlock.


Since I'm proposing new work, I'll also offer to help, perhaps by putting 
together
a small patch to get it kicked off, if you approve of the idea.

thanks,
--
John Hubbard
NVIDIA


Re: The ext3 way of journalling

2008-01-14 Thread John Hubbard

Tuomo Valkonen wrote:

On 2008-01-13 18:11 -0500, Theodore Tso wrote:

It's much more likely that this early in your boot cycle, your clock is
sometimes incorrect.


I doubt it. I get this nearly _always_ when the system crashes, which
accounts for the vast majority of the times I boot it. (I wish swsusp
didn't suck so much..)


Is the "9192" number roughly constant, or is it always changing?


No. That's the number I got last time, but typically I've got 
something in the 3 range. 


If your machine is on the network, then the "ntpdate"
program could be setting your time so that it looks correct, but
that's after e2fsck is run.  


ntpdate isn't run by any of the init scripts. ntpd is, but like I 
already mentioned, I doubt it would correct vastly incorrect time,

not even being able to track and correct when it advances fast.



ntpd will allow an initial correction that is arbitrarily large, if run 
with the -g option. This is a commonly used option. I see it is running 
on my stock Fedora Core 8, for example. So there is often no need to run 
ntpdate.


Also, ntpd keeps track of how fast your local clock tends to drift, and 
attempts to compensate. So, even if the local clock runs quite fast or 
slow, you'll normally get good results. The exception would be if you 
clock's drift rate jumps around; for example: fast today, slow tomorrow.


On most systems, ntpd will also copy the current time back to the CMOS, 
periodically, and during an orderly shutdown.


Hope that adds some clarity.

thanks,
John Hubbard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Bitops source problem

2008-01-16 Thread John Hubbard

Pravin Nanaware wrote:

Hi,

I was just going through the include file in the /usr/include/asm/bitops.h

The function description describes it as non-atomic but it seems it is not. 


static __inline__ void __change_bit(int nr, volatile void * addr)
{
__asm__ __volatile__(
"btcl %1,%0"
:"=m" (ADDR)
:"Ir" (nr));
}

The kernel version I am using is 2.6.9-42. Is it right or am I missing something ?  


Thanks,
Pravin



The bitops.h comments are correct: the btc IA-32 instruction is only 
atomic if used with the lock prefix. The function above does not use the 
lock prefix, so it is not atomic.


thanks,
John Hubbard

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/4] mm/gup: add compound page list iterator

2021-02-03 Thread John Hubbard

On 2/3/21 2:00 PM, Joao Martins wrote:

Add an helper that iterates over head pages in a list of pages. It
essentially counts the tails until the next page to process has a
different head that the current. This is going to be used by
unpin_user_pages() family of functions, to batch the head page refcount
updates once for all passed consecutive tail pages.

Suggested-by: Jason Gunthorpe 
Signed-off-by: Joao Martins 
---
  mm/gup.c | 29 +
  1 file changed, 29 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index d68bcb482b11..4f88dcef39f2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -215,6 +215,35 @@ void unpin_user_page(struct page *page)
  }
  EXPORT_SYMBOL(unpin_user_page);
  
+static inline unsigned int count_ntails(struct page **pages, unsigned long npages)


Silly naming nit: could we please name this function count_pagetails()? 
count_ntails
is a bit redundant, plus slightly less clear.


+{
+   struct page *head = compound_head(pages[0]);
+   unsigned int ntails;
+
+   for (ntails = 1; ntails < npages; ntails++) {
+   if (compound_head(pages[ntails]) != head)
+   break;
+   }
+
+   return ntails;
+}
+
+static inline void compound_next(unsigned long i, unsigned long npages,
+struct page **list, struct page **head,
+unsigned int *ntails)
+{
+   if (i >= npages)
+   return;
+
+   *ntails = count_ntails(list + i, npages - i);
+   *head = compound_head(list[i]);
+}
+
+#define for_each_compound_head(i, list, npages, head, ntails) \


When using macros, which are dangerous in general, you have to worry about
things like name collisions. I really dislike that C has forced this unsafe
pattern upon us, but of course we are stuck with it, for iterator helpers.

Given that we're stuck, you should probably use names such as __i, __list, etc,
in the the above #define. Otherwise you could stomp on existing variables.


+   for (i = 0, compound_next(i, npages, list, , ); \
+i < npages; i += ntails, \
+compound_next(i, npages, list, , ))
+
  /**
   * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
pages
   * @pages:  array of pages to be maybe marked dirty, and definitely released.



thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 4/4] RDMA/umem: batch page unpin in __ib_mem_release()

2021-02-03 Thread John Hubbard

On 2/3/21 2:00 PM, Joao Martins wrote:

Use the newly added unpin_user_page_range_dirty_lock()
for more quickly unpinning a consecutive range of pages
represented as compound pages. This will also calculate
number of pages to unpin (for the tail pages which matching
head page) and thus batch the refcount update.

Running a test program which calls mr reg/unreg on a 1G in size
and measures cost of both operations together (in a guest using rxe)
with THP and hugetlbfs:


In the patch subject line:

   s/__ib_mem_release/__ib_umem_release/



Before:
590 rounds in 5.003 sec: 8480.335 usec / round
6898 rounds in 60.001 sec: 8698.367 usec / round

After:
2631 rounds in 5.001 sec: 1900.618 usec / round
31625 rounds in 60.001 sec: 1897.267 usec / round

Signed-off-by: Joao Martins 
---
  drivers/infiniband/core/umem.c | 12 ++--
  1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 2dde99a9ba07..ea4ebb3261d9 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -47,17 +47,17 @@
  
  static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)

  {
-   struct sg_page_iter sg_iter;
-   struct page *page;
+   bool make_dirty = umem->writable && dirty;
+   struct scatterlist *sg;
+   int i;


Maybe unsigned int is better, so as to perfectly match the scatterlist.length.

  
  	if (umem->nmap > 0)

ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
DMA_BIDIRECTIONAL);
  
-	for_each_sg_page(umem->sg_head.sgl, _iter, umem->sg_nents, 0) {

-   page = sg_page_iter_page(_iter);
-   unpin_user_pages_dirty_lock(, 1, umem->writable && dirty);
-   }
+   for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)


The change from umem->sg_nents to umem->nmap looks OK, although we should get
IB people to verify that there is not some odd bug or reason to leave it as is.


+   unpin_user_page_range_dirty_lock(sg_page(sg),
+   DIV_ROUND_UP(sg->length, PAGE_SIZE), make_dirty);


Is it really OK to refer directly to sg->length? The scatterlist library goes
to some effort to avoid having callers directly access the struct member 
variables.

Actually, the for_each_sg() code and its behavior with sg->length and 
sg_page(sg)
confuses me because I'm new to it, and I don't quite understand how this works.
Especially with SG_CHAIN. I'm assuming that you've monitored /proc/vmstat for
nr_foll_pin* ?

  
  	sg_free_table(>sg_head);

  }



thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 3/4] mm/gup: add a range variant of unpin_user_pages_dirty_lock()

2021-02-03 Thread John Hubbard

On 2/3/21 2:00 PM, Joao Martins wrote:

Add a unpin_user_page_range() API which takes a starting page
and how many consecutive pages we want to dirty.

Given that we won't be iterating on a list of changes, change
compound_next() to receive a bool, whether to calculate from the starting
page, or walk the page array. Finally add a separate iterator,


A bool arg is sometimes, but not always, a hint that you really just want
a separate set of routines. Below...


for_each_compound_range() that just operate in page ranges as opposed
to page array.

For users (like RDMA mr_dereg) where each sg represents a
contiguous set of pages, we're able to more efficiently unpin
pages without having to supply an array of pages much of what
happens today with unpin_user_pages().

Suggested-by: Jason Gunthorpe 
Signed-off-by: Joao Martins 
---
  include/linux/mm.h |  2 ++
  mm/gup.c   | 48 ++
  2 files changed, 42 insertions(+), 8 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index a608feb0d42e..b76063f7f18a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1265,6 +1265,8 @@ static inline void put_page(struct page *page)
  void unpin_user_page(struct page *page);
  void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 bool make_dirty);
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty);
  void unpin_user_pages(struct page **pages, unsigned long npages);
  
  /**

diff --git a/mm/gup.c b/mm/gup.c
index 971a24b4b73f..1b57355d5033 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -215,11 +215,16 @@ void unpin_user_page(struct page *page)
  }
  EXPORT_SYMBOL(unpin_user_page);
  
-static inline unsigned int count_ntails(struct page **pages, unsigned long npages)

+static inline unsigned int count_ntails(struct page **pages,
+   unsigned long npages, bool range)
  {
-   struct page *head = compound_head(pages[0]);
+   struct page *page = pages[0], *head = compound_head(page);
unsigned int ntails;
  
+	if (range)

+   return (!PageCompound(head) || compound_order(head) <= 1) ? 1 :
+  min_t(unsigned int, (head + compound_nr(head) - page), 
npages);


Here, you clearly should use a separate set of _range routines. Because you're 
basically
creating two different routines here! Keep it simple.

Once you're in a separate routine, you might feel more comfortable expanding 
that to
a more readable form, too:

if (!PageCompound(head) || compound_order(head) <= 1)
return 1;

return min_t(unsigned int, (head + compound_nr(head) - page), npages);


thanks,
--
John Hubbard
NVIDIA


+
for (ntails = 1; ntails < npages; ntails++) {
if (compound_head(pages[ntails]) != head)
break;
@@ -229,20 +234,32 @@ static inline unsigned int count_ntails(struct page 
**pages, unsigned long npage
  }
  
  static inline void compound_next(unsigned long i, unsigned long npages,

-struct page **list, struct page **head,
-unsigned int *ntails)
+struct page **list, bool range,
+struct page **head, unsigned int *ntails)
  {
+   struct page *p, **next = 
+
if (i >= npages)
return;
  
-	*ntails = count_ntails(list + i, npages - i);

-   *head = compound_head(list[i]);
+   if (range)
+   *next = *list + i;
+   else
+   next = list + i;
+
+   *ntails = count_ntails(next, npages - i, range);
+   *head = compound_head(*next);
  }
  
+#define for_each_compound_range(i, list, npages, head, ntails) \

+   for (i = 0, compound_next(i, npages, list, true, , ); \
+i < npages; i += ntails, \
+compound_next(i, npages, list, true,  , ))
+
  #define for_each_compound_head(i, list, npages, head, ntails) \
-   for (i = 0, compound_next(i, npages, list, , ); \
+   for (i = 0, compound_next(i, npages, list, false, , ); \
 i < npages; i += ntails, \
-compound_next(i, npages, list, , ))
+compound_next(i, npages, list, false,  , ))
  
  /**

   * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
pages
@@ -306,6 +323,21 @@ void unpin_user_pages_dirty_lock(struct page **pages, 
unsigned long npages,
  }
  EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
  
+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,

+ bool make_dirty)
+{
+   unsigned long index;
+   struct page *head;
+   unsigned int ntails;
+
+   for_each_compound_range(index, , npages, head, ntails) {
+   if (make_dirty && !PageDirty(head))
+

Re: [PATCH 3/4] mm/gup: add a range variant of unpin_user_pages_dirty_lock()

2021-02-03 Thread John Hubbard

On 2/3/21 2:00 PM, Joao Martins wrote:
...

+void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
+ bool make_dirty)
+{
+   unsigned long index;
+   struct page *head;
+   unsigned int ntails;
+
+   for_each_compound_range(index, , npages, head, ntails) {
+   if (make_dirty && !PageDirty(head))
+   set_page_dirty_lock(head);
+   put_compound_head(head, ntails, FOLL_PIN);
+   }
+}
+EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
+


Also, looking at this again while trying to verify the sg diffs in patch #4, I 
noticed
that the name is tricky. Usually a "range" would not have a single struct page* 
as the
argument. So in this case, perhaps a comment above the function would help, 
something
approximately like this:

/*
 * The "range" in the function name refers to the fact that the page may be a
 * compound page. If so, then that compound page will be treated as one or more
 * ranges of contiguous tail pages.
 */

...I guess. Or maybe the name is just wrong (a comment block explaining a name 
is
always a bad sign). Perhaps we should rename it to something like:

unpin_user_compound_page_dirty_lock()

?

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH 2/4] mm/gup: decrement head page once for group of subpages

2021-02-03 Thread John Hubbard

On 2/3/21 2:00 PM, Joao Martins wrote:

Rather than decrementing the head page refcount one by one, we
walk the page array and checking which belong to the same
compound_head. Later on we decrement the calculated amount
of references in a single write to the head page. To that
end switch to for_each_compound_head() does most of the work.

set_page_dirty() needs no adjustment as it's a nop for
non-dirty head pages and it doesn't operate on tail pages.

This considerably improves unpinning of pages with THP and
hugetlbfs:

- THP
gup_test -t -m 16384 -r 10 [-L|-a] -S -n 512 -w
PIN_LONGTERM_BENCHMARK (put values): ~87.6k us -> ~23.2k us

- 16G with 1G huge page size
gup_test -f /mnt/huge/file -m 16384 -r 10 [-L|-a] -S -n 512 -w
PIN_LONGTERM_BENCHMARK: (put values): ~87.6k us -> ~27.5k us

Signed-off-by: Joao Martins 
---
  mm/gup.c | 29 +++--
  1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 4f88dcef39f2..971a24b4b73f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -270,20 +270,15 @@ void unpin_user_pages_dirty_lock(struct page **pages, 
unsigned long npages,
 bool make_dirty)
  {
unsigned long index;
-
-   /*
-* TODO: this can be optimized for huge pages: if a series of pages is
-* physically contiguous and part of the same compound page, then a
-* single operation to the head page should suffice.
-*/


Great to see this TODO (and the related one below) finally done!

Everything looks correct here.

Reviewed-by: John Hubbard 


thanks,
--
John Hubbard
NVIDIA


+   struct page *head;
+   unsigned int ntails;
  
  	if (!make_dirty) {

unpin_user_pages(pages, npages);
return;
}
  
-	for (index = 0; index < npages; index++) {

-   struct page *page = compound_head(pages[index]);
+   for_each_compound_head(index, pages, npages, head, ntails) {
/*
 * Checking PageDirty at this point may race with
 * clear_page_dirty_for_io(), but that's OK. Two key
@@ -304,9 +299,9 @@ void unpin_user_pages_dirty_lock(struct page **pages, 
unsigned long npages,
 * written back, so it gets written back again in the
 * next writeback cycle. This is harmless.
 */
-   if (!PageDirty(page))
-   set_page_dirty_lock(page);
-   unpin_user_page(page);
+   if (!PageDirty(head))
+   set_page_dirty_lock(head);
+   put_compound_head(head, ntails, FOLL_PIN);
}
  }
  EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
@@ -323,6 +318,8 @@ EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
  void unpin_user_pages(struct page **pages, unsigned long npages)
  {
unsigned long index;
+   struct page *head;
+   unsigned int ntails;
  
  	/*

 * If this WARN_ON() fires, then the system *might* be leaking pages (by
@@ -331,13 +328,9 @@ void unpin_user_pages(struct page **pages, unsigned long 
npages)
 */
if (WARN_ON(IS_ERR_VALUE(npages)))
return;
-   /*
-* TODO: this can be optimized for huge pages: if a series of pages is
-* physically contiguous and part of the same compound page, then a
-* single operation to the head page should suffice.
-*/
-   for (index = 0; index < npages; index++)
-   unpin_user_page(pages[index]);
+
+   for_each_compound_head(index, pages, npages, head, ntails)
+   put_compound_head(head, ntails, FOLL_PIN);
  }
  EXPORT_SYMBOL(unpin_user_pages);
  





Re: [PATCH] mm: cma: support sysfs

2021-02-04 Thread John Hubbard

On 2/3/21 7:50 AM, Minchan Kim wrote:

Since CMA is getting used more widely, it's more important to
keep monitoring CMA statistics for system health since it's
directly related to user experience.

This patch introduces sysfs for the CMA and exposes stats below
to keep monitor for telemetric in the system.

  * the number of CMA allocation attempts
  * the number of CMA allocation failures
  * the number of CMA page allocation attempts
  * the number of CMA page allocation failures


The desire to report CMA data is understandable, but there are a few
odd things here:

1) First of all, this has significant overlap with /sys/kernel/debug/cma
items. I suspect that all of these items could instead go into
/sys/kernel/debug/cma, right?

2) The overall CMA allocation attempts/failures (first two items above) seem
an odd pair of things to track. Maybe that is what was easy to track, but I'd
vote for just omitting them.


Signed-off-by: Minchan Kim 
---
  Documentation/ABI/testing/sysfs-kernel-mm-cma |  39 +
  include/linux/cma.h   |   1 +
  mm/Makefile   |   1 +
  mm/cma.c  |   6 +-
  mm/cma.h  |  20 +++
  mm/cma_sysfs.c| 143 ++
  6 files changed, 209 insertions(+), 1 deletion(-)
  create mode 100644 Documentation/ABI/testing/sysfs-kernel-mm-cma
  create mode 100644 mm/cma_sysfs.c

diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-cma 
b/Documentation/ABI/testing/sysfs-kernel-mm-cma
new file mode 100644
index ..2a43c0aacc39
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-cma
@@ -0,0 +1,39 @@
+What:  /sys/kernel/mm/cma/
+Date:  Feb 2021
+Contact:   Minchan Kim 
+Description:
+   /sys/kernel/mm/cma/ contains a number of subdirectories by
+   cma-heap name. The subdirectory contains a number of files
+   to represent cma allocation statistics.


Somewhere, maybe here, there should be a mention of the closely related
/sys/kernel/debug/cma files.


+
+   There are number of files under
+   /sys/kernel/mm/cma/ directory
+
+   - cma_alloc_attempt
+   - cma_alloc_fail


Are these really useful? They a summary of the alloc_pages items, really.


+   - alloc_pages_attempt
+   - alloc_pages_fail


This should also have "cma" in the name, really: cma_alloc_pages_*.


+
+What:  /sys/kernel/mm/cma//cma_alloc_attempt
+Date:  Feb 2021
+Contact:   Minchan Kim 
+Description:
+   the number of cma_alloc API attempted
+
+What:  /sys/kernel/mm/cma//cma_alloc_fail
+Date:  Feb 2021
+Contact:   Minchan Kim 
+Description:
+   the number of CMA_alloc API failed
+
+What:  /sys/kernel/mm/cma//alloc_pages_attempt
+Date:  Feb 2021
+Contact:   Minchan Kim 
+Description:
+   the number of pages CMA API tried to allocate
+
+What:  /sys/kernel/mm/cma//alloc_pages_fail
+Date:  Feb 2021
+Contact:   Minchan Kim 
+Description:
+   the number of pages CMA API failed to allocate
diff --git a/include/linux/cma.h b/include/linux/cma.h
index 217999c8a762..71a28a5bb54e 100644
--- a/include/linux/cma.h
+++ b/include/linux/cma.h
@@ -49,4 +49,5 @@ extern struct page *cma_alloc(struct cma *cma, size_t count, 
unsigned int align,
  extern bool cma_release(struct cma *cma, const struct page *pages, unsigned 
int count);
  
  extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data);

+


A single additional blank line seems to be the only change to this file. :)

thanks,
--
John Hubbard
NVIDIA


Re: [PATCH v2 net-next 3/4] net: introduce common dev_page_is_reserved()

2021-01-30 Thread John Hubbard



On 1/30/21 11:45 AM, Alexander Lobakin wrote:

From: Jakub Kicinski 
Date: Sat, 30 Jan 2021 11:07:07 -0800


On Sat, 30 Jan 2021 15:42:29 + Alexander Lobakin wrote:

On Wed, 27 Jan 2021 20:11:23 + Alexander Lobakin wrote:

+ * dev_page_is_reserved - check whether a page can be reused for network Rx
+ * @page: the page to test
+ *
+ * A page shouldn't be considered for reusing/recycling if it was allocated
+ * under memory pressure or at a distant memory node.
+ *
+ * Returns true if this page should be returned to page allocator, false
+ * otherwise.
+ */
+static inline bool dev_page_is_reserved(const struct page *page)


Am I the only one who feels like "reusable" is a better term than
"reserved".


I thought about it, but this will need to inverse the conditions in
most of the drivers. I decided to keep it as it is.
I can redo if "reusable" is preferred.


Naming is hard. As long as the condition is not a double negative it
reads fine to me, but that's probably personal preference.
The thing that doesn't sit well is the fact that there is nothing
"reserved" about a page from another NUMA node.. But again, if nobody
+1s this it's whatever...


Agree on NUMA and naming. I'm a bit surprised that 95% of drivers
have this helper called "reserved" (one of the reasons why I finished
with this variant).
Let's say, if anybody else will vote for "reusable", I'll pick it for
v3.


Definitely "reusable" seems better to me, and especially anything *other*
than "reserved" is a good idea, IMHO.


thanks,
--
John Hubbard
NVIDIA


Re: [PATCH] staging: kpc2000: Convert put_page() to put_user_page*()

2019-07-15 Thread John Hubbard
On 7/15/19 12:52 PM, Bharath Vedartham wrote:
> There have been issues with get_user_pages and filesystem writeback.
> The issues are better described in [1].
> 
> The solution being proposed wants to keep track of gup_pinned pages which 
> will allow to take furthur steps to coordinate between subsystems using gup.
> 
> put_user_page() simply calls put_page inside for now. But the implementation 
> will change once all call sites of put_page() are converted.
> 
> I currently do not have the driver to test. Could I have some suggestions to 
> test this code? The solution is currently implemented in [2] and
> it would be great if we could apply the patch on top of [2] and run some 
> tests to check if any regressions occur.

Hi Bharath,

Process point: the above paragraph, and other meta-questions (about the patch, 
rather than part of the patch) should be placed either after the "---", or in a 
cover letter (git-send-email --cover-letter). That way, the patch itself is in 
a commit-able state.

One more below:

> 
> [1] https://lwn.net/Articles/753027/
> [2] https://github.com/johnhubbard/linux/tree/gup_dma_core
> 
> Cc: Matt Sickler 
> Cc: Greg Kroah-Hartman 
> Cc: Jérôme Glisse 
> Cc: Ira Weiny 
> Cc: John Hubbard 
> Cc: linux...@kvack.org
> Cc: de...@driverdev.osuosl.org
> 
> Signed-off-by: Bharath Vedartham 
> ---
>  drivers/staging/kpc2000/kpc_dma/fileops.c | 8 ++--
>  1 file changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/staging/kpc2000/kpc_dma/fileops.c 
> b/drivers/staging/kpc2000/kpc_dma/fileops.c
> index 6166587..82c70e6 100644
> --- a/drivers/staging/kpc2000/kpc_dma/fileops.c
> +++ b/drivers/staging/kpc2000/kpc_dma/fileops.c
> @@ -198,9 +198,7 @@ int  kpc_dma_transfer(struct dev_private_data *priv, 
> struct kiocb *kcb, unsigned
>   sg_free_table(>sgt);
>   err_dma_map_sg:
>   err_alloc_sg_table:
> - for (i = 0 ; i < acd->page_count ; i++){
> - put_page(acd->user_pages[i]);
> - }
> + put_user_pages(acd->user_pages, acd->page_count);
>   err_get_user_pages:
>   kfree(acd->user_pages);
>   err_alloc_userpages:
> @@ -229,9 +227,7 @@ void  transfer_complete_cb(struct aio_cb_data *acd, 
> size_t xfr_count, u32 flags)
>   
>   dma_unmap_sg(>ldev->pldev->dev, acd->sgt.sgl, acd->sgt.nents, 
> acd->ldev->dir);
>   
> - for (i = 0 ; i < acd->page_count ; i++){
> - put_page(acd->user_pages[i]);
> - }
> + put_user_pages(acd->user_pages, acd->page_count);
>   
>   sg_free_table(>sgt);
>   
> 

Because this is a common pattern, and because the code here doesn't likely need 
to set page dirty before the dma_unmap_sg call, I think the following would be 
better (it's untested), instead of the above diff hunk:

diff --git a/drivers/staging/kpc2000/kpc_dma/fileops.c 
b/drivers/staging/kpc2000/kpc_dma/fileops.c
index 48ca88bc6b0b..d486f9866449 100644
--- a/drivers/staging/kpc2000/kpc_dma/fileops.c
+++ b/drivers/staging/kpc2000/kpc_dma/fileops.c
@@ -211,16 +211,13 @@ void  transfer_complete_cb(struct aio_cb_data *acd, 
size_t xfr_count, u32 flags)
BUG_ON(acd->ldev == NULL);
BUG_ON(acd->ldev->pldev == NULL);
 
-   for (i = 0 ; i < acd->page_count ; i++) {
-   if (!PageReserved(acd->user_pages[i])) {
-   set_page_dirty(acd->user_pages[i]);
-   }
-   }
-
dma_unmap_sg(>ldev->pldev->dev, acd->sgt.sgl, acd->sgt.nents, 
acd->ldev->dir);
 
for (i = 0 ; i < acd->page_count ; i++) {
-   put_page(acd->user_pages[i]);
+   if (!PageReserved(acd->user_pages[i])) {
+   put_user_pages_dirty(>user_pages[i], 1);
+   else
+   put_user_page(acd->user_pages[i]);
}
 
sg_free_table(>sgt);

Assuming that you make those two changes, you can add:

Reviewed-by: John Hubbard 


thanks,
-- 
John Hubbard
NVIDIA


[PATCH] staging: kpc2000: whitespace and line length cleanup

2019-07-15 Thread john . hubbard
From: John Hubbard 

This commit was created by running indent(1):
`indent -linux`

...and then applying some manual corrections and
cleanup afterward, to keep it sane. No functional changes
were made.

In addition to whitespace changes, some strings were split,
but not strings that were likely to be a grep problem
(in other words, if a user is likely to grep for a string
within the driver, that should still work in most cases).

A few "void * foo" cases were fixed to be "void *foo".

That was enough to make checkpatch.pl run without errors,
although note that there are lots of serious warnings
remaining--but those require functional, not just whitespace
changes. So those are left for a separate patch.

Cc: Greg Kroah-Hartman 
Cc: Simon Sandström 
Cc: Geordan Neukum 
Cc: Jeremy Sowden 
Cc: Dan Carpenter 
Cc: Vandana BN 
Cc: de...@driverdev.osuosl.org
Cc: Bharath Vedartham 
Signed-off-by: John Hubbard 
---
 drivers/staging/kpc2000/kpc2000_i2c.c | 189 +++--
 drivers/staging/kpc2000/kpc2000_spi.c | 116 +-
 drivers/staging/kpc2000/kpc_dma/dma.c | 109 ++
 drivers/staging/kpc2000/kpc_dma/fileops.c | 199 +++---
 .../staging/kpc2000/kpc_dma/kpc_dma_driver.c  | 113 +-
 .../staging/kpc2000/kpc_dma/kpc_dma_driver.h  | 156 +++---
 6 files changed, 509 insertions(+), 373 deletions(-)

diff --git a/drivers/staging/kpc2000/kpc2000_i2c.c 
b/drivers/staging/kpc2000/kpc2000_i2c.c
index b108da4ac633..93fa1858f6b5 100644
--- a/drivers/staging/kpc2000/kpc2000_i2c.c
+++ b/drivers/staging/kpc2000/kpc2000_i2c.c
@@ -33,9 +33,9 @@ MODULE_LICENSE("GPL");
 MODULE_AUTHOR("matt.sick...@daktronics.com");
 
 struct i2c_device {
-   unsigned long   smba;
-   struct i2c_adapter  adapter;
-   unsigned intfeatures;
+   unsigned long   smba;
+   struct i2c_adapter  adapter;
+   unsigned intfeatures;
 };
 
 /*
@@ -52,9 +52,9 @@ struct i2c_device {
 #define SMBHSTDAT0(p)   ((5  * REG_SIZE) + (p)->smba)
 #define SMBHSTDAT1(p)   ((6  * REG_SIZE) + (p)->smba)
 #define SMBBLKDAT(p)((7  * REG_SIZE) + (p)->smba)
-#define SMBPEC(p)   ((8  * REG_SIZE) + (p)->smba)   /* ICH3 and later */
-#define SMBAUXSTS(p)((12 * REG_SIZE) + (p)->smba)   /* ICH4 and later */
-#define SMBAUXCTL(p)((13 * REG_SIZE) + (p)->smba)   /* ICH4 and later */
+#define SMBPEC(p)   ((8  * REG_SIZE) + (p)->smba)  /* ICH3 and later */
+#define SMBAUXSTS(p)((12 * REG_SIZE) + (p)->smba)  /* ICH4 and later */
+#define SMBAUXCTL(p)((13 * REG_SIZE) + (p)->smba)  /* ICH4 and later */
 
 /* PCI Address Constants */
 #define SMBBAR  4
@@ -74,20 +74,20 @@ struct i2c_device {
 
 /* Other settings */
 #define MAX_RETRIES 400
-#define ENABLE_INT9 0   /* set to 0x01 to enable - untested */
+#define ENABLE_INT9 0  /* set to 0x01 to enable - untested */
 
 /* I801 command constants */
 #define I801_QUICK  0x00
 #define I801_BYTE   0x04
 #define I801_BYTE_DATA  0x08
 #define I801_WORD_DATA  0x0C
-#define I801_PROC_CALL  0x10/* unimplemented */
+#define I801_PROC_CALL  0x10   /* unimplemented */
 #define I801_BLOCK_DATA 0x14
-#define I801_I2C_BLOCK_DATA 0x18/* ICH5 and later */
+#define I801_I2C_BLOCK_DATA 0x18   /* ICH5 and later */
 #define I801_BLOCK_LAST 0x34
-#define I801_I2C_BLOCK_LAST 0x38/* ICH5 and later */
+#define I801_I2C_BLOCK_LAST 0x38   /* ICH5 and later */
 #define I801_START  0x40
-#define I801_PEC_EN 0x80/* ICH3 and later */
+#define I801_PEC_EN 0x80   /* ICH3 and later */
 
 /* I801 Hosts Status register bits */
 #define SMBHSTSTS_BYTE_DONE 0x80
@@ -99,7 +99,9 @@ struct i2c_device {
 #define SMBHSTSTS_INTR  0x02
 #define SMBHSTSTS_HOST_BUSY 0x01
 
-#define STATUS_FLAGS(SMBHSTSTS_BYTE_DONE | SMBHSTSTS_FAILED | 
SMBHSTSTS_BUS_ERR | SMBHSTSTS_DEV_ERR | SMBHSTSTS_INTR)
+#define STATUS_FLAGS \
+   (SMBHSTSTS_BYTE_DONE | SMBHSTSTS_FAILED | SMBHSTSTS_BUS_ERR | \
+SMBHSTSTS_DEV_ERR | SMBHSTSTS_INTR)
 
 /* Older devices have their ID defined in  */
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS   0x1c22
@@ -136,17 +138,21 @@ static int i801_check_pre(struct i2c_device *priv)
 
status = inb_p(SMBHSTSTS(priv));
if (status & SMBHSTSTS_HOST_BUSY) {
-   dev_err(>adapter.dev, "SMBus is busy, can't use it! 
(status=%x)\n", status);
+   dev_err(>adapter.dev,
+   "SMBus is busy, can't use it! (status=%x)\n", status);
return -EBUSY;
}
 
status &= STATUS_FLAGS;
if (status) {
-   //dev_dbg(>adapter.dev, "Clearing status flags (%02x)\n", 
status);
+

[PATCH 0/1] staging: kpc2000: whitespace and line length cleanup

2019-07-15 Thread john . hubbard
From: John Hubbard 

Hi everyone,

This is an easy, drive-by cleanup that I did while reviewing Bharath's
changes to convert over to put_user_page(). It should make the code less
obviously non-conforming, and therefore help future reviews and cleanups.

This is on top of latest linux.git, commit fec88ab0af97 ("Merge tag
'for-linus-hmm' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma"),
and it does NOT have Bharath's patch applied, so it conflicts (but should
be trivial to resolve, regardless of which is applied first, as it's just
whitespace).

Cc: Greg Kroah-Hartman 
Cc: Simon Sandström 
Cc: Geordan Neukum 
Cc: Jeremy Sowden 
Cc: Dan Carpenter 
Cc: Vandana BN 
Cc: de...@driverdev.osuosl.org
Cc: Bharath Vedartham 

John Hubbard (1):
  staging: kpc2000: whitespace and line length cleanup

 drivers/staging/kpc2000/kpc2000_i2c.c | 189 +++--
 drivers/staging/kpc2000/kpc2000_spi.c | 116 +-
 drivers/staging/kpc2000/kpc_dma/dma.c | 109 ++
 drivers/staging/kpc2000/kpc_dma/fileops.c | 199 +++---
 .../staging/kpc2000/kpc_dma/kpc_dma_driver.c  | 113 +-
 .../staging/kpc2000/kpc_dma/kpc_dma_driver.h  | 154 +++---
 6 files changed, 507 insertions(+), 373 deletions(-)

-- 
2.22.0



<    1   2   3   4   5   6   7   8   9   10   >