Re: [PATCH v5 2/4] mm/nvdimm: Add page size and struct page size to pfn superblock

2019-08-09 Thread Aneesh Kumar K.V
"Aneesh Kumar K.V"  writes:

case PFN_MODE_PMEM:
> @@ -475,6 +484,20 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char 
> *sig)
>   align = 1UL << ilog2(offset);
>   mode = le32_to_cpu(pfn_sb->mode);
>  
> + if (le32_to_cpu(pfn_sb->page_size) != PAGE_SIZE) {
> + dev_err(_pfn->dev,
> + "init failed, page size mismatch %d\n",
> + le32_to_cpu(pfn_sb->page_size));
> + return -EOPNOTSUPP;
> + }
> +
> + if (le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) {
> + dev_err(_pfn->dev,
> + "init failed, struct page size mismatch %d\n",
> + le16_to_cpu(pfn_sb->page_struct_size));
> + return -EOPNOTSUPP;
> + }
> +

We need this here?

>From 9885b2f9ed81a2438fc81507cfcdbdb1aeab756c Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" 
Date: Fri, 9 Aug 2019 22:10:08 +0530
Subject: [PATCH] nvdimm: check struct page size only if pfn node is PMEM

We should do the check only with PFN_MODE_PMEM. If we use
memory for backing vmemmap, we should be able to enable
the namespace even if struct page size change.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/pfn_devs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index f43d1baa6f33..f3e9a4b826da 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -509,7 +509,8 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
return -EOPNOTSUPP;
}
 
-   if (le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) {
+   if ((le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) &&
+(mode == PFN_MODE_PMEM)) {
dev_err(_pfn->dev,
"init failed, struct page size mismatch %d\n",
le16_to_cpu(pfn_sb->page_struct_size));
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [RFC PATCH v2 12/19] mm/gup: Prep put_user_pages() to take an vaddr_pin struct

2019-08-09 Thread John Hubbard
On 8/9/19 3:58 PM, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> Once callers start to use vaddr_pin the put_user_pages calls will need
> to have access to this data coming in.  Prep put_user_pages() for this
> data.
> 
> Signed-off-by: Ira Weiny 
> ---
>  include/linux/mm.h |  20 +---
>  mm/gup.c   | 122 -
>  2 files changed, 88 insertions(+), 54 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index befe150d17be..9d37cafbef9a 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1064,25 +1064,7 @@ static inline void put_page(struct page *page)
>   __put_page(page);
>  }
>  
> -/**
> - * put_user_page() - release a gup-pinned page
> - * @page:pointer to page to be released
> - *
> - * Pages that were pinned via get_user_pages*() must be released via
> - * either put_user_page(), or one of the put_user_pages*() routines
> - * below. This is so that eventually, pages that are pinned via
> - * get_user_pages*() can be separately tracked and uniquely handled. In
> - * particular, interactions with RDMA and filesystems need special
> - * handling.
> - *
> - * put_user_page() and put_page() are not interchangeable, despite this early
> - * implementation that makes them look the same. put_user_page() calls must
> - * be perfectly matched up with get_user_page() calls.
> - */
> -static inline void put_user_page(struct page *page)
> -{
> - put_page(page);
> -}
> +void put_user_page(struct page *page);
>  
>  void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
>  bool make_dirty);
> diff --git a/mm/gup.c b/mm/gup.c
> index a7a9d2f5278c..10cfd30ff668 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -24,30 +24,41 @@
>  
>  #include "internal.h"
>  
> -/**
> - * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
> pages
> - * @pages:  array of pages to be maybe marked dirty, and definitely released.

A couple comments from our circular review chain: some fellow with the same
last name as you, recommended wording it like this:

  @pages:  array of pages to be put

> - * @npages: number of pages in the @pages array.
> - * @make_dirty: whether to mark the pages dirty
> - *
> - * "gup-pinned page" refers to a page that has had one of the 
> get_user_pages()
> - * variants called on that page.
> - *
> - * For each page in the @pages array, make that page (or its head page, if a
> - * compound page) dirty, if @make_dirty is true, and if the page was 
> previously
> - * listed as clean. In any case, releases all pages using put_user_page(),
> - * possibly via put_user_pages(), for the non-dirty case.
> - *
> - * Please see the put_user_page() documentation for details.
> - *
> - * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
> - * required, then the caller should a) verify that this is really correct,
> - * because _lock() is usually required, and b) hand code it:
> - * set_page_dirty_lock(), put_user_page().
> - *
> - */
> -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
> -bool make_dirty)
> +static void __put_user_page(struct vaddr_pin *vaddr_pin, struct page *page)
> +{
> + page = compound_head(page);
> +
> + /*
> +  * For devmap managed pages we need to catch refcount transition from
> +  * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
> +  * page is free and we need to inform the device driver through
> +  * callback. See include/linux/memremap.h and HMM for details.
> +  */
> + if (put_devmap_managed_page(page))
> + return;
> +
> + if (put_page_testzero(page))
> + __put_page(page);
> +}
> +
> +static void __put_user_pages(struct vaddr_pin *vaddr_pin, struct page 
> **pages,
> +  unsigned long npages)
> +{
> + unsigned long index;
> +
> + /*
> +  * TODO: this can be optimized for huge pages: if a series of pages is
> +  * physically contiguous and part of the same compound page, then a
> +  * single operation to the head page should suffice.
> +  */

As discussed in the other review thread (""), let's just delete that comment,
as long as you're moving things around.


> + for (index = 0; index < npages; index++)
> + __put_user_page(vaddr_pin, pages[index]);
> +}
> +
> +static void __put_user_pages_dirty_lock(struct vaddr_pin *vaddr_pin,
> + struct page **pages,
> + unsigned long npages,
> + bool make_dirty)

Elsewhere in this series, we pass vaddr_pin at the end of the arg list.
Here we pass it at the beginning, and it caused a minor jar when reading it.
Obviously just bike shedding at this point, though. Either way. :)

>  {
>   unsigned long index;
>  
> @@ -58,7 +69,7 @@ void 

Re: [RFC PATCH v2 11/19] mm/gup: Pass follow_page_context further down the call stack

2019-08-09 Thread John Hubbard
On 8/9/19 3:58 PM, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> In preparation for passing more information (vaddr_pin) into
> follow_page_pte(), follow_devmap_pud(), and follow_devmap_pmd().
> 
> Signed-off-by: Ira Weiny 
> ---
>  include/linux/huge_mm.h | 17 -
>  mm/gup.c| 31 +++
>  mm/huge_memory.c|  6 --
>  mm/internal.h   | 28 
>  4 files changed, 47 insertions(+), 35 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 45ede62aa85b..b01a20ce0bb9 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -233,11 +233,6 @@ static inline int hpage_nr_pages(struct page *page)
>   return 1;
>  }
>  
> -struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long 
> addr,
> - pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
> -struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long 
> addr,
> - pud_t *pud, int flags, struct dev_pagemap **pgmap);
> -
>  extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t 
> orig_pmd);
>  
>  extern struct page *huge_zero_page;
> @@ -375,18 +370,6 @@ static inline void mm_put_huge_zero_page(struct 
> mm_struct *mm)
>   return;
>  }
>  
> -static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
> - unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
> -{
> - return NULL;
> -}
> -
> -static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
> - unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
> -{
> - return NULL;
> -}
> -
>  static inline bool thp_migration_supported(void)
>  {
>   return false;
> diff --git a/mm/gup.c b/mm/gup.c
> index 504af3e9a942..a7a9d2f5278c 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -24,11 +24,6 @@
>  
>  #include "internal.h"
>  
> -struct follow_page_context {
> - struct dev_pagemap *pgmap;
> - unsigned int page_mask;
> -};
> -
>  /**
>   * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
> pages
>   * @pages:  array of pages to be maybe marked dirty, and definitely released.
> @@ -172,8 +167,9 @@ static inline bool can_follow_write_pte(pte_t pte, 
> unsigned int flags)
>  
>  static struct page *follow_page_pte(struct vm_area_struct *vma,
>   unsigned long address, pmd_t *pmd, unsigned int flags,
> - struct dev_pagemap **pgmap)
> + struct follow_page_context *ctx)
>  {
> + struct dev_pagemap **pgmap = >pgmap;
>   struct mm_struct *mm = vma->vm_mm;
>   struct page *page;
>   spinlock_t *ptl;
> @@ -363,13 +359,13 @@ static struct page *follow_pmd_mask(struct 
> vm_area_struct *vma,
>   }
>   if (pmd_devmap(pmdval)) {
>   ptl = pmd_lock(mm, pmd);
> - page = follow_devmap_pmd(vma, address, pmd, flags, >pgmap);
> + page = follow_devmap_pmd(vma, address, pmd, flags, ctx);
>   spin_unlock(ptl);
>   if (page)
>   return page;
>   }
>   if (likely(!pmd_trans_huge(pmdval)))
> - return follow_page_pte(vma, address, pmd, flags, >pgmap);
> + return follow_page_pte(vma, address, pmd, flags, ctx);
>  
>   if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
>   return no_page_table(vma, flags);
> @@ -389,7 +385,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
> *vma,
>   }
>   if (unlikely(!pmd_trans_huge(*pmd))) {
>   spin_unlock(ptl);
> - return follow_page_pte(vma, address, pmd, flags, >pgmap);
> + return follow_page_pte(vma, address, pmd, flags, ctx);
>   }
>   if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
>   int ret;
> @@ -419,7 +415,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
> *vma,
>   }
>  
>   return ret ? ERR_PTR(ret) :
> - follow_page_pte(vma, address, pmd, flags, >pgmap);
> + follow_page_pte(vma, address, pmd, flags, ctx);
>   }
>   page = follow_trans_huge_pmd(vma, address, pmd, flags);
>   spin_unlock(ptl);
> @@ -456,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct 
> *vma,
>   }
>   if (pud_devmap(*pud)) {
>   ptl = pud_lock(mm, pud);
> - page = follow_devmap_pud(vma, address, pud, flags, >pgmap);
> + page = follow_devmap_pud(vma, address, pud, flags, ctx);
>   spin_unlock(ptl);
>   if (page)
>   return page;
> @@ -786,7 +782,8 @@ static int check_vma_flags(struct vm_area_struct *vma, 
> unsigned long gup_flags)
>  static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
>   unsigned long start, unsigned long nr_pages,
>   unsigned int gup_flags, struct page **pages,
> -   

Re: [RFC PATCH v2 15/19] mm/gup: Introduce vaddr_pin_pages()

2019-08-09 Thread John Hubbard
On 8/9/19 3:58 PM, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> The addition of FOLL_LONGTERM has taken on additional meaning for CMA
> pages.
> 
> In addition subsystems such as RDMA require new information to be passed
> to the GUP interface to track file owning information.  As such a simple
> FOLL_LONGTERM flag is no longer sufficient for these users to pin pages.
> 
> Introduce a new GUP like call which takes the newly introduced vaddr_pin
> information.  Failure to pass the vaddr_pin object back to a vaddr_put*
> call will result in a failure if pins were created on files during the
> pin operation.
> 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes from list:
>   Change to vaddr_put_pages_dirty_lock
>   Change to vaddr_unpin_pages_dirty_lock
> 
>  include/linux/mm.h |  5 
>  mm/gup.c   | 59 ++
>  2 files changed, 64 insertions(+)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 657c947bda49..90c5802866df 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1603,6 +1603,11 @@ int account_locked_vm(struct mm_struct *mm, unsigned 
> long pages, bool inc);
>  int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
>   struct task_struct *task, bool bypass_rlim);
>  
> +long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
> +  unsigned int gup_flags, struct page **pages,
> +  struct vaddr_pin *vaddr_pin);
> +void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long 
> nr_pages,
> +   struct vaddr_pin *vaddr_pin, bool make_dirty);

Hi Ira,

OK, the API seems fine to me, anyway. :)

A bit more below...

>  bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page 
> *page);
>  
>  /* Container for pinned pfns / pages */
> diff --git a/mm/gup.c b/mm/gup.c
> index eeaa0ddd08a6..6d23f70d7847 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2536,3 +2536,62 @@ int get_user_pages_fast(unsigned long start, int 
> nr_pages,
>   return ret;
>  }
>  EXPORT_SYMBOL_GPL(get_user_pages_fast);
> +
> +/**
> + * vaddr_pin_pages pin pages by virtual address and return the pages to the
> + * user.
> + *
> + * @addr, start address

What's with the commas? I thought kernel-doc wants colons, like this, right?

@addr: start address


> + * @nr_pages, number of pages to pin
> + * @gup_flags, flags to use for the pin
> + * @pages, array of pages returned
> + * @vaddr_pin, initalized meta information this pin is to be associated
> + * with.
> + *
> + * NOTE regarding vaddr_pin:
> + *
> + * Some callers can share pins via file descriptors to other processes.
> + * Callers such as this should use the f_owner field of vaddr_pin to indicate
> + * the file the fd points to.  All other callers should use the mm this pin 
> is
> + * being made against.  Usually "current->mm".
> + *
> + * Expects mmap_sem to be read locked.
> + */
> +long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
> +  unsigned int gup_flags, struct page **pages,
> +  struct vaddr_pin *vaddr_pin)
> +{
> + long ret;
> +
> + gup_flags |= FOLL_LONGTERM;


Is now the right time to introduce and use FOLL_PIN? If not, then I can always
add it on top of this later, as part of gup-tracking patches. But you did point
out that FOLL_LONGTERM is taking on additional meaning, and so maybe it's better
to split that meaning up right from the start.


> +
> + if (!vaddr_pin || (!vaddr_pin->mm && !vaddr_pin->f_owner))
> + return -EINVAL;
> +
> + ret = __gup_longterm_locked(current,
> + vaddr_pin->mm,
> + addr, nr_pages,
> + pages, NULL, gup_flags,
> + vaddr_pin);
> + return ret;
> +}
> +EXPORT_SYMBOL(vaddr_pin_pages);
> +
> +/**
> + * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
> + *
> + * @pages, array of pages returned
> + * @nr_pages, number of pages in pages
> + * @vaddr_pin, same information passed to vaddr_pin_pages
> + * @make_dirty: whether to mark the pages dirty
> + *
> + * The semantics are similar to put_user_pages_dirty_lock but a vaddr_pin 
> used
> + * in vaddr_pin_pages should be passed back into this call for propper

Typo:
  proper

> + * tracking.
> + */
> +void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long 
> nr_pages,
> +   struct vaddr_pin *vaddr_pin, bool make_dirty)
> +{
> + __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
> +}
> +EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
> 

OK, whew, I'm glad to see the updated _dirty_lock() API used here. :)

thanks,
-- 
John Hubbard
NVIDIA
___
Linux-nvdimm mailing list

Re: [RFC PATCH v2 10/19] mm/gup: Pass a NULL vaddr_pin through GUP fast

2019-08-09 Thread John Hubbard
On 8/9/19 3:58 PM, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> Internally GUP fast needs to know that fast users will not support file
> pins.  Pass NULL for vaddr_pin through the fast call stack so that the
> pin code can return an error if it encounters file backed memory within
> the address range.
> 

Reviewed-by: John Hubbard 

thanks,
-- 
John Hubbard
NVIDIA

> Signed-off-by: Ira Weiny 
> ---
>  mm/gup.c | 65 ++--
>  1 file changed, 40 insertions(+), 25 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 7a449500f0a6..504af3e9a942 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1813,7 +1813,8 @@ static inline struct page *try_get_compound_head(struct 
> page *page, int refs)
>  
>  #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
>  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> -  unsigned int flags, struct page **pages, int *nr)
> +  unsigned int flags, struct page **pages, int *nr,
> +  struct vaddr_pin *vaddr_pin)
>  {
>   struct dev_pagemap *pgmap = NULL;
>   int nr_start = *nr, ret = 0;
> @@ -1894,7 +1895,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
> unsigned long end,
>   * useful to have gup_huge_pmd even if we can't operate on ptes.
>   */
>  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
> -  unsigned int flags, struct page **pages, int *nr)
> +  unsigned int flags, struct page **pages, int *nr,
> +  struct vaddr_pin *vaddr_pin)
>  {
>   return 0;
>  }
> @@ -1903,7 +1905,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
> unsigned long end,
>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && 
> defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
>   unsigned long end, struct page **pages, int *nr,
> - unsigned int flags)
> + unsigned int flags, struct vaddr_pin *vaddr_pin)
>  {
>   int nr_start = *nr;
>   struct dev_pagemap *pgmap = NULL;
> @@ -1938,13 +1940,14 @@ static int __gup_device_huge(unsigned long pfn, 
> unsigned long addr,
>  
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
>   unsigned long end, struct page **pages, int *nr,
> - unsigned int flags)
> + unsigned int flags, struct vaddr_pin *vaddr_pin)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
> + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags,
> +vaddr_pin))
>   return 0;
>  
>   if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
> @@ -1957,13 +1960,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t 
> *pmdp, unsigned long addr,
>  
>  static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
>   unsigned long end, struct page **pages, int *nr,
> - unsigned int flags)
> + unsigned int flags, struct vaddr_pin *vaddr_pin)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
> + if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags,
> +vaddr_pin))
>   return 0;
>  
>   if (unlikely(pud_val(orig) != pud_val(*pudp))) {
> @@ -1975,7 +1979,7 @@ static int __gup_device_huge_pud(pud_t orig, pud_t 
> *pudp, unsigned long addr,
>  #else
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
>   unsigned long end, struct page **pages, int *nr,
> - unsigned int flags)
> + unsigned int flags, struct vaddr_pin *vaddr_pin)
>  {
>   BUILD_BUG();
>   return 0;
> @@ -1983,7 +1987,7 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t 
> *pmdp, unsigned long addr,
>  
>  static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
>   unsigned long end, struct page **pages, int *nr,
> - unsigned int flags)
> + unsigned int flags, struct vaddr_pin *vaddr_pin)
>  {
>   BUILD_BUG();
>   return 0;
> @@ -2075,7 +2079,8 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned 
> long addr,
>  #endif /* CONFIG_ARCH_HAS_HUGEPD */
>  
>  static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, unsigned int flags, struct page **pages, int 
> *nr)
> + unsigned long end, unsigned int flags, struct page **pages,
> + int *nr, struct vaddr_pin *vaddr_pin)
>  {
>   struct page *head, *page;
>   int refs;
> @@ -2087,7 +2092,7 @@ 

Re: [RFC PATCH v2 09/19] mm/gup: Introduce vaddr_pin structure

2019-08-09 Thread John Hubbard
On 8/9/19 3:58 PM, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> Some subsystems need to pass owning file information to GUP calls to
> allow for GUP to associate the "owning file" to any files being pinned
> within the GUP call.
> 
> Introduce an object to specify this information and pass it down through
> some of the GUP call stack.
> 
> Signed-off-by: Ira Weiny 
> ---
>  include/linux/mm.h |  9 +
>  mm/gup.c   | 36 ++--
>  2 files changed, 31 insertions(+), 14 deletions(-)
> 

Looks good, although you may want to combine it with the next patch. 
Otherwise it feels like a "to be continued" when you're reading them.

Either way, though:

Reviewed-by: John Hubbard 


thanks,
-- 
John Hubbard
NVIDIA

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 04f22722b374..befe150d17be 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -971,6 +971,15 @@ static inline bool is_zone_device_page(const struct page 
> *page)
>  }
>  #endif
>  
> +/**
> + * @f_owner The file who "owns this GUP"
> + * @mm The mm who "owns this GUP"
> + */
> +struct vaddr_pin {
> + struct file *f_owner;
> + struct mm_struct *mm;
> +};
> +
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
>  void __put_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> diff --git a/mm/gup.c b/mm/gup.c
> index 0b05e22ac05f..7a449500f0a6 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1005,7 +1005,8 @@ static __always_inline long 
> __get_user_pages_locked(struct task_struct *tsk,
>   struct page **pages,
>   struct vm_area_struct **vmas,
>   int *locked,
> - unsigned int flags)
> + unsigned int flags,
> + struct vaddr_pin *vaddr_pin)
>  {
>   long ret, pages_done;
>   bool lock_dropped;
> @@ -1165,7 +1166,8 @@ long get_user_pages_remote(struct task_struct *tsk, 
> struct mm_struct *mm,
>  
>   return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
>  locked,
> -gup_flags | FOLL_TOUCH | FOLL_REMOTE);
> +gup_flags | FOLL_TOUCH | FOLL_REMOTE,
> +NULL);
>  }
>  EXPORT_SYMBOL(get_user_pages_remote);
>  
> @@ -1320,7 +1322,8 @@ static long __get_user_pages_locked(struct task_struct 
> *tsk,
>   struct mm_struct *mm, unsigned long start,
>   unsigned long nr_pages, struct page **pages,
>   struct vm_area_struct **vmas, int *locked,
> - unsigned int foll_flags)
> + unsigned int foll_flags,
> + struct vaddr_pin *vaddr_pin)
>  {
>   struct vm_area_struct *vma;
>   unsigned long vm_flags;
> @@ -1504,7 +1507,7 @@ static long check_and_migrate_cma_pages(struct 
> task_struct *tsk,
>*/
>   nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
>  pages, vmas, NULL,
> -gup_flags);
> +gup_flags, NULL);
>  
>   if ((nr_pages > 0) && migrate_allow) {
>   drain_allow = true;
> @@ -1537,7 +1540,8 @@ static long __gup_longterm_locked(struct task_struct 
> *tsk,
> unsigned long nr_pages,
> struct page **pages,
> struct vm_area_struct **vmas,
> -   unsigned int gup_flags)
> +   unsigned int gup_flags,
> +   struct vaddr_pin *vaddr_pin)
>  {
>   struct vm_area_struct **vmas_tmp = vmas;
>   unsigned long flags = 0;
> @@ -1558,7 +1562,7 @@ static long __gup_longterm_locked(struct task_struct 
> *tsk,
>   }
>  
>   rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
> -  vmas_tmp, NULL, gup_flags);
> +  vmas_tmp, NULL, gup_flags, vaddr_pin);
>  
>   if (gup_flags & FOLL_LONGTERM) {
>   memalloc_nocma_restore(flags);
> @@ -1588,10 +1592,11 @@ static __always_inline long 
> __gup_longterm_locked(struct task_struct *tsk,
> unsigned long nr_pages,
> struct page **pages,
> struct vm_area_struct **vmas,
> -   unsigned int flags)
> +   unsigned int flags,
> +   struct vaddr_pin *vaddr_pin)
>  {
>   return 

Re: [RFC PATCH v2 01/19] fs/locks: Export F_LAYOUT lease to user space

2019-08-09 Thread Dave Chinner
On Fri, Aug 09, 2019 at 03:58:15PM -0700, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> In order to support an opt-in policy for users to allow long term pins
> of FS DAX pages we need to export the LAYOUT lease to user space.
> 
> This is the first of 2 new lease flags which must be used to allow a
> long term pin to be made on a file.
> 
> After the complete series:
> 
> 0) Registrations to Device DAX char devs are not affected
> 
> 1) The user has to opt in to allowing page pins on a file with an exclusive
>layout lease.  Both exclusive and layout lease flags are user visible now.
> 
> 2) page pins will fail if the lease is not active when the file back page is
>encountered.
> 
> 3) Any truncate or hole punch operation on a pinned DAX page will fail.
> 
> 4) The user has the option of holding the lease or releasing it.  If they
>release it no other pin calls will work on the file.
> 
> 5) Closing the file is ok.
> 
> 6) Unmapping the file is ok
> 
> 7) Pins against the files are tracked back to an owning file or an owning mm
>depending on the internal subsystem needs.  With RDMA there is an owning
>file which is related to the pined file.
> 
> 8) Only RDMA is currently supported
> 
> 9) Truncation of pages which are not actively pinned nor covered by a lease
>will succeed.

This has nothing to do with layout leases or what they provide
access arbitration over. Layout leases have _nothing_ to do with
page pinning or RDMA - they arbitrate behaviour the file offset ->
physical block device mapping within the filesystem and the
behaviour that will occur when a specific lease is held.

The commit descripting needs to describe what F_LAYOUT actually
protects, when they'll get broken, etc, not how RDMA is going to use
it.

> @@ -2022,8 +2030,26 @@ static int do_fcntl_add_lease(unsigned int fd, struct 
> file *filp, long arg)
>   struct file_lock *fl;
>   struct fasync_struct *new;
>   int error;
> + unsigned int flags = 0;
> +
> + /*
> +  * NOTE on F_LAYOUT lease
> +  *
> +  * LAYOUT lease types are taken on files which the user knows that
> +  * they will be pinning in memory for some indeterminate amount of
> +  * time.

Indeed, layout leases have nothing to do with pinning of memory.
That's something an application taht uses layout leases might do,
but it largely irrelevant to the functionality layout leases
provide. What needs to be done here is explain what the layout lease
API actually guarantees w.r.t. the physical file layout, not what
some application is going to do with a lease. e.g.

The layout lease F_RDLCK guarantees that the holder will be
notified that the physical file layout is about to be
changed, and that it needs to release any resources it has
over the range of this lease, drop the lease and then
request it again to wait for the kernel to finish whatever
it is doing on that range.

The layout lease F_RDLCK also allows the holder to modify
the physical layout of the file. If an operation from the
lease holder occurs that would modify the layout, that lease
holder does not get notification that a change will occur,
but it will block until all other F_RDLCK leases have been
released by their holders before going ahead.

If there is a F_WRLCK lease held on the file, then a F_RDLCK
holder will fail any operation that may modify the physical
layout of the file. F_WRLCK provides exclusive physical
modification access to the holder, guaranteeing nothing else
will change the layout of the file while it holds the lease.

The F_WRLCK holder can change the physical layout of the
file if it so desires, this will block while F_RDLCK holders
are notified and release their leases before the
modification will take place.

We need to define the semantics we expose to userspace first.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


Re: [RFC PATCH v2 08/19] fs/xfs: Fail truncate if page lease can't be broken

2019-08-09 Thread Dave Chinner
On Fri, Aug 09, 2019 at 03:58:22PM -0700, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> If pages are under a lease fail the truncate operation.  We change the order 
> of
> lease breaks to directly fail the operation if the lease exists.
> 
> Select EXPORT_BLOCK_OPS for FS_DAX to ensure that xfs_break_lease_layouts() is
> defined for FS_DAX as well as pNFS.
> 
> Signed-off-by: Ira Weiny 
> ---
>  fs/Kconfig| 1 +
>  fs/xfs/xfs_file.c | 5 +++--
>  2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/Kconfig b/fs/Kconfig
> index 14cd4abdc143..c10b91f92528 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -48,6 +48,7 @@ config FS_DAX
>   select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
>   select FS_IOMAP
>   select DAX
> + select EXPORTFS_BLOCK_OPS
>   help
> Direct Access (DAX) can be used on memory-backed block devices.
> If the block device supports DAX and the filesystem supports DAX,

That looks wrong. If you require xfs_break_lease_layouts() outside
of pnfs context, then move the function in the XFS code base to a
file that is built in. It's only external dependency is on the
break_layout() function, and XFS already has other unconditional
direct calls to break_layout()...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 14/19] fs/locks: Associate file pins while performing GUP

2019-08-09 Thread ira . weiny
From: Ira Weiny 

When a file back area is being pinned add the appropriate file pin
information to the appropriate file or mm owner.  This information can
then be used by admins to determine who is causing a failure to change
the layout of a file.

Signed-off-by: Ira Weiny 
---
 fs/locks.c | 195 -
 include/linux/mm.h |  35 +++-
 mm/gup.c   |   8 +-
 mm/huge_memory.c   |   4 +-
 4 files changed, 230 insertions(+), 12 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 14892c84844b..02c525446d25 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -168,6 +168,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define CREATE_TRACE_POINTS
 #include 
@@ -2972,9 +2973,194 @@ static int __init filelock_init(void)
 }
 core_initcall(filelock_init);
 
+static struct file_file_pin *alloc_file_file_pin(struct inode *inode,
+struct file *file)
+{
+   struct file_file_pin *fp = kzalloc(sizeof(*fp), GFP_ATOMIC);
+
+   if (!fp)
+   return ERR_PTR(-ENOMEM);
+
+   INIT_LIST_HEAD(>list);
+   kref_init(>ref);
+   return fp;
+}
+
+static int add_file_pin_to_f_owner(struct vaddr_pin *vaddr_pin,
+  struct inode *inode,
+  struct file *file)
+{
+   struct file_file_pin *fp;
+
+   list_for_each_entry(fp, _pin->f_owner->file_pins, list) {
+   if (fp->file == file) {
+   kref_get(>ref);
+   return 0;
+   }
+   }
+
+   fp = alloc_file_file_pin(inode, file);
+   if (IS_ERR(fp))
+   return PTR_ERR(fp);
+
+   fp->file = get_file(file);
+   /* NOTE no reference needed here.
+* It is expected that the caller holds a reference to the owner file
+* for the duration of this pin.
+*/
+   fp->f_owner = vaddr_pin->f_owner;
+
+   spin_lock(>f_owner->fp_lock);
+   list_add(>list, >f_owner->file_pins);
+   spin_unlock(>f_owner->fp_lock);
+
+   return 0;
+}
+
+static void release_file_file_pin(struct kref *ref)
+{
+   struct file_file_pin *fp = container_of(ref, struct file_file_pin, ref);
+
+   spin_lock(>f_owner->fp_lock);
+   list_del(>list);
+   spin_unlock(>f_owner->fp_lock);
+   fput(fp->file);
+   kfree(fp);
+}
+
+static struct mm_file_pin *alloc_mm_file_pin(struct inode *inode,
+struct file *file)
+{
+   struct mm_file_pin *fp = kzalloc(sizeof(*fp), GFP_ATOMIC);
+
+   if (!fp)
+   return ERR_PTR(-ENOMEM);
+
+   INIT_LIST_HEAD(>list);
+   kref_init(>ref);
+   return fp;
+}
+
+/**
+ * This object bridges files and the mm struct for the purpose of tracking
+ * which files have GUP pins on them.
+ */
+static int add_file_pin_to_mm(struct vaddr_pin *vaddr_pin, struct inode *inode,
+ struct file *file)
+{
+   struct mm_file_pin *fp;
+
+   list_for_each_entry(fp, _pin->mm->file_pins, list) {
+   if (fp->inode == inode) {
+   kref_get(>ref);
+   return 0;
+   }
+   }
+
+   fp = alloc_mm_file_pin(inode, file);
+   if (IS_ERR(fp))
+   return PTR_ERR(fp);
+
+   fp->inode = igrab(inode);
+   if (!fp->inode) {
+   kfree(fp);
+   return -EFAULT;
+   }
+
+   fp->file = get_file(file);
+   fp->mm = vaddr_pin->mm;
+   mmgrab(fp->mm);
+
+   spin_lock(>mm->fp_lock);
+   list_add(>list, >mm->file_pins);
+   spin_unlock(>mm->fp_lock);
+
+   return 0;
+}
+
+static void release_mm_file_pin(struct kref *ref)
+{
+   struct mm_file_pin *fp = container_of(ref, struct mm_file_pin, ref);
+
+   spin_lock(>mm->fp_lock);
+   list_del(>list);
+   spin_unlock(>mm->fp_lock);
+
+   mmdrop(fp->mm);
+   fput(fp->file);
+   iput(fp->inode);
+   kfree(fp);
+}
+
+static void remove_file_file_pin(struct vaddr_pin *vaddr_pin)
+{
+   struct file_file_pin *fp;
+   struct file_file_pin *tmp;
+
+   list_for_each_entry_safe(fp, tmp, _pin->f_owner->file_pins,
+list) {
+   kref_put(>ref, release_file_file_pin);
+   }
+}
+
+static void remove_mm_file_pin(struct vaddr_pin *vaddr_pin,
+  struct inode *inode)
+{
+   struct mm_file_pin *fp;
+   struct mm_file_pin *tmp;
+
+   list_for_each_entry_safe(fp, tmp, _pin->mm->file_pins, list) {
+   if (fp->inode == inode)
+   kref_put(>ref, release_mm_file_pin);
+   }
+}
+
+static bool add_file_pin(struct vaddr_pin *vaddr_pin, struct inode *inode,
+struct file *file)
+{
+   bool ret = true;
+
+   if (!vaddr_pin || (!vaddr_pin->f_owner && !vaddr_pin->mm))
+   return false;
+
+   if (vaddr_pin->f_owner) {
+  

[RFC PATCH v2 15/19] mm/gup: Introduce vaddr_pin_pages()

2019-08-09 Thread ira . weiny
From: Ira Weiny 

The addition of FOLL_LONGTERM has taken on additional meaning for CMA
pages.

In addition subsystems such as RDMA require new information to be passed
to the GUP interface to track file owning information.  As such a simple
FOLL_LONGTERM flag is no longer sufficient for these users to pin pages.

Introduce a new GUP like call which takes the newly introduced vaddr_pin
information.  Failure to pass the vaddr_pin object back to a vaddr_put*
call will result in a failure if pins were created on files during the
pin operation.

Signed-off-by: Ira Weiny 

---
Changes from list:
Change to vaddr_put_pages_dirty_lock
Change to vaddr_unpin_pages_dirty_lock

 include/linux/mm.h |  5 
 mm/gup.c   | 59 ++
 2 files changed, 64 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 657c947bda49..90c5802866df 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1603,6 +1603,11 @@ int account_locked_vm(struct mm_struct *mm, unsigned 
long pages, bool inc);
 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
 
+long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
+unsigned int gup_flags, struct page **pages,
+struct vaddr_pin *vaddr_pin);
+void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
+ struct vaddr_pin *vaddr_pin, bool make_dirty);
 bool mapping_inode_has_layout(struct vaddr_pin *vaddr_pin, struct page *page);
 
 /* Container for pinned pfns / pages */
diff --git a/mm/gup.c b/mm/gup.c
index eeaa0ddd08a6..6d23f70d7847 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2536,3 +2536,62 @@ int get_user_pages_fast(unsigned long start, int 
nr_pages,
return ret;
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/**
+ * vaddr_pin_pages pin pages by virtual address and return the pages to the
+ * user.
+ *
+ * @addr, start address
+ * @nr_pages, number of pages to pin
+ * @gup_flags, flags to use for the pin
+ * @pages, array of pages returned
+ * @vaddr_pin, initalized meta information this pin is to be associated
+ * with.
+ *
+ * NOTE regarding vaddr_pin:
+ *
+ * Some callers can share pins via file descriptors to other processes.
+ * Callers such as this should use the f_owner field of vaddr_pin to indicate
+ * the file the fd points to.  All other callers should use the mm this pin is
+ * being made against.  Usually "current->mm".
+ *
+ * Expects mmap_sem to be read locked.
+ */
+long vaddr_pin_pages(unsigned long addr, unsigned long nr_pages,
+unsigned int gup_flags, struct page **pages,
+struct vaddr_pin *vaddr_pin)
+{
+   long ret;
+
+   gup_flags |= FOLL_LONGTERM;
+
+   if (!vaddr_pin || (!vaddr_pin->mm && !vaddr_pin->f_owner))
+   return -EINVAL;
+
+   ret = __gup_longterm_locked(current,
+   vaddr_pin->mm,
+   addr, nr_pages,
+   pages, NULL, gup_flags,
+   vaddr_pin);
+   return ret;
+}
+EXPORT_SYMBOL(vaddr_pin_pages);
+
+/**
+ * vaddr_unpin_pages_dirty_lock - counterpart to vaddr_pin_pages
+ *
+ * @pages, array of pages returned
+ * @nr_pages, number of pages in pages
+ * @vaddr_pin, same information passed to vaddr_pin_pages
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * The semantics are similar to put_user_pages_dirty_lock but a vaddr_pin used
+ * in vaddr_pin_pages should be passed back into this call for propper
+ * tracking.
+ */
+void vaddr_unpin_pages_dirty_lock(struct page **pages, unsigned long nr_pages,
+ struct vaddr_pin *vaddr_pin, bool make_dirty)
+{
+   __put_user_pages_dirty_lock(vaddr_pin, pages, nr_pages, make_dirty);
+}
+EXPORT_SYMBOL(vaddr_unpin_pages_dirty_lock);
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 17/19] RDMA/umem: Convert to vaddr_[pin|unpin]* operations.

2019-08-09 Thread ira . weiny
From: Ira Weiny 

In order to properly track the pinning information we need to keep a
vaddr_pin object around.  Store that within the umem object directly.

The vaddr_pin object allows the GUP code to associate any files it pins
with the RDMA file descriptor associated with this GUP.

Furthermore, use the vaddr_pin object to store the owning mm while we
are at it.

No references need to be taken on the owing file as the lifetime of that
object is tied to all the umems being destroyed first.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/core/umem.c | 26 +-
 drivers/infiniband/core/umem_odp.c | 16 
 include/rdma/ib_umem.h |  2 +-
 3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 965cf9dea71a..a9ce3e3816ef 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -54,7 +54,8 @@ static void __ib_umem_release(struct ib_device *dev, struct 
ib_umem *umem, int d
 
for_each_sg_page(umem->sg_head.sgl, _iter, umem->sg_nents, 0) {
page = sg_page_iter_page(_iter);
-   put_user_pages_dirty_lock(, 1, umem->writable && dirty);
+   vaddr_unpin_pages_dirty_lock(, 1, >vaddr_pin,
+umem->writable && dirty);
}
 
sg_free_table(>sg_head);
@@ -243,8 +244,15 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
umem->length = size;
umem->address= addr;
umem->writable   = ib_access_writable(access);
-   umem->owning_mm = mm = current->mm;
-   mmgrab(mm);
+   umem->vaddr_pin.mm = mm = current->mm;
+   mmgrab(umem->vaddr_pin.mm);
+
+   /* No need to get a reference to the core file object here.  The key is
+* that sys_file reference is held by the ufile.  Any duplication of
+* sys_file by the core will keep references active until all those
+* contexts are closed out.  No matter which process hold them open.
+*/
+   umem->vaddr_pin.f_owner = context->ufile->sys_file;
 
if (access & IB_ACCESS_ON_DEMAND) {
if (WARN_ON_ONCE(!context->invalidate_range)) {
@@ -292,11 +300,11 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
 
while (npages) {
down_read(>mmap_sem);
-   ret = get_user_pages(cur_base,
+   ret = vaddr_pin_pages(cur_base,
 min_t(unsigned long, npages,
   PAGE_SIZE / sizeof (struct page *)),
-gup_flags | FOLL_LONGTERM,
-page_list, NULL);
+gup_flags,
+page_list, >vaddr_pin);
if (ret < 0) {
up_read(>mmap_sem);
goto umem_release;
@@ -336,7 +344,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
unsigned long addr,
free_page((unsigned long) page_list);
 umem_kfree:
if (ret) {
-   mmdrop(umem->owning_mm);
+   mmdrop(umem->vaddr_pin.mm);
kfree(umem);
}
return ret ? ERR_PTR(ret) : umem;
@@ -345,7 +353,7 @@ EXPORT_SYMBOL(ib_umem_get);
 
 static void __ib_umem_release_tail(struct ib_umem *umem)
 {
-   mmdrop(umem->owning_mm);
+   mmdrop(umem->vaddr_pin.mm);
if (umem->is_odp)
kfree(to_ib_umem_odp(umem));
else
@@ -369,7 +377,7 @@ void ib_umem_release(struct ib_umem *umem)
 
__ib_umem_release(umem->context->device, umem, 1);
 
-   atomic64_sub(ib_umem_num_pages(umem), >owning_mm->pinned_vm);
+   atomic64_sub(ib_umem_num_pages(umem), >vaddr_pin.mm->pinned_vm);
__ib_umem_release_tail(umem);
 }
 EXPORT_SYMBOL(ib_umem_release);
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 2a75c6f8d827..53085896d718 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -278,11 +278,11 @@ static int get_per_mm(struct ib_umem_odp *umem_odp)
 */
mutex_lock(>per_mm_list_lock);
list_for_each_entry(per_mm, >per_mm_list, ucontext_list) {
-   if (per_mm->mm == umem_odp->umem.owning_mm)
+   if (per_mm->mm == umem_odp->umem.vaddr_pin.mm)
goto found;
}
 
-   per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm);
+   per_mm = alloc_per_mm(ctx, umem_odp->umem.vaddr_pin.mm);
if (IS_ERR(per_mm)) {
mutex_unlock(>per_mm_list_lock);
return PTR_ERR(per_mm);
@@ -355,8 +355,8 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_umem_odp 
*root,
umem->writable   = root->umem.writable;
umem->is_odp = 1;
odp_data->per_mm = per_mm;
-   umem->owning_mm  

[RFC PATCH v2 10/19] mm/gup: Pass a NULL vaddr_pin through GUP fast

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Internally GUP fast needs to know that fast users will not support file
pins.  Pass NULL for vaddr_pin through the fast call stack so that the
pin code can return an error if it encounters file backed memory within
the address range.

Signed-off-by: Ira Weiny 
---
 mm/gup.c | 65 ++--
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 7a449500f0a6..504af3e9a942 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1813,7 +1813,8 @@ static inline struct page *try_get_compound_head(struct 
page *page, int refs)
 
 #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-unsigned int flags, struct page **pages, int *nr)
+unsigned int flags, struct page **pages, int *nr,
+struct vaddr_pin *vaddr_pin)
 {
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1894,7 +1895,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
  * useful to have gup_huge_pmd even if we can't operate on ptes.
  */
 static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
-unsigned int flags, struct page **pages, int *nr)
+unsigned int flags, struct page **pages, int *nr,
+struct vaddr_pin *vaddr_pin)
 {
return 0;
 }
@@ -1903,7 +1905,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr,
-   unsigned int flags)
+   unsigned int flags, struct vaddr_pin *vaddr_pin)
 {
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
@@ -1938,13 +1940,14 @@ static int __gup_device_huge(unsigned long pfn, 
unsigned long addr,
 
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr,
-   unsigned int flags)
+   unsigned int flags, struct vaddr_pin *vaddr_pin)
 {
unsigned long fault_pfn;
int nr_start = *nr;
 
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
+   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags,
+  vaddr_pin))
return 0;
 
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
@@ -1957,13 +1960,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t 
*pmdp, unsigned long addr,
 
 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr,
-   unsigned int flags)
+   unsigned int flags, struct vaddr_pin *vaddr_pin)
 {
unsigned long fault_pfn;
int nr_start = *nr;
 
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
+   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags,
+  vaddr_pin))
return 0;
 
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
@@ -1975,7 +1979,7 @@ static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, 
unsigned long addr,
 #else
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
unsigned long end, struct page **pages, int *nr,
-   unsigned int flags)
+   unsigned int flags, struct vaddr_pin *vaddr_pin)
 {
BUILD_BUG();
return 0;
@@ -1983,7 +1987,7 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, 
unsigned long addr,
 
 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long end, struct page **pages, int *nr,
-   unsigned int flags)
+   unsigned int flags, struct vaddr_pin *vaddr_pin)
 {
BUILD_BUG();
return 0;
@@ -2075,7 +2079,8 @@ static inline int gup_huge_pd(hugepd_t hugepd, unsigned 
long addr,
 #endif /* CONFIG_ARCH_HAS_HUGEPD */
 
 static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-   unsigned long end, unsigned int flags, struct page **pages, int 
*nr)
+   unsigned long end, unsigned int flags, struct page **pages,
+   int *nr, struct vaddr_pin *vaddr_pin)
 {
struct page *head, *page;
int refs;
@@ -2087,7 +2092,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned 
long addr,
if (unlikely(flags & FOLL_LONGTERM))
return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr,
-   

[RFC PATCH v2 07/19] fs/xfs: Teach xfs to use new dax_layout_busy_page()

2019-08-09 Thread ira . weiny
From: Ira Weiny 

dax_layout_busy_page() can now operate on a sub-range of the
address_space provided.

Have xfs specify the sub range to dax_layout_busy_page()

Signed-off-by: Ira Weiny 
---
 fs/xfs/xfs_file.c  | 19 +--
 fs/xfs/xfs_inode.h |  5 +++--
 fs/xfs/xfs_ioctl.c | 15 ---
 fs/xfs/xfs_iops.c  | 14 ++
 4 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 8f8d478f9ec6..447571e3cb02 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -295,7 +295,11 @@ xfs_file_aio_write_checks(
if (error <= 0)
return error;
 
-   error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
+   /*
+* BREAK_WRITE ignores offset/len tuple just specify the whole file
+* (0 - ULONG_MAX to be safe.
+*/
+   error = xfs_break_layouts(inode, iolock, 0, ULONG_MAX, BREAK_WRITE);
if (error)
return error;
 
@@ -734,14 +738,15 @@ xfs_wait_dax_page(
 static int
 xfs_break_dax_layouts(
struct inode*inode,
-   bool*retry)
+   bool*retry,
+   loff_t   off,
+   loff_t   len)
 {
struct page *page;
 
ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
 
-   /* We default to the "whole file" */
-   page = dax_layout_busy_page(inode->i_mapping, 0, ULONG_MAX);
+   page = dax_layout_busy_page(inode->i_mapping, off, len);
if (!page)
return 0;
 
@@ -755,6 +760,8 @@ int
 xfs_break_layouts(
struct inode*inode,
uint*iolock,
+   loff_t   off,
+   loff_t   len,
enum layout_break_reason reason)
 {
boolretry;
@@ -766,7 +773,7 @@ xfs_break_layouts(
retry = false;
switch (reason) {
case BREAK_UNMAP:
-   error = xfs_break_dax_layouts(inode, );
+   error = xfs_break_dax_layouts(inode, , off, len);
if (error || retry)
break;
/* fall through */
@@ -808,7 +815,7 @@ xfs_file_fallocate(
return -EOPNOTSUPP;
 
xfs_ilock(ip, iolock);
-   error = xfs_break_layouts(inode, , BREAK_UNMAP);
+   error = xfs_break_layouts(inode, , offset, len, BREAK_UNMAP);
if (error)
goto out_unlock;
 
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f95a03..1b0948f5267c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -475,8 +475,9 @@ enum xfs_prealloc_flags {
 
 intxfs_update_prealloc_flags(struct xfs_inode *ip,
  enum xfs_prealloc_flags flags);
-intxfs_break_layouts(struct inode *inode, uint *iolock,
-   enum layout_break_reason reason);
+int xfs_break_layouts(struct inode *inode, uint *iolock,
+ loff_t off, loff_t len,
+ enum layout_break_reason reason);
 
 /* from xfs_iops.c */
 extern void xfs_setup_inode(struct xfs_inode *ip);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6f7848cd5527..3897b88080bd 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -597,6 +597,7 @@ xfs_ioc_space(
enum xfs_prealloc_flags flags = 0;
uintiolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
+   loff_t  break_length;
 
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
return -EPERM;
@@ -617,9 +618,6 @@ xfs_ioc_space(
return error;
 
xfs_ilock(ip, iolock);
-   error = xfs_break_layouts(inode, , BREAK_UNMAP);
-   if (error)
-   goto out_unlock;
 
switch (bf->l_whence) {
case 0: /*SEEK_SET*/
@@ -665,6 +663,17 @@ xfs_ioc_space(
goto out_unlock;
}
 
+   /* break layout for the whole file if len ends up 0 */
+   if (bf->l_len == 0)
+   break_length = ULONG_MAX;
+   else
+   break_length = bf->l_len;
+
+   error = xfs_break_layouts(inode, , bf->l_start, break_length,
+ BREAK_UNMAP);
+   if (error)
+   goto out_unlock;
+
switch (cmd) {
case XFS_IOC_ZERO_RANGE:
flags |= XFS_PREALLOC_SET;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ff3c1fae5357..f0de5486f6c1 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1042,10 +1042,16 @@ xfs_vn_setattr(
xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
 
-   error = xfs_break_layouts(inode, , BREAK_UNMAP);
-   if (error) {
-   xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
-   return error;
+

[RFC PATCH v2 08/19] fs/xfs: Fail truncate if page lease can't be broken

2019-08-09 Thread ira . weiny
From: Ira Weiny 

If pages are under a lease fail the truncate operation.  We change the order of
lease breaks to directly fail the operation if the lease exists.

Select EXPORT_BLOCK_OPS for FS_DAX to ensure that xfs_break_lease_layouts() is
defined for FS_DAX as well as pNFS.

Signed-off-by: Ira Weiny 
---
 fs/Kconfig| 1 +
 fs/xfs/xfs_file.c | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/Kconfig b/fs/Kconfig
index 14cd4abdc143..c10b91f92528 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ config FS_DAX
select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED)
select FS_IOMAP
select DAX
+   select EXPORTFS_BLOCK_OPS
help
  Direct Access (DAX) can be used on memory-backed block devices.
  If the block device supports DAX and the filesystem supports DAX,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 447571e3cb02..850d0a0953a2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -773,10 +773,11 @@ xfs_break_layouts(
retry = false;
switch (reason) {
case BREAK_UNMAP:
-   error = xfs_break_dax_layouts(inode, , off, len);
+   error = xfs_break_leased_layouts(inode, iolock, );
if (error || retry)
break;
-   /* fall through */
+   error = xfs_break_dax_layouts(inode, , off, len);
+   break;
case BREAK_WRITE:
error = xfs_break_leased_layouts(inode, iolock, );
break;
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 00/19] RDMA/FS DAX truncate proposal V1,000,002 ;-)

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Pre-requisites
==
Based on mmotm tree.

Based on the feedback from LSFmm, the LWN article, the RFC series since
then, and a ton of scenarios I've worked in my mind and/or tested...[1]

Solution summary


The real issue is that there is no use case for a user to have RDMA pinn'ed
memory which is then truncated.  So really any solution we present which:

A) Prevents file system corruption or data leaks
...and...
B) Informs the user that they did something wrong

Should be an acceptable solution.

Because this is slightly new behavior.  And because this is going to be
specific to DAX (because of the lack of a page cache) we have made the user
"opt in" to this behavior.

The following patches implement the following solution.

0) Registrations to Device DAX char devs are not affected

1) The user has to opt in to allowing page pins on a file with an exclusive
   layout lease.  Both exclusive and layout lease flags are user visible now.

2) page pins will fail if the lease is not active when the file back page is
   encountered.

3) Any truncate or hole punch operation on a pinned DAX page will fail.

4) The user has the option of holding the lease or releasing it.  If they
   release it no other pin calls will work on the file.

5) Closing the file is ok.

6) Unmapping the file is ok

7) Pins against the files are tracked back to an owning file or an owning mm
   depending on the internal subsystem needs.  With RDMA there is an owning
   file which is related to the pined file.

8) Only RDMA is currently supported

9) Truncation of pages which are not actively pinned nor covered by a lease
   will succeed.


Reporting of pinned files in procfs
===

A number of alternatives were explored for how to report the file pins within
procfs.  The following incorporates ideas from Jan Kara, Jason Gunthorpe, Dave
Chinner, Dan Williams and myself.

A new entry is added to procfs

/proc//file_pins

For processes which have pinned DAX file memory file_pins reference come in 2
flavors.  Those which are attached to another open file descriptor (For example
what is done in the RDMA subsytem) and those which are attached to a process
mm.

For those which are attached to another open file descriptor (such as RDMA)
the file pin references go through the 'struct file' associated with that pin.
In RDMA this is the RDMA context struct file.

The resulting output from proc fs is something like.

$ cat /proc//file_pins
3: /dev/infiniband/uverbs0
/mnt/pmem/foo

Where '3' is the file descriptor (and file path) of the rdma context within the
process.  The paths of the files pinned using that context are then listed.

RDMA contexts may have multiple MR each of which may have multiple files pinned
within them.  So an output like the following is possible.

$ cat /proc//file_pins
4: /dev/infiniband/uverbs0
/mnt/pmem/foo
/mnt/pmem/bar
/mnt/pmem/another
/mnt/pmem/one

The actual memory regions associated with the file pins are not reported.

For processes which are pinning memory which is not associated with a specific
file descriptor memory pins are reported directly as paths to the file.

$ cat /proc//file_pins
/mnt/pmem/foo

Putting the above together if a process was using RDMA and another subsystem
the output could be something like:


$ cat /proc//file_pins
4: /dev/infiniband/uverbs0
/mnt/pmem/foo
/mnt/pmem/bar
/mnt/pmem/another
/mnt/pmem/one
/mnt/pmem/foo
/mnt/pmem/another
/mnt/pmem/mm_mapped_file


[1] https://lkml.org/lkml/2019/6/5/1046


Background
==

It should be noted that one solution for this problem is to use RDMA's On
Demand Paging (ODP).  There are 2 big reasons this may not work.

1) The hardware being used for RDMA may not support ODP
2) ODP may be detrimental to the over all network (cluster or cloud)
   performance

Therefore, in order to support RDMA to File system pages without On Demand
Paging (ODP) a number of things need to be done.

1) "longterm" GUP users need to inform other subsystems that they have taken a
   pin on a page which may remain pinned for a very "long time".  The
   definition of long time is debatable but it has been established that RDMAs
   use of pages for, minutes, hours, or even days after the pin is the extreme
   case which makes this problem most severe.

2) Any page which is "controlled" by a file system needs to have special
   handling.  The details of the handling depends on if the page is page cache
   fronted or not.

   2a) A page cache fronted page which has been pinned by GUP long term can use 
a
   bounce buffer to allow the file system to write back snap shots of the page.
   This is handled by the FS recognizing the GUP long term pin and making a copy
   of the page to be written back.
NOTE: this patch set does not address this path.

   2b) A FS "controlled" page which is not page 

[RFC PATCH v2 13/19] {mm,file}: Add file_pins objects

2019-08-09 Thread ira . weiny
From: Ira Weiny 

User page pins (aka GUP) needs to track file information of files being
pinned by those calls.  Depending on the needs of the caller this
information is stored in 1 of 2 ways.

1) Some subsystems like RDMA associate GUP pins with file descriptors
   which can be passed around to other process'.  In this case a file
   being pined must be associated with an owning file object (which can
   then be resolved back to any of the processes which have a file
   descriptor 'pointing' to that file object).

2) Other subsystems do not have an owning file and can therefore
   associate the file pin directly to the mm of the process which
   created them.

This patch introduces the new file pin structures and ensures struct
file and struct mm_struct are prepared to store them.

In subsequent patches the required information will be passed into new
pin page calls and procfs is enhanced to show this information to the user.

Signed-off-by: Ira Weiny 
---
 fs/file_table.c  |  4 
 include/linux/file.h | 49 
 include/linux/fs.h   |  2 ++
 include/linux/mm_types.h |  2 ++
 kernel/fork.c|  3 +++
 5 files changed, 60 insertions(+)

diff --git a/fs/file_table.c b/fs/file_table.c
index b07b53f24ff5..38947b9a4769 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -46,6 +46,7 @@ static void file_free_rcu(struct rcu_head *head)
 {
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
 
+   WARN_ON(!list_empty(>file_pins));
put_cred(f->f_cred);
kmem_cache_free(filp_cachep, f);
 }
@@ -118,6 +119,9 @@ static struct file *__alloc_file(int flags, const struct 
cred *cred)
f->f_mode = OPEN_FMODE(flags);
/* f->f_version: 0 */
 
+   INIT_LIST_HEAD(>file_pins);
+   spin_lock_init(>fp_lock);
+
return f;
 }
 
diff --git a/include/linux/file.h b/include/linux/file.h
index 3fcddff56bc4..cd79adad5b23 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct file;
 
@@ -91,4 +92,52 @@ extern void fd_install(unsigned int fd, struct file *file);
 extern void flush_delayed_fput(void);
 extern void __fput_sync(struct file *);
 
+/**
+ * struct file_file_pin
+ *
+ * Associate a pin'ed file with another file owner.
+ *
+ * Subsystems such as RDMA have the ability to pin memory which is associated
+ * with a file descriptor which can be passed to other processes without
+ * necessarily having that memory accessed in the remote processes address
+ * space.
+ *
+ * @file file backing memory which was pined by a GUP caller
+ * @f_owner the file representing the GUP owner
+ * @list of all file pins this owner has
+ *   (struct file *)->file_pins
+ * @ref number of times this pin was taken (roughly the number of pages pinned
+ *  in the file)
+ */
+struct file_file_pin {
+   struct file *file;
+   struct file *f_owner;
+   struct list_head list;
+   struct kref ref;
+};
+
+/*
+ * struct mm_file_pin
+ *
+ * Some GUP callers do not have an "owning" file.  Those pins are accounted for
+ * in the mm of the process that called GUP.
+ *
+ * The tuple {file, inode} is used to track this as a unique file pin and to
+ * track when this pin has been removed.
+ *
+ * @file file backing memory which was pined by a GUP caller
+ * @mm back point to owning mm
+ * @inode backing the file
+ * @list of all file pins this owner has
+ *   (struct mm_struct *)->file_pins
+ * @ref number of times this pin was taken
+ */
+struct mm_file_pin {
+   struct file *file;
+   struct mm_struct *mm;
+   struct inode *inode;
+   struct list_head list;
+   struct kref ref;
+};
+
 #endif /* __LINUX_FILE_H */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2e41ce547913..d2e08feb9737 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -963,6 +963,8 @@ struct file {
 #endif /* #ifdef CONFIG_EPOLL */
struct address_space*f_mapping;
errseq_tf_wb_err;
+   struct list_headfile_pins;
+   spinlock_t  fp_lock;
 } __randomize_layout
   __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 6a7a1083b6fb..4f6ea4acddbd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -516,6 +516,8 @@ struct mm_struct {
/* HMM needs to track a few things per mm */
struct hmm *hmm;
 #endif
+   struct list_head file_pins;
+   spinlock_t fp_lock; /* lock file_pins */
} __randomize_layout;
 
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 0e2f9a2c132c..093f2f2fce1a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -675,6 +675,7 @@ void __mmdrop(struct mm_struct *mm)
BUG_ON(mm == _mm);
WARN_ON_ONCE(mm == current->mm);
WARN_ON_ONCE(mm == 

[RFC PATCH v2 01/19] fs/locks: Export F_LAYOUT lease to user space

2019-08-09 Thread ira . weiny
From: Ira Weiny 

In order to support an opt-in policy for users to allow long term pins
of FS DAX pages we need to export the LAYOUT lease to user space.

This is the first of 2 new lease flags which must be used to allow a
long term pin to be made on a file.

After the complete series:

0) Registrations to Device DAX char devs are not affected

1) The user has to opt in to allowing page pins on a file with an exclusive
   layout lease.  Both exclusive and layout lease flags are user visible now.

2) page pins will fail if the lease is not active when the file back page is
   encountered.

3) Any truncate or hole punch operation on a pinned DAX page will fail.

4) The user has the option of holding the lease or releasing it.  If they
   release it no other pin calls will work on the file.

5) Closing the file is ok.

6) Unmapping the file is ok

7) Pins against the files are tracked back to an owning file or an owning mm
   depending on the internal subsystem needs.  With RDMA there is an owning
   file which is related to the pined file.

8) Only RDMA is currently supported

9) Truncation of pages which are not actively pinned nor covered by a lease
   will succeed.

Signed-off-by: Ira Weiny 
---
 fs/locks.c   | 36 +++-
 include/linux/fs.h   |  2 +-
 include/uapi/asm-generic/fcntl.h |  3 +++
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 24d1db632f6c..ad17c6ffca06 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -191,6 +191,8 @@ static int target_leasetype(struct file_lock *fl)
return F_UNLCK;
if (fl->fl_flags & FL_DOWNGRADE_PENDING)
return F_RDLCK;
+   if (fl->fl_flags & FL_LAYOUT)
+   return F_LAYOUT;
return fl->fl_type;
 }
 
@@ -611,7 +613,8 @@ static const struct lock_manager_operations 
lease_manager_ops = {
 /*
  * Initialize a lease, use the default lock manager operations
  */
-static int lease_init(struct file *filp, long type, struct file_lock *fl)
+static int lease_init(struct file *filp, long type, unsigned int flags,
+ struct file_lock *fl)
 {
if (assign_type(fl, type) != 0)
return -EINVAL;
@@ -621,6 +624,8 @@ static int lease_init(struct file *filp, long type, struct 
file_lock *fl)
 
fl->fl_file = filp;
fl->fl_flags = FL_LEASE;
+   if (flags & FL_LAYOUT)
+   fl->fl_flags |= FL_LAYOUT;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
fl->fl_ops = NULL;
@@ -629,7 +634,8 @@ static int lease_init(struct file *filp, long type, struct 
file_lock *fl)
 }
 
 /* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, long type)
+static struct file_lock *lease_alloc(struct file *filp, long type,
+unsigned int flags)
 {
struct file_lock *fl = locks_alloc_lock();
int error = -ENOMEM;
@@ -637,7 +643,7 @@ static struct file_lock *lease_alloc(struct file *filp, 
long type)
if (fl == NULL)
return ERR_PTR(error);
 
-   error = lease_init(filp, type, fl);
+   error = lease_init(filp, type, flags, fl);
if (error) {
locks_free_lock(fl);
return ERR_PTR(error);
@@ -1583,7 +1589,7 @@ int __break_lease(struct inode *inode, unsigned int mode, 
unsigned int type)
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
 
-   new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
+   new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK, 0);
if (IS_ERR(new_fl))
return PTR_ERR(new_fl);
new_fl->fl_flags = type;
@@ -1720,6 +1726,8 @@ EXPORT_SYMBOL(lease_get_mtime);
  *
  * %F_UNLCK to indicate no lease is held.
  *
+ * %F_LAYOUT to indicate a layout lease is held.
+ *
  * (if a lease break is pending):
  *
  * %F_RDLCK to indicate an exclusive lease needs to be
@@ -2022,8 +2030,26 @@ static int do_fcntl_add_lease(unsigned int fd, struct 
file *filp, long arg)
struct file_lock *fl;
struct fasync_struct *new;
int error;
+   unsigned int flags = 0;
+
+   /*
+* NOTE on F_LAYOUT lease
+*
+* LAYOUT lease types are taken on files which the user knows that
+* they will be pinning in memory for some indeterminate amount of
+* time.  Such as for use with RDMA.  While we don't know what user
+* space is going to do with the file we still use a F_RDLOCK level of
+* lease.  This ensures that there are no conflicts between
+* 2 users.  The conflict should only come from the File system wanting
+* to revoke the lease in break_layout()  And this is done by using
+* F_WRLCK in the break code.
+*/
+   if (arg == F_LAYOUT) {
+   arg = F_RDLCK;
+   flags = 

[RFC PATCH v2 19/19] mm/gup: Remove FOLL_LONGTERM DAX exclusion

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Now that there is a mechanism for users to safely take LONGTERM pins on
FS DAX pages, remove the FS DAX exclusion from the GUP implementation.

Special processing remains in effect for CONFIG_CMA

NOTE: Some callers still fail because the vaddr_pin information has not
been passed into the new interface.  As new users appear they can start
to use the new interface to support FS DAX.

Signed-off-by: Ira Weiny 
---
 mm/gup.c | 78 ++--
 1 file changed, 8 insertions(+), 70 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index 6d23f70d7847..58f008a3c153 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1415,26 +1415,6 @@ static long __get_user_pages_locked(struct task_struct 
*tsk,
 }
 #endif /* !CONFIG_MMU */
 
-#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
-   long i;
-   struct vm_area_struct *vma_prev = NULL;
-
-   for (i = 0; i < nr_pages; i++) {
-   struct vm_area_struct *vma = vmas[i];
-
-   if (vma == vma_prev)
-   continue;
-
-   vma_prev = vma;
-
-   if (vma_is_fsdax(vma))
-   return true;
-   }
-   return false;
-}
-
 #ifdef CONFIG_CMA
 static struct page *new_non_cma_page(struct page *page, unsigned long private)
 {
@@ -1568,18 +1548,6 @@ static long check_and_migrate_cma_pages(struct 
task_struct *tsk,
 
return nr_pages;
 }
-#else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
-   struct mm_struct *mm,
-   unsigned long start,
-   unsigned long nr_pages,
-   struct page **pages,
-   struct vm_area_struct **vmas,
-   unsigned int gup_flags)
-{
-   return nr_pages;
-}
-#endif /* CONFIG_CMA */
 
 /*
  * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
@@ -1594,49 +1562,28 @@ static long __gup_longterm_locked(struct task_struct 
*tsk,
  unsigned int gup_flags,
  struct vaddr_pin *vaddr_pin)
 {
-   struct vm_area_struct **vmas_tmp = vmas;
unsigned long flags = 0;
-   long rc, i;
+   long rc;
 
-   if (gup_flags & FOLL_LONGTERM) {
-   if (!pages)
-   return -EINVAL;
-
-   if (!vmas_tmp) {
-   vmas_tmp = kcalloc(nr_pages,
-  sizeof(struct vm_area_struct *),
-  GFP_KERNEL);
-   if (!vmas_tmp)
-   return -ENOMEM;
-   }
+   if (flags & FOLL_LONGTERM)
flags = memalloc_nocma_save();
-   }
 
rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
-vmas_tmp, NULL, gup_flags, vaddr_pin);
+vmas, NULL, gup_flags, vaddr_pin);
 
if (gup_flags & FOLL_LONGTERM) {
memalloc_nocma_restore(flags);
if (rc < 0)
goto out;
 
-   if (check_dax_vmas(vmas_tmp, rc)) {
-   for (i = 0; i < rc; i++)
-   put_page(pages[i]);
-   rc = -EOPNOTSUPP;
-   goto out;
-   }
-
rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
-vmas_tmp, gup_flags);
+vmas, gup_flags);
}
 
 out:
-   if (vmas_tmp != vmas)
-   kfree(vmas_tmp);
return rc;
 }
-#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+#else /* !CONFIG_CMA */
 static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
  struct mm_struct *mm,
  unsigned long start,
@@ -1649,7 +1596,7 @@ static __always_inline long __gup_longterm_locked(struct 
task_struct *tsk,
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
   NULL, flags, vaddr_pin);
 }
-#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+#endif /* CONFIG_CMA */
 
 /*
  * This is the same as get_user_pages_remote(), just with a
@@ -1887,9 +1834,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
goto pte_unmap;
 
if (pte_devmap(pte)) {
-   if (unlikely(flags & FOLL_LONGTERM))
-   goto pte_unmap;
-
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {

[RFC PATCH v2 16/19] RDMA/uverbs: Add back pointer to system file object

2019-08-09 Thread ira . weiny
From: Ira Weiny 

In order for MRs to be tracked against the open verbs context the ufile
needs to have a pointer to hand to the GUP code.

No references need to be taken as this should be valid for the lifetime
of the context.

Signed-off-by: Ira Weiny 
---
 drivers/infiniband/core/uverbs.h  | 1 +
 drivers/infiniband/core/uverbs_main.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
index 1e5aeb39f774..e802ba8c67d6 100644
--- a/drivers/infiniband/core/uverbs.h
+++ b/drivers/infiniband/core/uverbs.h
@@ -163,6 +163,7 @@ struct ib_uverbs_file {
struct page *disassociate_page;
 
struct xarray   idr;
+   struct file *sys_file; /* backpointer to system file object 
*/
 };
 
 struct ib_uverbs_event {
diff --git a/drivers/infiniband/core/uverbs_main.c 
b/drivers/infiniband/core/uverbs_main.c
index 11c13c1381cf..002c24e0d4db 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -1092,6 +1092,7 @@ static int ib_uverbs_open(struct inode *inode, struct 
file *filp)
INIT_LIST_HEAD(>umaps);
 
filp->private_data = file;
+   file->sys_file = filp;
list_add_tail(>list, >uverbs_file_list);
mutex_unlock(>lists_mutex);
srcu_read_unlock(>disassociate_srcu, srcu_key);
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 18/19] {mm,procfs}: Add display file_pins proc

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Now that we have the file pins information stored add a new procfs entry
to display them to the user.

NOTE output will be dependant on where the file pin is tied to.  Some
processes may have the pin associated with a file descriptor in which
case that file is reported as well.

Others are associated directly with the process mm and are reported as
such.

For example of a file pinned to an RDMA open context (fd 4) and a file
pinned to the mm of that process:

4: /dev/infiniband/uverbs0
   /mnt/pmem/foo
/mnt/pmem/bar

Signed-off-by: Ira Weiny 
---
 fs/proc/base.c | 214 +
 1 file changed, 214 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ebea9501afb8..f4d219172235 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2995,6 +2995,7 @@ static int proc_stack_depth(struct seq_file *m, struct 
pid_namespace *ns,
  */
 static const struct file_operations proc_task_operations;
 static const struct inode_operations proc_task_inode_operations;
+static const struct file_operations proc_pid_file_pins_operations;
 
 static const struct pid_entry tgid_base_stuff[] = {
DIR("task",   S_IRUGO|S_IXUGO, proc_task_inode_operations, 
proc_task_operations),
@@ -3024,6 +3025,7 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("stat",   S_IRUGO, proc_tgid_stat),
ONE("statm",  S_IRUGO, proc_pid_statm),
REG("maps",   S_IRUGO, proc_pid_maps_operations),
+   REG("file_pins",  S_IRUGO, proc_pid_file_pins_operations),
 #ifdef CONFIG_NUMA
REG("numa_maps",  S_IRUGO, proc_pid_numa_maps_operations),
 #endif
@@ -3422,6 +3424,7 @@ static const struct pid_entry tid_base_stuff[] = {
ONE("stat",  S_IRUGO, proc_tid_stat),
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps",  S_IRUGO, proc_pid_maps_operations),
+   REG("file_pins", S_IRUGO, proc_pid_file_pins_operations),
 #ifdef CONFIG_PROC_CHILDREN
REG("children",  S_IRUGO, proc_tid_children_operations),
 #endif
@@ -3718,3 +3721,214 @@ void __init set_proc_pid_nlink(void)
nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
nlink_tgid = pid_entry_nlink(tgid_base_stuff, 
ARRAY_SIZE(tgid_base_stuff));
 }
+
+/**
+ * file_pin information below.
+ */
+
+struct proc_file_pins_private {
+   struct inode *inode;
+   struct task_struct *task;
+   struct mm_struct *mm;
+   struct files_struct *files;
+   unsigned int nr_pins;
+   struct xarray fps;
+} __randomize_layout;
+
+static void release_fp(struct proc_file_pins_private *priv)
+{
+   up_read(>mm->mmap_sem);
+   mmput(priv->mm);
+}
+
+static void print_fd_file_pin(struct seq_file *m, struct file *file,
+   unsigned long i)
+{
+   struct file_file_pin *fp;
+   struct file_file_pin *tmp;
+
+   if (list_empty_careful(>file_pins))
+   return;
+
+   seq_printf(m, "%lu: ", i);
+   seq_file_path(m, file, "\n");
+   seq_putc(m, '\n');
+
+   list_for_each_entry_safe(fp, tmp, >file_pins, list) {
+   seq_puts(m, "   ");
+   seq_file_path(m, fp->file, "\n");
+   seq_putc(m, '\n');
+   }
+}
+
+/* We are storing the index's within the FD table for later retrieval */
+static int store_fd(const void *priv , struct file *file, unsigned i)
+{
+   struct proc_file_pins_private *fp_priv;
+
+   /* cast away const... */
+   fp_priv = (struct proc_file_pins_private *)priv;
+
+   if (list_empty_careful(>file_pins))
+   return 0;
+
+   /* can't sleep in the iterate of the fd table */
+   xa_store(_priv->fps, fp_priv->nr_pins, xa_mk_value(i), GFP_ATOMIC);
+   fp_priv->nr_pins++;
+
+   return 0;
+}
+
+static void store_mm_pins(struct proc_file_pins_private *priv)
+{
+   struct mm_file_pin *fp;
+   struct mm_file_pin *tmp;
+
+   list_for_each_entry_safe(fp, tmp, >mm->file_pins, list) {
+   xa_store(>fps, priv->nr_pins, fp, GFP_KERNEL);
+   priv->nr_pins++;
+   }
+}
+
+
+static void *fp_start(struct seq_file *m, loff_t *ppos)
+{
+   struct proc_file_pins_private *priv = m->private;
+   unsigned int pos = *ppos;
+
+   priv->task = get_proc_task(priv->inode);
+   if (!priv->task)
+   return ERR_PTR(-ESRCH);
+
+   if (!priv->mm || !mmget_not_zero(priv->mm))
+   return NULL;
+
+   priv->files = get_files_struct(priv->task);
+   down_read(>mm->mmap_sem);
+
+   xa_destroy(>fps);
+   priv->nr_pins = 0;
+
+   /* grab fds of "files" which have pins and store as xa values */
+   if (priv->files)
+   iterate_fd(priv->files, 0, store_fd, priv);
+
+   /* store mm_file_pins as xa entries */
+   store_mm_pins(priv);
+
+   if (pos >= priv->nr_pins) {
+   release_fp(priv);
+   return NULL;
+   }
+
+   return xa_load(>fps, 

[RFC PATCH v2 05/19] fs/ext4: Teach ext4 to break layout leases

2019-08-09 Thread ira . weiny
From: Ira Weiny 

ext4 must attempt to break a layout lease if it is held to know if the
layout can be modified.

Split out the logic to determine if a mapping is DAX, export it, and then
break layout leases if a mapping is DAX.

Signed-off-by: Ira Weiny 

---
Changes from RFC v1:

Based on feedback from Dave Chinner, add support to fail all
other layout breaks when a lease is held.

 fs/dax.c| 23 ---
 fs/ext4/inode.c |  7 +++
 include/linux/dax.h |  6 ++
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index b64964ef44f6..a14ec32255d8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -557,6 +557,21 @@ static void *grab_mapping_entry(struct xa_state *xas,
return xa_mk_internal(VM_FAULT_FALLBACK);
 }
 
+bool dax_mapping_is_dax(struct address_space *mapping)
+{
+   /*
+* In the 'limited' case get_user_pages() for dax is disabled.
+*/
+   if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+   return false;
+
+   if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+   return false;
+
+   return true;
+}
+EXPORT_SYMBOL_GPL(dax_mapping_is_dax);
+
 /**
  * dax_layout_busy_page - find first pinned page in @mapping
  * @mapping: address space to scan for a page with ref count > 1
@@ -579,13 +594,7 @@ struct page *dax_layout_busy_page(struct address_space 
*mapping)
unsigned int scanned = 0;
struct page *page = NULL;
 
-   /*
-* In the 'limited' case get_user_pages() for dax is disabled.
-*/
-   if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
-   return NULL;
-
-   if (!dax_mapping(mapping) || !mapping_mapped(mapping))
+   if (!dax_mapping_is_dax(mapping))
return NULL;
 
/*
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index b2c8d09acf65..f08f48de52c5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4271,6 +4271,13 @@ int ext4_break_layouts(struct inode *inode)
if (WARN_ON_ONCE(!rwsem_is_locked(>i_mmap_sem)))
return -EINVAL;
 
+   /* Break layout leases if active */
+   if (dax_mapping_is_dax(inode->i_mapping)) {
+   error = break_layout(inode, true);
+   if (error)
+   return error;
+   }
+
do {
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 9bd8528bd305..da0768b34b48 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -143,6 +143,7 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device 
*bdev);
 int dax_writeback_mapping_range(struct address_space *mapping,
struct block_device *bdev, struct writeback_control *wbc);
 
+bool dax_mapping_is_dax(struct address_space *mapping);
 struct page *dax_layout_busy_page(struct address_space *mapping);
 dax_entry_t dax_lock_page(struct page *page);
 void dax_unlock_page(struct page *page, dax_entry_t cookie);
@@ -174,6 +175,11 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct 
block_device *bdev)
return NULL;
 }
 
+static inline bool dax_mapping_is_dax(struct address_space *mapping)
+{
+   return false;
+}
+
 static inline struct page *dax_layout_busy_page(struct address_space *mapping)
 {
return NULL;
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 06/19] fs/ext4: Teach dax_layout_busy_page() to operate on a sub-range

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Callers of dax_layout_busy_page() are only rarely operating on the
entire file of concern.

Teach dax_layout_busy_page() to operate on a sub-range of the
address_space provided.  Specifying 0 - ULONG_MAX however, will continue
to operate on the "entire file" and XFS is split out to a separate patch
by this method.

This could potentially speed up dax_layout_busy_page() as well.

Signed-off-by: Ira Weiny 

---
Changes from RFC v1
Fix 0-day build errors

 fs/dax.c| 15 +++
 fs/ext4/ext4.h  |  2 +-
 fs/ext4/extents.c   |  6 +++---
 fs/ext4/inode.c | 19 ---
 fs/xfs/xfs_file.c   |  3 ++-
 include/linux/dax.h |  6 --
 6 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a14ec32255d8..3ad19c384454 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -573,8 +573,11 @@ bool dax_mapping_is_dax(struct address_space *mapping)
 EXPORT_SYMBOL_GPL(dax_mapping_is_dax);
 
 /**
- * dax_layout_busy_page - find first pinned page in @mapping
+ * dax_layout_busy_page - find first pinned page in @mapping within
+ *the range @off - @off + @len
  * @mapping: address space to scan for a page with ref count > 1
+ * @off: offset to start at
+ * @len: length to scan through
  *
  * DAX requires ZONE_DEVICE mapped pages. These pages are never
  * 'onlined' to the page allocator so they are considered idle when
@@ -587,9 +590,13 @@ EXPORT_SYMBOL_GPL(dax_mapping_is_dax);
  * to be able to run unmap_mapping_range() and subsequently not race
  * mapping_mapped() becoming true.
  */
-struct page *dax_layout_busy_page(struct address_space *mapping)
+struct page *dax_layout_busy_page(struct address_space *mapping,
+ loff_t off, loff_t len)
 {
-   XA_STATE(xas, >i_pages, 0);
+   unsigned long start_idx = off >> PAGE_SHIFT;
+   unsigned long end_idx = (len == ULONG_MAX) ? ULONG_MAX
+   : start_idx + (len >> PAGE_SHIFT);
+   XA_STATE(xas, >i_pages, start_idx);
void *entry;
unsigned int scanned = 0;
struct page *page = NULL;
@@ -612,7 +619,7 @@ struct page *dax_layout_busy_page(struct address_space 
*mapping)
unmap_mapping_range(mapping, 0, 0, 1);
 
xas_lock_irq();
-   xas_for_each(, entry, ULONG_MAX) {
+   xas_for_each(, entry, end_idx) {
if (WARN_ON_ONCE(!xa_is_value(entry)))
continue;
if (unlikely(dax_is_locked(entry)))
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9c7f4036021b..32738ccdac1d 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2578,7 +2578,7 @@ extern int ext4_get_inode_loc(struct inode *, struct 
ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
 extern int ext4_truncate(struct inode *);
-extern int ext4_break_layouts(struct inode *);
+extern int ext4_break_layouts(struct inode *inode, loff_t offset, loff_t len);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int 
nblocks);
 extern void ext4_set_inode_flags(struct inode *);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2da7d6..ded4b1d92299 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4736,7 +4736,7 @@ static long ext4_zero_range(struct file *file, loff_t 
offset,
 */
down_write(_I(inode)->i_mmap_sem);
 
-   ret = ext4_break_layouts(inode);
+   ret = ext4_break_layouts(inode, offset, len);
if (ret) {
up_write(_I(inode)->i_mmap_sem);
goto out_mutex;
@@ -5419,7 +5419,7 @@ int ext4_collapse_range(struct inode *inode, loff_t 
offset, loff_t len)
 */
down_write(_I(inode)->i_mmap_sem);
 
-   ret = ext4_break_layouts(inode);
+   ret = ext4_break_layouts(inode, offset, len);
if (ret)
goto out_mmap;
 
@@ -5572,7 +5572,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, 
loff_t len)
 */
down_write(_I(inode)->i_mmap_sem);
 
-   ret = ext4_break_layouts(inode);
+   ret = ext4_break_layouts(inode, offset, len);
if (ret)
goto out_mmap;
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f08f48de52c5..d3fc6035428c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4262,7 +4262,7 @@ static void ext4_wait_dax_page(struct ext4_inode_info *ei)
down_write(>i_mmap_sem);
 }
 
-int ext4_break_layouts(struct inode *inode)
+int ext4_break_layouts(struct inode *inode, loff_t offset, loff_t len)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
struct page *page;
@@ -4279,7 +4279,7 @@ int ext4_break_layouts(struct inode *inode)
}
 
do {
-   page = dax_layout_busy_page(inode->i_mapping);
+   page = 

[RFC PATCH v2 09/19] mm/gup: Introduce vaddr_pin structure

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Some subsystems need to pass owning file information to GUP calls to
allow for GUP to associate the "owning file" to any files being pinned
within the GUP call.

Introduce an object to specify this information and pass it down through
some of the GUP call stack.

Signed-off-by: Ira Weiny 
---
 include/linux/mm.h |  9 +
 mm/gup.c   | 36 ++--
 2 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 04f22722b374..befe150d17be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -971,6 +971,15 @@ static inline bool is_zone_device_page(const struct page 
*page)
 }
 #endif
 
+/**
+ * @f_owner The file who "owns this GUP"
+ * @mm The mm who "owns this GUP"
+ */
+struct vaddr_pin {
+   struct file *f_owner;
+   struct mm_struct *mm;
+};
+
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 void __put_devmap_managed_page(struct page *page);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
diff --git a/mm/gup.c b/mm/gup.c
index 0b05e22ac05f..7a449500f0a6 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1005,7 +1005,8 @@ static __always_inline long 
__get_user_pages_locked(struct task_struct *tsk,
struct page **pages,
struct vm_area_struct **vmas,
int *locked,
-   unsigned int flags)
+   unsigned int flags,
+   struct vaddr_pin *vaddr_pin)
 {
long ret, pages_done;
bool lock_dropped;
@@ -1165,7 +1166,8 @@ long get_user_pages_remote(struct task_struct *tsk, 
struct mm_struct *mm,
 
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
   locked,
-  gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+  gup_flags | FOLL_TOUCH | FOLL_REMOTE,
+  NULL);
 }
 EXPORT_SYMBOL(get_user_pages_remote);
 
@@ -1320,7 +1322,8 @@ static long __get_user_pages_locked(struct task_struct 
*tsk,
struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
-   unsigned int foll_flags)
+   unsigned int foll_flags,
+   struct vaddr_pin *vaddr_pin)
 {
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -1504,7 +1507,7 @@ static long check_and_migrate_cma_pages(struct 
task_struct *tsk,
 */
nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
   pages, vmas, NULL,
-  gup_flags);
+  gup_flags, NULL);
 
if ((nr_pages > 0) && migrate_allow) {
drain_allow = true;
@@ -1537,7 +1540,8 @@ static long __gup_longterm_locked(struct task_struct *tsk,
  unsigned long nr_pages,
  struct page **pages,
  struct vm_area_struct **vmas,
- unsigned int gup_flags)
+ unsigned int gup_flags,
+ struct vaddr_pin *vaddr_pin)
 {
struct vm_area_struct **vmas_tmp = vmas;
unsigned long flags = 0;
@@ -1558,7 +1562,7 @@ static long __gup_longterm_locked(struct task_struct *tsk,
}
 
rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
-vmas_tmp, NULL, gup_flags);
+vmas_tmp, NULL, gup_flags, vaddr_pin);
 
if (gup_flags & FOLL_LONGTERM) {
memalloc_nocma_restore(flags);
@@ -1588,10 +1592,11 @@ static __always_inline long 
__gup_longterm_locked(struct task_struct *tsk,
  unsigned long nr_pages,
  struct page **pages,
  struct vm_area_struct **vmas,
- unsigned int flags)
+ unsigned int flags,
+ struct vaddr_pin *vaddr_pin)
 {
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
-  NULL, flags);
+  NULL, flags, vaddr_pin);
 }
 #endif /* CONFIG_FS_DAX || CONFIG_CMA */
 
@@ -1607,7 +1612,8 @@ long get_user_pages(unsigned long start, unsigned long 
nr_pages,
struct vm_area_struct **vmas)
 {
return __gup_longterm_locked(current, 

[RFC PATCH v2 03/19] mm/gup: Pass flags down to __gup_device_huge* calls

2019-08-09 Thread ira . weiny
From: Ira Weiny 

In order to support checking for a layout lease on a FS DAX inode these
calls need to know if FOLL_LONGTERM was specified.

Signed-off-by: Ira Weiny 
---
 mm/gup.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index b6a293bf1267..80423779a50a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1881,7 +1881,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 
 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
 static int __gup_device_huge(unsigned long pfn, unsigned long addr,
-   unsigned long end, struct page **pages, int *nr)
+   unsigned long end, struct page **pages, int *nr,
+   unsigned int flags)
 {
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
@@ -1907,30 +1908,33 @@ static int __gup_device_huge(unsigned long pfn, 
unsigned long addr,
 }
 
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-   unsigned long end, struct page **pages, int *nr)
+   unsigned long end, struct page **pages, int *nr,
+   unsigned int flags)
 {
unsigned long fault_pfn;
int nr_start = *nr;
 
fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
-   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
return 0;
 
if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
+
return 1;
 }
 
 static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
-   unsigned long end, struct page **pages, int *nr)
+   unsigned long end, struct page **pages, int *nr,
+   unsigned int flags)
 {
unsigned long fault_pfn;
int nr_start = *nr;
 
fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
-   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
+   if (!__gup_device_huge(fault_pfn, addr, end, pages, nr, flags))
return 0;
 
if (unlikely(pud_val(orig) != pud_val(*pudp))) {
@@ -1941,14 +1945,16 @@ static int __gup_device_huge_pud(pud_t orig, pud_t 
*pudp, unsigned long addr,
 }
 #else
 static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
-   unsigned long end, struct page **pages, int *nr)
+   unsigned long end, struct page **pages, int *nr,
+   unsigned int flags)
 {
BUILD_BUG();
return 0;
 }
 
 static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
-   unsigned long end, struct page **pages, int *nr)
+   unsigned long end, struct page **pages, int *nr,
+   unsigned int flags)
 {
BUILD_BUG();
return 0;
@@ -2051,7 +2057,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned 
long addr,
if (pmd_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
-   return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+   return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr,
+flags);
}
 
refs = 0;
@@ -2092,7 +2099,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned 
long addr,
if (pud_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
-   return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+   return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr,
+flags);
}
 
refs = 0;
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 02/19] fs/locks: Add Exclusive flag to user Layout lease

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Add an exclusive lease flag which indicates that the layout mechanism
can not be broken.

Exclusive layout leases allow the file system to know that pages may be
GUP pined and that attempts to change the layout, ie truncate, should be
failed.

A process which attempts to break it's own exclusive lease gets an
EDEADLOCK return to help determine that this is likely a programming bug
vs someone else holding a resource.

Signed-off-by: Ira Weiny 
---
 fs/locks.c   | 23 +--
 include/linux/fs.h   |  1 +
 include/uapi/asm-generic/fcntl.h |  2 ++
 3 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index ad17c6ffca06..0c7359cdab92 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -626,6 +626,8 @@ static int lease_init(struct file *filp, long type, 
unsigned int flags,
fl->fl_flags = FL_LEASE;
if (flags & FL_LAYOUT)
fl->fl_flags |= FL_LAYOUT;
+   if (flags & FL_EXCLUSIVE)
+   fl->fl_flags |= FL_EXCLUSIVE;
fl->fl_start = 0;
fl->fl_end = OFFSET_MAX;
fl->fl_ops = NULL;
@@ -1619,6 +1621,14 @@ int __break_lease(struct inode *inode, unsigned int 
mode, unsigned int type)
list_for_each_entry_safe(fl, tmp, >flc_lease, fl_list) {
if (!leases_conflict(fl, new_fl))
continue;
+   if (fl->fl_flags & FL_EXCLUSIVE) {
+   error = -ETXTBSY;
+   if (new_fl->fl_pid == fl->fl_pid) {
+   error = -EDEADLOCK;
+   goto out;
+   }
+   continue;
+   }
if (want_write) {
if (fl->fl_flags & FL_UNLOCK_PENDING)
continue;
@@ -1634,6 +1644,13 @@ int __break_lease(struct inode *inode, unsigned int 
mode, unsigned int type)
locks_delete_lock_ctx(fl, );
}
 
+   /* We differentiate between -EDEADLOCK and -ETXTBSY so the above loop
+* continues with -ETXTBSY looking for a potential deadlock instead.
+* If deadlock is not found go ahead and return -ETXTBSY.
+*/
+   if (error == -ETXTBSY)
+   goto out;
+
if (list_empty(>flc_lease))
goto out;
 
@@ -2044,9 +2061,11 @@ static int do_fcntl_add_lease(unsigned int fd, struct 
file *filp, long arg)
 * to revoke the lease in break_layout()  And this is done by using
 * F_WRLCK in the break code.
 */
-   if (arg == F_LAYOUT) {
+   if ((arg & F_LAYOUT) == F_LAYOUT) {
+   if ((arg & F_EXCLUSIVE) == F_EXCLUSIVE)
+   flags |= FL_EXCLUSIVE;
arg = F_RDLCK;
-   flags = FL_LAYOUT;
+   flags |= FL_LAYOUT;
}
 
fl = lease_alloc(filp, arg, flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd60d5be9886..2e41ce547913 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1005,6 +1005,7 @@ static inline struct file *get_file(struct file *f)
 #define FL_UNLOCK_PENDING  512 /* Lease is being broken */
 #define FL_OFDLCK  1024/* lock is "owned" by struct file */
 #define FL_LAYOUT  2048/* outstanding pNFS layout or user held pin */
+#define FL_EXCLUSIVE   4096/* Layout lease is exclusive */
 
 #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
 
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index baddd54f3031..88b175ceccbc 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -176,6 +176,8 @@ struct f_owner_ex {
 
 #define F_LAYOUT   16  /* layout lease to allow longterm pins such as
   RDMA */
+#define F_EXCLUSIVE32  /* layout lease is exclusive */
+   /* FIXME or shoudl this be F_EXLCK??? */
 
 /* operations for bsd flock(), also used by the kernel implementation */
 #define LOCK_SH1   /* shared lock */
-- 
2.20.1

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[RFC PATCH v2 04/19] mm/gup: Ensure F_LAYOUT lease is held prior to GUP'ing pages

2019-08-09 Thread ira . weiny
From: Ira Weiny 

On FS DAX files users must inform the file system they intend to take
long term GUP pins on the file pages.  Failure to do so should result in
an error.

Ensure that a F_LAYOUT lease exists at the time the GUP call is made.
If not return EPERM.

Signed-off-by: Ira Weiny 

---
Changes from RFC v1:

The old version had remnants of when GUP was going to take the lease
for the user.  Remove this prototype code.
Fix issue in gup_device_huge which was setting page reference prior
to checking for Layout Lease
Re-base to 5.3+
Clean up htmldoc comments

 fs/locks.c | 47 ++
 include/linux/mm.h |  2 ++
 mm/gup.c   | 23 +++
 mm/huge_memory.c   | 12 
 4 files changed, 84 insertions(+)

diff --git a/fs/locks.c b/fs/locks.c
index 0c7359cdab92..14892c84844b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2971,3 +2971,50 @@ static int __init filelock_init(void)
return 0;
 }
 core_initcall(filelock_init);
+
+/**
+ * mapping_inode_has_layout - ensure a file mapped page has a layout lease
+ * taken
+ * @page: page we are trying to GUP
+ *
+ * This should only be called on DAX pages.  DAX pages which are mapped through
+ * FS DAX do not use the page cache.  As a result they require the user to take
+ * a LAYOUT lease on them prior to be able to pin them for longterm use.
+ * This allows the user to opt-into the fact that truncation operations will
+ * fail for the duration of the pin.
+ *
+ * Return true if the page has a LAYOUT lease associated with it's file.
+ */
+bool mapping_inode_has_layout(struct page *page)
+{
+   bool ret = false;
+   struct inode *inode;
+   struct file_lock *fl;
+
+   if (WARN_ON(PageAnon(page)) ||
+   WARN_ON(!page) ||
+   WARN_ON(!page->mapping) ||
+   WARN_ON(!page->mapping->host))
+   return false;
+
+   inode = page->mapping->host;
+
+   smp_mb();
+   if (inode->i_flctx &&
+   !list_empty_careful(>i_flctx->flc_lease)) {
+   spin_lock(>i_flctx->flc_lock);
+   ret = false;
+   list_for_each_entry(fl, >i_flctx->flc_lease, fl_list) {
+   if (fl->fl_pid == current->tgid &&
+   (fl->fl_flags & FL_LAYOUT) &&
+   (fl->fl_flags & FL_EXCLUSIVE)) {
+   ret = true;
+   break;
+   }
+   }
+   spin_unlock(>i_flctx->flc_lock);
+   }
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(mapping_inode_has_layout);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad6766a08f9b..04f22722b374 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1583,6 +1583,8 @@ int account_locked_vm(struct mm_struct *mm, unsigned long 
pages, bool inc);
 int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
 
+bool mapping_inode_has_layout(struct page *page);
+
 /* Container for pinned pfns / pages */
 struct frame_vector {
unsigned int nr_allocated;  /* Number of frames we have space for */
diff --git a/mm/gup.c b/mm/gup.c
index 80423779a50a..0b05e22ac05f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -221,6 +221,13 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
page = pte_page(pte);
else
goto no_page;
+
+   if (unlikely(flags & FOLL_LONGTERM) &&
+   (*pgmap)->type == MEMORY_DEVICE_FS_DAX &&
+   !mapping_inode_has_layout(page)) {
+   page = ERR_PTR(-EPERM);
+   goto out;
+   }
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
@@ -1847,6 +1854,14 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
unsigned long end,
 
VM_BUG_ON_PAGE(compound_head(page) != head, page);
 
+   if (pte_devmap(pte) &&
+   unlikely(flags & FOLL_LONGTERM) &&
+   pgmap->type == MEMORY_DEVICE_FS_DAX &&
+   !mapping_inode_has_layout(head)) {
+   put_user_page(head);
+   goto pte_unmap;
+   }
+
SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -1895,6 +1910,14 @@ static int __gup_device_huge(unsigned long pfn, unsigned 
long addr,
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
+
+   if (unlikely(flags & FOLL_LONGTERM) &&
+   pgmap->type == MEMORY_DEVICE_FS_DAX &&
+   !mapping_inode_has_layout(page)) {
+   undo_dev_pagemap(nr, nr_start, pages);
+   

[RFC PATCH v2 12/19] mm/gup: Prep put_user_pages() to take an vaddr_pin struct

2019-08-09 Thread ira . weiny
From: Ira Weiny 

Once callers start to use vaddr_pin the put_user_pages calls will need
to have access to this data coming in.  Prep put_user_pages() for this
data.

Signed-off-by: Ira Weiny 
---
 include/linux/mm.h |  20 +---
 mm/gup.c   | 122 -
 2 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index befe150d17be..9d37cafbef9a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1064,25 +1064,7 @@ static inline void put_page(struct page *page)
__put_page(page);
 }
 
-/**
- * put_user_page() - release a gup-pinned page
- * @page:pointer to page to be released
- *
- * Pages that were pinned via get_user_pages*() must be released via
- * either put_user_page(), or one of the put_user_pages*() routines
- * below. This is so that eventually, pages that are pinned via
- * get_user_pages*() can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special
- * handling.
- *
- * put_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. put_user_page() calls must
- * be perfectly matched up with get_user_page() calls.
- */
-static inline void put_user_page(struct page *page)
-{
-   put_page(page);
-}
+void put_user_page(struct page *page);
 
 void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
   bool make_dirty);
diff --git a/mm/gup.c b/mm/gup.c
index a7a9d2f5278c..10cfd30ff668 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -24,30 +24,41 @@
 
 #include "internal.h"
 
-/**
- * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
- * @pages:  array of pages to be maybe marked dirty, and definitely released.
- * @npages: number of pages in the @pages array.
- * @make_dirty: whether to mark the pages dirty
- *
- * "gup-pinned page" refers to a page that has had one of the get_user_pages()
- * variants called on that page.
- *
- * For each page in the @pages array, make that page (or its head page, if a
- * compound page) dirty, if @make_dirty is true, and if the page was previously
- * listed as clean. In any case, releases all pages using put_user_page(),
- * possibly via put_user_pages(), for the non-dirty case.
- *
- * Please see the put_user_page() documentation for details.
- *
- * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
- * required, then the caller should a) verify that this is really correct,
- * because _lock() is usually required, and b) hand code it:
- * set_page_dirty_lock(), put_user_page().
- *
- */
-void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
-  bool make_dirty)
+static void __put_user_page(struct vaddr_pin *vaddr_pin, struct page *page)
+{
+   page = compound_head(page);
+
+   /*
+* For devmap managed pages we need to catch refcount transition from
+* GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
+* page is free and we need to inform the device driver through
+* callback. See include/linux/memremap.h and HMM for details.
+*/
+   if (put_devmap_managed_page(page))
+   return;
+
+   if (put_page_testzero(page))
+   __put_page(page);
+}
+
+static void __put_user_pages(struct vaddr_pin *vaddr_pin, struct page **pages,
+unsigned long npages)
+{
+   unsigned long index;
+
+   /*
+* TODO: this can be optimized for huge pages: if a series of pages is
+* physically contiguous and part of the same compound page, then a
+* single operation to the head page should suffice.
+*/
+   for (index = 0; index < npages; index++)
+   __put_user_page(vaddr_pin, pages[index]);
+}
+
+static void __put_user_pages_dirty_lock(struct vaddr_pin *vaddr_pin,
+   struct page **pages,
+   unsigned long npages,
+   bool make_dirty)
 {
unsigned long index;
 
@@ -58,7 +69,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned 
long npages,
 */
 
if (!make_dirty) {
-   put_user_pages(pages, npages);
+   __put_user_pages(vaddr_pin, pages, npages);
return;
}
 
@@ -86,9 +97,58 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned 
long npages,
 */
if (!PageDirty(page))
set_page_dirty_lock(page);
-   put_user_page(page);
+   __put_user_page(vaddr_pin, page);
}
 }
+
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page:pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either 

[RFC PATCH v2 11/19] mm/gup: Pass follow_page_context further down the call stack

2019-08-09 Thread ira . weiny
From: Ira Weiny 

In preparation for passing more information (vaddr_pin) into
follow_page_pte(), follow_devmap_pud(), and follow_devmap_pmd().

Signed-off-by: Ira Weiny 
---
 include/linux/huge_mm.h | 17 -
 mm/gup.c| 31 +++
 mm/huge_memory.c|  6 --
 mm/internal.h   | 28 
 4 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 45ede62aa85b..b01a20ce0bb9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -233,11 +233,6 @@ static inline int hpage_nr_pages(struct page *page)
return 1;
 }
 
-struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
-   pmd_t *pmd, int flags, struct dev_pagemap **pgmap);
-struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
-   pud_t *pud, int flags, struct dev_pagemap **pgmap);
-
 extern vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd);
 
 extern struct page *huge_zero_page;
@@ -375,18 +370,6 @@ static inline void mm_put_huge_zero_page(struct mm_struct 
*mm)
return;
 }
 
-static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma,
-   unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
-{
-   return NULL;
-}
-
-static inline struct page *follow_devmap_pud(struct vm_area_struct *vma,
-   unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap)
-{
-   return NULL;
-}
-
 static inline bool thp_migration_supported(void)
 {
return false;
diff --git a/mm/gup.c b/mm/gup.c
index 504af3e9a942..a7a9d2f5278c 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -24,11 +24,6 @@
 
 #include "internal.h"
 
-struct follow_page_context {
-   struct dev_pagemap *pgmap;
-   unsigned int page_mask;
-};
-
 /**
  * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
  * @pages:  array of pages to be maybe marked dirty, and definitely released.
@@ -172,8 +167,9 @@ static inline bool can_follow_write_pte(pte_t pte, unsigned 
int flags)
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags,
-   struct dev_pagemap **pgmap)
+   struct follow_page_context *ctx)
 {
+   struct dev_pagemap **pgmap = >pgmap;
struct mm_struct *mm = vma->vm_mm;
struct page *page;
spinlock_t *ptl;
@@ -363,13 +359,13 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
-   page = follow_devmap_pmd(vma, address, pmd, flags, >pgmap);
+   page = follow_devmap_pmd(vma, address, pmd, flags, ctx);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(pmdval)))
-   return follow_page_pte(vma, address, pmd, flags, >pgmap);
+   return follow_page_pte(vma, address, pmd, flags, ctx);
 
if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
@@ -389,7 +385,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
-   return follow_page_pte(vma, address, pmd, flags, >pgmap);
+   return follow_page_pte(vma, address, pmd, flags, ctx);
}
if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
@@ -419,7 +415,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct 
*vma,
}
 
return ret ? ERR_PTR(ret) :
-   follow_page_pte(vma, address, pmd, flags, >pgmap);
+   follow_page_pte(vma, address, pmd, flags, ctx);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
@@ -456,7 +452,7 @@ static struct page *follow_pud_mask(struct vm_area_struct 
*vma,
}
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
-   page = follow_devmap_pud(vma, address, pud, flags, >pgmap);
+   page = follow_devmap_pud(vma, address, pud, flags, ctx);
spin_unlock(ptl);
if (page)
return page;
@@ -786,7 +782,8 @@ static int check_vma_flags(struct vm_area_struct *vma, 
unsigned long gup_flags)
 static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
-   struct vm_area_struct **vmas, int *nonblocking)
+   struct vm_area_struct **vmas, int *nonblocking,
+   struct vaddr_pin *vaddr_pin)
 {
long ret = 0, i = 0;

Re: [PATCH] mm/memremap: Fix reuse of pgmap instances with internal references

2019-08-09 Thread Christoph Hellwig
Looks good:

Reviewed-by: Christoph Hellwig 
___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH] nvdimm: Initialize bad block for volatile namespaces

2019-08-09 Thread Aneesh Kumar K.V
We do check for a bad block during namespace init and that use
region bad block list. We need to initialize the bad block
for volatile regions for this to work. We also observe a lockdep
warning as below because the lock is not initialized correctly
since we skip bad block init for volatile regions.

 INFO: trying to register non-static key.
 the code is fine but needs lockdep annotation.
 turning off the locking correctness validator.
 CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.3.0-rc1-15699-g3dee241c937e #149
 Call Trace:
 [c000f95cb250] [c147dd84] dump_stack+0xe8/0x164 (unreliable)
 [c000f95cb2a0] [c022ccd8] register_lock_class+0x308/0xa60
 [c000f95cb3a0] [c0229cc0] __lock_acquire+0x170/0x1ff0
 [c000f95cb4c0] [c022c740] lock_acquire+0x220/0x270
 [c000f95cb580] [c0a93230] badblocks_check+0xc0/0x290
 [c000f95cb5f0] [c0d97540] nd_pfn_validate+0x5c0/0x7f0
 [c000f95cb6d0] [c0d98300] nd_dax_probe+0xd0/0x1f0
 [c000f95cb760] [c0d9b66c] nd_pmem_probe+0x10c/0x160
 [c000f95cb790] [c0d7f5ec] nvdimm_bus_probe+0x10c/0x240
 [c000f95cb820] [c0d0f844] really_probe+0x254/0x4e0
 [c000f95cb8b0] [c0d0fdfc] driver_probe_device+0x16c/0x1e0
 [c000f95cb930] [c0d10238] device_driver_attach+0x68/0xa0
 [c000f95cb970] [c0d1040c] __driver_attach+0x19c/0x1c0
 [c000f95cb9f0] [c0d0c4c4] bus_for_each_dev+0x94/0x130
 [c000f95cba50] [c0d0f014] driver_attach+0x34/0x50
 [c000f95cba70] [c0d0e208] bus_add_driver+0x178/0x2f0
 [c000f95cbb00] [c0d117c8] driver_register+0x108/0x170
 [c000f95cbb70] [c0d7edb0] __nd_driver_register+0xe0/0x100
 [c000f95cbbd0] [c1a6baa4] nd_pmem_driver_init+0x34/0x48
 [c000f95cbbf0] [c00106f4] do_one_initcall+0x1d4/0x4b0
 [c000f95cbcd0] [c19f499c] kernel_init_freeable+0x544/0x65c
 [c000f95cbdb0] [c0010d6c] kernel_init+0x2c/0x180
 [c000f95cbe20] [c000b954] ret_from_kernel_thread+0x5c/0x68

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/region.c  | 4 ++--
 drivers/nvdimm/region_devs.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvdimm/region.c b/drivers/nvdimm/region.c
index 37bf8719a2a4..0f6978e72e7c 100644
--- a/drivers/nvdimm/region.c
+++ b/drivers/nvdimm/region.c
@@ -34,7 +34,7 @@ static int nd_region_probe(struct device *dev)
if (rc)
return rc;
 
-   if (is_nd_pmem(_region->dev)) {
+   if (is_memory(_region->dev)) {
struct resource ndr_res;
 
if (devm_init_badblocks(dev, _region->bb))
@@ -123,7 +123,7 @@ static void nd_region_notify(struct device *dev, enum 
nvdimm_event event)
struct nd_region *nd_region = to_nd_region(dev);
struct resource res;
 
-   if (is_nd_pmem(_region->dev)) {
+   if (is_memory(_region->dev)) {
res.start = nd_region->ndr_start;
res.end = nd_region->ndr_start +
nd_region->ndr_size - 1;
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 20e265a534f8..8ce275e88492 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -632,7 +632,7 @@ static umode_t region_visible(struct kobject *kobj, struct 
attribute *a, int n)
if (!is_memory(dev) && a == _attr_dax_seed.attr)
return 0;
 
-   if (!is_nd_pmem(dev) && a == _attr_badblocks.attr)
+   if (!is_memory(dev) && a == _attr_badblocks.attr)
return 0;
 
if (a == _attr_resource.attr) {
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5] mm/nvdimm: Use correct alignment when looking at first pfn from a region

2019-08-09 Thread Aneesh Kumar K.V
vmem_altmap_offset() adjust the section aligned base_pfn offset.
So we need to make sure we account for the same when computing base_pfn.

ie, for altmap_valid case, our pfn_first should be:

pfn_first = altmap->base_pfn + vmem_altmap_offset(altmap);

Signed-off-by: Aneesh Kumar K.V 
---
Changes from v4:
* rebase to latest kernel

 mm/memremap.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/memremap.c b/mm/memremap.c
index 6ee03a816d67..6b8cd10e5e35 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -54,8 +54,16 @@ static void pgmap_array_delete(struct resource *res)
 
 static unsigned long pfn_first(struct dev_pagemap *pgmap)
 {
-   return PHYS_PFN(pgmap->res.start) +
-   vmem_altmap_offset(pgmap_altmap(pgmap));
+   const struct resource *res = >res;
+   struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+   unsigned long pfn;
+
+   if (altmap) {
+   pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+   } else
+   pfn = PHYS_PFN(res->start);
+
+   return pfn;
 }
 
 static unsigned long pfn_end(struct dev_pagemap *pgmap)
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5] mm/nvdimm: Fix endian conversion issues 

2019-08-09 Thread Aneesh Kumar K.V
nd_label->dpa issue was observed when trying to enable the namespace created
with little-endian kernel on a big-endian kernel. That made me run
`sparse` on the rest of the code and other changes are the result of that.

Fixes: d9b83c756953 ("libnvdimm, btt: rework error clearing")
Fixes: 9dedc73a4658 ("libnvdimm/btt: Fix LBA masking during 'free list' 
population")

Reviewed-by: Vishal Verma 
Signed-off-by: Aneesh Kumar K.V 
---
Changes from V4:
* Rebase to latest kernel

 drivers/nvdimm/btt.c| 8 
 drivers/nvdimm/namespace_devs.c | 7 ---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index a8d56887ec88..3e9f45aec8d1 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -392,9 +392,9 @@ static int btt_flog_write(struct arena_info *arena, u32 
lane, u32 sub,
arena->freelist[lane].sub = 1 - arena->freelist[lane].sub;
if (++(arena->freelist[lane].seq) == 4)
arena->freelist[lane].seq = 1;
-   if (ent_e_flag(ent->old_map))
+   if (ent_e_flag(le32_to_cpu(ent->old_map)))
arena->freelist[lane].has_err = 1;
-   arena->freelist[lane].block = le32_to_cpu(ent_lba(ent->old_map));
+   arena->freelist[lane].block = ent_lba(le32_to_cpu(ent->old_map));
 
return ret;
 }
@@ -560,8 +560,8 @@ static int btt_freelist_init(struct arena_info *arena)
 * FIXME: if error clearing fails during init, we want to make
 * the BTT read-only
 */
-   if (ent_e_flag(log_new.old_map) &&
-   !ent_normal(log_new.old_map)) {
+   if (ent_e_flag(le32_to_cpu(log_new.old_map)) &&
+   !ent_normal(le32_to_cpu(log_new.old_map))) {
arena->freelist[i].has_err = 1;
ret = arena_clear_freelist_error(arena, i);
if (ret)
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index a9c76df12cb9..f779cb2b0c69 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1987,7 +1987,7 @@ static struct device *create_namespace_pmem(struct 
nd_region *nd_region,
nd_mapping = _region->mapping[i];
label_ent = list_first_entry_or_null(_mapping->labels,
typeof(*label_ent), list);
-   label0 = label_ent ? label_ent->label : 0;
+   label0 = label_ent ? label_ent->label : NULL;
 
if (!label0) {
WARN_ON(1);
@@ -2322,8 +2322,9 @@ static struct device **scan_labels(struct nd_region 
*nd_region)
continue;
 
/* skip labels that describe extents outside of the region */
-   if (nd_label->dpa < nd_mapping->start || nd_label->dpa > 
map_end)
-   continue;
+   if (__le64_to_cpu(nd_label->dpa) < nd_mapping->start ||
+   __le64_to_cpu(nd_label->dpa) > map_end)
+   continue;
 
i = add_namespace_resource(nd_region, nd_label, devs, count);
if (i < 0)
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5 1/4] nvdimm: Consider probe return -EOPNOTSUPP as success

2019-08-09 Thread Aneesh Kumar K.V
This patch add -EOPNOTSUPP as return from probe callback to
indicate we were not able to initialize a namespace due to pfn superblock
feature/version mismatch. We want to consider this a probe success so that
we can create new namesapce seed and there by avoid marking the failed
namespace as the seed namespace.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/bus.c  |  2 +-
 drivers/nvdimm/pmem.c | 26 ++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 798c5c4aea9c..16c35e6446a7 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -95,7 +95,7 @@ static int nvdimm_bus_probe(struct device *dev)
rc = nd_drv->probe(dev);
debug_nvdimm_unlock(dev);
 
-   if (rc == 0)
+   if (rc == 0 || rc == -EOPNOTSUPP)
nd_region_probe_success(nvdimm_bus, dev);
else
nd_region_disable(nvdimm_bus, dev);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4c121dd03dd9..3f498881dd28 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -490,6 +490,7 @@ static int pmem_attach_disk(struct device *dev,
 
 static int nd_pmem_probe(struct device *dev)
 {
+   int ret;
struct nd_namespace_common *ndns;
 
ndns = nvdimm_namespace_common_probe(dev);
@@ -505,12 +506,29 @@ static int nd_pmem_probe(struct device *dev)
if (is_nd_pfn(dev))
return pmem_attach_disk(dev, ndns);
 
-   /* if we find a valid info-block we'll come back as that personality */
-   if (nd_btt_probe(dev, ndns) == 0 || nd_pfn_probe(dev, ndns) == 0
-   || nd_dax_probe(dev, ndns) == 0)
+   ret = nd_btt_probe(dev, ndns);
+   if (ret == 0)
return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
 
-   /* ...otherwise we're just a raw pmem device */
+   ret = nd_pfn_probe(dev, ndns);
+   if (ret == 0)
+   return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
+
+   ret = nd_dax_probe(dev, ndns);
+   if (ret == 0)
+   return -ENXIO;
+   else if (ret == -EOPNOTSUPP)
+   return ret;
+   /*
+* We have two failure conditions here, there is no
+* info reserver block or we found a valid info reserve block
+* but failed to initialize the pfn superblock.
+* Don't create a raw pmem disk for the second case.
+*/
return pmem_attach_disk(dev, ndns);
 }
 
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5 2/4] mm/nvdimm: Add page size and struct page size to pfn superblock

2019-08-09 Thread Aneesh Kumar K.V
This is needed so that we don't wrongly initialize a namespace
which doesn't have enough space reserved for holding struct pages
with the current kernel.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/pfn.h  |  5 -
 drivers/nvdimm/pfn_devs.c | 27 ++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/drivers/nvdimm/pfn.h b/drivers/nvdimm/pfn.h
index 7381673b7b70..acb19517f678 100644
--- a/drivers/nvdimm/pfn.h
+++ b/drivers/nvdimm/pfn.h
@@ -29,7 +29,10 @@ struct nd_pfn_sb {
/* minor-version-2 record the base alignment of the mapping */
__le32 align;
/* minor-version-3 guarantee the padding and flags are zero */
-   u8 padding[4000];
+   /* minor-version-4 record the page size and struct page size */
+   __le32 page_size;
+   __le16 page_struct_size;
+   u8 padding[3994];
__le64 checksum;
 };
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 3e7b11cf1aae..37e96811c2fc 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -460,6 +460,15 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
if (__le16_to_cpu(pfn_sb->version_minor) < 2)
pfn_sb->align = 0;
 
+   if (__le16_to_cpu(pfn_sb->version_minor) < 4) {
+   /*
+* For a large part we use PAGE_SIZE. But we
+* do have some accounting code using SZ_4K.
+*/
+   pfn_sb->page_struct_size = cpu_to_le16(64);
+   pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
+   }
+
switch (le32_to_cpu(pfn_sb->mode)) {
case PFN_MODE_RAM:
case PFN_MODE_PMEM:
@@ -475,6 +484,20 @@ int nd_pfn_validate(struct nd_pfn *nd_pfn, const char *sig)
align = 1UL << ilog2(offset);
mode = le32_to_cpu(pfn_sb->mode);
 
+   if (le32_to_cpu(pfn_sb->page_size) != PAGE_SIZE) {
+   dev_err(_pfn->dev,
+   "init failed, page size mismatch %d\n",
+   le32_to_cpu(pfn_sb->page_size));
+   return -EOPNOTSUPP;
+   }
+
+   if (le16_to_cpu(pfn_sb->page_struct_size) < sizeof(struct page)) {
+   dev_err(_pfn->dev,
+   "init failed, struct page size mismatch %d\n",
+   le16_to_cpu(pfn_sb->page_struct_size));
+   return -EOPNOTSUPP;
+   }
+
if (!nd_pfn->uuid) {
/*
 * When probing a namepace via nd_pfn_probe() the uuid
@@ -722,8 +745,10 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
memcpy(pfn_sb->uuid, nd_pfn->uuid, 16);
memcpy(pfn_sb->parent_uuid, nd_dev_to_uuid(>dev), 16);
pfn_sb->version_major = cpu_to_le16(1);
-   pfn_sb->version_minor = cpu_to_le16(3);
+   pfn_sb->version_minor = cpu_to_le16(4);
pfn_sb->align = cpu_to_le32(nd_pfn->align);
+   pfn_sb->page_struct_size = cpu_to_le16(sizeof(struct page));
+   pfn_sb->page_size = cpu_to_le32(PAGE_SIZE);
checksum = nd_sb_checksum((struct nd_gen_sb *) pfn_sb);
pfn_sb->checksum = cpu_to_le64(checksum);
 
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5 0/4] Mark the namespace disabled on pfn superblock mismatch

2019-08-09 Thread Aneesh Kumar K.V
We add new members to pfn superblock (PAGE_SIZE and struct page size) in this 
series.
This is now checked while initializing the namespace. If we find a mismatch we 
mark
the namespace disabled.

This series also handle configs where hugepage support is not enabled by 
default.
This can result in different align restrictions for dax namespace. We mark the
dax namespace disabled if we find the alignment not supported.

Aneesh Kumar K.V (4):
  nvdimm: Consider probe return -EOPNOTSUPP as success
  mm/nvdimm: Add page size and struct page size to pfn superblock
  mm/nvdimm: Use correct #defines instead of open coding
  mm/nvdimm: Pick the right alignment default when creating dax devices

 arch/powerpc/include/asm/libnvdimm.h |  9 
 arch/powerpc/mm/Makefile |  1 +
 arch/powerpc/mm/nvdimm.c | 34 +++
 arch/x86/include/asm/libnvdimm.h | 19 +
 drivers/nvdimm/bus.c |  2 +-
 drivers/nvdimm/label.c   |  2 +-
 drivers/nvdimm/namespace_devs.c  |  6 +--
 drivers/nvdimm/nd.h  |  6 ---
 drivers/nvdimm/pfn.h |  5 ++-
 drivers/nvdimm/pfn_devs.c| 62 ++--
 drivers/nvdimm/pmem.c| 26 ++--
 drivers/nvdimm/region_devs.c |  8 ++--
 include/linux/huge_mm.h  |  7 +++-
 13 files changed, 163 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5 3/4] mm/nvdimm: Use correct #defines instead of open coding

2019-08-09 Thread Aneesh Kumar K.V
Use PAGE_SIZE instead of SZ_4K and sizeof(struct page) instead of 64.
If we have a kernel built with different struct page size the previous
patch should handle marking the namespace disabled.

Signed-off-by: Aneesh Kumar K.V 
---
 drivers/nvdimm/label.c  | 2 +-
 drivers/nvdimm/namespace_devs.c | 6 +++---
 drivers/nvdimm/pfn_devs.c   | 3 ++-
 drivers/nvdimm/region_devs.c| 8 
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c
index 73e197babc2f..7ee037063be7 100644
--- a/drivers/nvdimm/label.c
+++ b/drivers/nvdimm/label.c
@@ -355,7 +355,7 @@ static bool slot_valid(struct nvdimm_drvdata *ndd,
 
/* check that DPA allocations are page aligned */
if ((__le64_to_cpu(nd_label->dpa)
-   | __le64_to_cpu(nd_label->rawsize)) % SZ_4K)
+   | __le64_to_cpu(nd_label->rawsize)) % PAGE_SIZE)
return false;
 
/* check checksum */
diff --git a/drivers/nvdimm/namespace_devs.c b/drivers/nvdimm/namespace_devs.c
index a16e52251a30..a9c76df12cb9 100644
--- a/drivers/nvdimm/namespace_devs.c
+++ b/drivers/nvdimm/namespace_devs.c
@@ -1006,10 +1006,10 @@ static ssize_t __size_store(struct device *dev, 
unsigned long long val)
return -ENXIO;
}
 
-   div_u64_rem(val, SZ_4K * nd_region->ndr_mappings, );
+   div_u64_rem(val, PAGE_SIZE * nd_region->ndr_mappings, );
if (remainder) {
-   dev_dbg(dev, "%llu is not %dK aligned\n", val,
-   (SZ_4K * nd_region->ndr_mappings) / SZ_1K);
+   dev_dbg(dev, "%llu is not %ldK aligned\n", val,
+   (PAGE_SIZE * nd_region->ndr_mappings) / SZ_1K);
return -EINVAL;
}
 
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c
index 37e96811c2fc..c1d9be609322 100644
--- a/drivers/nvdimm/pfn_devs.c
+++ b/drivers/nvdimm/pfn_devs.c
@@ -725,7 +725,8 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn)
 * when populating the vmemmap. This *should* be equal to
 * PMD_SIZE for most architectures.
 */
-   offset = ALIGN(start + SZ_8K + 64 * npfns, align) - start;
+   offset = ALIGN(start + SZ_8K + sizeof(struct page) * npfns,
+  align) - start;
} else if (nd_pfn->mode == PFN_MODE_RAM)
offset = ALIGN(start + SZ_8K, align) - start;
else
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index af30cbe7a8ea..20e265a534f8 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -992,10 +992,10 @@ static struct nd_region *nd_region_create(struct 
nvdimm_bus *nvdimm_bus,
struct nd_mapping_desc *mapping = _desc->mapping[i];
struct nvdimm *nvdimm = mapping->nvdimm;
 
-   if ((mapping->start | mapping->size) % SZ_4K) {
-   dev_err(_bus->dev, "%s: %s mapping%d is not 4K 
aligned\n",
-   caller, dev_name(>dev), i);
-
+   if ((mapping->start | mapping->size) % PAGE_SIZE) {
+   dev_err(_bus->dev,
+   "%s: %s mapping%d is not %ld aligned\n",
+   caller, dev_name(>dev), i, PAGE_SIZE);
return NULL;
}
 
-- 
2.21.0

___
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm


[PATCH v5 4/4] mm/nvdimm: Pick the right alignment default when creating dax devices

2019-08-09 Thread Aneesh Kumar K.V
Allow arch to provide the supported alignments and use hugepage alignment only
if we support hugepage. Right now we depend on compile time configs whereas this
patch switch this to runtime discovery.

Architectures like ppc64 can have THP enabled in code, but then can have
hugepage size disabled by the hypervisor. This allows us to create dax devices
with PAGE_SIZE alignment in this case.

Existing dax namespace with alignment larger than PAGE_SIZE will fail to
initialize in this specific case. We still allow fsdax namespace initialization.

With respect to identifying whether to enable hugepage fault for a dax device,
if THP is enabled during compile, we default to taking hugepage fault and in dax
fault handler if we find the fault size > alignment we retry with PAGE_SIZE
fault size.

This also addresses the below failure scenario on ppc64

ndctl create-namespace --mode=devdax  | grep align
 "align":16777216,
 "align":16777216

cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments
 65536 16777216

daxio.static-debug  -z -o /dev/dax0.0
  Bus error (core dumped)

  $ dmesg | tail
   lpar: Failed hash pte insert with error -4
   hash-mmu: mm: Hashing failure ! EA=0x7fff1700 access=0x8006 
current=daxio
   hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 
pte=0xc00501002b86
   daxio[3860]: bus error (7) at 7fff1700 nip 7fff973c007c lr 7fff973bff34 
code 2 in libpmem.so.1.0.0[7fff973b+2]
   daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c 
e93f0088 f93f0120
   daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128  e93f0088 
39290008 f93f0110

The failure was due to guest kernel using wrong page size.

The namespaces created with 16M alignment will appear as below on a config with
16M page size disabled.

$ ndctl list -Ni
[
  {
"dev":"namespace0.1",
"mode":"fsdax",
"map":"dev",
"size":5351931904,
"uuid":"fc6e9667-461a-4718-82b4-69b24570bddb",
"align":16777216,
"blockdev":"pmem0.1",
"supported_alignments":[
  65536
]
  },
  {
"dev":"namespace0.0",
"mode":"fsdax",< devdax 16M alignment marked disabled.
"map":"mem",
"size":5368709120,
"uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484",
"state":"disabled"
  }
]

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/libnvdimm.h |  9 
 arch/powerpc/mm/Makefile |  1 +
 arch/powerpc/mm/nvdimm.c | 34 
 arch/x86/include/asm/libnvdimm.h | 19 
 drivers/nvdimm/nd.h  |  6 -
 drivers/nvdimm/pfn_devs.c| 32 +-
 include/linux/huge_mm.h  |  7 +-
 7 files changed, 100 insertions(+), 8 deletions(-)
 create mode 100644 arch/powerpc/include/asm/libnvdimm.h
 create mode 100644 arch/powerpc/mm/nvdimm.c
 create mode 100644 arch/x86/include/asm/libnvdimm.h

diff --git a/arch/powerpc/include/asm/libnvdimm.h 
b/arch/powerpc/include/asm/libnvdimm.h
new file mode 100644
index ..d35fd7f48603
--- /dev/null
+++ b/arch/powerpc/include/asm/libnvdimm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_LIBNVDIMM_H
+#define _ASM_POWERPC_LIBNVDIMM_H
+
+#define nd_pfn_supported_alignments nd_pfn_supported_alignments
+extern unsigned long *nd_pfn_supported_alignments(void);
+extern unsigned long nd_pfn_default_alignment(void);
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 0f499db315d6..42e4a399ba5d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o
 obj-$(CONFIG_PPC_COPRO_BASE)   += copro_fault.o
 obj-$(CONFIG_PPC_PTDUMP)   += ptdump/
 obj-$(CONFIG_KASAN)+= kasan/
+obj-$(CONFIG_NVDIMM_PFN)   += nvdimm.o
diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c
new file mode 100644
index ..a29a4510715e
--- /dev/null
+++ b/arch/powerpc/mm/nvdimm.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+#include 
+/*
+ * We support only pte and pmd mappings for now.
+ */
+const unsigned long *nd_pfn_supported_alignments(void)
+{
+   static unsigned long supported_alignments[3];
+
+   supported_alignments[0] = PAGE_SIZE;
+
+   if (has_transparent_hugepage())
+   supported_alignments[1] = HPAGE_PMD_SIZE;
+   else
+   supported_alignments[1] = 0;
+
+   supported_alignments[2] = 0;
+   return supported_alignments;
+}
+
+/*
+ * Use pmd mapping if supported as default alignment
+ */
+unsigned long nd_pfn_default_alignment(void)
+{
+
+   if (has_transparent_hugepage())
+   return HPAGE_PMD_SIZE;
+   return PAGE_SIZE;
+}
diff --git a/arch/x86/include/asm/libnvdimm.h b/arch/x86/include/asm/libnvdimm.h
new file mode 100644
index ..3d5361db9164
--- /dev/null
+++ b/arch/x86/include/asm/libnvdimm.h
@@ -0,0