[PATCH] mm/migrate: fix indexing bug (off by one) and avoid out of bound access

2017-10-02 Thread Jérôme Glisse
From: Mark Hairgrove <mhairgr...@nvidia.com>

Index was incremented before last use and thus the second array
could dereference to an invalid address (not mentioning the fact
that it did not properly clear the entry we intended to clear).

Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 mm/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 6954c14..e00814c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2146,8 +2146,9 @@ static int migrate_vma_collect_hole(unsigned long start,
unsigned long addr;
 
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-   migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+   migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
+   migrate->npages++;
migrate->cpages++;
}
 
-- 
2.4.11



[PATCH] mm/migrate: fix indexing bug (off by one) and avoid out of bound access

2017-10-02 Thread Jérôme Glisse
From: Mark Hairgrove 

Index was incremented before last use and thus the second array
could dereference to an invalid address (not mentioning the fact
that it did not properly clear the entry we intended to clear).

Signed-off-by: Mark Hairgrove 
Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
---
 mm/migrate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 6954c14..e00814c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2146,8 +2146,9 @@ static int migrate_vma_collect_hole(unsigned long start,
unsigned long addr;
 
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-   migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+   migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
+   migrate->npages++;
migrate->cpages++;
}
 
-- 
2.4.11



[PATCH] mm/hmm: constify hmm_devmem_page_get_drvdata() parameter

2017-10-02 Thread Jérôme Glisse
From: Ralph Campbell <rcampb...@nvidia.com>

Constify pointer parameter to avoid issue when use from code that
only has const struct page pointer to use in the first place.

Signed-off-by: Ralph Campbell <rcampb...@nvidia.com>
Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 include/linux/hmm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 96e6997..325017a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page 
*page,
  * @page: pointer to struct page
  * Return: driver data value
  */
-static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+static inline unsigned long hmm_devmem_page_get_drvdata(const struct page 
*page)
 {
-   unsigned long *drvdata = (unsigned long *)>pgmap;
+   const unsigned long *drvdata = (const unsigned long *)>pgmap;
 
return drvdata[1];
 }
-- 
2.4.11



[PATCH] mm/hmm: constify hmm_devmem_page_get_drvdata() parameter

2017-10-02 Thread Jérôme Glisse
From: Ralph Campbell 

Constify pointer parameter to avoid issue when use from code that
only has const struct page pointer to use in the first place.

Signed-off-by: Ralph Campbell 
Signed-off-by: Jérôme Glisse 
Cc: Andrew Morton 
---
 include/linux/hmm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 96e6997..325017a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page 
*page,
  * @page: pointer to struct page
  * Return: driver data value
  */
-static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page)
+static inline unsigned long hmm_devmem_page_get_drvdata(const struct page 
*page)
 {
-   unsigned long *drvdata = (unsigned long *)>pgmap;
+   const unsigned long *drvdata = (const unsigned long *)>pgmap;
 
return drvdata[1];
 }
-- 
2.4.11



[PATCH 02/13] mm/rmap: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Replacing all mmu_notifier_invalidate_page() by mmu_notifier_invalidat_range()
and making sure it is bracketed by call to mmu_notifier_invalidate_range_start/
end.

Note that because we can not presume the pmd value or pte value we have to
assume the worse and unconditionaly report an invalidation as happening.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 mm/rmap.c | 44 +---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..da97ed525088 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -887,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+   unsigned long start = address, end;
int *cleaned = arg;
 
+   /*
+* We have to assume the worse case ie pmd for invalidation. Note that
+* the page can not be free from this function.
+*/
+   end = min(vma->vm_end, (start & PMD_MASK) + PMD_SIZE);
+   mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk()) {
+   unsigned long cstart, cend;
int ret = 0;
-   address = pvmw.address;
+
+   cstart = address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -904,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
+   cend = cstart + PAGE_SIZE;
ret = 1;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -918,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
+   cstart &= PMD_MASK;
+   cend = cstart + PMD_SIZE;
ret = 1;
 #else
/* unexpected pmd-mapped page? */
@@ -926,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
}
 
if (ret) {
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
(*cleaned)++;
}
}
 
+   mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1324,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
+   unsigned long start = address, end;
enum ttu_flags flags = (enum ttu_flags)arg;
 
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1335,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
flags & TTU_MIGRATION, page);
}
 
+   /*
+* We have to assume the worse case ie pmd for invalidation. Note that
+* the page can not be free in this function as call of try_to_unmap()
+* must hold a reference on the page.
+*/
+   end = min(vma->vm_end, (start & PMD_MASK) + PMD_SIZE);
+   mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk()) {
/*
 * If the page is mlock()d, we cannot swap it out.
@@ -1408,6 +1432,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
set_huge_swap_pte_at(mm, address,
 pvmw.pte, pteval,
 vma_mmu_pagesize(vma));
+   mmu_notifier_invalidate_range(mm, address,
+ 

[PATCH 02/13] mm/rmap: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Replacing all mmu_notifier_invalidate_page() by mmu_notifier_invalidat_range()
and making sure it is bracketed by call to mmu_notifier_invalidate_range_start/
end.

Note that because we can not presume the pmd value or pte value we have to
assume the worse and unconditionaly report an invalidation as happening.

Signed-off-by: Jérôme Glisse 
Cc: Dan Williams 
Cc: Ross Zwisler 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 mm/rmap.c | 44 +---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..da97ed525088 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -887,11 +887,21 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+   unsigned long start = address, end;
int *cleaned = arg;
 
+   /*
+* We have to assume the worse case ie pmd for invalidation. Note that
+* the page can not be free from this function.
+*/
+   end = min(vma->vm_end, (start & PMD_MASK) + PMD_SIZE);
+   mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk()) {
+   unsigned long cstart, cend;
int ret = 0;
-   address = pvmw.address;
+
+   cstart = address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -904,6 +914,7 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pte_wrprotect(entry);
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
+   cend = cstart + PAGE_SIZE;
ret = 1;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
@@ -918,6 +929,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
+   cstart &= PMD_MASK;
+   cend = cstart + PMD_SIZE;
ret = 1;
 #else
/* unexpected pmd-mapped page? */
@@ -926,11 +939,13 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
}
 
if (ret) {
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
(*cleaned)++;
}
}
 
+   mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1324,6 +1339,7 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
pte_t pteval;
struct page *subpage;
bool ret = true;
+   unsigned long start = address, end;
enum ttu_flags flags = (enum ttu_flags)arg;
 
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1335,6 +1351,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
flags & TTU_MIGRATION, page);
}
 
+   /*
+* We have to assume the worse case ie pmd for invalidation. Note that
+* the page can not be free in this function as call of try_to_unmap()
+* must hold a reference on the page.
+*/
+   end = min(vma->vm_end, (start & PMD_MASK) + PMD_SIZE);
+   mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+
while (page_vma_mapped_walk()) {
/*
 * If the page is mlock()d, we cannot swap it out.
@@ -1408,6 +1432,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
set_huge_swap_pte_at(mm, address,
 pvmw.pte, pteval,
 vma_mmu_pagesize(vma));
+   mmu_notifier_invalidate_range(mm, address,
+   address + vma_mmu_pagesize(vma));
} else {
dec_mm_counter(mm, mm_counter(page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1435,6 +1461,8 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
 

[PATCH 05/13] IB/umem: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Leon Romanovsky <leo...@mellanox.com>
Cc: linux-r...@vger.kernel.org
Cc: Artemy Kovalyov <artem...@mellanox.com>
Cc: Doug Ledford <dledf...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/infiniband/core/umem_odp.c | 19 ---
 1 file changed, 19 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 8c4ec564e495..55e8f5ed8b3c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -166,24 +166,6 @@ static int invalidate_page_trampoline(struct ib_umem 
*item, u64 start,
return 0;
 }
 
-static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
-
-   if (!context->invalidate_range)
-   return;
-
-   ib_ucontext_notifier_start_account(context);
-   down_read(>umem_rwsem);
-   rbt_ib_umem_for_each_in_range(>umem_tree, address,
- address + PAGE_SIZE,
- invalidate_page_trampoline, NULL);
-   up_read(>umem_rwsem);
-   ib_ucontext_notifier_end_account(context);
-}
-
 static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
 u64 end, void *cookie)
 {
@@ -237,7 +219,6 @@ static void ib_umem_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 
 static const struct mmu_notifier_ops ib_umem_notifiers = {
.release= ib_umem_notifier_release,
-   .invalidate_page= ib_umem_notifier_invalidate_page,
.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
.invalidate_range_end   = ib_umem_notifier_invalidate_range_end,
 };
-- 
2.13.5



[PATCH 05/13] IB/umem: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Leon Romanovsky 
Cc: linux-r...@vger.kernel.org
Cc: Artemy Kovalyov 
Cc: Doug Ledford 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/infiniband/core/umem_odp.c | 19 ---
 1 file changed, 19 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 8c4ec564e495..55e8f5ed8b3c 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -166,24 +166,6 @@ static int invalidate_page_trampoline(struct ib_umem 
*item, u64 start,
return 0;
 }
 
-static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
-
-   if (!context->invalidate_range)
-   return;
-
-   ib_ucontext_notifier_start_account(context);
-   down_read(>umem_rwsem);
-   rbt_ib_umem_for_each_in_range(>umem_tree, address,
- address + PAGE_SIZE,
- invalidate_page_trampoline, NULL);
-   up_read(>umem_rwsem);
-   ib_ucontext_notifier_end_account(context);
-}
-
 static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
 u64 end, void *cookie)
 {
@@ -237,7 +219,6 @@ static void ib_umem_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 
 static const struct mmu_notifier_ops ib_umem_notifiers = {
.release= ib_umem_notifier_release,
-   .invalidate_page= ib_umem_notifier_invalidate_page,
.invalidate_range_start = ib_umem_notifier_invalidate_range_start,
.invalidate_range_end   = ib_umem_notifier_invalidate_range_end,
 };
-- 
2.13.5



[PATCH 06/13] IB/hfi1: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: linux-r...@vger.kernel.org
Cc: Dean Luick <dean.lu...@intel.com>
Cc: Ira Weiny <ira.we...@intel.com>
Cc: Doug Ledford <dledf...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/infiniband/hw/hfi1/mmu_rb.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c 
b/drivers/infiniband/hw/hfi1/mmu_rb.c
index ccbf52c8ff6f..e4b56a0dd6d0 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -67,8 +67,6 @@ struct mmu_rb_handler {
 
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
-static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
-unsigned long);
 static inline void mmu_notifier_range_start(struct mmu_notifier *,
struct mm_struct *,
unsigned long, unsigned long);
@@ -82,7 +80,6 @@ static void do_remove(struct mmu_rb_handler *handler,
 static void handle_remove(struct work_struct *work);
 
 static const struct mmu_notifier_ops mn_opts = {
-   .invalidate_page = mmu_notifier_page,
.invalidate_range_start = mmu_notifier_range_start,
 };
 
@@ -285,12 +282,6 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
handler->ops->remove(handler->ops_arg, node);
 }
 
-static inline void mmu_notifier_page(struct mmu_notifier *mn,
-struct mm_struct *mm, unsigned long addr)
-{
-   mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
-}
-
 static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
-- 
2.13.5



[PATCH 04/13] drm/amdgpu: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: amd-...@lists.freedesktop.org
Cc: Felix Kuehling <felix.kuehl...@amd.com>
Cc: Christian König <christian.koe...@amd.com>
Cc: Alex Deucher <alexander.deuc...@amd.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 31 ---
 1 file changed, 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 6558a3ed57a7..e1cde6b80027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -147,36 +147,6 @@ static void amdgpu_mn_invalidate_node(struct 
amdgpu_mn_node *node,
 }
 
 /**
- * amdgpu_mn_invalidate_page - callback to notify about mm change
- *
- * @mn: our notifier
- * @mn: the mm this callback is about
- * @address: address of invalidate page
- *
- * Invalidation of a single page. Blocks for all BOs mapping it
- * and unmap them by move them into system domain again.
- */
-static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
-   struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
-   struct interval_tree_node *it;
-
-   mutex_lock(>lock);
-
-   it = interval_tree_iter_first(>objects, address, address);
-   if (it) {
-   struct amdgpu_mn_node *node;
-
-   node = container_of(it, struct amdgpu_mn_node, it);
-   amdgpu_mn_invalidate_node(node, address, address);
-   }
-
-   mutex_unlock(>lock);
-}
-
-/**
  * amdgpu_mn_invalidate_range_start - callback to notify about mm change
  *
  * @mn: our notifier
@@ -215,7 +185,6 @@ static void amdgpu_mn_invalidate_range_start(struct 
mmu_notifier *mn,
 
 static const struct mmu_notifier_ops amdgpu_mn_ops = {
.release = amdgpu_mn_release,
-   .invalidate_page = amdgpu_mn_invalidate_page,
.invalidate_range_start = amdgpu_mn_invalidate_range_start,
 };
 
-- 
2.13.5



[PATCH 06/13] IB/hfi1: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: linux-r...@vger.kernel.org
Cc: Dean Luick 
Cc: Ira Weiny 
Cc: Doug Ledford 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/infiniband/hw/hfi1/mmu_rb.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c 
b/drivers/infiniband/hw/hfi1/mmu_rb.c
index ccbf52c8ff6f..e4b56a0dd6d0 100644
--- a/drivers/infiniband/hw/hfi1/mmu_rb.c
+++ b/drivers/infiniband/hw/hfi1/mmu_rb.c
@@ -67,8 +67,6 @@ struct mmu_rb_handler {
 
 static unsigned long mmu_node_start(struct mmu_rb_node *);
 static unsigned long mmu_node_last(struct mmu_rb_node *);
-static inline void mmu_notifier_page(struct mmu_notifier *, struct mm_struct *,
-unsigned long);
 static inline void mmu_notifier_range_start(struct mmu_notifier *,
struct mm_struct *,
unsigned long, unsigned long);
@@ -82,7 +80,6 @@ static void do_remove(struct mmu_rb_handler *handler,
 static void handle_remove(struct work_struct *work);
 
 static const struct mmu_notifier_ops mn_opts = {
-   .invalidate_page = mmu_notifier_page,
.invalidate_range_start = mmu_notifier_range_start,
 };
 
@@ -285,12 +282,6 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler,
handler->ops->remove(handler->ops_arg, node);
 }
 
-static inline void mmu_notifier_page(struct mmu_notifier *mn,
-struct mm_struct *mm, unsigned long addr)
-{
-   mmu_notifier_mem_invalidate(mn, mm, addr, addr + PAGE_SIZE);
-}
-
 static inline void mmu_notifier_range_start(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start,
-- 
2.13.5



[PATCH 04/13] drm/amdgpu: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: amd-...@lists.freedesktop.org
Cc: Felix Kuehling 
Cc: Christian König 
Cc: Alex Deucher 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 31 ---
 1 file changed, 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 6558a3ed57a7..e1cde6b80027 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -147,36 +147,6 @@ static void amdgpu_mn_invalidate_node(struct 
amdgpu_mn_node *node,
 }
 
 /**
- * amdgpu_mn_invalidate_page - callback to notify about mm change
- *
- * @mn: our notifier
- * @mn: the mm this callback is about
- * @address: address of invalidate page
- *
- * Invalidation of a single page. Blocks for all BOs mapping it
- * and unmap them by move them into system domain again.
- */
-static void amdgpu_mn_invalidate_page(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
-   struct amdgpu_mn *rmn = container_of(mn, struct amdgpu_mn, mn);
-   struct interval_tree_node *it;
-
-   mutex_lock(>lock);
-
-   it = interval_tree_iter_first(>objects, address, address);
-   if (it) {
-   struct amdgpu_mn_node *node;
-
-   node = container_of(it, struct amdgpu_mn_node, it);
-   amdgpu_mn_invalidate_node(node, address, address);
-   }
-
-   mutex_unlock(>lock);
-}
-
-/**
  * amdgpu_mn_invalidate_range_start - callback to notify about mm change
  *
  * @mn: our notifier
@@ -215,7 +185,6 @@ static void amdgpu_mn_invalidate_range_start(struct 
mmu_notifier *mn,
 
 static const struct mmu_notifier_ops amdgpu_mn_ops = {
.release = amdgpu_mn_release,
-   .invalidate_page = amdgpu_mn_invalidate_page,
.invalidate_range_start = amdgpu_mn_invalidate_range_start,
 };
 
-- 
2.13.5



[PATCH 03/13] powerpc/powernv: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: linuxppc-...@lists.ozlabs.org
Cc: Alistair Popple <alist...@popple.id.au>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 arch/powerpc/platforms/powernv/npu-dma.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b5d960d6db3d..4c7b8591f737 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
mmio_invalidate(npu_context, 1, address, true);
 }
 
-static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long address)
-{
-   struct npu_context *npu_context = mn_to_npu_context(mn);
-
-   mmio_invalidate(npu_context, 1, address, true);
-}
-
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct 
mmu_notifier *mn,
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
.release = pnv_npu2_mn_release,
.change_pte = pnv_npu2_mn_change_pte,
-   .invalidate_page = pnv_npu2_mn_invalidate_page,
.invalidate_range = pnv_npu2_mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 07/13] iommu/amd: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Suravee Suthikulpanit <suravee.suthikulpa...@amd.com>
Cc: io...@lists.linux-foundation.org
Cc: Joerg Roedel <jroe...@suse.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/iommu/amd_iommu_v2.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 6629c472eafd..dccf5b76eff2 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -391,13 +391,6 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
return 0;
 }
 
-static void mn_invalidate_page(struct mmu_notifier *mn,
-  struct mm_struct *mm,
-  unsigned long address)
-{
-   __mn_flush_page(mn, address);
-}
-
 static void mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -436,7 +429,6 @@ static void mn_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 static const struct mmu_notifier_ops iommu_mn = {
.release= mn_release,
.clear_flush_young  = mn_clear_flush_young,
-   .invalidate_page= mn_invalidate_page,
.invalidate_range   = mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 03/13] powerpc/powernv: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: linuxppc-...@lists.ozlabs.org
Cc: Alistair Popple 
Cc: Michael Ellerman 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b5d960d6db3d..4c7b8591f737 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
mmio_invalidate(npu_context, 1, address, true);
 }
 
-static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long address)
-{
-   struct npu_context *npu_context = mn_to_npu_context(mn);
-
-   mmio_invalidate(npu_context, 1, address, true);
-}
-
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct 
mmu_notifier *mn,
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
.release = pnv_npu2_mn_release,
.change_pte = pnv_npu2_mn_change_pte,
-   .invalidate_page = pnv_npu2_mn_invalidate_page,
.invalidate_range = pnv_npu2_mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 07/13] iommu/amd: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Suravee Suthikulpanit 
Cc: io...@lists.linux-foundation.org
Cc: Joerg Roedel 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/iommu/amd_iommu_v2.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 6629c472eafd..dccf5b76eff2 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -391,13 +391,6 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
return 0;
 }
 
-static void mn_invalidate_page(struct mmu_notifier *mn,
-  struct mm_struct *mm,
-  unsigned long address)
-{
-   __mn_flush_page(mn, address);
-}
-
 static void mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -436,7 +429,6 @@ static void mn_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 static const struct mmu_notifier_ops iommu_mn = {
.release= mn_release,
.clear_flush_young  = mn_clear_flush_young,
-   .invalidate_page= mn_invalidate_page,
.invalidate_range   = mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 08/13] iommu/intel: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: David Woodhouse <dw...@infradead.org>
Cc: io...@lists.linux-foundation.org
Cc: Joerg Roedel <jroe...@suse.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/iommu/intel-svm.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index f167c0d84ebf..f620dccec8ee 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -223,14 +223,6 @@ static void intel_change_pte(struct mmu_notifier *mn, 
struct mm_struct *mm,
intel_flush_svm_range(svm, address, 1, 1, 0);
 }
 
-static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct 
*mm,
- unsigned long address)
-{
-   struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
-
-   intel_flush_svm_range(svm, address, 1, 1, 0);
-}
-
 /* Pages have been freed at this point */
 static void intel_invalidate_range(struct mmu_notifier *mn,
   struct mm_struct *mm,
@@ -285,7 +277,6 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
 static const struct mmu_notifier_ops intel_mmuops = {
.release = intel_mm_release,
.change_pte = intel_change_pte,
-   .invalidate_page = intel_invalidate_page,
.invalidate_range = intel_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 08/13] iommu/intel: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: David Woodhouse 
Cc: io...@lists.linux-foundation.org
Cc: Joerg Roedel 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/iommu/intel-svm.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index f167c0d84ebf..f620dccec8ee 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -223,14 +223,6 @@ static void intel_change_pte(struct mmu_notifier *mn, 
struct mm_struct *mm,
intel_flush_svm_range(svm, address, 1, 1, 0);
 }
 
-static void intel_invalidate_page(struct mmu_notifier *mn, struct mm_struct 
*mm,
- unsigned long address)
-{
-   struct intel_svm *svm = container_of(mn, struct intel_svm, notifier);
-
-   intel_flush_svm_range(svm, address, 1, 1, 0);
-}
-
 /* Pages have been freed at this point */
 static void intel_invalidate_range(struct mmu_notifier *mn,
   struct mm_struct *mm,
@@ -285,7 +277,6 @@ static void intel_mm_release(struct mmu_notifier *mn, 
struct mm_struct *mm)
 static const struct mmu_notifier_ops intel_mmuops = {
.release = intel_mm_release,
.change_pte = intel_change_pte,
-   .invalidate_page = intel_invalidate_page,
.invalidate_range = intel_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 11/13] xen/gntdev: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
Cc: Roger Pau Monné <roger@citrix.com>
Cc: Boris Ostrovsky <boris.ostrov...@oracle.com>
Cc: xen-de...@lists.xenproject.org
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/xen/gntdev.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index f3bf8f4e2d6c..82360594fa8e 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -484,13 +484,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
mutex_unlock(>lock);
 }
 
-static void mn_invl_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
-}
-
 static void mn_release(struct mmu_notifier *mn,
   struct mm_struct *mm)
 {
@@ -522,7 +515,6 @@ static void mn_release(struct mmu_notifier *mn,
 
 static const struct mmu_notifier_ops gntdev_mmu_ops = {
.release= mn_release,
-   .invalidate_page= mn_invl_page,
.invalidate_range_start = mn_invl_range_start,
 };
 
-- 
2.13.5



[PATCH 11/13] xen/gntdev: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Konrad Rzeszutek Wilk 
Cc: Roger Pau Monné 
Cc: Boris Ostrovsky 
Cc: xen-de...@lists.xenproject.org
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/xen/gntdev.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c
index f3bf8f4e2d6c..82360594fa8e 100644
--- a/drivers/xen/gntdev.c
+++ b/drivers/xen/gntdev.c
@@ -484,13 +484,6 @@ static void mn_invl_range_start(struct mmu_notifier *mn,
mutex_unlock(>lock);
 }
 
-static void mn_invl_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   mn_invl_range_start(mn, mm, address, address + PAGE_SIZE);
-}
-
 static void mn_release(struct mmu_notifier *mn,
   struct mm_struct *mm)
 {
@@ -522,7 +515,6 @@ static void mn_release(struct mmu_notifier *mn,
 
 static const struct mmu_notifier_ops gntdev_mmu_ops = {
.release= mn_release,
-   .invalidate_page= mn_invl_page,
.invalidate_range_start = mn_invl_range_start,
 };
 
-- 
2.13.5



[PATCH 10/13] sgi-gru: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Dimitri Sivanich <sivan...@sgi.com>
Cc: Jack Steiner <stei...@sgi.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/misc/sgi-gru/grutlbpurge.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index e936d43895d2..9918eda0e05f 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -247,17 +247,6 @@ static void gru_invalidate_range_end(struct mmu_notifier 
*mn,
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
 }
 
-static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
-   unsigned long address)
-{
-   struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
-ms_notifier);
-
-   STAT(mmu_invalidate_page);
-   gru_flush_tlb_range(gms, address, PAGE_SIZE);
-   gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address);
-}
-
 static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
@@ -269,7 +258,6 @@ static void gru_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 
 
 static const struct mmu_notifier_ops gru_mmuops = {
-   .invalidate_page= gru_invalidate_page,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end   = gru_invalidate_range_end,
.release= gru_release,
-- 
2.13.5



[PATCH 09/13] misc/mic/scif: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Sudeep Dutt <sudeep.d...@intel.com>
Cc: Ashutosh Dixit <ashutosh.di...@intel.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 drivers/misc/mic/scif/scif_dma.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
index 64d5760d069a..63d6246d6dff 100644
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -200,16 +200,6 @@ static void scif_mmu_notifier_release(struct mmu_notifier 
*mn,
schedule_work(_info.misc_work);
 }
 
-static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
-   struct scif_mmu_notif   *mmn;
-
-   mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
-   scif_rma_destroy_tcw(mmn, address, PAGE_SIZE);
-}
-
 static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
@@ -235,7 +225,6 @@ static void scif_mmu_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
.release = scif_mmu_notifier_release,
.clear_flush_young = NULL,
-   .invalidate_page = scif_mmu_notifier_invalidate_page,
.invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
.invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
 
-- 
2.13.5



[PATCH 10/13] sgi-gru: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Dimitri Sivanich 
Cc: Jack Steiner 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/misc/sgi-gru/grutlbpurge.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/drivers/misc/sgi-gru/grutlbpurge.c 
b/drivers/misc/sgi-gru/grutlbpurge.c
index e936d43895d2..9918eda0e05f 100644
--- a/drivers/misc/sgi-gru/grutlbpurge.c
+++ b/drivers/misc/sgi-gru/grutlbpurge.c
@@ -247,17 +247,6 @@ static void gru_invalidate_range_end(struct mmu_notifier 
*mn,
gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx\n", gms, start, end);
 }
 
-static void gru_invalidate_page(struct mmu_notifier *mn, struct mm_struct *mm,
-   unsigned long address)
-{
-   struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
-ms_notifier);
-
-   STAT(mmu_invalidate_page);
-   gru_flush_tlb_range(gms, address, PAGE_SIZE);
-   gru_dbg(grudev, "gms %p, address 0x%lx\n", gms, address);
-}
-
 static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct,
@@ -269,7 +258,6 @@ static void gru_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 
 
 static const struct mmu_notifier_ops gru_mmuops = {
-   .invalidate_page= gru_invalidate_page,
.invalidate_range_start = gru_invalidate_range_start,
.invalidate_range_end   = gru_invalidate_range_end,
.release= gru_release,
-- 
2.13.5



[PATCH 09/13] misc/mic/scif: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Sudeep Dutt 
Cc: Ashutosh Dixit 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 drivers/misc/mic/scif/scif_dma.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c
index 64d5760d069a..63d6246d6dff 100644
--- a/drivers/misc/mic/scif/scif_dma.c
+++ b/drivers/misc/mic/scif/scif_dma.c
@@ -200,16 +200,6 @@ static void scif_mmu_notifier_release(struct mmu_notifier 
*mn,
schedule_work(_info.misc_work);
 }
 
-static void scif_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long address)
-{
-   struct scif_mmu_notif   *mmn;
-
-   mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier);
-   scif_rma_destroy_tcw(mmn, address, PAGE_SIZE);
-}
-
 static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
 struct mm_struct *mm,
 unsigned long start,
@@ -235,7 +225,6 @@ static void scif_mmu_notifier_invalidate_range_end(struct 
mmu_notifier *mn,
 static const struct mmu_notifier_ops scif_mmu_notifier_ops = {
.release = scif_mmu_notifier_release,
.clear_flush_young = NULL,
-   .invalidate_page = scif_mmu_notifier_invalidate_page,
.invalidate_range_start = scif_mmu_notifier_invalidate_range_start,
.invalidate_range_end = scif_mmu_notifier_invalidate_range_end};
 
-- 
2.13.5



[PATCH 12/13] KVM: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: k...@vger.kernel.org
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
---
 virt/kvm/kvm_main.c | 42 --
 1 file changed, 42 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 15252d723b54..4d81f6ded88e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -322,47 +322,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct 
mmu_notifier *mn)
return container_of(mn, struct kvm, mmu_notifier);
 }
 
-static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   struct kvm *kvm = mmu_notifier_to_kvm(mn);
-   int need_tlb_flush, idx;
-
-   /*
-* When ->invalidate_page runs, the linux pte has been zapped
-* already but the page is still allocated until
-* ->invalidate_page returns. So if we increase the sequence
-* here the kvm page fault will notice if the spte can't be
-* established because the page is going to be freed. If
-* instead the kvm page fault establishes the spte before
-* ->invalidate_page runs, kvm_unmap_hva will release it
-* before returning.
-*
-* The sequence increase only need to be seen at spin_unlock
-* time, and not at spin_lock time.
-*
-* Increasing the sequence after the spin_unlock would be
-* unsafe because the kvm page fault could then establish the
-* pte after kvm_unmap_hva returned, without noticing the page
-* is going to be freed.
-*/
-   idx = srcu_read_lock(>srcu);
-   spin_lock(>mmu_lock);
-
-   kvm->mmu_notifier_seq++;
-   need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
-   /* we've to flush the tlb before the pages can be freed */
-   if (need_tlb_flush)
-   kvm_flush_remote_tlbs(kvm);
-
-   spin_unlock(>mmu_lock);
-
-   kvm_arch_mmu_notifier_invalidate_page(kvm, address);
-
-   srcu_read_unlock(>srcu, idx);
-}
-
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -510,7 +469,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier 
*mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
-   .invalidate_page= kvm_mmu_notifier_invalidate_page,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young  = kvm_mmu_notifier_clear_flush_young,
-- 
2.13.5



[PATCH 12/13] KVM: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: k...@vger.kernel.org
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 virt/kvm/kvm_main.c | 42 --
 1 file changed, 42 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 15252d723b54..4d81f6ded88e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -322,47 +322,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct 
mmu_notifier *mn)
return container_of(mn, struct kvm, mmu_notifier);
 }
 
-static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
-struct mm_struct *mm,
-unsigned long address)
-{
-   struct kvm *kvm = mmu_notifier_to_kvm(mn);
-   int need_tlb_flush, idx;
-
-   /*
-* When ->invalidate_page runs, the linux pte has been zapped
-* already but the page is still allocated until
-* ->invalidate_page returns. So if we increase the sequence
-* here the kvm page fault will notice if the spte can't be
-* established because the page is going to be freed. If
-* instead the kvm page fault establishes the spte before
-* ->invalidate_page runs, kvm_unmap_hva will release it
-* before returning.
-*
-* The sequence increase only need to be seen at spin_unlock
-* time, and not at spin_lock time.
-*
-* Increasing the sequence after the spin_unlock would be
-* unsafe because the kvm page fault could then establish the
-* pte after kvm_unmap_hva returned, without noticing the page
-* is going to be freed.
-*/
-   idx = srcu_read_lock(>srcu);
-   spin_lock(>mmu_lock);
-
-   kvm->mmu_notifier_seq++;
-   need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
-   /* we've to flush the tlb before the pages can be freed */
-   if (need_tlb_flush)
-   kvm_flush_remote_tlbs(kvm);
-
-   spin_unlock(>mmu_lock);
-
-   kvm_arch_mmu_notifier_invalidate_page(kvm, address);
-
-   srcu_read_unlock(>srcu, idx);
-}
-
 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -510,7 +469,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier 
*mn,
 }
 
 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
-   .invalidate_page= kvm_mmu_notifier_invalidate_page,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end   = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young  = kvm_mmu_notifier_clear_flush_young,
-- 
2.13.5



[PATCH 13/13] mm/mmu_notifier: kill invalidate_page

2017-08-29 Thread Jérôme Glisse
The invalidate_page callback suffered from 2 pitfalls. First it use to
happen after page table lock was release and thus a new page might have
setup before the call to invalidate_page() happened.

This is in a weird way fix by c7ab0d2fdc840266b39db94538f74207ec2afbf6
that moved the callback under the page table lock but this also break
several existing user of the mmu_notifier API that assumed they could
sleep inside this callback.

The second pitfall was invalidate_page being the only callback not taking
a range of address in respect to invalidation but was giving an address
and a page. Lot of the callback implementer assumed this could never be
THP and thus failed to invalidate the appropriate range for THP.

By killing this callback we unify the mmu_notifier callback API to always
take a virtual address range as input.

Finaly this also simplify the end user life as there is now 2 clear
choice:
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
page table update under page table lock

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 25 -
 mm/mmu_notifier.c| 14 --
 2 files changed, 39 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c91b3bcd158f..7b2e31b1745a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -95,17 +95,6 @@ struct mmu_notifier_ops {
   pte_t pte);
 
/*
-* Before this is invoked any secondary MMU is still ok to
-* read/write to the page previously pointed to by the Linux
-* pte because the page hasn't been freed yet and it won't be
-* freed until this returns. If required set_page_dirty has to
-* be called internally to this method.
-*/
-   void (*invalidate_page)(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long address);
-
-   /*
 * invalidate_range_start() and invalidate_range_end() must be
 * paired and are called only when the mmap_sem and/or the
 * locks protecting the reverse maps are held. If the subsystem
@@ -220,8 +209,6 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
 unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
  unsigned long address, pte_t pte);
-extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -268,13 +255,6 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
__mmu_notifier_change_pte(mm, address, pte);
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-   if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_page(mm, address);
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
@@ -442,11 +422,6 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
 {
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 54ca54562928..314285284e6e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, 
unsigned long address,
srcu_read_unlock(, id);
 }
 
-void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-   struct mmu_notifier *mn;
-   int id;
-
-   id = srcu_read_lock();
-   hlist_for_each_entry_r

[PATCH 13/13] mm/mmu_notifier: kill invalidate_page

2017-08-29 Thread Jérôme Glisse
The invalidate_page callback suffered from 2 pitfalls. First it use to
happen after page table lock was release and thus a new page might have
setup before the call to invalidate_page() happened.

This is in a weird way fix by c7ab0d2fdc840266b39db94538f74207ec2afbf6
that moved the callback under the page table lock but this also break
several existing user of the mmu_notifier API that assumed they could
sleep inside this callback.

The second pitfall was invalidate_page being the only callback not taking
a range of address in respect to invalidation but was giving an address
and a page. Lot of the callback implementer assumed this could never be
THP and thus failed to invalidate the appropriate range for THP.

By killing this callback we unify the mmu_notifier callback API to always
take a virtual address range as input.

Finaly this also simplify the end user life as there is now 2 clear
choice:
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
page table update under page table lock

Signed-off-by: Jérôme Glisse 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 include/linux/mmu_notifier.h | 25 -
 mm/mmu_notifier.c| 14 --
 2 files changed, 39 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c91b3bcd158f..7b2e31b1745a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -95,17 +95,6 @@ struct mmu_notifier_ops {
   pte_t pte);
 
/*
-* Before this is invoked any secondary MMU is still ok to
-* read/write to the page previously pointed to by the Linux
-* pte because the page hasn't been freed yet and it won't be
-* freed until this returns. If required set_page_dirty has to
-* be called internally to this method.
-*/
-   void (*invalidate_page)(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long address);
-
-   /*
 * invalidate_range_start() and invalidate_range_end() must be
 * paired and are called only when the mmap_sem and/or the
 * locks protecting the reverse maps are held. If the subsystem
@@ -220,8 +209,6 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm,
 unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
  unsigned long address, pte_t pte);
-extern void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address);
 extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end);
 extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
@@ -268,13 +255,6 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
__mmu_notifier_change_pte(mm, address, pte);
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-   if (mm_has_notifiers(mm))
-   __mmu_notifier_invalidate_page(mm, address);
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
@@ -442,11 +422,6 @@ static inline void mmu_notifier_change_pte(struct 
mm_struct *mm,
 {
 }
 
-static inline void mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-}
-
 static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 54ca54562928..314285284e6e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -174,20 +174,6 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, 
unsigned long address,
srcu_read_unlock(, id);
 }
 
-void __mmu_notifier_invalidate_page(struct mm_struct *mm,
- unsigned long address)
-{
-   struct mmu_notifier *mn;
-   int id;
-
-   id = srcu_read_lock();
-   hlist_for_each_entry_rcu(mn, >mmu_notifier_mm->list, hlist) {
-   if (mn->ops->invalidate_page)
-   mn->ops->invalidate_page(mn, mm, address);
-   }
-   srcu_read_unlock(, id);
-}
-
 void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
  unsigned long start, unsigned long end)
 {
-- 
2.13.5



[PATCH 00/13] mmu_notifier kill invalidate_page callback

2017-08-29 Thread Jérôme Glisse
(Sorry for so many list cross-posting and big cc)

Please help testing !

The invalidate_page callback suffered from 2 pitfalls. First it used to
happen after page table lock was release and thus a new page might have
been setup for the virtual address before the call to invalidate_page().

This is in a weird way fixed by c7ab0d2fdc840266b39db94538f74207ec2afbf6
which moved the callback under the page table lock. Which also broke
several existing user of the mmu_notifier API that assumed they could
sleep inside this callback.

The second pitfall was invalidate_page being the only callback not taking
a range of address in respect to invalidation but was giving an address
and a page. Lot of the callback implementer assumed this could never be
THP and thus failed to invalidate the appropriate range for THP pages.

By killing this callback we unify the mmu_notifier callback API to always
take a virtual address range as input.

There is now 2 clear API (I am not mentioning the youngess API which is
seldomly used):
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
page table update under page table lock


Note that a lot of existing user feels broken in respect to range_start/
range_end. Many user only have range_start() callback but there is nothing
preventing them to undo what was invalidated in their range_start() callback
after it returns but before any CPU page table update take place.

The code pattern use in kvm or umem odp is an example on how to properly
avoid such race. In a nutshell use some kind of sequence number and active
range invalidation counter to block anything that might undo what the
range_start() callback did.

If you do not care about keeping fully in sync with CPU page table (ie
you can live with CPU page table pointing to new different page for a
given virtual address) then you can take a reference on the pages inside
the range_start callback and drop it in range_end or when your driver
is done with those pages.

Last alternative is to use invalidate_range() if you can do invalidation
without sleeping as invalidate_range() callback happens under the CPU
page table spinlock right after the page table is updated.


Note this is barely tested. I intend to do more testing of next few days
but i do not have access to all hardware that make use of the mmu_notifier
API.


First 2 patches convert existing call of mmu_notifier_invalidate_page()
to mmu_notifier_invalidate_range() and bracket those call with call to
mmu_notifier_invalidate_range_start()/end().

The next 10 patches remove existing invalidate_page() callback as it can
no longer happen.

Finaly the last page remove it completely so it can RIP.

Jérôme Glisse (13):
  dax: update to new mmu_notifier semantic
  mm/rmap: update to new mmu_notifier semantic
  powerpc/powernv: update to new mmu_notifier semantic
  drm/amdgpu: update to new mmu_notifier semantic
  IB/umem: update to new mmu_notifier semantic
  IB/hfi1: update to new mmu_notifier semantic
  iommu/amd: update to new mmu_notifier semantic
  iommu/intel: update to new mmu_notifier semantic
  misc/mic/scif: update to new mmu_notifier semantic
  sgi-gru: update to new mmu_notifier semantic
  xen/gntdev: update to new mmu_notifier semantic
  KVM: update to new mmu_notifier semantic
  mm/mmu_notifier: kill invalidate_page

Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Joerg Roedel <jroe...@suse.de>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Sudeep Dutt <sudeep.d...@intel.com>
Cc: Ashutosh Dixit <ashutosh.di...@intel.com>
Cc: Dimitri Sivanich <sivan...@sgi.com>
Cc: Jack Steiner <stei...@sgi.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>

Cc: linuxppc-...@lists.ozlabs.org
Cc: dri-de...@lists.freedesktop.org
Cc: amd-...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: io...@lists.linux-foundation.org
Cc: xen-de...@lists.xenproject.org
Cc: k...@vger.kernel.org


 arch/powerpc/platforms/powernv/npu-dma.c | 10 
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c   | 31 --
 drivers/infiniband/core/umem_odp.c   | 19 --
 drivers/infiniband/hw/hfi1/mmu_rb.c  |  9 ---
 drivers/iommu/amd_iommu_v2.c |  8 --
 drivers/iommu/intel-svm.c|  9 ---
 drivers/misc/mic/scif/scif_dma.c | 11 
 drivers/misc/sgi-gru/grutlbpurge.c   | 12 -
 drivers/xen/gntdev.c |  8 --
 fs/dax.c | 19 --
 include/linux/mm.h   |  1 +
 include/linux/mmu_notifier.h | 25 --
 mm/memory.c  | 26 +++-

[PATCH 00/13] mmu_notifier kill invalidate_page callback

2017-08-29 Thread Jérôme Glisse
(Sorry for so many list cross-posting and big cc)

Please help testing !

The invalidate_page callback suffered from 2 pitfalls. First it used to
happen after page table lock was release and thus a new page might have
been setup for the virtual address before the call to invalidate_page().

This is in a weird way fixed by c7ab0d2fdc840266b39db94538f74207ec2afbf6
which moved the callback under the page table lock. Which also broke
several existing user of the mmu_notifier API that assumed they could
sleep inside this callback.

The second pitfall was invalidate_page being the only callback not taking
a range of address in respect to invalidation but was giving an address
and a page. Lot of the callback implementer assumed this could never be
THP and thus failed to invalidate the appropriate range for THP pages.

By killing this callback we unify the mmu_notifier callback API to always
take a virtual address range as input.

There is now 2 clear API (I am not mentioning the youngess API which is
seldomly used):
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
page table update under page table lock


Note that a lot of existing user feels broken in respect to range_start/
range_end. Many user only have range_start() callback but there is nothing
preventing them to undo what was invalidated in their range_start() callback
after it returns but before any CPU page table update take place.

The code pattern use in kvm or umem odp is an example on how to properly
avoid such race. In a nutshell use some kind of sequence number and active
range invalidation counter to block anything that might undo what the
range_start() callback did.

If you do not care about keeping fully in sync with CPU page table (ie
you can live with CPU page table pointing to new different page for a
given virtual address) then you can take a reference on the pages inside
the range_start callback and drop it in range_end or when your driver
is done with those pages.

Last alternative is to use invalidate_range() if you can do invalidation
without sleeping as invalidate_range() callback happens under the CPU
page table spinlock right after the page table is updated.


Note this is barely tested. I intend to do more testing of next few days
but i do not have access to all hardware that make use of the mmu_notifier
API.


First 2 patches convert existing call of mmu_notifier_invalidate_page()
to mmu_notifier_invalidate_range() and bracket those call with call to
mmu_notifier_invalidate_range_start()/end().

The next 10 patches remove existing invalidate_page() callback as it can
no longer happen.

Finaly the last page remove it completely so it can RIP.

Jérôme Glisse (13):
  dax: update to new mmu_notifier semantic
  mm/rmap: update to new mmu_notifier semantic
  powerpc/powernv: update to new mmu_notifier semantic
  drm/amdgpu: update to new mmu_notifier semantic
  IB/umem: update to new mmu_notifier semantic
  IB/hfi1: update to new mmu_notifier semantic
  iommu/amd: update to new mmu_notifier semantic
  iommu/intel: update to new mmu_notifier semantic
  misc/mic/scif: update to new mmu_notifier semantic
  sgi-gru: update to new mmu_notifier semantic
  xen/gntdev: update to new mmu_notifier semantic
  KVM: update to new mmu_notifier semantic
  mm/mmu_notifier: kill invalidate_page

Cc: Kirill A. Shutemov 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Andrea Arcangeli 
Cc: Joerg Roedel 
Cc: Dan Williams 
Cc: Sudeep Dutt 
Cc: Ashutosh Dixit 
Cc: Dimitri Sivanich 
Cc: Jack Steiner 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 

Cc: linuxppc-...@lists.ozlabs.org
Cc: dri-de...@lists.freedesktop.org
Cc: amd-...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: io...@lists.linux-foundation.org
Cc: xen-de...@lists.xenproject.org
Cc: k...@vger.kernel.org


 arch/powerpc/platforms/powernv/npu-dma.c | 10 
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c   | 31 --
 drivers/infiniband/core/umem_odp.c   | 19 --
 drivers/infiniband/hw/hfi1/mmu_rb.c  |  9 ---
 drivers/iommu/amd_iommu_v2.c |  8 --
 drivers/iommu/intel-svm.c|  9 ---
 drivers/misc/mic/scif/scif_dma.c | 11 
 drivers/misc/sgi-gru/grutlbpurge.c   | 12 -
 drivers/xen/gntdev.c |  8 --
 fs/dax.c | 19 --
 include/linux/mm.h   |  1 +
 include/linux/mmu_notifier.h | 25 --
 mm/memory.c  | 26 +++
 mm/mmu_notifier.c| 14 --
 mm/rmap.c| 44 +---
 virt/kvm/kvm_main.c  | 42 --
 16 files changed, 74 insertions(+), 214 deletions(-)

-- 
2.13.5



[PATCH 01/13] dax: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Replacing all mmu_notifier_invalida_page() by mmu_notifier_invalidat_range
and making sure it is bracketed by call to mmu_notifier_invalidate_range_start/
end.

Note that because we can not presume the pmd value or pte value we have to
assume the worse and unconditionaly report an invalidation as happening.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 fs/dax.c   | 19 +++
 include/linux/mm.h |  1 +
 mm/memory.c| 26 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..ab925dc6647a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
-   bool changed;
 
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   unsigned long address;
+   unsigned long address, start, end;
 
cond_resched();
 
@@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
continue;
 
address = pgoff_address(index, vma);
-   changed = false;
-   if (follow_pte_pmd(vma->vm_mm, address, , , ))
+
+   /*
+* Note because we provide start/end to follow_pte_pmd it will
+* call mmu_notifier_invalidate_range_start() on our behalf
+* before taking any lock.
+*/
+   if (follow_pte_pmd(vma->vm_mm, address, , , , 
, ))
continue;
 
if (pmdp) {
@@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-   changed = true;
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
spin_unlock(ptl);
 #endif
@@ -691,13 +695,12 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
-   changed = true;
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
 
-   if (changed)
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..c1f6c95f3496 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1260,6 +1260,7 @@ int copy_page_range(struct mm_struct *dst, struct 
mm_struct *src,
 void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+unsigned long *start, unsigned long *end,
 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
diff --git a/mm/memory.c b/mm/memory.c
index fe2fba27ded2..56e48e4593cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4008,7 +4008,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+   unsigned long *start, unsigned long *end,
+   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4035,17 +4036,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto o

[PATCH 01/13] dax: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
Replacing all mmu_notifier_invalida_page() by mmu_notifier_invalidat_range
and making sure it is bracketed by call to mmu_notifier_invalidate_range_start/
end.

Note that because we can not presume the pmd value or pte value we have to
assume the worse and unconditionaly report an invalidation as happening.

Signed-off-by: Jérôme Glisse 
Cc: Dan Williams 
Cc: Ross Zwisler 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 fs/dax.c   | 19 +++
 include/linux/mm.h |  1 +
 mm/memory.c| 26 +-
 3 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..ab925dc6647a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -646,11 +646,10 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte_t pte, *ptep = NULL;
pmd_t *pmdp = NULL;
spinlock_t *ptl;
-   bool changed;
 
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   unsigned long address;
+   unsigned long address, start, end;
 
cond_resched();
 
@@ -658,8 +657,13 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
continue;
 
address = pgoff_address(index, vma);
-   changed = false;
-   if (follow_pte_pmd(vma->vm_mm, address, , , ))
+
+   /*
+* Note because we provide start/end to follow_pte_pmd it will
+* call mmu_notifier_invalidate_range_start() on our behalf
+* before taking any lock.
+*/
+   if (follow_pte_pmd(vma->vm_mm, address, , , , 
, ))
continue;
 
if (pmdp) {
@@ -676,7 +680,7 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-   changed = true;
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pmd:
spin_unlock(ptl);
 #endif
@@ -691,13 +695,12 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
-   changed = true;
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
 unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
 
-   if (changed)
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 46b9ac5e8569..c1f6c95f3496 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1260,6 +1260,7 @@ int copy_page_range(struct mm_struct *dst, struct 
mm_struct *src,
 void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
 int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
+unsigned long *start, unsigned long *end,
 pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn);
diff --git a/mm/memory.c b/mm/memory.c
index fe2fba27ded2..56e48e4593cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4008,7 +4008,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
-   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+   unsigned long *start, unsigned long *end,
+   pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4035,17 +4036,29 @@ static int __follow_pte_pmd(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto out;
 
+   if (start && end) {
+   *start = address & PMD_MASK;
+   *end = *start + PMD_SIZE;
+   mmu_notifier_invalidate_range_start(mm, *start, *end);
+   }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*

[PATCH 0/4] mmu_notifier semantic update

2017-08-29 Thread Jérôme Glisse
So we do not want to allow sleep during call to mmu_notifier_invalidate_page()
but some code do not have surrounding mmu_notifier_invalidate_range_start()/
mmu_notifier_invalidate_range_end() or mmu_notifier_invalidate_range()

This patch serie just make sure that there is at least a call (outside spinlock
section) to mmu_notifier_invalidate_range() after mmu_notifier_invalidate_page()

This fix issue with AMD IOMMU v2 while avoiding to introduce issue for others
user of the mmu_notifier API. For releavent threads see:

https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>

Jérôme Glisse (4):
  mm/mmu_notifier: document new behavior for
mmu_notifier_invalidate_page()
  dax/mmu_notifier: update to new mmu_notifier semantic
  mm/rmap: update to new mmu_notifier_invalidate_page() semantic
  iommu/amd: update to new mmu_notifier_invalidate_page() semantic

 drivers/iommu/amd_iommu_v2.c |  8 
 fs/dax.c |  8 ++--
 include/linux/mmu_notifier.h |  6 ++
 mm/rmap.c| 18 +-
 4 files changed, 29 insertions(+), 11 deletions(-)

-- 
2.13.5



[PATCH 0/4] mmu_notifier semantic update

2017-08-29 Thread Jérôme Glisse
So we do not want to allow sleep during call to mmu_notifier_invalidate_page()
but some code do not have surrounding mmu_notifier_invalidate_range_start()/
mmu_notifier_invalidate_range_end() or mmu_notifier_invalidate_range()

This patch serie just make sure that there is at least a call (outside spinlock
section) to mmu_notifier_invalidate_range() after mmu_notifier_invalidate_page()

This fix issue with AMD IOMMU v2 while avoiding to introduce issue for others
user of the mmu_notifier API. For releavent threads see:

https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
Cc: Dan Williams 
Cc: Ross Zwisler 

Jérôme Glisse (4):
  mm/mmu_notifier: document new behavior for
mmu_notifier_invalidate_page()
  dax/mmu_notifier: update to new mmu_notifier semantic
  mm/rmap: update to new mmu_notifier_invalidate_page() semantic
  iommu/amd: update to new mmu_notifier_invalidate_page() semantic

 drivers/iommu/amd_iommu_v2.c |  8 
 fs/dax.c |  8 ++--
 include/linux/mmu_notifier.h |  6 ++
 mm/rmap.c| 18 +-
 4 files changed, 29 insertions(+), 11 deletions(-)

-- 
2.13.5



[PATCH 4/4] iommu/amd: update to new mmu_notifier_invalidate_page() semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() is now be call from under the spinlock.
But we can now rely on invalidate_range() being call afterward.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Joerg Roedel <jroe...@suse.de>
Cc: Suravee Suthikulpanit <suravee.suthikulpa...@amd.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 drivers/iommu/amd_iommu_v2.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 6629c472eafd..dccf5b76eff2 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -391,13 +391,6 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
return 0;
 }
 
-static void mn_invalidate_page(struct mmu_notifier *mn,
-  struct mm_struct *mm,
-  unsigned long address)
-{
-   __mn_flush_page(mn, address);
-}
-
 static void mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -436,7 +429,6 @@ static void mn_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 static const struct mmu_notifier_ops iommu_mn = {
.release= mn_release,
.clear_flush_young  = mn_clear_flush_young,
-   .invalidate_page= mn_invalidate_page,
.invalidate_range   = mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 4/4] iommu/amd: update to new mmu_notifier_invalidate_page() semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() is now be call from under the spinlock.
But we can now rely on invalidate_range() being call afterward.

Signed-off-by: Jérôme Glisse 
Cc: Joerg Roedel 
Cc: Suravee Suthikulpanit 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 drivers/iommu/amd_iommu_v2.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c
index 6629c472eafd..dccf5b76eff2 100644
--- a/drivers/iommu/amd_iommu_v2.c
+++ b/drivers/iommu/amd_iommu_v2.c
@@ -391,13 +391,6 @@ static int mn_clear_flush_young(struct mmu_notifier *mn,
return 0;
 }
 
-static void mn_invalidate_page(struct mmu_notifier *mn,
-  struct mm_struct *mm,
-  unsigned long address)
-{
-   __mn_flush_page(mn, address);
-}
-
 static void mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -436,7 +429,6 @@ static void mn_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
 static const struct mmu_notifier_ops iommu_mn = {
.release= mn_release,
.clear_flush_young  = mn_clear_flush_young,
-   .invalidate_page= mn_invalidate_page,
.invalidate_range   = mn_invalidate_range,
 };
 
-- 
2.13.5



[PATCH 2/4] dax/mmu_notifier: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() can now be call from under the spinlock.
Move it approprietly and add a call to mmu_notifier_invalidate_range()
for user that need to be able to sleep.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 fs/dax.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..23cfb055e92e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -650,7 +650,7 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
 
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   unsigned long address;
+   unsigned long start, address, end;
 
cond_resched();
 
@@ -676,6 +676,9 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+   start = address & PMD_MASK;
+   end = start + PMD_SIZE;
+   mmu_notifier_invalidate_page(vma->vm_mm, address);
changed = true;
 unlock_pmd:
spin_unlock(ptl);
@@ -691,13 +694,16 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
+   mmu_notifier_invalidate_page(vma->vm_mm, address);
changed = true;
+   start = address;
+   end = start + PAGE_SIZE;
 unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
 
if (changed)
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
 }
-- 
2.13.5



[PATCH 2/4] dax/mmu_notifier: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() can now be call from under the spinlock.
Move it approprietly and add a call to mmu_notifier_invalidate_range()
for user that need to be able to sleep.

Signed-off-by: Jérôme Glisse 
Cc: Dan Williams 
Cc: Ross Zwisler 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 fs/dax.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 865d42c63e23..23cfb055e92e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -650,7 +650,7 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
 
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   unsigned long address;
+   unsigned long start, address, end;
 
cond_resched();
 
@@ -676,6 +676,9 @@ static void dax_mapping_entry_mkclean(struct address_space 
*mapping,
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+   start = address & PMD_MASK;
+   end = start + PMD_SIZE;
+   mmu_notifier_invalidate_page(vma->vm_mm, address);
changed = true;
 unlock_pmd:
spin_unlock(ptl);
@@ -691,13 +694,16 @@ static void dax_mapping_entry_mkclean(struct 
address_space *mapping,
pte = pte_wrprotect(pte);
pte = pte_mkclean(pte);
set_pte_at(vma->vm_mm, address, ptep, pte);
+   mmu_notifier_invalidate_page(vma->vm_mm, address);
changed = true;
+   start = address;
+   end = start + PAGE_SIZE;
 unlock_pte:
pte_unmap_unlock(ptep, ptl);
}
 
if (changed)
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
}
i_mmap_unlock_read(mapping);
 }
-- 
2.13.5



[PATCH 3/4] mm/rmap: update to new mmu_notifier_invalidate_page() semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() is now be call from under the spinlock.
Add a call to mmu_notifier_invalidate_range() for user that need to be
able to sleep.

Relevent threads:
https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 mm/rmap.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..06792e28093c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -887,6 +887,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+   unsigned long start = address, end = address;
+   bool invalidate = false;
int *cleaned = arg;
 
while (page_vma_mapped_walk()) {
@@ -927,10 +929,16 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
 
if (ret) {
mmu_notifier_invalidate_page(vma->vm_mm, address);
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
+   invalidate = true;
(*cleaned)++;
}
}
 
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1323,8 +1331,9 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
-   bool ret = true;
+   bool ret = true, invalidate = false;
enum ttu_flags flags = (enum ttu_flags)arg;
+   unsigned long start = address, end = address;
 
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1491,7 +1500,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
mmu_notifier_invalidate_page(mm, address);
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
+   invalidate = true;
}
+
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return ret;
 }
 
-- 
2.13.5



[PATCH 1/4] mm/mmu_notifier: document new behavior for mmu_notifier_invalidate_page()

2017-08-29 Thread Jérôme Glisse
The invalidate page callback use to happen outside the page table spinlock
and thus callback use to be allow to sleep. This is no longer the case.
However now all call to mmu_notifier_invalidate_page() are bracketed by
call to mmu_notifier_invalidate_range_start/mmu_notifier_invalidate_range_end

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 include/linux/mmu_notifier.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c91b3bcd158f..acc72167b9cb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -100,6 +100,12 @@ struct mmu_notifier_ops {
 * pte because the page hasn't been freed yet and it won't be
 * freed until this returns. If required set_page_dirty has to
 * be called internally to this method.
+*
+* Note that previously this callback wasn't call from under
+* a spinlock and thus you were able to sleep inside it. This
+* is no longer the case. However now all call to this callback
+* is either bracketed by call to range_start()/range_end() or
+* follow by a call to invalidate_range().
 */
void (*invalidate_page)(struct mmu_notifier *mn,
struct mm_struct *mm,
-- 
2.13.5



[PATCH 3/4] mm/rmap: update to new mmu_notifier_invalidate_page() semantic

2017-08-29 Thread Jérôme Glisse
mmu_notifier_invalidate_page() is now be call from under the spinlock.
Add a call to mmu_notifier_invalidate_range() for user that need to be
able to sleep.

Relevent threads:
https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 mm/rmap.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..06792e28093c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -887,6 +887,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.address = address,
.flags = PVMW_SYNC,
};
+   unsigned long start = address, end = address;
+   bool invalidate = false;
int *cleaned = arg;
 
while (page_vma_mapped_walk()) {
@@ -927,10 +929,16 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
 
if (ret) {
mmu_notifier_invalidate_page(vma->vm_mm, address);
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
+   invalidate = true;
(*cleaned)++;
}
}
 
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1323,8 +1331,9 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
-   bool ret = true;
+   bool ret = true, invalidate = false;
enum ttu_flags flags = (enum ttu_flags)arg;
+   unsigned long start = address, end = address;
 
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1491,7 +1500,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
mmu_notifier_invalidate_page(mm, address);
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
+   invalidate = true;
}
+
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return ret;
 }
 
-- 
2.13.5



[PATCH 1/4] mm/mmu_notifier: document new behavior for mmu_notifier_invalidate_page()

2017-08-29 Thread Jérôme Glisse
The invalidate page callback use to happen outside the page table spinlock
and thus callback use to be allow to sleep. This is no longer the case.
However now all call to mmu_notifier_invalidate_page() are bracketed by
call to mmu_notifier_invalidate_range_start/mmu_notifier_invalidate_range_end

Signed-off-by: Jérôme Glisse 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 include/linux/mmu_notifier.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c91b3bcd158f..acc72167b9cb 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -100,6 +100,12 @@ struct mmu_notifier_ops {
 * pte because the page hasn't been freed yet and it won't be
 * freed until this returns. If required set_page_dirty has to
 * be called internally to this method.
+*
+* Note that previously this callback wasn't call from under
+* a spinlock and thus you were able to sleep inside it. This
+* is no longer the case. However now all call to this callback
+* is either bracketed by call to range_start()/range_end() or
+* follow by a call to invalidate_range().
 */
void (*invalidate_page)(struct mmu_notifier *mn,
struct mm_struct *mm,
-- 
2.13.5



[RFC PATCH] mm/rmap: do not call mmu_notifier_invalidate_page() v3

2017-08-29 Thread Jérôme Glisse
Some MMU notifier need to be able to sleep during callback. This was
broken by c7ab0d2fdc84 ("mm: convert try_to_unmap_one() to use
page_vma_mapped_walk()").

This patch restore the sleep ability and properly capture the range of
address that needs to be invalidated.

Relevent threads:
https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Bernhard Held <berny...@gmx.de>
Cc: Adam Borowski <kilob...@angband.pl>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Radim Krčmář <rkrc...@redhat.com>
Cc: Wanpeng Li <kernel...@gmail.com>
Cc: Paolo Bonzini <pbonz...@redhat.com>
Cc: Takashi Iwai <ti...@suse.de>
Cc: Nadav Amit <nadav.a...@gmail.com>
Cc: Mike Galbraith <efa...@gmx.de>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: axie <a...@amd.com>
Cc: Andrew Morton <a...@linux-foundation.org>
---
 mm/rmap.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..0b25b720f494 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -888,6 +888,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.flags = PVMW_SYNC,
};
int *cleaned = arg;
+   bool invalidate = false;
+   unsigned long start = address, end = address;
 
while (page_vma_mapped_walk()) {
int ret = 0;
@@ -905,6 +907,9 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
+   invalidate = true;
+   /* range is exclusive */
+   end = pvmw.address + PAGE_SIZE;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
pmd_t *pmd = pvmw.pmd;
@@ -919,18 +924,22 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
+   invalidate = true;
+   /* range is exclusive */
+   end = pvmw.address + PAGE_SIZE;
 #else
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
 #endif
}
 
-   if (ret) {
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   if (ret)
(*cleaned)++;
-   }
}
 
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1323,8 +1332,9 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
-   bool ret = true;
+   bool ret = true, invalidate = false;
enum ttu_flags flags = (enum ttu_flags)arg;
+   unsigned long start = address, end = address;
 
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1490,8 +1500,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
 discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
-   mmu_notifier_invalidate_page(mm, address);
+   invalidate = true;
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
}
+
+   if (invalidate)
+   mmu_notifier_invalidate_range(mm, start, end);
+
return ret;
 }
 
-- 
2.13.5



[RFC PATCH] mm/rmap: do not call mmu_notifier_invalidate_page() v3

2017-08-29 Thread Jérôme Glisse
Some MMU notifier need to be able to sleep during callback. This was
broken by c7ab0d2fdc84 ("mm: convert try_to_unmap_one() to use
page_vma_mapped_walk()").

This patch restore the sleep ability and properly capture the range of
address that needs to be invalidated.

Relevent threads:
https://lkml.kernel.org/r/20170809204333.27485-1-jgli...@redhat.com
https://lkml.kernel.org/r/20170804134928.l4klfcnqatni7...@black.fi.intel.com
https://marc.info/?l=kvm=150327081325160=2

Signed-off-by: Jérôme Glisse 
Cc: Linus Torvalds 
Cc: Bernhard Held 
Cc: Adam Borowski 
Cc: Andrea Arcangeli 
Cc: Radim Krčmář 
Cc: Wanpeng Li 
Cc: Paolo Bonzini 
Cc: Takashi Iwai 
Cc: Nadav Amit 
Cc: Mike Galbraith 
Cc: Kirill A. Shutemov 
Cc: axie 
Cc: Andrew Morton 
---
 mm/rmap.c | 26 +-
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index c8993c63eb25..0b25b720f494 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -888,6 +888,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
.flags = PVMW_SYNC,
};
int *cleaned = arg;
+   bool invalidate = false;
+   unsigned long start = address, end = address;
 
while (page_vma_mapped_walk()) {
int ret = 0;
@@ -905,6 +907,9 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pte_mkclean(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
+   invalidate = true;
+   /* range is exclusive */
+   end = pvmw.address + PAGE_SIZE;
} else {
 #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
pmd_t *pmd = pvmw.pmd;
@@ -919,18 +924,22 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
+   invalidate = true;
+   /* range is exclusive */
+   end = pvmw.address + PAGE_SIZE;
 #else
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
 #endif
}
 
-   if (ret) {
-   mmu_notifier_invalidate_page(vma->vm_mm, address);
+   if (ret)
(*cleaned)++;
-   }
}
 
+   if (invalidate)
+   mmu_notifier_invalidate_range(vma->vm_mm, start, end);
+
return true;
 }
 
@@ -1323,8 +1332,9 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
};
pte_t pteval;
struct page *subpage;
-   bool ret = true;
+   bool ret = true, invalidate = false;
enum ttu_flags flags = (enum ttu_flags)arg;
+   unsigned long start = address, end = address;
 
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
@@ -1490,8 +1500,14 @@ static bool try_to_unmap_one(struct page *page, struct 
vm_area_struct *vma,
 discard:
page_remove_rmap(subpage, PageHuge(page));
put_page(page);
-   mmu_notifier_invalidate_page(mm, address);
+   invalidate = true;
+   /* range is exclusive */
+   end = address + PAGE_SIZE;
}
+
+   if (invalidate)
+   mmu_notifier_invalidate_range(mm, start, end);
+
return ret;
 }
 
-- 
2.13.5



[HMM-v25 01/19] hmm: heterogeneous memory management documentation v3

2017-08-16 Thread Jérôme Glisse
This add documentation for HMM (Heterogeneous Memory Management). It
presents the motivation behind it, the features necessary for it to
be useful and and gives an overview of how this is implemented.

Changed since v2:
  - add section about memory accounting cgroup (memcg)
Changed since v1:
  - removed outdated section

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
---
 Documentation/vm/hmm.txt | 384 +++
 MAINTAINERS  |   7 +
 2 files changed, 391 insertions(+)
 create mode 100644 Documentation/vm/hmm.txt

diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
new file mode 100644
index ..4d3aac9f4a5d
--- /dev/null
+++ b/Documentation/vm/hmm.txt
@@ -0,0 +1,384 @@
+Heterogeneous Memory Management (HMM)
+
+Transparently allow any component of a program to use any memory region of said
+program with a device without using device specific memory allocator. This is
+becoming a requirement to simplify the use of advance heterogeneous computing
+where GPU, DSP or FPGA are use to perform various computations.
+
+This document is divided as follow, in the first section i expose the problems
+related to the use of a device specific allocator. The second section i expose
+the hardware limitations that are inherent to many platforms. The third section
+gives an overview of HMM designs. The fourth section explains how CPU page-
+table mirroring works and what is HMM purpose in this context. Fifth section
+deals with how device memory is represented inside the kernel. Finaly the last
+section present the new migration helper that allow to leverage the device DMA
+engine.
+
+
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
+---
+
+1) Problems of using device specific memory allocator:
+
+Device with large amount of on board memory (several giga bytes) like GPU have
+historically manage their memory through dedicated driver specific API. This
+creates a disconnect between memory allocated and managed by device driver and
+regular application memory (private anonymous, share memory or regular file
+back memory). From here on i will refer to this aspect as split address space.
+I use share address space to refer to the opposite situation ie one in which
+any memory region can be use by device transparently.
+
+Split address space because device can only access memory allocated through the
+device specific API. This imply that all memory object in a program are not
+equal from device point of view which complicate large program that rely on a
+wide set of libraries.
+
+Concretly this means that code that wants to leverage device like GPU need to
+copy object between genericly allocated memory (malloc, mmap private/share/)
+and memory allocated through the device driver API (this still end up with an
+mmap but of the device file).
+
+For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
+complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
+data-set need to re-map all the pointer relations between each of its elements.
+This is error prone and program gets harder to debug because of the duplicate
+data-set.
+
+Split address space also means that library can not transparently use data they
+are getting from core program or other library and thus each library might have
+to duplicate its input data-set using specific memory allocator. Large project
+suffer from this and waste resources because of the various memory copy.
+
+Duplicating each library API to accept as input or output memory allocted by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosions in the library entry points.
+
+Finaly with the advance of high level language constructs (in C++ but in other
+language too) it is now possible for compiler to leverage GPU or other devices
+without even the programmer knowledge. Some of compiler identified patterns are
+only do-able with a share address. It is as well more reasonable to use a share
+address space for all the other patterns.
+
+
+---
+
+2) System bus, device memory characteristics
+
+System bus cripple share address due to few limitations. Most system bus only
+allow basic memory access from device to main memory, even cache coherency is
+often optional. Access to device memory from CPU is even more limited, most
+often than not it is not cache coherent.
+
+If we only consider the PCIE bus than device can access main memory (often
+through an IOMMU) and be cache coherent with th

[HMM-v25 01/19] hmm: heterogeneous memory management documentation v3

2017-08-16 Thread Jérôme Glisse
This add documentation for HMM (Heterogeneous Memory Management). It
presents the motivation behind it, the features necessary for it to
be useful and and gives an overview of how this is implemented.

Changed since v2:
  - add section about memory accounting cgroup (memcg)
Changed since v1:
  - removed outdated section

Signed-off-by: Jérôme Glisse 
---
 Documentation/vm/hmm.txt | 384 +++
 MAINTAINERS  |   7 +
 2 files changed, 391 insertions(+)
 create mode 100644 Documentation/vm/hmm.txt

diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
new file mode 100644
index ..4d3aac9f4a5d
--- /dev/null
+++ b/Documentation/vm/hmm.txt
@@ -0,0 +1,384 @@
+Heterogeneous Memory Management (HMM)
+
+Transparently allow any component of a program to use any memory region of said
+program with a device without using device specific memory allocator. This is
+becoming a requirement to simplify the use of advance heterogeneous computing
+where GPU, DSP or FPGA are use to perform various computations.
+
+This document is divided as follow, in the first section i expose the problems
+related to the use of a device specific allocator. The second section i expose
+the hardware limitations that are inherent to many platforms. The third section
+gives an overview of HMM designs. The fourth section explains how CPU page-
+table mirroring works and what is HMM purpose in this context. Fifth section
+deals with how device memory is represented inside the kernel. Finaly the last
+section present the new migration helper that allow to leverage the device DMA
+engine.
+
+
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
+---
+
+1) Problems of using device specific memory allocator:
+
+Device with large amount of on board memory (several giga bytes) like GPU have
+historically manage their memory through dedicated driver specific API. This
+creates a disconnect between memory allocated and managed by device driver and
+regular application memory (private anonymous, share memory or regular file
+back memory). From here on i will refer to this aspect as split address space.
+I use share address space to refer to the opposite situation ie one in which
+any memory region can be use by device transparently.
+
+Split address space because device can only access memory allocated through the
+device specific API. This imply that all memory object in a program are not
+equal from device point of view which complicate large program that rely on a
+wide set of libraries.
+
+Concretly this means that code that wants to leverage device like GPU need to
+copy object between genericly allocated memory (malloc, mmap private/share/)
+and memory allocated through the device driver API (this still end up with an
+mmap but of the device file).
+
+For flat dataset (array, grid, image, ...) this isn't too hard to achieve but
+complex data-set (list, tree, ...) are hard to get right. Duplicating a complex
+data-set need to re-map all the pointer relations between each of its elements.
+This is error prone and program gets harder to debug because of the duplicate
+data-set.
+
+Split address space also means that library can not transparently use data they
+are getting from core program or other library and thus each library might have
+to duplicate its input data-set using specific memory allocator. Large project
+suffer from this and waste resources because of the various memory copy.
+
+Duplicating each library API to accept as input or output memory allocted by
+each device specific allocator is not a viable option. It would lead to a
+combinatorial explosions in the library entry points.
+
+Finaly with the advance of high level language constructs (in C++ but in other
+language too) it is now possible for compiler to leverage GPU or other devices
+without even the programmer knowledge. Some of compiler identified patterns are
+only do-able with a share address. It is as well more reasonable to use a share
+address space for all the other patterns.
+
+
+---
+
+2) System bus, device memory characteristics
+
+System bus cripple share address due to few limitations. Most system bus only
+allow basic memory access from device to main memory, even cache coherency is
+often optional. Access to device memory from CPU is even more limited, most
+often than not it is not cache coherent.
+
+If we only consider the PCIE bus than device can access main memory (often
+through an IOMMU) and be cache coherent with the CPUs. However it only allows

[HMM-v25 03/19] mm/hmm/mirror: mirror process address space on device with HMM helpers v3

2017-08-16 Thread Jérôme Glisse
This is a heterogeneous memory management (HMM) process address space
mirroring. In a nutshell this provide an API to mirror process address
space on a device. This boils down to keeping CPU and device page table
synchronize (we assume that both device and CPU are cache coherent like
PCIe device can be).

This patch provide a simple API for device driver to achieve address
space mirroring thus avoiding each device driver to grow its own CPU
page table walker and its own CPU page table synchronization mechanism.

This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more
hardware in the future.

Changed since v2:
  - s/device unaddressable/device private/
Changed since v1:
  - Kconfig logic (depend on x86-64 and use ARCH_HAS pattern)

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/hmm.h | 110 ++
 mm/Kconfig  |  12 
 mm/hmm.c| 168 +++-
 3 files changed, 275 insertions(+), 15 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ca60595ce784..61c707970aa6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,7 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+struct hmm;
 
 /*
  * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
@@ -134,6 +135,115 @@ static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long 
pfn)
 }
 
 
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+/*
+ * Mirroring: how to synchronize device page table with CPU page table.
+ *
+ * A device driver that is participating in HMM mirroring must always
+ * synchronize with CPU page table updates. For this, device drivers can either
+ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
+ * drivers can decide to register one mirror per device per process, or just
+ * one mirror per process for a group of devices. The pattern is:
+ *
+ *  int device_bind_address_space(..., struct mm_struct *mm, ...)
+ *  {
+ *  struct device_address_space *das;
+ *
+ *  // Device driver specific initialization, and allocation of das
+ *  // which contains an hmm_mirror struct as one of its fields.
+ *  ...
+ *
+ *  ret = hmm_mirror_register(>mirror, mm, _mirror_ops);
+ *  if (ret) {
+ *  // Cleanup on error
+ *  return ret;
+ *  }
+ *
+ *  // Other device driver specific initialization
+ *  ...
+ *  }
+ *
+ * Once an hmm_mirror is registered for an address space, the device driver
+ * will get callbacks through sync_cpu_device_pagetables() operation (see
+ * hmm_mirror_ops struct).
+ *
+ * Device driver must not free the struct containing the hmm_mirror struct
+ * before calling hmm_mirror_unregister(). The expected usage is to do that 
when
+ * the device driver is unbinding from an address space.
+ *
+ *
+ *  void device_unbind_address_space(struct device_address_space *das)
+ *  {
+ *  // Device driver specific cleanup
+ *  ...
+ *
+ *  hmm_mirror_unregister(>mirror);
+ *
+ *  // Other device driver specific cleanup, and now das can be freed
+ *  ...
+ *  }
+ */
+
+struct hmm_mirror;
+
+/*
+ * enum hmm_update_type - type of update
+ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
+ */
+enum hmm_update_type {
+   HMM_UPDATE_INVALIDATE,
+};
+
+/*
+ * struct hmm_mirror_ops - HMM mirror device operations callback
+ *
+ * @update: callback to update range on a device
+ */
+struct hmm_mirror_ops {
+   /* sync_cpu_device_pagetables() - synchronize page tables
+*
+* @mirror: pointer to struct hmm_mirror
+* @update_type: type of update that occurred to the CPU page table
+* @start: virtual start address of the range to update
+* @end: virtual end address of the range to update
+*
+* This callback ultimately originates from mmu_notifiers when the CPU
+* page table is updated. The device driver must update its page table
+* in response to this callback. The update argument tells what action
+* to perform.
+*
+* The device driver must not return from this callback until the device
+* page tables are completely updated (TLBs flushed, etc); this is a
+* synchronous call.
+*/
+   void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+  enum hmm_update_type update_type,
+  unsigned long start,
+  unsigned long end);
+};
+
+/*
+ * struct hmm_mirror - mirror 

[HMM-v25 04/19] mm/hmm/mirror: helper to snapshot CPU page table v4

2017-08-16 Thread Jérôme Glisse
This does not use existing page table walker because we want to share
same code for our page fault handler.

Changed since v3:
  - fix THP handling
Changed since v2:
  - s/device unaddressable/device private/
Changes since v1:
  - Use spinlock instead of rcu synchronized list traversal

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/hmm.h |  55 +-
 mm/hmm.c| 285 
 2 files changed, 338 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 61c707970aa6..4cb6f9706c3c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -79,13 +79,26 @@ struct hmm;
  *
  * Flags:
  * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_READ:  CPU page table has read permission set
  * HMM_PFN_WRITE: CPU page table has write permission set
+ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
+ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
+ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
+ *  result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ *  be mirrored by a device, because the entry will never have 
HMM_PFN_VALID
+ *  set and the pfn value is undefined.
+ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
  */
 typedef unsigned long hmm_pfn_t;
 
 #define HMM_PFN_VALID (1 << 0)
-#define HMM_PFN_WRITE (1 << 1)
-#define HMM_PFN_SHIFT 2
+#define HMM_PFN_READ (1 << 1)
+#define HMM_PFN_WRITE (1 << 2)
+#define HMM_PFN_ERROR (1 << 3)
+#define HMM_PFN_EMPTY (1 << 4)
+#define HMM_PFN_SPECIAL (1 << 5)
+#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
+#define HMM_PFN_SHIFT 7
 
 /*
  * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
@@ -241,6 +254,44 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+/*
+ * struct hmm_range - track invalidation lock on virtual address range
+ *
+ * @list: all range lock are on a list
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @pfns: array of pfns (big enough for the range)
+ * @valid: pfns array did not change since it has been fill by an HMM function
+ */
+struct hmm_range {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   hmm_pfn_t   *pfns;
+   boolvalid;
+};
+
+/*
+ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
+ * driver lock that serializes device page table updates, then call
+ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
+ * device driver page table update lock must also be used in the
+ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
+ * table invalidation serializes on it.
+ *
+ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_vma_get_pfns() WITHOUT ERROR !
+ *
+ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+struct hmm_range *range,
+unsigned long start,
+unsigned long end,
+hmm_pfn_t *pfns);
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index ff78c92cd34c..b8a24b80463e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,8 +19,12 @@
  */
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 
@@ -31,14 +35,18 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  * struct hmm - HMM per mm struct
  *
  * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
  * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
  * @mirrors: list of mirrors for this mm
  * @mmu_notifier: mmu notifier to track updates to CPU page table
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  */
 struct hmm {
struct mm_struct*mm;
+   spinlock_t  lock;
atomic_tsequence;
+   struct list_headranges;
struct list_headmirrors;
struct mmu_notifier mmu_notifier;
struct rw_semaphore mirrors_sem;
@@ -72,6 +80,8 @@ static struct hmm *hmm_register(struct mm_struct *mm)
init_

[HMM-v25 02/19] mm/hmm: heterogeneous memory management (HMM for short) v5

2017-08-16 Thread Jérôme Glisse
HMM provides 3 separate types of functionality:
- Mirroring: synchronize CPU page table and device page table
- Device memory: allocating struct page for device memory
- Migration: migrating regular memory to device memory

This patch introduces some common helpers and definitions to all of
those 3 functionality.

Changed since v4:
  - added hmm_mm_init() call to init mm's HMM fields
Changed since v3:
  - Unconditionaly build hmm.c for static keys
Changed since v2:
  - s/device unaddressable/device private
Changed since v1:
  - Kconfig logic (depend on x86-64 and use ARCH_HAS pattern)

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/hmm.h  | 152 +++
 include/linux/mm_types.h |   6 ++
 kernel/fork.c|   3 +
 mm/Kconfig   |  13 
 mm/Makefile  |   2 +-
 mm/hmm.c |  74 +++
 6 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index ..ca60595ce784
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse <jgli...@redhat.com>
+ */
+/*
+ * Heterogeneous Memory Management (HMM)
+ *
+ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * is for. Here we focus on the HMM API description, with some explanation of
+ * the underlying implementation.
+ *
+ * Short description: HMM provides a set of helpers to share a virtual address
+ * space between CPU and a device, so that the device can access any valid
+ * address of the process (while still obeying memory protection). HMM also
+ * provides helpers to migrate process memory to device memory, and back. Each
+ * set of functionality (address space mirroring, and migration to and from
+ * device memory) can be used independently of the other.
+ *
+ *
+ * HMM address space mirroring API:
+ *
+ * Use HMM address space mirroring if you want to mirror range of the CPU page
+ * table of a process into a device page table. Here, "mirror" means "keep
+ * synchronized". Prerequisites: the device must provide the ability to write-
+ * protect its page tables (at PAGE_SIZE granularity), and must be able to
+ * recover from the resulting potential page faults.
+ *
+ * HMM guarantees that at any point in time, a given virtual address points to
+ * either the same memory in both CPU and device page tables (that is: CPU and
+ * device page tables each point to the same pages), or that one page table 
(CPU
+ * or device) points to no entry, while the other still points to the old page
+ * for the address. The latter case happens when the CPU page table update
+ * happens first, and then the update is mirrored over to the device page 
table.
+ * This does not cause any issue, because the CPU page table cannot start
+ * pointing to a new page until the device page table is invalidated.
+ *
+ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
+ * updates to each device driver that has registered a mirror. It also provides
+ * some API calls to help with taking a snapshot of the CPU page table, and to
+ * synchronize with any updates that might happen concurrently.
+ *
+ *
+ * HMM migration to and from device memory:
+ *
+ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
+ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
+ * of the device memory, and allows the device driver to manage its memory
+ * using those struct pages. Having struct pages for device memory makes
+ * migration easier. Because that memory is not addressable by the CPU it must
+ * never be pinned to the device; in other words, any CPU page fault can always
+ * cause the device memory to be migrated (copied/moved) back to regular 
memory.
+ *
+ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
+ * allows use of a device DMA engine to perform the copy operation between
+ * regular syste

[HMM-v25 07/19] mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory v5

2017-08-16 Thread Jérôme Glisse
HMM (heterogeneous memory management) need struct page to support migration
from system main memory to device memory.  Reasons for HMM and migration to
device memory is explained with HMM core patch.

This patch deals with device memory that is un-addressable memory (ie CPU
can not access it). Hence we do not want those struct page to be manage
like regular memory. That is why we extend ZONE_DEVICE to support different
types of memory.

A persistent memory type is define for existing user of ZONE_DEVICE and a
new device un-addressable type is added for the un-addressable memory type.
There is a clear separation between what is expected from each memory type
and existing user of ZONE_DEVICE are un-affected by new requirement and new
use of the un-addressable type. All specific code path are protect with
test against the memory type.

Because memory is un-addressable we use a new special swap type for when
a page is migrated to device memory (this reduces the number of maximum
swap file).

The main two additions beside memory type to ZONE_DEVICE is two callbacks.
First one, page_free() is call whenever page refcount reach 1 (which means
the page is free as ZONE_DEVICE page never reach a refcount of 0). This
allow device driver to manage its memory and associated struct page.

The second callback page_fault() happens when there is a CPU access to
an address that is back by a device page (which are un-addressable by the
CPU). This callback is responsible to migrate the page back to system
main memory. Device driver can not block migration back to system memory,
HMM make sure that such page can not be pin into device memory.

If device is in some error condition and can not migrate memory back then
a CPU page fault to device memory should end with SIGBUS.

Changed since v4:
  - s/DEVICE_PUBLIC/DEVICE_HOST (to free DEVICE_PUBLIC for HMM-CDM)
Changed since v3:
  - fix comments that was still using UNADDRESSABLE as keyword
  - kernel configuration simplification
Changed since v2:
  - s/DEVICE_UNADDRESSABLE/DEVICE_PRIVATE
Changed since v1:
  - rename to device private memory (from device unaddressable)

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Acked-by: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 fs/proc/task_mmu.c   |  7 +
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 73 
 include/linux/mm.h   | 12 
 include/linux/swap.h | 24 ++--
 include/linux/swapops.h  | 68 
 kernel/memremap.c| 34 ++
 mm/Kconfig   | 11 +++-
 mm/memory.c  | 61 
 mm/memory_hotplug.c  | 10 +--
 mm/mprotect.c| 14 ++
 11 files changed, 309 insertions(+), 6 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 186e907187ab..e3ba93b35925 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -535,6 +535,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+   else if (is_device_private_entry(swpent))
+   page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = find_get_entry(vma->vm_file->f_mapping,
@@ -698,6 +700,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long 
hmask,
 
if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+   else if (is_device_private_entry(swpent))
+   page = device_private_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -1198,6 +1202,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
+
+   if (is_device_private_entry(entry))
+   page = device_private_entry_to_page(entry);
}
 
if (page && !PageAnon(page))
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6230064d7f95..3a4f69137bc2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -130,6 +130,7 @@ enum {
IORES_DESC_ACPI_NV_STORAGE  = 3,
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
+   IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 93

[HMM-v25 03/19] mm/hmm/mirror: mirror process address space on device with HMM helpers v3

2017-08-16 Thread Jérôme Glisse
This is a heterogeneous memory management (HMM) process address space
mirroring. In a nutshell this provide an API to mirror process address
space on a device. This boils down to keeping CPU and device page table
synchronize (we assume that both device and CPU are cache coherent like
PCIe device can be).

This patch provide a simple API for device driver to achieve address
space mirroring thus avoiding each device driver to grow its own CPU
page table walker and its own CPU page table synchronization mechanism.

This is useful for NVidia GPU >= Pascal, Mellanox IB >= mlx5 and more
hardware in the future.

Changed since v2:
  - s/device unaddressable/device private/
Changed since v1:
  - Kconfig logic (depend on x86-64 and use ARCH_HAS pattern)

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/hmm.h | 110 ++
 mm/Kconfig  |  12 
 mm/hmm.c| 168 +++-
 3 files changed, 275 insertions(+), 15 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index ca60595ce784..61c707970aa6 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,7 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+struct hmm;
 
 /*
  * hmm_pfn_t - HMM uses its own pfn type to keep several flags per page
@@ -134,6 +135,115 @@ static inline hmm_pfn_t hmm_pfn_t_from_pfn(unsigned long 
pfn)
 }
 
 
+#if IS_ENABLED(CONFIG_HMM_MIRROR)
+/*
+ * Mirroring: how to synchronize device page table with CPU page table.
+ *
+ * A device driver that is participating in HMM mirroring must always
+ * synchronize with CPU page table updates. For this, device drivers can either
+ * directly use mmu_notifier APIs or they can use the hmm_mirror API. Device
+ * drivers can decide to register one mirror per device per process, or just
+ * one mirror per process for a group of devices. The pattern is:
+ *
+ *  int device_bind_address_space(..., struct mm_struct *mm, ...)
+ *  {
+ *  struct device_address_space *das;
+ *
+ *  // Device driver specific initialization, and allocation of das
+ *  // which contains an hmm_mirror struct as one of its fields.
+ *  ...
+ *
+ *  ret = hmm_mirror_register(>mirror, mm, _mirror_ops);
+ *  if (ret) {
+ *  // Cleanup on error
+ *  return ret;
+ *  }
+ *
+ *  // Other device driver specific initialization
+ *  ...
+ *  }
+ *
+ * Once an hmm_mirror is registered for an address space, the device driver
+ * will get callbacks through sync_cpu_device_pagetables() operation (see
+ * hmm_mirror_ops struct).
+ *
+ * Device driver must not free the struct containing the hmm_mirror struct
+ * before calling hmm_mirror_unregister(). The expected usage is to do that 
when
+ * the device driver is unbinding from an address space.
+ *
+ *
+ *  void device_unbind_address_space(struct device_address_space *das)
+ *  {
+ *  // Device driver specific cleanup
+ *  ...
+ *
+ *  hmm_mirror_unregister(>mirror);
+ *
+ *  // Other device driver specific cleanup, and now das can be freed
+ *  ...
+ *  }
+ */
+
+struct hmm_mirror;
+
+/*
+ * enum hmm_update_type - type of update
+ * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
+ */
+enum hmm_update_type {
+   HMM_UPDATE_INVALIDATE,
+};
+
+/*
+ * struct hmm_mirror_ops - HMM mirror device operations callback
+ *
+ * @update: callback to update range on a device
+ */
+struct hmm_mirror_ops {
+   /* sync_cpu_device_pagetables() - synchronize page tables
+*
+* @mirror: pointer to struct hmm_mirror
+* @update_type: type of update that occurred to the CPU page table
+* @start: virtual start address of the range to update
+* @end: virtual end address of the range to update
+*
+* This callback ultimately originates from mmu_notifiers when the CPU
+* page table is updated. The device driver must update its page table
+* in response to this callback. The update argument tells what action
+* to perform.
+*
+* The device driver must not return from this callback until the device
+* page tables are completely updated (TLBs flushed, etc); this is a
+* synchronous call.
+*/
+   void (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
+  enum hmm_update_type update_type,
+  unsigned long start,
+  unsigned long end);
+};
+
+/*
+ * struct hmm_mirror - mirror struct for a device driver
+ *
+ * @hmm: pointer to struct hmm (which is unique per mm_struct)
+ * @ops: device driver callback for HMM mirror operations
+ * @li

[HMM-v25 04/19] mm/hmm/mirror: helper to snapshot CPU page table v4

2017-08-16 Thread Jérôme Glisse
This does not use existing page table walker because we want to share
same code for our page fault handler.

Changed since v3:
  - fix THP handling
Changed since v2:
  - s/device unaddressable/device private/
Changes since v1:
  - Use spinlock instead of rcu synchronized list traversal

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/hmm.h |  55 +-
 mm/hmm.c| 285 
 2 files changed, 338 insertions(+), 2 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 61c707970aa6..4cb6f9706c3c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -79,13 +79,26 @@ struct hmm;
  *
  * Flags:
  * HMM_PFN_VALID: pfn is valid
+ * HMM_PFN_READ:  CPU page table has read permission set
  * HMM_PFN_WRITE: CPU page table has write permission set
+ * HMM_PFN_ERROR: corresponding CPU page table entry points to poisoned memory
+ * HMM_PFN_EMPTY: corresponding CPU page table entry is pte_none()
+ * HMM_PFN_SPECIAL: corresponding CPU page table entry is special; i.e., the
+ *  result of vm_insert_pfn() or vm_insert_page(). Therefore, it should not
+ *  be mirrored by a device, because the entry will never have 
HMM_PFN_VALID
+ *  set and the pfn value is undefined.
+ * HMM_PFN_DEVICE_UNADDRESSABLE: unaddressable device memory (ZONE_DEVICE)
  */
 typedef unsigned long hmm_pfn_t;
 
 #define HMM_PFN_VALID (1 << 0)
-#define HMM_PFN_WRITE (1 << 1)
-#define HMM_PFN_SHIFT 2
+#define HMM_PFN_READ (1 << 1)
+#define HMM_PFN_WRITE (1 << 2)
+#define HMM_PFN_ERROR (1 << 3)
+#define HMM_PFN_EMPTY (1 << 4)
+#define HMM_PFN_SPECIAL (1 << 5)
+#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6)
+#define HMM_PFN_SHIFT 7
 
 /*
  * hmm_pfn_t_to_page() - return struct page pointed to by a valid hmm_pfn_t
@@ -241,6 +254,44 @@ struct hmm_mirror {
 
 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
 void hmm_mirror_unregister(struct hmm_mirror *mirror);
+
+
+/*
+ * struct hmm_range - track invalidation lock on virtual address range
+ *
+ * @list: all range lock are on a list
+ * @start: range virtual start address (inclusive)
+ * @end: range virtual end address (exclusive)
+ * @pfns: array of pfns (big enough for the range)
+ * @valid: pfns array did not change since it has been fill by an HMM function
+ */
+struct hmm_range {
+   struct list_headlist;
+   unsigned long   start;
+   unsigned long   end;
+   hmm_pfn_t   *pfns;
+   boolvalid;
+};
+
+/*
+ * To snapshot the CPU page table, call hmm_vma_get_pfns(), then take a device
+ * driver lock that serializes device page table updates, then call
+ * hmm_vma_range_done(), to check if the snapshot is still valid. The same
+ * device driver page table update lock must also be used in the
+ * hmm_mirror_ops.sync_cpu_device_pagetables() callback, so that CPU page
+ * table invalidation serializes on it.
+ *
+ * YOU MUST CALL hmm_vma_range_done() ONCE AND ONLY ONCE EACH TIME YOU CALL
+ * hmm_vma_get_pfns() WITHOUT ERROR !
+ *
+ * IF YOU DO NOT FOLLOW THE ABOVE RULE THE SNAPSHOT CONTENT MIGHT BE INVALID !
+ */
+int hmm_vma_get_pfns(struct vm_area_struct *vma,
+struct hmm_range *range,
+unsigned long start,
+unsigned long end,
+hmm_pfn_t *pfns);
+bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index ff78c92cd34c..b8a24b80463e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,8 +19,12 @@
  */
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 
@@ -31,14 +35,18 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  * struct hmm - HMM per mm struct
  *
  * @mm: mm struct this HMM struct is bound to
+ * @lock: lock protecting ranges list
  * @sequence: we track updates to the CPU page table with a sequence number
+ * @ranges: list of range being snapshotted
  * @mirrors: list of mirrors for this mm
  * @mmu_notifier: mmu notifier to track updates to CPU page table
  * @mirrors_sem: read/write semaphore protecting the mirrors list
  */
 struct hmm {
struct mm_struct*mm;
+   spinlock_t  lock;
atomic_tsequence;
+   struct list_headranges;
struct list_headmirrors;
struct mmu_notifier mmu_notifier;
struct rw_semaphore mirrors_sem;
@@ -72,6 +80,8 @@ static struct hmm *hmm_register(struct mm_struct *mm)
init_rwsem(>mirrors_sem);
atomic_set(>sequence, 0);
hmm->mmu_notifier.ops = NULL;
+   INIT_LIST_HEAD(>ranges);
+   spin_lock_init(

[HMM-v25 02/19] mm/hmm: heterogeneous memory management (HMM for short) v5

2017-08-16 Thread Jérôme Glisse
HMM provides 3 separate types of functionality:
- Mirroring: synchronize CPU page table and device page table
- Device memory: allocating struct page for device memory
- Migration: migrating regular memory to device memory

This patch introduces some common helpers and definitions to all of
those 3 functionality.

Changed since v4:
  - added hmm_mm_init() call to init mm's HMM fields
Changed since v3:
  - Unconditionaly build hmm.c for static keys
Changed since v2:
  - s/device unaddressable/device private
Changed since v1:
  - Kconfig logic (depend on x86-64 and use ARCH_HAS pattern)

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/hmm.h  | 152 +++
 include/linux/mm_types.h |   6 ++
 kernel/fork.c|   3 +
 mm/Kconfig   |  13 
 mm/Makefile  |   2 +-
 mm/hmm.c |  74 +++
 6 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
new file mode 100644
index ..ca60595ce784
--- /dev/null
+++ b/include/linux/hmm.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * Heterogeneous Memory Management (HMM)
+ *
+ * See Documentation/vm/hmm.txt for reasons and overview of what HMM is and it
+ * is for. Here we focus on the HMM API description, with some explanation of
+ * the underlying implementation.
+ *
+ * Short description: HMM provides a set of helpers to share a virtual address
+ * space between CPU and a device, so that the device can access any valid
+ * address of the process (while still obeying memory protection). HMM also
+ * provides helpers to migrate process memory to device memory, and back. Each
+ * set of functionality (address space mirroring, and migration to and from
+ * device memory) can be used independently of the other.
+ *
+ *
+ * HMM address space mirroring API:
+ *
+ * Use HMM address space mirroring if you want to mirror range of the CPU page
+ * table of a process into a device page table. Here, "mirror" means "keep
+ * synchronized". Prerequisites: the device must provide the ability to write-
+ * protect its page tables (at PAGE_SIZE granularity), and must be able to
+ * recover from the resulting potential page faults.
+ *
+ * HMM guarantees that at any point in time, a given virtual address points to
+ * either the same memory in both CPU and device page tables (that is: CPU and
+ * device page tables each point to the same pages), or that one page table 
(CPU
+ * or device) points to no entry, while the other still points to the old page
+ * for the address. The latter case happens when the CPU page table update
+ * happens first, and then the update is mirrored over to the device page 
table.
+ * This does not cause any issue, because the CPU page table cannot start
+ * pointing to a new page until the device page table is invalidated.
+ *
+ * HMM uses mmu_notifiers to monitor the CPU page tables, and forwards any
+ * updates to each device driver that has registered a mirror. It also provides
+ * some API calls to help with taking a snapshot of the CPU page table, and to
+ * synchronize with any updates that might happen concurrently.
+ *
+ *
+ * HMM migration to and from device memory:
+ *
+ * HMM provides a set of helpers to hotplug device memory as ZONE_DEVICE, with
+ * a new MEMORY_DEVICE_PRIVATE type. This provides a struct page for each page
+ * of the device memory, and allows the device driver to manage its memory
+ * using those struct pages. Having struct pages for device memory makes
+ * migration easier. Because that memory is not addressable by the CPU it must
+ * never be pinned to the device; in other words, any CPU page fault can always
+ * cause the device memory to be migrated (copied/moved) back to regular 
memory.
+ *
+ * A new migrate helper (migrate_vma()) has been added (see mm/migrate.c) that
+ * allows use of a device DMA engine to perform the copy operation between
+ * regular system memory and device memory.
+ */
+#ifndef LINUX_HMM_H
+#define LINUX_HMM_H
+
+#include 
+
+#if IS_ENABLED(CONFIG_HMM)
+
+
+/*
+ * hmm_pfn_t - HMM uses its own pfn type to keep several flags pe

[HMM-v25 07/19] mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory v5

2017-08-16 Thread Jérôme Glisse
HMM (heterogeneous memory management) need struct page to support migration
from system main memory to device memory.  Reasons for HMM and migration to
device memory is explained with HMM core patch.

This patch deals with device memory that is un-addressable memory (ie CPU
can not access it). Hence we do not want those struct page to be manage
like regular memory. That is why we extend ZONE_DEVICE to support different
types of memory.

A persistent memory type is define for existing user of ZONE_DEVICE and a
new device un-addressable type is added for the un-addressable memory type.
There is a clear separation between what is expected from each memory type
and existing user of ZONE_DEVICE are un-affected by new requirement and new
use of the un-addressable type. All specific code path are protect with
test against the memory type.

Because memory is un-addressable we use a new special swap type for when
a page is migrated to device memory (this reduces the number of maximum
swap file).

The main two additions beside memory type to ZONE_DEVICE is two callbacks.
First one, page_free() is call whenever page refcount reach 1 (which means
the page is free as ZONE_DEVICE page never reach a refcount of 0). This
allow device driver to manage its memory and associated struct page.

The second callback page_fault() happens when there is a CPU access to
an address that is back by a device page (which are un-addressable by the
CPU). This callback is responsible to migrate the page back to system
main memory. Device driver can not block migration back to system memory,
HMM make sure that such page can not be pin into device memory.

If device is in some error condition and can not migrate memory back then
a CPU page fault to device memory should end with SIGBUS.

Changed since v4:
  - s/DEVICE_PUBLIC/DEVICE_HOST (to free DEVICE_PUBLIC for HMM-CDM)
Changed since v3:
  - fix comments that was still using UNADDRESSABLE as keyword
  - kernel configuration simplification
Changed since v2:
  - s/DEVICE_UNADDRESSABLE/DEVICE_PRIVATE
Changed since v1:
  - rename to device private memory (from device unaddressable)

Signed-off-by: Jérôme Glisse 
Acked-by: Dan Williams 
Cc: Ross Zwisler 
---
 fs/proc/task_mmu.c   |  7 +
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 73 
 include/linux/mm.h   | 12 
 include/linux/swap.h | 24 ++--
 include/linux/swapops.h  | 68 
 kernel/memremap.c| 34 ++
 mm/Kconfig   | 11 +++-
 mm/memory.c  | 61 
 mm/memory_hotplug.c  | 10 +--
 mm/mprotect.c| 14 ++
 11 files changed, 309 insertions(+), 6 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 186e907187ab..e3ba93b35925 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -535,6 +535,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
}
} else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+   else if (is_device_private_entry(swpent))
+   page = device_private_entry_to_page(swpent);
} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
&& pte_none(*pte))) {
page = find_get_entry(vma->vm_file->f_mapping,
@@ -698,6 +700,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long 
hmask,
 
if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
+   else if (is_device_private_entry(swpent))
+   page = device_private_entry_to_page(swpent);
}
if (page) {
int mapcount = page_mapcount(page);
@@ -1198,6 +1202,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
flags |= PM_SWAP;
if (is_migration_entry(entry))
page = migration_entry_to_page(entry);
+
+   if (is_device_private_entry(entry))
+   page = device_private_entry_to_page(entry);
}
 
if (page && !PageAnon(page))
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 6230064d7f95..3a4f69137bc2 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -130,6 +130,7 @@ enum {
IORES_DESC_ACPI_NV_STORAGE  = 3,
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
+   IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 93416196ba64..8e164ec9eed0 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ 

[HMM-v25 09/19] mm/memcontrol: allow to uncharge page without using page->lru field

2017-08-16 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus you can not use page->lru fields of those pages. This patch
re-arrange the uncharge to allow single page to be uncharge without
modifying the lru field of the struct page.

There is no change to memcontrol logic, it is the same as it was
before this patch.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Vladimir Davydov <vdavydov@gmail.com>
Cc: cgro...@vger.kernel.org
---
 mm/memcontrol.c | 168 +++-
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df6f63ee95d6..604fb3ca8028 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5533,48 +5533,102 @@ void mem_cgroup_cancel_charge(struct page *page, 
struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-  unsigned long nr_anon, unsigned long nr_file,
-  unsigned long nr_kmem, unsigned long nr_huge,
-  unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+   struct mem_cgroup *memcg;
+   unsigned long pgpgout;
+   unsigned long nr_anon;
+   unsigned long nr_file;
+   unsigned long nr_kmem;
+   unsigned long nr_huge;
+   unsigned long nr_shmem;
+   struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-   unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+   memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+   unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
 
-   if (!mem_cgroup_is_root(memcg)) {
-   page_counter_uncharge(>memory, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg)) {
+   page_counter_uncharge(>memcg->memory, nr_pages);
if (do_memsw_account())
-   page_counter_uncharge(>memsw, nr_pages);
-   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-   page_counter_uncharge(>kmem, nr_kmem);
-   memcg_oom_recover(memcg);
+   page_counter_uncharge(>memcg->memsw, nr_pages);
+   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+   page_counter_uncharge(>memcg->kmem, ug->nr_kmem);
+   memcg_oom_recover(ug->memcg);
}
 
local_irq_save(flags);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-   __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-   __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-   __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-   __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-   memcg_check_events(memcg, dummy_page);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+   __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+   __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+   __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+   memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
 
-   if (!mem_cgroup_is_root(memcg))
-   css_put_many(>css, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg))
+   css_put_many(>memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+   VM_BUG_ON_PAGE(PageLRU(page), page);
+   VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+   if (!page->mem_cgroup)
+   return;
+
+   /*
+* Nobody should be changing or seriously looking at
+* page->mem_cgroup at this point, we have fully
+* exclusive access to the page.
+*/
+
+   if (ug->memcg != page->mem_cgroup) {
+   if (ug->memcg) {
+   uncharge_batch(ug);
+   uncharge_gather_clear(ug);
+   }
+   ug->memcg = page->mem_cgroup;
+   }
+
+   if (!PageKmemcg(page)) {
+   unsigned int nr_pages = 1;
+
+   if (PageTransHuge(page)) {
+   nr_pages <<= compound_order(page);
+ 

[HMM-v25 05/19] mm/hmm/mirror: device page fault handler

2017-08-16 Thread Jérôme Glisse
This handle page fault on behalf of device driver, unlike handle_mm_fault()
it does not trigger migration back to system memory for device memory.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/hmm.h |  27 ++
 mm/hmm.c| 256 +---
 2 files changed, 271 insertions(+), 12 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4cb6f9706c3c..79f5d4fa2d4f 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 unsigned long end,
 hmm_pfn_t *pfns);
 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array 
will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index b8a24b80463e..91592dae364e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -236,6 +236,36 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
+struct hmm_vma_walk {
+   struct hmm_range*range;
+   unsigned long   last;
+   boolfault;
+   boolblock;
+   boolwrite;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+   unsigned long addr,
+   hmm_pfn_t *pfn)
+{
+   unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct vm_area_struct *vma = walk->vma;
+   int r;
+
+   flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+   flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+   r = handle_mm_fault(vma, addr, flags);
+   if (r & VM_FAULT_RETRY)
+   return -EBUSY;
+   if (r & VM_FAULT_ERROR) {
+   *pfn = HMM_PFN_ERROR;
+   return -EFAULT;
+   }
+
+   return -EAGAIN;
+}
+
 static void hmm_pfns_special(hmm_pfn_t *pfns,
 unsigned long addr,
 unsigned long end)
@@ -259,34 +289,62 @@ static int hmm_pfns_bad(unsigned long addr,
return 0;
 }
 
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+  unsigned long addr,
+  unsigned long end)
+{
+   for (; addr < end; addr += PAGE_SIZE, pfns++)
+   *pfns = 0;
+}
+
 static int hmm_vma_walk_hole(unsigned long addr,
 unsigned long end,
 struct mm_walk *walk)
 {
-   struct hmm_range *range = walk->private;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
hmm_pfn_t *pfns = range->pfns;
unsigned long i;
 
+   hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
-   for (; addr < end; addr += PAGE_SIZE, i++)
+   for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = HMM_PFN_EMPTY;
+   if (hmm_vma_walk->fault) {
+   int ret;
 
-   return 0;
+   ret = hmm_vma_do_fault(walk, addr, [i]);
+   if (ret != -EAGAIN)
+   return ret;
+   }
+   }
+
+   return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_clear(unsigned long addr,
  unsigned long end,
  struct mm_walk *walk)
 {
-   struct hmm_range *range = wal

[HMM-v25 09/19] mm/memcontrol: allow to uncharge page without using page->lru field

2017-08-16 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus you can not use page->lru fields of those pages. This patch
re-arrange the uncharge to allow single page to be uncharge without
modifying the lru field of the struct page.

There is no change to memcontrol logic, it is the same as it was
before this patch.

Signed-off-by: Jérôme Glisse 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: cgro...@vger.kernel.org
---
 mm/memcontrol.c | 168 +++-
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index df6f63ee95d6..604fb3ca8028 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5533,48 +5533,102 @@ void mem_cgroup_cancel_charge(struct page *page, 
struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-  unsigned long nr_anon, unsigned long nr_file,
-  unsigned long nr_kmem, unsigned long nr_huge,
-  unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+   struct mem_cgroup *memcg;
+   unsigned long pgpgout;
+   unsigned long nr_anon;
+   unsigned long nr_file;
+   unsigned long nr_kmem;
+   unsigned long nr_huge;
+   unsigned long nr_shmem;
+   struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-   unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+   memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+   unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
 
-   if (!mem_cgroup_is_root(memcg)) {
-   page_counter_uncharge(>memory, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg)) {
+   page_counter_uncharge(>memcg->memory, nr_pages);
if (do_memsw_account())
-   page_counter_uncharge(>memsw, nr_pages);
-   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-   page_counter_uncharge(>kmem, nr_kmem);
-   memcg_oom_recover(memcg);
+   page_counter_uncharge(>memcg->memsw, nr_pages);
+   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+   page_counter_uncharge(>memcg->kmem, ug->nr_kmem);
+   memcg_oom_recover(ug->memcg);
}
 
local_irq_save(flags);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-   __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-   __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-   __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-   __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-   memcg_check_events(memcg, dummy_page);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+   __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+   __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+   __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+   memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
 
-   if (!mem_cgroup_is_root(memcg))
-   css_put_many(>css, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg))
+   css_put_many(>memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+   VM_BUG_ON_PAGE(PageLRU(page), page);
+   VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+   if (!page->mem_cgroup)
+   return;
+
+   /*
+* Nobody should be changing or seriously looking at
+* page->mem_cgroup at this point, we have fully
+* exclusive access to the page.
+*/
+
+   if (ug->memcg != page->mem_cgroup) {
+   if (ug->memcg) {
+   uncharge_batch(ug);
+   uncharge_gather_clear(ug);
+   }
+   ug->memcg = page->mem_cgroup;
+   }
+
+   if (!PageKmemcg(page)) {
+   unsigned int nr_pages = 1;
+
+   if (PageTransHuge(page)) {
+   nr_pages <<= compound_order(page);
+   ug->nr_huge += nr_pages;
+   }
+   if (PageAnon(page))
+   ug->nr_anon += nr_p

[HMM-v25 05/19] mm/hmm/mirror: device page fault handler

2017-08-16 Thread Jérôme Glisse
This handle page fault on behalf of device driver, unlike handle_mm_fault()
it does not trigger migration back to system memory for device memory.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/hmm.h |  27 ++
 mm/hmm.c| 256 +---
 2 files changed, 271 insertions(+), 12 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4cb6f9706c3c..79f5d4fa2d4f 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -292,6 +292,33 @@ int hmm_vma_get_pfns(struct vm_area_struct *vma,
 unsigned long end,
 hmm_pfn_t *pfns);
 bool hmm_vma_range_done(struct vm_area_struct *vma, struct hmm_range *range);
+
+
+/*
+ * Fault memory on behalf of device driver. Unlike handle_mm_fault(), this will
+ * not migrate any device memory back to system memory. The hmm_pfn_t array 
will
+ * be updated with the fault result and current snapshot of the CPU page table
+ * for the range.
+ *
+ * The mmap_sem must be taken in read mode before entering and it might be
+ * dropped by the function if the block argument is false. In that case, the
+ * function returns -EAGAIN.
+ *
+ * Return value does not reflect if the fault was successful for every single
+ * address or not. Therefore, the caller must to inspect the hmm_pfn_t array to
+ * determine fault status for each address.
+ *
+ * Trying to fault inside an invalid vma will result in -EINVAL.
+ *
+ * See the function description in mm/hmm.c for further documentation.
+ */
+int hmm_vma_fault(struct vm_area_struct *vma,
+ struct hmm_range *range,
+ unsigned long start,
+ unsigned long end,
+ hmm_pfn_t *pfns,
+ bool write,
+ bool block);
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index b8a24b80463e..91592dae364e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -236,6 +236,36 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
 }
 EXPORT_SYMBOL(hmm_mirror_unregister);
 
+struct hmm_vma_walk {
+   struct hmm_range*range;
+   unsigned long   last;
+   boolfault;
+   boolblock;
+   boolwrite;
+};
+
+static int hmm_vma_do_fault(struct mm_walk *walk,
+   unsigned long addr,
+   hmm_pfn_t *pfn)
+{
+   unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct vm_area_struct *vma = walk->vma;
+   int r;
+
+   flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
+   flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0;
+   r = handle_mm_fault(vma, addr, flags);
+   if (r & VM_FAULT_RETRY)
+   return -EBUSY;
+   if (r & VM_FAULT_ERROR) {
+   *pfn = HMM_PFN_ERROR;
+   return -EFAULT;
+   }
+
+   return -EAGAIN;
+}
+
 static void hmm_pfns_special(hmm_pfn_t *pfns,
 unsigned long addr,
 unsigned long end)
@@ -259,34 +289,62 @@ static int hmm_pfns_bad(unsigned long addr,
return 0;
 }
 
+static void hmm_pfns_clear(hmm_pfn_t *pfns,
+  unsigned long addr,
+  unsigned long end)
+{
+   for (; addr < end; addr += PAGE_SIZE, pfns++)
+   *pfns = 0;
+}
+
 static int hmm_vma_walk_hole(unsigned long addr,
 unsigned long end,
 struct mm_walk *walk)
 {
-   struct hmm_range *range = walk->private;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
hmm_pfn_t *pfns = range->pfns;
unsigned long i;
 
+   hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
-   for (; addr < end; addr += PAGE_SIZE, i++)
+   for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = HMM_PFN_EMPTY;
+   if (hmm_vma_walk->fault) {
+   int ret;
 
-   return 0;
+   ret = hmm_vma_do_fault(walk, addr, [i]);
+   if (ret != -EAGAIN)
+   return ret;
+   }
+   }
+
+   return hmm_vma_walk->fault ? -EAGAIN : 0;
 }
 
 static int hmm_vma_walk_clear(unsigned long addr,
  unsigned long end,
  struct mm_walk *walk)
 {
-   struct hmm_range *range = walk->private;
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
hmm_pfn_t *pfns 

[HMM-v25 08/19] mm/ZONE_DEVICE: special case put_page() for device private pages v4

2017-08-16 Thread Jérôme Glisse
A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer
have any user. For device private pages this is important to catch
and thus we need to special case put_page() for this.

Changed since v3:
  - clear page mapping field
Changed since v2:
  - clear page active and waiters
Changed since v1:
  - use static key to disable special code path in put_page() by
default
  - uninline put_zone_device_private_page()
  - fix build issues with some kernel config related to header
inter-dependency

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 include/linux/memremap.h | 13 +
 include/linux/mm.h   | 31 ++-
 kernel/memremap.c| 25 -
 mm/hmm.c |  8 
 4 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8e164ec9eed0..8aa6b82679e2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -126,6 +126,14 @@ struct dev_pagemap {
 void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
 struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page);
+
+static inline bool is_device_private_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -144,6 +152,11 @@ static inline struct dev_pagemap 
*find_dev_pagemap(resource_size_t phys)
 {
return NULL;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+   return false;
+}
 #endif
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a59e149b958a..515d4ae611b2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mempolicy;
 struct anon_vma;
@@ -788,25 +789,25 @@ static inline bool is_zone_device_page(const struct page 
*page)
 {
return page_zonenum(page) == ZONE_DEVICE;
 }
-
-static inline bool is_device_private_page(const struct page *page)
-{
-   /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
-   return ((page_zonenum(page) == ZONE_DEVICE) &&
-   (page->pgmap->type == MEMORY_DEVICE_PRIVATE));
-}
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
return false;
 }
+#endif
 
-static inline bool is_device_private_page(const struct page *page)
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page);
+#else
+static inline void put_zone_device_private_page(struct page *page)
 {
-   return false;
 }
 #endif
 
+static inline bool is_device_private_page(const struct page *page);
+
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+
 static inline void get_page(struct page *page)
 {
page = compound_head(page);
@@ -822,6 +823,18 @@ static inline void put_page(struct page *page)
 {
page = compound_head(page);
 
+   /*
+* For private device pages we need to catch refcount transition from
+* 2 to 1, when refcount reach one it means the private device page is
+* free and we need to inform the device driver through callback. See
+* include/linux/memremap.h and HMM for details.
+*/
+   if (static_branch_unlikely(_private_key) &&
+   unlikely(is_device_private_page(page))) {
+   put_zone_device_private_page(page);
+   return;
+   }
+
if (put_page_testzero(page))
__put_page(page);
 }
diff --git a/kernel/memremap.c b/kernel/memremap.c
index d3241f51c1f0..398630c1fba3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
  * General Public License for more details.
  */
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -476,3 +475,27 @@ struct vmem_altmap *to_vmem_altmap(unsigned long 
memmap_start)
return pgmap ? pgmap->altmap : NULL;
 }
 #endif /* CONFIG_ZONE_DEVICE */
+
+
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page)
+{
+   int count = page_ref_dec_return(page);
+
+   /*
+* If refcount is 1 then page is freed and refcount is stable as nobody
+* holds a reference on the page.
+*/
+   if (count == 1) {
+   /* Clear Active bit in case of parallel mark_page_accessed */
+   __ClearPageActive(page);
+   __ClearPageWaiters(page);
+
+   page->mapping = NULL;
+
+   page->pgmap->page_free(page, page-&

[HMM-v25 08/19] mm/ZONE_DEVICE: special case put_page() for device private pages v4

2017-08-16 Thread Jérôme Glisse
A ZONE_DEVICE page that reach a refcount of 1 is free ie no longer
have any user. For device private pages this is important to catch
and thus we need to special case put_page() for this.

Changed since v3:
  - clear page mapping field
Changed since v2:
  - clear page active and waiters
Changed since v1:
  - use static key to disable special code path in put_page() by
default
  - uninline put_zone_device_private_page()
  - fix build issues with some kernel config related to header
inter-dependency

Signed-off-by: Jérôme Glisse 
Cc: Kirill A. Shutemov 
Cc: Dan Williams 
Cc: Ross Zwisler 
---
 include/linux/memremap.h | 13 +
 include/linux/mm.h   | 31 ++-
 kernel/memremap.c| 25 -
 mm/hmm.c |  8 
 4 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8e164ec9eed0..8aa6b82679e2 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -126,6 +126,14 @@ struct dev_pagemap {
 void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap);
 struct dev_pagemap *find_dev_pagemap(resource_size_t phys);
+
+static inline bool is_zone_device_page(const struct page *page);
+
+static inline bool is_device_private_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_PRIVATE;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *ref,
@@ -144,6 +152,11 @@ static inline struct dev_pagemap 
*find_dev_pagemap(resource_size_t phys)
 {
return NULL;
 }
+
+static inline bool is_device_private_page(const struct page *page)
+{
+   return false;
+}
 #endif
 
 /**
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a59e149b958a..515d4ae611b2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct mempolicy;
 struct anon_vma;
@@ -788,25 +789,25 @@ static inline bool is_zone_device_page(const struct page 
*page)
 {
return page_zonenum(page) == ZONE_DEVICE;
 }
-
-static inline bool is_device_private_page(const struct page *page)
-{
-   /* See MEMORY_DEVICE_PRIVATE in include/linux/memory_hotplug.h */
-   return ((page_zonenum(page) == ZONE_DEVICE) &&
-   (page->pgmap->type == MEMORY_DEVICE_PRIVATE));
-}
 #else
 static inline bool is_zone_device_page(const struct page *page)
 {
return false;
 }
+#endif
 
-static inline bool is_device_private_page(const struct page *page)
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page);
+#else
+static inline void put_zone_device_private_page(struct page *page)
 {
-   return false;
 }
 #endif
 
+static inline bool is_device_private_page(const struct page *page);
+
+DECLARE_STATIC_KEY_FALSE(device_private_key);
+
 static inline void get_page(struct page *page)
 {
page = compound_head(page);
@@ -822,6 +823,18 @@ static inline void put_page(struct page *page)
 {
page = compound_head(page);
 
+   /*
+* For private device pages we need to catch refcount transition from
+* 2 to 1, when refcount reach one it means the private device page is
+* free and we need to inform the device driver through callback. See
+* include/linux/memremap.h and HMM for details.
+*/
+   if (static_branch_unlikely(_private_key) &&
+   unlikely(is_device_private_page(page))) {
+   put_zone_device_private_page(page);
+   return;
+   }
+
if (put_page_testzero(page))
__put_page(page);
 }
diff --git a/kernel/memremap.c b/kernel/memremap.c
index d3241f51c1f0..398630c1fba3 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -11,7 +11,6 @@
  * General Public License for more details.
  */
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -476,3 +475,27 @@ struct vmem_altmap *to_vmem_altmap(unsigned long 
memmap_start)
return pgmap ? pgmap->altmap : NULL;
 }
 #endif /* CONFIG_ZONE_DEVICE */
+
+
+#ifdef CONFIG_DEVICE_PRIVATE
+void put_zone_device_private_page(struct page *page)
+{
+   int count = page_ref_dec_return(page);
+
+   /*
+* If refcount is 1 then page is freed and refcount is stable as nobody
+* holds a reference on the page.
+*/
+   if (count == 1) {
+   /* Clear Active bit in case of parallel mark_page_accessed */
+   __ClearPageActive(page);
+   __ClearPageWaiters(page);
+
+   page->mapping = NULL;
+
+   page->pgmap->page_free(page, page->pgmap->data);
+   } else if (!count)
+   __put_page(page);
+}
+EXPORT_SYMBOL(put_zone_device_private_page);
+#endif 

[HMM-v25 12/19] mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory v3

2017-08-16 Thread Jérôme Glisse
This introduce a dummy HMM device class so device driver can use it to
create hmm_device for the sole purpose of registering device memory.
It is useful to device driver that want to manage multiple physical
device memory under same struct device umbrella.

Changed since v2:
  - use device_initcall() and drop everything that is module specific
Changed since v1:
  - Improve commit message
  - Add drvdata parameter to set on struct device

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/hmm.h | 22 ++-
 mm/hmm.c| 81 +
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 0a8e5e3688e8..aeba696a4385 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,11 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include 
 #include 
 #include 
 #include 
 
-
 struct hmm;
 
 /*
@@ -474,6 +474,26 @@ static inline unsigned long 
hmm_devmem_page_get_drvdata(struct page *page)
 
return drvdata[1];
 }
+
+
+/*
+ * struct hmm_device - fake device to hang device memory onto
+ *
+ * @device: device struct
+ * @minor: device minor number
+ */
+struct hmm_device {
+   struct device   device;
+   unsigned intminor;
+};
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper and
+ * it is not strictly needed, in order to make use of any HMM functionality.
+ */
+struct hmm_device *hmm_device_new(void *drvdata);
+void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index afc9ab8c..8c5ce5f76b2e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,6 +19,7 @@
  */
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -,4 +1112,84 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
devm_release_mem_region(device, start, size);
 }
 EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+   struct hmm_device *hmm_device;
+
+   hmm_device = container_of(device, struct hmm_device, device);
+   spin_lock(_device_lock);
+   clear_bit(hmm_device->minor, hmm_device_mask);
+   spin_unlock(_device_lock);
+
+   kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+   struct hmm_device *hmm_device;
+
+   hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+   if (!hmm_device)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(_device_lock);
+   hmm_device->minor = find_first_zero_bit(hmm_device_mask, 
HMM_DEVICE_MAX);
+   if (hmm_device->minor >= HMM_DEVICE_MAX) {
+   spin_unlock(_device_lock);
+   kfree(hmm_device);
+   return ERR_PTR(-EBUSY);
+   }
+   set_bit(hmm_device->minor, hmm_device_mask);
+   spin_unlock(_device_lock);
+
+   dev_set_name(_device->device, "hmm_device%d", hmm_device->minor);
+   hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+   hmm_device->minor);
+   hmm_device->device.release = hmm_device_release;
+   dev_set_drvdata(_device->device, drvdata);
+   hmm_device->device.class = hmm_device_class;
+   device_initialize(_device->device);
+
+   return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+   put_device(_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+   int ret;
+
+   ret = alloc_chrdev_region(_device_devt, 0,
+ HMM_DEVICE_MAX,
+ "hmm_device");
+   if (ret)
+   return ret;
+
+   hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+   if (IS_ERR(hmm_device_class)) {
+   unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+   return PTR_ERR(hmm_device_class);
+   }
+   return 0;
+}
+
+device_initcall(hmm_init);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
-- 
2.13.4



[HMM-v25 12/19] mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory v3

2017-08-16 Thread Jérôme Glisse
This introduce a dummy HMM device class so device driver can use it to
create hmm_device for the sole purpose of registering device memory.
It is useful to device driver that want to manage multiple physical
device memory under same struct device umbrella.

Changed since v2:
  - use device_initcall() and drop everything that is module specific
Changed since v1:
  - Improve commit message
  - Add drvdata parameter to set on struct device

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/hmm.h | 22 ++-
 mm/hmm.c| 81 +
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 0a8e5e3688e8..aeba696a4385 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,11 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include 
 #include 
 #include 
 #include 
 
-
 struct hmm;
 
 /*
@@ -474,6 +474,26 @@ static inline unsigned long 
hmm_devmem_page_get_drvdata(struct page *page)
 
return drvdata[1];
 }
+
+
+/*
+ * struct hmm_device - fake device to hang device memory onto
+ *
+ * @device: device struct
+ * @minor: device minor number
+ */
+struct hmm_device {
+   struct device   device;
+   unsigned intminor;
+};
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper and
+ * it is not strictly needed, in order to make use of any HMM functionality.
+ */
+struct hmm_device *hmm_device_new(void *drvdata);
+void hmm_device_put(struct hmm_device *hmm_device);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
 
 
diff --git a/mm/hmm.c b/mm/hmm.c
index afc9ab8c..8c5ce5f76b2e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -19,6 +19,7 @@
  */
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -,4 +1112,84 @@ void hmm_devmem_remove(struct hmm_devmem *devmem)
devm_release_mem_region(device, start, size);
 }
 EXPORT_SYMBOL(hmm_devmem_remove);
+
+/*
+ * A device driver that wants to handle multiple devices memory through a
+ * single fake device can use hmm_device to do so. This is purely a helper
+ * and it is not needed to make use of any HMM functionality.
+ */
+#define HMM_DEVICE_MAX 256
+
+static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
+static DEFINE_SPINLOCK(hmm_device_lock);
+static struct class *hmm_device_class;
+static dev_t hmm_device_devt;
+
+static void hmm_device_release(struct device *device)
+{
+   struct hmm_device *hmm_device;
+
+   hmm_device = container_of(device, struct hmm_device, device);
+   spin_lock(_device_lock);
+   clear_bit(hmm_device->minor, hmm_device_mask);
+   spin_unlock(_device_lock);
+
+   kfree(hmm_device);
+}
+
+struct hmm_device *hmm_device_new(void *drvdata)
+{
+   struct hmm_device *hmm_device;
+
+   hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
+   if (!hmm_device)
+   return ERR_PTR(-ENOMEM);
+
+   spin_lock(_device_lock);
+   hmm_device->minor = find_first_zero_bit(hmm_device_mask, 
HMM_DEVICE_MAX);
+   if (hmm_device->minor >= HMM_DEVICE_MAX) {
+   spin_unlock(_device_lock);
+   kfree(hmm_device);
+   return ERR_PTR(-EBUSY);
+   }
+   set_bit(hmm_device->minor, hmm_device_mask);
+   spin_unlock(_device_lock);
+
+   dev_set_name(_device->device, "hmm_device%d", hmm_device->minor);
+   hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
+   hmm_device->minor);
+   hmm_device->device.release = hmm_device_release;
+   dev_set_drvdata(_device->device, drvdata);
+   hmm_device->device.class = hmm_device_class;
+   device_initialize(_device->device);
+
+   return hmm_device;
+}
+EXPORT_SYMBOL(hmm_device_new);
+
+void hmm_device_put(struct hmm_device *hmm_device)
+{
+   put_device(_device->device);
+}
+EXPORT_SYMBOL(hmm_device_put);
+
+static int __init hmm_init(void)
+{
+   int ret;
+
+   ret = alloc_chrdev_region(_device_devt, 0,
+ HMM_DEVICE_MAX,
+ "hmm_device");
+   if (ret)
+   return ret;
+
+   hmm_device_class = class_create(THIS_MODULE, "hmm_device");
+   if (IS_ERR(hmm_device_class)) {
+   unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
+   return PTR_ERR(hmm_device_class);
+   }
+   return 0;
+}
+
+device_initcall(hmm_init);
 #endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
-- 
2.13.4



[HMM-v25 11/19] mm/hmm/devmem: device memory hotplug using ZONE_DEVICE v7

2017-08-16 Thread Jérôme Glisse
This introduce a simple struct and associated helpers for device driver
to use when hotpluging un-addressable device memory as ZONE_DEVICE. It
will find a unuse physical address range and trigger memory hotplug for
it which allocates and initialize struct page for the device memory.

Device driver should use this helper during device initialization to
hotplug the device memory. It should only need to remove the memory
once the device is going offline (shutdown or hotremove). There should
not be any userspace API to hotplug memory expect maybe for host
device driver to allow to add more memory to a guest device driver.

Device's memory is manage by the device driver and HMM only provides
helpers to that effect.

Changed since v6:
  - fix start address calculation (Balbir Singh)
  - more comments and updated commit message.
Changed since v5:
  - kernel configuration simplification
  - remove now unuse device driver helper
Changed since v4:
  - enable device_private_key static key when adding device memory
Changed since v3:
  - s/device unaddressable/device private/
Changed since v2:
  - s/SECTION_SIZE/PA_SECTION_SIZE
Changed since v1:
  - change to adapt to new add_pages() helper
  - make this x86-64 only for now

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
Signed-off-by: Balbir Singh <bsinghar...@gmail.com>
---
 include/linux/hmm.h | 155 +
 mm/hmm.c| 379 +++-
 2 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 79f5d4fa2d4f..0a8e5e3688e8 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include 
+#include 
+#include 
+
+
 struct hmm;
 
 /*
@@ -322,6 +327,156 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+struct hmm_devmem;
+
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+  unsigned long addr);
+
+/*
+ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
+ *
+ * @free: call when refcount on page reach 1 and thus is no longer use
+ * @fault: call when there is a page fault to unaddressable memory
+ *
+ * Both callback happens from page_free() and page_fault() callback of struct
+ * dev_pagemap respectively. See include/linux/memremap.h for more details on
+ * those.
+ *
+ * The hmm_devmem_ops callback are just here to provide a coherent and
+ * uniq API to device driver and device driver should not register their
+ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
+ * back.
+ */
+struct hmm_devmem_ops {
+   /*
+* free() - free a device page
+* @devmem: device memory structure (see struct hmm_devmem)
+* @page: pointer to struct page being freed
+*
+* Call back occurs whenever a device page refcount reach 1 which
+* means that no one is holding any reference on the page anymore
+* (ZONE_DEVICE page have an elevated refcount of 1 as default so
+* that they are not release to the general page allocator).
+*
+* Note that callback has exclusive ownership of the page (as no
+* one is holding any reference).
+*/
+   void (*free)(struct hmm_devmem *devmem, struct page *page);
+   /*
+* fault() - CPU page fault or get user page (GUP)
+* @devmem: device memory structure (see struct hmm_devmem)
+* @vma: virtual memory area containing the virtual address
+* @addr: virtual address that faulted or for which there is a GUP
+* @page: pointer to struct page backing virtual address (unreliable)
+* @flags: FAULT_FLAG_* (see include/linux/mm.h)
+* @pmdp: page middle directory
+* Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+*   on error
+*
+* The callback occurs whenever there is a CPU page fault or GUP on a
+* virtual address. This means that the device driver must migrate the
+* page back to regular memory (CPU accessible).
+*
+* The device driver is free to migrate more than one page from the
+* fault() callback as an optimization. However if device decide to
+* migrate more than one page it must always priotirize the faulting
+* address over the others.
+*
+* The struct page pointer is only given as an hint to allow quick
+* lookup of internal device driver data. A concurrent migration
+* might have already free that page and the virtual address mig

[HMM-v25 11/19] mm/hmm/devmem: device memory hotplug using ZONE_DEVICE v7

2017-08-16 Thread Jérôme Glisse
This introduce a simple struct and associated helpers for device driver
to use when hotpluging un-addressable device memory as ZONE_DEVICE. It
will find a unuse physical address range and trigger memory hotplug for
it which allocates and initialize struct page for the device memory.

Device driver should use this helper during device initialization to
hotplug the device memory. It should only need to remove the memory
once the device is going offline (shutdown or hotremove). There should
not be any userspace API to hotplug memory expect maybe for host
device driver to allow to add more memory to a guest device driver.

Device's memory is manage by the device driver and HMM only provides
helpers to that effect.

Changed since v6:
  - fix start address calculation (Balbir Singh)
  - more comments and updated commit message.
Changed since v5:
  - kernel configuration simplification
  - remove now unuse device driver helper
Changed since v4:
  - enable device_private_key static key when adding device memory
Changed since v3:
  - s/device unaddressable/device private/
Changed since v2:
  - s/SECTION_SIZE/PA_SECTION_SIZE
Changed since v1:
  - change to adapt to new add_pages() helper
  - make this x86-64 only for now

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
Signed-off-by: Balbir Singh 
---
 include/linux/hmm.h | 155 +
 mm/hmm.c| 379 +++-
 2 files changed, 533 insertions(+), 1 deletion(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 79f5d4fa2d4f..0a8e5e3688e8 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -72,6 +72,11 @@
 
 #if IS_ENABLED(CONFIG_HMM)
 
+#include 
+#include 
+#include 
+
+
 struct hmm;
 
 /*
@@ -322,6 +327,156 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+struct hmm_devmem;
+
+struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
+  unsigned long addr);
+
+/*
+ * struct hmm_devmem_ops - callback for ZONE_DEVICE memory events
+ *
+ * @free: call when refcount on page reach 1 and thus is no longer use
+ * @fault: call when there is a page fault to unaddressable memory
+ *
+ * Both callback happens from page_free() and page_fault() callback of struct
+ * dev_pagemap respectively. See include/linux/memremap.h for more details on
+ * those.
+ *
+ * The hmm_devmem_ops callback are just here to provide a coherent and
+ * uniq API to device driver and device driver should not register their
+ * own page_free() or page_fault() but rely on the hmm_devmem_ops call-
+ * back.
+ */
+struct hmm_devmem_ops {
+   /*
+* free() - free a device page
+* @devmem: device memory structure (see struct hmm_devmem)
+* @page: pointer to struct page being freed
+*
+* Call back occurs whenever a device page refcount reach 1 which
+* means that no one is holding any reference on the page anymore
+* (ZONE_DEVICE page have an elevated refcount of 1 as default so
+* that they are not release to the general page allocator).
+*
+* Note that callback has exclusive ownership of the page (as no
+* one is holding any reference).
+*/
+   void (*free)(struct hmm_devmem *devmem, struct page *page);
+   /*
+* fault() - CPU page fault or get user page (GUP)
+* @devmem: device memory structure (see struct hmm_devmem)
+* @vma: virtual memory area containing the virtual address
+* @addr: virtual address that faulted or for which there is a GUP
+* @page: pointer to struct page backing virtual address (unreliable)
+* @flags: FAULT_FLAG_* (see include/linux/mm.h)
+* @pmdp: page middle directory
+* Returns: VM_FAULT_MINOR/MAJOR on success or one of VM_FAULT_ERROR
+*   on error
+*
+* The callback occurs whenever there is a CPU page fault or GUP on a
+* virtual address. This means that the device driver must migrate the
+* page back to regular memory (CPU accessible).
+*
+* The device driver is free to migrate more than one page from the
+* fault() callback as an optimization. However if device decide to
+* migrate more than one page it must always priotirize the faulting
+* address over the others.
+*
+* The struct page pointer is only given as an hint to allow quick
+* lookup of internal device driver data. A concurrent migration
+* might have already free that page and the virtual address might
+* not longer be back by it. So it should not be modified by the
+* callback.
+*
+* Note that mmap semaphore is held in read mode at least when

[HMM-v25 16/19] mm/migrate: support un-addressable ZONE_DEVICE page in migration v3

2017-08-16 Thread Jérôme Glisse
Allow to unmap and restore special swap entry of un-addressable
ZONE_DEVICE memory.

Changed since v2:
  - un-conditionaly allow device private memory to be migrated (it can
not be pin to pointless to check reference count).
Changed since v1:
  - s/device unaddressable/device private/

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
---
 include/linux/migrate.h |  10 +++-
 mm/migrate.c| 149 +++-
 mm/page_vma_mapped.c|  10 
 mm/rmap.c   |  25 
 4 files changed, 164 insertions(+), 30 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8f73cebfc3f5..8dc8f0a3f1af 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
 
 #ifdef CONFIG_MIGRATION
 
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
 #define MIGRATE_PFN_VALID  (1UL << 0)
 #define MIGRATE_PFN_MIGRATE(1UL << 1)
 #define MIGRATE_PFN_LOCKED (1UL << 2)
 #define MIGRATE_PFN_WRITE  (1UL << 3)
-#define MIGRATE_PFN_ERROR  (1UL << 4)
-#define MIGRATE_PFN_SHIFT  5
+#define MIGRATE_PFN_DEVICE (1UL << 4)
+#define MIGRATE_PFN_ERROR  (1UL << 5)
+#define MIGRATE_PFN_SHIFT  6
 
 static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
 {
diff --git a/mm/migrate.c b/mm/migrate.c
index 57d1fa7a8e62..6c8a9826da32 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -236,7 +237,13 @@ static bool remove_migration_pte(struct page *page, struct 
vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
 
-   flush_dcache_page(new);
+   if (unlikely(is_zone_device_page(new)) &&
+   is_device_private_page(new)) {
+   entry = make_device_private_entry(new, pte_write(pte));
+   pte = swp_entry_to_pte(entry);
+   } else
+   flush_dcache_page(new);
+
 #ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -2216,17 +2223,40 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pte = *ptep;
pfn = pte_pfn(pte);
 
-   if (!pte_present(pte)) {
+   if (pte_none(pte)) {
mpfn = pfn = 0;
goto next;
}
 
+   if (!pte_present(pte)) {
+   mpfn = pfn = 0;
+
+   /*
+* Only care about unaddressable device page special
+* page table entry. Other special swap entries are not
+* migratable, and we ignore regular swapped page.
+*/
+   entry = pte_to_swp_entry(pte);
+   if (!is_device_private_entry(entry))
+   goto next;
+
+   page = device_private_entry_to_page(entry);
+   mpfn = migrate_pfn(page_to_pfn(page))|
+   MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+   if (is_write_device_private_entry(entry))
+   mpfn |= MIGRATE_PFN_WRITE;
+   } else {
+   page = vm_normal_page(migrate->vma, addr, pte);
+   mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+   mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+   }
+
/* FIXME support THP */
-   page = vm_normal_page(migrate->vma, addr, pte);
if (!page || !page->mapping || PageTransCompound(page)) {
mpfn = pfn = 0;
goto next;
}
+   pfn = page_to_pfn(page);
 
/*
 * By getting a reference on the page we pin it and that blocks
@@ -2239,8 +2269,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 */
get_page(page);
migrate->cpages++;
-   mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-   mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
/*
 * Optimize for the common case where page is only mapped once
@@ -2267,10 +2295,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 */
page_remove_rmap(page, false);
put_page(page);
-  

[HMM-v25 16/19] mm/migrate: support un-addressable ZONE_DEVICE page in migration v3

2017-08-16 Thread Jérôme Glisse
Allow to unmap and restore special swap entry of un-addressable
ZONE_DEVICE memory.

Changed since v2:
  - un-conditionaly allow device private memory to be migrated (it can
not be pin to pointless to check reference count).
Changed since v1:
  - s/device unaddressable/device private/

Signed-off-by: Jérôme Glisse 
Cc: Kirill A. Shutemov 
---
 include/linux/migrate.h |  10 +++-
 mm/migrate.c| 149 +++-
 mm/page_vma_mapped.c|  10 
 mm/rmap.c   |  25 
 4 files changed, 164 insertions(+), 30 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8f73cebfc3f5..8dc8f0a3f1af 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -159,12 +159,18 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
 
 #ifdef CONFIG_MIGRATION
 
+/*
+ * Watch out for PAE architecture, which has an unsigned long, and might not
+ * have enough bits to store all physical address and flags. So far we have
+ * enough room for all our flags.
+ */
 #define MIGRATE_PFN_VALID  (1UL << 0)
 #define MIGRATE_PFN_MIGRATE(1UL << 1)
 #define MIGRATE_PFN_LOCKED (1UL << 2)
 #define MIGRATE_PFN_WRITE  (1UL << 3)
-#define MIGRATE_PFN_ERROR  (1UL << 4)
-#define MIGRATE_PFN_SHIFT  5
+#define MIGRATE_PFN_DEVICE (1UL << 4)
+#define MIGRATE_PFN_ERROR  (1UL << 5)
+#define MIGRATE_PFN_SHIFT  6
 
 static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
 {
diff --git a/mm/migrate.c b/mm/migrate.c
index 57d1fa7a8e62..6c8a9826da32 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -36,6 +36,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -236,7 +237,13 @@ static bool remove_migration_pte(struct page *page, struct 
vm_area_struct *vma,
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
 
-   flush_dcache_page(new);
+   if (unlikely(is_zone_device_page(new)) &&
+   is_device_private_page(new)) {
+   entry = make_device_private_entry(new, pte_write(pte));
+   pte = swp_entry_to_pte(entry);
+   } else
+   flush_dcache_page(new);
+
 #ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
pte = pte_mkhuge(pte);
@@ -2216,17 +2223,40 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pte = *ptep;
pfn = pte_pfn(pte);
 
-   if (!pte_present(pte)) {
+   if (pte_none(pte)) {
mpfn = pfn = 0;
goto next;
}
 
+   if (!pte_present(pte)) {
+   mpfn = pfn = 0;
+
+   /*
+* Only care about unaddressable device page special
+* page table entry. Other special swap entries are not
+* migratable, and we ignore regular swapped page.
+*/
+   entry = pte_to_swp_entry(pte);
+   if (!is_device_private_entry(entry))
+   goto next;
+
+   page = device_private_entry_to_page(entry);
+   mpfn = migrate_pfn(page_to_pfn(page))|
+   MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+   if (is_write_device_private_entry(entry))
+   mpfn |= MIGRATE_PFN_WRITE;
+   } else {
+   page = vm_normal_page(migrate->vma, addr, pte);
+   mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
+   mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+   }
+
/* FIXME support THP */
-   page = vm_normal_page(migrate->vma, addr, pte);
if (!page || !page->mapping || PageTransCompound(page)) {
mpfn = pfn = 0;
goto next;
}
+   pfn = page_to_pfn(page);
 
/*
 * By getting a reference on the page we pin it and that blocks
@@ -2239,8 +2269,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 */
get_page(page);
migrate->cpages++;
-   mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-   mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
/*
 * Optimize for the common case where page is only mapped once
@@ -2267,10 +2295,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 */
page_remove_rmap(page, false);
put_page(page);
-   unmapped++;
+
+   if (pte_present(pte))
+  

[HMM-v25 15/19] mm/migrate: migrate_vma() unmap page from vma while collecting pages

2017-08-16 Thread Jérôme Glisse
Common case for migration of virtual address range is page are map
only once inside the vma in which migration is taking place. Because
we already walk the CPU page table for that range we can directly do
the unmap there and setup special migration swap entry.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 mm/migrate.c | 141 +++
 1 file changed, 112 insertions(+), 29 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 60e2f8369cd7..57d1fa7a8e62 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2160,7 +2160,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct migrate_vma *migrate = walk->private;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
-   unsigned long addr = start;
+   unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
pte_t *ptep;
 
@@ -2205,9 +2205,12 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
return migrate_vma_collect_hole(start, end, walk);
 
ptep = pte_offset_map_lock(mm, pmdp, addr, );
+   arch_enter_lazy_mmu_mode();
+
for (; addr < end; addr += PAGE_SIZE, ptep++) {
unsigned long mpfn, pfn;
struct page *page;
+   swp_entry_t entry;
pte_t pte;
 
pte = *ptep;
@@ -2239,11 +2242,44 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
+   /*
+* Optimize for the common case where page is only mapped once
+* in one process. If we can lock the page, then we can safely
+* set up a special migration page table entry now.
+*/
+   if (trylock_page(page)) {
+   pte_t swp_pte;
+
+   mpfn |= MIGRATE_PFN_LOCKED;
+   ptep_get_and_clear(mm, addr, ptep);
+
+   /* Setup special migration page table entry */
+   entry = make_migration_entry(page, pte_write(pte));
+   swp_pte = swp_entry_to_pte(entry);
+   if (pte_soft_dirty(pte))
+   swp_pte = pte_swp_mksoft_dirty(swp_pte);
+   set_pte_at(mm, addr, ptep, swp_pte);
+
+   /*
+* This is like regular unmap: we remove the rmap and
+* drop page refcount. Page won't be freed, as we took
+* a reference just above.
+*/
+   page_remove_rmap(page, false);
+   put_page(page);
+   unmapped++;
+   }
+
 next:
migrate->src[migrate->npages++] = mpfn;
}
+   arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
 
+   /* Only flush the TLB if we actually modified any entries */
+   if (unmapped)
+   flush_tlb_range(walk->vma, start, end);
+
return 0;
 }
 
@@ -2268,7 +2304,13 @@ static void migrate_vma_collect(struct migrate_vma 
*migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
 
+   mmu_notifier_invalidate_range_start(mm_walk.mm,
+   migrate->start,
+   migrate->end);
walk_page_range(migrate->start, migrate->end, _walk);
+   mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
 
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
 }
@@ -2316,32 +2358,37 @@ static bool migrate_vma_check_page(struct page *page)
 static void migrate_vma_prepare(struct migrate_vma *migrate)
 {
const unsigned long npages = migrate->npages;
+   const unsigned long start = migrate->start;
+   unsigned long addr, i, restore = 0;
bool allow_drain = true;
-   unsigned long i;
 
lru_add_drain();
 
for (i = 0; (i < npages) && migrate->cpages; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
+   bool remap = true;
 
if (!page)
continue;
 
-   /*
-* Because we are migrating several pages there can be
-* a deadlock between 2 concurrent migration where each
-* are wa

[HMM-v25 15/19] mm/migrate: migrate_vma() unmap page from vma while collecting pages

2017-08-16 Thread Jérôme Glisse
Common case for migration of virtual address range is page are map
only once inside the vma in which migration is taking place. Because
we already walk the CPU page table for that range we can directly do
the unmap there and setup special migration swap entry.

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 mm/migrate.c | 141 +++
 1 file changed, 112 insertions(+), 29 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 60e2f8369cd7..57d1fa7a8e62 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2160,7 +2160,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
struct migrate_vma *migrate = walk->private;
struct vm_area_struct *vma = walk->vma;
struct mm_struct *mm = vma->vm_mm;
-   unsigned long addr = start;
+   unsigned long addr = start, unmapped = 0;
spinlock_t *ptl;
pte_t *ptep;
 
@@ -2205,9 +2205,12 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
return migrate_vma_collect_hole(start, end, walk);
 
ptep = pte_offset_map_lock(mm, pmdp, addr, );
+   arch_enter_lazy_mmu_mode();
+
for (; addr < end; addr += PAGE_SIZE, ptep++) {
unsigned long mpfn, pfn;
struct page *page;
+   swp_entry_t entry;
pte_t pte;
 
pte = *ptep;
@@ -2239,11 +2242,44 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
 
+   /*
+* Optimize for the common case where page is only mapped once
+* in one process. If we can lock the page, then we can safely
+* set up a special migration page table entry now.
+*/
+   if (trylock_page(page)) {
+   pte_t swp_pte;
+
+   mpfn |= MIGRATE_PFN_LOCKED;
+   ptep_get_and_clear(mm, addr, ptep);
+
+   /* Setup special migration page table entry */
+   entry = make_migration_entry(page, pte_write(pte));
+   swp_pte = swp_entry_to_pte(entry);
+   if (pte_soft_dirty(pte))
+   swp_pte = pte_swp_mksoft_dirty(swp_pte);
+   set_pte_at(mm, addr, ptep, swp_pte);
+
+   /*
+* This is like regular unmap: we remove the rmap and
+* drop page refcount. Page won't be freed, as we took
+* a reference just above.
+*/
+   page_remove_rmap(page, false);
+   put_page(page);
+   unmapped++;
+   }
+
 next:
migrate->src[migrate->npages++] = mpfn;
}
+   arch_leave_lazy_mmu_mode();
pte_unmap_unlock(ptep - 1, ptl);
 
+   /* Only flush the TLB if we actually modified any entries */
+   if (unmapped)
+   flush_tlb_range(walk->vma, start, end);
+
return 0;
 }
 
@@ -2268,7 +2304,13 @@ static void migrate_vma_collect(struct migrate_vma 
*migrate)
mm_walk.mm = migrate->vma->vm_mm;
mm_walk.private = migrate;
 
+   mmu_notifier_invalidate_range_start(mm_walk.mm,
+   migrate->start,
+   migrate->end);
walk_page_range(migrate->start, migrate->end, _walk);
+   mmu_notifier_invalidate_range_end(mm_walk.mm,
+ migrate->start,
+ migrate->end);
 
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
 }
@@ -2316,32 +2358,37 @@ static bool migrate_vma_check_page(struct page *page)
 static void migrate_vma_prepare(struct migrate_vma *migrate)
 {
const unsigned long npages = migrate->npages;
+   const unsigned long start = migrate->start;
+   unsigned long addr, i, restore = 0;
bool allow_drain = true;
-   unsigned long i;
 
lru_add_drain();
 
for (i = 0; (i < npages) && migrate->cpages; i++) {
struct page *page = migrate_pfn_to_page(migrate->src[i]);
+   bool remap = true;
 
if (!page)
continue;
 
-   /*
-* Because we are migrating several pages there can be
-* a deadlock between 2 concurrent migration where each
-* are waiting on each other page lock.
-*
-* Make migrate_vma() a best effort thing and backoff
-* for any page we 

[HMM-v25 14/19] mm/migrate: new memory migration helper for use with device memory v5

2017-08-16 Thread Jérôme Glisse
This patch add a new memory migration helpers, which migrate memory
backing a range of virtual address of a process to different memory
(which can be allocated through special allocator). It differs from
numa migration by working on a range of virtual address and thus by
doing migration in chunk that can be large enough to use DMA engine
or special copy offloading engine.

Expected users are any one with heterogeneous memory where different
memory have different characteristics (latency, bandwidth, ...). As
an example IBM platform with CAPI bus can make use of this feature
to migrate between regular memory and CAPI device memory. New CPU
architecture with a pool of high performance memory not manage as
cache but presented as regular memory (while being faster and with
lower latency than DDR) will also be prime user of this patch.

Migration to private device memory will be useful for device that
have large pool of such like GPU, NVidia plans to use HMM for that.

Changed since v4:
  - split THP instead of skipping them
Changes since v3:
  - Rebase
Changes since v2:
  - droped HMM prefix and HMM specific code
Changes since v1:
  - typos fix
  - split early unmap optimization for page with single mapping

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Signed-off-by: Evgeny Baskakov <ebaska...@nvidia.com>
Signed-off-by: John Hubbard <jhubb...@nvidia.com>
Signed-off-by: Mark Hairgrove <mhairgr...@nvidia.com>
Signed-off-by: Sherry Cheung <sche...@nvidia.com>
Signed-off-by: Subhash Gutti <sgu...@nvidia.com>
---
 include/linux/migrate.h | 104 ++
 mm/migrate.c| 492 
 2 files changed, 596 insertions(+)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7db4c812a2a6..8f73cebfc3f5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -156,4 +156,108 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
 }
 #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
 
+
+#ifdef CONFIG_MIGRATION
+
+#define MIGRATE_PFN_VALID  (1UL << 0)
+#define MIGRATE_PFN_MIGRATE(1UL << 1)
+#define MIGRATE_PFN_LOCKED (1UL << 2)
+#define MIGRATE_PFN_WRITE  (1UL << 3)
+#define MIGRATE_PFN_ERROR  (1UL << 4)
+#define MIGRATE_PFN_SHIFT  5
+
+static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+{
+   if (!(mpfn & MIGRATE_PFN_VALID))
+   return NULL;
+   return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
+}
+
+static inline unsigned long migrate_pfn(unsigned long pfn)
+{
+   return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
+}
+
+/*
+ * struct migrate_vma_ops - migrate operation callback
+ *
+ * @alloc_and_copy: alloc destination memory and copy source memory to it
+ * @finalize_and_map: allow caller to map the successfully migrated pages
+ *
+ *
+ * The alloc_and_copy() callback happens once all source pages have been 
locked,
+ * unmapped and checked (checked whether pinned or not). All pages that can be
+ * migrated will have an entry in the src array set with the pfn value of the
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
+ * flags might be set but should be ignored by the callback).
+ *
+ * The alloc_and_copy() callback can then allocate destination memory and copy
+ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
+ * callback must update each corresponding entry in the dst array with the pfn
+ * value of the destination page and with the MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
+ * locked, via lock_page()).
+ *
+ * At this point the alloc_and_copy() callback is done and returns.
+ *
+ * Note that the callback does not have to migrate all the pages that are
+ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
+ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
+ * set in the src array entry). If the device driver cannot migrate a device
+ * page back to system memory, then it must set the corresponding dst array
+ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
+ * access any of the virtual addresses originally backed by this page. Because
+ * a SIGBUS is such a severe result for the userspace process, the device
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+ *
+ * The finalize_and_map() callback happens after struct page migration from
+ * source to destination (destination struct pages are the struct pages for the
+ * memory allocated by the alloc_and_copy() callback).  Migration can fail, and
+ * 

[HMM-v25 14/19] mm/migrate: new memory migration helper for use with device memory v5

2017-08-16 Thread Jérôme Glisse
This patch add a new memory migration helpers, which migrate memory
backing a range of virtual address of a process to different memory
(which can be allocated through special allocator). It differs from
numa migration by working on a range of virtual address and thus by
doing migration in chunk that can be large enough to use DMA engine
or special copy offloading engine.

Expected users are any one with heterogeneous memory where different
memory have different characteristics (latency, bandwidth, ...). As
an example IBM platform with CAPI bus can make use of this feature
to migrate between regular memory and CAPI device memory. New CPU
architecture with a pool of high performance memory not manage as
cache but presented as regular memory (while being faster and with
lower latency than DDR) will also be prime user of this patch.

Migration to private device memory will be useful for device that
have large pool of such like GPU, NVidia plans to use HMM for that.

Changed since v4:
  - split THP instead of skipping them
Changes since v3:
  - Rebase
Changes since v2:
  - droped HMM prefix and HMM specific code
Changes since v1:
  - typos fix
  - split early unmap optimization for page with single mapping

Signed-off-by: Jérôme Glisse 
Signed-off-by: Evgeny Baskakov 
Signed-off-by: John Hubbard 
Signed-off-by: Mark Hairgrove 
Signed-off-by: Sherry Cheung 
Signed-off-by: Subhash Gutti 
---
 include/linux/migrate.h | 104 ++
 mm/migrate.c| 492 
 2 files changed, 596 insertions(+)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 7db4c812a2a6..8f73cebfc3f5 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -156,4 +156,108 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
 }
 #endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
 
+
+#ifdef CONFIG_MIGRATION
+
+#define MIGRATE_PFN_VALID  (1UL << 0)
+#define MIGRATE_PFN_MIGRATE(1UL << 1)
+#define MIGRATE_PFN_LOCKED (1UL << 2)
+#define MIGRATE_PFN_WRITE  (1UL << 3)
+#define MIGRATE_PFN_ERROR  (1UL << 4)
+#define MIGRATE_PFN_SHIFT  5
+
+static inline struct page *migrate_pfn_to_page(unsigned long mpfn)
+{
+   if (!(mpfn & MIGRATE_PFN_VALID))
+   return NULL;
+   return pfn_to_page(mpfn >> MIGRATE_PFN_SHIFT);
+}
+
+static inline unsigned long migrate_pfn(unsigned long pfn)
+{
+   return (pfn << MIGRATE_PFN_SHIFT) | MIGRATE_PFN_VALID;
+}
+
+/*
+ * struct migrate_vma_ops - migrate operation callback
+ *
+ * @alloc_and_copy: alloc destination memory and copy source memory to it
+ * @finalize_and_map: allow caller to map the successfully migrated pages
+ *
+ *
+ * The alloc_and_copy() callback happens once all source pages have been 
locked,
+ * unmapped and checked (checked whether pinned or not). All pages that can be
+ * migrated will have an entry in the src array set with the pfn value of the
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set (other
+ * flags might be set but should be ignored by the callback).
+ *
+ * The alloc_and_copy() callback can then allocate destination memory and copy
+ * source memory to it for all those entries (ie with MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_MIGRATE flag set). Once these are allocated and copied, the
+ * callback must update each corresponding entry in the dst array with the pfn
+ * value of the destination page and with the MIGRATE_PFN_VALID and
+ * MIGRATE_PFN_LOCKED flags set (destination pages must have their struct pages
+ * locked, via lock_page()).
+ *
+ * At this point the alloc_and_copy() callback is done and returns.
+ *
+ * Note that the callback does not have to migrate all the pages that are
+ * marked with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration
+ * from device memory to system memory (ie the MIGRATE_PFN_DEVICE flag is also
+ * set in the src array entry). If the device driver cannot migrate a device
+ * page back to system memory, then it must set the corresponding dst array
+ * entry to MIGRATE_PFN_ERROR. This will trigger a SIGBUS if CPU tries to
+ * access any of the virtual addresses originally backed by this page. Because
+ * a SIGBUS is such a severe result for the userspace process, the device
+ * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
+ * unrecoverable state.
+ *
+ * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
+ * OR BAD THINGS WILL HAPPEN !
+ *
+ *
+ * The finalize_and_map() callback happens after struct page migration from
+ * source to destination (destination struct pages are the struct pages for the
+ * memory allocated by the alloc_and_copy() callback).  Migration can fail, and
+ * thus the finalize_and_map() allows the driver to inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * hav

[HMM-v25 19/19] mm/hmm: add new helper to hotplug CDM memory region v3

2017-08-16 Thread Jérôme Glisse
Unlike unaddressable memory, coherent device memory has a real
resource associated with it on the system (as CPU can address
it). Add a new helper to hotplug such memory within the HMM
framework.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Reviewed-by: Balbir Singh <bsinghar...@gmail.com>
---
 include/linux/hmm.h |  3 ++
 mm/hmm.c| 88 ++---
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 79e63178fd87..5866f3194c26 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -443,6 +443,9 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  struct device *device,
  unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res);
 void hmm_devmem_remove(struct hmm_devmem *devmem);
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 1a1e79d390c1..3faa4d40295e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -854,7 +854,11 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
zone = page_zone(page);
 
mem_hotplug_begin();
-   __remove_pages(zone, start_pfn, npages);
+   if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+   __remove_pages(zone, start_pfn, npages);
+   else
+   arch_remove_memory(start_pfn << PAGE_SHIFT,
+  npages << PAGE_SHIFT);
mem_hotplug_done();
 
hmm_devmem_radix_release(resource);
@@ -890,7 +894,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
if (is_ram == REGION_INTERSECTS)
return -ENXIO;
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+   else
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
devmem->pagemap.res = devmem->resource;
devmem->pagemap.page_fault = hmm_devmem_fault;
devmem->pagemap.page_free = hmm_devmem_free;
@@ -935,9 +943,15 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
 * over the device memory is un-accessible thus we do not want to
 * create a linear mapping for the memory like arch_add_memory()
 * would do.
+*
+* For device public memory, which is accesible by the CPU, we do
+* want the linear mapping and thus use arch_add_memory().
 */
-   ret = add_pages(nid, align_start >> PAGE_SHIFT,
-   align_size >> PAGE_SHIFT, false);
+   if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+   ret = arch_add_memory(nid, align_start, align_size, false);
+   else
+   ret = add_pages(nid, align_start >> PAGE_SHIFT,
+   align_size >> PAGE_SHIFT, false);
if (ret) {
mem_hotplug_done();
goto error_add_memory;
@@ -1084,6 +1098,67 @@ struct hmm_devmem *hmm_devmem_add(const struct 
hmm_devmem_ops *ops,
 }
 EXPORT_SYMBOL(hmm_devmem_add);
 
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res)
+{
+   struct hmm_devmem *devmem;
+   int ret;
+
+   if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   return ERR_PTR(-EINVAL);
+
+   static_branch_enable(_private_key);
+
+   devmem = devres_alloc_node(_devmem_release, sizeof(*devmem),
+  GFP_KERNEL, dev_to_node(device));
+   if (!devmem)
+   return ERR_PTR(-ENOMEM);
+
+   init_completion(>completion);
+   devmem->pfn_first = -1UL;
+   devmem->pfn_last = -1UL;
+   devmem->resource = res;
+   devmem->device = device;
+   devmem->ops = ops;
+
+   ret = percpu_ref_init(>ref, _devmem_ref_release,
+ 0, GFP_KERNEL);
+   if (ret)
+   goto error_percpu_ref;
+
+   ret = devm_add_action(device, hmm_devmem_ref_exit, >ref);
+   if (ret)
+   goto error_devm_add_action;
+
+
+   devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+   devmem->pfn_last = devmem->pfn_first +
+  (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+   ret = hmm_devmem_pages_create(devmem);
+   if (ret)
+   goto error_devm_add_action;
+
+   devres_add(device

[HMM-v25 06/19] mm/memory_hotplug: introduce add_pages

2017-08-16 Thread Jérôme Glisse
From: Michal Hocko <mho...@suse.com>

There are new users of memory hotplug emerging. Some of them require
different subset of arch_add_memory. There are some which only require
allocation of struct pages without mapping those pages to the kernel
address space. We currently have __add_pages for that purpose. But this
is rather lowlevel and not very suitable for the code outside of the
memory hotplug. E.g. x86_64 wants to update max_pfn which should be
done by the caller. Introduce add_pages() which should care about those
details if they are needed. Each architecture should define its
implementation and select CONFIG_ARCH_HAS_ADD_PAGES. All others use
the currently existing __add_pages.

Signed-off-by: Michal Hocko <mho...@suse.com>
Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
---
 arch/x86/Kconfig   |  4 
 arch/x86/mm/init_64.c  | 22 +++---
 include/linux/memory_hotplug.h | 11 +++
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cf3012220d10..78492ac02779 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2319,6 +2319,10 @@ source "kernel/livepatch/Kconfig"
 
 endmenu
 
+config ARCH_HAS_ADD_PAGES
+   def_bool y
+   depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
 config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..048fbe8fc274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ void __init paging_init(void)
  * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
  * updating.
  */
-static void  update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
 {
unsigned long end_pfn = PFN_UP(start + size);
 
@@ -772,22 +772,30 @@ static void  update_end_of_memory_vars(u64 start, u64 
size)
}
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
 {
-   unsigned long start_pfn = start >> PAGE_SHIFT;
-   unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
-   init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
 
/* update max_pfn, max_low_pfn and high_memory */
-   update_end_of_memory_vars(start, size);
+   update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
 
return ret;
 }
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+
+   init_memory_mapping(start, start + size);
+
+   return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
 #define PAGE_INUSE 0xFD
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5e6e4cc36ff4..0995e1a2b458 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -133,6 +133,17 @@ extern int __remove_pages(struct zone *zone, unsigned long 
start_pfn,
 extern int __add_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, bool want_memblock);
 
+#ifndef CONFIG_ARCH_HAS_ADD_PAGES
+static inline int add_pages(int nid, unsigned long start_pfn,
+   unsigned long nr_pages, bool want_memblock)
+{
+   return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
+#else /* ARCH_HAS_ADD_PAGES */
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock);
+#endif /* ARCH_HAS_ADD_PAGES */
+
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
 #else
-- 
2.13.4



[HMM-v25 19/19] mm/hmm: add new helper to hotplug CDM memory region v3

2017-08-16 Thread Jérôme Glisse
Unlike unaddressable memory, coherent device memory has a real
resource associated with it on the system (as CPU can address
it). Add a new helper to hotplug such memory within the HMM
framework.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host

Signed-off-by: Jérôme Glisse 
Reviewed-by: Balbir Singh 
---
 include/linux/hmm.h |  3 ++
 mm/hmm.c| 88 ++---
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 79e63178fd87..5866f3194c26 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -443,6 +443,9 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  struct device *device,
  unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res);
 void hmm_devmem_remove(struct hmm_devmem *devmem);
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 1a1e79d390c1..3faa4d40295e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -854,7 +854,11 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
zone = page_zone(page);
 
mem_hotplug_begin();
-   __remove_pages(zone, start_pfn, npages);
+   if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+   __remove_pages(zone, start_pfn, npages);
+   else
+   arch_remove_memory(start_pfn << PAGE_SHIFT,
+  npages << PAGE_SHIFT);
mem_hotplug_done();
 
hmm_devmem_radix_release(resource);
@@ -890,7 +894,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
if (is_ram == REGION_INTERSECTS)
return -ENXIO;
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+   else
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
devmem->pagemap.res = devmem->resource;
devmem->pagemap.page_fault = hmm_devmem_fault;
devmem->pagemap.page_free = hmm_devmem_free;
@@ -935,9 +943,15 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
 * over the device memory is un-accessible thus we do not want to
 * create a linear mapping for the memory like arch_add_memory()
 * would do.
+*
+* For device public memory, which is accesible by the CPU, we do
+* want the linear mapping and thus use arch_add_memory().
 */
-   ret = add_pages(nid, align_start >> PAGE_SHIFT,
-   align_size >> PAGE_SHIFT, false);
+   if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+   ret = arch_add_memory(nid, align_start, align_size, false);
+   else
+   ret = add_pages(nid, align_start >> PAGE_SHIFT,
+   align_size >> PAGE_SHIFT, false);
if (ret) {
mem_hotplug_done();
goto error_add_memory;
@@ -1084,6 +1098,67 @@ struct hmm_devmem *hmm_devmem_add(const struct 
hmm_devmem_ops *ops,
 }
 EXPORT_SYMBOL(hmm_devmem_add);
 
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res)
+{
+   struct hmm_devmem *devmem;
+   int ret;
+
+   if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   return ERR_PTR(-EINVAL);
+
+   static_branch_enable(_private_key);
+
+   devmem = devres_alloc_node(_devmem_release, sizeof(*devmem),
+  GFP_KERNEL, dev_to_node(device));
+   if (!devmem)
+   return ERR_PTR(-ENOMEM);
+
+   init_completion(>completion);
+   devmem->pfn_first = -1UL;
+   devmem->pfn_last = -1UL;
+   devmem->resource = res;
+   devmem->device = device;
+   devmem->ops = ops;
+
+   ret = percpu_ref_init(>ref, _devmem_ref_release,
+ 0, GFP_KERNEL);
+   if (ret)
+   goto error_percpu_ref;
+
+   ret = devm_add_action(device, hmm_devmem_ref_exit, >ref);
+   if (ret)
+   goto error_devm_add_action;
+
+
+   devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+   devmem->pfn_last = devmem->pfn_first +
+  (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+   ret = hmm_devmem_pages_create(devmem);
+   if (ret)
+   goto error_devm_add_action;
+
+   devres_add(device, devmem);
+
+   ret = devm_add_action(device, hmm_devmem_ref_kill, >re

[HMM-v25 06/19] mm/memory_hotplug: introduce add_pages

2017-08-16 Thread Jérôme Glisse
From: Michal Hocko 

There are new users of memory hotplug emerging. Some of them require
different subset of arch_add_memory. There are some which only require
allocation of struct pages without mapping those pages to the kernel
address space. We currently have __add_pages for that purpose. But this
is rather lowlevel and not very suitable for the code outside of the
memory hotplug. E.g. x86_64 wants to update max_pfn which should be
done by the caller. Introduce add_pages() which should care about those
details if they are needed. Each architecture should define its
implementation and select CONFIG_ARCH_HAS_ADD_PAGES. All others use
the currently existing __add_pages.

Signed-off-by: Michal Hocko 
Signed-off-by: Jérôme Glisse 
Acked-by: Balbir Singh 
---
 arch/x86/Kconfig   |  4 
 arch/x86/mm/init_64.c  | 22 +++---
 include/linux/memory_hotplug.h | 11 +++
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cf3012220d10..78492ac02779 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2319,6 +2319,10 @@ source "kernel/livepatch/Kconfig"
 
 endmenu
 
+config ARCH_HAS_ADD_PAGES
+   def_bool y
+   depends on X86_64 && ARCH_ENABLE_MEMORY_HOTPLUG
+
 config ARCH_ENABLE_MEMORY_HOTPLUG
def_bool y
depends on X86_64 || (X86_32 && HIGHMEM)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 136422d7d539..048fbe8fc274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -761,7 +761,7 @@ void __init paging_init(void)
  * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
  * updating.
  */
-static void  update_end_of_memory_vars(u64 start, u64 size)
+static void update_end_of_memory_vars(u64 start, u64 size)
 {
unsigned long end_pfn = PFN_UP(start + size);
 
@@ -772,22 +772,30 @@ static void  update_end_of_memory_vars(u64 start, u64 
size)
}
 }
 
-int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock)
 {
-   unsigned long start_pfn = start >> PAGE_SHIFT;
-   unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
-   init_memory_mapping(start, start + size);
-
ret = __add_pages(nid, start_pfn, nr_pages, want_memblock);
WARN_ON_ONCE(ret);
 
/* update max_pfn, max_low_pfn and high_memory */
-   update_end_of_memory_vars(start, size);
+   update_end_of_memory_vars(start_pfn << PAGE_SHIFT,
+ nr_pages << PAGE_SHIFT);
 
return ret;
 }
+
+int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock)
+{
+   unsigned long start_pfn = start >> PAGE_SHIFT;
+   unsigned long nr_pages = size >> PAGE_SHIFT;
+
+   init_memory_mapping(start, start + size);
+
+   return add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
 EXPORT_SYMBOL_GPL(arch_add_memory);
 
 #define PAGE_INUSE 0xFD
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 5e6e4cc36ff4..0995e1a2b458 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -133,6 +133,17 @@ extern int __remove_pages(struct zone *zone, unsigned long 
start_pfn,
 extern int __add_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, bool want_memblock);
 
+#ifndef CONFIG_ARCH_HAS_ADD_PAGES
+static inline int add_pages(int nid, unsigned long start_pfn,
+   unsigned long nr_pages, bool want_memblock)
+{
+   return __add_pages(nid, start_pfn, nr_pages, want_memblock);
+}
+#else /* ARCH_HAS_ADD_PAGES */
+int add_pages(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, bool want_memblock);
+#endif /* ARCH_HAS_ADD_PAGES */
+
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
 #else
-- 
2.13.4



[HMM-v25 17/19] mm/migrate: allow migrate_vma() to alloc new page on empty entry v4

2017-08-16 Thread Jérôme Glisse
This allow caller of migrate_vma() to allocate new page for empty CPU
page table entry (pte_none or back by zero page). This is only for
anonymous memory and it won't allow new page to be instanced if the
userfaultfd is armed.

This is useful to device driver that want to migrate a range of virtual
address and would rather allocate new memory than having to fault later
on.

Changed since v3:
  - support zero pfn entry
  - improve commit message
Changed sinve v2:
  - differentiate between empty CPU page table entry and non empty
  - improve code comments explaining how this works
Changed since v1:
  - 5 level page table fix

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
---
 include/linux/migrate.h |   9 +++
 mm/migrate.c| 205 +---
 2 files changed, 205 insertions(+), 9 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8dc8f0a3f1af..d4e6d12a0b40 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -218,6 +218,15 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
  * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
  * unrecoverable state.
  *
+ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing device driver to allocate device memory for those unback virtual
+ * address. For this the device driver simply have to allocate device memory
+ * and properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entry inside the finalize_and_map()
+ * callback just like for regular migration.
+ *
  * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
  * OR BAD THINGS WILL HAPPEN !
  *
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c8a9826da32..5500e3738354 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2152,6 +2153,22 @@ static int migrate_vma_collect_hole(unsigned long start,
unsigned long addr;
 
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+   migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+   migrate->dst[migrate->npages] = 0;
+   migrate->cpages++;
+   }
+
+   return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+   unsigned long end,
+   struct mm_walk *walk)
+{
+   struct migrate_vma *migrate = walk->private;
+   unsigned long addr;
+
+   for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2189,7 +2206,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
spin_unlock(ptl);
split_huge_pmd(vma, pmdp, addr);
if (pmd_trans_unstable(pmdp))
-   return migrate_vma_collect_hole(start, end,
+   return migrate_vma_collect_skip(start, end,
walk);
} else {
int ret;
@@ -2197,19 +2214,22 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
get_page(page);
spin_unlock(ptl);
if (unlikely(!trylock_page(page)))
-   return migrate_vma_collect_hole(start, end,
+   return migrate_vma_collect_skip(start, end,
walk);
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
-   if (ret || pmd_none(*pmdp))
+   if (ret)
+   return migrate_vma_collect_skip(start, end,
+   walk);
+   if (pmd_none(*pmdp))
return migrate_vma_collect_hole(start, end,
walk);
}
}
 
if (unlikely(pmd_bad(*pmdp)))
-   return migrate_vma_collect_hole(start, end, walk);
+   return migrate_vma_collect_skip(start, end, walk);
 
ptep = pte_offset_map_lock(mm, pmdp, addr, );
arch_enter_lazy_mmu_mode();
@@ -2224,7 +2244,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pfn = pte_pfn(pte);
 
if (pte_none(pte)) {
-   mpfn = pfn = 0;

[HMM-v25 18/19] mm/device-public-memory: device memory cache coherent with CPU v5

2017-08-16 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v4:
  - added memory cgroup change to this patch
Changed since v3:
  - s/public/public (going back)
Changed since v2:
  - s/public/public
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Aneesh Kumar <aneesh.ku...@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c|  8 +++
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memcontrol.c  | 12 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 
 mm/swap.c| 11 ++
 14 files changed, 159 insertions(+), 47 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e3ba93b35925..d3a57b43f9f1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1189,7 +1189,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index aeba696a4385..79e63178fd87 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -494,7 +494,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8aa6b82679e2..f8ee1c73ad2d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->ty

[HMM-v25 17/19] mm/migrate: allow migrate_vma() to alloc new page on empty entry v4

2017-08-16 Thread Jérôme Glisse
This allow caller of migrate_vma() to allocate new page for empty CPU
page table entry (pte_none or back by zero page). This is only for
anonymous memory and it won't allow new page to be instanced if the
userfaultfd is armed.

This is useful to device driver that want to migrate a range of virtual
address and would rather allocate new memory than having to fault later
on.

Changed since v3:
  - support zero pfn entry
  - improve commit message
Changed sinve v2:
  - differentiate between empty CPU page table entry and non empty
  - improve code comments explaining how this works
Changed since v1:
  - 5 level page table fix

Signed-off-by: Jérôme Glisse 
---
 include/linux/migrate.h |   9 +++
 mm/migrate.c| 205 +---
 2 files changed, 205 insertions(+), 9 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8dc8f0a3f1af..d4e6d12a0b40 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -218,6 +218,15 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
  * driver should avoid setting MIGRATE_PFN_ERROR unless it is really in an
  * unrecoverable state.
  *
+ * For empty entry inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing device driver to allocate device memory for those unback virtual
+ * address. For this the device driver simply have to allocate device memory
+ * and properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entry inside the finalize_and_map()
+ * callback just like for regular migration.
+ *
  * THE alloc_and_copy() CALLBACK MUST NOT CHANGE ANY OF THE SRC ARRAY ENTRIES
  * OR BAD THINGS WILL HAPPEN !
  *
diff --git a/mm/migrate.c b/mm/migrate.c
index 6c8a9826da32..5500e3738354 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2152,6 +2153,22 @@ static int migrate_vma_collect_hole(unsigned long start,
unsigned long addr;
 
for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+   migrate->src[migrate->npages++] = MIGRATE_PFN_MIGRATE;
+   migrate->dst[migrate->npages] = 0;
+   migrate->cpages++;
+   }
+
+   return 0;
+}
+
+static int migrate_vma_collect_skip(unsigned long start,
+   unsigned long end,
+   struct mm_walk *walk)
+{
+   struct migrate_vma *migrate = walk->private;
+   unsigned long addr;
+
+   for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2189,7 +2206,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
spin_unlock(ptl);
split_huge_pmd(vma, pmdp, addr);
if (pmd_trans_unstable(pmdp))
-   return migrate_vma_collect_hole(start, end,
+   return migrate_vma_collect_skip(start, end,
walk);
} else {
int ret;
@@ -2197,19 +2214,22 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
get_page(page);
spin_unlock(ptl);
if (unlikely(!trylock_page(page)))
-   return migrate_vma_collect_hole(start, end,
+   return migrate_vma_collect_skip(start, end,
walk);
ret = split_huge_page(page);
unlock_page(page);
put_page(page);
-   if (ret || pmd_none(*pmdp))
+   if (ret)
+   return migrate_vma_collect_skip(start, end,
+   walk);
+   if (pmd_none(*pmdp))
return migrate_vma_collect_hole(start, end,
walk);
}
}
 
if (unlikely(pmd_bad(*pmdp)))
-   return migrate_vma_collect_hole(start, end, walk);
+   return migrate_vma_collect_skip(start, end, walk);
 
ptep = pte_offset_map_lock(mm, pmdp, addr, );
arch_enter_lazy_mmu_mode();
@@ -2224,7 +2244,9 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
pfn = pte_pfn(pte);
 
if (pte_none(pte)) {
-   mpfn = pfn = 0;
+   mpfn = MIGRATE_PFN_MIGR

[HMM-v25 18/19] mm/device-public-memory: device memory cache coherent with CPU v5

2017-08-16 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v4:
  - added memory cgroup change to this patch
Changed since v3:
  - s/public/public (going back)
Changed since v2:
  - s/public/public
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse 
Cc: Aneesh Kumar 
Cc: Paul E. McKenney 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Ross Zwisler 
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c|  8 +++
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memcontrol.c  | 12 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 
 mm/swap.c| 11 ++
 14 files changed, 159 insertions(+), 47 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e3ba93b35925..d3a57b43f9f1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1189,7 +1189,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index aeba696a4385..79e63178fd87 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -494,7 +494,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 8aa6b82679e2..f8ee1c73ad2d 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_D

[HMM-v25 00/19] HMM (Heterogeneous Memory Management) v25

2017-08-16 Thread Jérôme Glisse
/2017/3/16/596
v19 https://lkml.org/lkml/2017/4/5/831
v20 https://lwn.net/Articles/720715/
v21 https://lkml.org/lkml/2017/4/24/747
v22 http://lkml.iu.edu/hypermail/linux/kernel/1705.2/05176.html
v23 https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1404788.html
v24 https://lwn.net/Articles/726691/

Jérôme Glisse (18):
  hmm: heterogeneous memory management documentation v3
  mm/hmm: heterogeneous memory management (HMM for short) v5
  mm/hmm/mirror: mirror process address space on device with HMM helpers
v3
  mm/hmm/mirror: helper to snapshot CPU page table v4
  mm/hmm/mirror: device page fault handler
  mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory v5
  mm/ZONE_DEVICE: special case put_page() for device private pages v4
  mm/memcontrol: allow to uncharge page without using page->lru field
  mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4
  mm/hmm/devmem: device memory hotplug using ZONE_DEVICE v7
  mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory v3
  mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY
  mm/migrate: new memory migration helper for use with device memory v5
  mm/migrate: migrate_vma() unmap page from vma while collecting pages
  mm/migrate: support un-addressable ZONE_DEVICE page in migration v3
  mm/migrate: allow migrate_vma() to alloc new page on empty entry v4
  mm/device-public-memory: device memory cache coherent with CPU v5
  mm/hmm: add new helper to hotplug CDM memory region v3

Michal Hocko (1):
  mm/memory_hotplug: introduce add_pages

 Documentation/vm/hmm.txt   |  384 
 MAINTAINERS|7 +
 arch/x86/Kconfig   |4 +
 arch/x86/mm/init_64.c  |   22 +-
 fs/aio.c   |8 +
 fs/f2fs/data.c |5 +-
 fs/hugetlbfs/inode.c   |5 +-
 fs/proc/task_mmu.c |9 +-
 fs/ubifs/file.c|5 +-
 include/linux/hmm.h|  518 
 include/linux/ioport.h |2 +
 include/linux/memory_hotplug.h |   11 +
 include/linux/memremap.h   |  107 
 include/linux/migrate.h|  124 
 include/linux/migrate_mode.h   |5 +
 include/linux/mm.h |   33 +-
 include/linux/mm_types.h   |6 +
 include/linux/swap.h   |   24 +-
 include/linux/swapops.h|   68 +++
 kernel/fork.c  |3 +
 kernel/memremap.c  |   60 +-
 mm/Kconfig |   47 +-
 mm/Makefile|2 +-
 mm/balloon_compaction.c|8 +
 mm/gup.c   |7 +
 mm/hmm.c   | 1273 
 mm/madvise.c   |2 +-
 mm/memcontrol.c|  222 ---
 mm/memory.c|  107 +++-
 mm/memory_hotplug.c|   10 +-
 mm/migrate.c   |  928 -
 mm/mprotect.c  |   14 +
 mm/page_vma_mapped.c   |   10 +
 mm/rmap.c  |   25 +
 mm/swap.c  |   11 +
 mm/zsmalloc.c  |8 +
 36 files changed, 3964 insertions(+), 120 deletions(-)
 create mode 100644 Documentation/vm/hmm.txt
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

-- 
2.13.4



[HMM-v25 13/19] mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY

2017-08-16 Thread Jérôme Glisse
Introduce a new migration mode that allow to offload the copy to
a device DMA engine. This changes the workflow of migration and
not all address_space migratepage callback can support this. So
it needs to be tested in those cases.

This is intended to be use by migrate_vma() which itself is use
for thing like HMM (see include/linux/hmm.h).

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
---
 fs/aio.c |  8 +++
 fs/f2fs/data.c   |  5 -
 fs/hugetlbfs/inode.c |  5 -
 fs/ubifs/file.c  |  5 -
 include/linux/migrate.h  |  5 +
 include/linux/migrate_mode.h |  5 +
 mm/balloon_compaction.c  |  8 +++
 mm/migrate.c | 52 ++--
 mm/zsmalloc.c|  8 +++
 9 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 8f0127526299..b5d69f28d8b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, 
struct page *new,
pgoff_t idx;
int rc;
 
+   /*
+* We cannot support the _NO_COPY case here, because copy needs to
+* happen under the ctx->completion_lock. That does not work with the
+* migration workflow of MIGRATE_SYNC_NO_COPY.
+*/
+   if (mode == MIGRATE_SYNC_NO_COPY)
+   return -EINVAL;
+
rc = 0;
 
/* mapping->private_lock here protects against the kioctx teardown.  */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aefc2a5745d3..cf7930daa10e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2257,7 +2257,10 @@ int f2fs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
set_page_private(newpage, page_private(page));
 
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
 
return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 33961b35007b..59073e9f01a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -830,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space 
*mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
 
return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f90a466ea5db..a02aa59d1e24 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space 
*mapping,
SetPagePrivate(newpage);
}
 
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
 }
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce15989521a1..7db4c812a2a6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -72,6 +72,7 @@ extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
+extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
  struct page *newpage, struct page *page);
@@ -92,6 +93,10 @@ static inline int isolate_movable_page(struct page *page, 
isolate_mode_t mode)
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
 
+static inline void migrate_page_states(struct page *newpage, struct page *page)
+{
+}
+
 static inline void migrate_page_copy(struct page *newpage,
 struct page *page) {}
 
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89a3919..bdf66af9b937 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -6,11 +6,16 @@
  * on most operations but not ->writepage as the potential stall time
  * is too significant
  * MIGRATE_SYNC will block when migrating pages
+ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
+ * with the CPU. Instead, page copy happens outside the migratepage()
+ * callback and is likely using a DMA engine. See migrate_vma() and HMM
+ * (mm/hmm.c) for users of this mode.
  */
 enum migrate_mode {
MIGRATE_ASYNC,
MIGRATE_SYNC_LIGHT,
MIGRATE_SYNC,
+   MIGRATE_SYNC_NO_COPY

[HMM-v25 00/19] HMM (Heterogeneous Memory Management) v25

2017-08-16 Thread Jérôme Glisse
/2017/3/16/596
v19 https://lkml.org/lkml/2017/4/5/831
v20 https://lwn.net/Articles/720715/
v21 https://lkml.org/lkml/2017/4/24/747
v22 http://lkml.iu.edu/hypermail/linux/kernel/1705.2/05176.html
v23 https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1404788.html
v24 https://lwn.net/Articles/726691/

Jérôme Glisse (18):
  hmm: heterogeneous memory management documentation v3
  mm/hmm: heterogeneous memory management (HMM for short) v5
  mm/hmm/mirror: mirror process address space on device with HMM helpers
v3
  mm/hmm/mirror: helper to snapshot CPU page table v4
  mm/hmm/mirror: device page fault handler
  mm/ZONE_DEVICE: new type of ZONE_DEVICE for unaddressable memory v5
  mm/ZONE_DEVICE: special case put_page() for device private pages v4
  mm/memcontrol: allow to uncharge page without using page->lru field
  mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4
  mm/hmm/devmem: device memory hotplug using ZONE_DEVICE v7
  mm/hmm/devmem: dummy HMM device for ZONE_DEVICE memory v3
  mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY
  mm/migrate: new memory migration helper for use with device memory v5
  mm/migrate: migrate_vma() unmap page from vma while collecting pages
  mm/migrate: support un-addressable ZONE_DEVICE page in migration v3
  mm/migrate: allow migrate_vma() to alloc new page on empty entry v4
  mm/device-public-memory: device memory cache coherent with CPU v5
  mm/hmm: add new helper to hotplug CDM memory region v3

Michal Hocko (1):
  mm/memory_hotplug: introduce add_pages

 Documentation/vm/hmm.txt   |  384 
 MAINTAINERS|7 +
 arch/x86/Kconfig   |4 +
 arch/x86/mm/init_64.c  |   22 +-
 fs/aio.c   |8 +
 fs/f2fs/data.c |5 +-
 fs/hugetlbfs/inode.c   |5 +-
 fs/proc/task_mmu.c |9 +-
 fs/ubifs/file.c|5 +-
 include/linux/hmm.h|  518 
 include/linux/ioport.h |2 +
 include/linux/memory_hotplug.h |   11 +
 include/linux/memremap.h   |  107 
 include/linux/migrate.h|  124 
 include/linux/migrate_mode.h   |5 +
 include/linux/mm.h |   33 +-
 include/linux/mm_types.h   |6 +
 include/linux/swap.h   |   24 +-
 include/linux/swapops.h|   68 +++
 kernel/fork.c  |3 +
 kernel/memremap.c  |   60 +-
 mm/Kconfig |   47 +-
 mm/Makefile|2 +-
 mm/balloon_compaction.c|8 +
 mm/gup.c   |7 +
 mm/hmm.c   | 1273 
 mm/madvise.c   |2 +-
 mm/memcontrol.c|  222 ---
 mm/memory.c|  107 +++-
 mm/memory_hotplug.c|   10 +-
 mm/migrate.c   |  928 -
 mm/mprotect.c  |   14 +
 mm/page_vma_mapped.c   |   10 +
 mm/rmap.c  |   25 +
 mm/swap.c  |   11 +
 mm/zsmalloc.c  |8 +
 36 files changed, 3964 insertions(+), 120 deletions(-)
 create mode 100644 Documentation/vm/hmm.txt
 create mode 100644 include/linux/hmm.h
 create mode 100644 mm/hmm.c

-- 
2.13.4



[HMM-v25 13/19] mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY

2017-08-16 Thread Jérôme Glisse
Introduce a new migration mode that allow to offload the copy to
a device DMA engine. This changes the workflow of migration and
not all address_space migratepage callback can support this. So
it needs to be tested in those cases.

This is intended to be use by migrate_vma() which itself is use
for thing like HMM (see include/linux/hmm.h).

Signed-off-by: Jérôme Glisse 
---
 fs/aio.c |  8 +++
 fs/f2fs/data.c   |  5 -
 fs/hugetlbfs/inode.c |  5 -
 fs/ubifs/file.c  |  5 -
 include/linux/migrate.h  |  5 +
 include/linux/migrate_mode.h |  5 +
 mm/balloon_compaction.c  |  8 +++
 mm/migrate.c | 52 ++--
 mm/zsmalloc.c|  8 +++
 9 files changed, 86 insertions(+), 15 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 8f0127526299..b5d69f28d8b1 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, 
struct page *new,
pgoff_t idx;
int rc;
 
+   /*
+* We cannot support the _NO_COPY case here, because copy needs to
+* happen under the ctx->completion_lock. That does not work with the
+* migration workflow of MIGRATE_SYNC_NO_COPY.
+*/
+   if (mode == MIGRATE_SYNC_NO_COPY)
+   return -EINVAL;
+
rc = 0;
 
/* mapping->private_lock here protects against the kioctx teardown.  */
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index aefc2a5745d3..cf7930daa10e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2257,7 +2257,10 @@ int f2fs_migrate_page(struct address_space *mapping,
SetPagePrivate(newpage);
set_page_private(newpage, page_private(page));
 
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
 
return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 33961b35007b..59073e9f01a4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -830,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space 
*mapping,
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
 
return MIGRATEPAGE_SUCCESS;
 }
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f90a466ea5db..a02aa59d1e24 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space 
*mapping,
SetPagePrivate(newpage);
}
 
-   migrate_page_copy(newpage, page);
+   if (mode != MIGRATE_SYNC_NO_COPY)
+   migrate_page_copy(newpage, page);
+   else
+   migrate_page_states(newpage, page);
return MIGRATEPAGE_SUCCESS;
 }
 #endif
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ce15989521a1..7db4c812a2a6 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -72,6 +72,7 @@ extern void putback_movable_page(struct page *page);
 
 extern int migrate_prep(void);
 extern int migrate_prep_local(void);
+extern void migrate_page_states(struct page *newpage, struct page *page);
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
  struct page *newpage, struct page *page);
@@ -92,6 +93,10 @@ static inline int isolate_movable_page(struct page *page, 
isolate_mode_t mode)
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
 
+static inline void migrate_page_states(struct page *newpage, struct page *page)
+{
+}
+
 static inline void migrate_page_copy(struct page *newpage,
 struct page *page) {}
 
diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h
index ebf3d89a3919..bdf66af9b937 100644
--- a/include/linux/migrate_mode.h
+++ b/include/linux/migrate_mode.h
@@ -6,11 +6,16 @@
  * on most operations but not ->writepage as the potential stall time
  * is too significant
  * MIGRATE_SYNC will block when migrating pages
+ * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages
+ * with the CPU. Instead, page copy happens outside the migratepage()
+ * callback and is likely using a DMA engine. See migrate_vma() and HMM
+ * (mm/hmm.c) for users of this mode.
  */
 enum migrate_mode {
MIGRATE_ASYNC,
MIGRATE_SYNC_LIGHT,
MIGRATE_SYNC,
+   MIGRATE_SYNC_NO_COPY,
 };

[HMM-v25 10/19] mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4

2017-08-16 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus need special handling when it comes to lru or refcount. This
patch make sure that memcontrol properly handle those when it face
them. Those pages are use like regular pages in a process address
space either as anonymous page or as file back page. So from memcg
point of view we want to handle them like regular page for now at
least.

Changed since v3:
  - remove public support and move those chunk to separate patch
Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host
  - add comments explaining how device memory behave and why

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Acked-by: Balbir Singh <bsinghar...@gmail.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Vladimir Davydov <vdavydov@gmail.com>
Cc: cgro...@vger.kernel.org
---
 kernel/memremap.c |  1 +
 mm/memcontrol.c   | 52 
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 398630c1fba3..f42d7483e886 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page)
__ClearPageWaiters(page);
 
page->mapping = NULL;
+   mem_cgroup_uncharge(page);
 
page->pgmap->page_free(page, page->pgmap->data);
} else if (!count)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 604fb3ca8028..977d1cf3493a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4407,12 +4407,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+   MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
 {
-   struct page *page = vm_normal_page(vma, addr, ptent);
+   struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
if (!page || !page_mapped(page))
return NULL;
@@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct 
vm_area_struct *vma,
return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
 {
@@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct 
vm_area_struct *vma,
 
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+   /*
+* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+* a device and because they are not accessible by CPU they are store
+* as special swap entry in the CPU page table.
+*/
+   if (is_device_private_entry(ent)) {
+   page = device_private_entry_to_page(ent);
+   /*
+* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+* a refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   return page;
+   }
+
/*
 * Because lookup_swap_cache() updates some statistics counter,
 * we call find_get_page() with swapper_space directly.
@@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
+ * charge like a regular page would be as for all intent and purposes it is
+ * just special memory taking the place of a regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
  * Called with pte lock held.
  */
@@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+   if (is_device_private_page(page))
+   ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
*pmd,
 
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
+   /*
+* Note their can not be MC_TARGET_DEVICE for now as we do not
+* support transparent huge page with MEMORY_DEVICE_PUBLIC or
+* MEMORY_DEVICE_PRIVATE but this might change.
+*/
  

[HMM-v25 10/19] mm/memcontrol: support MEMORY_DEVICE_PRIVATE v4

2017-08-16 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus need special handling when it comes to lru or refcount. This
patch make sure that memcontrol properly handle those when it face
them. Those pages are use like regular pages in a process address
space either as anonymous page or as file back page. So from memcg
point of view we want to handle them like regular page for now at
least.

Changed since v3:
  - remove public support and move those chunk to separate patch
Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host
  - add comments explaining how device memory behave and why

Signed-off-by: Jérôme Glisse 
Acked-by: Balbir Singh 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: cgro...@vger.kernel.org
---
 kernel/memremap.c |  1 +
 mm/memcontrol.c   | 52 
 2 files changed, 49 insertions(+), 4 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 398630c1fba3..f42d7483e886 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -492,6 +492,7 @@ void put_zone_device_private_page(struct page *page)
__ClearPageWaiters(page);
 
page->mapping = NULL;
+   mem_cgroup_uncharge(page);
 
page->pgmap->page_free(page, page->pgmap->data);
} else if (!count)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 604fb3ca8028..977d1cf3493a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4407,12 +4407,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+   MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
 {
-   struct page *page = vm_normal_page(vma, addr, ptent);
+   struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
if (!page || !page_mapped(page))
return NULL;
@@ -4429,7 +4430,7 @@ static struct page *mc_handle_present_pte(struct 
vm_area_struct *vma,
return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
 {
@@ -4438,6 +4439,23 @@ static struct page *mc_handle_swap_pte(struct 
vm_area_struct *vma,
 
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+   /*
+* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+* a device and because they are not accessible by CPU they are store
+* as special swap entry in the CPU page table.
+*/
+   if (is_device_private_entry(ent)) {
+   page = device_private_entry_to_page(ent);
+   /*
+* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+* a refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   return page;
+   }
+
/*
 * Because lookup_swap_cache() updates some statistics counter,
 * we call find_get_page() with swapper_space directly.
@@ -4598,6 +4616,12 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru). For now we such page is
+ * charge like a regular page would be as for all intent and purposes it is
+ * just special memory taking the place of a regular page.
+ *
+ * See Documentations/vm/hmm.txt and include/linux/hmm.h
  *
  * Called with pte lock held.
  */
@@ -4626,6 +4650,8 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+   if (is_device_private_page(page))
+   ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
}
@@ -4693,6 +4719,11 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t 
*pmd,
 
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
+   /*
+* Note their can not be MC_TARGET_DEVICE for now as we do not
+* support transparent huge page with MEMORY_DEVICE_PUBLIC or
+* MEMORY_DEVICE_PRIVATE but this might change.
+*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
   

[PATCH 2/6] mm/device-public-memory: device memory cache coherent with CPU v4

2017-07-13 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v3:
  - s/public/public (going back)
Changed since v2:
  - s/public/public
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Balbir Singh <balb...@au1.ibm.com>
Cc: Aneesh Kumar <aneesh.ku...@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c| 15 -
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 ++--
 mm/swap.c| 11 ++
 13 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 957b6ea80d5f..1f38f2c7cc34 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1182,7 +1182,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 458d0d6d82f3..a40288309fd2 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -443,7 +443,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index ae5ff92f72b4..c7b4c75ae3f8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_de

[PATCH 2/6] mm/device-public-memory: device memory cache coherent with CPU v4

2017-07-13 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v3:
  - s/public/public (going back)
Changed since v2:
  - s/public/public
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse 
Cc: Balbir Singh 
Cc: Aneesh Kumar 
Cc: Paul E. McKenney 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Ross Zwisler 
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c| 15 -
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 ++--
 mm/swap.c| 11 ++
 13 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 957b6ea80d5f..1f38f2c7cc34 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1182,7 +1182,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 458d0d6d82f3..a40288309fd2 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -443,7 +443,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..f5cf32e80041 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_PUBLIC_MEMORY = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index ae5ff92f72b4..c7b4c75ae3f8 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_PUBLIC:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_PUBLIC,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_PUBLIC only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_public_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_PUBLIC;
+}
 #else
 static inline void *devm_memremap_pages(struct devi

[PATCH 1/6] mm/zone-device: rename DEVICE_PUBLIC to DEVICE_HOST

2017-07-13 Thread Jérôme Glisse
Existing user of ZONE_DEVICE in its DEVICE_PUBLIC variant are not tie
to specific device and behave more like host memory. This patch rename
DEVICE_PUBLIC to DEVICE_HOST and free the name DEVICE_PUBLIC to be use
for cache coherent device memory that has strong tie with the device
on which the memory is (for instance on board GPU memory).

There is no functional change here.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 include/linux/memremap.h | 4 ++--
 kernel/memremap.c| 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 57546a07a558..ae5ff92f72b4 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,7 +41,7 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  * Specialize ZONE_DEVICE memory into multiple types each having differents
  * usage.
  *
- * MEMORY_DEVICE_PUBLIC:
+ * MEMORY_DEVICE_HOST:
  * Persistent device memory (pmem): struct page might be allocated in different
  * memory and architecture might want to perform special actions. It is similar
  * to regular memory, in that the CPU can access it transparently. However,
@@ -59,7 +59,7 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
  */
 enum memory_type {
-   MEMORY_DEVICE_PUBLIC = 0,
+   MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
 };
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b9baa6c07918..4e07525aa273 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct 
resource *res,
}
pgmap->ref = ref;
pgmap->res = _map->res;
-   pgmap->type = MEMORY_DEVICE_PUBLIC;
+   pgmap->type = MEMORY_DEVICE_HOST;
pgmap->page_fault = NULL;
pgmap->page_free = NULL;
pgmap->data = NULL;
-- 
2.13.0



[PATCH 1/6] mm/zone-device: rename DEVICE_PUBLIC to DEVICE_HOST

2017-07-13 Thread Jérôme Glisse
Existing user of ZONE_DEVICE in its DEVICE_PUBLIC variant are not tie
to specific device and behave more like host memory. This patch rename
DEVICE_PUBLIC to DEVICE_HOST and free the name DEVICE_PUBLIC to be use
for cache coherent device memory that has strong tie with the device
on which the memory is (for instance on board GPU memory).

There is no functional change here.

Signed-off-by: Jérôme Glisse 
Cc: Dan Williams 
Cc: Ross Zwisler 
---
 include/linux/memremap.h | 4 ++--
 kernel/memremap.c| 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 57546a07a558..ae5ff92f72b4 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -41,7 +41,7 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  * Specialize ZONE_DEVICE memory into multiple types each having differents
  * usage.
  *
- * MEMORY_DEVICE_PUBLIC:
+ * MEMORY_DEVICE_HOST:
  * Persistent device memory (pmem): struct page might be allocated in different
  * memory and architecture might want to perform special actions. It is similar
  * to regular memory, in that the CPU can access it transparently. However,
@@ -59,7 +59,7 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
  */
 enum memory_type {
-   MEMORY_DEVICE_PUBLIC = 0,
+   MEMORY_DEVICE_HOST = 0,
MEMORY_DEVICE_PRIVATE,
 };
 
diff --git a/kernel/memremap.c b/kernel/memremap.c
index b9baa6c07918..4e07525aa273 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct 
resource *res,
}
pgmap->ref = ref;
pgmap->res = _map->res;
-   pgmap->type = MEMORY_DEVICE_PUBLIC;
+   pgmap->type = MEMORY_DEVICE_HOST;
pgmap->page_fault = NULL;
pgmap->page_free = NULL;
pgmap->data = NULL;
-- 
2.13.0



[PATCH 3/6] mm/hmm: add new helper to hotplug CDM memory region v3

2017-07-13 Thread Jérôme Glisse
Unlike unaddressable memory, coherent device memory has a real
resource associated with it on the system (as CPU can address
it). Add a new helper to hotplug such memory within the HMM
framework.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Reviewed-by: Balbir Singh <bsinghar...@gmail.com>
---
 include/linux/hmm.h |  3 ++
 mm/hmm.c| 85 +
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index a40288309fd2..e44cb8edb137 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -392,6 +392,9 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  struct device *device,
  unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res);
 void hmm_devmem_remove(struct hmm_devmem *devmem);
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index eadf70829c34..28e54e3b4e1d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -849,7 +849,11 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
zone = page_zone(page);
 
mem_hotplug_begin();
-   __remove_pages(zone, start_pfn, npages);
+   if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+   __remove_pages(zone, start_pfn, npages);
+   else
+   arch_remove_memory(start_pfn << PAGE_SHIFT,
+  npages << PAGE_SHIFT);
mem_hotplug_done();
 
hmm_devmem_radix_release(resource);
@@ -885,7 +889,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
if (is_ram == REGION_INTERSECTS)
return -ENXIO;
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+   else
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
devmem->pagemap.res = devmem->resource;
devmem->pagemap.page_fault = hmm_devmem_fault;
devmem->pagemap.page_free = hmm_devmem_free;
@@ -924,8 +932,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
nid = numa_mem_id();
 
mem_hotplug_begin();
-   ret = add_pages(nid, align_start >> PAGE_SHIFT,
-   align_size >> PAGE_SHIFT, false);
+   if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+   ret = arch_add_memory(nid, align_start, align_size, false);
+   else
+   ret = add_pages(nid, align_start >> PAGE_SHIFT,
+   align_size >> PAGE_SHIFT, false);
if (ret) {
mem_hotplug_done();
goto error_add_memory;
@@ -1075,6 +1086,67 @@ struct hmm_devmem *hmm_devmem_add(const struct 
hmm_devmem_ops *ops,
 }
 EXPORT_SYMBOL(hmm_devmem_add);
 
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res)
+{
+   struct hmm_devmem *devmem;
+   int ret;
+
+   if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   return ERR_PTR(-EINVAL);
+
+   static_branch_enable(_private_key);
+
+   devmem = devres_alloc_node(_devmem_release, sizeof(*devmem),
+  GFP_KERNEL, dev_to_node(device));
+   if (!devmem)
+   return ERR_PTR(-ENOMEM);
+
+   init_completion(>completion);
+   devmem->pfn_first = -1UL;
+   devmem->pfn_last = -1UL;
+   devmem->resource = res;
+   devmem->device = device;
+   devmem->ops = ops;
+
+   ret = percpu_ref_init(>ref, _devmem_ref_release,
+ 0, GFP_KERNEL);
+   if (ret)
+   goto error_percpu_ref;
+
+   ret = devm_add_action(device, hmm_devmem_ref_exit, >ref);
+   if (ret)
+   goto error_devm_add_action;
+
+
+   devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+   devmem->pfn_last = devmem->pfn_first +
+  (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+   ret = hmm_devmem_pages_create(devmem);
+   if (ret)
+   goto error_devm_add_action;
+
+   devres_add(device, devmem);
+
+   ret = devm_add_action(device, hmm_devmem_ref_kill, >ref);
+   if (ret) {
+   hmm_devmem_remove(devmem);
+   return ERR_PTR(ret);
+   }
+
+   return devmem;
+
+error_devm_add_action:
+   hmm_devmem_ref_

[PATCH 3/6] mm/hmm: add new helper to hotplug CDM memory region v3

2017-07-13 Thread Jérôme Glisse
Unlike unaddressable memory, coherent device memory has a real
resource associated with it on the system (as CPU can address
it). Add a new helper to hotplug such memory within the HMM
framework.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host

Signed-off-by: Jérôme Glisse 
Reviewed-by: Balbir Singh 
---
 include/linux/hmm.h |  3 ++
 mm/hmm.c| 85 +
 2 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index a40288309fd2..e44cb8edb137 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -392,6 +392,9 @@ struct hmm_devmem {
 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
  struct device *device,
  unsigned long size);
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res);
 void hmm_devmem_remove(struct hmm_devmem *devmem);
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index eadf70829c34..28e54e3b4e1d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -849,7 +849,11 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
zone = page_zone(page);
 
mem_hotplug_begin();
-   __remove_pages(zone, start_pfn, npages);
+   if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
+   __remove_pages(zone, start_pfn, npages);
+   else
+   arch_remove_memory(start_pfn << PAGE_SHIFT,
+  npages << PAGE_SHIFT);
mem_hotplug_done();
 
hmm_devmem_radix_release(resource);
@@ -885,7 +889,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
if (is_ram == REGION_INTERSECTS)
return -ENXIO;
 
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
+   else
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+
devmem->pagemap.res = devmem->resource;
devmem->pagemap.page_fault = hmm_devmem_fault;
devmem->pagemap.page_free = hmm_devmem_free;
@@ -924,8 +932,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
nid = numa_mem_id();
 
mem_hotplug_begin();
-   ret = add_pages(nid, align_start >> PAGE_SHIFT,
-   align_size >> PAGE_SHIFT, false);
+   if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
+   ret = arch_add_memory(nid, align_start, align_size, false);
+   else
+   ret = add_pages(nid, align_start >> PAGE_SHIFT,
+   align_size >> PAGE_SHIFT, false);
if (ret) {
mem_hotplug_done();
goto error_add_memory;
@@ -1075,6 +1086,67 @@ struct hmm_devmem *hmm_devmem_add(const struct 
hmm_devmem_ops *ops,
 }
 EXPORT_SYMBOL(hmm_devmem_add);
 
+struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
+  struct device *device,
+  struct resource *res)
+{
+   struct hmm_devmem *devmem;
+   int ret;
+
+   if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
+   return ERR_PTR(-EINVAL);
+
+   static_branch_enable(_private_key);
+
+   devmem = devres_alloc_node(_devmem_release, sizeof(*devmem),
+  GFP_KERNEL, dev_to_node(device));
+   if (!devmem)
+   return ERR_PTR(-ENOMEM);
+
+   init_completion(>completion);
+   devmem->pfn_first = -1UL;
+   devmem->pfn_last = -1UL;
+   devmem->resource = res;
+   devmem->device = device;
+   devmem->ops = ops;
+
+   ret = percpu_ref_init(>ref, _devmem_ref_release,
+ 0, GFP_KERNEL);
+   if (ret)
+   goto error_percpu_ref;
+
+   ret = devm_add_action(device, hmm_devmem_ref_exit, >ref);
+   if (ret)
+   goto error_devm_add_action;
+
+
+   devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
+   devmem->pfn_last = devmem->pfn_first +
+  (resource_size(devmem->resource) >> PAGE_SHIFT);
+
+   ret = hmm_devmem_pages_create(devmem);
+   if (ret)
+   goto error_devm_add_action;
+
+   devres_add(device, devmem);
+
+   ret = devm_add_action(device, hmm_devmem_ref_kill, >ref);
+   if (ret) {
+   hmm_devmem_remove(devmem);
+   return ERR_PTR(ret);
+   }
+
+   return devmem;
+
+error_devm_add_action:
+   hmm_devmem_ref_kill(>ref);
+   hmm_devmem_ref_exit(>ref);
+error_percpu_ref:
+

[PATCH 0/6] Cache coherent device memory (CDM) with HMM v5

2017-07-13 Thread Jérôme Glisse
Sorry i made horrible mistake on names in v4, i completly miss-
understood the suggestion. So here i repost with proper naming.
This is the only change since v3. Again sorry about the noise
with v4.

Changes since v4:
  - s/DEVICE_HOST/DEVICE_PUBLIC

Git tree:
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-cdm-v5


Cache coherent device memory apply to architecture with system bus
like CAPI or CCIX. Device connected to such system bus can expose
their memory to the system and allow cache coherent access to it
from the CPU.

Even if for all intent and purposes device memory behave like regular
memory, we still want to manage it in isolation from regular memory.
Several reasons for that, first and foremost this memory is less
reliable than regular memory if the device hangs because of invalid
commands we can loose access to device memory. Second CPU access to
this memory is expected to be slower than to regular memory. Third
having random memory into device means that some of the bus bandwith
wouldn't be available to the device but would be use by CPU access.

This is why we want to manage such memory in isolation from regular
memory. Kernel should not try to use this memory even as last resort
when running out of memory, at least for now.

This patchset add a new type of ZONE_DEVICE memory (DEVICE_HOST)
that is use to represent CDM memory. This patchset build on top of
the HMM patchset that already introduce a new type of ZONE_DEVICE
memory for private device memory (see HMM patchset).

The end result is that with this patchset if a device is in use in
a process you might have private anonymous memory or file back
page memory using ZONE_DEVICE (DEVICE_HOST). Thus care must be
taken to not overwritte lru fields of such pages.

Hence all core mm changes are done to address assumption that any
process memory is back by a regular struct page that is part of
the lru. ZONE_DEVICE page are not on the lru and the lru pointer
of struct page are use to store device specific informations.

Thus this patchset update all code path that would make assumptions
about lruness of a process page.

patch 01 - rename DEVICE_PUBLIC to DEVICE_HOST to free DEVICE_PUBLIC name
patch 02 - add DEVICE_PUBLIC type to ZONE_DEVICE (all core mm changes)
patch 03 - add an helper to HMM for hotplug of CDM memory
patch 04 - preparatory patch for memory controller changes (memch)
patch 05 - update memory controller to properly handle
   ZONE_DEVICE pages when uncharging
patch 06 - documentation patch

Previous posting:
v1 https://lkml.org/lkml/2017/4/7/638
v2 https://lwn.net/Articles/725412/
v3 https://lwn.net/Articles/727114/
v4 https://lwn.net/Articles/727692/

Jérôme Glisse (6):
  mm/zone-device: rename DEVICE_PUBLIC to DEVICE_HOST
  mm/device-public-memory: device memory cache coherent with CPU v4
  mm/hmm: add new helper to hotplug CDM memory region v3
  mm/memcontrol: allow to uncharge page without using page->lru field
  mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC
v3
  mm/hmm: documents how device memory is accounted in rss and memcg

 Documentation/vm/hmm.txt |  40 
 fs/proc/task_mmu.c   |   2 +-
 include/linux/hmm.h  |   7 +-
 include/linux/ioport.h   |   1 +
 include/linux/memremap.h |  25 -
 include/linux/mm.h   |  20 ++--
 kernel/memremap.c|  19 ++--
 mm/Kconfig   |  11 +++
 mm/gup.c |   7 ++
 mm/hmm.c |  89 --
 mm/madvise.c |   2 +-
 mm/memcontrol.c  | 231 ++-
 mm/memory.c  |  46 +-
 mm/migrate.c |  57 +++-
 mm/swap.c|  11 +++
 15 files changed, 434 insertions(+), 134 deletions(-)

-- 
2.13.0



[PATCH 5/6] mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC v3

2017-07-13 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus need special handling when it comes to lru or refcount. This
patch make sure that memcontrol properly handle those when it face
them. Those pages are use like regular pages in a process address
space either as anonymous page or as file back page. So from memcg
point of view we want to handle them like regular page for now at
least.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host
  - add comments explaining how device memory behave and why

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Vladimir Davydov <vdavydov@gmail.com>
Cc: cgro...@vger.kernel.org
---
 kernel/memremap.c |  2 ++
 mm/memcontrol.c   | 63 ++-
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 25c098151ed2..4d74b4a4f8f5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -479,6 +479,8 @@ void put_zone_device_private_or_public_page(struct page 
*page)
__ClearPageActive(page);
__ClearPageWaiters(page);
 
+   mem_cgroup_uncharge(page);
+
page->pgmap->page_free(page, page->pgmap->data);
}
else if (!count)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c709fdceac13..858842a741bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4391,12 +4391,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+   MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
 {
-   struct page *page = vm_normal_page(vma, addr, ptent);
+   struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
if (!page || !page_mapped(page))
return NULL;
@@ -4407,13 +4408,20 @@ static struct page *mc_handle_present_pte(struct 
vm_area_struct *vma,
if (!(mc.flags & MOVE_FILE))
return NULL;
}
-   if (!get_page_unless_zero(page))
+   if (is_device_public_page(page)) {
+   /*
+* MEMORY_DEVICE_PUBLIC means ZONE_DEVICE page and which have a
+* refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   } else if (!get_page_unless_zero(page))
return NULL;
 
return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
 {
@@ -4422,6 +4430,23 @@ static struct page *mc_handle_swap_pte(struct 
vm_area_struct *vma,
 
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+   /*
+* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+* a device and because they are not accessible by CPU they are store
+* as special swap entry in the CPU page table.
+*/
+   if (is_device_private_entry(ent)) {
+   page = device_private_entry_to_page(ent);
+   /*
+* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+* a refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   return page;
+   }
+
/*
 * Because lookup_swap_cache() updates some statistics counter,
 * we call find_get_page() with swapper_space directly.
@@ -4582,6 +4607,13 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page. See Documentations/vm/hmm.txt and include/linux/hmm.h for
+ * more informations on this type of memory how it is use and why it is
+ * charge like this.
  *
  * Called with pte lock held.
  */
@@ -4610,6 +4642,9 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+   if (is_device_private_page(page) ||
+   

[PATCH 4/6] mm/memcontrol: allow to uncharge page without using page->lru field

2017-07-13 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus you can not use page->lru fields of those pages. This patch
re-arrange the uncharge to allow single page to be uncharge without
modifying the lru field of the struct page.

There is no change to memcontrol logic, it is the same as it was
before this patch.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Vladimir Davydov <vdavydov@gmail.com>
Cc: cgro...@vger.kernel.org
---
 mm/memcontrol.c | 168 +++-
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..c709fdceac13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5509,48 +5509,102 @@ void mem_cgroup_cancel_charge(struct page *page, 
struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-  unsigned long nr_anon, unsigned long nr_file,
-  unsigned long nr_kmem, unsigned long nr_huge,
-  unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+   struct mem_cgroup *memcg;
+   unsigned long pgpgout;
+   unsigned long nr_anon;
+   unsigned long nr_file;
+   unsigned long nr_kmem;
+   unsigned long nr_huge;
+   unsigned long nr_shmem;
+   struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-   unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+   memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+   unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
 
-   if (!mem_cgroup_is_root(memcg)) {
-   page_counter_uncharge(>memory, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg)) {
+   page_counter_uncharge(>memcg->memory, nr_pages);
if (do_memsw_account())
-   page_counter_uncharge(>memsw, nr_pages);
-   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-   page_counter_uncharge(>kmem, nr_kmem);
-   memcg_oom_recover(memcg);
+   page_counter_uncharge(>memcg->memsw, nr_pages);
+   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+   page_counter_uncharge(>memcg->kmem, ug->nr_kmem);
+   memcg_oom_recover(ug->memcg);
}
 
local_irq_save(flags);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-   __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-   __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-   __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-   __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-   memcg_check_events(memcg, dummy_page);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+   __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+   __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+   __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+   memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
 
-   if (!mem_cgroup_is_root(memcg))
-   css_put_many(>css, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg))
+   css_put_many(>memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+   VM_BUG_ON_PAGE(PageLRU(page), page);
+   VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+   if (!page->mem_cgroup)
+   return;
+
+   /*
+* Nobody should be changing or seriously looking at
+* page->mem_cgroup at this point, we have fully
+* exclusive access to the page.
+*/
+
+   if (ug->memcg != page->mem_cgroup) {
+   if (ug->memcg) {
+   uncharge_batch(ug);
+   uncharge_gather_clear(ug);
+   }
+   ug->memcg = page->mem_cgroup;
+   }
+
+   if (!PageKmemcg(page)) {
+   unsigned int nr_pages = 1;
+
+   if (PageTransHuge(page)) {
+   nr_pages <<= compound_order(page);
+ 

[PATCH 6/6] mm/hmm: documents how device memory is accounted in rss and memcg

2017-07-13 Thread Jérôme Glisse
For now we account device memory exactly like a regular page in
respect to rss counters and memory cgroup. We do this so that any
existing application that starts using device memory without knowing
about it will keep running unimpacted. This also simplify migration
code.

We will likely revisit this choice once we gain more experience with
how device memory is use and how it impacts overall memory resource
management. For now we believe this is a good enough choice.

Note that device memory can not be pin. Nor by device driver, nor
by GUP thus device memory can always be free and unaccounted when
a process exit.

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Michal Hocko <mho...@kernel.org>
---
 Documentation/vm/hmm.txt | 40 
 1 file changed, 40 insertions(+)

diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
index 192dcdb38bd1..4d3aac9f4a5d 100644
--- a/Documentation/vm/hmm.txt
+++ b/Documentation/vm/hmm.txt
@@ -15,6 +15,15 @@ section present the new migration helper that allow to 
leverage the device DMA
 engine.
 
 
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
 ---
 
 1) Problems of using device specific memory allocator:
@@ -342,3 +351,34 @@ that happens then the finalize_and_map() can catch any 
pages that was not
 migrated. Note those page were still copied to new page and thus we wasted
 bandwidth but this is considered as a rare event and a price that we are
 willing to pay to keep all the code simpler.
+
+
+---
+
+7) Memory cgroup (memcg) and rss accounting
+
+For now device memory is accounted as any regular page in rss counters (either
+anonymous if device page is use for anonymous, file if device page is use for
+file back page or shmem if device page is use for share memory). This is a
+deliberate choice to keep existing application that might start using device
+memory without knowing about it to keep runing unimpacted.
+
+Drawbacks is that OOM killer might kill an application using a lot of device
+memory and not a lot of regular system memory and thus not freeing much system
+memory. We want to gather more real world experience on how application and
+system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory page are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory can not fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is use and its impact on memory
+resource control.
+
+
+Note that device memory can never be pin nor by device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is drop in case of share memory or file back memory.
-- 
2.13.0



[PATCH 0/6] Cache coherent device memory (CDM) with HMM v5

2017-07-13 Thread Jérôme Glisse
Sorry i made horrible mistake on names in v4, i completly miss-
understood the suggestion. So here i repost with proper naming.
This is the only change since v3. Again sorry about the noise
with v4.

Changes since v4:
  - s/DEVICE_HOST/DEVICE_PUBLIC

Git tree:
https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-cdm-v5


Cache coherent device memory apply to architecture with system bus
like CAPI or CCIX. Device connected to such system bus can expose
their memory to the system and allow cache coherent access to it
from the CPU.

Even if for all intent and purposes device memory behave like regular
memory, we still want to manage it in isolation from regular memory.
Several reasons for that, first and foremost this memory is less
reliable than regular memory if the device hangs because of invalid
commands we can loose access to device memory. Second CPU access to
this memory is expected to be slower than to regular memory. Third
having random memory into device means that some of the bus bandwith
wouldn't be available to the device but would be use by CPU access.

This is why we want to manage such memory in isolation from regular
memory. Kernel should not try to use this memory even as last resort
when running out of memory, at least for now.

This patchset add a new type of ZONE_DEVICE memory (DEVICE_HOST)
that is use to represent CDM memory. This patchset build on top of
the HMM patchset that already introduce a new type of ZONE_DEVICE
memory for private device memory (see HMM patchset).

The end result is that with this patchset if a device is in use in
a process you might have private anonymous memory or file back
page memory using ZONE_DEVICE (DEVICE_HOST). Thus care must be
taken to not overwritte lru fields of such pages.

Hence all core mm changes are done to address assumption that any
process memory is back by a regular struct page that is part of
the lru. ZONE_DEVICE page are not on the lru and the lru pointer
of struct page are use to store device specific informations.

Thus this patchset update all code path that would make assumptions
about lruness of a process page.

patch 01 - rename DEVICE_PUBLIC to DEVICE_HOST to free DEVICE_PUBLIC name
patch 02 - add DEVICE_PUBLIC type to ZONE_DEVICE (all core mm changes)
patch 03 - add an helper to HMM for hotplug of CDM memory
patch 04 - preparatory patch for memory controller changes (memch)
patch 05 - update memory controller to properly handle
   ZONE_DEVICE pages when uncharging
patch 06 - documentation patch

Previous posting:
v1 https://lkml.org/lkml/2017/4/7/638
v2 https://lwn.net/Articles/725412/
v3 https://lwn.net/Articles/727114/
v4 https://lwn.net/Articles/727692/

Jérôme Glisse (6):
  mm/zone-device: rename DEVICE_PUBLIC to DEVICE_HOST
  mm/device-public-memory: device memory cache coherent with CPU v4
  mm/hmm: add new helper to hotplug CDM memory region v3
  mm/memcontrol: allow to uncharge page without using page->lru field
  mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC
v3
  mm/hmm: documents how device memory is accounted in rss and memcg

 Documentation/vm/hmm.txt |  40 
 fs/proc/task_mmu.c   |   2 +-
 include/linux/hmm.h  |   7 +-
 include/linux/ioport.h   |   1 +
 include/linux/memremap.h |  25 -
 include/linux/mm.h   |  20 ++--
 kernel/memremap.c|  19 ++--
 mm/Kconfig   |  11 +++
 mm/gup.c |   7 ++
 mm/hmm.c |  89 --
 mm/madvise.c |   2 +-
 mm/memcontrol.c  | 231 ++-
 mm/memory.c  |  46 +-
 mm/migrate.c |  57 +++-
 mm/swap.c|  11 +++
 15 files changed, 434 insertions(+), 134 deletions(-)

-- 
2.13.0



[PATCH 5/6] mm/memcontrol: support MEMORY_DEVICE_PRIVATE and MEMORY_DEVICE_PUBLIC v3

2017-07-13 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus need special handling when it comes to lru or refcount. This
patch make sure that memcontrol properly handle those when it face
them. Those pages are use like regular pages in a process address
space either as anonymous page or as file back page. So from memcg
point of view we want to handle them like regular page for now at
least.

Changed since v2:
  - s/host/public
Changed since v1:
  - s/public/host
  - add comments explaining how device memory behave and why

Signed-off-by: Jérôme Glisse 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: cgro...@vger.kernel.org
---
 kernel/memremap.c |  2 ++
 mm/memcontrol.c   | 63 ++-
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/kernel/memremap.c b/kernel/memremap.c
index 25c098151ed2..4d74b4a4f8f5 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -479,6 +479,8 @@ void put_zone_device_private_or_public_page(struct page 
*page)
__ClearPageActive(page);
__ClearPageWaiters(page);
 
+   mem_cgroup_uncharge(page);
+
page->pgmap->page_free(page, page->pgmap->data);
}
else if (!count)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c709fdceac13..858842a741bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4391,12 +4391,13 @@ enum mc_target_type {
MC_TARGET_NONE = 0,
MC_TARGET_PAGE,
MC_TARGET_SWAP,
+   MC_TARGET_DEVICE,
 };
 
 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
 {
-   struct page *page = vm_normal_page(vma, addr, ptent);
+   struct page *page = _vm_normal_page(vma, addr, ptent, true);
 
if (!page || !page_mapped(page))
return NULL;
@@ -4407,13 +4408,20 @@ static struct page *mc_handle_present_pte(struct 
vm_area_struct *vma,
if (!(mc.flags & MOVE_FILE))
return NULL;
}
-   if (!get_page_unless_zero(page))
+   if (is_device_public_page(page)) {
+   /*
+* MEMORY_DEVICE_PUBLIC means ZONE_DEVICE page and which have a
+* refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   } else if (!get_page_unless_zero(page))
return NULL;
 
return page;
 }
 
-#ifdef CONFIG_SWAP
+#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
pte_t ptent, swp_entry_t *entry)
 {
@@ -4422,6 +4430,23 @@ static struct page *mc_handle_swap_pte(struct 
vm_area_struct *vma,
 
if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
+
+   /*
+* Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
+* a device and because they are not accessible by CPU they are store
+* as special swap entry in the CPU page table.
+*/
+   if (is_device_private_entry(ent)) {
+   page = device_private_entry_to_page(ent);
+   /*
+* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
+* a refcount of 1 when free (unlike normal page)
+*/
+   if (!page_ref_add_unless(page, 1, 1))
+   return NULL;
+   return page;
+   }
+
/*
 * Because lookup_swap_cache() updates some statistics counter,
 * we call find_get_page() with swapper_space directly.
@@ -4582,6 +4607,13 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
+ * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * For now we such page is charge like a regular page would be as for all
+ * intent and purposes it is just special memory taking the place of a
+ * regular page. See Documentations/vm/hmm.txt and include/linux/hmm.h for
+ * more informations on this type of memory how it is use and why it is
+ * charge like this.
  *
  * Called with pte lock held.
  */
@@ -4610,6 +4642,9 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
+   if (is_device_private_page(page) ||
+   is_device_public_page(page))
+   ret = MC_TARGET_DEVICE;

[PATCH 4/6] mm/memcontrol: allow to uncharge page without using page->lru field

2017-07-13 Thread Jérôme Glisse
HMM pages (private or public device pages) are ZONE_DEVICE page and
thus you can not use page->lru fields of those pages. This patch
re-arrange the uncharge to allow single page to be uncharge without
modifying the lru field of the struct page.

There is no change to memcontrol logic, it is the same as it was
before this patch.

Signed-off-by: Jérôme Glisse 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Vladimir Davydov 
Cc: cgro...@vger.kernel.org
---
 mm/memcontrol.c | 168 +++-
 1 file changed, 92 insertions(+), 76 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3df3c04d73ab..c709fdceac13 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5509,48 +5509,102 @@ void mem_cgroup_cancel_charge(struct page *page, 
struct mem_cgroup *memcg,
cancel_charge(memcg, nr_pages);
 }
 
-static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
-  unsigned long nr_anon, unsigned long nr_file,
-  unsigned long nr_kmem, unsigned long nr_huge,
-  unsigned long nr_shmem, struct page *dummy_page)
+struct uncharge_gather {
+   struct mem_cgroup *memcg;
+   unsigned long pgpgout;
+   unsigned long nr_anon;
+   unsigned long nr_file;
+   unsigned long nr_kmem;
+   unsigned long nr_huge;
+   unsigned long nr_shmem;
+   struct page *dummy_page;
+};
+
+static inline void uncharge_gather_clear(struct uncharge_gather *ug)
 {
-   unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
+   memset(ug, 0, sizeof(*ug));
+}
+
+static void uncharge_batch(const struct uncharge_gather *ug)
+{
+   unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
 
-   if (!mem_cgroup_is_root(memcg)) {
-   page_counter_uncharge(>memory, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg)) {
+   page_counter_uncharge(>memcg->memory, nr_pages);
if (do_memsw_account())
-   page_counter_uncharge(>memsw, nr_pages);
-   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
-   page_counter_uncharge(>kmem, nr_kmem);
-   memcg_oom_recover(memcg);
+   page_counter_uncharge(>memcg->memsw, nr_pages);
+   if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
+   page_counter_uncharge(>memcg->kmem, ug->nr_kmem);
+   memcg_oom_recover(ug->memcg);
}
 
local_irq_save(flags);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon);
-   __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file);
-   __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge);
-   __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem);
-   __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout);
-   __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
-   memcg_check_events(memcg, dummy_page);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file);
+   __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge);
+   __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem);
+   __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout);
+   __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages);
+   memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
 
-   if (!mem_cgroup_is_root(memcg))
-   css_put_many(>css, nr_pages);
+   if (!mem_cgroup_is_root(ug->memcg))
+   css_put_many(>memcg->css, nr_pages);
+}
+
+static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+{
+   VM_BUG_ON_PAGE(PageLRU(page), page);
+   VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
+
+   if (!page->mem_cgroup)
+   return;
+
+   /*
+* Nobody should be changing or seriously looking at
+* page->mem_cgroup at this point, we have fully
+* exclusive access to the page.
+*/
+
+   if (ug->memcg != page->mem_cgroup) {
+   if (ug->memcg) {
+   uncharge_batch(ug);
+   uncharge_gather_clear(ug);
+   }
+   ug->memcg = page->mem_cgroup;
+   }
+
+   if (!PageKmemcg(page)) {
+   unsigned int nr_pages = 1;
+
+   if (PageTransHuge(page)) {
+   nr_pages <<= compound_order(page);
+   ug->nr_huge += nr_pages;
+   }
+   if (PageAnon(page))
+   ug->nr_anon += nr_p

[PATCH 6/6] mm/hmm: documents how device memory is accounted in rss and memcg

2017-07-13 Thread Jérôme Glisse
For now we account device memory exactly like a regular page in
respect to rss counters and memory cgroup. We do this so that any
existing application that starts using device memory without knowing
about it will keep running unimpacted. This also simplify migration
code.

We will likely revisit this choice once we gain more experience with
how device memory is use and how it impacts overall memory resource
management. For now we believe this is a good enough choice.

Note that device memory can not be pin. Nor by device driver, nor
by GUP thus device memory can always be free and unaccounted when
a process exit.

Signed-off-by: Jérôme Glisse 
Cc: Michal Hocko 
---
 Documentation/vm/hmm.txt | 40 
 1 file changed, 40 insertions(+)

diff --git a/Documentation/vm/hmm.txt b/Documentation/vm/hmm.txt
index 192dcdb38bd1..4d3aac9f4a5d 100644
--- a/Documentation/vm/hmm.txt
+++ b/Documentation/vm/hmm.txt
@@ -15,6 +15,15 @@ section present the new migration helper that allow to 
leverage the device DMA
 engine.
 
 
+1) Problems of using device specific memory allocator:
+2) System bus, device memory characteristics
+3) Share address space and migration
+4) Address space mirroring implementation and API
+5) Represent and manage device memory from core kernel point of view
+6) Migrate to and from device memory
+7) Memory cgroup (memcg) and rss accounting
+
+
 ---
 
 1) Problems of using device specific memory allocator:
@@ -342,3 +351,34 @@ that happens then the finalize_and_map() can catch any 
pages that was not
 migrated. Note those page were still copied to new page and thus we wasted
 bandwidth but this is considered as a rare event and a price that we are
 willing to pay to keep all the code simpler.
+
+
+---
+
+7) Memory cgroup (memcg) and rss accounting
+
+For now device memory is accounted as any regular page in rss counters (either
+anonymous if device page is use for anonymous, file if device page is use for
+file back page or shmem if device page is use for share memory). This is a
+deliberate choice to keep existing application that might start using device
+memory without knowing about it to keep runing unimpacted.
+
+Drawbacks is that OOM killer might kill an application using a lot of device
+memory and not a lot of regular system memory and thus not freeing much system
+memory. We want to gather more real world experience on how application and
+system react under memory pressure in the presence of device memory before
+deciding to account device memory differently.
+
+
+Same decision was made for memory cgroup. Device memory page are accounted
+against same memory cgroup a regular page would be accounted to. This does
+simplify migration to and from device memory. This also means that migration
+back from device memory to regular memory can not fail because it would
+go above memory cgroup limit. We might revisit this choice latter on once we
+get more experience in how device memory is use and its impact on memory
+resource control.
+
+
+Note that device memory can never be pin nor by device driver nor through GUP
+and thus such memory is always free upon process exit. Or when last reference
+is drop in case of share memory or file back memory.
-- 
2.13.0



[PATCH 1/5] mm/device-host-memory: device memory cache coherent with CPU v3

2017-07-12 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v2:
  - s/public/host
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse <jgli...@redhat.com>
Cc: Balbir Singh <balb...@au1.ibm.com>
Cc: Aneesh Kumar <aneesh.ku...@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paul...@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Dan Williams <dan.j.willi...@intel.com>
Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c| 15 -
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 ++--
 mm/swap.c| 11 ++
 13 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 957b6ea80d5f..1f38f2c7cc34 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1182,7 +1182,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 458d0d6d82f3..e4d6471dbca7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_HOST)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -443,7 +443,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_HOST */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..76d4720aba5e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_HOST_MEMORY   = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 57546a07a558..4b7f28273f37 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_PUBLIC = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_HOST,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_HOST only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_host_page(const struct page *page)
+{
+   return is_

[PATCH 1/5] mm/device-host-memory: device memory cache coherent with CPU v3

2017-07-12 Thread Jérôme Glisse
Platform with advance system bus (like CAPI or CCIX) allow device
memory to be accessible from CPU in a cache coherent fashion. Add
a new type of ZONE_DEVICE to represent such memory. The use case
are the same as for the un-addressable device memory but without
all the corners cases.

Changed since v2:
  - s/public/host
  - add proper include in migrate.c and drop useless #if/#endif
Changed since v1:
  - Kconfig and #if/#else cleanup

Signed-off-by: Jérôme Glisse 
Cc: Balbir Singh 
Cc: Aneesh Kumar 
Cc: Paul E. McKenney 
Cc: Benjamin Herrenschmidt 
Cc: Dan Williams 
Cc: Ross Zwisler 
---
 fs/proc/task_mmu.c   |  2 +-
 include/linux/hmm.h  |  4 ++--
 include/linux/ioport.h   |  1 +
 include/linux/memremap.h | 21 ++
 include/linux/mm.h   | 20 ++---
 kernel/memremap.c| 15 -
 mm/Kconfig   | 11 ++
 mm/gup.c |  7 ++
 mm/hmm.c |  4 ++--
 mm/madvise.c |  2 +-
 mm/memory.c  | 46 +-
 mm/migrate.c | 57 ++--
 mm/swap.c| 11 ++
 13 files changed, 156 insertions(+), 45 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 957b6ea80d5f..1f38f2c7cc34 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1182,7 +1182,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct 
pagemapread *pm,
if (pm->show_pfn)
frame = pte_pfn(pte);
flags |= PM_PRESENT;
-   page = vm_normal_page(vma, addr, pte);
+   page = _vm_normal_page(vma, addr, pte, true);
if (pte_soft_dirty(pte))
flags |= PM_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 458d0d6d82f3..e4d6471dbca7 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -327,7 +327,7 @@ int hmm_vma_fault(struct vm_area_struct *vma,
 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
 
 
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
+#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_HOST)
 struct hmm_devmem;
 
 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
@@ -443,7 +443,7 @@ struct hmm_device {
  */
 struct hmm_device *hmm_device_new(void *drvdata);
 void hmm_device_put(struct hmm_device *hmm_device);
-#endif /* IS_ENABLED(CONFIG_DEVICE_PRIVATE) */
+#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_HOST */
 
 
 /* Below are for HMM internal use only! Not to be used by device driver! */
diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 3a4f69137bc2..76d4720aba5e 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -131,6 +131,7 @@ enum {
IORES_DESC_PERSISTENT_MEMORY= 4,
IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5,
IORES_DESC_DEVICE_PRIVATE_MEMORY= 6,
+   IORES_DESC_DEVICE_HOST_MEMORY   = 7,
 };
 
 /* helpers to define resources */
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 57546a07a558..4b7f28273f37 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -57,10 +57,18 @@ static inline struct vmem_altmap *to_vmem_altmap(unsigned 
long memmap_start)
  *
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.txt.
+ *
+ * MEMORY_DEVICE_HOST:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is use on platform that have an advance system bus (like CAPI or CCIX). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allow to pin such memory so that it can always be evicted.
  */
 enum memory_type {
MEMORY_DEVICE_PUBLIC = 0,
MEMORY_DEVICE_PRIVATE,
+   MEMORY_DEVICE_HOST,
 };
 
 /*
@@ -92,6 +100,8 @@ enum memory_type {
  * The page_free() callback is called once the page refcount reaches 1
  * (ZONE_DEVICE pages never reach 0 refcount unless there is a refcount bug.
  * This allows the device driver to implement its own memory management.)
+ *
+ * For MEMORY_DEVICE_HOST only the page_free() callback matter.
  */
 typedef int (*dev_page_fault_t)(struct vm_area_struct *vma,
unsigned long addr,
@@ -134,6 +144,12 @@ static inline bool is_device_private_page(const struct 
page *page)
return is_zone_device_page(page) &&
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
+
+static inline bool is_device_host_page(const struct page *page)
+{
+   return is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_HOST;
+}
 #else
 static inline void *devm_memremap_pages(struct device *dev,
struct resource *res, struct percpu_ref *r

  1   2   3   4   5   6   7   8   9   >