from:"Ralph Campbell"

Re: [PATCH v5 1/9] mm/mmu_notifier: helper to test if a range invalidation is blockable

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

Simple helpers to test if range invalidation is blockable. Latter
patches use cocinnelle to convert all direct dereference of range->
blockable to use this function instead so that we can convert the
blockable field to an unsigned for more flags.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: linux-fsde...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 4050ec1c3b45..e630def131ce 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct 
mmu_notifier_range *r,
  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end);
  
+static inline bool

+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+   return range->blockable;
+}
+
  static inline void mmu_notifier_release(struct mm_struct *mm)
  {
if (mm_has_notifiers(mm))
@@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct 
mmu_notifier_range *range,
  #define mmu_notifier_range_init(range, mm, start, end) \
_mmu_notifier_range_init(range, start, end)
  
+static inline bool

+mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
+{
+   return true;
+}
  
  static inline int mm_has_notifiers(struct mm_struct *mm)

  {


Reviewed-by: Ralph Campbell 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 6/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

This update each existing invalidation to use the correct mmu notifier
event that represent what is happening to the CPU page table. See the
patch which introduced the events to see the rational behind this.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  fs/proc/task_mmu.c  |  4 ++--
  kernel/events/uprobes.c |  2 +-
  mm/huge_memory.c| 14 ++
  mm/hugetlb.c|  8 
  mm/khugepaged.c |  2 +-
  mm/ksm.c|  4 ++--
  mm/madvise.c|  2 +-
  mm/memory.c | 14 +++---
  mm/migrate.c|  4 ++--
  mm/mprotect.c   |  5 +++--
  mm/rmap.c   |  6 +++---
  11 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fcbd0e574917..3b93ce496dd4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const 
char __user *buf,
break;
}
  
-			mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0,

-   NULL, mm, 0, -1UL);
+   mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY,
+   0, NULL, mm, 0, -1UL);
mmu_notifier_invalidate_range_start();
}
walk_page_range(0, mm->highest_vm_end, _refs_walk);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 46f546bdba00..8e8342080013 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, 
unsigned long addr,
struct mmu_notifier_range range;
struct mem_cgroup *memcg;
  
-	mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr,

+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
addr + PAGE_SIZE);
  
  	VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page);

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c9d638f1b34e..1da6ca0f0f6d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct 
vm_fault *vmf,
cond_resched();
}
  
-	mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
  
  	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);

@@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, 
pmd_t orig_pmd)
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
  
-	mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

-   haddr,
-   haddr + HPAGE_PMD_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+   haddr, haddr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
  
  	spin_lock(vmf->ptl);

@@ -2028,7 +2026,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t 
*pud,
spinlock_t *ptl;
struct mmu_notifier_range range;
  
-	mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PUD_MASK,
(address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
mmu_notifier_invalidate_range_start();
@@ -2247,7 +2245,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
spinlock_t *ptl;
struct mmu_notifier_range range;
  
-	mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,

+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
(address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start();
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d9e5c5a4c004..a58115c6b0a3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3250,7 +3250,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct 
mm_struct *src,
cow = (vm

Re: [PATCH v5 7/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening

This patch is just passing down the new informations by adding it to the
mmu_notifier_range structure.

Changes since v1:
 - Initialize flags field from mmu_notifier_range_init() arguments

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h | 6 +-
  1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 62f94cd85455..0379956fff23 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -58,10 +58,12 @@ struct mmu_notifier_mm {
  #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
  
  struct mmu_notifier_range {

+   struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long start;
unsigned long end;
unsigned flags;
+   enum mmu_notifier_event event;
  };
  
  struct mmu_notifier_ops {

@@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
   unsigned long start,
   unsigned long end)
  {
+   range->vma = vma;
+   range->event = event;
range->mm = mm;
range->start = start;
range->end = end;
-   range->flags = 0;
+   range->flags = flags;
  }
  
  #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\




Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 8/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

Helper to test if a range is updated to read only (it is still valid
to read from the range). This is useful for device driver or anyone
who wish to optimize out update when they know that they already have
the range map read only.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h |  4 
  mm/mmu_notifier.c| 10 ++
  2 files changed, 14 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 0379956fff23..b6c004bd9f6a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct 
mmu_notifier_range *r,
  bool only_end);
  extern void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  unsigned long start, unsigned long end);
+extern bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range);
  
  static inline bool

  mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
@@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct 
*mm)
  {
  }
  
+#define mmu_notifier_range_update_to_read_only(r) false

+
  #define ptep_clear_flush_young_notify ptep_clear_flush_young
  #define pmdp_clear_flush_young_notify pmdp_clear_flush_young
  #define ptep_clear_young_notify ptep_test_and_clear_young
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index abd88c466eb2..ee36068077b6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct 
mmu_notifier *mn,
mmdrop(mm);
  }
  EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+   if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+   return false;
+   /* Return true if the vma still have the read flag set. */
+   return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);



Don't you have to check for !WRITE & READ?
mprotect() can change the permissions from R/O to RW and
end up calling mmu_notifier_range_init() and 
mmu_notifier_invalidate_range_start()/end().


I'm not sure how useful this is since only applies to the
MMU_NOTIFY_PROTECTION_VMA case.
Anyway, you can add

Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 5/9] mm/mmu_notifier: contextual information for event triggering invalidation v2

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result
of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

Users of mmu notifier API track changes to the CPU page table and take
specific action for them. While current API only provide range of virtual
address affected by the change, not why the changes is happening.

This patchset do the initial mechanical convertion of all the places that
calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP
event as well as the vma if it is know (most invalidation happens against
a given vma). Passing down the vma allows the users of mmu notifier to
inspect the new vma page protection.

The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier
should assume that every for the range is going away when that event
happens. A latter patch do convert mm call path to use a more appropriate
events for each call.

Changes since v1:
 - add the flags parameter to init range flags

This is done as 2 patches so that no call site is forgotten especialy
as it uses this following coccinelle patch:

%<--
@@
identifier I1, I2, I3, I4;
@@
static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1,
+enum mmu_notifier_event event,
+unsigned flags,
+struct vm_area_struct *vma,
struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... }

@@
@@
-#define mmu_notifier_range_init(range, mm, start, end)
+#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end)

@@
expression E1, E3, E4;
identifier I1;
@@
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, I1,
I1->vm_mm, E3, E4)
...>

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(..., struct vm_area_struct *VMA, ...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN, VMA;
@@
FN(...) {
struct vm_area_struct *VMA;
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, VMA,
E2, E3, E4)
...> }

@@
expression E1, E2, E3, E4;
identifier FN;
@@
FN(...) {
<...
mmu_notifier_range_init(E1,
+MMU_NOTIFY_UNMAP, 0, NULL,
E2, E3, E4)
...> }
-->%

Applied with:
spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c 
--in-place
spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place
spatch --sp-file mmu-notifier.spatch --dir mm --in-place

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---


Reviewed-by: Ralph Campbell 

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 3/9] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

Use an unsigned field for flags other than blockable and convert
the blockable field to be one of those flags.

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Andrew Morton 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h | 11 +++
  1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index e630def131ce..c8672c366f67 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -25,11 +25,13 @@ struct mmu_notifier_mm {
spinlock_t lock;
  };
  
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

+
  struct mmu_notifier_range {
struct mm_struct *mm;
unsigned long start;
unsigned long end;
-   bool blockable;
+   unsigned flags;
  };
  
  struct mmu_notifier_ops {

@@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct 
mm_struct *mm,
  static inline bool
  mmu_notifier_range_blockable(const struct mmu_notifier_range *range)
  {
-   return range->blockable;
+   return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
  }
  
  static inline void mmu_notifier_release(struct mm_struct *mm)

@@ -275,7 +277,7 @@ static inline void
  mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
  {
if (mm_has_notifiers(range->mm)) {
-   range->blockable = true;
+   range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE;
__mmu_notifier_invalidate_range_start(range);
}
  }
@@ -284,7 +286,7 @@ static inline int
  mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range)
  {
if (mm_has_notifiers(range->mm)) {
-   range->blockable = false;
+   range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE;
return __mmu_notifier_invalidate_range_start(range);
}
return 0;
@@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct 
mmu_notifier_range *range,
range->mm = mm;
range->start = start;
range->end = end;
+   range->flags = 0;
  }
  
  #define ptep_clear_flush_young_notify(__vma, __address, __ptep)		\




Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 9/9] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag
for page table update that use set_pte_at_notify() and where the we are
going either from read and write to read only with same pfn or read only
to read and write with new pfn.

Note that set_pte_at_notify() itself should only be use in rare cases
ie we do not want to use it when we are updating a significant range of
virtual addresses and thus a significant number of pte. Instead for
those cases the event provided to mmu notifer invalidate_range_start()
callback should be use for optimization.

Changes since v1:
 - Use the new unsigned flags field in struct mmu_notifier_range
 - Use the new flags parameter to mmu_notifier_range_init()
 - Explicitly list all the patterns where we can use change_pte()

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h | 34 --
  mm/ksm.c | 11 ++-
  mm/memory.c  |  5 +++--
  3 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b6c004bd9f6a..0230a4b06b46 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -40,6 +40,26 @@ enum mmu_notifier_event {
MMU_NOTIFY_SOFT_DIRTY,
  };
  
+/*

+ * @MMU_NOTIFIER_RANGE_BLOCKABLE: can the mmu notifier range_start/range_end
+ * callback block or not ? If set then the callback can block.
+ *
+ * @MMU_NOTIFIER_USE_CHANGE_PTE: only set when the page table it updated with
+ * the set_pte_at_notify() the valid patterns for this are:
+ *  - pte read and write to read only same pfn
+ *  - pte read only to read and write (pfn can change or stay the same)
+ *  - pte read only to read only with different pfn
+ * It is illegal to set in any other circumstances.
+ *
+ * Note that set_pte_at_notify() should not be use outside of the above cases.
+ * When updating a range in batch (like write protecting a range) it is better
+ * to rely on invalidate_range_start() and struct mmu_notifier_range to infer
+ * the kind of update that is happening (as an example you can look at the
+ * mmu_notifier_range_update_to_read_only() function).
+ */
+#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)
+#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << 1)
+
  #ifdef CONFIG_MMU_NOTIFIER
  
  /*

@@ -55,8 +75,6 @@ struct mmu_notifier_mm {
spinlock_t lock;
  };
  
-#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0)

-
  struct mmu_notifier_range {
struct vm_area_struct *vma;
struct mm_struct *mm;
@@ -268,6 +286,12 @@ mmu_notifier_range_blockable(const struct 
mmu_notifier_range *range)
return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE);
  }
  
+static inline bool

+mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range)
+{
+   return (range->flags & MMU_NOTIFIER_USE_CHANGE_PTE);
+}
+
  static inline void mmu_notifier_release(struct mm_struct *mm)
  {
if (mm_has_notifiers(mm))
@@ -509,6 +533,12 @@ mmu_notifier_range_blockable(const struct 
mmu_notifier_range *range)
return true;
  }
  
+static inline bool

+mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range)
+{
+   return false;
+}
+
  static inline int mm_has_notifiers(struct mm_struct *mm)
  {
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index b782fadade8f..41e51882f999 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1066,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, 
struct page *page,
  
  	BUG_ON(PageTransCompound(page));
  
-	mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm,

-   pvmw.address,
-   pvmw.address + PAGE_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR,
+   MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm,
+   pvmw.address, pvmw.address + PAGE_SIZE);
mmu_notifier_invalidate_range_start();
  
  	if (!page_vma_mapped_walk())

@@ -1155,8 +1155,9 @@ static int replace_page(struct vm_area_struct *vma, 
struct page *page,
if (!pmd)
goto out;
  
-	mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,

-   addr + PAGE_SIZE);
+   mmu_notifier_range_init(, MMU_NOTIFY_CLEAR,
+   MMU_NOTIFIER_USE_CHANGE_PTE,
+   vma, mm, addr, addr + PAGE_SIZE);
mmu

Re: [PATCH v5 2/9] mm/mmu_notifier: convert user range->blockable to helper function

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

Use the mmu_notifier_range_blockable() helper function instead of
directly dereferencing the range->blockable field. This is done to
make it easier to change the mmu_notifier range field.

This patch is the outcome of the following coccinelle patch:

%<---
@@
identifier I1, FN;
@@
FN(..., struct mmu_notifier_range *I1, ...) {
<...
-I1->blockable
+mmu_notifier_range_blockable(I1)
...>
}
--->%

spatch --in-place --sp-file blockable.spatch --dir .

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  | 8 
  drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +-
  drivers/gpu/drm/radeon/radeon_mn.c  | 4 ++--
  drivers/infiniband/core/umem_odp.c  | 5 +++--
  drivers/xen/gntdev.c| 6 +++---
  mm/hmm.c| 6 +++---
  mm/mmu_notifier.c   | 2 +-
  virt/kvm/kvm_main.c | 3 ++-
  8 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3e6823fdd939..58ed401c5996 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct 
mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
 * amdgpu_mn_invalidate_node
 */
-   if (amdgpu_mn_read_lock(amn, range->blockable))
+   if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
  
  	it = interval_tree_iter_first(>objects, range->start, end);

while (it) {
struct amdgpu_mn_node *node;
  
-		if (!range->blockable) {

+   if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
@@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
/* notification is exclusive, but interval is inclusive */
end = range->end - 1;
  
-	if (amdgpu_mn_read_lock(amn, range->blockable))

+   if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range)))
return -EAGAIN;
  
  	it = interval_tree_iter_first(>objects, range->start, end);

@@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct 
mmu_notifier *mn,
struct amdgpu_mn_node *node;
struct amdgpu_bo *bo;
  
-		if (!range->blockable) {

+   if (!mmu_notifier_range_blockable(range)) {
amdgpu_mn_read_unlock(amn);
return -EAGAIN;
}
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 1d3f9a31ad61..777b3f8727e7 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn,
while (it) {
struct drm_i915_gem_object *obj;
  
-		if (!range->blockable) {

+   if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
break;
}
diff --git a/drivers/gpu/drm/radeon/radeon_mn.c 
b/drivers/gpu/drm/radeon/radeon_mn.c
index b3019505065a..c9bd1278f573 100644
--- a/drivers/gpu/drm/radeon/radeon_mn.c
+++ b/drivers/gpu/drm/radeon/radeon_mn.c
@@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
/* TODO we should be able to split locking for interval tree and
 * the tear down.
 */
-   if (range->blockable)
+   if (mmu_notifier_range_blockable(range))
mutex_lock(>lock);
else if (!mutex_trylock(>lock))
return -EAGAIN;
@@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct 
mmu_notifier *mn,
struct radeon_bo *bo;
long r;
  
-		if (!range->blockable) {

+   if (!mmu_notifier_range_blockable(range)) {
ret = -EAGAIN;
goto out_unlock;
}
diff --git a/drivers/infiniband/core/umem_odp.c 
b/drivers/infiniband/core/umem_odp.c
index 012044f16d1c..3a3f1538d295 1

Re: [PATCH v5 4/9] mm/mmu_notifier: contextual information for event enums

2019-02-25 Thread Ralph Campbell



On 2/19/19 12:04 PM, jgli...@redhat.com wrote:

From: Jérôme Glisse 

CPU page table update can happens for many reasons, not only as a result


s/update/updates
s/happens/happen


of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also
as a result of kernel activities (memory compression, reclaim, migration,
...).

This patch introduce a set of enums that can be associated with each of


s/introduce/introduces


the events triggering a mmu notifier. Latter patches take advantages of
those enum values.


s/advantages/advantage



 - UNMAP: munmap() or mremap()
 - CLEAR: page table is cleared (migration, compaction, reclaim, ...)
 - PROTECTION_VMA: change in access protections for the range
 - PROTECTION_PAGE: change in access protections for page in the range
 - SOFT_DIRTY: soft dirtyness tracking



s/dirtyness/dirtiness


Being able to identify munmap() and mremap() from other reasons why the
page table is cleared is important to allow user of mmu notifier to
update their own internal tracking structure accordingly (on munmap or
mremap it is not longer needed to track range of virtual address as it
becomes invalid).

Signed-off-by: Jérôme Glisse 
Cc: Christian König 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Rodrigo Vivi 
Cc: Jan Kara 
Cc: Andrea Arcangeli 
Cc: Peter Xu 
Cc: Felix Kuehling 
Cc: Jason Gunthorpe 
Cc: Ross Zwisler 
Cc: Dan Williams 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 
Cc: Michal Hocko 
Cc: Christian Koenig 
Cc: Ralph Campbell 
Cc: John Hubbard 
Cc: k...@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: Arnd Bergmann 
---
  include/linux/mmu_notifier.h | 30 ++
  1 file changed, 30 insertions(+)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index c8672c366f67..2386e71ac1b8 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -10,6 +10,36 @@
  struct mmu_notifier;
  struct mmu_notifier_ops;
  
+/**

+ * enum mmu_notifier_event - reason for the mmu notifier callback
+ * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that
+ * move the range


I would say something about the VMA for the notifier range
is being deleted.
MMU notifier clients can then use this case to remove any policy or
access counts associated with the range.
Just changing the PTE to "no access" as in the CLEAR case
doesn't mean a policy which prefers device private memory
over system memory should be cleared.


+ *
+ * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like
+ * madvise() or replacing a page by another one, ...).
+ *
+ * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range
+ * ie using the vma access permission (vm_page_prot) to update the whole range
+ * is enough no need to inspect changes to the CPU page table (mprotect()
+ * syscall)
+ *
+ * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for
+ * pages in the range so to mirror those changes the user must inspect the CPU
+ * page table (from the end callback).
+ *
+ * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
+ * access flags). User should soft dirty the page in the end callback to make
+ * sure that anyone relying on soft dirtyness catch pages that might be written
+ * through non CPU mappings.
+ */
+enum mmu_notifier_event {
+   MMU_NOTIFY_UNMAP = 0,
+   MMU_NOTIFY_CLEAR,
+   MMU_NOTIFY_PROTECTION_VMA,
+   MMU_NOTIFY_PROTECTION_PAGE,
+   MMU_NOTIFY_SOFT_DIRTY,
+};
+
  #ifdef CONFIG_MMU_NOTIFIER
  
  /*



___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 3/9] mm: Add write-protect and clean utilities for address space ranges

2019-04-15 Thread Ralph Campbell



On 4/12/19 9:04 AM, Thomas Hellstrom wrote:

Add two utilities to a) write-protect and b) clean all ptes pointing into
a range of an address space


A period at the end, please.


The utilities are intended to aid in tracking dirty pages (either
driver-allocated system memory or pci device memory).
The write-protect utility should be used in conjunction with
page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page
accesses. Typically one would want to use this on sparse accesses into
large memory regions. The clean utility should be used to utilize
hardware dirtying functionality and avoid the overhead of page-faults,
typically on large accesses into small memory regions.

The added file "apply_as_range.c" is initially listed as maintained by
VMware under our DRM driver. If somebody would like it elsewhere,
that's of course no problem.

Notable changes since RFC:
- Added comments to help avoid the usage of these function for VMAs
   it's not intended for. We also do advisory checks on the vm_flags and
   warn on illegal usage.
- Perform the pte modifications the same way softdirty does.
- Add mmu_notifier range invalidation calls.
- Add a config option so that this code is not unconditionally included.
- Tell the mmu_gather code about pending tlb flushes.

Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Minchan Kim 
Cc: Michal Hocko 
Cc: Huang Ying 
Cc: Souptick Joarder 
Cc: "Jérôme Glisse" 
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Thomas Hellstrom 


Reviewed-by: Ralph Campbell 


---
  MAINTAINERS |   1 +
  include/linux/mm.h  |   9 +-
  mm/Kconfig  |   3 +
  mm/Makefile |   3 +-
  mm/apply_as_range.c | 295 
  5 files changed, 309 insertions(+), 2 deletions(-)
  create mode 100644 mm/apply_as_range.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 35e6357f9d30..bc243ffcb840 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4971,6 +4971,7 @@ T:git git://people.freedesktop.org/~thomash/linux
  S:Supported
  F:drivers/gpu/drm/vmwgfx/
  F:include/uapi/drm/vmwgfx_drm.h
+F: mm/apply_as_range.c
  
  DRM DRIVERS

  M:David Airlie 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b7dd4ddd6efb..62f24dd0bfa0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2642,7 +2642,14 @@ struct pfn_range_apply {
  };
  extern int apply_to_pfn_range(struct pfn_range_apply *closure,
  unsigned long address, unsigned long size);
-
+unsigned long apply_as_wrprotect(struct address_space *mapping,
+pgoff_t first_index, pgoff_t nr);
+unsigned long apply_as_clean(struct address_space *mapping,
+pgoff_t first_index, pgoff_t nr,
+pgoff_t bitmap_pgoff,
+unsigned long *bitmap,
+pgoff_t *start,
+pgoff_t *end);
  #ifdef CONFIG_PAGE_POISONING
  extern bool page_poisoning_enabled(void);
  extern void kernel_poison_pages(struct page *page, int numpages, int enable);
diff --git a/mm/Kconfig b/mm/Kconfig
index 25c71eb8a7db..80e41cdbb4ae 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -758,4 +758,7 @@ config GUP_BENCHMARK
  config ARCH_HAS_PTE_SPECIAL
bool
  
+config AS_DIRTY_HELPERS

+bool
+
  endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d210cc9d6f80..b295717be856 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o 
fadvise.o \
   mm_init.o mmu_context.o percpu.o slab_common.o \
   compaction.o vmacache.o \
   interval_tree.o list_lru.o workingset.o \
-  debug.o $(mmu-y)
+  debug.o apply_as_range.o $(mmu-y)
  
  obj-y += init-mm.o

  obj-y += memblock.o
@@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
  obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
  obj-$(CONFIG_HMM) += hmm.o
  obj-$(CONFIG_MEMFD_CREATE) += memfd.o
+obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o
diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c
new file mode 100644
index ..32d28619aec5
--- /dev/null
+++ b/mm/apply_as_range.c
@@ -0,0 +1,295 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/**
+ * struct apply_as - Closure structure for apply_as_range
+ * @base: struct pfn_range_apply we derive from
+ * @start: Address of first modified pte
+ * @end: Address of last modified pte + 1
+ * @total: Total number of modified ptes
+ * @vma: Pointer to the struct vm_area_struct we're currently operating on
+ */
+struct apply_as {
+   struct pfn_range_apply base;
+   unsigned long start, end;


One variable defined per line, please.


+

Re: [PATCH 2/9] mm: Add an apply_to_pfn_range interface

2019-04-15 Thread Ralph Campbell



On 4/12/19 9:04 AM, Thomas Hellstrom wrote:

This is basically apply_to_page_range with added functionality:
Allocating missing parts of the page table becomes optional, which
means that the function can be guaranteed not to error if allocation
is disabled. Also passing of the closure struct and callback function
becomes different and more in line with how things are done elsewhere.

Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range

The reason for not using the page-walk code is that we want to perform
the page-walk on vmas pointing to an address space without requiring the
mmap_sem to be held rather thand on vmas belonging to a process with the


s/thand/than/


mmap_sem held.

Notable changes since RFC:
Don't export apply_to_pfn range.

Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Minchan Kim 
Cc: Michal Hocko 
Cc: Huang Ying 
Cc: Souptick Joarder 
Cc: "Jérôme Glisse" 
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org
Signed-off-by: Thomas Hellstrom 


Reviewed-by: Ralph Campbell 


---
  include/linux/mm.h |  10 
  mm/memory.c| 130 ++---
  2 files changed, 108 insertions(+), 32 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..b7dd4ddd6efb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, 
unsigned long addr,
  extern int apply_to_page_range(struct mm_struct *mm, unsigned long address,
   unsigned long size, pte_fn_t fn, void *data);
  
+struct pfn_range_apply;

+typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
+struct pfn_range_apply *closure);
+struct pfn_range_apply {
+   struct mm_struct *mm;
+   pter_fn_t ptefn;
+   unsigned int alloc;
+};
+extern int apply_to_pfn_range(struct pfn_range_apply *closure,
+ unsigned long address, unsigned long size);
  
  #ifdef CONFIG_PAGE_POISONING

  extern bool page_poisoning_enabled(void);
diff --git a/mm/memory.c b/mm/memory.c
index a95b4a3b1ae2..60d67158964f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, 
phys_addr_t start, unsigned long
  }
  EXPORT_SYMBOL(vm_iomap_memory);
  
-static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

-unsigned long addr, unsigned long end,
-pte_fn_t fn, void *data)
+static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd,
+ unsigned long addr, unsigned long end)
  {
pte_t *pte;
int err;
pgtable_t token;
spinlock_t *uninitialized_var(ptl);
  
-	pte = (mm == _mm) ?

+   pte = (closure->mm == _mm) ?
pte_alloc_kernel(pmd, addr) :
-   pte_alloc_map_lock(mm, pmd, addr, );
+   pte_alloc_map_lock(closure->mm, pmd, addr, );
if (!pte)
return -ENOMEM;
  
@@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,

token = pmd_pgtable(*pmd);
  
  	do {

-   err = fn(pte++, token, addr, data);
+   err = closure->ptefn(pte++, token, addr, closure);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);
  
  	arch_leave_lazy_mmu_mode();
  
-	if (mm != _mm)

+   if (closure->mm != _mm)
pte_unmap_unlock(pte-1, ptl);
return err;
  }
  
-static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,

-unsigned long addr, unsigned long end,
-pte_fn_t fn, void *data)
+static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud,
+ unsigned long addr, unsigned long end)
  {
pmd_t *pmd;
unsigned long next;
-   int err;
+   int err = 0;
  
  	BUG_ON(pud_huge(*pud));
  
-	pmd = pmd_alloc(mm, pud, addr);

+   pmd = pmd_alloc(closure->mm, pud, addr);
if (!pmd)
return -ENOMEM;
+
do {
next = pmd_addr_end(addr, end);
-   err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+   if (!closure->alloc && pmd_none_or_clear_bad(pmd))
+   continue;
+   err = apply_to_pte_range(closure, pmd, addr, next);
if (err)
break;
} while (pmd++, addr = next, addr != end);
return err;
  }
  
-static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,

-unsigned long addr, unsigned long end,
-pte_fn_t fn, void *data)
+static int apply_to_pud_range(struct

Re: [PATCH 1/9] mm: Allow the [page|pfn]_mkwrite callbacks to drop the mmap_sem

2019-04-15 Thread Ralph Campbell



On 4/12/19 9:04 AM, Thomas Hellstrom wrote:

Driver fault callbacks are allowed to drop the mmap_sem when expecting
long hardware waits to avoid blocking other mm users. Allow the mkwrite
callbacks to do the same by returning early on VM_FAULT_RETRY.

In particular we want to be able to drop the mmap_sem when waiting for
a reservation object lock on a GPU buffer object. These locks may be
held while waiting for the GPU.

Cc: Andrew Morton 
Cc: Matthew Wilcox 
Cc: Will Deacon 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Minchan Kim 
Cc: Michal Hocko 
Cc: Huang Ying 
Cc: Souptick Joarder 
Cc: "Jérôme Glisse" 
Cc: linux...@kvack.org
Cc: linux-ker...@vger.kernel.org

Signed-off-by: Thomas Hellstrom 


Reviewed-by: Ralph Campbell 


---
  mm/memory.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e11ca9dd823f..a95b4a3b1ae2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf)
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
-   if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+   if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)))


A very minor nit, for consistency elsewhere in mm/memory.c,
could you make this be:
(VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)


return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED))) {
lock_page(page);
@@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
vmf->flags |= FAULT_FLAG_MKWRITE;
ret = vma->vm_ops->pfn_mkwrite(vmf);
-   if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))
+   if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))
return ret;
return finish_mkwrite_fault(vmf);
}
@@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
pte_unmap_unlock(vmf->pte, vmf->ptl);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp || (tmp &
- (VM_FAULT_ERROR | VM_FAULT_NOPAGE {
+ (VM_FAULT_ERROR | VM_FAULT_RETRY |
+  VM_FAULT_NOPAGE {
put_page(vmf->page);
return tmp;
}
@@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
unlock_page(vmf->page);
tmp = do_page_mkwrite(vmf);
if (unlikely(!tmp ||
-   (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE {
+   (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY |
+   VM_FAULT_NOPAGE {
put_page(vmf->page);
return tmp;
}


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 11/11] mm/hmm: Remove confusing comment and logic from hmm_release

2019-06-10 Thread Ralph Campbell



On 6/10/19 9:02 AM, Jason Gunthorpe wrote:

On Fri, Jun 07, 2019 at 02:37:07PM -0700, Ralph Campbell wrote:


On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

hmm_release() is called exactly once per hmm. ops->release() cannot
accidentally trigger any action that would recurse back onto
hmm->mirrors_sem.

This fixes a use after-free race of the form:

 CPU0   CPU1
 hmm_release()
   up_write(>mirrors_sem);
   hmm_mirror_unregister(mirror)
down_write(>mirrors_sem);
up_write(>mirrors_sem);
kfree(mirror)
   mirror->ops->release(mirror)

The only user we have today for ops->release is an empty function, so this
is unambiguously safe.

As a consequence of plugging this race drivers are not allowed to
register/unregister mirrors from within a release op.

Signed-off-by: Jason Gunthorpe 


I agree with the analysis above but I'm not sure that release() will
always be an empty function. It might be more efficient to write back
all data migrated to a device "in one pass" instead of relying
on unmap_vmas() calling hmm_start_range_invalidate() per VMA.


I think we have to focus on the *current* kernel - and we have two
users of release, nouveau_svm.c is empty and amdgpu_mn.c does
schedule_work() - so I believe we should go ahead with this simple
solution to the actual race today that both of those will suffer from.

If we find a need for a more complex version then it can be debated
and justified with proper context...

Ok?

Jason


OK.
I guess we have enough on the plate already :-)
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 01/11] mm/hmm: fix use after free with struct hmm in the mmu notifiers

2019-06-09 Thread Ralph Campbell




On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier
system will continue to reference hmm->mn until the srcu grace period
expires.

Resulting in use after free races like this:

  CPU0 CPU1

__mmu_notifier_invalidate_range_start()
  srcu_read_lock
  hlist_for_each ()
// mn == hmm->mn
hmm_mirror_unregister()
   hmm_put()
 hmm_free()
   mmu_notifier_unregister_no_release()
  hlist_del_init_rcu(hmm-mn->list)
   
mn->ops->invalidate_range_start(mn, range);
 mm_get_hmm()
   mm->hmm = NULL;
   kfree(hmm)
  mutex_lock(>lock);

Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm
existing. Get the now-safe hmm struct through container_of and directly
check kref_get_unless_zero to lock it against free.

Signed-off-by: Jason Gunthorpe 


You can add
Reviewed-by: Ralph Campbell 


---
v2:
- Spell 'free' properly (Jerome/Ralph)
---
  include/linux/hmm.h |  1 +
  mm/hmm.c| 25 +++--
  2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 092f0234bfe917..688c5ca7068795 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -102,6 +102,7 @@ struct hmm {
struct mmu_notifier mmu_notifier;
struct rw_semaphore mirrors_sem;
wait_queue_head_t   wq;
+   struct rcu_head rcu;
longnotifiers;
booldead;
  };
diff --git a/mm/hmm.c b/mm/hmm.c
index 8e7403f081f44a..547002f56a163d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -113,6 +113,11 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
return NULL;
  }
  
+static void hmm_free_rcu(struct rcu_head *rcu)

+{
+   kfree(container_of(rcu, struct hmm, rcu));
+}
+
  static void hmm_free(struct kref *kref)
  {
struct hmm *hmm = container_of(kref, struct hmm, kref);
@@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref)
mm->hmm = NULL;
spin_unlock(>page_table_lock);
  
-	kfree(hmm);

+   mmu_notifier_call_srcu(>rcu, hmm_free_rcu);
  }
  
  static inline void hmm_put(struct hmm *hmm)

@@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm)
  
  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)

  {
-   struct hmm *hmm = mm_get_hmm(mm);
+   struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
struct hmm_range *range;
  
+	/* hmm is in progress to free */

+   if (!kref_get_unless_zero(>kref))
+   return;
+
/* Report this HMM as dying. */
hmm->dead = true;
  
@@ -194,13 +203,15 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)

  static int hmm_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
  {
-   struct hmm *hmm = mm_get_hmm(nrange->mm);
+   struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
struct hmm_update update;
struct hmm_range *range;
int ret = 0;
  
-	VM_BUG_ON(!hmm);

+   /* hmm is in progress to free */
+   if (!kref_get_unless_zero(>kref))
+   return 0;
  
  	update.start = nrange->start;

update.end = nrange->end;
@@ -245,9 +256,11 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
  static void hmm_invalidate_range_end(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
  {
-   struct hmm *hmm = mm_get_hmm(nrange->mm);
+   struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
  
-	VM_BUG_ON(!hmm);

+   /* hmm is in progress to free */
+   if (!kref_get_unless_zero(>kref))
+   return;
  
  	mutex_lock(>lock);

hmm->notifiers--;


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 07/11] mm/hmm: Use lockdep instead of comments

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

So we can check locking at runtime.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 


Reviewed-by: Ralph Campbell 


---
v2
- Fix missing & in lockdeps (Jason)
---
  mm/hmm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index f67ba32983d9f1..c702cd72651b53 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -254,11 +254,11 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops 
= {
   *
   * To start mirroring a process address space, the device driver must register
   * an HMM mirror struct.
- *
- * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
   */
  int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
  {
+   lockdep_assert_held_exclusive(>mmap_sem);
+
/* Sanity check */
if (!mm || !mirror || !mirror->ops)
return -EINVAL;


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

The wait_event_timeout macro already tests the condition as its first
action, so there is no reason to open code another version of this, all
that does is skip the might_sleep() debugging in common cases, which is
not helpful.

Further, based on prior patches, we can no simplify the required condition
test:
  - If range is valid memory then so is range->hmm
  - If hmm_release() has run then range->valid is set to false
at the same time as dead, so no reason to check both.
  - A valid hmm has a valid hmm->mm.

Also, add the READ_ONCE for range->valid as there is no lock held here.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 
---
  include/linux/hmm.h | 12 ++--
  1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4ee3acabe5ed22..2ab35b40992b24 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -218,17 +218,9 @@ static inline unsigned long hmm_range_page_size(const 
struct hmm_range *range)
  static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
  unsigned long timeout)
  {
-   /* Check if mm is dead ? */
-   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
-   range->valid = false;
-   return false;
-   }
-   if (range->valid)
-   return true;
-   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+   wait_event_timeout(range->hmm->wq, range->valid,
   msecs_to_jiffies(timeout));
-   /* Return current valid status just in case we get lucky */
-   return range->valid;
+   return READ_ONCE(range->valid);
  }
  
  /*




Since we are simplifying things, perhaps we should consider merging
hmm_range_wait_until_valid() info hmm_range_register() and
removing hmm_range_wait_until_valid() since the pattern
is to always call the two together.

In any case, this looks OK to me so you can add
Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 10/11] mm/hmm: Do not use list*_rcu() for hmm->ranges

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

This list is always read and written while holding hmm->lock so there is
no need for the confusing _rcu annotations.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 


Reviewed-by: Ralph Campbell 


---
  mm/hmm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index c2fecb3ecb11e1..709d138dd49027 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -911,7 +911,7 @@ int hmm_range_register(struct hmm_range *range,
mutex_lock(>lock);
  
  	range->hmm = hmm;

-   list_add_rcu(>list, >ranges);
+   list_add(>list, >ranges);
  
  	/*

 * If there are any concurrent notifiers we have to wait for them for
@@ -941,7 +941,7 @@ void hmm_range_unregister(struct hmm_range *range)
return;
  
  	mutex_lock(>lock);

-   list_del_rcu(>list);
+   list_del(>list);
mutex_unlock(>lock);
  
  	/* Drop reference taken by hmm_range_register() */



___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 02/11] mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

Ralph observes that hmm_range_register() can only be called by a driver
while a mirror is registered. Make this clear in the API by passing in the
mirror structure as a parameter.

This also simplifies understanding the lifetime model for struct hmm, as
the hmm pointer must be valid as part of a registered mirror so all we
need in hmm_register_range() is a simple kref_get.

Suggested-by: Ralph Campbell 
Signed-off-by: Jason Gunthorpe 


You might CC Ben for the nouveau part.
CC: Ben Skeggs 

Reviewed-by: Ralph Campbell 



---
v2
- Include the oneline patch to nouveau_svm.c
---
  drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
  include/linux/hmm.h   |  7 ---
  mm/hmm.c  | 15 ++-
  3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 93ed43c413f0bb..8c92374afcf227 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
  again:
-   ret = hmm_vma_fault(, true);
+   ret = hmm_vma_fault(>mirror, , true);
if (ret == 0) {
mutex_lock(>mutex);
if (!hmm_vma_range_done()) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 688c5ca7068795..2d519797cb134a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror 
*mirror)
   * Please see Documentation/vm/hmm.rst for how to use the range API.
   */
  int hmm_range_register(struct hmm_range *range,
-  struct mm_struct *mm,
+  struct hmm_mirror *mirror,
   unsigned long start,
   unsigned long end,
   unsigned page_shift);
@@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range 
*range)
  }
  
  /* This is a temporary helper to avoid merge conflict between trees. */

-static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+static inline int hmm_vma_fault(struct hmm_mirror *mirror,
+   struct hmm_range *range, bool block)
  {
long ret;
  
@@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block)

range->default_flags = 0;
range->pfn_flags_mask = -1UL;
  
-	ret = hmm_range_register(range, range->vma->vm_mm,

+   ret = hmm_range_register(range, mirror,
 range->start, range->end,
 PAGE_SHIFT);
if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index 547002f56a163d..8796447299023c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range,
   * Track updates to the CPU page table see include/linux/hmm.h
   */
  int hmm_range_register(struct hmm_range *range,
-  struct mm_struct *mm,
+  struct hmm_mirror *mirror,
   unsigned long start,
   unsigned long end,
   unsigned page_shift)
  {
unsigned long mask = ((1UL << page_shift) - 1UL);
-   struct hmm *hmm;
+   struct hmm *hmm = mirror->hmm;
  
  	range->valid = false;

range->hmm = NULL;
@@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range,
range->start = start;
range->end = end;
  
-	hmm = hmm_get_or_create(mm);

-   if (!hmm)
-   return -EFAULT;
-
/* Check if hmm_mm_destroy() was call. */
-   if (hmm->mm == NULL || hmm->dead) {
-   hmm_put(hmm);
+   if (hmm->mm == NULL || hmm->dead)
return -EFAULT;
-   }
+
+   range->hmm = hmm;
+   kref_get(>kref);
  
  	/* Initialize range to track CPU page table updates. */

mutex_lock(>lock);


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout

2019-06-09 Thread Ralph Campbell



On 6/7/19 1:44 PM, Jason Gunthorpe wrote:

On Fri, Jun 07, 2019 at 01:21:12PM -0700, Ralph Campbell wrote:


What I want to get to is a pattern like this:

pagefault():

 hmm_range_register();
again:
 /* On the slow path, if we appear to be live locked then we get
the write side of mmap_sem which will break the live lock,
otherwise this gets the read lock */
 if (hmm_range_start_and_lock())
   goto err;

 lockdep_assert_held(range->mm->mmap_sem);

 // Optional: Avoid useless expensive work
 if (hmm_range_needs_retry())
goto again;
 hmm_range_(touch vmas)

 take_lock(driver->update);
 if (hmm_range_end() {
 release_lock(driver->update);
 goto again;
 }
 // Finish driver updates
 release_lock(driver->update);

 // Releases mmap_sem
 hmm_range_unregister_and_unlock();

What do you think?

Is it clear?

Jason



Are you talking about acquiring mmap_sem in hmm_range_start_and_lock()?
Usually, the fault code has to lock mmap_sem for read in order to
call find_vma() so it can set range.vma.



If HMM drops mmap_sem - which I don't think it should, just return an
error to tell the caller to drop mmap_sem and retry - the find_vma()
will need to be repeated as well.


Overall I don't think it makes a lot of sense to sleep for retry in
hmm_range_start_and_lock() while holding mmap_sem. It would be better
to drop that lock, sleep, then re-acquire it as part of the hmm logic.

The find_vma should be done inside the critical section created by
hmm_range_start_and_lock(), not before it. If we are retrying then we
already slept and the additional CPU cost to repeat the find_vma is
immaterial, IMHO?

Do you see a reason why the find_vma() ever needs to be before the
'again' in my above example? range.vma does not need to be set for
range_register.


Yes, for the GPU case, there can be many faults in an event queue
and the goal is to try to handle more than one page at a time.
The vma is needed to limit the amount of coalescing and checking
for pages that could be speculatively migrated or mapped.


I'm also not sure about acquiring the mmap_sem for write as way to
mitigate thrashing. It seems to me that if a device and a CPU are
both faulting on the same page,


One of the reasons to prefer this approach is that it means we don't
need to keep track of which ranges we are faulting, and if there is a
lot of *unrelated* fault activity (unlikely?) we can resolve it using
mmap sem instead of this elaborate ranges scheme and related
locking.

This would reduce the overall work in the page fault and
invalidate_start/end paths for the common uncontended cases.


some sort of backoff delay is needed to let one side or the other
make some progress.


What the write side of the mmap_sem would do is force the CPU and
device to cleanly take turns. Once the device pages are registered
under the write side the CPU will have to wait in invalidate_start for
the driver to complete a shootdown, then the whole thing starts all
over again.

It is certainly imaginable something could have a 'min life' timer for
a device mapping and hold mm invalidate_start, and device pagefault
for that min time to promote better sharing.

But, if we don't use the mmap_sem then we can livelock and the device
will see an unrecoverable error from the timeout which means we have
risk that under load the system will simply obscurely fail. This seems
unacceptable to me..

Particularly since for the ODP use case the issue is not trashing
migration as a GPU might have, but simple system stability under swap
load. We do not want the ODP pagefault to permanently fail due to
timeout if the VMA is still valid..

Jason



OK, I understand.
If you come up with a set of changes, I can try testing them.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout

2019-06-09 Thread Ralph Campbell



On 6/7/19 12:13 PM, Jason Gunthorpe wrote:

On Fri, Jun 07, 2019 at 12:01:45PM -0700, Ralph Campbell wrote:


On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

The wait_event_timeout macro already tests the condition as its first
action, so there is no reason to open code another version of this, all
that does is skip the might_sleep() debugging in common cases, which is
not helpful.

Further, based on prior patches, we can no simplify the required condition
test:
   - If range is valid memory then so is range->hmm
   - If hmm_release() has run then range->valid is set to false
 at the same time as dead, so no reason to check both.
   - A valid hmm has a valid hmm->mm.

Also, add the READ_ONCE for range->valid as there is no lock held here.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 
   include/linux/hmm.h | 12 ++--
   1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4ee3acabe5ed22..2ab35b40992b24 100644
+++ b/include/linux/hmm.h
@@ -218,17 +218,9 @@ static inline unsigned long hmm_range_page_size(const 
struct hmm_range *range)
   static inline bool hmm_range_wait_until_valid(struct hmm_range *range,
  unsigned long timeout)
   {
-   /* Check if mm is dead ? */
-   if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) {
-   range->valid = false;
-   return false;
-   }
-   if (range->valid)
-   return true;
-   wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead,
+   wait_event_timeout(range->hmm->wq, range->valid,
   msecs_to_jiffies(timeout));
-   /* Return current valid status just in case we get lucky */
-   return range->valid;
+   return READ_ONCE(range->valid);
   }
   /*



Since we are simplifying things, perhaps we should consider merging
hmm_range_wait_until_valid() info hmm_range_register() and
removing hmm_range_wait_until_valid() since the pattern
is to always call the two together.


? the hmm.rst shows the hmm_range_wait_until_valid being called in the
(ret == -EAGAIN) path. It is confusing because it should really just
have the again label moved up above hmm_range_wait_until_valid() as
even if we get the driver lock it could still be a long wait for the
colliding invalidation to clear.

What I want to get to is a pattern like this:

pagefault():

hmm_range_register();
again:
/* On the slow path, if we appear to be live locked then we get
   the write side of mmap_sem which will break the live lock,
   otherwise this gets the read lock */
if (hmm_range_start_and_lock())
  goto err;

lockdep_assert_held(range->mm->mmap_sem);

// Optional: Avoid useless expensive work
if (hmm_range_needs_retry())
   goto again;
hmm_range_(touch vmas)

take_lock(driver->update);
if (hmm_range_end() {
release_lock(driver->update);
goto again;
}
// Finish driver updates
release_lock(driver->update);

// Releases mmap_sem
hmm_range_unregister_and_unlock();

What do you think?

Is it clear?

Jason



Are you talking about acquiring mmap_sem in hmm_range_start_and_lock()?
Usually, the fault code has to lock mmap_sem for read in order to
call find_vma() so it can set range.vma.
If HMM drops mmap_sem - which I don't think it should, just return an
error to tell the caller to drop mmap_sem and retry - the find_vma()
will need to be repeated as well.
I'm also not sure about acquiring the mmap_sem for write as way to
mitigate thrashing. It seems to me that if a device and a CPU are
both faulting on the same page, some sort of backoff delay is needed
to let one side or the other make some progress.

Thrashing mitigation and how migrate_vma() plays in this is a
deep topic for thought.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 06/11] mm/hmm: Hold on to the mmget for the lifetime of the range

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

Range functions like hmm_range_snapshot() and hmm_range_fault() call
find_vma, which requires hodling the mmget() and the mmap_sem for the mm.

Make this simpler for the callers by holding the mmget() inside the range
for the lifetime of the range. Other functions that accept a range should
only be called if the range is registered.

This has the side effect of directly preventing hmm_release() from
happening while a range is registered. That means range->dead cannot be
false during the lifetime of the range, so remove dead and
hmm_mirror_mm_is_alive() entirely.

Signed-off-by: Jason Gunthorpe 


Looks good to me.
Reviewed-by: Ralph Campbell 


---
v2:
  - Use Jerome's idea of just holding the mmget() for the range lifetime,
rework the patch to use that as as simplification to remove dead in
one step
---
  include/linux/hmm.h | 26 --
  mm/hmm.c| 28 ++--
  2 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 2ab35b40992b24..0e20566802967a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -91,7 +91,6 @@
   * @mirrors_sem: read/write semaphore protecting the mirrors list
   * @wq: wait queue for user waiting on a range invalidation
   * @notifiers: count of active mmu notifiers
- * @dead: is the mm dead ?
   */
  struct hmm {
struct mm_struct*mm;
@@ -104,7 +103,6 @@ struct hmm {
wait_queue_head_t   wq;
struct rcu_head rcu;
longnotifiers;
-   booldead;
  };
  
  /*

@@ -469,30 +467,6 @@ struct hmm_mirror {
  int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm);
  void hmm_mirror_unregister(struct hmm_mirror *mirror);
  
-/*

- * hmm_mirror_mm_is_alive() - test if mm is still alive
- * @mirror: the HMM mm mirror for which we want to lock the mmap_sem
- * Return: false if the mm is dead, true otherwise
- *
- * This is an optimization, it will not always accurately return false if the
- * mm is dead; i.e., there can be false negatives (process is being killed but
- * HMM is not yet informed of that). It is only intended to be used to optimize
- * out cases where the driver is about to do something time consuming and it
- * would be better to skip it if the mm is dead.
- */
-static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror)
-{
-   struct mm_struct *mm;
-
-   if (!mirror || !mirror->hmm)
-   return false;
-   mm = READ_ONCE(mirror->hmm->mm);
-   if (mirror->hmm->dead || !mm)
-   return false;
-
-   return true;
-}
-
  /*
   * Please see Documentation/vm/hmm.rst for how to use the range API.
   */
diff --git a/mm/hmm.c b/mm/hmm.c
index dc30edad9a8a02..f67ba32983d9f1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -80,7 +80,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
mutex_init(>lock);
kref_init(>kref);
hmm->notifiers = 0;
-   hmm->dead = false;
hmm->mm = mm;
  
  	hmm->mmu_notifier.ops = _mmu_notifier_ops;

@@ -124,20 +123,17 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
  {
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
-   struct hmm_range *range;
  
  	/* hmm is in progress to free */

if (!kref_get_unless_zero(>kref))
return;
  
-	/* Report this HMM as dying. */

-   hmm->dead = true;
-
-   /* Wake-up everyone waiting on any range. */
mutex_lock(>lock);
-   list_for_each_entry(range, >ranges, list)
-   range->valid = false;
-   wake_up_all(>wq);
+   /*
+* Since hmm_range_register() holds the mmget() lock hmm_release() is
+* prevented as long as a range exists.
+*/
+   WARN_ON(!list_empty(>ranges));
mutex_unlock(>lock);
  
  	down_write(>mirrors_sem);

@@ -909,8 +905,8 @@ int hmm_range_register(struct hmm_range *range,
range->start = start;
range->end = end;
  
-	/* Check if hmm_mm_destroy() was call. */

-   if (hmm->mm == NULL || hmm->dead)
+   /* Prevent hmm_release() from running while the range is valid */
+   if (!mmget_not_zero(hmm->mm))
return -EFAULT;
  
  	range->hmm = hmm;

@@ -955,6 +951,7 @@ void hmm_range_unregister(struct hmm_range *range)
  
  	/* Drop reference taken by hmm_range_register() */

range->valid = false;
+   mmput(hmm->mm);
hmm_put(hmm);
range->hmm = NULL;
  }
@@ -982,10 +979,7 @@ long hmm_range_snapshot(struct hmm_range *range)
struct vm_area_struct *vma;
struct mm_walk mm_walk;
  
-	/* Check if hmm_mm_destroy() was call. */

-   if (hmm->mm == NULL || hmm-&g

Re: [PATCH v2 12/11] mm/hmm: Fix error flows in hmm_invalidate_range_start

2019-06-09 Thread Ralph Campbell



On 6/7/19 9:05 AM, Jason Gunthorpe wrote:

If the trylock on the hmm->mirrors_sem fails the function will return
without decrementing the notifiers that were previously incremented. Since
the caller will not call invalidate_range_end() on EAGAIN this will result
in notifiers becoming permanently incremented and deadlock.

If the sync_cpu_device_pagetables() required blocking the function will
not return EAGAIN even though the device continues to touch the
pages. This is a violation of the mmu notifier contract.

Switch, and rename, the ranges_lock to a spin lock so we can reliably
obtain it without blocking during error unwind.

The error unwind is necessary since the notifiers count must be held
incremented across the call to sync_cpu_device_pagetables() as we cannot
allow the range to become marked valid by a parallel
invalidate_start/end() pair while doing sync_cpu_device_pagetables().

Signed-off-by: Jason Gunthorpe 


Reviewed-by: Ralph Campbell 


---
  include/linux/hmm.h |  2 +-
  mm/hmm.c| 77 +++--
  2 files changed, 48 insertions(+), 31 deletions(-)

I almost lost this patch - it is part of the series, hasn't been
posted before, and wasn't sent with the rest, sorry.

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bf013e96525771..0fa8ea34ccef6d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -86,7 +86,7 @@
  struct hmm {
struct mm_struct*mm;
struct kref kref;
-   struct mutexlock;
+   spinlock_t  ranges_lock;
struct list_headranges;
struct list_headmirrors;
struct mmu_notifier mmu_notifier;
diff --git a/mm/hmm.c b/mm/hmm.c
index 4215edf737ef5b..10103a24e9b7b3 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -68,7 +68,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
init_rwsem(>mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(>ranges);
-   mutex_init(>lock);
+   spin_lock_init(>ranges_lock);
kref_init(>kref);
hmm->notifiers = 0;
hmm->mm = mm;
@@ -114,18 +114,19 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
  {
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
+   unsigned long flags;
  
  	/* Bail out if hmm is in the process of being freed */

if (!kref_get_unless_zero(>kref))
return;
  
-	mutex_lock(>lock);

+   spin_lock_irqsave(>ranges_lock, flags);
/*
 * Since hmm_range_register() holds the mmget() lock hmm_release() is
 * prevented as long as a range exists.
 */
WARN_ON(!list_empty(>ranges));
-   mutex_unlock(>lock);
+   spin_unlock_irqrestore(>ranges_lock, flags);
  
  	down_read(>mirrors_sem);

list_for_each_entry(mirror, >mirrors, list) {
@@ -141,6 +142,23 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
hmm_put(hmm);
  }
  
+static void notifiers_decrement(struct hmm *hmm)

+{
+   lockdep_assert_held(>ranges_lock);
+
+   hmm->notifiers--;
+   if (!hmm->notifiers) {
+   struct hmm_range *range;
+
+   list_for_each_entry(range, >ranges, list) {
+   if (range->valid)
+   continue;
+   range->valid = true;
+   }


This just effectively sets all ranges to valid.
I'm not sure that is best.
Shouldn't hmm_range_register() start with range.valid = true and
then hmm_invalidate_range_start() set affected ranges to false?
Then this becomes just wake_up_all() if --notifiers == 0 and
hmm_range_wait_until_valid() should wait for notifiers == 0.
Otherwise, range.valid doesn't really mean it's valid.


+   wake_up_all(>wq);
+   }
+}
+
  static int hmm_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
  {
@@ -148,6 +166,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
struct hmm_mirror *mirror;
struct hmm_update update;
struct hmm_range *range;
+   unsigned long flags;
int ret = 0;
  
  	if (!kref_get_unless_zero(>kref))

@@ -158,12 +177,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
update.event = HMM_UPDATE_INVALIDATE;
update.blockable = mmu_notifier_range_blockable(nrange);
  
-	if (mmu_notifier_range_blockable(nrange))

-   mutex_lock(>lock);
-   else if (!mutex_trylock(>lock)) {
-   ret = -EAGAIN;
-   goto out;
-   }
+   spin_lock_irqsave(>ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, >ranges, list) {
if (update.end < range->start || update.start >= range->

Re: [PATCH v2 hmm 02/11] mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register

2019-06-09 Thread Ralph Campbell



On 6/7/19 11:24 AM, Ralph Campbell wrote:


On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

Ralph observes that hmm_range_register() can only be called by a driver
while a mirror is registered. Make this clear in the API by passing in 
the

mirror structure as a parameter.

This also simplifies understanding the lifetime model for struct hmm, as
the hmm pointer must be valid as part of a registered mirror so all we
need in hmm_register_range() is a simple kref_get.

Suggested-by: Ralph Campbell 
Signed-off-by: Jason Gunthorpe 


You might CC Ben for the nouveau part.
CC: Ben Skeggs 

Reviewed-by: Ralph Campbell 



---
v2
- Include the oneline patch to nouveau_svm.c
---
  drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
  include/linux/hmm.h   |  7 ---
  mm/hmm.c  | 15 ++-
  3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c

index 93ed43c413f0bb..8c92374afcf227 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
  range.values = nouveau_svm_pfn_values;
  range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
  again:
-    ret = hmm_vma_fault(, true);
+    ret = hmm_vma_fault(>mirror, , true);
  if (ret == 0) {
  mutex_lock(>mutex);
  if (!hmm_vma_range_done()) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 688c5ca7068795..2d519797cb134a 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct 
hmm_mirror *mirror)

   * Please see Documentation/vm/hmm.rst for how to use the range API.
   */
  int hmm_range_register(struct hmm_range *range,
-   struct mm_struct *mm,
+   struct hmm_mirror *mirror,
 unsigned long start,
 unsigned long end,
 unsigned page_shift);
@@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct 
hmm_range *range)

  }
  /* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_range *range, bool block)
+static inline int hmm_vma_fault(struct hmm_mirror *mirror,
+    struct hmm_range *range, bool block)
  {
  long ret;
@@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range 
*range, bool block)

  range->default_flags = 0;
  range->pfn_flags_mask = -1UL;
-    ret = hmm_range_register(range, range->vma->vm_mm,
+    ret = hmm_range_register(range, mirror,
   range->start, range->end,
   PAGE_SHIFT);
  if (ret)
diff --git a/mm/hmm.c b/mm/hmm.c
index 547002f56a163d..8796447299023c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range,
   * Track updates to the CPU page table see include/linux/hmm.h
   */
  int hmm_range_register(struct hmm_range *range,
-   struct mm_struct *mm,
+   struct hmm_mirror *mirror,
 unsigned long start,
 unsigned long end,
 unsigned page_shift)
  {
  unsigned long mask = ((1UL << page_shift) - 1UL);
-    struct hmm *hmm;
+    struct hmm *hmm = mirror->hmm;
  range->valid = false;
  range->hmm = NULL;
@@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range,
  range->start = start;
  range->end = end;
-    hmm = hmm_get_or_create(mm);
-    if (!hmm)
-    return -EFAULT;
-
  /* Check if hmm_mm_destroy() was call. */
-    if (hmm->mm == NULL || hmm->dead) {
-    hmm_put(hmm);
+    if (hmm->mm == NULL || hmm->dead)
  return -EFAULT;
-    }
+
+    range->hmm = hmm;
+    kref_get(>kref);
  /* Initialize range to track CPU page table updates. */
  mutex_lock(>lock);



I forgot to add that I think you can delete the duplicate
"range->hmm = hmm;"
here between the mutex_lock/unlock.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 04/11] mm/hmm: Simplify hmm_get_or_create and make it reliable

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

As coded this function can false-fail in various racy situations. Make it
reliable by running only under the write side of the mmap_sem and avoiding
the false-failing compare/exchange pattern.

Also make the locking very easy to understand by only ever reading or
writing mm->hmm while holding the write side of the mmap_sem.

Signed-off-by: Jason Gunthorpe 


Reviewed-by: Ralph Campbell 


---
v2:
- Fix error unwind of mmgrab (Jerome)
- Use hmm local instead of 2nd container_of (Jerome)
---
  mm/hmm.c | 80 
  1 file changed, 29 insertions(+), 51 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index cc7c26fda3300e..dc30edad9a8a02 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -40,16 +40,6 @@
  #if IS_ENABLED(CONFIG_HMM_MIRROR)
  static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  
-static inline struct hmm *mm_get_hmm(struct mm_struct *mm)

-{
-   struct hmm *hmm = READ_ONCE(mm->hmm);
-
-   if (hmm && kref_get_unless_zero(>kref))
-   return hmm;
-
-   return NULL;
-}
-
  /**
   * hmm_get_or_create - register HMM against an mm (HMM internal)
   *
@@ -64,11 +54,20 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm)
   */
  static struct hmm *hmm_get_or_create(struct mm_struct *mm)
  {
-   struct hmm *hmm = mm_get_hmm(mm);
-   bool cleanup = false;
+   struct hmm *hmm;
  
-	if (hmm)

-   return hmm;
+   lockdep_assert_held_exclusive(>mmap_sem);
+
+   if (mm->hmm) {
+   if (kref_get_unless_zero(>hmm->kref))
+   return mm->hmm;
+   /*
+* The hmm is being freed by some other CPU and is pending a
+* RCU grace period, but this CPU can NULL now it since we
+* have the mmap_sem.
+*/
+   mm->hmm = NULL;
+   }
  
  	hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);

if (!hmm)
@@ -83,57 +82,36 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
hmm->notifiers = 0;
hmm->dead = false;
hmm->mm = mm;
-   mmgrab(hmm->mm);
-
-   spin_lock(>page_table_lock);
-   if (!mm->hmm)
-   mm->hmm = hmm;
-   else
-   cleanup = true;
-   spin_unlock(>page_table_lock);
  
-	if (cleanup)

-   goto error;
-
-   /*
-* We should only get here if hold the mmap_sem in write mode ie on
-* registration of first mirror through hmm_mirror_register()
-*/
hmm->mmu_notifier.ops = _mmu_notifier_ops;
-   if (__mmu_notifier_register(>mmu_notifier, mm))
-   goto error_mm;
+   if (__mmu_notifier_register(>mmu_notifier, mm)) {
+   kfree(hmm);
+   return NULL;
+   }
  
+	mmgrab(hmm->mm);

+   mm->hmm = hmm;
return hmm;
-
-error_mm:
-   spin_lock(>page_table_lock);
-   if (mm->hmm == hmm)
-   mm->hmm = NULL;
-   spin_unlock(>page_table_lock);
-error:
-   mmdrop(hmm->mm);
-   kfree(hmm);
-   return NULL;
  }
  
  static void hmm_free_rcu(struct rcu_head *rcu)

  {
-   kfree(container_of(rcu, struct hmm, rcu));
+   struct hmm *hmm = container_of(rcu, struct hmm, rcu);
+
+   down_write(>mm->mmap_sem);
+   if (hmm->mm->hmm == hmm)
+   hmm->mm->hmm = NULL;
+   up_write(>mm->mmap_sem);
+   mmdrop(hmm->mm);
+
+   kfree(hmm);
  }
  
  static void hmm_free(struct kref *kref)

  {
struct hmm *hmm = container_of(kref, struct hmm, kref);
-   struct mm_struct *mm = hmm->mm;
-
-   mmu_notifier_unregister_no_release(>mmu_notifier, mm);
  
-	spin_lock(>page_table_lock);

-   if (mm->hmm == hmm)
-   mm->hmm = NULL;
-   spin_unlock(>page_table_lock);
-
-   mmdrop(hmm->mm);
+   mmu_notifier_unregister_no_release(>mmu_notifier, hmm->mm);
mmu_notifier_call_srcu(>rcu, hmm_free_rcu);
  }
  


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[RFC] mm/hmm: pass mmu_notifier_range to sync_cpu_device_pagetables

2019-06-09 Thread Ralph Campbell

HMM defines its own struct hmm_update which is passed to the
sync_cpu_device_pagetables() callback function. This is
sufficient when the only action is to invalidate. However,
a device may want to know the reason for the invalidation and
be able to see the new permissions on a range, update device access
rights or range statistics. Since sync_cpu_device_pagetables()
can be called from try_to_unmap(), the mmap_sem may not be held
and find_vma() is not safe to be called.
Pass the struct mmu_notifier_range to sync_cpu_device_pagetables()
to allow the full invalidation information to be used.

Signed-off-by: Ralph Campbell 
---

I'm sending this out now since we are updating many of the HMM APIs
and I think it will be useful.


 drivers/gpu/drm/nouveau/nouveau_svm.c |  4 ++--
 include/linux/hmm.h   | 27 ++-
 mm/hmm.c  | 13 -
 3 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 8c92374afcf2..c34b98fafe2f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 
start, u64 limit)
 
 static int
 nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
-   const struct hmm_update *update)
+   const struct mmu_notifier_range *update)
 {
struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
unsigned long start = update->start;
unsigned long limit = update->end;
 
-   if (!update->blockable)
+   if (!mmu_notifier_range_blockable(update))
return -EAGAIN;
 
SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 0fa8ea34ccef..07a2d38fde34 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -377,29 +377,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct 
hmm_range *range,
 
 struct hmm_mirror;
 
-/*
- * enum hmm_update_event - type of update
- * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
- */
-enum hmm_update_event {
-   HMM_UPDATE_INVALIDATE,
-};
-
-/*
- * struct hmm_update - HMM update information for callback
- *
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
- * @event: event triggering the update (what is happening)
- * @blockable: can the callback block/sleep ?
- */
-struct hmm_update {
-   unsigned long start;
-   unsigned long end;
-   enum hmm_update_event event;
-   bool blockable;
-};
-
 /*
  * struct hmm_mirror_ops - HMM mirror device operations callback
  *
@@ -420,7 +397,7 @@ struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
 *
 * @mirror: pointer to struct hmm_mirror
-* @update: update information (see struct hmm_update)
+* @update: update information (see struct mmu_notifier_range)
 * Return: -EAGAIN if update.blockable false and callback need to
 *  block, 0 otherwise.
 *
@@ -434,7 +411,7 @@ struct hmm_mirror_ops {
 * synchronous call.
 */
int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- const struct hmm_update *update);
+   const struct mmu_notifier_range *update);
 };
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 9aad3550f2bb..b49a43712554 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -164,7 +164,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
 {
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
-   struct hmm_update update;
struct hmm_range *range;
unsigned long flags;
int ret = 0;
@@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
if (!kref_get_unless_zero(>kref))
return 0;
 
-   update.start = nrange->start;
-   update.end = nrange->end;
-   update.event = HMM_UPDATE_INVALIDATE;
-   update.blockable = mmu_notifier_range_blockable(nrange);
-
spin_lock_irqsave(>ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, >ranges, list) {
-   if (update.end < range->start || update.start >= range->end)
+   if (nrange->end < range->start || nrange->start >= range->end)
continue;
 
range->valid = false;
@@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
list_for_each_entry(mirror, >mirrors, list) {
int rc;
 
-   rc = mirror->ops->sync_cpu_device_pagetables(mirror,

Re: [PATCH v2 hmm 03/11] mm/hmm: Hold a mmgrab from hmm to mm

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

So long a a struct hmm pointer exists, so should the struct mm it is


s/a a/as a/


linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it
once the hmm refcount goes to zero.

Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL
mm->hmm delete the hmm_hmm_destroy().

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 


Reviewed-by: Ralph Campbell 


---
v2:
  - Fix error unwind paths in hmm_get_or_create (Jerome/Jason)
---
  include/linux/hmm.h |  3 ---
  kernel/fork.c   |  1 -
  mm/hmm.c| 22 --
  3 files changed, 4 insertions(+), 22 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 2d519797cb134a..4ee3acabe5ed22 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -586,14 +586,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror,
  }
  
  /* Below are for HMM internal use only! Not to be used by device driver! */

-void hmm_mm_destroy(struct mm_struct *mm);
-
  static inline void hmm_mm_init(struct mm_struct *mm)
  {
mm->hmm = NULL;
  }
  #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-static inline void hmm_mm_destroy(struct mm_struct *mm) {}
  static inline void hmm_mm_init(struct mm_struct *mm) {}
  #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
  
diff --git a/kernel/fork.c b/kernel/fork.c

index b2b87d450b80b5..588c768ae72451 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm)
WARN_ON_ONCE(mm == current->active_mm);
mm_free_pgd(mm);
destroy_context(mm);
-   hmm_mm_destroy(mm);
mmu_notifier_mm_destroy(mm);
check_mm(mm);
put_user_ns(mm->user_ns);
diff --git a/mm/hmm.c b/mm/hmm.c
index 8796447299023c..cc7c26fda3300e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -29,6 +29,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include 
@@ -82,6 +83,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
hmm->notifiers = 0;
hmm->dead = false;
hmm->mm = mm;
+   mmgrab(hmm->mm);
  
  	spin_lock(>page_table_lock);

if (!mm->hmm)
@@ -109,6 +111,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
mm->hmm = NULL;
spin_unlock(>page_table_lock);
  error:
+   mmdrop(hmm->mm);
kfree(hmm);
return NULL;
  }
@@ -130,6 +133,7 @@ static void hmm_free(struct kref *kref)
mm->hmm = NULL;
spin_unlock(>page_table_lock);
  
+	mmdrop(hmm->mm);

mmu_notifier_call_srcu(>rcu, hmm_free_rcu);
  }
  
@@ -138,24 +142,6 @@ static inline void hmm_put(struct hmm *hmm)

kref_put(>kref, hmm_free);
  }
  
-void hmm_mm_destroy(struct mm_struct *mm)

-{
-   struct hmm *hmm;
-
-   spin_lock(>page_table_lock);
-   hmm = mm_get_hmm(mm);
-   mm->hmm = NULL;
-   if (hmm) {
-   hmm->mm = NULL;
-   hmm->dead = true;
-   spin_unlock(>page_table_lock);
-   hmm_put(hmm);
-   return;
-   }
-
-   spin_unlock(>page_table_lock);
-}
-
  static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
  {
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 08/11] mm/hmm: Remove racy protection against double-unregistration

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

No other register/unregister kernel API attempts to provide this kind of
protection as it is inherently racy, so just drop it.

Callers should provide their own protection, it appears nouveau already
does, but just in case drop a debugging POISON.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 


Reviewed-by: Ralph Campbell 


---
  mm/hmm.c | 9 ++---
  1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index c702cd72651b53..6802de7080d172 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -284,18 +284,13 @@ EXPORT_SYMBOL(hmm_mirror_register);
   */
  void hmm_mirror_unregister(struct hmm_mirror *mirror)
  {
-   struct hmm *hmm = READ_ONCE(mirror->hmm);
-
-   if (hmm == NULL)
-   return;
+   struct hmm *hmm = mirror->hmm;
  
  	down_write(>mirrors_sem);

list_del_init(>list);
-   /* To protect us against double unregister ... */
-   mirror->hmm = NULL;
up_write(>mirrors_sem);
-
hmm_put(hmm);
+   memset(>hmm, POISON_INUSE, sizeof(mirror->hmm));
  }
  EXPORT_SYMBOL(hmm_mirror_unregister);
  


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 11/11] mm/hmm: Remove confusing comment and logic from hmm_release

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

hmm_release() is called exactly once per hmm. ops->release() cannot
accidentally trigger any action that would recurse back onto
hmm->mirrors_sem.

This fixes a use after-free race of the form:

CPU0   CPU1
hmm_release()
  up_write(>mirrors_sem);
  hmm_mirror_unregister(mirror)
   down_write(>mirrors_sem);
   up_write(>mirrors_sem);
   kfree(mirror)
  mirror->ops->release(mirror)

The only user we have today for ops->release is an empty function, so this
is unambiguously safe.

As a consequence of plugging this race drivers are not allowed to
register/unregister mirrors from within a release op.

Signed-off-by: Jason Gunthorpe 


I agree with the analysis above but I'm not sure that release() will
always be an empty function. It might be more efficient to write back
all data migrated to a device "in one pass" instead of relying
on unmap_vmas() calling hmm_start_range_invalidate() per VMA.

I think the bigger issue is potential deadlocks while calling
sync_cpu_device_pagetables() and tasks calling hmm_mirror_unregister():

Say you have three threads:
- Thread A is in try_to_unmap(), either without holding mmap_sem or with
mmap_sem held for read.
- Thread B has some unrelated driver calling hmm_mirror_unregister().
This doesn't require mmap_sem.
- Thread C is about to call migrate_vma().

Thread AThread B Thread C
try_to_unmaphmm_mirror_unregistermigrate_vma
--  ---  --
hmm_invalidate_range_start
down_read(mirrors_sem)
down_write(mirrors_sem)
// Blocked on A
  device_lock
device_lock
// Blocked on C
  migrate_vma()
  hmm_invalidate_range_s
  down_read(mirrors_sem)
  // Blocked on B
  // Deadlock

Perhaps we should consider using SRCU for walking the mirror->list?


---
  mm/hmm.c | 28 +---
  1 file changed, 9 insertions(+), 19 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 709d138dd49027..3a45dd3d778248 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -136,26 +136,16 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
WARN_ON(!list_empty(>ranges));
mutex_unlock(>lock);
  
-	down_write(>mirrors_sem);

-   mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror,
- list);
-   while (mirror) {
-   list_del_init(>list);
-   if (mirror->ops->release) {
-   /*
-* Drop mirrors_sem so the release callback can wait
-* on any pending work that might itself trigger a
-* mmu_notifier callback and thus would deadlock with
-* us.
-*/
-   up_write(>mirrors_sem);
+   down_read(>mirrors_sem);
+   list_for_each_entry(mirror, >mirrors, list) {
+   /*
+* Note: The driver is not allowed to trigger
+* hmm_mirror_unregister() from this thread.
+*/
+   if (mirror->ops->release)
mirror->ops->release(mirror);
-   down_write(>mirrors_sem);
-   }
-   mirror = list_first_entry_or_null(>mirrors,
- struct hmm_mirror, list);
}
-   up_write(>mirrors_sem);
+   up_read(>mirrors_sem);
  
  	hmm_put(hmm);

  }
@@ -287,7 +277,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror)
struct hmm *hmm = mirror->hmm;
  
  	down_write(>mirrors_sem);

-   list_del_init(>list);
+   list_del(>list);
up_write(>mirrors_sem);
hmm_put(hmm);
memset(>hmm, POISON_INUSE, sizeof(mirror->hmm));


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 hmm 09/11] mm/hmm: Poison hmm_range during unregister

2019-06-09 Thread Ralph Campbell



On 6/6/19 11:44 AM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

Trying to misuse a range outside its lifetime is a kernel bug. Use WARN_ON
and poison bytes to detect this condition.

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Jérôme Glisse 


Reviewed-by: Ralph Campbell 


---
v2
- Keep range start/end valid after unregistration (Jerome)
---
  mm/hmm.c | 7 +--
  1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 6802de7080d172..c2fecb3ecb11e1 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -937,7 +937,7 @@ void hmm_range_unregister(struct hmm_range *range)
struct hmm *hmm = range->hmm;
  
  	/* Sanity check this really should not happen. */

-   if (hmm == NULL || range->end <= range->start)
+   if (WARN_ON(range->end <= range->start))
return;


WARN_ON() is definitely better than silent return but I wonder how
useful it is since the caller shouldn't be modifying the hmm_range
once it is registered. Other fields could be changed too...


mutex_lock(>lock);
@@ -948,7 +948,10 @@ void hmm_range_unregister(struct hmm_range *range)
range->valid = false;
mmput(hmm->mm);
hmm_put(hmm);
-   range->hmm = NULL;
+
+   /* The range is now invalid, leave it poisoned. */
+   range->valid = false;
+   memset(>hmm, POISON_INUSE, sizeof(range->hmm));
  }
  EXPORT_SYMBOL(hmm_range_unregister);
  


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [RFC] mm/hmm: pass mmu_notifier_range to sync_cpu_device_pagetables

2019-07-03 Thread Ralph Campbell



On 7/2/19 12:53 PM, Jason Gunthorpe wrote:

On Fri, Jun 07, 2019 at 05:14:52PM -0700, Ralph Campbell wrote:

HMM defines its own struct hmm_update which is passed to the
sync_cpu_device_pagetables() callback function. This is
sufficient when the only action is to invalidate. However,
a device may want to know the reason for the invalidation and
be able to see the new permissions on a range, update device access
rights or range statistics. Since sync_cpu_device_pagetables()
can be called from try_to_unmap(), the mmap_sem may not be held
and find_vma() is not safe to be called.
Pass the struct mmu_notifier_range to sync_cpu_device_pagetables()
to allow the full invalidation information to be used.

Signed-off-by: Ralph Campbell 
---

I'm sending this out now since we are updating many of the HMM APIs
and I think it will be useful.


This make so much sense, I'd like to apply this in hmm.git, is there
any objection?

Jason


Not from me. :-)

Thanks!
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: hmm_range_fault related fixes and legacy API removal v2

2019-07-09 Thread Ralph Campbell



On 7/4/19 9:42 AM, Jason Gunthorpe wrote:

On Wed, Jul 03, 2019 at 03:02:08PM -0700, Christoph Hellwig wrote:

Hi Jérôme, Ben and Jason,

below is a series against the hmm tree which fixes up the mmap_sem
locking in nouveau and while at it also removes leftover legacy HMM APIs
only used by nouveau.

Changes since v1:
  - don't return the valid state from hmm_range_unregister
  - additional nouveau cleanups


Ralph, since most of this is nouveau could you contribute a
Tested-by? Thanks

Jason



I can test things fairly easily but with all the different patches,
conflicts, and personal git trees, can you specify the git tree
and branch with everything applied that you want me to test?
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 18/22] mm: return valid info from hmm_range_unregister

2019-07-04 Thread Ralph Campbell



On 6/30/19 11:20 PM, Christoph Hellwig wrote:

Checking range->valid is trivial and has no meaningful cost, but
nicely simplifies the fastpath in typical callers.  Also remove the
hmm_vma_range_done function, which now is a trivial wrapper around
hmm_range_unregister.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_svm.c |  2 +-
  include/linux/hmm.h   | 11 +--
  mm/hmm.c  |  6 +-
  3 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 8c92374afcf2..9d40114d7949 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -652,7 +652,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
ret = hmm_vma_fault(>mirror, , true);
if (ret == 0) {
mutex_lock(>mutex);
-   if (!hmm_vma_range_done()) {
+   if (!hmm_range_unregister()) {
mutex_unlock(>mutex);
goto again;
}
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 0fa8ea34ccef..4b185d286c3b 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -465,7 +465,7 @@ int hmm_range_register(struct hmm_range *range,
   unsigned long start,
   unsigned long end,
   unsigned page_shift);
-void hmm_range_unregister(struct hmm_range *range);
+bool hmm_range_unregister(struct hmm_range *range);
  long hmm_range_snapshot(struct hmm_range *range);
  long hmm_range_fault(struct hmm_range *range, bool block);
  long hmm_range_dma_map(struct hmm_range *range,
@@ -487,15 +487,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
   */
  #define HMM_RANGE_DEFAULT_TIMEOUT 1000
  
-/* This is a temporary helper to avoid merge conflict between trees. */

-static inline bool hmm_vma_range_done(struct hmm_range *range)
-{
-   bool ret = hmm_range_valid(range);
-
-   hmm_range_unregister(range);
-   return ret;
-}
-
  /* This is a temporary helper to avoid merge conflict between trees. */
  static inline int hmm_vma_fault(struct hmm_mirror *mirror,
struct hmm_range *range, bool block)
diff --git a/mm/hmm.c b/mm/hmm.c
index de35289df20d..c85ed7d4e2ce 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -920,11 +920,14 @@ EXPORT_SYMBOL(hmm_range_register);
   *
   * Range struct is used to track updates to the CPU page table after a call to
   * hmm_range_register(). See include/linux/hmm.h for how to use it.
+ *
+ * Returns if the range was still valid at the time of unregistering.


Since this is an exported function, we should have kernel-doc comments.
That is probably a separate patch but at least this line could be:
Return: True if the range was still valid at the time of unregistering.


   */
-void hmm_range_unregister(struct hmm_range *range)
+bool hmm_range_unregister(struct hmm_range *range)
  {
struct hmm *hmm = range->hmm;
unsigned long flags;
+   bool ret = range->valid;
  
  	spin_lock_irqsave(>ranges_lock, flags);

list_del_init(>list);
@@ -941,6 +944,7 @@ void hmm_range_unregister(struct hmm_range *range)
 */
range->valid = false;
memset(>hmm, POISON_INUSE, sizeof(range->hmm));
+   return ret;
  }
  EXPORT_SYMBOL(hmm_range_unregister);
  


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 20/22] mm: move hmm_vma_fault to nouveau

2019-07-04 Thread Ralph Campbell



On 6/30/19 11:20 PM, Christoph Hellwig wrote:

hmm_vma_fault is marked as a legacy API to get rid of, but quite suites
the current nouvea flow.  Move it to the only user in preparation for


I didn't quite parse the phrase "quite suites the current nouvea flow."
s/nouvea/nouveau/


fixing a locking bug involving caller and callee.

Signed-off-by: Christoph Hellwig 


I see where you are going with this and it
looks like straightforward code movement so,

Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_svm.c | 54 ++-
  include/linux/hmm.h   | 54 ---
  2 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 9d40114d7949..e831f4184a17 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -36,6 +36,13 @@
  #include 
  #include 
  
+/*

+ * When waiting for mmu notifiers we need some kind of time out otherwise we
+ * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
+ * wait already.
+ */
+#define NOUVEAU_RANGE_FAULT_TIMEOUT 1000
+
  struct nouveau_svm {
struct nouveau_drm *drm;
struct mutex mutex;
@@ -475,6 +482,51 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm,
fault->inst, fault->addr, fault->access);
  }
  
+static int

+nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range,
+   bool block)
+{
+   long ret;
+
+   /*
+* With the old API the driver must set each individual entries with
+* the requested flags (valid, write, ...). So here we set the mask to
+* keep intact the entries provided by the driver and zero out the
+* default_flags.
+*/
+   range->default_flags = 0;
+   range->pfn_flags_mask = -1UL;
+
+   ret = hmm_range_register(range, mirror,
+range->start, range->end,
+PAGE_SHIFT);
+   if (ret)
+   return (int)ret;
+
+   if (!hmm_range_wait_until_valid(range, NOUVEAU_RANGE_FAULT_TIMEOUT)) {
+   /*
+* The mmap_sem was taken by driver we release it here and
+* returns -EAGAIN which correspond to mmap_sem have been
+* drop in the old API.
+*/
+   up_read(>vma->vm_mm->mmap_sem);
+   return -EAGAIN;
+   }
+
+   ret = hmm_range_fault(range, block);
+   if (ret <= 0) {
+   if (ret == -EBUSY || !ret) {
+   /* Same as above, drop mmap_sem to match old API. */
+   up_read(>vma->vm_mm->mmap_sem);
+   ret = -EBUSY;
+   } else if (ret == -EAGAIN)
+   ret = -EBUSY;
+   hmm_range_unregister(range);
+   return ret;
+   }
+   return 0;
+}
+
  static int
  nouveau_svm_fault(struct nvif_notify *notify)
  {
@@ -649,7 +701,7 @@ nouveau_svm_fault(struct nvif_notify *notify)
range.values = nouveau_svm_pfn_values;
range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT;
  again:
-   ret = hmm_vma_fault(>mirror, , true);
+   ret = nouveau_range_fault(>mirror, , true);
if (ret == 0) {
mutex_lock(>mutex);
if (!hmm_range_unregister()) {
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 4b185d286c3b..3457cf9182e5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -478,60 +478,6 @@ long hmm_range_dma_unmap(struct hmm_range *range,
 dma_addr_t *daddrs,
 bool dirty);
  
-/*

- * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range
- *
- * When waiting for mmu notifiers we need some kind of time out otherwise we
- * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to
- * wait already.
- */
-#define HMM_RANGE_DEFAULT_TIMEOUT 1000
-
-/* This is a temporary helper to avoid merge conflict between trees. */
-static inline int hmm_vma_fault(struct hmm_mirror *mirror,
-   struct hmm_range *range, bool block)
-{
-   long ret;
-
-   /*
-* With the old API the driver must set each individual entries with
-* the requested flags (valid, write, ...). So here we set the mask to
-* keep intact the entries provided by the driver and zero out the
-* default_flags.
-*/
-   range->default_flags = 0;
-   range->pfn_flags_mask = -1UL;
-
-   ret = hmm_range_register(range, mirror,
-range->start, range->end,
-PAGE_SHIFT);
-   if (ret)
-   return (int)ret;
-

Re: [PATCH 4/5] nouveau: unlock mmap_sem on all errors from nouveau_range_fault

2019-07-04 Thread Ralph Campbell



On 7/3/19 11:45 AM, Christoph Hellwig wrote:

Currently nouveau_svm_fault expects nouveau_range_fault to never unlock
mmap_sem, but the latter unlocks it for a random selection of error
codes. Fix this up by always unlocking mmap_sem for non-zero return
values in nouveau_range_fault, and only unlocking it in the caller
for successful returns.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_svm.c | 15 ---
  1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index e831f4184a17..c0cf7aeaefb3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -500,8 +500,10 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct 
hmm_range *range,


You can delete the comment "With the old API the driver must ..."
(not visible in the patch here).
I suggest moving the two assignments:
range->default_flags = 0;
range->pfn_flags_mask = -1UL;
to just above the "again:" where the other range.xxx fields are
initialized in nouveau_svm_fault().


ret = hmm_range_register(range, mirror,
 range->start, range->end,
 PAGE_SHIFT);
-   if (ret)
+   if (ret) {
+   up_read(>vma->vm_mm->mmap_sem; > return 
(int)ret;
+   }
  
  	if (!hmm_range_wait_until_valid(range, NOUVEAU_RANGE_FAULT_TIMEOUT)) {

/*


You can delete this comment (only the first line is visible here)
since it is about the "old API".
Also, it should return -EBUSY not -EAGAIN since it means there was a
range invalidation collision (similar to hmm_range_fault() if
!range->valid).


@@ -515,15 +517,14 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct 
hmm_range *range,
  
  	ret = hmm_range_fault(range, block);


nouveau_range_fault() is only called with "block = true" so
could eliminate the block parameter and pass true here.


if (ret <= 0) {
-   if (ret == -EBUSY || !ret) {
-   /* Same as above, drop mmap_sem to match old API. */
-   up_read(>vma->vm_mm->mmap_sem);
-   ret = -EBUSY;
-   } else if (ret == -EAGAIN)
+   if (ret == 0)
ret = -EBUSY;
+   if (ret != -EAGAIN)
+   up_read(>vma->vm_mm->mmap_sem);


Can ret == -EAGAIN happen if "block = true"?
Generally, I prefer the read_down()/read_up() in the same function
(i.e., nouveau_svm_fault()) but I can see why it should be here
if hmm_range_fault() can return with mmap_sem unlocked.


hmm_range_unregister(range);
return ret;
}
+
return 0;
  }
  
@@ -718,8 +719,8 @@ nouveau_svm_fault(struct nvif_notify *notify)

NULL);
svmm->vmm->vmm.object.client->super = false;
mutex_unlock(>mutex);
+   up_read(>mm->mmap_sem);
}
-   up_read(>mm->mmap_sem);
  


The "else" case should check for -EBUSY and goto again.


/* Cancel any faults in the window whose pages didn't manage
 * to keep their valid bit, or stay writeable when required.


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 5/5] mm: remove the legacy hmm_pfn_* APIs

2019-07-04 Thread Ralph Campbell



On 7/3/19 11:45 AM, Christoph Hellwig wrote:

Switch the one remaining user in nouveau over to its replacement,
and remove all the wrappers.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Jason Gunthorpe 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
  include/linux/hmm.h| 34 --
  2 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 42c026010938..b9ced2e61667 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -844,7 +844,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
struct page *page;
uint64_t addr;
  
-		page = hmm_pfn_to_page(range, range->pfns[i]);

+   page = hmm_device_entry_to_page(range, range->pfns[i]);
if (page == NULL)
continue;
  
diff --git a/include/linux/hmm.h b/include/linux/hmm.h

index 657606f48796..cdcd78627393 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -290,40 +290,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const 
struct hmm_range *range,
range->flags[HMM_PFN_VALID];
  }
  
-/*

- * Old API:
- * hmm_pfn_to_page()
- * hmm_pfn_to_pfn()
- * hmm_pfn_from_page()
- * hmm_pfn_from_pfn()
- *
- * This are the OLD API please use new API, it is here to avoid cross-tree
- * merge painfullness ie we convert things to new API in stages.
- */
-static inline struct page *hmm_pfn_to_page(const struct hmm_range *range,
-  uint64_t pfn)
-{
-   return hmm_device_entry_to_page(range, pfn);
-}
-
-static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range,
-  uint64_t pfn)
-{
-   return hmm_device_entry_to_pfn(range, pfn);
-}
-
-static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range,
-struct page *page)
-{
-   return hmm_device_entry_from_page(range, page);
-}
-
-static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range,
-   unsigned long pfn)
-{
-   return hmm_device_entry_from_pfn(range, pfn);
-}
-
  /*
   * Mirroring: how to synchronize device page table with CPU page table.
   *


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 19/22] mm: always return EBUSY for invalid ranges in hmm_range_{fault,snapshot}

2019-07-04 Thread Ralph Campbell



On 6/30/19 11:20 PM, Christoph Hellwig wrote:

We should not have two different error codes for the same condition.  In
addition this really complicates the code due to the special handling of
EAGAIN that drops the mmap_sem due to the FAULT_FLAG_ALLOW_RETRY logic
in the core vm.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 

Probably should update the "Return:" comment above
hmm_range_snapshot() too.


---
  mm/hmm.c | 8 +++-
  1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index c85ed7d4e2ce..d125df698e2b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -974,7 +974,7 @@ long hmm_range_snapshot(struct hmm_range *range)
do {
/* If range is no longer valid force retry. */
if (!range->valid)
-   return -EAGAIN;
+   return -EBUSY;
  
  		vma = find_vma(hmm->mm, start);

if (vma == NULL || (vma->vm_flags & device_vma))
@@ -1069,10 +1069,8 @@ long hmm_range_fault(struct hmm_range *range, bool block)
  
  	do {

/* If range is no longer valid force retry. */
-   if (!range->valid) {
-   up_read(>mm->mmap_sem);
-   return -EAGAIN;
-   }
+   if (!range->valid)
+   return -EBUSY;
  
  		vma = find_vma(hmm->mm, start);

if (vma == NULL || (vma->vm_flags & device_vma))


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 3/9] nouveau: factor out device memory address calculation

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

Factor out the repeated device memory address calculation into
a helper.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 42 +++---
  1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index e696157f771e..d469bc334438 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -102,6 +102,14 @@ struct nouveau_migrate {
unsigned long dma_nr;
  };
  
+static unsigned long nouveau_dmem_page_addr(struct page *page)

+{
+   struct nouveau_dmem_chunk *chunk = page->zone_device_data;
+   unsigned long idx = page_to_pfn(page) - chunk->pfn_first;
+
+   return (idx << PAGE_SHIFT) + chunk->bo->bo.offset;
+}
+
  static void nouveau_dmem_page_free(struct page *page)
  {
struct nouveau_dmem_chunk *chunk = page->zone_device_data;
@@ -169,9 +177,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
/* Copy things over */
copy = drm->dmem->migrate.copy_func;
for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct nouveau_dmem_chunk *chunk;
struct page *spage, *dpage;
-   u64 src_addr, dst_addr;
  
  		dpage = migrate_pfn_to_page(dst_pfns[i]);

if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
@@ -194,14 +200,10 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
continue;
}
  
-		dst_addr = fault->dma[fault->npages++];

-
-   chunk = spage->zone_device_data;
-   src_addr = page_to_pfn(spage) - chunk->pfn_first;
-   src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
-
-   ret = copy(drm, 1, NOUVEAU_APER_HOST, dst_addr,
-  NOUVEAU_APER_VRAM, src_addr);
+   ret = copy(drm, 1, NOUVEAU_APER_HOST,
+   fault->dma[fault->npages++],
+   NOUVEAU_APER_VRAM,
+   nouveau_dmem_page_addr(spage));
if (ret) {
dst_pfns[i] = MIGRATE_PFN_ERROR;
__free_page(dpage);
@@ -687,18 +689,12 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct 
*vma,
/* Copy things over */
copy = drm->dmem->migrate.copy_func;
for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct nouveau_dmem_chunk *chunk;
struct page *spage, *dpage;
-   u64 src_addr, dst_addr;
  
  		dpage = migrate_pfn_to_page(dst_pfns[i]);

if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
continue;
  
-		chunk = dpage->zone_device_data;

-   dst_addr = page_to_pfn(dpage) - chunk->pfn_first;
-   dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset;
-
spage = migrate_pfn_to_page(src_pfns[i]);
if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
nouveau_dmem_page_free_locked(drm, dpage);
@@ -716,10 +712,10 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct 
*vma,
continue;
}
  
-		src_addr = migrate->dma[migrate->dma_nr++];

-
-   ret = copy(drm, 1, NOUVEAU_APER_VRAM, dst_addr,
-  NOUVEAU_APER_HOST, src_addr);
+   ret = copy(drm, 1, NOUVEAU_APER_VRAM,
+   nouveau_dmem_page_addr(dpage),
+   NOUVEAU_APER_HOST,
+   migrate->dma[migrate->dma_nr++]);
if (ret) {
nouveau_dmem_page_free_locked(drm, dpage);
dst_pfns[i] = 0;
@@ -846,7 +842,6 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,
  
  	npages = (range->end - range->start) >> PAGE_SHIFT;

for (i = 0; i < npages; ++i) {
-   struct nouveau_dmem_chunk *chunk;
struct page *page;
uint64_t addr;
  
@@ -864,10 +859,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm,

continue;
}
  
-		chunk = page->zone_device_data;

-   addr = page_to_pfn(page) - chunk->pfn_first;
-   addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT;
-
+   addr = nouveau_dmem_page_addr(page);
range->pfns[i] &= ((1UL << range->pfn_shift) - 1);
range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift;
}


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 8/9] mm: remove the unused MIGRATE_PFN_DEVICE flag

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

No one ever checks this flag, and we could easily get that information
from the page if needed.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 +--
  include/linux/migrate.h| 1 -
  mm/migrate.c   | 4 ++--
  3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 6cb930755970..f04686a2c21f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -582,8 +582,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct 
nouveau_drm *drm,
*dma_addr))
goto out_dma_unmap;
  
-	return migrate_pfn(page_to_pfn(dpage)) |

-   MIGRATE_PFN_LOCKED | MIGRATE_PFN_DEVICE;
+   return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
  
  out_dma_unmap:

dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL);
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 229153c2c496..8b46cfdb1a0e 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -166,7 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
  #define MIGRATE_PFN_MIGRATE   (1UL << 1)
  #define MIGRATE_PFN_LOCKED(1UL << 2)
  #define MIGRATE_PFN_WRITE (1UL << 3)
-#define MIGRATE_PFN_DEVICE (1UL << 4)
  #define MIGRATE_PFN_SHIFT 6
  
  static inline struct page *migrate_pfn_to_page(unsigned long mpfn)

diff --git a/mm/migrate.c b/mm/migrate.c
index dc4e60a496f2..74735256e260 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2237,8 +2237,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
  
  			page = device_private_entry_to_page(entry);

-   mpfn = migrate_pfn(page_to_pfn(page))|
-   MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+   mpfn = migrate_pfn(page_to_pfn(page)) |
+   MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 7/9] mm: remove the unused MIGRATE_PFN_ERROR flag

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

We don't use this flag anymore, so remove it.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  include/linux/migrate.h | 1 -
  1 file changed, 1 deletion(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 093d67fcf6dd..229153c2c496 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -167,7 +167,6 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
  #define MIGRATE_PFN_LOCKED(1UL << 2)
  #define MIGRATE_PFN_WRITE (1UL << 3)
  #define MIGRATE_PFN_DEVICE(1UL << 4)
-#define MIGRATE_PFN_ERROR  (1UL << 5)
  #define MIGRATE_PFN_SHIFT 6


The MIGRATE_PFN_SHIFT could be reduced to 5 since it is only used
to make room for the flags.


  static inline struct page *migrate_pfn_to_page(unsigned long mpfn)


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 6/9] nouveau: simplify nouveau_dmem_migrate_vma

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

Factor the main copy page to vram routine out into a helper that acts
on a single page and which doesn't require the nouveau_dmem_migrate
structure for argument passing.  As an added benefit the new version
only allocates the dma address array once and reuses it for each
subsequent chunk of work.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 185 -
  1 file changed, 56 insertions(+), 129 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 036e6c07d489..6cb930755970 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -44,8 +44,6 @@
  #define DMEM_CHUNK_SIZE (2UL << 20)
  #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT)
  
-struct nouveau_migrate;

-
  enum nouveau_aper {
NOUVEAU_APER_VIRT,
NOUVEAU_APER_VRAM,
@@ -86,15 +84,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page 
*page)
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
  }
  
-struct nouveau_migrate {

-   struct vm_area_struct *vma;
-   struct nouveau_drm *drm;
-   struct nouveau_fence *fence;
-   unsigned long npages;
-   dma_addr_t *dma;
-   unsigned long dma_nr;
-};
-
  static unsigned long nouveau_dmem_page_addr(struct page *page)
  {
struct nouveau_dmem_chunk *chunk = page->zone_device_data;
@@ -569,131 +558,67 @@ nouveau_dmem_init(struct nouveau_drm *drm)
drm->dmem = NULL;
  }
  
-static void

-nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma,
-   const unsigned long *src_pfns,
-   unsigned long *dst_pfns,
-   unsigned long start,
-   unsigned long end,
-   struct nouveau_migrate *migrate)
+static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
+   struct vm_area_struct *vma, unsigned long addr,
+   unsigned long src, dma_addr_t *dma_addr)
  {
-   struct nouveau_drm *drm = migrate->drm;
struct device *dev = drm->dev->dev;
-   unsigned long addr, i, npages = 0;
-   nouveau_migrate_copy_t copy;
-   int ret;
-
-   /* First allocate new memory */
-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *dpage, *spage;
-
-   dst_pfns[i] = 0;
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
-   continue;
-
-   dpage = nouveau_dmem_page_alloc_locked(drm);
-   if (!dpage)
-   continue;
-
-   dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
- MIGRATE_PFN_LOCKED |
- MIGRATE_PFN_DEVICE;
-   npages++;
-   }
-
-   if (!npages)
-   return;
-
-   /* Allocate storage for DMA addresses, so we can unmap later. */
-   migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL);
-   if (!migrate->dma)
-   goto error;
-   migrate->dma_nr = 0;
-
-   /* Copy things over */
-   copy = drm->dmem->migrate.copy_func;
-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *spage, *dpage;
-
-   dpage = migrate_pfn_to_page(dst_pfns[i]);
-   if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
-   continue;
-
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
-   nouveau_dmem_page_free_locked(drm, dpage);
-   dst_pfns[i] = 0;
-   continue;
-   }
-
-   migrate->dma[migrate->dma_nr] =
-   dma_map_page_attrs(dev, spage, 0, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL,
-  DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(dev, migrate->dma[migrate->dma_nr])) {
-   nouveau_dmem_page_free_locked(drm, dpage);
-   dst_pfns[i] = 0;
-   continue;
-   }
-
-   ret = copy(drm, 1, NOUVEAU_APER_VRAM,
-   nouveau_dmem_page_addr(dpage),
-   NOUVEAU_APER_HOST,
-   migrate->dma[migrate->dma_nr++]);
-   if (ret) {
-   nouveau_dmem_page_free_locked(drm, dpage);
-   dst_pfns[i] = 0;
-   continue;
-

Re: [PATCH 5/9] nouveau: simplify nouveau_dmem_migrate_to_ram

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

Factor the main copy page to ram routine out into a helper that acts on
a single page and which doesn't require the nouveau_dmem_fault
structure for argument passing.  Also remove the loop over multiple
pages as we only handle one at the moment, although the structure of
the main worker function makes it relatively easy to add multi page
support back if needed in the future.  But at least for now this avoid
the needed to dynamically allocate memory for the dma addresses in
what is essentially the page fault path.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 158 ++---
  1 file changed, 39 insertions(+), 119 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 21052a4aaf69..036e6c07d489 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -86,13 +86,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page 
*page)
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
  }
  
-struct nouveau_dmem_fault {

-   struct nouveau_drm *drm;
-   struct nouveau_fence *fence;
-   dma_addr_t *dma;
-   unsigned long npages;
-};
-
  struct nouveau_migrate {
struct vm_area_struct *vma;
struct nouveau_drm *drm;
@@ -146,130 +139,55 @@ static void nouveau_dmem_fence_done(struct nouveau_fence 
**fence)
}
  }
  
-static void

-nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
- const unsigned long *src_pfns,
- unsigned long *dst_pfns,
- unsigned long start,
- unsigned long end,
- struct nouveau_dmem_fault *fault)
+static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
+   struct vm_area_struct *vma, unsigned long addr,
+   unsigned long src, unsigned long *dst, dma_addr_t *dma_addr)
  {
-   struct nouveau_drm *drm = fault->drm;
struct device *dev = drm->dev->dev;
-   unsigned long addr, i, npages = 0;
-   nouveau_migrate_copy_t copy;
-   int ret;
-
-
-   /* First allocate new memory */
-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *dpage, *spage;
-
-   dst_pfns[i] = 0;
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
-   continue;
+   struct page *dpage, *spage;
  
-		dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);

-   if (!dpage) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   continue;
-   }
-   lock_page(dpage);
-
-   dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
- MIGRATE_PFN_LOCKED;
-   npages++;
-   }
+   spage = migrate_pfn_to_page(src);
+   if (!spage || !(src & MIGRATE_PFN_MIGRATE))
+   return 0;
  
-	/* Allocate storage for DMA addresses, so we can unmap later. */

-   fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL);
-   if (!fault->dma)
+   dpage = alloc_page_vma(GFP_HIGHUSER, args->vma, addr);
+   if (!dpage)
goto error;
+   lock_page(dpage);
  
-	/* Copy things over */

-   copy = drm->dmem->migrate.copy_func;
-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *spage, *dpage;
-
-   dpage = migrate_pfn_to_page(dst_pfns[i]);
-   if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
-   continue;
-
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   __free_page(dpage);
-   continue;
-   }
-
-   fault->dma[fault->npages] =
-   dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL,
-  DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(dev, fault->dma[fault->npages])) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   __free_page(dpage);
-   continue;
-   }
-
-   ret = copy(drm, 1, NOUVEAU_APER_HOST,
-   fault->dma[fault->npages++],
-   NOUVEAU_APER_VRAM,
-   nouveau_dmem_page_addr(spage));
-   if (ret) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-

Re: [PATCH 9/9] mm: remove the MIGRATE_PFN_WRITE flag

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

The MIGRATE_PFN_WRITE is only used locally in migrate_vma_collect_pmd,
where it can be replaced with a simple boolean local variable.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  include/linux/migrate.h | 1 -
  mm/migrate.c| 9 +
  2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 8b46cfdb1a0e..ba74ef5a7702 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -165,7 +165,6 @@ static inline int migrate_misplaced_transhuge_page(struct 
mm_struct *mm,
  #define MIGRATE_PFN_VALID (1UL << 0)
  #define MIGRATE_PFN_MIGRATE   (1UL << 1)
  #define MIGRATE_PFN_LOCKED(1UL << 2)
-#define MIGRATE_PFN_WRITE  (1UL << 3)
  #define MIGRATE_PFN_SHIFT 6
  
  static inline struct page *migrate_pfn_to_page(unsigned long mpfn)

diff --git a/mm/migrate.c b/mm/migrate.c
index 74735256e260..724f92dcc31b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2212,6 +2212,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
unsigned long mpfn, pfn;
struct page *page;
swp_entry_t entry;
+   bool writable = false;
pte_t pte;
  
  		pte = *ptep;

@@ -2240,7 +2241,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
-   mpfn |= MIGRATE_PFN_WRITE;
+   writable = true;
} else {
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
@@ -2250,7 +2251,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
}
page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
-   mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
+   if (pte_write(pte))
+   writable = true;
}
  
  		/* FIXME support THP */

@@ -2284,8 +2286,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
ptep_get_and_clear(mm, addr, ptep);
  
  			/* Setup special migration page table entry */

-   entry = make_migration_entry(page, mpfn &
-MIGRATE_PFN_WRITE);
+   entry = make_migration_entry(page, writable);
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pte))
swp_pte = pte_swp_mksoft_dirty(swp_pte);



MIGRATE_PFN_WRITE may mot being used but that seems like a bug to me.
If a page is migrated to device memory, it could be mapped at the same
time to avoid a device page fault but it would need the flag to know
whether to map it RW or RO. But I suppose that could be inferred from
the vma->vm_flags.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 2/9] nouveau: reset dma_nr in nouveau_dmem_migrate_alloc_and_copy

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

When we start a new batch of dma_map operations we need to reset dma_nr,
as we start filling a newly allocated array.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 38416798abd4..e696157f771e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -682,6 +682,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct 
*vma,
migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL);
if (!migrate->dma)
goto error;
+   migrate->dma_nr = 0;
  
  	/* Copy things over */

copy = drm->dmem->migrate.copy_func;


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/9] mm: turn migrate_vma upside down

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

There isn't any good reason to pass callbacks to migrate_vma.  Instead
we can just export the three steps done by this function to drivers and
let them sequence the operation without callbacks.  This removes a lot
of boilerplate code as-is, and will allow the drivers to drastically
improve code flow and error handling further on.

Signed-off-by: Christoph Hellwig 


Except for a few white space errors ( and $),
looks OK.

Reviewed-by: Ralph Campbell 


---
  Documentation/vm/hmm.rst   |  55 +-
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 122 +++--
  include/linux/migrate.h| 118 ++--
  mm/migrate.c   | 242 +++--
  4 files changed, 193 insertions(+), 344 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index ddcb5ca8b296..ad880e3996b1 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -339,58 +339,9 @@ Migration to and from device memory
  ===
  
  Because the CPU cannot access device memory, migration must use the device DMA

-engine to perform copy from and to device memory. For this we need a new
-migration helper::
-
- int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long mentries,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private);
-
-Unlike other migration functions it works on a range of virtual address, there
-are two reasons for that. First, device DMA copy has a high setup overhead cost
-and thus batching multiple pages is needed as otherwise the migration overhead
-makes the whole exercise pointless. The second reason is because the
-migration might be for a range of addresses the device is actively accessing.
-
-The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
-controls destination memory allocation and copy operation. Second one is there
-to allow the device driver to perform cleanup operations after migration::
-
- struct migrate_vma_ops {
- void (*alloc_and_copy)(struct vm_area_struct *vma,
-const unsigned long *src,
-unsigned long *dst,
-unsigned long start,
-unsigned long end,
-void *private);
- void (*finalize_and_map)(struct vm_area_struct *vma,
-  const unsigned long *src,
-  const unsigned long *dst,
-  unsigned long start,
-  unsigned long end,
-  void *private);
- };
-
-It is important to stress that these migration helpers allow for holes in the
-virtual address range. Some pages in the range might not be migrated for all
-the usual reasons (page is pinned, page is locked, ...). This helper does not
-fail but just skips over those pages.
-
-The alloc_and_copy() might decide to not migrate all pages in the
-range (for reasons under the callback control). For those, the callback just
-has to leave the corresponding dst entry empty.
-
-Finally, the migration of the struct page might fail (for file backed page) for
-various reasons (failure to freeze reference, or update page cache, ...). If
-that happens, then the finalize_and_map() can catch any pages that were not
-migrated. Note those pages were still copied to a new page and thus we wasted
-bandwidth but this is considered as a rare event and a price that we are
-willing to pay to keep all the code simpler.
+engine to perform copy from and to device memory. For this we need a new to
+use migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
+helpers.
  
  
  Memory cgroup (memcg) and rss accounting

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 345c63cb752a..38416798abd4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -131,9 +131,8 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
  unsigned long *dst_pfns,
  unsigned long start,
  unsigned long end,
- void *private)
+ struct nouveau_dmem_fault *fault)
  {
-   struct nouveau_dmem_fault *fault = private;
struct nouveau_drm *drm = fault->drm;
struct device *dev = drm->dev->dev;
unsigned long addr, i, npages = 0;
@@ -230,14 +229,9 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
}
  }
  
-void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma,

-

Re: [PATCH 4/9] nouveau: factor out dmem fence completion

2019-07-29 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

Factor out the end of fencing logic from the two migration routines.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 33 --
  1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index d469bc334438..21052a4aaf69 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -133,6 +133,19 @@ static void nouveau_dmem_page_free(struct page *page)
spin_unlock(>lock);
  }
  
+static void nouveau_dmem_fence_done(struct nouveau_fence **fence)

+{
+   if (fence) {
+   nouveau_fence_wait(*fence, true, false);
+   nouveau_fence_unref(fence);
+   } else {
+   /*
+* FIXME wait for channel to be IDLE before calling finalizing
+* the hmem object.
+*/
+   }
+}
+
  static void
  nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
  const unsigned long *src_pfns,
@@ -236,15 +249,7 @@ nouveau_dmem_fault_finalize_and_map(struct 
nouveau_dmem_fault *fault)
  {
struct nouveau_drm *drm = fault->drm;
  
-	if (fault->fence) {

-   nouveau_fence_wait(fault->fence, true, false);
-   nouveau_fence_unref(>fence);
-   } else {
-   /*
-* FIXME wait for channel to be IDLE before calling finalizing
-* the hmem object below (nouveau_migrate_hmem_fini()).
-*/
-   }
+   nouveau_dmem_fence_done(>fence);
  
  	while (fault->npages--) {

dma_unmap_page(drm->dev->dev, fault->dma[fault->npages],
@@ -748,15 +753,7 @@ nouveau_dmem_migrate_finalize_and_map(struct 
nouveau_migrate *migrate)
  {
struct nouveau_drm *drm = migrate->drm;
  
-	if (migrate->fence) {

-   nouveau_fence_wait(migrate->fence, true, false);
-   nouveau_fence_unref(>fence);
-   } else {
-   /*
-* FIXME wait for channel to be IDLE before finalizing
-* the hmem object below (nouveau_migrate_hmem_fini()) ?
-*/
-   }
+   nouveau_dmem_fence_done(>fence);
  
  	while (migrate->dma_nr--) {

dma_unmap_page(drm->dev->dev, migrate->dma[migrate->dma_nr],


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 hmm 11/11] mm/mmu_notifiers: remove unregister_no_release

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

mmu_notifier_unregister_no_release() and mmu_notifier_call_srcu() no
longer have any users, they have all been converted to use
mmu_notifier_put().

So delete this difficult to use interface.

Signed-off-by: Jason Gunthorpe 


Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/5] mm: Check if mmu notifier callbacks are allowed to fail

2019-08-14 Thread Ralph Campbell



On 8/14/19 3:14 PM, Andrew Morton wrote:

On Wed, 14 Aug 2019 22:20:23 +0200 Daniel Vetter  wrote:


Just a bit of paranoia, since if we start pushing this deep into
callchains it's hard to spot all places where an mmu notifier
implementation might fail when it's not allowed to.

Inspired by some confusion we had discussing i915 mmu notifiers and
whether we could use the newly-introduced return value to handle some
corner cases. Until we realized that these are only for when a task
has been killed by the oom reaper.

An alternative approach would be to split the callback into two
versions, one with the int return value, and the other with void
return value like in older kernels. But that's a lot more churn for
fairly little gain I think.

Summary from the m-l discussion on why we want something at warning
level: This allows automated tooling in CI to catch bugs without
humans having to look at everything. If we just upgrade the existing
pr_info to a pr_warn, then we'll have false positives. And as-is, no
one will ever spot the problem since it's lost in the massive amounts
of overall dmesg noise.

...

--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -179,6 +179,8 @@ int __mmu_notifier_invalidate_range_start(struct 
mmu_notifier_range *range)
pr_info("%pS callback failed with %d in %sblockable 
context.\n",
mn->ops->invalidate_range_start, _ret,
!mmu_notifier_range_blockable(range) ? "non-" : 
"");
+   WARN_ON(mmu_notifier_range_blockable(range) ||
+   ret != -EAGAIN);
ret = _ret;
}
}


A problem with WARN_ON(a || b) is that if it triggers, we don't know
whether it was because of a or because of b.  Or both.  So I'd suggest

WARN_ON(a);
WARN_ON(b);



This won't quite work. It is OK to have 
mmu_notifier_range_blockable(range) be true or false.

sync_cpu_device_pagetables() shouldn't return
-EAGAIN unless blockable is true.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 hmm 00/11] Add mmu_notifier_get/put for managing mmu notifier registrations

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

This series introduces a new registration flow for mmu_notifiers based on
the idea that the user would like to get a single refcounted piece of
memory for a mm, keyed to its use.

For instance many users of mmu_notifiers use an interval tree or similar
to dispatch notifications to some object. There are many objects but only
one notifier subscription per mm holding the tree.

Of the 12 places that call mmu_notifier_register:
  - 7 are maintaining some kind of obvious mapping of mm_struct to
mmu_notifier registration, ie in some linked list or hash table. Of
the 7 this series converts 4 (gru, hmm, RDMA, radeon)

  - 3 (hfi1, gntdev, vhost) are registering multiple notifiers, but each
one immediately does some VA range filtering, ie with an interval tree.
These would be better with a global subsystem-wide range filter and
could convert to this API.

  - 2 (kvm, amd_iommu) are deliberately using a single mm at a time, and
really can't use this API. One of the intel-svm's modes is also in this
list

The 3/7 unconverted drivers are:
  - intel-svm
This driver tracks mm's in a global linked list 'global_svm_list'
and would benefit from this API.

Its flow is a bit complex, since it also wants a set of non-shared
notifiers.

  - i915_gem_usrptr
This driver tracks mm's in a per-device hash
table (dev_priv->mm_structs), but only has an optional use of
mmu_notifiers.  Since it still seems to need the hash table it is
difficult to convert.

  - amdkfd/kfd_process
This driver is using a global SRCU hash table to track mm's

The control flow here is very complicated and the driver is relying on
this hash table to be fast on the ioctl syscall path.

It would definitely benefit, but only if the ioctl path didn't need to
do the search so often.

This series is already entangled with patches in the hmm & RDMA tree and
will require some git topic branches for the RDMA ODP stuff. I intend for
it to go through the hmm tree.

There is a git version here:

https://github.com/jgunthorpe/linux/commits/mmu_notifier

Which has the required pre-patches for the RDMA ODP conversion that are
still being reviewed.

Jason Gunthorpe (11):
   mm/mmu_notifiers: hoist do_mmu_notifier_register down_write to the
 caller
   mm/mmu_notifiers: do not speculatively allocate a mmu_notifier_mm
   mm/mmu_notifiers: add a get/put scheme for the registration
   misc/sgi-gru: use mmu_notifier_get/put for struct gru_mm_struct
   hmm: use mmu_notifier_get/put for 'struct hmm'
   RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'
   RDMA/odp: remove ib_ucontext from ib_umem
   drm/radeon: use mmu_notifier_get/put for struct radeon_mn
   drm/amdkfd: fix a use after free race with mmu_notifer unregister
   drm/amdkfd: use mmu_notifier_put
   mm/mmu_notifiers: remove unregister_no_release

  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|   3 -
  drivers/gpu/drm/amd/amdkfd/kfd_process.c |  88 -
  drivers/gpu/drm/nouveau/nouveau_drm.c|   3 +
  drivers/gpu/drm/radeon/radeon.h  |   3 -
  drivers/gpu/drm/radeon/radeon_device.c   |   2 -
  drivers/gpu/drm/radeon/radeon_drv.c  |   2 +
  drivers/gpu/drm/radeon/radeon_mn.c   | 157 
  drivers/infiniband/core/umem.c   |   4 +-
  drivers/infiniband/core/umem_odp.c   | 183 ++
  drivers/infiniband/core/uverbs_cmd.c |   3 -
  drivers/infiniband/core/uverbs_main.c|   1 +
  drivers/infiniband/hw/mlx5/main.c|   5 -
  drivers/misc/sgi-gru/grufile.c   |   1 +
  drivers/misc/sgi-gru/grutables.h |   2 -
  drivers/misc/sgi-gru/grutlbpurge.c   |  84 +++--
  include/linux/hmm.h  |  12 +-
  include/linux/mm_types.h |   6 -
  include/linux/mmu_notifier.h |  40 +++-
  include/rdma/ib_umem.h   |   2 +-
  include/rdma/ib_umem_odp.h   |  10 +-
  include/rdma/ib_verbs.h  |   3 -
  kernel/fork.c|   1 -
  mm/hmm.c | 121 +++-
  mm/mmu_notifier.c| 230 +--
  25 files changed, 408 insertions(+), 559 deletions(-)


For the core MM, HMM, and nouveau changes you can add:
Tested-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 hmm 03/11] mm/mmu_notifiers: add a get/put scheme for the registration

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

Many places in the kernel have a flow where userspace will create some
object and that object will need to connect to the subsystem's
mmu_notifier subscription for the duration of its lifetime.

In this case the subsystem is usually tracking multiple mm_structs and it
is difficult to keep track of what struct mmu_notifier's have been
allocated for what mm's.

Since this has been open coded in a variety of exciting ways, provide core
functionality to do this safely.

This approach uses the strct mmu_notifier_ops * as a key to determine if


s/strct/struct


the subsystem has a notifier registered on the mm or not. If there is a
registration then the existing notifier struct is returned, otherwise the
ops->alloc_notifiers() is used to create a new per-subsystem notifier for
the mm.

The destroy side incorporates an async call_srcu based destruction which
will avoid bugs in the callers such as commit 6d7c3cde93c1 ("mm/hmm: fix
use after free with struct hmm in the mmu notifiers").

Since we are inside the mmu notifier core locking is fairly simple, the
allocation uses the same approach as for mmu_notifier_mm, the write side
of the mmap_sem makes everything deterministic and we only need to do
hlist_add_head_rcu() under the mm_take_all_locks(). The new users count
and the discoverability in the hlist is fully serialized by the
mmu_notifier_mm->lock.

Co-developed-by: Christoph Hellwig 
Signed-off-by: Christoph Hellwig 
Signed-off-by: Jason Gunthorpe 


Reviewed-by: Ralph Campbell 


---
  include/linux/mmu_notifier.h |  35 
  mm/mmu_notifier.c| 156 +--
  2 files changed, 185 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index b6c004bd9f6ad9..31aa971315a142 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -211,6 +211,19 @@ struct mmu_notifier_ops {
 */
void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm,
 unsigned long start, unsigned long end);
+
+   /*
+* These callbacks are used with the get/put interface to manage the
+* lifetime of the mmu_notifier memory. alloc_notifier() returns a new
+* notifier for use with the mm.
+*
+* free_notifier() is only called after the mmu_notifier has been
+* fully put, calls to any ops callback are prevented and no ops
+* callbacks are currently running. It is called from a SRCU callback
+* and cannot sleep.
+*/
+   struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm);
+   void (*free_notifier)(struct mmu_notifier *mn);
  };
  
  /*

@@ -227,6 +240,9 @@ struct mmu_notifier_ops {
  struct mmu_notifier {
struct hlist_node hlist;
const struct mmu_notifier_ops *ops;
+   struct mm_struct *mm;
+   struct rcu_head rcu;
+   unsigned int users;
  };
  
  static inline int mm_has_notifiers(struct mm_struct *mm)

@@ -234,6 +250,21 @@ static inline int mm_has_notifiers(struct mm_struct *mm)
return unlikely(mm->mmu_notifier_mm);
  }
  
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,

+struct mm_struct *mm);
+static inline struct mmu_notifier *
+mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm)
+{
+   struct mmu_notifier *ret;
+
+   down_write(>mmap_sem);
+   ret = mmu_notifier_get_locked(ops, mm);
+   up_write(>mmap_sem);
+   return ret;
+}
+void mmu_notifier_put(struct mmu_notifier *mn);
+void mmu_notifier_synchronize(void);
+
  extern int mmu_notifier_register(struct mmu_notifier *mn,
 struct mm_struct *mm);
  extern int __mmu_notifier_register(struct mmu_notifier *mn,
@@ -581,6 +612,10 @@ static inline void mmu_notifier_mm_destroy(struct 
mm_struct *mm)
  #define pudp_huge_clear_flush_notify pudp_huge_clear_flush
  #define set_pte_at_notify set_pte_at
  
+static inline void mmu_notifier_synchronize(void)

+{
+}
+
  #endif /* CONFIG_MMU_NOTIFIER */
  
  #endif /* _LINUX_MMU_NOTIFIER_H */

diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 696810f632ade1..4a770b5211b71d 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -248,6 +248,9 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct 
mm_struct *mm)
lockdep_assert_held_write(>mmap_sem);
BUG_ON(atomic_read(>mm_users) <= 0);
  
+	mn->mm = mm;

+   mn->users = 1;
+
if (!mm->mmu_notifier_mm) {
/*
 * kmalloc cannot be called under mm_take_all_locks(), but we
@@ -295,18 +298,24 @@ int __mmu_notifier_register(struct mmu_notifier *mn, 
struct mm_struct *mm)
  }
  EXPORT_SYMBOL_GPL(__mmu_notifier_register);
  
-/*

+/**
+ * mmu_notifier_register - Register a notif

Re: [PATCH v3 hmm 02/11] mm/mmu_notifiers: do not speculatively allocate a mmu_notifier_mm

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

A prior commit e0f3c3f78da2 ("mm/mmu_notifier: init notifier if necessary")
made an attempt at doing this, but had to be reverted as calling
the GFP_KERNEL allocator under the i_mmap_mutex causes deadlock, see
commit 35cfa2b0b491 ("mm/mmu_notifier: allocate mmu_notifier in advance").

However, we can avoid that problem by doing the allocation only under
the mmap_sem, which is already happening.

Since all writers to mm->mmu_notifier_mm hold the write side of the
mmap_sem reading it under that sem is deterministic and we can use that to
decide if the allocation path is required, without speculation.

The actual update to mmu_notifier_mm must still be done under the
mm_take_all_locks() to ensure read-side coherency.

Signed-off-by: Jason Gunthorpe 


Looks good to me.
Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 hmm 05/11] hmm: use mmu_notifier_get/put for 'struct hmm'

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

This is a significant simplification, it eliminates all the remaining
'hmm' stuff in mm_struct, eliminates krefing along the critical notifier
paths, and takes away all the ugly locking and abuse of page_table_lock.

mmu_notifier_get() provides the single struct hmm per struct mm which
eliminates mm->hmm.

It also directly guarantees that no mmu_notifier op callback is callable
while concurrent free is possible, this eliminates all the krefs inside
the mmu_notifier callbacks.

The remaining krefs in the range code were overly cautious, drivers are
already not permitted to free the mirror while a range exists.

Signed-off-by: Jason Gunthorpe 


Looks good.
Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: turn hmm migrate_vma upside down v3

2019-08-14 Thread Ralph Campbell



On 8/14/19 12:59 AM, Christoph Hellwig wrote:

Hi Jérôme, Ben and Jason,

below is a series against the hmm tree which starts revamping the
migrate_vma functionality.  The prime idea is to export three slightly
lower level functions and thus avoid the need for migrate_vma_ops
callbacks.

Diffstat:

 7 files changed, 282 insertions(+), 614 deletions(-)

A git tree is also available at:

 git://git.infradead.org/users/hch/misc.git migrate_vma-cleanup.3

Gitweb:

 
http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/migrate_vma-cleanup.3


Changes since v2:
  - don't unmap pages when returning 0 from nouveau_dmem_migrate_to_ram
  - minor style fixes
  - add a new patch to remove CONFIG_MIGRATE_VMA_HELPER

Changes since v1:
  - fix a few whitespace issues
  - drop the patch to remove MIGRATE_PFN_WRITE for now
  - various spelling fixes
  - clear cpages and npages in migrate_vma_setup
  - fix the nouveau_dmem_fault_copy_one return value
  - minor improvements to some nouveau internal calling conventions



Some of the patches seem to have been mangled in the mail.
I was able to edit them and apply to Jason's tree
https://github.com/jgunthorpe/linux.git mmu_notifier branch.
So for the series you can add:

Tested-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 hmm 01/11] mm/mmu_notifiers: hoist do_mmu_notifier_register down_write to the caller

2019-08-14 Thread Ralph Campbell



On 8/6/19 4:15 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

This simplifies the code to not have so many one line functions and extra
logic. __mmu_notifier_register() simply becomes the entry point to
register the notifier, and the other one calls it under lock.

Also add a lockdep_assert to check that the callers are holding the lock
as expected.

Suggested-by: Christoph Hellwig 
Signed-off-by: Jason Gunthorpe 


Nice clean up.
Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH] nouveau/hmm: map pages after migration

2019-08-12 Thread Ralph Campbell



On 8/10/19 4:13 AM, Christoph Hellwig wrote:

On something vaguely related to this patch:

You use the NVIF_VMM_PFNMAP_V0_V* defines from nvif/if000c.h, which are
a little odd as we only ever set these bits, but they also don't seem
to appear to be in values that are directly fed to the hardware.

On the other hand mmu/vmm.h defines a set of NVIF_VMM_PFNMAP_V0_*


Yes, I see NVKM_VMM_PFN_*


constants with similar names and identical values, and those are used
in mmu/vmmgp100.c and what appears to finally do the low-level dma
mapping and talking to the hardware.  Are these two sets of constants
supposed to be the same?  Are the actual hardware values or just a
driver internal interface?


It looks a bit odd to me too.
I don't really know the structure/history of nouveau.
Perhaps Ben Skeggs can shed more light on your question.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk

2019-08-16 Thread Ralph Campbell



On 8/16/19 10:28 AM, Jason Gunthorpe wrote:

On Fri, Aug 16, 2019 at 10:21:41AM -0700, Dan Williams wrote:


We can do a get_dev_pagemap inside the page_walk and touch the pgmap,
or we can do the 'device mutex && retry' pattern and touch the pgmap
in the driver, under that lock.

However in all cases the current get_dev_pagemap()'s in the page walk
are not necessary, and we can delete them.


Yes, as long as 'struct page' instances resulting from that lookup are
not passed outside of that lock.


Indeed.

Also, I was reflecting over lunch that the hmm_range_fault should only
return DEVICE_PRIVATE pages for the caller's device (see other thread
with HCH), and in this case, the caller should also be responsible to
ensure that the driver is not calling hmm_range_fault at the same time
it is deleting it's own DEVICE_PRIVATE mapping - ie by fencing its
page fault handler.


Yes, that would make it a one step process to access another
device's migrated memory pages.
Right now, it has to be a two step process where the caller calls
hmm_range_fault, check the struct page to see if it is device
private and not owned, then call hmm_range_fault again with
range->pfns[i] |= range->flags[HMM_PFN_DEVICE_PRIVATE] to cause
the other device to migrate the page back to system memory.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug

2019-08-26 Thread Ralph Campbell



On 8/26/19 11:09 AM, Jason Gunthorpe wrote:

On Mon, Aug 26, 2019 at 11:02:12AM -0700, Ralph Campbell wrote:


On 8/24/19 3:37 PM, Christoph Hellwig wrote:

On Fri, Aug 23, 2019 at 03:17:52PM -0700, Ralph Campbell wrote:

Although hmm_range_fault() calls find_vma() to make sure that a vma exists
before calling walk_page_range(), hmm_vma_walk_hole() can still be called
with walk->vma == NULL if the start and end address are not contained
within the vma range.


Should we convert to walk_vma_range instead?  Or keep walk_page_range
but drop searching the vma ourselves?

Except for that the patch looks good to me:

Reviewed-by: Christoph Hellwig 



I think keeping the call to walk_page_range() makes sense.
Jason is hoping to be able to snapshot a range with & without vmas
and have the pfns[] filled with empty/valid entries as appropriate.

I plan to repost my patch changing hmm_range_fault() to use
walk.test_walk which will remove the call to find_vma().
Jason had some concerns about testing it so that's why I have
been working on some HMM self tests before resending it.


I'm really excited to see tests for hmm_range_fault()!

Did you find this bug with the tests??

Jason



Yes, I found both bugs with the tests.
I started with Jerome's hmm_dummy driver and user level test code.
Hopefully I can send it out this week.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug

2019-08-26 Thread Ralph Campbell



On 8/24/19 3:37 PM, Christoph Hellwig wrote:

On Fri, Aug 23, 2019 at 03:17:52PM -0700, Ralph Campbell wrote:

Although hmm_range_fault() calls find_vma() to make sure that a vma exists
before calling walk_page_range(), hmm_vma_walk_hole() can still be called
with walk->vma == NULL if the start and end address are not contained
within the vma range.


Should we convert to walk_vma_range instead?  Or keep walk_page_range
but drop searching the vma ourselves?

Except for that the patch looks good to me:

Reviewed-by: Christoph Hellwig 



I think keeping the call to walk_page_range() makes sense.
Jason is hoping to be able to snapshot a range with & without vmas
and have the pfns[] filled with empty/valid entries as appropriate.

I plan to repost my patch changing hmm_range_fault() to use
walk.test_walk which will remove the call to find_vma().
Jason had some concerns about testing it so that's why I have
been working on some HMM self tests before resending it.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 2/2] mm/hmm: hmm_range_fault() infinite loop

2019-08-27 Thread Ralph Campbell



On 8/27/19 11:41 AM, Jason Gunthorpe wrote:

On Fri, Aug 23, 2019 at 03:17:53PM -0700, Ralph Campbell wrote:


Signed-off-by: Ralph Campbell 
  mm/hmm.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index 29371485fe94..4882b83aeccb 100644
+++ b/mm/hmm.c
@@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned 
long end,
hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
  
+	if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))

+   return -EPERM;


Can walk->vma be NULL here? hmm_vma_do_fault() touches it
unconditionally.

Jason


walk->vma can be NULL. hmm_vma_do_fault() no longer touches it
unconditionally, that is what the preceding patch fixes.
I suppose I could change hmm_vma_walk_hole_() to check for NULL
and fill in the pfns[] array, I just chose to handle it in
hmm_vma_do_fault().
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug

2019-08-23 Thread Ralph Campbell

Although hmm_range_fault() calls find_vma() to make sure that a vma exists
before calling walk_page_range(), hmm_vma_walk_hole() can still be called
with walk->vma == NULL if the start and end address are not contained
within the vma range.

 hmm_range_fault() /* calls find_vma() but no range check */
  walk_page_range() /* calls find_vma(), sets walk->vma = NULL */
   __walk_page_range()
walk_pgd_range()
 walk_p4d_range()
  walk_pud_range()
   hmm_vma_walk_hole()
hmm_vma_walk_hole_()
 hmm_vma_do_fault()
  handle_mm_fault(vma=0)

Signed-off-by: Ralph Campbell 
---
 mm/hmm.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index fc05c8fe78b4..29371485fe94 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -229,6 +229,9 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned 
long addr,
struct vm_area_struct *vma = walk->vma;
vm_fault_t ret;
 
+   if (!vma)
+   goto err;
+
if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
flags |= FAULT_FLAG_ALLOW_RETRY;
if (write_fault)
@@ -239,12 +242,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, 
unsigned long addr,
/* Note, handle_mm_fault did up_read(>mmap_sem)) */
return -EAGAIN;
}
-   if (ret & VM_FAULT_ERROR) {
-   *pfn = range->values[HMM_PFN_ERROR];
-   return -EFAULT;
-   }
+   if (ret & VM_FAULT_ERROR)
+   goto err;
 
return -EBUSY;
+
+err:
+   *pfn = range->values[HMM_PFN_ERROR];
+   return -EFAULT;
 }
 
 static int hmm_pfns_bad(unsigned long addr,
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 2/2] mm/hmm: hmm_range_fault() infinite loop

2019-08-23 Thread Ralph Campbell

Normally, callers to handle_mm_fault() are supposed to check the
vma->vm_flags first. hmm_range_fault() checks for VM_READ but doesn't
check for VM_WRITE if the caller requests a page to be faulted in
with write permission (via the hmm_range.pfns[] value).
If the vma is write protected, this can result in an infinite loop:
  hmm_range_fault()
walk_page_range()
  ...
  hmm_vma_walk_hole()
hmm_vma_walk_hole_()
  hmm_vma_do_fault()
handle_mm_fault(FAULT_FLAG_WRITE)
/* returns VM_FAULT_WRITE */
  /* returns -EBUSY */
/* returns -EBUSY */
  /* returns -EBUSY */
/* loops on -EBUSY and range->valid */
Prevent this by checking for vma->vm_flags & VM_WRITE before calling
handle_mm_fault().

Signed-off-by: Ralph Campbell 
---
 mm/hmm.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/hmm.c b/mm/hmm.c
index 29371485fe94..4882b83aeccb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned 
long end,
hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
 
+   if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
+   return -EPERM;
+
for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 0/2] mm/hmm: two bug fixes for hmm_range_fault()

2019-08-23 Thread Ralph Campbell

I have been working on converting Jerome's hmm_dummy driver and self
tests into a stand-alone set of tests to be included in
tools/testing/selftests/vm and came across these two bug fixes in the
process. The tests aren't quite ready to be posted as a patch.
I'm posting the fixes now since I thought they shouldn't wait.
They should probably have a fixes line but with all the HMM changes,
I wasn't sure exactly which commit to use.

These are based on top of Jason's latest hmm branch.

Ralph Campbell (2):
  mm/hmm: hmm_range_fault() NULL pointer bug
  mm/hmm: hmm_range_fault() infinite loop

 mm/hmm.c | 16 
 1 file changed, 12 insertions(+), 4 deletions(-)

-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 2/4] mm/hmm: allow snapshot of the special zero page

2019-09-11 Thread Ralph Campbell

Allow hmm_range_fault() to return success (0) when the CPU pagetable
entry points to the special shared zero page.
The caller can then handle the zero page by possibly clearing device
private memory instead of DMAing a zero page.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 06041d4399ff..7217912bef13 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -532,7 +532,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
return -EBUSY;
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) 
{
*pfn = range->values[HMM_PFN_SPECIAL];
-   return -EFAULT;
+   return is_zero_pfn(pte_pfn(pte)) ? 0 : -EFAULT;
}
 
*pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 1/4] mm/hmm: make full use of walk_page_range()

2019-09-11 Thread Ralph Campbell

hmm_range_fault() calls find_vma() and walk_page_range() in a loop.
This is unnecessary duplication since walk_page_range() calls find_vma()
in a loop already.
Simplify hmm_range_fault() by defining a walk_test() callback function
to filter unhandled vmas.
This also fixes a bug where hmm_range_fault() was not checking
start >= vma->vm_start before checking vma->vm_flags so hmm_range_fault()
could return an error based on the wrong vma for the requested range.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 113 ---
 1 file changed, 50 insertions(+), 63 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 902f5fa6bf93..06041d4399ff 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -252,18 +252,17 @@ static int hmm_vma_do_fault(struct mm_walk *walk, 
unsigned long addr,
return -EFAULT;
 }
 
-static int hmm_pfns_bad(unsigned long addr,
-   unsigned long end,
-   struct mm_walk *walk)
+static int hmm_pfns_fill(unsigned long addr,
+unsigned long end,
+struct hmm_range *range,
+enum hmm_pfn_value_e value)
 {
-   struct hmm_vma_walk *hmm_vma_walk = walk->private;
-   struct hmm_range *range = hmm_vma_walk->range;
uint64_t *pfns = range->pfns;
unsigned long i;
 
i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
-   pfns[i] = range->values[HMM_PFN_ERROR];
+   pfns[i] = range->values[value];
 
return 0;
 }
@@ -584,7 +583,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
}
return 0;
} else if (!pmd_present(pmd))
-   return hmm_pfns_bad(start, end, walk);
+   return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
@@ -612,7 +611,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 * recover.
 */
if (pmd_bad(pmd))
-   return hmm_pfns_bad(start, end, walk);
+   return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 
ptep = pte_offset_map(pmdp, addr);
i = (addr - range->start) >> PAGE_SHIFT;
@@ -770,13 +769,36 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, 
unsigned long hmask,
 #define hmm_vma_walk_hugetlb_entry NULL
 #endif /* CONFIG_HUGETLB_PAGE */
 
-static void hmm_pfns_clear(struct hmm_range *range,
-  uint64_t *pfns,
-  unsigned long addr,
-  unsigned long end)
+static int hmm_vma_walk_test(unsigned long start,
+unsigned long end,
+struct mm_walk *walk)
 {
-   for (; addr < end; addr += PAGE_SIZE, pfns++)
-   *pfns = range->values[HMM_PFN_NONE];
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+
+   /* If range is no longer valid, force retry. */
+   if (!range->valid)
+   return -EBUSY;
+
+   /*
+* Skip vma ranges that don't have struct page backing them or
+* map I/O devices directly.
+*/
+   if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
+   return -EFAULT;
+
+   /*
+* If the vma does not allow read access, then assume that it does not
+* allow write access either. HMM does not support architectures
+* that allow write without read.
+*/
+   if (!(vma->vm_flags & VM_READ)) {
+   (void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+   return -EPERM;
+   }
+
+   return 0;
 }
 
 /*
@@ -857,6 +879,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
.pmd_entry  = hmm_vma_walk_pmd,
.pte_hole   = hmm_vma_walk_hole,
.hugetlb_entry  = hmm_vma_walk_hugetlb_entry,
+   .test_walk  = hmm_vma_walk_test,
 };
 
 /**
@@ -889,63 +912,27 @@ static const struct mm_walk_ops hmm_walk_ops = {
  */
 long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 {
-   const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
-   unsigned long start = range->start, end;
-   struct hmm_vma_walk hmm_vma_walk;
+   unsigned long start = range->start;
+   struct hmm_vma_walk hmm_vma_walk = {
+   .range = range,
+   .last = start,
+   .flags = flags,
+   };
struct hmm *hmm = range->hmm;
-   struct vm_area_struct *vma;
int ret;
 
lockdep_assert_held(>mmu_notifier.mm->mmap_sem);
 
do {
-   /* If range is no longer valid force retry. */
-   if (!rang

[PATCH 0/4] HMM tests and minor fixes

2019-09-11 Thread Ralph Campbell

These changes are based on Jason's latest hmm branch.
Patch 1 was previously posted here [1] but was dropped from the orginal
series. Hopefully, the tests will reduce concerns about edge conditions.
I'm sure more tests could be usefully added but I thought this was a good
starting point.

[1] https://lore.kernel.org/linux-mm/20190726005650.2566-6-rcampb...@nvidia.com/

Ralph Campbell (4):
  mm/hmm: make full use of walk_page_range()
  mm/hmm: allow snapshot of the special zero page
  mm/hmm: allow hmm_range_fault() of mmap(PROT_NONE)
  mm/hmm/test: add self tests for HMM

 MAINTAINERS|3 +
 drivers/char/Kconfig   |   11 +
 drivers/char/Makefile  |1 +
 drivers/char/hmm_dmirror.c | 1504 
 include/Kbuild |1 +
 include/uapi/linux/hmm_dmirror.h   |   74 ++
 mm/hmm.c   |  117 +-
 tools/testing/selftests/vm/.gitignore  |1 +
 tools/testing/selftests/vm/Makefile|3 +
 tools/testing/selftests/vm/config  |3 +
 tools/testing/selftests/vm/hmm-tests.c | 1304 
 tools/testing/selftests/vm/run_vmtests |   16 +
 tools/testing/selftests/vm/test_hmm.sh |  105 ++
 13 files changed, 3079 insertions(+), 64 deletions(-)
 create mode 100644 drivers/char/hmm_dmirror.c
 create mode 100644 include/uapi/linux/hmm_dmirror.h
 create mode 100644 tools/testing/selftests/vm/hmm-tests.c
 create mode 100755 tools/testing/selftests/vm/test_hmm.sh

-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 3/4] mm/hmm: allow hmm_range_fault() of mmap(PROT_NONE)

2019-09-11 Thread Ralph Campbell

Allow hmm_range_fault() to return success (0) when the range has no access
(!(vma->vm_flags & VM_READ)). The range->pfns[] array will be filled with
range->values[HMM_PFN_NONE] in this case.
This allows the caller to get a snapshot of a range without having to
lookup the vma before calling hmm_range_fault().
If the call to hmm_range_fault() is not a snapshot, the caller can still
check that pfns have the desired access permissions.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 7217912bef13..16c834e5d1c0 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -795,7 +795,9 @@ static int hmm_vma_walk_test(unsigned long start,
 */
if (!(vma->vm_flags & VM_READ)) {
(void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
-   return -EPERM;
+
+   /* Skip this vma and continue processing the next vma. */
+   return 1;
}
 
return 0;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 4/4] mm/hmm/test: add self tests for HMM

2019-09-11 Thread Ralph Campbell

Add self tests for HMM.

Signed-off-by: Ralph Campbell 
---
 MAINTAINERS|3 +
 drivers/char/Kconfig   |   11 +
 drivers/char/Makefile  |1 +
 drivers/char/hmm_dmirror.c | 1504 
 include/Kbuild |1 +
 include/uapi/linux/hmm_dmirror.h   |   74 ++
 tools/testing/selftests/vm/.gitignore  |1 +
 tools/testing/selftests/vm/Makefile|3 +
 tools/testing/selftests/vm/config  |3 +
 tools/testing/selftests/vm/hmm-tests.c | 1304 
 tools/testing/selftests/vm/run_vmtests |   16 +
 tools/testing/selftests/vm/test_hmm.sh |  105 ++
 12 files changed, 3026 insertions(+)
 create mode 100644 drivers/char/hmm_dmirror.c
 create mode 100644 include/uapi/linux/hmm_dmirror.h
 create mode 100644 tools/testing/selftests/vm/hmm-tests.c
 create mode 100755 tools/testing/selftests/vm/test_hmm.sh

diff --git a/MAINTAINERS b/MAINTAINERS
index 43604d6ab96c..8ab242d91876 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7318,8 +7318,11 @@ M:   Jérôme Glisse 
 L: linux...@kvack.org
 S: Maintained
 F: mm/hmm*
+F: drivers/char/hmm*
 F: include/linux/hmm*
+F: include/uapi/linux/hmm*
 F: Documentation/vm/hmm.rst
+F: tools/testing/selftests/vm/*hmm*
 
 HOST AP DRIVER
 M: Jouni Malinen 
diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 3e866885a405..b4ad868ead63 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -557,6 +557,17 @@ config ADI
  and SSM (Silicon Secured Memory).  Intended consumers of this
  driver include crash and makedumpfile.
 
+config HMM_DMIRROR
+   tristate "HMM driver for testing Heterogeneous Memory Management"
+   depends on HMM_MIRROR
+   depends on DEVICE_PRIVATE
+   help
+ This is a pseudo device driver solely for testing HMM.
+ Say Y here if you want to build the HMM test driver.
+ Doing so will allow you to run tools/testing/selftest/vm/hmm-tests.
+
+ If in doubt, say "N".
+
 endmenu
 
 config RANDOM_TRUST_CPU
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index fbea7dd12932..c9ddd8e550c5 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -54,3 +54,4 @@ js-rtc-y = rtc.o
 obj-$(CONFIG_XILLYBUS) += xillybus/
 obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o
 obj-$(CONFIG_ADI)  += adi.o
+obj-$(CONFIG_HMM_DMIRROR)  += hmm_dmirror.o
diff --git a/drivers/char/hmm_dmirror.c b/drivers/char/hmm_dmirror.c
new file mode 100644
index ..ccb4e03a03b5
--- /dev/null
+++ b/drivers/char/hmm_dmirror.c
@@ -0,0 +1,1504 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2013 Red Hat Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Jérôme Glisse 
+ */
+/*
+ * This is a driver to exercice the HMM (heterogeneous memory management)
+ * mirror and zone device private memory migration APIs of the kernel.
+ * Userspace programs can register with the driver to mirror their own address
+ * space and can use the device to read/write any valid virtual address.
+ *
+ * In some ways it can also serve as an example driver for people wanting to 
use
+ * HMM inside their own device driver.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define DMIRROR_NDEVICES   2
+#define DMIRROR_RANGE_FAULT_TIMEOUT1000
+#define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
+#define DEVMEM_CHUNKS_RESERVE  16
+
+static const struct dev_pagemap_ops dmirror_devmem_ops;
+static dev_t dmirror_dev;
+static struct platform_device *dmirror_platform_devices[DMIRROR_NDEVICES];
+static struct page *dmirror_zero_page;
+
+struct dmirror_device;
+
+struct dmirror_bounce {
+   void*ptr;
+   unsigned long   size;
+   unsigned long   addr;
+   unsigned long   cpages;
+};
+
+#define DPT_SHIFT PAGE_SHIFT
+#define DPT_VALID (1UL << 0)
+#define DPT_WRITE (1UL << 1)
+#define DPT_DPAGE (1UL << 2)
+#define DPT_ZPAGE 0x20UL
+
+const uint64_t dmirror_hmm_flags[HMM_PFN_FLAG_MAX] = {
+   [HMM_PFN_VALID] = DPT_VALID,
+   [HMM_PFN_WRITE] = DPT_WRITE,
+   [HMM_PFN_DEVICE_PRIVATE] = DPT_DPAGE,
+};
+
+static const uint64_

Re: [PATCH 2/4] mm/hmm: allow snapshot of the special zero page

2019-09-12 Thread Ralph Campbell



On 9/12/19 1:26 AM, Christoph Hellwig wrote:

On Wed, Sep 11, 2019 at 03:28:27PM -0700, Ralph Campbell wrote:

Allow hmm_range_fault() to return success (0) when the CPU pagetable
entry points to the special shared zero page.
The caller can then handle the zero page by possibly clearing device
private memory instead of DMAing a zero page.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
  mm/hmm.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 06041d4399ff..7217912bef13 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -532,7 +532,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
return -EBUSY;
} else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) 
{
*pfn = range->values[HMM_PFN_SPECIAL];
-   return -EFAULT;
+   return is_zero_pfn(pte_pfn(pte)) ? 0 : -EFAULT;


Any chance to just use a normal if here:

if (!is_zero_pfn(pte_pfn(pte)))
return -EFAULT;
return 0;



Sure, no problem.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/4] mm/hmm: make full use of walk_page_range()

2019-09-12 Thread Ralph Campbell



On 9/12/19 1:26 AM, Christoph Hellwig wrote:

+static int hmm_pfns_fill(unsigned long addr,
+unsigned long end,
+struct hmm_range *range,
+enum hmm_pfn_value_e value)


Nit: can we use the space a little more efficient, e.g.:

static int hmm_pfns_fill(unsigned long addr, unsigned long end,
struct hmm_range *range, enum hmm_pfn_value_e value)


+static int hmm_vma_walk_test(unsigned long start,
+unsigned long end,
+struct mm_walk *walk)


Same here.


+   if (!(vma->vm_flags & VM_READ)) {
+   (void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE);


There should be no need for the void cast here.



OK. I'll post a v2 with the these changes.
Thanks for the reviews.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 08/13] mm: remove the mask variable in hmm_vma_walk_hugetlb_entry

2019-07-30 Thread Ralph Campbell



On 7/29/19 10:51 PM, Christoph Hellwig wrote:

The pagewalk code already passes the value as the hmask parameter.

Signed-off-by: Christoph Hellwig 
---
  mm/hmm.c | 7 ++-
  1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index f26d6abc4ed2..88b77a4a6a1e 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -771,19 +771,16 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, 
unsigned long hmask,
  struct mm_walk *walk)
  {
  #ifdef CONFIG_HUGETLB_PAGE
-   unsigned long addr = start, i, pfn, mask;
+   unsigned long addr = start, i, pfn;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
-   struct hstate *h = hstate_vma(vma);
uint64_t orig_pfn, cpu_flags;
bool fault, write_fault;
spinlock_t *ptl;
pte_t entry;
int ret = 0;
  
-	mask = huge_page_size(h) - 1;

-
ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
entry = huge_ptep_get(pte);
  
@@ -799,7 +796,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,

goto unlock;
}
  
-	pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT);

+   pfn = pte_pfn(entry) + ((start & hmask) >> PAGE_SHIFT);


This needs to be "~hmask" so that the upper bits of the start address
are not added to the pfn. It's the middle bits of the address that
offset into the huge page that are needed.


for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
 cpu_flags;


___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 1/9] mm: turn migrate_vma upside down

2019-07-30 Thread Ralph Campbell



On 7/29/19 7:28 AM, Christoph Hellwig wrote:

There isn't any good reason to pass callbacks to migrate_vma.  Instead
we can just export the three steps done by this function to drivers and
let them sequence the operation without callbacks.  This removes a lot
of boilerplate code as-is, and will allow the drivers to drastically
improve code flow and error handling further on.

Signed-off-by: Christoph Hellwig 
---
  Documentation/vm/hmm.rst   |  55 +-
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 122 +++--
  include/linux/migrate.h| 118 ++--
  mm/migrate.c   | 242 +++--
  4 files changed, 193 insertions(+), 344 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index ddcb5ca8b296..ad880e3996b1 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -339,58 +339,9 @@ Migration to and from device memory
  ===
  
  Because the CPU cannot access device memory, migration must use the device DMA

-engine to perform copy from and to device memory. For this we need a new
-migration helper::
-
- int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long mentries,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private);
-
-Unlike other migration functions it works on a range of virtual address, there
-are two reasons for that. First, device DMA copy has a high setup overhead cost
-and thus batching multiple pages is needed as otherwise the migration overhead
-makes the whole exercise pointless. The second reason is because the
-migration might be for a range of addresses the device is actively accessing.
-
-The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy())
-controls destination memory allocation and copy operation. Second one is there
-to allow the device driver to perform cleanup operations after migration::
-
- struct migrate_vma_ops {
- void (*alloc_and_copy)(struct vm_area_struct *vma,
-const unsigned long *src,
-unsigned long *dst,
-unsigned long start,
-unsigned long end,
-void *private);
- void (*finalize_and_map)(struct vm_area_struct *vma,
-  const unsigned long *src,
-  const unsigned long *dst,
-  unsigned long start,
-  unsigned long end,
-  void *private);
- };
-
-It is important to stress that these migration helpers allow for holes in the
-virtual address range. Some pages in the range might not be migrated for all
-the usual reasons (page is pinned, page is locked, ...). This helper does not
-fail but just skips over those pages.
-
-The alloc_and_copy() might decide to not migrate all pages in the
-range (for reasons under the callback control). For those, the callback just
-has to leave the corresponding dst entry empty.
-
-Finally, the migration of the struct page might fail (for file backed page) for
-various reasons (failure to freeze reference, or update page cache, ...). If
-that happens, then the finalize_and_map() can catch any pages that were not
-migrated. Note those pages were still copied to a new page and thus we wasted
-bandwidth but this is considered as a rare event and a price that we are
-willing to pay to keep all the code simpler.
+engine to perform copy from and to device memory. For this we need a new to
+use migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize()
+helpers.
  
  
  Memory cgroup (memcg) and rss accounting

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 345c63cb752a..38416798abd4 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -131,9 +131,8 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
  unsigned long *dst_pfns,
  unsigned long start,
  unsigned long end,
- void *private)
+ struct nouveau_dmem_fault *fault)
  {
-   struct nouveau_dmem_fault *fault = private;
struct nouveau_drm *drm = fault->drm;
struct device *dev = drm->dev->dev;
unsigned long addr, i, npages = 0;
@@ -230,14 +229,9 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct 
*vma,
}
  }
  
-void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma,

-const unsigned long *src_pfns,
-const unsigned long

[PATCH] nouveau/hmm: map pages after migration

2019-08-07 Thread Ralph Campbell

When memory is migrated to the GPU it is likely to be accessed by GPU
code soon afterwards. Instead of waiting for a GPU fault, map the
migrated memory into the GPU page tables with the same access permissions
as the source CPU page table entries. This preserves copy on write
semantics.

Signed-off-by: Ralph Campbell 
Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Cc: "Jérôme Glisse" 
Cc: Ben Skeggs 
---

This patch is based on top of Christoph Hellwig's 9 patch series
https://lore.kernel.org/linux-mm/20190729234611.gc7...@redhat.com/T/#u
"turn the hmm migrate_vma upside down" but without patch 9
"mm: remove the unused MIGRATE_PFN_WRITE" and adds a use for the flag.


 drivers/gpu/drm/nouveau/nouveau_dmem.c | 45 +-
 drivers/gpu/drm/nouveau/nouveau_svm.c  | 86 ++
 drivers/gpu/drm/nouveau/nouveau_svm.h  | 19 ++
 3 files changed, 133 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index ef9de82b0744..c83e6f118817 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -25,11 +25,13 @@
 #include "nouveau_dma.h"
 #include "nouveau_mem.h"
 #include "nouveau_bo.h"
+#include "nouveau_svm.h"
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -560,11 +562,12 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 }
 
 static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
-   struct vm_area_struct *vma, unsigned long addr,
-   unsigned long src, dma_addr_t *dma_addr)
+   struct vm_area_struct *vma, unsigned long src,
+   dma_addr_t *dma_addr, u64 *pfn)
 {
struct device *dev = drm->dev->dev;
struct page *dpage, *spage;
+   unsigned long paddr;
 
spage = migrate_pfn_to_page(src);
if (!spage || !(src & MIGRATE_PFN_MIGRATE))
@@ -572,17 +575,21 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct 
nouveau_drm *drm,
 
dpage = nouveau_dmem_page_alloc_locked(drm);
if (!dpage)
-   return 0;
+   goto out;
 
*dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
goto out_free_page;
 
+   paddr = nouveau_dmem_page_addr(dpage);
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM,
-   nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST,
-   *dma_addr))
+   paddr, NOUVEAU_APER_HOST, *dma_addr))
goto out_dma_unmap;
 
+   *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM |
+   ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
+   if (src & MIGRATE_PFN_WRITE)
+   *pfn |= NVIF_VMM_PFNMAP_V0_W;
return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 
 out_dma_unmap:
@@ -590,18 +597,19 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct 
nouveau_drm *drm,
 out_free_page:
nouveau_dmem_page_free_locked(drm, dpage);
 out:
+   *pfn = NVIF_VMM_PFNMAP_V0_NONE;
return 0;
 }
 
 static void nouveau_dmem_migrate_chunk(struct migrate_vma *args,
-   struct nouveau_drm *drm, dma_addr_t *dma_addrs)
+   struct nouveau_drm *drm, dma_addr_t *dma_addrs, u64 *pfns)
 {
struct nouveau_fence *fence;
unsigned long addr = args->start, nr_dma = 0, i;
 
for (i = 0; addr < args->end; i++) {
args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->vma,
-   addr, args->src[i], _addrs[nr_dma]);
+   args->src[i], _addrs[nr_dma], [i]);
if (args->dst[i])
nr_dma++;
addr += PAGE_SIZE;
@@ -615,10 +623,6 @@ static void nouveau_dmem_migrate_chunk(struct migrate_vma 
*args,
dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE,
DMA_BIDIRECTIONAL);
}
-   /*
-* FIXME optimization: update GPU page table to point to newly migrated
-* memory.
-*/
migrate_vma_finalize(args);
 }
 
@@ -631,11 +635,12 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages);
dma_addr_t *dma_addrs;
+   u64 *pfns;
struct migrate_vma args = {
.vma= vma,
.start  = start,
};
-   unsigned long c, i;
+   unsigned long i;
int ret = -ENOMEM;
 
args.src = kcalloc(max, sizeof(args.src), GFP_KERNEL);
@@ -649,19 +654,25 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
if (!dma

Re: [PATCH 6/9] nouveau: simplify nouveau_dmem_migrate_to_ram

2019-08-08 Thread Ralph Campbell



On 8/8/19 8:33 AM, Christoph Hellwig wrote:

Factor the main copy page to ram routine out into a helper that acts on
a single page and which doesn't require the nouveau_dmem_fault
structure for argument passing.  Also remove the loop over multiple
pages as we only handle one at the moment, although the structure of
the main worker function makes it relatively easy to add multi page
support back if needed in the future.  But at least for now this avoid
the needed to dynamically allocate memory for the dma addresses in
what is essentially the page fault path.

Signed-off-by: Christoph Hellwig 


Reviewed-by: Ralph Campbell 


---
  drivers/gpu/drm/nouveau/nouveau_dmem.c | 159 +++--
  1 file changed, 40 insertions(+), 119 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 21052a4aaf69..473195762974 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -86,13 +86,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page 
*page)
return container_of(page->pgmap, struct nouveau_dmem, pagemap);
  }
  
-struct nouveau_dmem_fault {

-   struct nouveau_drm *drm;
-   struct nouveau_fence *fence;
-   dma_addr_t *dma;
-   unsigned long npages;
-};
-
  struct nouveau_migrate {
struct vm_area_struct *vma;
struct nouveau_drm *drm;
@@ -146,130 +139,57 @@ static void nouveau_dmem_fence_done(struct nouveau_fence 
**fence)
}
  }
  
-static void

-nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma,
- const unsigned long *src_pfns,
- unsigned long *dst_pfns,
- unsigned long start,
- unsigned long end,
- struct nouveau_dmem_fault *fault)
+static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm,
+   struct vm_fault *vmf, struct migrate_vma *args,
+   dma_addr_t *dma_addr)
  {
-   struct nouveau_drm *drm = fault->drm;
struct device *dev = drm->dev->dev;
-   unsigned long addr, i, npages = 0;
-   nouveau_migrate_copy_t copy;
-   int ret;
-
+   struct page *dpage, *spage;
+   vm_fault_t ret = VM_FAULT_SIGBUS;


You can remove this line and return VM_FAULT_SIGBUS in the error path below.

  
-	/* First allocate new memory */

-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *dpage, *spage;
-
-   dst_pfns[i] = 0;
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE))
-   continue;
-
-   dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr);
-   if (!dpage) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   continue;
-   }
-   lock_page(dpage);
-
-   dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) |
- MIGRATE_PFN_LOCKED;
-   npages++;
-   }
+   spage = migrate_pfn_to_page(args->src[0]);
+   if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE))
+   return 0;
  
-	/* Allocate storage for DMA addresses, so we can unmap later. */

-   fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL);
-   if (!fault->dma)
+   dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address);
+   if (!dpage)
goto error;
+   lock_page(dpage);
  
-	/* Copy things over */

-   copy = drm->dmem->migrate.copy_func;
-   for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) {
-   struct page *spage, *dpage;
-
-   dpage = migrate_pfn_to_page(dst_pfns[i]);
-   if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR)
-   continue;
-
-   spage = migrate_pfn_to_page(src_pfns[i]);
-   if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   __free_page(dpage);
-   continue;
-   }
-
-   fault->dma[fault->npages] =
-   dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE,
-  PCI_DMA_BIDIRECTIONAL,
-  DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(dev, fault->dma[fault->npages])) {
-   dst_pfns[i] = MIGRATE_PFN_ERROR;
-   __free_page(dpage);
-   continue;
-   }
+   *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
+   if (dma_mapping_error(dev, *dma_addr))
+   goto error_free_page;
  
-		ret = copy(drm, 1, NOUVEAU

Re: [PATCH] nouveau/hmm: map pages after migration

2019-08-08 Thread Ralph Campbell



On 8/8/19 12:07 AM, Christoph Hellwig wrote:

On Wed, Aug 07, 2019 at 08:02:14AM -0700, Ralph Campbell wrote:

When memory is migrated to the GPU it is likely to be accessed by GPU
code soon afterwards. Instead of waiting for a GPU fault, map the
migrated memory into the GPU page tables with the same access permissions
as the source CPU page table entries. This preserves copy on write
semantics.

Signed-off-by: Ralph Campbell 
Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Cc: "Jérôme Glisse" 
Cc: Ben Skeggs 
---

This patch is based on top of Christoph Hellwig's 9 patch series
https://lore.kernel.org/linux-mm/20190729234611.gc7...@redhat.com/T/#u
"turn the hmm migrate_vma upside down" but without patch 9
"mm: remove the unused MIGRATE_PFN_WRITE" and adds a use for the flag.


This looks useful.  I've already dropped that patch for the pending
resend.


Thanks.




  static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
-   struct vm_area_struct *vma, unsigned long addr,
-   unsigned long src, dma_addr_t *dma_addr)
+   struct vm_area_struct *vma, unsigned long src,
+   dma_addr_t *dma_addr, u64 *pfn)


I'll pick up the removal of the not needed addr argument for the patch
introducing nouveau_dmem_migrate_copy_one, thanks,


  static void nouveau_dmem_migrate_chunk(struct migrate_vma *args,
-   struct nouveau_drm *drm, dma_addr_t *dma_addrs)
+   struct nouveau_drm *drm, dma_addr_t *dma_addrs, u64 *pfns)
  {
struct nouveau_fence *fence;
unsigned long addr = args->start, nr_dma = 0, i;
  
  	for (i = 0; addr < args->end; i++) {

args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->vma,
-   addr, args->src[i], _addrs[nr_dma]);
+   args->src[i], _addrs[nr_dma], [i]);


Nit: I find the [i] way to pass the argument a little weird to read.
Why not "pfns + i"?


OK, will do in v2.
Should I convert to "dma_addrs + nr_dma" too?


+u64 *
+nouveau_pfns_alloc(unsigned long npages)
+{
+   struct nouveau_pfnmap_args *args;
+
+   args = kzalloc(sizeof(*args) + npages * sizeof(args->p.phys[0]),


Can we use struct_size here?


Yes, good suggestion.




+   int ret;
+
+   if (!svm)
+   return;
+
+   mutex_lock(>mutex);
+   svmm = nouveau_find_svmm(svm, mm);
+   if (!svmm) {
+   mutex_unlock(>mutex);
+   return;
+   }
+   mutex_unlock(>mutex);


Given that nouveau_find_svmm doesn't take any kind of reference, what
gurantees svmm doesn't go away after dropping the lock?


I asked Ben and Jerome about this too.
I'm still looking into it.




@@ -44,5 +49,19 @@ static inline int nouveau_svmm_bind(struct drm_device 
*device, void *p,
  {
return -ENOSYS;
  }
+
+u64 *nouveau_pfns_alloc(unsigned long npages)
+{
+   return NULL;
+}
+
+void nouveau_pfns_free(u64 *pfns)
+{
+}
+
+void nouveau_pfns_map(struct nouveau_drm *drm, struct mm_struct *mm,
+ unsigned long addr, u64 *pfns, unsigned long npages)
+{
+}
  #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */


nouveau_dmem.c and nouveau_svm.c are both built conditional on
CONFIG_DRM_NOUVEAU_SVM, so there is no need for stubs here.



Good point. I'll remove them in v2.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: hmm_range_fault related fixes and legacy API removal v2

2019-07-23 Thread Ralph Campbell



On 7/22/19 2:44 AM, Christoph Hellwig wrote:

Hi Jérôme, Ben and Jason,

below is a series against the hmm tree which fixes up the mmap_sem
locking in nouveau and while at it also removes leftover legacy HMM APIs
only used by nouveau.

The first 4 patches are a bug fix for nouveau, which I suspect should
go into this merge window even if the code is marked as staging, just
to avoid people copying the breakage.

Changes since v1:
  - don't return the valid state from hmm_range_unregister
  - additional nouveau cleanups



I ran some OpenCL tests from Jerome with nouveau and this series,
5.3.0-rc1, and my two HMM fixes:
("mm/hmm: fix ZONE_DEVICE anon page mapping reuse")
("mm/hmm: Fix bad subpage pointer in try_to_unmap_one")

You can add for the series:
Tested-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH] mm/hmm: replace hmm_update with mmu_notifier_range

2019-07-24 Thread Ralph Campbell

The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes
a struct hmm_update which is a simplified version of struct
mmu_notifier_range. This is unnecessary so replace hmm_update with
mmu_notifier_range directly.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
Cc: Ben Skeggs 
---

This is based on 5.3.0-rc1 plus Christoph Hellwig's 6 patches
("hmm_range_fault related fixes and legacy API removal v2").
Jason, I believe this is the patch you were requesting.

 drivers/gpu/drm/nouveau/nouveau_svm.c |  4 ++--
 include/linux/hmm.h   | 31 ---
 mm/hmm.c  | 13 ---
 3 files changed, 10 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index a9c5c58d425b..6298d2dadb55 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 
start, u64 limit)
 
 static int
 nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
-   const struct hmm_update *update)
+   const struct mmu_notifier_range *update)
 {
struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
unsigned long start = update->start;
unsigned long limit = update->end;
 
-   if (!update->blockable)
+   if (!mmu_notifier_range_blockable(update))
return -EAGAIN;
 
SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 9f32586684c9..659e25a15700 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const 
struct hmm_range *range,
 
 struct hmm_mirror;
 
-/*
- * enum hmm_update_event - type of update
- * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
- */
-enum hmm_update_event {
-   HMM_UPDATE_INVALIDATE,
-};
-
-/*
- * struct hmm_update - HMM update information for callback
- *
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
- * @event: event triggering the update (what is happening)
- * @blockable: can the callback block/sleep ?
- */
-struct hmm_update {
-   unsigned long start;
-   unsigned long end;
-   enum hmm_update_event event;
-   bool blockable;
-};
-
 /*
  * struct hmm_mirror_ops - HMM mirror device operations callback
  *
@@ -383,9 +360,9 @@ struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
 *
 * @mirror: pointer to struct hmm_mirror
-* @update: update information (see struct hmm_update)
-* Return: -EAGAIN if update.blockable false and callback need to
-*  block, 0 otherwise.
+* @update: update information (see struct mmu_notifier_range)
+* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
+* and callback needs to block, 0 otherwise.
 *
 * This callback ultimately originates from mmu_notifiers when the CPU
 * page table is updated. The device driver must update its page table
@@ -397,7 +374,7 @@ struct hmm_mirror_ops {
 * synchronous call.
 */
int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror,
- const struct hmm_update *update);
+   const struct mmu_notifier_range *update);
 };
 
 /*
diff --git a/mm/hmm.c b/mm/hmm.c
index 16b6731a34db..b810a4fa3de9 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -165,7 +165,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
 {
struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
-   struct hmm_update update;
struct hmm_range *range;
unsigned long flags;
int ret = 0;
@@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
if (!kref_get_unless_zero(>kref))
return 0;
 
-   update.start = nrange->start;
-   update.end = nrange->end;
-   update.event = HMM_UPDATE_INVALIDATE;
-   update.blockable = mmu_notifier_range_blockable(nrange);
-
spin_lock_irqsave(>ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, >ranges, list) {
-   if (update.end < range->start || update.start >= range->end)
+   if (nrange->end < range->start || nrange->start >= range->end)
continue;
 
range->valid = false;
@@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
list_for_each_entry(mirror, >mirrors, l

Re: [PATCH] mm/hmm: replace hmm_update with mmu_notifier_range

2019-07-26 Thread Ralph Campbell



On 7/24/19 6:14 PM, Jason Gunthorpe wrote:

On Tue, Jul 23, 2019 at 02:05:06PM -0700, Ralph Campbell wrote:

The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes
a struct hmm_update which is a simplified version of struct
mmu_notifier_range. This is unnecessary so replace hmm_update with
mmu_notifier_range directly.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
Cc: Ben Skeggs 

This is based on 5.3.0-rc1 plus Christoph Hellwig's 6 patches
("hmm_range_fault related fixes and legacy API removal v2").
Jason, I believe this is the patch you were requesting.


Doesn't this need revision to include amgpu?

drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c: .sync_cpu_device_pagetables = 
amdgpu_mn_sync_pagetables_gfx,
drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c: .sync_cpu_device_pagetables = 
amdgpu_mn_sync_pagetables_hsa,

Thanks,
Jason



Yes. I have added this to v2 which I'll send out with Christoph's 2 
patches and the hmm_range.vma removal patch you suggested.

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v2 4/7] mm: merge hmm_range_snapshot into hmm_range_fault

2019-07-25 Thread Ralph Campbell

From: Christoph Hellwig 

Add a HMM_FAULT_SNAPSHOT flag so that hmm_range_snapshot can be merged
into the almost identical hmm_range_fault function.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
---
 Documentation/vm/hmm.rst | 17 
 include/linux/hmm.h  |  4 +-
 mm/hmm.c | 85 +---
 3 files changed, 13 insertions(+), 93 deletions(-)

diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst
index 710ce1c701bf..ddcb5ca8b296 100644
--- a/Documentation/vm/hmm.rst
+++ b/Documentation/vm/hmm.rst
@@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must 
complete the update before
 the driver callback returns.
 
 When the device driver wants to populate a range of virtual addresses, it can
-use either::
+use::
 
-  long hmm_range_snapshot(struct hmm_range *range);
-  long hmm_range_fault(struct hmm_range *range, bool block);
+  long hmm_range_fault(struct hmm_range *range, unsigned int flags);
 
-The first one (hmm_range_snapshot()) will only fetch present CPU page table
+With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table
 entries and will not trigger a page fault on missing or non-present entries.
-The second one does trigger a page fault on missing or read-only entries if
-write access is requested (see below). Page faults use the generic mm page
+Without that flag, it does trigger a page fault on missing or read-only entries
+if write access is requested (see below). Page faults use the generic mm page
 fault code path just like a CPU page fault.
 
 Both functions copy CPU page table entries into their pfns array argument. Each
@@ -227,20 +226,20 @@ The usage pattern is::
 
   /*
* Just wait for range to be valid, safe to ignore return value as we
-   * will use the return value of hmm_range_snapshot() below under the
+   * will use the return value of hmm_range_fault() below under the
* mmap_sem to ascertain the validity of the range.
*/
   hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC);
 
  again:
   down_read(>mmap_sem);
-  ret = hmm_range_snapshot();
+  ret = hmm_range_fault(, HMM_RANGE_SNAPSHOT);
   if (ret) {
   up_read(>mmap_sem);
   if (ret == -EBUSY) {
 /*
  * No need to check hmm_range_wait_until_valid() return value
- * on retry we will get proper error with hmm_range_snapshot()
+ * on retry we will get proper error with hmm_range_fault()
  */
 hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC);
 goto again;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 15f1b113be3c..f3693dcc8b98 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -412,7 +412,9 @@ void hmm_range_unregister(struct hmm_range *range);
  */
 #define HMM_FAULT_ALLOW_RETRY  (1 << 0)
 
-long hmm_range_snapshot(struct hmm_range *range);
+/* Don't fault in missing PTEs, just snapshot the current state. */
+#define HMM_FAULT_SNAPSHOT (1 << 1)
+
 long hmm_range_fault(struct hmm_range *range, unsigned int flags);
 
 long hmm_range_dma_map(struct hmm_range *range,
diff --git a/mm/hmm.c b/mm/hmm.c
index 84f2791d3510..1bc014cddd78 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -280,7 +280,6 @@ struct hmm_vma_walk {
struct hmm_range*range;
struct dev_pagemap  *pgmap;
unsigned long   last;
-   boolfault;
unsigned intflags;
 };
 
@@ -373,7 +372,7 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
 {
struct hmm_range *range = hmm_vma_walk->range;
 
-   if (!hmm_vma_walk->fault)
+   if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
return;
 
/*
@@ -418,7 +417,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk 
*hmm_vma_walk,
 {
unsigned long i;
 
-   if (!hmm_vma_walk->fault) {
+   if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
*fault = *write_fault = false;
return;
}
@@ -936,85 +935,6 @@ void hmm_range_unregister(struct hmm_range *range)
 }
 EXPORT_SYMBOL(hmm_range_unregister);
 
-/*
- * hmm_range_snapshot() - snapshot CPU page table for a range
- * @range: range
- * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- *  permission (for instance asking for write and range is read only),
- *  -EBUSY if you need to retry, -EFAULT invalid (ie either no valid
- *  vma or it is illegal to access that range), number of valid pages
- *  in range->pfns[] (from range start address).
- *
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See in include/linux/hmm.h for example
- * on how to use.

[PATCH v2 5/7] mm/hmm: make full use of walk_page_range()

2019-07-25 Thread Ralph Campbell

hmm_range_fault() calls find_vma() and walk_page_range() in a loop.
This is unnecessary duplication since walk_page_range() calls find_vma()
in a loop already.
Simplify hmm_range_fault() by defining a walk_test() callback function
to filter unhandled vmas.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 130 ---
 1 file changed, 57 insertions(+), 73 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 1bc014cddd78..838cd1d50497 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -840,13 +840,44 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, 
unsigned long hmask,
 #endif
 }
 
-static void hmm_pfns_clear(struct hmm_range *range,
-  uint64_t *pfns,
-  unsigned long addr,
-  unsigned long end)
+static int hmm_vma_walk_test(unsigned long start,
+unsigned long end,
+struct mm_walk *walk)
 {
-   for (; addr < end; addr += PAGE_SIZE, pfns++)
-   *pfns = range->values[HMM_PFN_NONE];
+   struct hmm_vma_walk *hmm_vma_walk = walk->private;
+   struct hmm_range *range = hmm_vma_walk->range;
+   struct vm_area_struct *vma = walk->vma;
+
+   /* If range is no longer valid, force retry. */
+   if (!range->valid)
+   return -EBUSY;
+
+   /*
+* Skip vma ranges that don't have struct page backing them or
+* map I/O devices directly.
+* TODO: handle peer-to-peer device mappings.
+*/
+   if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP))
+   return -EFAULT;
+
+   if (is_vm_hugetlb_page(vma)) {
+   if (huge_page_shift(hstate_vma(vma)) != range->page_shift &&
+   range->page_shift != PAGE_SHIFT)
+   return -EINVAL;
+   } else {
+   if (range->page_shift != PAGE_SHIFT)
+   return -EINVAL;
+   }
+
+   /*
+* If vma does not allow read access, then assume that it does not
+* allow write access, either. HMM does not support architectures
+* that allow write without read.
+*/
+   if (!(vma->vm_flags & VM_READ))
+   return -EPERM;
+
+   return 0;
 }
 
 /*
@@ -965,82 +996,35 @@ EXPORT_SYMBOL(hmm_range_unregister);
  */
 long hmm_range_fault(struct hmm_range *range, unsigned int flags)
 {
-   const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
-   unsigned long start = range->start, end;
-   struct hmm_vma_walk hmm_vma_walk;
+   unsigned long start = range->start;
+   struct hmm_vma_walk hmm_vma_walk = {};
struct hmm *hmm = range->hmm;
-   struct vm_area_struct *vma;
-   struct mm_walk mm_walk;
+   struct mm_walk mm_walk = {};
int ret;
 
lockdep_assert_held(>mm->mmap_sem);
 
-   do {
-   /* If range is no longer valid force retry. */
-   if (!range->valid)
-   return -EBUSY;
+   hmm_vma_walk.range = range;
+   hmm_vma_walk.last = start;
+   hmm_vma_walk.flags = flags;
+   mm_walk.private = _vma_walk;
 
-   vma = find_vma(hmm->mm, start);
-   if (vma == NULL || (vma->vm_flags & device_vma))
-   return -EFAULT;
-
-   if (is_vm_hugetlb_page(vma)) {
-   if (huge_page_shift(hstate_vma(vma)) !=
-   range->page_shift &&
-   range->page_shift != PAGE_SHIFT)
-   return -EINVAL;
-   } else {
-   if (range->page_shift != PAGE_SHIFT)
-   return -EINVAL;
-   }
+   mm_walk.mm = hmm->mm;
+   mm_walk.pud_entry = hmm_vma_walk_pud;
+   mm_walk.pmd_entry = hmm_vma_walk_pmd;
+   mm_walk.pte_hole = hmm_vma_walk_hole;
+   mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry;
+   mm_walk.test_walk = hmm_vma_walk_test;
 
-   if (!(vma->vm_flags & VM_READ)) {
-   /*
-* If vma do not allow read access, then assume that it
-* does not allow write access, either. HMM does not
-* support architecture that allow write without read.
-*/
-   hmm_pfns_clear(range, range->pfns,
-   range->start, range->end);
-   return -EPERM;
-   }
+   do {
+   ret = walk_page_range(start, range->end, _walk);
+   start = hmm_vma_walk.last;
 
-   range->vma = vma;
-   hmm_vma_walk.pgmap = NULL;
-   hmm_vma_walk.last = start;
-

[PATCH v2 2/7] mm/hmm: a few more C style and comment clean ups

2019-07-25 Thread Ralph Campbell

A few more comments and minor programming style clean ups.
There should be no functional changes.

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 39 +--
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 4040b4427635..362944b0fbca 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -32,7 +32,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
  * hmm_get_or_create - register HMM against an mm (HMM internal)
  *
  * @mm: mm struct to attach to
- * Returns: returns an HMM object, either by referencing the existing
+ * Return: an HMM object, either by referencing the existing
  *  (per-process) object, or by creating a new one.
  *
  * This is not intended to be used directly by device drivers. If mm already
@@ -325,8 +325,8 @@ static int hmm_pfns_bad(unsigned long addr,
 }
 
 /*
- * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
- * @start: range virtual start address (inclusive)
+ * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * @addr: range virtual start address (inclusive)
  * @end: range virtual end address (exclusive)
  * @fault: should we fault or not ?
  * @write_fault: write fault ?
@@ -376,9 +376,9 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
/*
 * So we not only consider the individual per page request we also
 * consider the default flags requested for the range. The API can
-* be use in 2 fashions. The first one where the HMM user coalesce
-* multiple page fault into one request and set flags per pfns for
-* of those faults. The second one where the HMM user want to pre-
+* be used 2 ways. The first one where the HMM user coalesces
+* multiple page faults into one request and sets flags per pfn for
+* those faults. The second one where the HMM user wants to pre-
 * fault a range with specific flags. For the latter one it is a
 * waste to have the user pre-fill the pfn arrays with a default
 * flags value.
@@ -388,7 +388,7 @@ static inline void hmm_pte_need_fault(const struct 
hmm_vma_walk *hmm_vma_walk,
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
-   /* If this is device memory than only fault if explicitly requested */
+   /* If this is device memory then only fault if explicitly requested */
if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
/* Do we fault on device memory ? */
if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
@@ -502,7 +502,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk,
hmm_vma_walk->last = end;
return 0;
 #else
-   /* If THP is not enabled then we should never reach that code ! */
+   /* If THP is not enabled then we should never reach this code ! */
return -EINVAL;
 #endif
 }
@@ -522,7 +522,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
 {
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
-   struct vm_area_struct *vma = walk->vma;
bool fault, write_fault;
uint64_t cpu_flags;
pte_t pte = *ptep;
@@ -571,8 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, 
unsigned long addr,
if (fault || write_fault) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
-   migration_entry_wait(vma->vm_mm,
-pmdp, addr);
+   migration_entry_wait(walk->mm, pmdp, addr);
return -EBUSY;
}
return 0;
@@ -620,13 +618,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 {
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
-   struct vm_area_struct *vma = walk->vma;
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
pmd_t pmd;
 
-
 again:
pmd = READ_ONCE(*pmdp);
if (pmd_none(pmd))
@@ -648,7 +644,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 0, , _fault);
if (fault || write_fault) {
hmm_vma_walk->last = addr;
-   pmd_migration_entry_wait(vma->vm_mm, pmdp);
+   pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
return 0;
@@ -657,11 +653,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 
if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {

[PATCH v2 1/7] mm/hmm: replace hmm_update with mmu_notifier_range

2019-07-25 Thread Ralph Campbell

The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes
a struct hmm_update which is a simplified version of struct
mmu_notifier_range. This is unnecessary so replace hmm_update with
mmu_notifier_range directly.

Signed-off-by: Ralph Campbell 
Reviewed: Christoph Hellwig 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Ben Skeggs 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c |  8 +++
 drivers/gpu/drm/nouveau/nouveau_svm.c  |  4 ++--
 include/linux/hmm.h| 31 --
 mm/hmm.c   | 13 ---
 4 files changed, 14 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 3971c201f320..cf945080dff3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -196,12 +196,12 @@ static void amdgpu_mn_invalidate_node(struct 
amdgpu_mn_node *node,
  * potentially dirty.
  */
 static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror,
-   const struct hmm_update *update)
+   const struct mmu_notifier_range *update)
 {
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
-   bool blockable = update->blockable;
+   bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
 
/* notification is exclusive, but interval is inclusive */
@@ -244,12 +244,12 @@ static int amdgpu_mn_sync_pagetables_gfx(struct 
hmm_mirror *mirror,
  * are restorted in amdgpu_mn_invalidate_range_end_hsa.
  */
 static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror,
-   const struct hmm_update *update)
+   const struct mmu_notifier_range *update)
 {
struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror);
unsigned long start = update->start;
unsigned long end = update->end;
-   bool blockable = update->blockable;
+   bool blockable = mmu_notifier_range_blockable(update);
struct interval_tree_node *it;
 
/* notification is exclusive, but interval is inclusive */
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 545100f7c594..79b29c918717 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 
start, u64 limit)
 
 static int
 nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror,
-   const struct hmm_update *update)
+   const struct mmu_notifier_range *update)
 {
struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror);
unsigned long start = update->start;
unsigned long limit = update->end;
 
-   if (!update->blockable)
+   if (!mmu_notifier_range_blockable(update))
return -EAGAIN;
 
SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit);
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 9f32586684c9..659e25a15700 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const 
struct hmm_range *range,
 
 struct hmm_mirror;
 
-/*
- * enum hmm_update_event - type of update
- * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why)
- */
-enum hmm_update_event {
-   HMM_UPDATE_INVALIDATE,
-};
-
-/*
- * struct hmm_update - HMM update information for callback
- *
- * @start: virtual start address of the range to update
- * @end: virtual end address of the range to update
- * @event: event triggering the update (what is happening)
- * @blockable: can the callback block/sleep ?
- */
-struct hmm_update {
-   unsigned long start;
-   unsigned long end;
-   enum hmm_update_event event;
-   bool blockable;
-};
-
 /*
  * struct hmm_mirror_ops - HMM mirror device operations callback
  *
@@ -383,9 +360,9 @@ struct hmm_mirror_ops {
/* sync_cpu_device_pagetables() - synchronize page tables
 *
 * @mirror: pointer to struct hmm_mirror
-* @update: update information (see struct hmm_update)
-* Return: -EAGAIN if update.blockable false and callback need to
-*  block, 0 otherwise.
+* @update: update information (see struct mmu_notifier_range)
+* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false
+* and callback needs to block, 0 otherwise.
 *
 * This callback ultimately originates from mmu_notifiers when the CPU
 * page table is updated. The device driver must update its page table
@@ -397,7 +374,7 @@ struct hmm_mirror_ops {
 * synchronous call.

[PATCH v2 6/7] mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd

2019-07-25 Thread Ralph Campbell

walk_page_range() will only call hmm_vma_walk_hugetlb_entry() for
hugetlbfs pages and doesn't call hmm_vma_walk_pmd() in this case.
Therefore, it is safe to remove the check for vma->vm_flags & VM_HUGETLB
in hmm_vma_walk_pmd().

Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
Cc: Christoph Hellwig 
---
 mm/hmm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 838cd1d50497..29f322ca5d58 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -630,9 +630,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
 
-   if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB))
-   return hmm_pfns_bad(start, end, walk);
-
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
bool fault, write_fault;
unsigned long npages;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v2 3/7] mm/hmm: replace the block argument to hmm_range_fault with a flags value

2019-07-25 Thread Ralph Campbell

From: Christoph Hellwig 

This allows easier expansion to other flags, and also makes the
callers a little easier to read.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Jason Gunthorpe 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  2 +-
 include/linux/hmm.h | 11 +++-
 mm/hmm.c| 74 -
 4 files changed, 48 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e51b48ac48eb..12a59ac83f72 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -832,7 +832,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, 
struct page **pages)
 
down_read(>mmap_sem);
 
-   r = hmm_range_fault(range, true);
+   r = hmm_range_fault(range, 0);
if (unlikely(r < 0)) {
if (likely(r == -EAGAIN)) {
/*
diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 79b29c918717..49b520c60fc5 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -505,7 +505,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct 
hmm_range *range)
return -EBUSY;
}
 
-   ret = hmm_range_fault(range, true);
+   ret = hmm_range_fault(range, 0);
if (ret <= 0) {
if (ret == 0)
ret = -EBUSY;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 659e25a15700..15f1b113be3c 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -406,12 +406,19 @@ int hmm_range_register(struct hmm_range *range,
   unsigned long end,
   unsigned page_shift);
 void hmm_range_unregister(struct hmm_range *range);
+
+/*
+ * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case.
+ */
+#define HMM_FAULT_ALLOW_RETRY  (1 << 0)
+
 long hmm_range_snapshot(struct hmm_range *range);
-long hmm_range_fault(struct hmm_range *range, bool block);
+long hmm_range_fault(struct hmm_range *range, unsigned int flags);
+
 long hmm_range_dma_map(struct hmm_range *range,
   struct device *device,
   dma_addr_t *daddrs,
-  bool block);
+  unsigned int flags);
 long hmm_range_dma_unmap(struct hmm_range *range,
 struct vm_area_struct *vma,
 struct device *device,
diff --git a/mm/hmm.c b/mm/hmm.c
index 362944b0fbca..84f2791d3510 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -281,7 +281,7 @@ struct hmm_vma_walk {
struct dev_pagemap  *pgmap;
unsigned long   last;
boolfault;
-   boolblock;
+   unsigned intflags;
 };
 
 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
@@ -293,8 +293,11 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned 
long addr,
struct vm_area_struct *vma = walk->vma;
vm_fault_t ret;
 
-   flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
-   flags |= write_fault ? FAULT_FLAG_WRITE : 0;
+   if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
+   flags |= FAULT_FLAG_ALLOW_RETRY;
+   if (write_fault)
+   flags |= FAULT_FLAG_WRITE;
+
ret = handle_mm_fault(vma, addr, flags);
if (ret & VM_FAULT_RETRY) {
/* Note, handle_mm_fault did up_read(>mmap_sem)) */
@@ -1012,26 +1015,26 @@ long hmm_range_snapshot(struct hmm_range *range)
 }
 EXPORT_SYMBOL(hmm_range_snapshot);
 
-/*
- * hmm_range_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Return: number of valid pages in range->pfns[] (from range start
- *  address). This may be zero. If the return value is negative,
- *  then one of the following values may be returned:
+/**
+ * hmm_range_fault - try to fault some address in a virtual address range
+ * @range: range being faulted
+ * @flags: HMM_FAULT_* flags
  *
- *   -EINVAL  invalid arguments or mm or virtual address are in an
- *invalid vma (for instance device file vma).
- *   -ENOMEM: Out of memory.
- *   -EPERM:  Invalid permission (for instance asking for write and
- *range is read only).
- *   -EAGAIN: If you need to retry and mmap_sem was drop. This can only
- *happens if block argument is false.
- *   -EBUSY:  If the the range is being invalidated and you should wait
- *for invalidation to finish.
- *   -EFAULT: Invalid (ie

[PATCH v2 7/7] mm/hmm: remove hmm_range vma

2019-07-25 Thread Ralph Campbell

Since hmm_range_fault() doesn't use the struct hmm_range vma field,
remove it.

Suggested-by: Jason Gunthorpe 
Signed-off-by: Ralph Campbell 
Cc: "Jérôme Glisse" 
Cc: Christoph Hellwig 
---
 drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++
 include/linux/hmm.h   | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 49b520c60fc5..a74530b5a523 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -496,12 +496,12 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct 
hmm_range *range)
 range->start, range->end,
 PAGE_SHIFT);
if (ret) {
-   up_read(>vma->vm_mm->mmap_sem);
+   up_read(>hmm->mm->mmap_sem);
return (int)ret;
}
 
if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) {
-   up_read(>vma->vm_mm->mmap_sem);
+   up_read(>hmm->mm->mmap_sem);
return -EBUSY;
}
 
@@ -509,7 +509,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct 
hmm_range *range)
if (ret <= 0) {
if (ret == 0)
ret = -EBUSY;
-   up_read(>vma->vm_mm->mmap_sem);
+   up_read(>hmm->mm->mmap_sem);
hmm_range_unregister(range);
return ret;
}
@@ -682,7 +682,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
 args.i.p.addr + args.i.p.size, fn - fi);
 
/* Have HMM fault pages within the fault window to the GPU. */
-   range.vma = vma;
range.start = args.i.p.addr;
range.end = args.i.p.addr + args.i.p.size;
range.pfns = args.phys;
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index f3693dcc8b98..68949cf815f9 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -164,7 +164,6 @@ enum hmm_pfn_value_e {
  */
 struct hmm_range {
struct hmm  *hmm;
-   struct vm_area_struct   *vma;
struct list_headlist;
unsigned long   start;
unsigned long   end;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v2 0/7] mm/hmm: more HMM clean up

2019-07-25 Thread Ralph Campbell

Here are seven more patches for things I found to clean up.
This was based on top of Christoph's seven patches:
"hmm_range_fault related fixes and legacy API removal v3".
I assume this will go into Jason's tree since there will likely be
more HMM changes in this cycle.

Changes from v1 to v2:

Added AMD GPU to hmm_update removal.
Added 2 patches from Christoph.
Added 2 patches as a result of Jason's suggestions.

Christoph Hellwig (2):
  mm/hmm: replace the block argument to hmm_range_fault with a flags
value
  mm: merge hmm_range_snapshot into hmm_range_fault

Ralph Campbell (5):
  mm/hmm: replace hmm_update with mmu_notifier_range
  mm/hmm: a few more C style and comment clean ups
  mm/hmm: make full use of walk_page_range()
  mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd
  mm/hmm: remove hmm_range vma

 Documentation/vm/hmm.rst|  17 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c  |   8 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |   2 +-
 drivers/gpu/drm/nouveau/nouveau_svm.c   |  13 +-
 include/linux/hmm.h |  47 ++--
 mm/hmm.c| 340 
 6 files changed, 150 insertions(+), 277 deletions(-)

-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: hmm_range_fault related fixes and legacy API removal v3

2019-07-25 Thread Ralph Campbell



On 7/25/19 5:16 PM, Jason Gunthorpe wrote:

On Wed, Jul 24, 2019 at 08:52:51AM +0200, Christoph Hellwig wrote:

Hi Jérôme, Ben and Jason,

below is a series against the hmm tree which fixes up the mmap_sem
locking in nouveau and while at it also removes leftover legacy HMM APIs
only used by nouveau.

The first 4 patches are a bug fix for nouveau, which I suspect should
go into this merge window even if the code is marked as staging, just
to avoid people copying the breakage.

Changes since v2:
  - new patch from Jason to document FAULT_FLAG_ALLOW_RETRY semantics
better
  - remove -EAGAIN handling in nouveau earlier


I don't see Ralph's tested by, do you think it changed enough to
require testing again? If so, Ralph would you be so kind?

In any event, I'm sending this into linux-next and intend to forward
the first four next week.

Thanks,
Jason



I have been testing Christoph's v3 with my set of v2 changes so
feel free to add my tested-by.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 2/7] mm/hmm: a few more C style and comment clean ups

2019-07-26 Thread Ralph Campbell



On 7/25/19 11:23 PM, Christoph Hellwig wrote:

Note: it seems like you've only CCed me on patches 2-7, but not on the
cover letter and patch 1.  I'll try to find them later, but to make Ccs
useful they should normally cover the whole series.

Otherwise this looks fine to me:

Reviewed-by: Christoph Hellwig 



Thanks for the review and sorry about the oversight on CCs.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 hmm 12/12] mm/hmm: Fix error flows in hmm_invalidate_range_start

2019-06-27 Thread Ralph Campbell



On 6/24/19 2:01 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

If the trylock on the hmm->mirrors_sem fails the function will return
without decrementing the notifiers that were previously incremented. Since
the caller will not call invalidate_range_end() on EAGAIN this will result
in notifiers becoming permanently incremented and deadlock.

If the sync_cpu_device_pagetables() required blocking the function will
not return EAGAIN even though the device continues to touch the
pages. This is a violation of the mmu notifier contract.

Switch, and rename, the ranges_lock to a spin lock so we can reliably
obtain it without blocking during error unwind.

The error unwind is necessary since the notifiers count must be held
incremented across the call to sync_cpu_device_pagetables() as we cannot
allow the range to become marked valid by a parallel
invalidate_start/end() pair while doing sync_cpu_device_pagetables().

Signed-off-by: Jason Gunthorpe 
Reviewed-by: Ralph Campbell 
Reviewed-by: Christoph Hellwig 
Tested-by: Philip Yang 
---
  include/linux/hmm.h |  2 +-
  mm/hmm.c| 72 +++--
  2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index bf013e96525771..0fa8ea34ccef6d 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -86,7 +86,7 @@
  struct hmm {
struct mm_struct*mm;
struct kref kref;
-   struct mutexlock;
+   spinlock_t  ranges_lock;
struct list_headranges;
struct list_headmirrors;
struct mmu_notifier mmu_notifier;
diff --git a/mm/hmm.c b/mm/hmm.c
index b224ea635a7716..89549eac03d506 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -64,7 +64,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm)
init_rwsem(>mirrors_sem);
hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(>ranges);
-   mutex_init(>lock);
+   spin_lock_init(>ranges_lock);
kref_init(>kref);
hmm->notifiers = 0;
hmm->mm = mm;
@@ -144,6 +144,23 @@ static void hmm_release(struct mmu_notifier *mn, struct 
mm_struct *mm)
hmm_put(hmm);
  }
  
+static void notifiers_decrement(struct hmm *hmm)

+{
+   lockdep_assert_held(>ranges_lock);
+


Why not acquire the lock here and release at the end instead
of asserting the lock is held?
It looks like everywhere notifiers_decrement() is called does
that.


+   hmm->notifiers--;
+   if (!hmm->notifiers) {
+   struct hmm_range *range;
+
+   list_for_each_entry(range, >ranges, list) {
+   if (range->valid)
+   continue;
+   range->valid = true;
+   }
+   wake_up_all(>wq);
+   }
+}
+
  static int hmm_invalidate_range_start(struct mmu_notifier *mn,
const struct mmu_notifier_range *nrange)
  {
@@ -151,6 +168,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
struct hmm_mirror *mirror;
struct hmm_update update;
struct hmm_range *range;
+   unsigned long flags;
int ret = 0;
  
  	if (!kref_get_unless_zero(>kref))

@@ -161,12 +179,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
update.event = HMM_UPDATE_INVALIDATE;
update.blockable = mmu_notifier_range_blockable(nrange);
  
-	if (mmu_notifier_range_blockable(nrange))

-   mutex_lock(>lock);
-   else if (!mutex_trylock(>lock)) {
-   ret = -EAGAIN;
-   goto out;
-   }
+   spin_lock_irqsave(>ranges_lock, flags);
hmm->notifiers++;
list_for_each_entry(range, >ranges, list) {
if (update.end < range->start || update.start >= range->end)
@@ -174,7 +187,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
  
  		range->valid = false;

}
-   mutex_unlock(>lock);
+   spin_unlock_irqrestore(>ranges_lock, flags);
  
  	if (mmu_notifier_range_blockable(nrange))

down_read(>mirrors_sem);
@@ -182,16 +195,26 @@ static int hmm_invalidate_range_start(struct mmu_notifier 
*mn,
ret = -EAGAIN;
goto out;
}
+
list_for_each_entry(mirror, >mirrors, list) {
-   int ret;
+   int rc;
  
-		ret = mirror->ops->sync_cpu_device_pagetables(mirror, );

-   if (!update.blockable && ret == -EAGAIN)
+   rc = mirror->ops->sync_cpu_device_pagetables(mirror, );
+   if (rc) {
+   if (WARN_ON(update.blockable || rc != -EAGAIN))
+   continue;
+   ret = -EAGAIN;
break;
+   }
}
up_read(>mirrors_sem);
  
  out:

Re: [PATCH] drm/nouveau/dmem: missing mutex_lock in error path

2019-06-16 Thread Ralph Campbell



On 6/13/19 5:49 PM, John Hubbard wrote:

On 6/13/19 5:11 PM, Ralph Campbell wrote:

In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before
calling nouveau_dmem_chunk_alloc().
Reacquire the lock before continuing to the next page.

Signed-off-by: Ralph Campbell 
---

I found this while testing Jason Gunthorpe's hmm tree but this is
independent of those changes. I guess it could go through
David Airlie's tree for nouveau or Jason's tree.



Hi Ralph,

btw, was this the fix for the crash you were seeing? It might be nice to
mention in the commit description, if you are seeing real symptoms.



  drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 27aa4e72abe9..00f7236af1b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm,
ret = nouveau_dmem_chunk_alloc(drm);
if (ret) {
if (c)
-   break;


Actually, the pre-existing code is a little concerning. Your change preserves
the behavior, but it seems questionable to be doing a "return 0" (whether
via the above break, or your change) when it's in this partially allocated
state. It's reporting success when it only allocates part of what was requested,
and it doesn't fill in the pages array either.




+   return 0;
return ret;
}
+   mutex_lock(>dmem->mutex);
continue;
}
  



The above comment is about pre-existing potential problems, but your patch 
itself
looks correct, so:

Reviewed-by: John Hubbard 


thanks,


The crash was the NULL pointer bug in Christoph's patch #10.
I sent a separate reply for that.

Below is the console output I got, then I made the changes just based on
code inspection. Do you think I should include it in the change log?

As for the "return 0", If you follow the call chain,
nouveau_dmem_pages_alloc() is only ever called for one page so this
currently "works" but I agree it is a bit of a time bomb. There are a
number of other bugs that I can see that need fixing but I think those
should be separate patches.

[ 1294.871933] =
[ 1294.876656] WARNING: bad unlock balance detected!
[ 1294.881375] 5.2.0-rc3+ #5 Not tainted
[ 1294.885048] -
[ 1294.889773] test-malloc-vra/6299 is trying to release lock 
(>dmem->mutex) at:
[ 1294.897482] [] 
nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]

[ 1294.905782] but there are no more locks to release!
[ 1294.910690]
[ 1294.910690] other info that might help us debug this:
[ 1294.917249] 1 lock held by test-malloc-vra/6299:
[ 1294.921881]  #0: 16e10454 (>mmap_sem#2){}, at: 
nouveau_svmm_bind+0x142/0x210 [nouveau]

[ 1294.931313]
[ 1294.931313] stack backtrace:
[ 1294.935702] CPU: 4 PID: 6299 Comm: test-malloc-vra Not tainted 
5.2.0-rc3+ #5

[ 1294.942786] Hardware name: ASUS X299-A/PRIME X299-A, BIOS 1401 05/21/2018
[ 1294.949590] Call Trace:
[ 1294.952059]  dump_stack+0x7c/0xc0
[ 1294.955469]  ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1294.962213]  print_unlock_imbalance_bug.cold.52+0xca/0xcf
[ 1294.967641]  lock_release+0x306/0x380
[ 1294.971383]  ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1294.978089]  ? lock_downgrade+0x2d0/0x2d0
[ 1294.982121]  ? find_held_lock+0xac/0xd0
[ 1294.985979]  __mutex_unlock_slowpath+0x8f/0x3f0
[ 1294.990540]  ? wait_for_completion+0x230/0x230
[ 1294.995002]  ? rwlock_bug.part.2+0x60/0x60
[ 1294.999197]  nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1295.005751]  ? page_mapping+0x98/0x110
[ 1295.009511]  migrate_vma+0xa74/0x1090
[ 1295.013186]  ? move_to_new_page+0x480/0x480
[ 1295.017400]  ? __kmalloc+0x153/0x300
[ 1295.021052]  ? nouveau_dmem_migrate_vma+0xd8/0x1e0 [nouveau]
[ 1295.026796]  nouveau_dmem_migrate_vma+0x157/0x1e0 [nouveau]
[ 1295.032466]  ? nouveau_dmem_init+0x490/0x490 [nouveau]
[ 1295.037612]  ? vmacache_find+0xc2/0x110
[ 1295.041537]  nouveau_svmm_bind+0x1b4/0x210 [nouveau]
[ 1295.046583]  ? nouveau_svm_fault+0x13e0/0x13e0 [nouveau]
[ 1295.051912]  drm_ioctl_kernel+0x14d/0x1a0
[ 1295.055930]  ? drm_setversion+0x330/0x330
[ 1295.059971]  drm_ioctl+0x308/0x530
[ 1295.063384]  ? drm_version+0x150/0x150
[ 1295.067153]  ? find_held_lock+0xac/0xd0
[ 1295.070996]  ? __pm_runtime_resume+0x3f/0xa0
[ 1295.075285]  ? mark_held_locks+0x29/0xa0
[ 1295.079230]  ? _raw_spin_unlock_irqrestore+0x3c/0x50
[ 1295.084232]  ? lockdep_hardirqs_on+0x17d/0x250
[ 1295.088768]  nouveau_drm_ioctl+0x9a/0x100 [nouveau]
[ 1295.093661]  do_vfs_ioctl+0x137/0x

[PATCH v2] drm/nouveau/dmem: missing mutex_lock in error path

2019-06-16 Thread Ralph Campbell

In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before
calling nouveau_dmem_chunk_alloc() as shown when CONFIG_PROVE_LOCKING
is enabled:

[ 1294.871933] =
[ 1294.876656] WARNING: bad unlock balance detected!
[ 1294.881375] 5.2.0-rc3+ #5 Not tainted
[ 1294.885048] -
[ 1294.889773] test-malloc-vra/6299 is trying to release lock 
(>dmem->mutex) at:
[ 1294.897482] [] 
nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1294.905782] but there are no more locks to release!
[ 1294.910690] 
[ 1294.910690] other info that might help us debug this:
[ 1294.917249] 1 lock held by test-malloc-vra/6299:
[ 1294.921881]  #0: 16e10454 (>mmap_sem#2){}, at: 
nouveau_svmm_bind+0x142/0x210 [nouveau]
[ 1294.931313] 
[ 1294.931313] stack backtrace:
[ 1294.935702] CPU: 4 PID: 6299 Comm: test-malloc-vra Not tainted 5.2.0-rc3+ #5
[ 1294.942786] Hardware name: ASUS X299-A/PRIME X299-A, BIOS 1401 05/21/2018
[ 1294.949590] Call Trace:
[ 1294.952059]  dump_stack+0x7c/0xc0
[ 1294.955469]  ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1294.962213]  print_unlock_imbalance_bug.cold.52+0xca/0xcf
[ 1294.967641]  lock_release+0x306/0x380
[ 1294.971383]  ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1294.978089]  ? lock_downgrade+0x2d0/0x2d0
[ 1294.982121]  ? find_held_lock+0xac/0xd0
[ 1294.985979]  __mutex_unlock_slowpath+0x8f/0x3f0
[ 1294.990540]  ? wait_for_completion+0x230/0x230
[ 1294.995002]  ? rwlock_bug.part.2+0x60/0x60
[ 1294.999197]  nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau]
[ 1295.005751]  ? page_mapping+0x98/0x110
[ 1295.009511]  migrate_vma+0xa74/0x1090
[ 1295.013186]  ? move_to_new_page+0x480/0x480
[ 1295.017400]  ? __kmalloc+0x153/0x300
[ 1295.021052]  ? nouveau_dmem_migrate_vma+0xd8/0x1e0 [nouveau]
[ 1295.026796]  nouveau_dmem_migrate_vma+0x157/0x1e0 [nouveau]
[ 1295.032466]  ? nouveau_dmem_init+0x490/0x490 [nouveau]
[ 1295.037612]  ? vmacache_find+0xc2/0x110
[ 1295.041537]  nouveau_svmm_bind+0x1b4/0x210 [nouveau]
[ 1295.046583]  ? nouveau_svm_fault+0x13e0/0x13e0 [nouveau]
[ 1295.051912]  drm_ioctl_kernel+0x14d/0x1a0
[ 1295.055930]  ? drm_setversion+0x330/0x330
[ 1295.059971]  drm_ioctl+0x308/0x530
[ 1295.063384]  ? drm_version+0x150/0x150
[ 1295.067153]  ? find_held_lock+0xac/0xd0
[ 1295.070996]  ? __pm_runtime_resume+0x3f/0xa0
[ 1295.075285]  ? mark_held_locks+0x29/0xa0
[ 1295.079230]  ? _raw_spin_unlock_irqrestore+0x3c/0x50
[ 1295.084232]  ? lockdep_hardirqs_on+0x17d/0x250
[ 1295.088768]  nouveau_drm_ioctl+0x9a/0x100 [nouveau]
[ 1295.093661]  do_vfs_ioctl+0x137/0x9a0
[ 1295.097341]  ? ioctl_preallocate+0x140/0x140
[ 1295.101623]  ? match_held_lock+0x1b/0x230
[ 1295.105646]  ? match_held_lock+0x1b/0x230
[ 1295.109660]  ? find_held_lock+0xac/0xd0
[ 1295.113512]  ? __do_page_fault+0x324/0x630
[ 1295.117617]  ? lock_downgrade+0x2d0/0x2d0
[ 1295.121648]  ? mark_held_locks+0x79/0xa0
[ 1295.125583]  ? handle_mm_fault+0x352/0x430
[ 1295.129687]  ksys_ioctl+0x60/0x90
[ 1295.133020]  ? mark_held_locks+0x29/0xa0
[ 1295.136964]  __x64_sys_ioctl+0x3d/0x50
[ 1295.140726]  do_syscall_64+0x68/0x250
[ 1295.144400]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
[ 1295.149465] RIP: 0033:0x7f1a3495809b
[ 1295.153053] Code: 0f 1e fa 48 8b 05 ed bd 0c 00 64 c7 00 26 00 00 00 48 c7 
c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d bd bd 0c 00 f7 d8 64 89 01 48
[ 1295.171850] RSP: 002b:7ffef7ed1358 EFLAGS: 0246 ORIG_RAX: 
0010
[ 1295.179451] RAX: ffda RBX: 7ffef7ed1628 RCX: 7f1a3495809b
[ 1295.186601] RDX: 7ffef7ed13b0 RSI: 40406449 RDI: 0004
[ 1295.193759] RBP: 7ffef7ed13b0 R08:  R09: 0157e770
[ 1295.200917] R10: 0151c010 R11: 0246 R12: 40406449
[ 1295.208083] R13: 0004 R14:  R15: 

Reacquire the lock before continuing to the next page.

Signed-off-by: Ralph Campbell 
---

I found this while testing Jason Gunthorpe's hmm tree but this is
independent of those changes. Jason thinks it is best to go through
David Airlie's nouveau tree.

Changes for v2:
Updated change log to include console output.

 drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 27aa4e72abe9..00f7236af1b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm,
ret = nouveau_dmem_chunk_alloc(drm);
if (ret) {
if (c)
-   break;
+   return 0;

Re: [PATCH 10/22] memremap: add a migrate callback to struct dev_pagemap_ops

2019-06-14 Thread Ralph Campbell

E */
-
  static void pgmap_array_delete(struct resource *res)
  {
xa_store_range(_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
diff --git a/mm/hmm.c b/mm/hmm.c
index 6dc769feb2e1..aab799677c7d 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1330,15 +1330,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap 
*pgmap)
percpu_ref_kill(pgmap->ref);
  }
  
-static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma,

-   unsigned long addr,
-   const struct page *page,
-   unsigned int flags,
-   pmd_t *pmdp)
+static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf)
  {
-   struct hmm_devmem *devmem = page->pgmap->data;
+   struct hmm_devmem *devmem = vmf->page->pgmap->data;
  
-	return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);

+   return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page,
+   vmf->flags, vmf->pmd);
  }
  
  static void hmm_devmem_free(struct page *page, void *data)

@@ -1351,6 +1348,7 @@ static void hmm_devmem_free(struct page *page, void *data)
  static const struct dev_pagemap_ops hmm_pagemap_ops = {
.page_free  = hmm_devmem_free,
.kill   = hmm_devmem_ref_kill,
+   .migrate= hmm_devmem_migrate,
  };
  
  /*

@@ -1405,7 +1403,6 @@ struct hmm_devmem *hmm_devmem_add(const struct 
hmm_devmem_ops *ops,
devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
devmem->pfn_last = devmem->pfn_first +
   (resource_size(devmem->resource) >> PAGE_SHIFT);
-   devmem->page_fault = hmm_devmem_fault;
  
  	devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;

devmem->pagemap.res = *devmem->resource;
diff --git a/mm/memory.c b/mm/memory.c
index ddf20bd0c317..cbf3cb598436 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2782,13 +2782,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
migration_entry_wait(vma->vm_mm, vmf->pmd,
 vmf->address);
} else if (is_device_private_entry(entry)) {
-   /*
-* For un-addressable device memory we call the pgmap
-* fault handler callback. The callback must migrate
-* the page back to some CPU accessible page.
-*/
-   ret = device_private_entry_fault(vma, vmf->address, 
entry,
-vmf->flags, vmf->pmd);
+   vmf->page = device_private_entry_to_page(entry);
+   ret = page->pgmap->ops->migrate(vmf);


This needs to either initialize "page" or be changed to "vmf->page".
Otherwise, it is a NULL pointer dereference.


    } else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {



You can add:
Reviewed-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH] drm/nouveau/dmem: missing mutex_lock in error path

2019-06-14 Thread Ralph Campbell

In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before
calling nouveau_dmem_chunk_alloc().
Reacquire the lock before continuing to the next page.

Signed-off-by: Ralph Campbell 
---

I found this while testing Jason Gunthorpe's hmm tree but this is
independant of those changes. I guess it could go through
David Airlie's tree for nouveau or Jason's tree.

 drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 27aa4e72abe9..00f7236af1b9 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm,
ret = nouveau_dmem_chunk_alloc(drm);
if (ret) {
if (c)
-   break;
+   return 0;
return ret;
}
+   mutex_lock(>dmem->mutex);
continue;
}
 
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 18/22] mm: mark DEVICE_PUBLIC as broken

2019-06-14 Thread Ralph Campbell



On 6/13/19 12:44 PM, Jason Gunthorpe wrote:

On Thu, Jun 13, 2019 at 11:43:21AM +0200, Christoph Hellwig wrote:

The code hasn't been used since it was added to the tree, and doesn't
appear to actually be usable.  Mark it as BROKEN until either a user
comes along or we finally give up on it.

Signed-off-by: Christoph Hellwig 
  mm/Kconfig | 1 +
  1 file changed, 1 insertion(+)

diff --git a/mm/Kconfig b/mm/Kconfig
index 0d2ba7e1f43e..406fa45e9ecc 100644
+++ b/mm/Kconfig
@@ -721,6 +721,7 @@ config DEVICE_PRIVATE
  config DEVICE_PUBLIC
bool "Addressable device memory (like GPU memory)"
depends on ARCH_HAS_HMM
+   depends on BROKEN
select HMM
select DEV_PAGEMAP_OPS


This seems a bit harsh, we do have another kconfig that selects this
one today:

config DRM_NOUVEAU_SVM
 bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support"
 depends on ARCH_HAS_HMM
 depends on DRM_NOUVEAU
 depends on STAGING
 select HMM_MIRROR
 select DEVICE_PRIVATE
 default n
 help
   Say Y here if you want to enable experimental support for
   Shared Virtual Memory (SVM).

Maybe it should be depends on STAGING not broken?

or maybe nouveau_svm doesn't actually need DEVICE_PRIVATE?

Jason


I think you are confusing DEVICE_PRIVATE for DEVICE_PUBLIC.
DRM_NOUVEAU_SVM does use DEVICE_PRIVATE but not DEVICE_PUBLIC.

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 00/15] Consolidate the mmu notifier interval_tree and locking

2019-11-01 Thread Ralph Campbell



On 10/28/19 1:10 PM, Jason Gunthorpe wrote:

From: Jason Gunthorpe 

8 of the mmu_notifier using drivers (i915_gem, radeon_mn, umem_odp, hfi1,
scif_dma, vhost, gntdev, hmm) drivers are using a common pattern where
they only use invalidate_range_start/end and immediately check the
invalidating range against some driver data structure to tell if the
driver is interested. Half of them use an interval_tree, the others are
simple linear search lists.

Of the ones I checked they largely seem to have various kinds of races,
bugs and poor implementation. This is a result of the complexity in how
the notifier interacts with get_user_pages(). It is extremely difficult to
use it correctly.

Consolidate all of this code together into the core mmu_notifier and
provide a locking scheme similar to hmm_mirror that allows the user to
safely use get_user_pages() and reliably know if the page list still
matches the mm.

This new arrangment plays nicely with the !blockable mode for
OOM. Scanning the interval tree is done such that the intersection test
will always succeed, and since there is no invalidate_range_end exposed to
drivers the scheme safely allows multiple drivers to be subscribed.

Four places are converted as an example of how the new API is used.
Four are left for future patches:
  - i915_gem has complex locking around destruction of a registration,
needs more study
  - hfi1 (2nd user) needs access to the rbtree
  - scif_dma has a complicated logic flow
  - vhost's mmu notifiers are already being rewritten

This series, and the other code it depends on is available on my github:

https://github.com/jgunthorpe/linux/commits/mmu_notifier

v2 changes:
- Add mmu_range_set_seq() to set the mrn sequence number under the driver
   lock and make the locking more understandable
- Add some additional comments around locking/READ_ONCe
- Make the WARN_ON flow in mn_itree_invalidate a bit easier to follow
- Fix wrong WARN_ON

Jason Gunthorpe (15):
   mm/mmu_notifier: define the header pre-processor parts even if
 disabled
   mm/mmu_notifier: add an interval tree notifier
   mm/hmm: allow hmm_range to be used with a mmu_range_notifier or
 hmm_mirror
   mm/hmm: define the pre-processor related parts of hmm.h even if
 disabled
   RDMA/odp: Use mmu_range_notifier_insert()
   RDMA/hfi1: Use mmu_range_notifier_inset for user_exp_rcv
   drm/radeon: use mmu_range_notifier_insert
   xen/gntdev: Use select for DMA_SHARED_BUFFER
   xen/gntdev: use mmu_range_notifier_insert
   nouveau: use mmu_notifier directly for invalidate_range_start
   nouveau: use mmu_range_notifier instead of hmm_mirror
   drm/amdgpu: Call find_vma under mmap_sem
   drm/amdgpu: Use mmu_range_insert instead of hmm_mirror
   drm/amdgpu: Use mmu_range_notifier instead of hmm_mirror
   mm/hmm: remove hmm_mirror and related

  Documentation/vm/hmm.rst  | 105 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |   2 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |   9 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c|  14 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c|   1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c| 457 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h|  53 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h|  13 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 111 ++--
  drivers/gpu/drm/nouveau/nouveau_svm.c | 231 +---
  drivers/gpu/drm/radeon/radeon.h   |   9 +-
  drivers/gpu/drm/radeon/radeon_mn.c| 219 ++-
  drivers/infiniband/core/device.c  |   1 -
  drivers/infiniband/core/umem_odp.c| 288 +
  drivers/infiniband/hw/hfi1/file_ops.c |   2 +-
  drivers/infiniband/hw/hfi1/hfi.h  |   2 +-
  drivers/infiniband/hw/hfi1/user_exp_rcv.c | 146 ++---
  drivers/infiniband/hw/hfi1/user_exp_rcv.h |   3 +-
  drivers/infiniband/hw/mlx5/mlx5_ib.h  |   7 +-
  drivers/infiniband/hw/mlx5/mr.c   |   3 +-
  drivers/infiniband/hw/mlx5/odp.c  |  50 +-
  drivers/xen/Kconfig   |   3 +-
  drivers/xen/gntdev-common.h   |   8 +-
  drivers/xen/gntdev.c  | 180 ++
  include/linux/hmm.h   | 195 +--
  include/linux/mmu_notifier.h  | 144 -
  include/rdma/ib_umem_odp.h|  65 +--
  include/rdma/ib_verbs.h   |   2 -
  kernel/fork.c |   1 -
  mm/Kconfig|   2 +-
  mm/hmm.c  | 275 +
  mm/mmu_notifier.c | 546 +-
  32 files changed, 1225 insertions(+), 1922 deletions(-)



You can add my Tested-by for the mm and nouveau changes.
IOW, patches 1-4, 10-11, and 15.

Tested-by: Ralph Campbell 
___
dri-devel mailing list
dri-devel

Re: [PATCH v3 02/14] mm/mmu_notifier: add an interval tree notifier

2019-11-22 Thread Ralph Campbell



On 11/13/19 8:46 AM, Jason Gunthorpe wrote:

On Wed, Nov 13, 2019 at 05:59:52AM -0800, Christoph Hellwig wrote:

+int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long length,
+ const struct mmu_interval_notifier_ops 
*ops);
+int mmu_interval_notifier_insert_locked(
+   struct mmu_interval_notifier *mni, struct mm_struct *mm,
+   unsigned long start, unsigned long length,
+   const struct mmu_interval_notifier_ops *ops);


Very inconsistent indentation between these two related functions.


clang-format.. The kernel config is set to prefer a line up under the
( if all the arguments will fit within the 80 cols otherwise it does a
1 tab continuation indent.


+   /*
+* The inv_end incorporates a deferred mechanism like
+* rtnl_unlock(). Adds and removes are queued until the final inv_end
+* happens then they are progressed. This arrangement for tree updates
+* is used to avoid using a blocking lock during
+* invalidate_range_start.


Nitpick:  That comment can be condensed into one less line:


The rtnl_unlock can move up a line too. My editor is failing me on
this.


+   /*
+* TODO: Since we already have a spinlock above, this would be faster
+* as wake_up_q
+*/
+   if (need_wake)
+   wake_up_all(_mm->wq);


So why is this important enough for a TODO comment, but not important
enough to do right away?


Lets drop the comment, I'm noto sure wake_up_q is even a function this
layer should be calling.


Actually, I think you can remove the "need_wake" variable since it is
unconditionally set to "true".

Also, the comment in__mmu_interval_notifier_insert() says
"mni->mr_invalidate_seq" and I think that should be
"mni->invalidate_seq".
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2] nouveau/hmm: map pages after migration

2020-03-03 Thread Ralph Campbell



On 3/3/20 4:42 AM, Jason Gunthorpe wrote:

On Mon, Mar 02, 2020 at 05:00:23PM -0800, Ralph Campbell wrote:

When memory is migrated to the GPU, it is likely to be accessed by GPU
code soon afterwards. Instead of waiting for a GPU fault, map the
migrated memory into the GPU page tables with the same access permissions
as the source CPU page table entries. This preserves copy on write
semantics.

Signed-off-by: Ralph Campbell 
Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Cc: "Jérôme Glisse" 
Cc: Ben Skeggs 
---

Originally this patch was targeted for Jason's rdma tree since other HMM
related changes were queued there. Now that those have been merged, this
patch just contains changes to nouveau so it could go through any tree.
I guess Ben Skeggs' tree would be appropriate.


Yep


+static inline struct nouveau_pfnmap_args *
+nouveau_pfns_to_args(void *pfns)


don't use static inline inside C files


OK.


+{
+   struct nvif_vmm_pfnmap_v0 *p =
+   container_of(pfns, struct nvif_vmm_pfnmap_v0, phys);
+
+   return container_of(p, struct nouveau_pfnmap_args, p);


And this should just be

return container_of(pfns, struct nouveau_pfnmap_args, p.phys);


Much simpler, thanks.


+static struct nouveau_svmm *
+nouveau_find_svmm(struct nouveau_svm *svm, struct mm_struct *mm)
+{
+   struct nouveau_ivmm *ivmm;
+
+   list_for_each_entry(ivmm, >inst, head) {
+   if (ivmm->svmm->notifier.mm == mm)
+   return ivmm->svmm;
+   }
+   return NULL;
+}


Is this re-implementing mmu_notifier_get() ?

Jason


Not quite. This is being called from an ioctl() call on the GPU device
file which calls nouveau_svmm_bind() which locks mmap_sem for reading,
walks the vmas for the address range given in the ioctl() data, and migrates
the pages to the GPU memory.
mmu_notifier_get() would try to lock mmap_sem for writing so that would 
deadlock.
But it is similar in that the GPU specific process context (nouveau_svmm) needs
to be found for the given ioctl caller.
If find_get_mmu_notifier() was exported, I think that could work.
Now that I look at this again, there is an easier way to find the svmm and I see
some other bugs that need fixing. I'll post a v3 as soon as I get those written
and tested.

Thanks for the review.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v3 1/4] nouveau/hmm: fix vma range check for migration

2020-03-03 Thread Ralph Campbell

find_vma_intersection(mm, start, end) only guarantees that end is greater
than or equal to vma->vm_start but doesn't guarantee that start is
greater than or equal to vma->vm_start. The calculation for the
intersecting range in nouveau_svmm_bind() isn't accounting for this and
can call migrate_vma_setup() with a starting address less than
vma->vm_start. This results in migrate_vma_setup() returning -EINVAL for
the range instead of nouveau skipping that part of the range and migrating
the rest.

Signed-off-by: Ralph Campbell 
---
 drivers/gpu/drm/nouveau/nouveau_svm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index df9bf1fd1bc0..169320409286 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -179,6 +179,7 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
if (!vma)
break;
 
+   addr = max(addr, vma->vm_start);
next = min(vma->vm_end, end);
/* This is a best effort so we ignore errors */
nouveau_dmem_migrate_vma(cli->drm, vma, addr, next);
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v3 3/4] nouveau: remove useless SVM range check

2020-03-03 Thread Ralph Campbell

When nouveau processes GPU faults, it checks to see if the fault address
falls within the "unmanaged" range which is reserved for fixed allocations
instead of addresses chosen by the core mm code. If start is greater than
or equal to svmm->unmanaged.limit, then limit will also be greater than
svmm->unmanaged.limit which is greater than svmm->unmanaged.start and the
start = max_t(u64, start, svmm->unmanaged.limit) will change nothing.
Just remove the useless lines of code.

Signed-off-by: Ralph Campbell 
---
 drivers/gpu/drm/nouveau/nouveau_svm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index c567526b75b8..8dfa5cb74826 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -663,9 +663,6 @@ nouveau_svm_fault(struct nvif_notify *notify)
limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT);
if (start < svmm->unmanaged.limit)
limit = min_t(u64, limit, svmm->unmanaged.start);
-   else
-   if (limit > svmm->unmanaged.start)
-   start = max_t(u64, start, svmm->unmanaged.limit);
SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit);
 
mm = svmm->notifier.mm;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v3 2/4] nouveau/hmm: check for SVM initialized before migrating

2020-03-03 Thread Ralph Campbell

When migrating system memory to GPU memory, check that SVM has been
enabled. Even though most errors can be ignored since migration is
a performance optimization, return an error because this is a violation
of the API.

Signed-off-by: Ralph Campbell 
---
 drivers/gpu/drm/nouveau/nouveau_svm.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
b/drivers/gpu/drm/nouveau/nouveau_svm.c
index 169320409286..c567526b75b8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_svm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
@@ -171,6 +171,11 @@ nouveau_svmm_bind(struct drm_device *dev, void *data,
mm = get_task_mm(current);
down_read(>mmap_sem);
 
+   if (!cli->svm.svmm) {
+   up_read(>mmap_sem);
+   return -EINVAL;
+   }
+
for (addr = args->va_start, end = args->va_start + size; addr < end;) {
struct vm_area_struct *vma;
unsigned long next;
-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH v3 4/4] nouveau/hmm: map pages after migration

2020-03-03 Thread Ralph Campbell

When memory is migrated to the GPU, it is likely to be accessed by GPU
code soon afterwards. Instead of waiting for a GPU fault, map the
migrated memory into the GPU page tables with the same access permissions
as the source CPU page table entries. This preserves copy on write
semantics.

Signed-off-by: Ralph Campbell 
Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Cc: "Jérôme Glisse" 
Cc: Ben Skeggs 
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 46 +---
 drivers/gpu/drm/nouveau/nouveau_dmem.h |  2 +
 drivers/gpu/drm/nouveau/nouveau_svm.c  | 59 +-
 drivers/gpu/drm/nouveau/nouveau_svm.h  |  5 +++
 4 files changed, 95 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 0ad5d87b5a8e..981c05a2a6ca 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -25,11 +25,13 @@
 #include "nouveau_dma.h"
 #include "nouveau_mem.h"
 #include "nouveau_bo.h"
+#include "nouveau_svm.h"
 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -558,10 +560,11 @@ nouveau_dmem_init(struct nouveau_drm *drm)
 }
 
 static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm,
-   unsigned long src, dma_addr_t *dma_addr)
+   unsigned long src, dma_addr_t *dma_addr, u64 *pfn)
 {
struct device *dev = drm->dev->dev;
struct page *dpage, *spage;
+   unsigned long paddr;
 
spage = migrate_pfn_to_page(src);
if (!spage || !(src & MIGRATE_PFN_MIGRATE))
@@ -569,17 +572,21 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct 
nouveau_drm *drm,
 
dpage = nouveau_dmem_page_alloc_locked(drm);
if (!dpage)
-   return 0;
+   goto out;
 
*dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL);
if (dma_mapping_error(dev, *dma_addr))
goto out_free_page;
 
+   paddr = nouveau_dmem_page_addr(dpage);
if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM,
-   nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST,
-   *dma_addr))
+   paddr, NOUVEAU_APER_HOST, *dma_addr))
goto out_dma_unmap;
 
+   *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM |
+   ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT);
+   if (src & MIGRATE_PFN_WRITE)
+   *pfn |= NVIF_VMM_PFNMAP_V0_W;
return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED;
 
 out_dma_unmap:
@@ -587,18 +594,20 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct 
nouveau_drm *drm,
 out_free_page:
nouveau_dmem_page_free_locked(drm, dpage);
 out:
+   *pfn = NVIF_VMM_PFNMAP_V0_NONE;
return 0;
 }
 
 static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm,
-   struct migrate_vma *args, dma_addr_t *dma_addrs)
+   struct nouveau_svmm *svmm, struct migrate_vma *args,
+   dma_addr_t *dma_addrs, u64 *pfns)
 {
struct nouveau_fence *fence;
unsigned long addr = args->start, nr_dma = 0, i;
 
for (i = 0; addr < args->end; i++) {
args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i],
-   dma_addrs + nr_dma);
+   dma_addrs + nr_dma, pfns + i);
if (args->dst[i])
nr_dma++;
addr += PAGE_SIZE;
@@ -607,20 +616,18 @@ static void nouveau_dmem_migrate_chunk(struct nouveau_drm 
*drm,
nouveau_fence_new(drm->dmem->migrate.chan, false, );
migrate_vma_pages(args);
nouveau_dmem_fence_done();
+   nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i);
 
while (nr_dma--) {
dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE,
DMA_BIDIRECTIONAL);
}
-   /*
-* FIXME optimization: update GPU page table to point to newly migrated
-* memory.
-*/
migrate_vma_finalize(args);
 }
 
 int
 nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
+struct nouveau_svmm *svmm,
 struct vm_area_struct *vma,
 unsigned long start,
 unsigned long end)
@@ -632,7 +639,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
.vma= vma,
.start  = start,
};
-   unsigned long c, i;
+   unsigned long i;
+   u64 *pfns;
int ret = -ENOMEM;
 
args.src = kcalloc(max, sizeof(*args.src), GFP_KERNEL);
@@ -646,19 +654,25 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm,
if (!dma_addrs)

[PATCH v3 0/4] nouveau/hmm: map pages after migration

2020-03-03 Thread Ralph Campbell

Originally patch 4 was targeted for Jason's rdma tree since other HMM
related changes were queued there. Now that those have been merged,
these patches just contain changes to nouveau so they could go through
any tree. I guess Ben Skeggs' tree would be appropriate.

Changes since v2:
 Added patches 1-3 to fix some minor issues.
 Eliminated nouveau_find_svmm() since it is easily found.
 Applied Jason Gunthorpe's suggestions for nouveau_pfns_to_args().

Changes since v1:
 Rebase to linux-5.6.0-rc4
 Address Christoph Hellwig's comments


Ralph Campbell (4):
  nouveau/hmm: fix vma range check for migration
  nouveau/hmm: check for SVM initialized before migrating
  nouveau: remove useless SVM range check
  nouveau/hmm: map pages after migration

 drivers/gpu/drm/nouveau/nouveau_dmem.c | 46 +++--
 drivers/gpu/drm/nouveau/nouveau_dmem.h |  2 +
 drivers/gpu/drm/nouveau/nouveau_svm.c  | 69 --
 drivers/gpu/drm/nouveau/nouveau_svm.h  |  5 ++
 4 files changed, 102 insertions(+), 20 deletions(-)

-- 
2.20.1

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

1 2 >

1 - 100 of 153 matches

Mail list logo