Re: [PATCH v5 1/9] mm/mmu_notifier: helper to test if a range invalidation is blockable
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v5 6/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 8 mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 14 +++--- mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcbd0e574917..3b93ce496dd4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 46f546bdba00..8e8342080013 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c9d638f1b34e..1da6ca0f0f6d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2028,7 +2026,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2247,7 +2245,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9e5c5a4c004..a58115c6b0a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3250,7 +3250,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vm
Re: [PATCH v5 7/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Changes since v1: - Initialize flags field from mmu_notifier_range_init() arguments Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v5 8/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); Don't you have to check for !WRITE & READ? mprotect() can change the permissions from R/O to RW and end up calling mmu_notifier_range_init() and mmu_notifier_invalidate_range_start()/end(). I'm not sure how useful this is since only applies to the MMU_NOTIFY_PROTECTION_VMA case. Anyway, you can add Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v5 5/9] mm/mmu_notifier: contextual information for event triggering invalidation v2
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. Changes since v1: - add the flags parameter to init range flags This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v5 3/9] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v5 9/9] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag for page table update that use set_pte_at_notify() and where the we are going either from read and write to read only with same pfn or read only to read and write with new pfn. Note that set_pte_at_notify() itself should only be use in rare cases ie we do not want to use it when we are updating a significant range of virtual addresses and thus a significant number of pte. Instead for those cases the event provided to mmu notifer invalidate_range_start() callback should be use for optimization. Changes since v1: - Use the new unsigned flags field in struct mmu_notifier_range - Use the new flags parameter to mmu_notifier_range_init() - Explicitly list all the patterns where we can use change_pte() Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 34 -- mm/ksm.c | 11 ++- mm/memory.c | 5 +++-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..0230a4b06b46 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -40,6 +40,26 @@ enum mmu_notifier_event { MMU_NOTIFY_SOFT_DIRTY, }; +/* + * @MMU_NOTIFIER_RANGE_BLOCKABLE: can the mmu notifier range_start/range_end + * callback block or not ? If set then the callback can block. + * + * @MMU_NOTIFIER_USE_CHANGE_PTE: only set when the page table it updated with + * the set_pte_at_notify() the valid patterns for this are: + * - pte read and write to read only same pfn + * - pte read only to read and write (pfn can change or stay the same) + * - pte read only to read only with different pfn + * It is illegal to set in any other circumstances. + * + * Note that set_pte_at_notify() should not be use outside of the above cases. + * When updating a range in batch (like write protecting a range) it is better + * to rely on invalidate_range_start() and struct mmu_notifier_range to infer + * the kind of update that is happening (as an example you can look at the + * mmu_notifier_range_update_to_read_only() function). + */ +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) +#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << 1) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -55,8 +75,6 @@ struct mmu_notifier_mm { spinlock_t lock; }; -#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) - struct mmu_notifier_range { struct vm_area_struct *vma; struct mm_struct *mm; @@ -268,6 +286,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return (range->flags & MMU_NOTIFIER_USE_CHANGE_PTE); +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -509,6 +533,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return true; } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return false; +} + static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; diff --git a/mm/ksm.c b/mm/ksm.c index b782fadade8f..41e51882f999 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, - pvmw.address + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(); if (!page_vma_mapped_walk()) @@ -1155,8 +1155,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, - addr + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, + vma, mm, addr, addr + PAGE_SIZE); mmu
Re: [PATCH v5 2/9] mm/mmu_notifier: convert user range->blockable to helper function
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse Use the mmu_notifier_range_blockable() helper function instead of directly dereferencing the range->blockable field. This is done to make it easier to change the mmu_notifier range field. This patch is the outcome of the following coccinelle patch: %<--- @@ identifier I1, FN; @@ FN(..., struct mmu_notifier_range *I1, ...) { <... -I1->blockable +mmu_notifier_range_blockable(I1) ...> } --->% spatch --in-place --sp-file blockable.spatch --dir . Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/xen/gntdev.c| 6 +++--- mm/hmm.c| 6 +++--- mm/mmu_notifier.c | 2 +- virt/kvm/kvm_main.c | 3 ++- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 1d3f9a31ad61..777b3f8727e7 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(>lock); else if (!mutex_trylock(>lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 012044f16d1c..3a3f1538d295 1
Re: [PATCH v5 4/9] mm/mmu_notifier: contextual information for event enums
On 2/19/19 12:04 PM, jgli...@redhat.com wrote: From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result s/update/updates s/happens/happen of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of s/introduce/introduces the events triggering a mmu notifier. Latter patches take advantages of those enum values. s/advantages/advantage - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking s/dirtyness/dirtiness Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range I would say something about the VMA for the notifier range is being deleted. MMU notifier clients can then use this case to remove any policy or access counts associated with the range. Just changing the PTE to "no access" as in the CLEAR case doesn't mean a policy which prefers device private memory over system memory should be cleared. + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 3/9] mm: Add write-protect and clean utilities for address space ranges
On 4/12/19 9:04 AM, Thomas Hellstrom wrote: Add two utilities to a) write-protect and b) clean all ptes pointing into a range of an address space A period at the end, please. The utilities are intended to aid in tracking dirty pages (either driver-allocated system memory or pci device memory). The write-protect utility should be used in conjunction with page_mkwrite() and pfn_mkwrite() to trigger write page-faults on page accesses. Typically one would want to use this on sparse accesses into large memory regions. The clean utility should be used to utilize hardware dirtying functionality and avoid the overhead of page-faults, typically on large accesses into small memory regions. The added file "apply_as_range.c" is initially listed as maintained by VMware under our DRM driver. If somebody would like it elsewhere, that's of course no problem. Notable changes since RFC: - Added comments to help avoid the usage of these function for VMAs it's not intended for. We also do advisory checks on the vm_flags and warn on illegal usage. - Perform the pte modifications the same way softdirty does. - Add mmu_notifier range invalidation calls. - Add a config option so that this code is not unconditionally included. - Tell the mmu_gather code about pending tlb flushes. Cc: Andrew Morton Cc: Matthew Wilcox Cc: Will Deacon Cc: Peter Zijlstra Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: "Jérôme Glisse" Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Signed-off-by: Thomas Hellstrom Reviewed-by: Ralph Campbell --- MAINTAINERS | 1 + include/linux/mm.h | 9 +- mm/Kconfig | 3 + mm/Makefile | 3 +- mm/apply_as_range.c | 295 5 files changed, 309 insertions(+), 2 deletions(-) create mode 100644 mm/apply_as_range.c diff --git a/MAINTAINERS b/MAINTAINERS index 35e6357f9d30..bc243ffcb840 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4971,6 +4971,7 @@ T:git git://people.freedesktop.org/~thomash/linux S:Supported F:drivers/gpu/drm/vmwgfx/ F:include/uapi/drm/vmwgfx_drm.h +F: mm/apply_as_range.c DRM DRIVERS M:David Airlie diff --git a/include/linux/mm.h b/include/linux/mm.h index b7dd4ddd6efb..62f24dd0bfa0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2642,7 +2642,14 @@ struct pfn_range_apply { }; extern int apply_to_pfn_range(struct pfn_range_apply *closure, unsigned long address, unsigned long size); - +unsigned long apply_as_wrprotect(struct address_space *mapping, +pgoff_t first_index, pgoff_t nr); +unsigned long apply_as_clean(struct address_space *mapping, +pgoff_t first_index, pgoff_t nr, +pgoff_t bitmap_pgoff, +unsigned long *bitmap, +pgoff_t *start, +pgoff_t *end); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); diff --git a/mm/Kconfig b/mm/Kconfig index 25c71eb8a7db..80e41cdbb4ae 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -758,4 +758,7 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool +config AS_DIRTY_HELPERS +bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index d210cc9d6f80..b295717be856 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -39,7 +39,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o $(mmu-y) + debug.o apply_as_range.o $(mmu-y) obj-y += init-mm.o obj-y += memblock.o @@ -99,3 +99,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_AS_DIRTY_HELPERS) += apply_as_range.o diff --git a/mm/apply_as_range.c b/mm/apply_as_range.c new file mode 100644 index ..32d28619aec5 --- /dev/null +++ b/mm/apply_as_range.c @@ -0,0 +1,295 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +/** + * struct apply_as - Closure structure for apply_as_range + * @base: struct pfn_range_apply we derive from + * @start: Address of first modified pte + * @end: Address of last modified pte + 1 + * @total: Total number of modified ptes + * @vma: Pointer to the struct vm_area_struct we're currently operating on + */ +struct apply_as { + struct pfn_range_apply base; + unsigned long start, end; One variable defined per line, please. +
Re: [PATCH 2/9] mm: Add an apply_to_pfn_range interface
On 4/12/19 9:04 AM, Thomas Hellstrom wrote: This is basically apply_to_page_range with added functionality: Allocating missing parts of the page table becomes optional, which means that the function can be guaranteed not to error if allocation is disabled. Also passing of the closure struct and callback function becomes different and more in line with how things are done elsewhere. Finally we keep apply_to_page_range as a wrapper around apply_to_pfn_range The reason for not using the page-walk code is that we want to perform the page-walk on vmas pointing to an address space without requiring the mmap_sem to be held rather thand on vmas belonging to a process with the s/thand/than/ mmap_sem held. Notable changes since RFC: Don't export apply_to_pfn range. Cc: Andrew Morton Cc: Matthew Wilcox Cc: Will Deacon Cc: Peter Zijlstra Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: "Jérôme Glisse" Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Signed-off-by: Thomas Hellstrom Reviewed-by: Ralph Campbell --- include/linux/mm.h | 10 mm/memory.c| 130 ++--- 2 files changed, 108 insertions(+), 32 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..b7dd4ddd6efb 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2632,6 +2632,16 @@ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +struct pfn_range_apply; +typedef int (*pter_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, +struct pfn_range_apply *closure); +struct pfn_range_apply { + struct mm_struct *mm; + pter_fn_t ptefn; + unsigned int alloc; +}; +extern int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long address, unsigned long size); #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); diff --git a/mm/memory.c b/mm/memory.c index a95b4a3b1ae2..60d67158964f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1938,18 +1938,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, -unsigned long addr, unsigned long end, -pte_fn_t fn, void *data) +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, + unsigned long addr, unsigned long end) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); - pte = (mm == _mm) ? + pte = (closure->mm == _mm) ? pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, ); + pte_alloc_map_lock(closure->mm, pmd, addr, ); if (!pte) return -ENOMEM; @@ -1960,86 +1959,107 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte++, token, addr, data); + err = closure->ptefn(pte++, token, addr, closure); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - if (mm != _mm) + if (closure->mm != _mm) pte_unmap_unlock(pte-1, ptl); return err; } -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, -unsigned long addr, unsigned long end, -pte_fn_t fn, void *data) +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, + unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; - int err; + int err = 0; BUG_ON(pud_huge(*pud)); - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc(closure->mm, pud, addr); if (!pmd) return -ENOMEM; + do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) + continue; + err = apply_to_pte_range(closure, pmd, addr, next); if (err) break; } while (pmd++, addr = next, addr != end); return err; } -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, -unsigned long addr, unsigned long end, -pte_fn_t fn, void *data) +static int apply_to_pud_range(struct
Re: [PATCH 1/9] mm: Allow the [page|pfn]_mkwrite callbacks to drop the mmap_sem
On 4/12/19 9:04 AM, Thomas Hellstrom wrote: Driver fault callbacks are allowed to drop the mmap_sem when expecting long hardware waits to avoid blocking other mm users. Allow the mkwrite callbacks to do the same by returning early on VM_FAULT_RETRY. In particular we want to be able to drop the mmap_sem when waiting for a reservation object lock on a GPU buffer object. These locks may be held while waiting for the GPU. Cc: Andrew Morton Cc: Matthew Wilcox Cc: Will Deacon Cc: Peter Zijlstra Cc: Rik van Riel Cc: Minchan Kim Cc: Michal Hocko Cc: Huang Ying Cc: Souptick Joarder Cc: "Jérôme Glisse" Cc: linux...@kvack.org Cc: linux-ker...@vger.kernel.org Signed-off-by: Thomas Hellstrom Reviewed-by: Ralph Campbell --- mm/memory.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e11ca9dd823f..a95b4a3b1ae2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2144,7 +2144,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE))) A very minor nit, for consistency elsewhere in mm/memory.c, could you make this be: (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { lock_page(page); @@ -2419,7 +2419,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) + if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); } @@ -2440,7 +2440,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE { + (VM_FAULT_ERROR | VM_FAULT_RETRY | + VM_FAULT_NOPAGE { put_page(vmf->page); return tmp; } @@ -3494,7 +3495,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) unlock_page(vmf->page); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || - (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE { + (tmp & (VM_FAULT_ERROR | VM_FAULT_RETRY | + VM_FAULT_NOPAGE { put_page(vmf->page); return tmp; } ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 11/11] mm/hmm: Remove confusing comment and logic from hmm_release
On 6/10/19 9:02 AM, Jason Gunthorpe wrote: On Fri, Jun 07, 2019 at 02:37:07PM -0700, Ralph Campbell wrote: On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe hmm_release() is called exactly once per hmm. ops->release() cannot accidentally trigger any action that would recurse back onto hmm->mirrors_sem. This fixes a use after-free race of the form: CPU0 CPU1 hmm_release() up_write(>mirrors_sem); hmm_mirror_unregister(mirror) down_write(>mirrors_sem); up_write(>mirrors_sem); kfree(mirror) mirror->ops->release(mirror) The only user we have today for ops->release is an empty function, so this is unambiguously safe. As a consequence of plugging this race drivers are not allowed to register/unregister mirrors from within a release op. Signed-off-by: Jason Gunthorpe I agree with the analysis above but I'm not sure that release() will always be an empty function. It might be more efficient to write back all data migrated to a device "in one pass" instead of relying on unmap_vmas() calling hmm_start_range_invalidate() per VMA. I think we have to focus on the *current* kernel - and we have two users of release, nouveau_svm.c is empty and amdgpu_mn.c does schedule_work() - so I believe we should go ahead with this simple solution to the actual race today that both of those will suffer from. If we find a need for a more complex version then it can be debated and justified with proper context... Ok? Jason OK. I guess we have enough on the plate already :-) ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 01/11] mm/hmm: fix use after free with struct hmm in the mmu notifiers
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe mmu_notifier_unregister_no_release() is not a fence and the mmu_notifier system will continue to reference hmm->mn until the srcu grace period expires. Resulting in use after free races like this: CPU0 CPU1 __mmu_notifier_invalidate_range_start() srcu_read_lock hlist_for_each () // mn == hmm->mn hmm_mirror_unregister() hmm_put() hmm_free() mmu_notifier_unregister_no_release() hlist_del_init_rcu(hmm-mn->list) mn->ops->invalidate_range_start(mn, range); mm_get_hmm() mm->hmm = NULL; kfree(hmm) mutex_lock(>lock); Use SRCU to kfree the hmm memory so that the notifiers can rely on hmm existing. Get the now-safe hmm struct through container_of and directly check kref_get_unless_zero to lock it against free. Signed-off-by: Jason Gunthorpe You can add Reviewed-by: Ralph Campbell --- v2: - Spell 'free' properly (Jerome/Ralph) --- include/linux/hmm.h | 1 + mm/hmm.c| 25 +++-- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 092f0234bfe917..688c5ca7068795 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -102,6 +102,7 @@ struct hmm { struct mmu_notifier mmu_notifier; struct rw_semaphore mirrors_sem; wait_queue_head_t wq; + struct rcu_head rcu; longnotifiers; booldead; }; diff --git a/mm/hmm.c b/mm/hmm.c index 8e7403f081f44a..547002f56a163d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -113,6 +113,11 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) return NULL; } +static void hmm_free_rcu(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct hmm, rcu)); +} + static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); @@ -125,7 +130,7 @@ static void hmm_free(struct kref *kref) mm->hmm = NULL; spin_unlock(>page_table_lock); - kfree(hmm); + mmu_notifier_call_srcu(>rcu, hmm_free_rcu); } static inline void hmm_put(struct hmm *hmm) @@ -153,10 +158,14 @@ void hmm_mm_destroy(struct mm_struct *mm) static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_range *range; + /* hmm is in progress to free */ + if (!kref_get_unless_zero(>kref)) + return; + /* Report this HMM as dying. */ hmm->dead = true; @@ -194,13 +203,15 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; int ret = 0; - VM_BUG_ON(!hmm); + /* hmm is in progress to free */ + if (!kref_get_unless_zero(>kref)) + return 0; update.start = nrange->start; update.end = nrange->end; @@ -245,9 +256,11 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - VM_BUG_ON(!hmm); + /* hmm is in progress to free */ + if (!kref_get_unless_zero(>kref)) + return; mutex_lock(>lock); hmm->notifiers--; ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 07/11] mm/hmm: Use lockdep instead of comments
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe So we can check locking at runtime. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: Ralph Campbell --- v2 - Fix missing & in lockdeps (Jason) --- mm/hmm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index f67ba32983d9f1..c702cd72651b53 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -254,11 +254,11 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. - * - * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + lockdep_assert_held_exclusive(>mmap_sem); + /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe The wait_event_timeout macro already tests the condition as its first action, so there is no reason to open code another version of this, all that does is skip the might_sleep() debugging in common cases, which is not helpful. Further, based on prior patches, we can no simplify the required condition test: - If range is valid memory then so is range->hmm - If hmm_release() has run then range->valid is set to false at the same time as dead, so no reason to check both. - A valid hmm has a valid hmm->mm. Also, add the READ_ONCE for range->valid as there is no lock held here. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse --- include/linux/hmm.h | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4ee3acabe5ed22..2ab35b40992b24 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -218,17 +218,9 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) static inline bool hmm_range_wait_until_valid(struct hmm_range *range, unsigned long timeout) { - /* Check if mm is dead ? */ - if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { - range->valid = false; - return false; - } - if (range->valid) - return true; - wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + wait_event_timeout(range->hmm->wq, range->valid, msecs_to_jiffies(timeout)); - /* Return current valid status just in case we get lucky */ - return range->valid; + return READ_ONCE(range->valid); } /* Since we are simplifying things, perhaps we should consider merging hmm_range_wait_until_valid() info hmm_range_register() and removing hmm_range_wait_until_valid() since the pattern is to always call the two together. In any case, this looks OK to me so you can add Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 10/11] mm/hmm: Do not use list*_rcu() for hmm->ranges
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe This list is always read and written while holding hmm->lock so there is no need for the confusing _rcu annotations. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: Ralph Campbell --- mm/hmm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index c2fecb3ecb11e1..709d138dd49027 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -911,7 +911,7 @@ int hmm_range_register(struct hmm_range *range, mutex_lock(>lock); range->hmm = hmm; - list_add_rcu(>list, >ranges); + list_add(>list, >ranges); /* * If there are any concurrent notifiers we have to wait for them for @@ -941,7 +941,7 @@ void hmm_range_unregister(struct hmm_range *range) return; mutex_lock(>lock); - list_del_rcu(>list); + list_del(>list); mutex_unlock(>lock); /* Drop reference taken by hmm_range_register() */ ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 02/11] mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe Ralph observes that hmm_range_register() can only be called by a driver while a mirror is registered. Make this clear in the API by passing in the mirror structure as a parameter. This also simplifies understanding the lifetime model for struct hmm, as the hmm pointer must be valid as part of a registered mirror so all we need in hmm_register_range() is a simple kref_get. Suggested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe You might CC Ben for the nouveau part. CC: Ben Skeggs Reviewed-by: Ralph Campbell --- v2 - Include the oneline patch to nouveau_svm.c --- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 7 --- mm/hmm.c | 15 ++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 93ed43c413f0bb..8c92374afcf227 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(, true); + ret = hmm_vma_fault(>mirror, , true); if (ret == 0) { mutex_lock(>mutex); if (!hmm_vma_range_done()) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 688c5ca7068795..2d519797cb134a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) * Please see Documentation/vm/hmm.rst for how to use the range API. */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift); @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) } /* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_range *range, bool block) +static inline int hmm_vma_fault(struct hmm_mirror *mirror, + struct hmm_range *range, bool block) { long ret; @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, range->vma->vm_mm, + ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 547002f56a163d..8796447299023c 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); - struct hmm *hmm; + struct hmm *hmm = mirror->hmm; range->valid = false; range->hmm = NULL; @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - hmm = hmm_get_or_create(mm); - if (!hmm) - return -EFAULT; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) { - hmm_put(hmm); + if (hmm->mm == NULL || hmm->dead) return -EFAULT; - } + + range->hmm = hmm; + kref_get(>kref); /* Initialize range to track CPU page table updates. */ mutex_lock(>lock); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout
On 6/7/19 1:44 PM, Jason Gunthorpe wrote: On Fri, Jun 07, 2019 at 01:21:12PM -0700, Ralph Campbell wrote: What I want to get to is a pattern like this: pagefault(): hmm_range_register(); again: /* On the slow path, if we appear to be live locked then we get the write side of mmap_sem which will break the live lock, otherwise this gets the read lock */ if (hmm_range_start_and_lock()) goto err; lockdep_assert_held(range->mm->mmap_sem); // Optional: Avoid useless expensive work if (hmm_range_needs_retry()) goto again; hmm_range_(touch vmas) take_lock(driver->update); if (hmm_range_end() { release_lock(driver->update); goto again; } // Finish driver updates release_lock(driver->update); // Releases mmap_sem hmm_range_unregister_and_unlock(); What do you think? Is it clear? Jason Are you talking about acquiring mmap_sem in hmm_range_start_and_lock()? Usually, the fault code has to lock mmap_sem for read in order to call find_vma() so it can set range.vma. If HMM drops mmap_sem - which I don't think it should, just return an error to tell the caller to drop mmap_sem and retry - the find_vma() will need to be repeated as well. Overall I don't think it makes a lot of sense to sleep for retry in hmm_range_start_and_lock() while holding mmap_sem. It would be better to drop that lock, sleep, then re-acquire it as part of the hmm logic. The find_vma should be done inside the critical section created by hmm_range_start_and_lock(), not before it. If we are retrying then we already slept and the additional CPU cost to repeat the find_vma is immaterial, IMHO? Do you see a reason why the find_vma() ever needs to be before the 'again' in my above example? range.vma does not need to be set for range_register. Yes, for the GPU case, there can be many faults in an event queue and the goal is to try to handle more than one page at a time. The vma is needed to limit the amount of coalescing and checking for pages that could be speculatively migrated or mapped. I'm also not sure about acquiring the mmap_sem for write as way to mitigate thrashing. It seems to me that if a device and a CPU are both faulting on the same page, One of the reasons to prefer this approach is that it means we don't need to keep track of which ranges we are faulting, and if there is a lot of *unrelated* fault activity (unlikely?) we can resolve it using mmap sem instead of this elaborate ranges scheme and related locking. This would reduce the overall work in the page fault and invalidate_start/end paths for the common uncontended cases. some sort of backoff delay is needed to let one side or the other make some progress. What the write side of the mmap_sem would do is force the CPU and device to cleanly take turns. Once the device pages are registered under the write side the CPU will have to wait in invalidate_start for the driver to complete a shootdown, then the whole thing starts all over again. It is certainly imaginable something could have a 'min life' timer for a device mapping and hold mm invalidate_start, and device pagefault for that min time to promote better sharing. But, if we don't use the mmap_sem then we can livelock and the device will see an unrecoverable error from the timeout which means we have risk that under load the system will simply obscurely fail. This seems unacceptable to me.. Particularly since for the ODP use case the issue is not trashing migration as a GPU might have, but simple system stability under swap load. We do not want the ODP pagefault to permanently fail due to timeout if the VMA is still valid.. Jason OK, I understand. If you come up with a set of changes, I can try testing them. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 05/11] mm/hmm: Remove duplicate condition test before wait_event_timeout
On 6/7/19 12:13 PM, Jason Gunthorpe wrote: On Fri, Jun 07, 2019 at 12:01:45PM -0700, Ralph Campbell wrote: On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe The wait_event_timeout macro already tests the condition as its first action, so there is no reason to open code another version of this, all that does is skip the might_sleep() debugging in common cases, which is not helpful. Further, based on prior patches, we can no simplify the required condition test: - If range is valid memory then so is range->hmm - If hmm_release() has run then range->valid is set to false at the same time as dead, so no reason to check both. - A valid hmm has a valid hmm->mm. Also, add the READ_ONCE for range->valid as there is no lock held here. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse include/linux/hmm.h | 12 ++-- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4ee3acabe5ed22..2ab35b40992b24 100644 +++ b/include/linux/hmm.h @@ -218,17 +218,9 @@ static inline unsigned long hmm_range_page_size(const struct hmm_range *range) static inline bool hmm_range_wait_until_valid(struct hmm_range *range, unsigned long timeout) { - /* Check if mm is dead ? */ - if (range->hmm == NULL || range->hmm->dead || range->hmm->mm == NULL) { - range->valid = false; - return false; - } - if (range->valid) - return true; - wait_event_timeout(range->hmm->wq, range->valid || range->hmm->dead, + wait_event_timeout(range->hmm->wq, range->valid, msecs_to_jiffies(timeout)); - /* Return current valid status just in case we get lucky */ - return range->valid; + return READ_ONCE(range->valid); } /* Since we are simplifying things, perhaps we should consider merging hmm_range_wait_until_valid() info hmm_range_register() and removing hmm_range_wait_until_valid() since the pattern is to always call the two together. ? the hmm.rst shows the hmm_range_wait_until_valid being called in the (ret == -EAGAIN) path. It is confusing because it should really just have the again label moved up above hmm_range_wait_until_valid() as even if we get the driver lock it could still be a long wait for the colliding invalidation to clear. What I want to get to is a pattern like this: pagefault(): hmm_range_register(); again: /* On the slow path, if we appear to be live locked then we get the write side of mmap_sem which will break the live lock, otherwise this gets the read lock */ if (hmm_range_start_and_lock()) goto err; lockdep_assert_held(range->mm->mmap_sem); // Optional: Avoid useless expensive work if (hmm_range_needs_retry()) goto again; hmm_range_(touch vmas) take_lock(driver->update); if (hmm_range_end() { release_lock(driver->update); goto again; } // Finish driver updates release_lock(driver->update); // Releases mmap_sem hmm_range_unregister_and_unlock(); What do you think? Is it clear? Jason Are you talking about acquiring mmap_sem in hmm_range_start_and_lock()? Usually, the fault code has to lock mmap_sem for read in order to call find_vma() so it can set range.vma. If HMM drops mmap_sem - which I don't think it should, just return an error to tell the caller to drop mmap_sem and retry - the find_vma() will need to be repeated as well. I'm also not sure about acquiring the mmap_sem for write as way to mitigate thrashing. It seems to me that if a device and a CPU are both faulting on the same page, some sort of backoff delay is needed to let one side or the other make some progress. Thrashing mitigation and how migrate_vma() plays in this is a deep topic for thought. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 06/11] mm/hmm: Hold on to the mmget for the lifetime of the range
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe Range functions like hmm_range_snapshot() and hmm_range_fault() call find_vma, which requires hodling the mmget() and the mmap_sem for the mm. Make this simpler for the callers by holding the mmget() inside the range for the lifetime of the range. Other functions that accept a range should only be called if the range is registered. This has the side effect of directly preventing hmm_release() from happening while a range is registered. That means range->dead cannot be false during the lifetime of the range, so remove dead and hmm_mirror_mm_is_alive() entirely. Signed-off-by: Jason Gunthorpe Looks good to me. Reviewed-by: Ralph Campbell --- v2: - Use Jerome's idea of just holding the mmget() for the range lifetime, rework the patch to use that as as simplification to remove dead in one step --- include/linux/hmm.h | 26 -- mm/hmm.c| 28 ++-- 2 files changed, 10 insertions(+), 44 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2ab35b40992b24..0e20566802967a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -91,7 +91,6 @@ * @mirrors_sem: read/write semaphore protecting the mirrors list * @wq: wait queue for user waiting on a range invalidation * @notifiers: count of active mmu notifiers - * @dead: is the mm dead ? */ struct hmm { struct mm_struct*mm; @@ -104,7 +103,6 @@ struct hmm { wait_queue_head_t wq; struct rcu_head rcu; longnotifiers; - booldead; }; /* @@ -469,30 +467,6 @@ struct hmm_mirror { int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm); void hmm_mirror_unregister(struct hmm_mirror *mirror); -/* - * hmm_mirror_mm_is_alive() - test if mm is still alive - * @mirror: the HMM mm mirror for which we want to lock the mmap_sem - * Return: false if the mm is dead, true otherwise - * - * This is an optimization, it will not always accurately return false if the - * mm is dead; i.e., there can be false negatives (process is being killed but - * HMM is not yet informed of that). It is only intended to be used to optimize - * out cases where the driver is about to do something time consuming and it - * would be better to skip it if the mm is dead. - */ -static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) -{ - struct mm_struct *mm; - - if (!mirror || !mirror->hmm) - return false; - mm = READ_ONCE(mirror->hmm->mm); - if (mirror->hmm->dead || !mm) - return false; - - return true; -} - /* * Please see Documentation/vm/hmm.rst for how to use the range API. */ diff --git a/mm/hmm.c b/mm/hmm.c index dc30edad9a8a02..f67ba32983d9f1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -80,7 +80,6 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) mutex_init(>lock); kref_init(>kref); hmm->notifiers = 0; - hmm->dead = false; hmm->mm = mm; hmm->mmu_notifier.ops = _mmu_notifier_ops; @@ -124,20 +123,17 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_range *range; /* hmm is in progress to free */ if (!kref_get_unless_zero(>kref)) return; - /* Report this HMM as dying. */ - hmm->dead = true; - - /* Wake-up everyone waiting on any range. */ mutex_lock(>lock); - list_for_each_entry(range, >ranges, list) - range->valid = false; - wake_up_all(>wq); + /* +* Since hmm_range_register() holds the mmget() lock hmm_release() is +* prevented as long as a range exists. +*/ + WARN_ON(!list_empty(>ranges)); mutex_unlock(>lock); down_write(>mirrors_sem); @@ -909,8 +905,8 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) + /* Prevent hmm_release() from running while the range is valid */ + if (!mmget_not_zero(hmm->mm)) return -EFAULT; range->hmm = hmm; @@ -955,6 +951,7 @@ void hmm_range_unregister(struct hmm_range *range) /* Drop reference taken by hmm_range_register() */ range->valid = false; + mmput(hmm->mm); hmm_put(hmm); range->hmm = NULL; } @@ -982,10 +979,7 @@ long hmm_range_snapshot(struct hmm_range *range) struct vm_area_struct *vma; struct mm_walk mm_walk; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm-&g
Re: [PATCH v2 12/11] mm/hmm: Fix error flows in hmm_invalidate_range_start
On 6/7/19 9:05 AM, Jason Gunthorpe wrote: If the trylock on the hmm->mirrors_sem fails the function will return without decrementing the notifiers that were previously incremented. Since the caller will not call invalidate_range_end() on EAGAIN this will result in notifiers becoming permanently incremented and deadlock. If the sync_cpu_device_pagetables() required blocking the function will not return EAGAIN even though the device continues to touch the pages. This is a violation of the mmu notifier contract. Switch, and rename, the ranges_lock to a spin lock so we can reliably obtain it without blocking during error unwind. The error unwind is necessary since the notifiers count must be held incremented across the call to sync_cpu_device_pagetables() as we cannot allow the range to become marked valid by a parallel invalidate_start/end() pair while doing sync_cpu_device_pagetables(). Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell --- include/linux/hmm.h | 2 +- mm/hmm.c| 77 +++-- 2 files changed, 48 insertions(+), 31 deletions(-) I almost lost this patch - it is part of the series, hasn't been posted before, and wasn't sent with the rest, sorry. diff --git a/include/linux/hmm.h b/include/linux/hmm.h index bf013e96525771..0fa8ea34ccef6d 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -86,7 +86,7 @@ struct hmm { struct mm_struct*mm; struct kref kref; - struct mutexlock; + spinlock_t ranges_lock; struct list_headranges; struct list_headmirrors; struct mmu_notifier mmu_notifier; diff --git a/mm/hmm.c b/mm/hmm.c index 4215edf737ef5b..10103a24e9b7b3 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -68,7 +68,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) init_rwsem(>mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(>ranges); - mutex_init(>lock); + spin_lock_init(>ranges_lock); kref_init(>kref); hmm->notifiers = 0; hmm->mm = mm; @@ -114,18 +114,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; + unsigned long flags; /* Bail out if hmm is in the process of being freed */ if (!kref_get_unless_zero(>kref)) return; - mutex_lock(>lock); + spin_lock_irqsave(>ranges_lock, flags); /* * Since hmm_range_register() holds the mmget() lock hmm_release() is * prevented as long as a range exists. */ WARN_ON(!list_empty(>ranges)); - mutex_unlock(>lock); + spin_unlock_irqrestore(>ranges_lock, flags); down_read(>mirrors_sem); list_for_each_entry(mirror, >mirrors, list) { @@ -141,6 +142,23 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) hmm_put(hmm); } +static void notifiers_decrement(struct hmm *hmm) +{ + lockdep_assert_held(>ranges_lock); + + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, >ranges, list) { + if (range->valid) + continue; + range->valid = true; + } This just effectively sets all ranges to valid. I'm not sure that is best. Shouldn't hmm_range_register() start with range.valid = true and then hmm_invalidate_range_start() set affected ranges to false? Then this becomes just wake_up_all() if --notifiers == 0 and hmm_range_wait_until_valid() should wait for notifiers == 0. Otherwise, range.valid doesn't really mean it's valid. + wake_up_all(>wq); + } +} + static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { @@ -148,6 +166,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; + unsigned long flags; int ret = 0; if (!kref_get_unless_zero(>kref)) @@ -158,12 +177,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, update.event = HMM_UPDATE_INVALIDATE; update.blockable = mmu_notifier_range_blockable(nrange); - if (mmu_notifier_range_blockable(nrange)) - mutex_lock(>lock); - else if (!mutex_trylock(>lock)) { - ret = -EAGAIN; - goto out; - } + spin_lock_irqsave(>ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, >ranges, list) { if (update.end < range->start || update.start >= range->
Re: [PATCH v2 hmm 02/11] mm/hmm: Use hmm_mirror not mm as an argument for hmm_range_register
On 6/7/19 11:24 AM, Ralph Campbell wrote: On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe Ralph observes that hmm_range_register() can only be called by a driver while a mirror is registered. Make this clear in the API by passing in the mirror structure as a parameter. This also simplifies understanding the lifetime model for struct hmm, as the hmm pointer must be valid as part of a registered mirror so all we need in hmm_register_range() is a simple kref_get. Suggested-by: Ralph Campbell Signed-off-by: Jason Gunthorpe You might CC Ben for the nouveau part. CC: Ben Skeggs Reviewed-by: Ralph Campbell --- v2 - Include the oneline patch to nouveau_svm.c --- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 7 --- mm/hmm.c | 15 ++- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 93ed43c413f0bb..8c92374afcf227 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -649,7 +649,7 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(, true); + ret = hmm_vma_fault(>mirror, , true); if (ret == 0) { mutex_lock(>mutex); if (!hmm_vma_range_done()) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 688c5ca7068795..2d519797cb134a 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -505,7 +505,7 @@ static inline bool hmm_mirror_mm_is_alive(struct hmm_mirror *mirror) * Please see Documentation/vm/hmm.rst for how to use the range API. */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift); @@ -541,7 +541,8 @@ static inline bool hmm_vma_range_done(struct hmm_range *range) } /* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_range *range, bool block) +static inline int hmm_vma_fault(struct hmm_mirror *mirror, + struct hmm_range *range, bool block) { long ret; @@ -554,7 +555,7 @@ static inline int hmm_vma_fault(struct hmm_range *range, bool block) range->default_flags = 0; range->pfn_flags_mask = -1UL; - ret = hmm_range_register(range, range->vma->vm_mm, + ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); if (ret) diff --git a/mm/hmm.c b/mm/hmm.c index 547002f56a163d..8796447299023c 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -925,13 +925,13 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); - struct hmm *hmm; + struct hmm *hmm = mirror->hmm; range->valid = false; range->hmm = NULL; @@ -945,15 +945,12 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - hmm = hmm_get_or_create(mm); - if (!hmm) - return -EFAULT; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) { - hmm_put(hmm); + if (hmm->mm == NULL || hmm->dead) return -EFAULT; - } + + range->hmm = hmm; + kref_get(>kref); /* Initialize range to track CPU page table updates. */ mutex_lock(>lock); I forgot to add that I think you can delete the duplicate "range->hmm = hmm;" here between the mutex_lock/unlock. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 04/11] mm/hmm: Simplify hmm_get_or_create and make it reliable
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe As coded this function can false-fail in various racy situations. Make it reliable by running only under the write side of the mmap_sem and avoiding the false-failing compare/exchange pattern. Also make the locking very easy to understand by only ever reading or writing mm->hmm while holding the write side of the mmap_sem. Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell --- v2: - Fix error unwind of mmgrab (Jerome) - Use hmm local instead of 2nd container_of (Jerome) --- mm/hmm.c | 80 1 file changed, 29 insertions(+), 51 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index cc7c26fda3300e..dc30edad9a8a02 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -40,16 +40,6 @@ #if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -static inline struct hmm *mm_get_hmm(struct mm_struct *mm) -{ - struct hmm *hmm = READ_ONCE(mm->hmm); - - if (hmm && kref_get_unless_zero(>kref)) - return hmm; - - return NULL; -} - /** * hmm_get_or_create - register HMM against an mm (HMM internal) * @@ -64,11 +54,20 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm) */ static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); - bool cleanup = false; + struct hmm *hmm; - if (hmm) - return hmm; + lockdep_assert_held_exclusive(>mmap_sem); + + if (mm->hmm) { + if (kref_get_unless_zero(>hmm->kref)) + return mm->hmm; + /* +* The hmm is being freed by some other CPU and is pending a +* RCU grace period, but this CPU can NULL now it since we +* have the mmap_sem. +*/ + mm->hmm = NULL; + } hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) @@ -83,57 +82,36 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) hmm->notifiers = 0; hmm->dead = false; hmm->mm = mm; - mmgrab(hmm->mm); - - spin_lock(>page_table_lock); - if (!mm->hmm) - mm->hmm = hmm; - else - cleanup = true; - spin_unlock(>page_table_lock); - if (cleanup) - goto error; - - /* -* We should only get here if hold the mmap_sem in write mode ie on -* registration of first mirror through hmm_mirror_register() -*/ hmm->mmu_notifier.ops = _mmu_notifier_ops; - if (__mmu_notifier_register(>mmu_notifier, mm)) - goto error_mm; + if (__mmu_notifier_register(>mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } + mmgrab(hmm->mm); + mm->hmm = hmm; return hmm; - -error_mm: - spin_lock(>page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(>page_table_lock); -error: - mmdrop(hmm->mm); - kfree(hmm); - return NULL; } static void hmm_free_rcu(struct rcu_head *rcu) { - kfree(container_of(rcu, struct hmm, rcu)); + struct hmm *hmm = container_of(rcu, struct hmm, rcu); + + down_write(>mm->mmap_sem); + if (hmm->mm->hmm == hmm) + hmm->mm->hmm = NULL; + up_write(>mm->mmap_sem); + mmdrop(hmm->mm); + + kfree(hmm); } static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); - struct mm_struct *mm = hmm->mm; - - mmu_notifier_unregister_no_release(>mmu_notifier, mm); - spin_lock(>page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(>page_table_lock); - - mmdrop(hmm->mm); + mmu_notifier_unregister_no_release(>mmu_notifier, hmm->mm); mmu_notifier_call_srcu(>rcu, hmm_free_rcu); } ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC] mm/hmm: pass mmu_notifier_range to sync_cpu_device_pagetables
HMM defines its own struct hmm_update which is passed to the sync_cpu_device_pagetables() callback function. This is sufficient when the only action is to invalidate. However, a device may want to know the reason for the invalidation and be able to see the new permissions on a range, update device access rights or range statistics. Since sync_cpu_device_pagetables() can be called from try_to_unmap(), the mmap_sem may not be held and find_vma() is not safe to be called. Pass the struct mmu_notifier_range to sync_cpu_device_pagetables() to allow the full invalidation information to be used. Signed-off-by: Ralph Campbell --- I'm sending this out now since we are updating many of the HMM APIs and I think it will be useful. drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h | 27 ++- mm/hmm.c | 13 - 3 files changed, 8 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 8c92374afcf2..c34b98fafe2f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0fa8ea34ccef..07a2d38fde34 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -377,29 +377,6 @@ static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -420,7 +397,7 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror -* @update: update information (see struct hmm_update) +* @update: update information (see struct mmu_notifier_range) * Return: -EAGAIN if update.blockable false and callback need to * block, 0 otherwise. * @@ -434,7 +411,7 @@ struct hmm_mirror_ops { * synchronous call. */ int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + const struct mmu_notifier_range *update); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 9aad3550f2bb..b49a43712554 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -164,7 +164,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_update update; struct hmm_range *range; unsigned long flags; int ret = 0; @@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, if (!kref_get_unless_zero(>kref)) return 0; - update.start = nrange->start; - update.end = nrange->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = mmu_notifier_range_blockable(nrange); - spin_lock_irqsave(>ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, >ranges, list) { - if (update.end < range->start || update.start >= range->end) + if (nrange->end < range->start || nrange->start >= range->end) continue; range->valid = false; @@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, list_for_each_entry(mirror, >mirrors, list) { int rc; - rc = mirror->ops->sync_cpu_device_pagetables(mirror,
Re: [PATCH v2 hmm 03/11] mm/hmm: Hold a mmgrab from hmm to mm
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe So long a a struct hmm pointer exists, so should the struct mm it is s/a a/as a/ linked too. Hold the mmgrab() as soon as a hmm is created, and mmdrop() it once the hmm refcount goes to zero. Since mmdrop() (ie a 0 kref on struct mm) is now impossible with a !NULL mm->hmm delete the hmm_hmm_destroy(). Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: Ralph Campbell --- v2: - Fix error unwind paths in hmm_get_or_create (Jerome/Jason) --- include/linux/hmm.h | 3 --- kernel/fork.c | 1 - mm/hmm.c| 22 -- 3 files changed, 4 insertions(+), 22 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2d519797cb134a..4ee3acabe5ed22 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -586,14 +586,11 @@ static inline int hmm_vma_fault(struct hmm_mirror *mirror, } /* Below are for HMM internal use only! Not to be used by device driver! */ -void hmm_mm_destroy(struct mm_struct *mm); - static inline void hmm_mm_init(struct mm_struct *mm) { mm->hmm = NULL; } #else /* IS_ENABLED(CONFIG_HMM_MIRROR) */ -static inline void hmm_mm_destroy(struct mm_struct *mm) {} static inline void hmm_mm_init(struct mm_struct *mm) {} #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff --git a/kernel/fork.c b/kernel/fork.c index b2b87d450b80b5..588c768ae72451 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -673,7 +673,6 @@ void __mmdrop(struct mm_struct *mm) WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); destroy_context(mm); - hmm_mm_destroy(mm); mmu_notifier_mm_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); diff --git a/mm/hmm.c b/mm/hmm.c index 8796447299023c..cc7c26fda3300e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +83,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) hmm->notifiers = 0; hmm->dead = false; hmm->mm = mm; + mmgrab(hmm->mm); spin_lock(>page_table_lock); if (!mm->hmm) @@ -109,6 +111,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) mm->hmm = NULL; spin_unlock(>page_table_lock); error: + mmdrop(hmm->mm); kfree(hmm); return NULL; } @@ -130,6 +133,7 @@ static void hmm_free(struct kref *kref) mm->hmm = NULL; spin_unlock(>page_table_lock); + mmdrop(hmm->mm); mmu_notifier_call_srcu(>rcu, hmm_free_rcu); } @@ -138,24 +142,6 @@ static inline void hmm_put(struct hmm *hmm) kref_put(>kref, hmm_free); } -void hmm_mm_destroy(struct mm_struct *mm) -{ - struct hmm *hmm; - - spin_lock(>page_table_lock); - hmm = mm_get_hmm(mm); - mm->hmm = NULL; - if (hmm) { - hmm->mm = NULL; - hmm->dead = true; - spin_unlock(>page_table_lock); - hmm_put(hmm); - return; - } - - spin_unlock(>page_table_lock); -} - static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 08/11] mm/hmm: Remove racy protection against double-unregistration
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe No other register/unregister kernel API attempts to provide this kind of protection as it is inherently racy, so just drop it. Callers should provide their own protection, it appears nouveau already does, but just in case drop a debugging POISON. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: Ralph Campbell --- mm/hmm.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index c702cd72651b53..6802de7080d172 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -284,18 +284,13 @@ EXPORT_SYMBOL(hmm_mirror_register); */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = READ_ONCE(mirror->hmm); - - if (hmm == NULL) - return; + struct hmm *hmm = mirror->hmm; down_write(>mirrors_sem); list_del_init(>list); - /* To protect us against double unregister ... */ - mirror->hmm = NULL; up_write(>mirrors_sem); - hmm_put(hmm); + memset(>hmm, POISON_INUSE, sizeof(mirror->hmm)); } EXPORT_SYMBOL(hmm_mirror_unregister); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 11/11] mm/hmm: Remove confusing comment and logic from hmm_release
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe hmm_release() is called exactly once per hmm. ops->release() cannot accidentally trigger any action that would recurse back onto hmm->mirrors_sem. This fixes a use after-free race of the form: CPU0 CPU1 hmm_release() up_write(>mirrors_sem); hmm_mirror_unregister(mirror) down_write(>mirrors_sem); up_write(>mirrors_sem); kfree(mirror) mirror->ops->release(mirror) The only user we have today for ops->release is an empty function, so this is unambiguously safe. As a consequence of plugging this race drivers are not allowed to register/unregister mirrors from within a release op. Signed-off-by: Jason Gunthorpe I agree with the analysis above but I'm not sure that release() will always be an empty function. It might be more efficient to write back all data migrated to a device "in one pass" instead of relying on unmap_vmas() calling hmm_start_range_invalidate() per VMA. I think the bigger issue is potential deadlocks while calling sync_cpu_device_pagetables() and tasks calling hmm_mirror_unregister(): Say you have three threads: - Thread A is in try_to_unmap(), either without holding mmap_sem or with mmap_sem held for read. - Thread B has some unrelated driver calling hmm_mirror_unregister(). This doesn't require mmap_sem. - Thread C is about to call migrate_vma(). Thread AThread B Thread C try_to_unmaphmm_mirror_unregistermigrate_vma -- --- -- hmm_invalidate_range_start down_read(mirrors_sem) down_write(mirrors_sem) // Blocked on A device_lock device_lock // Blocked on C migrate_vma() hmm_invalidate_range_s down_read(mirrors_sem) // Blocked on B // Deadlock Perhaps we should consider using SRCU for walking the mirror->list? --- mm/hmm.c | 28 +--- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 709d138dd49027..3a45dd3d778248 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -136,26 +136,16 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) WARN_ON(!list_empty(>ranges)); mutex_unlock(>lock); - down_write(>mirrors_sem); - mirror = list_first_entry_or_null(>mirrors, struct hmm_mirror, - list); - while (mirror) { - list_del_init(>list); - if (mirror->ops->release) { - /* -* Drop mirrors_sem so the release callback can wait -* on any pending work that might itself trigger a -* mmu_notifier callback and thus would deadlock with -* us. -*/ - up_write(>mirrors_sem); + down_read(>mirrors_sem); + list_for_each_entry(mirror, >mirrors, list) { + /* +* Note: The driver is not allowed to trigger +* hmm_mirror_unregister() from this thread. +*/ + if (mirror->ops->release) mirror->ops->release(mirror); - down_write(>mirrors_sem); - } - mirror = list_first_entry_or_null(>mirrors, - struct hmm_mirror, list); } - up_write(>mirrors_sem); + up_read(>mirrors_sem); hmm_put(hmm); } @@ -287,7 +277,7 @@ void hmm_mirror_unregister(struct hmm_mirror *mirror) struct hmm *hmm = mirror->hmm; down_write(>mirrors_sem); - list_del_init(>list); + list_del(>list); up_write(>mirrors_sem); hmm_put(hmm); memset(>hmm, POISON_INUSE, sizeof(mirror->hmm)); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 hmm 09/11] mm/hmm: Poison hmm_range during unregister
On 6/6/19 11:44 AM, Jason Gunthorpe wrote: From: Jason Gunthorpe Trying to misuse a range outside its lifetime is a kernel bug. Use WARN_ON and poison bytes to detect this condition. Signed-off-by: Jason Gunthorpe Reviewed-by: Jérôme Glisse Reviewed-by: Ralph Campbell --- v2 - Keep range start/end valid after unregistration (Jerome) --- mm/hmm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 6802de7080d172..c2fecb3ecb11e1 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -937,7 +937,7 @@ void hmm_range_unregister(struct hmm_range *range) struct hmm *hmm = range->hmm; /* Sanity check this really should not happen. */ - if (hmm == NULL || range->end <= range->start) + if (WARN_ON(range->end <= range->start)) return; WARN_ON() is definitely better than silent return but I wonder how useful it is since the caller shouldn't be modifying the hmm_range once it is registered. Other fields could be changed too... mutex_lock(>lock); @@ -948,7 +948,10 @@ void hmm_range_unregister(struct hmm_range *range) range->valid = false; mmput(hmm->mm); hmm_put(hmm); - range->hmm = NULL; + + /* The range is now invalid, leave it poisoned. */ + range->valid = false; + memset(>hmm, POISON_INUSE, sizeof(range->hmm)); } EXPORT_SYMBOL(hmm_range_unregister); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [RFC] mm/hmm: pass mmu_notifier_range to sync_cpu_device_pagetables
On 7/2/19 12:53 PM, Jason Gunthorpe wrote: On Fri, Jun 07, 2019 at 05:14:52PM -0700, Ralph Campbell wrote: HMM defines its own struct hmm_update which is passed to the sync_cpu_device_pagetables() callback function. This is sufficient when the only action is to invalidate. However, a device may want to know the reason for the invalidation and be able to see the new permissions on a range, update device access rights or range statistics. Since sync_cpu_device_pagetables() can be called from try_to_unmap(), the mmap_sem may not be held and find_vma() is not safe to be called. Pass the struct mmu_notifier_range to sync_cpu_device_pagetables() to allow the full invalidation information to be used. Signed-off-by: Ralph Campbell --- I'm sending this out now since we are updating many of the HMM APIs and I think it will be useful. This make so much sense, I'd like to apply this in hmm.git, is there any objection? Jason Not from me. :-) Thanks! ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: hmm_range_fault related fixes and legacy API removal v2
On 7/4/19 9:42 AM, Jason Gunthorpe wrote: On Wed, Jul 03, 2019 at 03:02:08PM -0700, Christoph Hellwig wrote: Hi Jérôme, Ben and Jason, below is a series against the hmm tree which fixes up the mmap_sem locking in nouveau and while at it also removes leftover legacy HMM APIs only used by nouveau. Changes since v1: - don't return the valid state from hmm_range_unregister - additional nouveau cleanups Ralph, since most of this is nouveau could you contribute a Tested-by? Thanks Jason I can test things fairly easily but with all the different patches, conflicts, and personal git trees, can you specify the git tree and branch with everything applied that you want me to test? ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 18/22] mm: return valid info from hmm_range_unregister
On 6/30/19 11:20 PM, Christoph Hellwig wrote: Checking range->valid is trivial and has no meaningful cost, but nicely simplifies the fastpath in typical callers. Also remove the hmm_vma_range_done function, which now is a trivial wrapper around hmm_range_unregister. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 11 +-- mm/hmm.c | 6 +- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 8c92374afcf2..9d40114d7949 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -652,7 +652,7 @@ nouveau_svm_fault(struct nvif_notify *notify) ret = hmm_vma_fault(>mirror, , true); if (ret == 0) { mutex_lock(>mutex); - if (!hmm_vma_range_done()) { + if (!hmm_range_unregister()) { mutex_unlock(>mutex); goto again; } diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 0fa8ea34ccef..4b185d286c3b 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -465,7 +465,7 @@ int hmm_range_register(struct hmm_range *range, unsigned long start, unsigned long end, unsigned page_shift); -void hmm_range_unregister(struct hmm_range *range); +bool hmm_range_unregister(struct hmm_range *range); long hmm_range_snapshot(struct hmm_range *range); long hmm_range_fault(struct hmm_range *range, bool block); long hmm_range_dma_map(struct hmm_range *range, @@ -487,15 +487,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, */ #define HMM_RANGE_DEFAULT_TIMEOUT 1000 -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline bool hmm_vma_range_done(struct hmm_range *range) -{ - bool ret = hmm_range_valid(range); - - hmm_range_unregister(range); - return ret; -} - /* This is a temporary helper to avoid merge conflict between trees. */ static inline int hmm_vma_fault(struct hmm_mirror *mirror, struct hmm_range *range, bool block) diff --git a/mm/hmm.c b/mm/hmm.c index de35289df20d..c85ed7d4e2ce 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -920,11 +920,14 @@ EXPORT_SYMBOL(hmm_range_register); * * Range struct is used to track updates to the CPU page table after a call to * hmm_range_register(). See include/linux/hmm.h for how to use it. + * + * Returns if the range was still valid at the time of unregistering. Since this is an exported function, we should have kernel-doc comments. That is probably a separate patch but at least this line could be: Return: True if the range was still valid at the time of unregistering. */ -void hmm_range_unregister(struct hmm_range *range) +bool hmm_range_unregister(struct hmm_range *range) { struct hmm *hmm = range->hmm; unsigned long flags; + bool ret = range->valid; spin_lock_irqsave(>ranges_lock, flags); list_del_init(>list); @@ -941,6 +944,7 @@ void hmm_range_unregister(struct hmm_range *range) */ range->valid = false; memset(>hmm, POISON_INUSE, sizeof(range->hmm)); + return ret; } EXPORT_SYMBOL(hmm_range_unregister); ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 20/22] mm: move hmm_vma_fault to nouveau
On 6/30/19 11:20 PM, Christoph Hellwig wrote: hmm_vma_fault is marked as a legacy API to get rid of, but quite suites the current nouvea flow. Move it to the only user in preparation for I didn't quite parse the phrase "quite suites the current nouvea flow." s/nouvea/nouveau/ fixing a locking bug involving caller and callee. Signed-off-by: Christoph Hellwig I see where you are going with this and it looks like straightforward code movement so, Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 54 ++- include/linux/hmm.h | 54 --- 2 files changed, 53 insertions(+), 55 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 9d40114d7949..e831f4184a17 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -36,6 +36,13 @@ #include #include +/* + * When waiting for mmu notifiers we need some kind of time out otherwise we + * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to + * wait already. + */ +#define NOUVEAU_RANGE_FAULT_TIMEOUT 1000 + struct nouveau_svm { struct nouveau_drm *drm; struct mutex mutex; @@ -475,6 +482,51 @@ nouveau_svm_fault_cache(struct nouveau_svm *svm, fault->inst, fault->addr, fault->access); } +static int +nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range, + bool block) +{ + long ret; + + /* +* With the old API the driver must set each individual entries with +* the requested flags (valid, write, ...). So here we set the mask to +* keep intact the entries provided by the driver and zero out the +* default_flags. +*/ + range->default_flags = 0; + range->pfn_flags_mask = -1UL; + + ret = hmm_range_register(range, mirror, +range->start, range->end, +PAGE_SHIFT); + if (ret) + return (int)ret; + + if (!hmm_range_wait_until_valid(range, NOUVEAU_RANGE_FAULT_TIMEOUT)) { + /* +* The mmap_sem was taken by driver we release it here and +* returns -EAGAIN which correspond to mmap_sem have been +* drop in the old API. +*/ + up_read(>vma->vm_mm->mmap_sem); + return -EAGAIN; + } + + ret = hmm_range_fault(range, block); + if (ret <= 0) { + if (ret == -EBUSY || !ret) { + /* Same as above, drop mmap_sem to match old API. */ + up_read(>vma->vm_mm->mmap_sem); + ret = -EBUSY; + } else if (ret == -EAGAIN) + ret = -EBUSY; + hmm_range_unregister(range); + return ret; + } + return 0; +} + static int nouveau_svm_fault(struct nvif_notify *notify) { @@ -649,7 +701,7 @@ nouveau_svm_fault(struct nvif_notify *notify) range.values = nouveau_svm_pfn_values; range.pfn_shift = NVIF_VMM_PFNMAP_V0_ADDR_SHIFT; again: - ret = hmm_vma_fault(>mirror, , true); + ret = nouveau_range_fault(>mirror, , true); if (ret == 0) { mutex_lock(>mutex); if (!hmm_range_unregister()) { diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4b185d286c3b..3457cf9182e5 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -478,60 +478,6 @@ long hmm_range_dma_unmap(struct hmm_range *range, dma_addr_t *daddrs, bool dirty); -/* - * HMM_RANGE_DEFAULT_TIMEOUT - default timeout (ms) when waiting for a range - * - * When waiting for mmu notifiers we need some kind of time out otherwise we - * could potentialy wait for ever, 1000ms ie 1s sounds like a long time to - * wait already. - */ -#define HMM_RANGE_DEFAULT_TIMEOUT 1000 - -/* This is a temporary helper to avoid merge conflict between trees. */ -static inline int hmm_vma_fault(struct hmm_mirror *mirror, - struct hmm_range *range, bool block) -{ - long ret; - - /* -* With the old API the driver must set each individual entries with -* the requested flags (valid, write, ...). So here we set the mask to -* keep intact the entries provided by the driver and zero out the -* default_flags. -*/ - range->default_flags = 0; - range->pfn_flags_mask = -1UL; - - ret = hmm_range_register(range, mirror, -range->start, range->end, -PAGE_SHIFT); - if (ret) - return (int)ret; -
Re: [PATCH 4/5] nouveau: unlock mmap_sem on all errors from nouveau_range_fault
On 7/3/19 11:45 AM, Christoph Hellwig wrote: Currently nouveau_svm_fault expects nouveau_range_fault to never unlock mmap_sem, but the latter unlocks it for a random selection of error codes. Fix this up by always unlocking mmap_sem for non-zero return values in nouveau_range_fault, and only unlocking it in the caller for successful returns. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 15 --- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index e831f4184a17..c0cf7aeaefb3 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -500,8 +500,10 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range, You can delete the comment "With the old API the driver must ..." (not visible in the patch here). I suggest moving the two assignments: range->default_flags = 0; range->pfn_flags_mask = -1UL; to just above the "again:" where the other range.xxx fields are initialized in nouveau_svm_fault(). ret = hmm_range_register(range, mirror, range->start, range->end, PAGE_SHIFT); - if (ret) + if (ret) { + up_read(>vma->vm_mm->mmap_sem; > return (int)ret; + } if (!hmm_range_wait_until_valid(range, NOUVEAU_RANGE_FAULT_TIMEOUT)) { /* You can delete this comment (only the first line is visible here) since it is about the "old API". Also, it should return -EBUSY not -EAGAIN since it means there was a range invalidation collision (similar to hmm_range_fault() if !range->valid). @@ -515,15 +517,14 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range, ret = hmm_range_fault(range, block); nouveau_range_fault() is only called with "block = true" so could eliminate the block parameter and pass true here. if (ret <= 0) { - if (ret == -EBUSY || !ret) { - /* Same as above, drop mmap_sem to match old API. */ - up_read(>vma->vm_mm->mmap_sem); - ret = -EBUSY; - } else if (ret == -EAGAIN) + if (ret == 0) ret = -EBUSY; + if (ret != -EAGAIN) + up_read(>vma->vm_mm->mmap_sem); Can ret == -EAGAIN happen if "block = true"? Generally, I prefer the read_down()/read_up() in the same function (i.e., nouveau_svm_fault()) but I can see why it should be here if hmm_range_fault() can return with mmap_sem unlocked. hmm_range_unregister(range); return ret; } + return 0; } @@ -718,8 +719,8 @@ nouveau_svm_fault(struct nvif_notify *notify) NULL); svmm->vmm->vmm.object.client->super = false; mutex_unlock(>mutex); + up_read(>mm->mmap_sem); } - up_read(>mm->mmap_sem); The "else" case should check for -EBUSY and goto again. /* Cancel any faults in the window whose pages didn't manage * to keep their valid bit, or stay writeable when required. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 5/5] mm: remove the legacy hmm_pfn_* APIs
On 7/3/19 11:45 AM, Christoph Hellwig wrote: Switch the one remaining user in nouveau over to its replacement, and remove all the wrappers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/hmm.h| 34 -- 2 files changed, 1 insertion(+), 35 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 42c026010938..b9ced2e61667 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -844,7 +844,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, struct page *page; uint64_t addr; - page = hmm_pfn_to_page(range, range->pfns[i]); + page = hmm_device_entry_to_page(range, range->pfns[i]); if (page == NULL) continue; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 657606f48796..cdcd78627393 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -290,40 +290,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, range->flags[HMM_PFN_VALID]; } -/* - * Old API: - * hmm_pfn_to_page() - * hmm_pfn_to_pfn() - * hmm_pfn_from_page() - * hmm_pfn_from_pfn() - * - * This are the OLD API please use new API, it is here to avoid cross-tree - * merge painfullness ie we convert things to new API in stages. - */ -static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_page(range, pfn); -} - -static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, - uint64_t pfn) -{ - return hmm_device_entry_to_pfn(range, pfn); -} - -static inline uint64_t hmm_pfn_from_page(const struct hmm_range *range, -struct page *page) -{ - return hmm_device_entry_from_page(range, page); -} - -static inline uint64_t hmm_pfn_from_pfn(const struct hmm_range *range, - unsigned long pfn) -{ - return hmm_device_entry_from_pfn(range, pfn); -} - /* * Mirroring: how to synchronize device page table with CPU page table. * ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 19/22] mm: always return EBUSY for invalid ranges in hmm_range_{fault,snapshot}
On 6/30/19 11:20 PM, Christoph Hellwig wrote: We should not have two different error codes for the same condition. In addition this really complicates the code due to the special handling of EAGAIN that drops the mmap_sem due to the FAULT_FLAG_ALLOW_RETRY logic in the core vm. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell Probably should update the "Return:" comment above hmm_range_snapshot() too. --- mm/hmm.c | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index c85ed7d4e2ce..d125df698e2b 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -974,7 +974,7 @@ long hmm_range_snapshot(struct hmm_range *range) do { /* If range is no longer valid force retry. */ if (!range->valid) - return -EAGAIN; + return -EBUSY; vma = find_vma(hmm->mm, start); if (vma == NULL || (vma->vm_flags & device_vma)) @@ -1069,10 +1069,8 @@ long hmm_range_fault(struct hmm_range *range, bool block) do { /* If range is no longer valid force retry. */ - if (!range->valid) { - up_read(>mm->mmap_sem); - return -EAGAIN; - } + if (!range->valid) + return -EBUSY; vma = find_vma(hmm->mm, start); if (vma == NULL || (vma->vm_flags & device_vma)) ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 3/9] nouveau: factor out device memory address calculation
On 7/29/19 7:28 AM, Christoph Hellwig wrote: Factor out the repeated device memory address calculation into a helper. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 42 +++--- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index e696157f771e..d469bc334438 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -102,6 +102,14 @@ struct nouveau_migrate { unsigned long dma_nr; }; +static unsigned long nouveau_dmem_page_addr(struct page *page) +{ + struct nouveau_dmem_chunk *chunk = page->zone_device_data; + unsigned long idx = page_to_pfn(page) - chunk->pfn_first; + + return (idx << PAGE_SHIFT) + chunk->bo->bo.offset; +} + static void nouveau_dmem_page_free(struct page *page) { struct nouveau_dmem_chunk *chunk = page->zone_device_data; @@ -169,9 +177,7 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, /* Copy things over */ copy = drm->dmem->migrate.copy_func; for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct nouveau_dmem_chunk *chunk; struct page *spage, *dpage; - u64 src_addr, dst_addr; dpage = migrate_pfn_to_page(dst_pfns[i]); if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) @@ -194,14 +200,10 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, continue; } - dst_addr = fault->dma[fault->npages++]; - - chunk = spage->zone_device_data; - src_addr = page_to_pfn(spage) - chunk->pfn_first; - src_addr = (src_addr << PAGE_SHIFT) + chunk->bo->bo.offset; - - ret = copy(drm, 1, NOUVEAU_APER_HOST, dst_addr, - NOUVEAU_APER_VRAM, src_addr); + ret = copy(drm, 1, NOUVEAU_APER_HOST, + fault->dma[fault->npages++], + NOUVEAU_APER_VRAM, + nouveau_dmem_page_addr(spage)); if (ret) { dst_pfns[i] = MIGRATE_PFN_ERROR; __free_page(dpage); @@ -687,18 +689,12 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, /* Copy things over */ copy = drm->dmem->migrate.copy_func; for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct nouveau_dmem_chunk *chunk; struct page *spage, *dpage; - u64 src_addr, dst_addr; dpage = migrate_pfn_to_page(dst_pfns[i]); if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) continue; - chunk = dpage->zone_device_data; - dst_addr = page_to_pfn(dpage) - chunk->pfn_first; - dst_addr = (dst_addr << PAGE_SHIFT) + chunk->bo->bo.offset; - spage = migrate_pfn_to_page(src_pfns[i]); if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { nouveau_dmem_page_free_locked(drm, dpage); @@ -716,10 +712,10 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, continue; } - src_addr = migrate->dma[migrate->dma_nr++]; - - ret = copy(drm, 1, NOUVEAU_APER_VRAM, dst_addr, - NOUVEAU_APER_HOST, src_addr); + ret = copy(drm, 1, NOUVEAU_APER_VRAM, + nouveau_dmem_page_addr(dpage), + NOUVEAU_APER_HOST, + migrate->dma[migrate->dma_nr++]); if (ret) { nouveau_dmem_page_free_locked(drm, dpage); dst_pfns[i] = 0; @@ -846,7 +842,6 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, npages = (range->end - range->start) >> PAGE_SHIFT; for (i = 0; i < npages; ++i) { - struct nouveau_dmem_chunk *chunk; struct page *page; uint64_t addr; @@ -864,10 +859,7 @@ nouveau_dmem_convert_pfn(struct nouveau_drm *drm, continue; } - chunk = page->zone_device_data; - addr = page_to_pfn(page) - chunk->pfn_first; - addr = (addr + chunk->bo->bo.mem.start) << PAGE_SHIFT; - + addr = nouveau_dmem_page_addr(page); range->pfns[i] &= ((1UL << range->pfn_shift) - 1); range->pfns[i] |= (addr >> PAGE_SHIFT) << range->pfn_shift; } ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 8/9] mm: remove the unused MIGRATE_PFN_DEVICE flag
On 7/29/19 7:28 AM, Christoph Hellwig wrote: No one ever checks this flag, and we could easily get that information from the page if needed. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 +-- include/linux/migrate.h| 1 - mm/migrate.c | 4 ++-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 6cb930755970..f04686a2c21f 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -582,8 +582,7 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, *dma_addr)) goto out_dma_unmap; - return migrate_pfn(page_to_pfn(dpage)) | - MIGRATE_PFN_LOCKED | MIGRATE_PFN_DEVICE; + return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; out_dma_unmap: dma_unmap_page(dev, *dma_addr, PAGE_SIZE, DMA_BIDIRECTIONAL); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 229153c2c496..8b46cfdb1a0e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -166,7 +166,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED(1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) -#define MIGRATE_PFN_DEVICE (1UL << 4) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) diff --git a/mm/migrate.c b/mm/migrate.c index dc4e60a496f2..74735256e260 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2237,8 +2237,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, goto next; page = device_private_entry_to_page(entry); - mpfn = migrate_pfn(page_to_pfn(page))| - MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE; + mpfn = migrate_pfn(page_to_pfn(page)) | + MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) mpfn |= MIGRATE_PFN_WRITE; } else { ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 7/9] mm: remove the unused MIGRATE_PFN_ERROR flag
On 7/29/19 7:28 AM, Christoph Hellwig wrote: We don't use this flag anymore, so remove it. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- include/linux/migrate.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 093d67fcf6dd..229153c2c496 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -167,7 +167,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_LOCKED(1UL << 2) #define MIGRATE_PFN_WRITE (1UL << 3) #define MIGRATE_PFN_DEVICE(1UL << 4) -#define MIGRATE_PFN_ERROR (1UL << 5) #define MIGRATE_PFN_SHIFT 6 The MIGRATE_PFN_SHIFT could be reduced to 5 since it is only used to make room for the flags. static inline struct page *migrate_pfn_to_page(unsigned long mpfn) ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 6/9] nouveau: simplify nouveau_dmem_migrate_vma
On 7/29/19 7:28 AM, Christoph Hellwig wrote: Factor the main copy page to vram routine out into a helper that acts on a single page and which doesn't require the nouveau_dmem_migrate structure for argument passing. As an added benefit the new version only allocates the dma address array once and reuses it for each subsequent chunk of work. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 185 - 1 file changed, 56 insertions(+), 129 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 036e6c07d489..6cb930755970 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -44,8 +44,6 @@ #define DMEM_CHUNK_SIZE (2UL << 20) #define DMEM_CHUNK_NPAGES (DMEM_CHUNK_SIZE >> PAGE_SHIFT) -struct nouveau_migrate; - enum nouveau_aper { NOUVEAU_APER_VIRT, NOUVEAU_APER_VRAM, @@ -86,15 +84,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page) return container_of(page->pgmap, struct nouveau_dmem, pagemap); } -struct nouveau_migrate { - struct vm_area_struct *vma; - struct nouveau_drm *drm; - struct nouveau_fence *fence; - unsigned long npages; - dma_addr_t *dma; - unsigned long dma_nr; -}; - static unsigned long nouveau_dmem_page_addr(struct page *page) { struct nouveau_dmem_chunk *chunk = page->zone_device_data; @@ -569,131 +558,67 @@ nouveau_dmem_init(struct nouveau_drm *drm) drm->dmem = NULL; } -static void -nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, - const unsigned long *src_pfns, - unsigned long *dst_pfns, - unsigned long start, - unsigned long end, - struct nouveau_migrate *migrate) +static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, + struct vm_area_struct *vma, unsigned long addr, + unsigned long src, dma_addr_t *dma_addr) { - struct nouveau_drm *drm = migrate->drm; struct device *dev = drm->dev->dev; - unsigned long addr, i, npages = 0; - nouveau_migrate_copy_t copy; - int ret; - - /* First allocate new memory */ - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *dpage, *spage; - - dst_pfns[i] = 0; - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; - - dpage = nouveau_dmem_page_alloc_locked(drm); - if (!dpage) - continue; - - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | - MIGRATE_PFN_LOCKED | - MIGRATE_PFN_DEVICE; - npages++; - } - - if (!npages) - return; - - /* Allocate storage for DMA addresses, so we can unmap later. */ - migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL); - if (!migrate->dma) - goto error; - migrate->dma_nr = 0; - - /* Copy things over */ - copy = drm->dmem->migrate.copy_func; - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *spage, *dpage; - - dpage = migrate_pfn_to_page(dst_pfns[i]); - if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) - continue; - - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { - nouveau_dmem_page_free_locked(drm, dpage); - dst_pfns[i] = 0; - continue; - } - - migrate->dma[migrate->dma_nr] = - dma_map_page_attrs(dev, spage, 0, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(dev, migrate->dma[migrate->dma_nr])) { - nouveau_dmem_page_free_locked(drm, dpage); - dst_pfns[i] = 0; - continue; - } - - ret = copy(drm, 1, NOUVEAU_APER_VRAM, - nouveau_dmem_page_addr(dpage), - NOUVEAU_APER_HOST, - migrate->dma[migrate->dma_nr++]); - if (ret) { - nouveau_dmem_page_free_locked(drm, dpage); - dst_pfns[i] = 0; - continue; -
Re: [PATCH 5/9] nouveau: simplify nouveau_dmem_migrate_to_ram
On 7/29/19 7:28 AM, Christoph Hellwig wrote: Factor the main copy page to ram routine out into a helper that acts on a single page and which doesn't require the nouveau_dmem_fault structure for argument passing. Also remove the loop over multiple pages as we only handle one at the moment, although the structure of the main worker function makes it relatively easy to add multi page support back if needed in the future. But at least for now this avoid the needed to dynamically allocate memory for the dma addresses in what is essentially the page fault path. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 158 ++--- 1 file changed, 39 insertions(+), 119 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 21052a4aaf69..036e6c07d489 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -86,13 +86,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page) return container_of(page->pgmap, struct nouveau_dmem, pagemap); } -struct nouveau_dmem_fault { - struct nouveau_drm *drm; - struct nouveau_fence *fence; - dma_addr_t *dma; - unsigned long npages; -}; - struct nouveau_migrate { struct vm_area_struct *vma; struct nouveau_drm *drm; @@ -146,130 +139,55 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence) } } -static void -nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, - const unsigned long *src_pfns, - unsigned long *dst_pfns, - unsigned long start, - unsigned long end, - struct nouveau_dmem_fault *fault) +static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, + struct vm_area_struct *vma, unsigned long addr, + unsigned long src, unsigned long *dst, dma_addr_t *dma_addr) { - struct nouveau_drm *drm = fault->drm; struct device *dev = drm->dev->dev; - unsigned long addr, i, npages = 0; - nouveau_migrate_copy_t copy; - int ret; - - - /* First allocate new memory */ - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *dpage, *spage; - - dst_pfns[i] = 0; - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; + struct page *dpage, *spage; - dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!dpage) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - continue; - } - lock_page(dpage); - - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | - MIGRATE_PFN_LOCKED; - npages++; - } + spage = migrate_pfn_to_page(src); + if (!spage || !(src & MIGRATE_PFN_MIGRATE)) + return 0; - /* Allocate storage for DMA addresses, so we can unmap later. */ - fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL); - if (!fault->dma) + dpage = alloc_page_vma(GFP_HIGHUSER, args->vma, addr); + if (!dpage) goto error; + lock_page(dpage); - /* Copy things over */ - copy = drm->dmem->migrate.copy_func; - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *spage, *dpage; - - dpage = migrate_pfn_to_page(dst_pfns[i]); - if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) - continue; - - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - __free_page(dpage); - continue; - } - - fault->dma[fault->npages] = - dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(dev, fault->dma[fault->npages])) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - __free_page(dpage); - continue; - } - - ret = copy(drm, 1, NOUVEAU_APER_HOST, - fault->dma[fault->npages++], - NOUVEAU_APER_VRAM, - nouveau_dmem_page_addr(spage)); - if (ret) { - dst_pfns[i] = MIGRATE_PFN_ERROR; -
Re: [PATCH 9/9] mm: remove the MIGRATE_PFN_WRITE flag
On 7/29/19 7:28 AM, Christoph Hellwig wrote: The MIGRATE_PFN_WRITE is only used locally in migrate_vma_collect_pmd, where it can be replaced with a simple boolean local variable. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- include/linux/migrate.h | 1 - mm/migrate.c| 9 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 8b46cfdb1a0e..ba74ef5a7702 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -165,7 +165,6 @@ static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, #define MIGRATE_PFN_VALID (1UL << 0) #define MIGRATE_PFN_MIGRATE (1UL << 1) #define MIGRATE_PFN_LOCKED(1UL << 2) -#define MIGRATE_PFN_WRITE (1UL << 3) #define MIGRATE_PFN_SHIFT 6 static inline struct page *migrate_pfn_to_page(unsigned long mpfn) diff --git a/mm/migrate.c b/mm/migrate.c index 74735256e260..724f92dcc31b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2212,6 +2212,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, unsigned long mpfn, pfn; struct page *page; swp_entry_t entry; + bool writable = false; pte_t pte; pte = *ptep; @@ -2240,7 +2241,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, mpfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; if (is_write_device_private_entry(entry)) - mpfn |= MIGRATE_PFN_WRITE; + writable = true; } else { if (is_zero_pfn(pfn)) { mpfn = MIGRATE_PFN_MIGRATE; @@ -2250,7 +2251,8 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, } page = vm_normal_page(migrate->vma, addr, pte); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; - mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; + if (pte_write(pte)) + writable = true; } /* FIXME support THP */ @@ -2284,8 +2286,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, ptep_get_and_clear(mm, addr, ptep); /* Setup special migration page table entry */ - entry = make_migration_entry(page, mpfn & -MIGRATE_PFN_WRITE); + entry = make_migration_entry(page, writable); swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); MIGRATE_PFN_WRITE may mot being used but that seems like a bug to me. If a page is migrated to device memory, it could be mapped at the same time to avoid a device page fault but it would need the flag to know whether to map it RW or RO. But I suppose that could be inferred from the vma->vm_flags. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 2/9] nouveau: reset dma_nr in nouveau_dmem_migrate_alloc_and_copy
On 7/29/19 7:28 AM, Christoph Hellwig wrote: When we start a new batch of dma_map operations we need to reset dma_nr, as we start filling a newly allocated array. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 38416798abd4..e696157f771e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -682,6 +682,7 @@ nouveau_dmem_migrate_alloc_and_copy(struct vm_area_struct *vma, migrate->dma = kmalloc(sizeof(*migrate->dma) * npages, GFP_KERNEL); if (!migrate->dma) goto error; + migrate->dma_nr = 0; /* Copy things over */ copy = drm->dmem->migrate.copy_func; ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/9] mm: turn migrate_vma upside down
On 7/29/19 7:28 AM, Christoph Hellwig wrote: There isn't any good reason to pass callbacks to migrate_vma. Instead we can just export the three steps done by this function to drivers and let them sequence the operation without callbacks. This removes a lot of boilerplate code as-is, and will allow the drivers to drastically improve code flow and error handling further on. Signed-off-by: Christoph Hellwig Except for a few white space errors ( and $), looks OK. Reviewed-by: Ralph Campbell --- Documentation/vm/hmm.rst | 55 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 122 +++-- include/linux/migrate.h| 118 ++-- mm/migrate.c | 242 +++-- 4 files changed, 193 insertions(+), 344 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index ddcb5ca8b296..ad880e3996b1 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -339,58 +339,9 @@ Migration to and from device memory === Because the CPU cannot access device memory, migration must use the device DMA -engine to perform copy from and to device memory. For this we need a new -migration helper:: - - int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long mentries, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); - -Unlike other migration functions it works on a range of virtual address, there -are two reasons for that. First, device DMA copy has a high setup overhead cost -and thus batching multiple pages is needed as otherwise the migration overhead -makes the whole exercise pointless. The second reason is because the -migration might be for a range of addresses the device is actively accessing. - -The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) -controls destination memory allocation and copy operation. Second one is there -to allow the device driver to perform cleanup operations after migration:: - - struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, -const unsigned long *src, -unsigned long *dst, -unsigned long start, -unsigned long end, -void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - }; - -It is important to stress that these migration helpers allow for holes in the -virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pinned, page is locked, ...). This helper does not -fail but just skips over those pages. - -The alloc_and_copy() might decide to not migrate all pages in the -range (for reasons under the callback control). For those, the callback just -has to leave the corresponding dst entry empty. - -Finally, the migration of the struct page might fail (for file backed page) for -various reasons (failure to freeze reference, or update page cache, ...). If -that happens, then the finalize_and_map() can catch any pages that were not -migrated. Note those pages were still copied to a new page and thus we wasted -bandwidth but this is considered as a rare event and a price that we are -willing to pay to keep all the code simpler. +engine to perform copy from and to device memory. For this we need a new to +use migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() +helpers. Memory cgroup (memcg) and rss accounting diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 345c63cb752a..38416798abd4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -131,9 +131,8 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, unsigned long *dst_pfns, unsigned long start, unsigned long end, - void *private) + struct nouveau_dmem_fault *fault) { - struct nouveau_dmem_fault *fault = private; struct nouveau_drm *drm = fault->drm; struct device *dev = drm->dev->dev; unsigned long addr, i, npages = 0; @@ -230,14 +229,9 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, } } -void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, -
Re: [PATCH 4/9] nouveau: factor out dmem fence completion
On 7/29/19 7:28 AM, Christoph Hellwig wrote: Factor out the end of fencing logic from the two migration routines. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 33 -- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index d469bc334438..21052a4aaf69 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -133,6 +133,19 @@ static void nouveau_dmem_page_free(struct page *page) spin_unlock(>lock); } +static void nouveau_dmem_fence_done(struct nouveau_fence **fence) +{ + if (fence) { + nouveau_fence_wait(*fence, true, false); + nouveau_fence_unref(fence); + } else { + /* +* FIXME wait for channel to be IDLE before calling finalizing +* the hmem object. +*/ + } +} + static void nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, const unsigned long *src_pfns, @@ -236,15 +249,7 @@ nouveau_dmem_fault_finalize_and_map(struct nouveau_dmem_fault *fault) { struct nouveau_drm *drm = fault->drm; - if (fault->fence) { - nouveau_fence_wait(fault->fence, true, false); - nouveau_fence_unref(>fence); - } else { - /* -* FIXME wait for channel to be IDLE before calling finalizing -* the hmem object below (nouveau_migrate_hmem_fini()). -*/ - } + nouveau_dmem_fence_done(>fence); while (fault->npages--) { dma_unmap_page(drm->dev->dev, fault->dma[fault->npages], @@ -748,15 +753,7 @@ nouveau_dmem_migrate_finalize_and_map(struct nouveau_migrate *migrate) { struct nouveau_drm *drm = migrate->drm; - if (migrate->fence) { - nouveau_fence_wait(migrate->fence, true, false); - nouveau_fence_unref(>fence); - } else { - /* -* FIXME wait for channel to be IDLE before finalizing -* the hmem object below (nouveau_migrate_hmem_fini()) ? -*/ - } + nouveau_dmem_fence_done(>fence); while (migrate->dma_nr--) { dma_unmap_page(drm->dev->dev, migrate->dma[migrate->dma_nr], ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v3 hmm 11/11] mm/mmu_notifiers: remove unregister_no_release
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe mmu_notifier_unregister_no_release() and mmu_notifier_call_srcu() no longer have any users, they have all been converted to use mmu_notifier_put(). So delete this difficult to use interface. Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/5] mm: Check if mmu notifier callbacks are allowed to fail
On 8/14/19 3:14 PM, Andrew Morton wrote: On Wed, 14 Aug 2019 22:20:23 +0200 Daniel Vetter wrote: Just a bit of paranoia, since if we start pushing this deep into callchains it's hard to spot all places where an mmu notifier implementation might fail when it's not allowed to. Inspired by some confusion we had discussing i915 mmu notifiers and whether we could use the newly-introduced return value to handle some corner cases. Until we realized that these are only for when a task has been killed by the oom reaper. An alternative approach would be to split the callback into two versions, one with the int return value, and the other with void return value like in older kernels. But that's a lot more churn for fairly little gain I think. Summary from the m-l discussion on why we want something at warning level: This allows automated tooling in CI to catch bugs without humans having to look at everything. If we just upgrade the existing pr_info to a pr_warn, then we'll have false positives. And as-is, no one will ever spot the problem since it's lost in the massive amounts of overall dmesg noise. ... --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -179,6 +179,8 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) pr_info("%pS callback failed with %d in %sblockable context.\n", mn->ops->invalidate_range_start, _ret, !mmu_notifier_range_blockable(range) ? "non-" : ""); + WARN_ON(mmu_notifier_range_blockable(range) || + ret != -EAGAIN); ret = _ret; } } A problem with WARN_ON(a || b) is that if it triggers, we don't know whether it was because of a or because of b. Or both. So I'd suggest WARN_ON(a); WARN_ON(b); This won't quite work. It is OK to have mmu_notifier_range_blockable(range) be true or false. sync_cpu_device_pagetables() shouldn't return -EAGAIN unless blockable is true. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v3 hmm 00/11] Add mmu_notifier_get/put for managing mmu notifier registrations
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe This series introduces a new registration flow for mmu_notifiers based on the idea that the user would like to get a single refcounted piece of memory for a mm, keyed to its use. For instance many users of mmu_notifiers use an interval tree or similar to dispatch notifications to some object. There are many objects but only one notifier subscription per mm holding the tree. Of the 12 places that call mmu_notifier_register: - 7 are maintaining some kind of obvious mapping of mm_struct to mmu_notifier registration, ie in some linked list or hash table. Of the 7 this series converts 4 (gru, hmm, RDMA, radeon) - 3 (hfi1, gntdev, vhost) are registering multiple notifiers, but each one immediately does some VA range filtering, ie with an interval tree. These would be better with a global subsystem-wide range filter and could convert to this API. - 2 (kvm, amd_iommu) are deliberately using a single mm at a time, and really can't use this API. One of the intel-svm's modes is also in this list The 3/7 unconverted drivers are: - intel-svm This driver tracks mm's in a global linked list 'global_svm_list' and would benefit from this API. Its flow is a bit complex, since it also wants a set of non-shared notifiers. - i915_gem_usrptr This driver tracks mm's in a per-device hash table (dev_priv->mm_structs), but only has an optional use of mmu_notifiers. Since it still seems to need the hash table it is difficult to convert. - amdkfd/kfd_process This driver is using a global SRCU hash table to track mm's The control flow here is very complicated and the driver is relying on this hash table to be fast on the ioctl syscall path. It would definitely benefit, but only if the ioctl path didn't need to do the search so often. This series is already entangled with patches in the hmm & RDMA tree and will require some git topic branches for the RDMA ODP stuff. I intend for it to go through the hmm tree. There is a git version here: https://github.com/jgunthorpe/linux/commits/mmu_notifier Which has the required pre-patches for the RDMA ODP conversion that are still being reviewed. Jason Gunthorpe (11): mm/mmu_notifiers: hoist do_mmu_notifier_register down_write to the caller mm/mmu_notifiers: do not speculatively allocate a mmu_notifier_mm mm/mmu_notifiers: add a get/put scheme for the registration misc/sgi-gru: use mmu_notifier_get/put for struct gru_mm_struct hmm: use mmu_notifier_get/put for 'struct hmm' RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm' RDMA/odp: remove ib_ucontext from ib_umem drm/radeon: use mmu_notifier_get/put for struct radeon_mn drm/amdkfd: fix a use after free race with mmu_notifer unregister drm/amdkfd: use mmu_notifier_put mm/mmu_notifiers: remove unregister_no_release drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 3 - drivers/gpu/drm/amd/amdkfd/kfd_process.c | 88 - drivers/gpu/drm/nouveau/nouveau_drm.c| 3 + drivers/gpu/drm/radeon/radeon.h | 3 - drivers/gpu/drm/radeon/radeon_device.c | 2 - drivers/gpu/drm/radeon/radeon_drv.c | 2 + drivers/gpu/drm/radeon/radeon_mn.c | 157 drivers/infiniband/core/umem.c | 4 +- drivers/infiniband/core/umem_odp.c | 183 ++ drivers/infiniband/core/uverbs_cmd.c | 3 - drivers/infiniband/core/uverbs_main.c| 1 + drivers/infiniband/hw/mlx5/main.c| 5 - drivers/misc/sgi-gru/grufile.c | 1 + drivers/misc/sgi-gru/grutables.h | 2 - drivers/misc/sgi-gru/grutlbpurge.c | 84 +++-- include/linux/hmm.h | 12 +- include/linux/mm_types.h | 6 - include/linux/mmu_notifier.h | 40 +++- include/rdma/ib_umem.h | 2 +- include/rdma/ib_umem_odp.h | 10 +- include/rdma/ib_verbs.h | 3 - kernel/fork.c| 1 - mm/hmm.c | 121 +++- mm/mmu_notifier.c| 230 +-- 25 files changed, 408 insertions(+), 559 deletions(-) For the core MM, HMM, and nouveau changes you can add: Tested-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v3 hmm 03/11] mm/mmu_notifiers: add a get/put scheme for the registration
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe Many places in the kernel have a flow where userspace will create some object and that object will need to connect to the subsystem's mmu_notifier subscription for the duration of its lifetime. In this case the subsystem is usually tracking multiple mm_structs and it is difficult to keep track of what struct mmu_notifier's have been allocated for what mm's. Since this has been open coded in a variety of exciting ways, provide core functionality to do this safely. This approach uses the strct mmu_notifier_ops * as a key to determine if s/strct/struct the subsystem has a notifier registered on the mm or not. If there is a registration then the existing notifier struct is returned, otherwise the ops->alloc_notifiers() is used to create a new per-subsystem notifier for the mm. The destroy side incorporates an async call_srcu based destruction which will avoid bugs in the callers such as commit 6d7c3cde93c1 ("mm/hmm: fix use after free with struct hmm in the mmu notifiers"). Since we are inside the mmu notifier core locking is fairly simple, the allocation uses the same approach as for mmu_notifier_mm, the write side of the mmap_sem makes everything deterministic and we only need to do hlist_add_head_rcu() under the mm_take_all_locks(). The new users count and the discoverability in the hlist is fully serialized by the mmu_notifier_mm->lock. Co-developed-by: Christoph Hellwig Signed-off-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell --- include/linux/mmu_notifier.h | 35 mm/mmu_notifier.c| 156 +-- 2 files changed, 185 insertions(+), 6 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6ad9..31aa971315a142 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -211,6 +211,19 @@ struct mmu_notifier_ops { */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); + + /* +* These callbacks are used with the get/put interface to manage the +* lifetime of the mmu_notifier memory. alloc_notifier() returns a new +* notifier for use with the mm. +* +* free_notifier() is only called after the mmu_notifier has been +* fully put, calls to any ops callback are prevented and no ops +* callbacks are currently running. It is called from a SRCU callback +* and cannot sleep. +*/ + struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); + void (*free_notifier)(struct mmu_notifier *mn); }; /* @@ -227,6 +240,9 @@ struct mmu_notifier_ops { struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; + struct mm_struct *mm; + struct rcu_head rcu; + unsigned int users; }; static inline int mm_has_notifiers(struct mm_struct *mm) @@ -234,6 +250,21 @@ static inline int mm_has_notifiers(struct mm_struct *mm) return unlikely(mm->mmu_notifier_mm); } +struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, +struct mm_struct *mm); +static inline struct mmu_notifier * +mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) +{ + struct mmu_notifier *ret; + + down_write(>mmap_sem); + ret = mmu_notifier_get_locked(ops, mm); + up_write(>mmap_sem); + return ret; +} +void mmu_notifier_put(struct mmu_notifier *mn); +void mmu_notifier_synchronize(void); + extern int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *mn, @@ -581,6 +612,10 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at +static inline void mmu_notifier_synchronize(void) +{ +} + #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 696810f632ade1..4a770b5211b71d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -248,6 +248,9 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) lockdep_assert_held_write(>mmap_sem); BUG_ON(atomic_read(>mm_users) <= 0); + mn->mm = mm; + mn->users = 1; + if (!mm->mmu_notifier_mm) { /* * kmalloc cannot be called under mm_take_all_locks(), but we @@ -295,18 +298,24 @@ int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmu_notifier_register); -/* +/** + * mmu_notifier_register - Register a notif
Re: [PATCH v3 hmm 02/11] mm/mmu_notifiers: do not speculatively allocate a mmu_notifier_mm
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe A prior commit e0f3c3f78da2 ("mm/mmu_notifier: init notifier if necessary") made an attempt at doing this, but had to be reverted as calling the GFP_KERNEL allocator under the i_mmap_mutex causes deadlock, see commit 35cfa2b0b491 ("mm/mmu_notifier: allocate mmu_notifier in advance"). However, we can avoid that problem by doing the allocation only under the mmap_sem, which is already happening. Since all writers to mm->mmu_notifier_mm hold the write side of the mmap_sem reading it under that sem is deterministic and we can use that to decide if the allocation path is required, without speculation. The actual update to mmu_notifier_mm must still be done under the mm_take_all_locks() to ensure read-side coherency. Signed-off-by: Jason Gunthorpe Looks good to me. Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v3 hmm 05/11] hmm: use mmu_notifier_get/put for 'struct hmm'
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe This is a significant simplification, it eliminates all the remaining 'hmm' stuff in mm_struct, eliminates krefing along the critical notifier paths, and takes away all the ugly locking and abuse of page_table_lock. mmu_notifier_get() provides the single struct hmm per struct mm which eliminates mm->hmm. It also directly guarantees that no mmu_notifier op callback is callable while concurrent free is possible, this eliminates all the krefs inside the mmu_notifier callbacks. The remaining krefs in the range code were overly cautious, drivers are already not permitted to free the mirror while a range exists. Signed-off-by: Jason Gunthorpe Looks good. Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: turn hmm migrate_vma upside down v3
On 8/14/19 12:59 AM, Christoph Hellwig wrote: Hi Jérôme, Ben and Jason, below is a series against the hmm tree which starts revamping the migrate_vma functionality. The prime idea is to export three slightly lower level functions and thus avoid the need for migrate_vma_ops callbacks. Diffstat: 7 files changed, 282 insertions(+), 614 deletions(-) A git tree is also available at: git://git.infradead.org/users/hch/misc.git migrate_vma-cleanup.3 Gitweb: http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/migrate_vma-cleanup.3 Changes since v2: - don't unmap pages when returning 0 from nouveau_dmem_migrate_to_ram - minor style fixes - add a new patch to remove CONFIG_MIGRATE_VMA_HELPER Changes since v1: - fix a few whitespace issues - drop the patch to remove MIGRATE_PFN_WRITE for now - various spelling fixes - clear cpages and npages in migrate_vma_setup - fix the nouveau_dmem_fault_copy_one return value - minor improvements to some nouveau internal calling conventions Some of the patches seem to have been mangled in the mail. I was able to edit them and apply to Jason's tree https://github.com/jgunthorpe/linux.git mmu_notifier branch. So for the series you can add: Tested-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v3 hmm 01/11] mm/mmu_notifiers: hoist do_mmu_notifier_register down_write to the caller
On 8/6/19 4:15 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe This simplifies the code to not have so many one line functions and extra logic. __mmu_notifier_register() simply becomes the entry point to register the notifier, and the other one calls it under lock. Also add a lockdep_assert to check that the callers are holding the lock as expected. Suggested-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe Nice clean up. Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH] nouveau/hmm: map pages after migration
On 8/10/19 4:13 AM, Christoph Hellwig wrote: On something vaguely related to this patch: You use the NVIF_VMM_PFNMAP_V0_V* defines from nvif/if000c.h, which are a little odd as we only ever set these bits, but they also don't seem to appear to be in values that are directly fed to the hardware. On the other hand mmu/vmm.h defines a set of NVIF_VMM_PFNMAP_V0_* Yes, I see NVKM_VMM_PFN_* constants with similar names and identical values, and those are used in mmu/vmmgp100.c and what appears to finally do the low-level dma mapping and talking to the hardware. Are these two sets of constants supposed to be the same? Are the actual hardware values or just a driver internal interface? It looks a bit odd to me too. I don't really know the structure/history of nouveau. Perhaps Ben Skeggs can shed more light on your question. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 04/15] mm: remove the pgmap field from struct hmm_vma_walk
On 8/16/19 10:28 AM, Jason Gunthorpe wrote: On Fri, Aug 16, 2019 at 10:21:41AM -0700, Dan Williams wrote: We can do a get_dev_pagemap inside the page_walk and touch the pgmap, or we can do the 'device mutex && retry' pattern and touch the pgmap in the driver, under that lock. However in all cases the current get_dev_pagemap()'s in the page walk are not necessary, and we can delete them. Yes, as long as 'struct page' instances resulting from that lookup are not passed outside of that lock. Indeed. Also, I was reflecting over lunch that the hmm_range_fault should only return DEVICE_PRIVATE pages for the caller's device (see other thread with HCH), and in this case, the caller should also be responsible to ensure that the driver is not calling hmm_range_fault at the same time it is deleting it's own DEVICE_PRIVATE mapping - ie by fencing its page fault handler. Yes, that would make it a one step process to access another device's migrated memory pages. Right now, it has to be a two step process where the caller calls hmm_range_fault, check the struct page to see if it is device private and not owned, then call hmm_range_fault again with range->pfns[i] |= range->flags[HMM_PFN_DEVICE_PRIVATE] to cause the other device to migrate the page back to system memory. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug
On 8/26/19 11:09 AM, Jason Gunthorpe wrote: On Mon, Aug 26, 2019 at 11:02:12AM -0700, Ralph Campbell wrote: On 8/24/19 3:37 PM, Christoph Hellwig wrote: On Fri, Aug 23, 2019 at 03:17:52PM -0700, Ralph Campbell wrote: Although hmm_range_fault() calls find_vma() to make sure that a vma exists before calling walk_page_range(), hmm_vma_walk_hole() can still be called with walk->vma == NULL if the start and end address are not contained within the vma range. Should we convert to walk_vma_range instead? Or keep walk_page_range but drop searching the vma ourselves? Except for that the patch looks good to me: Reviewed-by: Christoph Hellwig I think keeping the call to walk_page_range() makes sense. Jason is hoping to be able to snapshot a range with & without vmas and have the pfns[] filled with empty/valid entries as appropriate. I plan to repost my patch changing hmm_range_fault() to use walk.test_walk which will remove the call to find_vma(). Jason had some concerns about testing it so that's why I have been working on some HMM self tests before resending it. I'm really excited to see tests for hmm_range_fault()! Did you find this bug with the tests?? Jason Yes, I found both bugs with the tests. I started with Jerome's hmm_dummy driver and user level test code. Hopefully I can send it out this week. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug
On 8/24/19 3:37 PM, Christoph Hellwig wrote: On Fri, Aug 23, 2019 at 03:17:52PM -0700, Ralph Campbell wrote: Although hmm_range_fault() calls find_vma() to make sure that a vma exists before calling walk_page_range(), hmm_vma_walk_hole() can still be called with walk->vma == NULL if the start and end address are not contained within the vma range. Should we convert to walk_vma_range instead? Or keep walk_page_range but drop searching the vma ourselves? Except for that the patch looks good to me: Reviewed-by: Christoph Hellwig I think keeping the call to walk_page_range() makes sense. Jason is hoping to be able to snapshot a range with & without vmas and have the pfns[] filled with empty/valid entries as appropriate. I plan to repost my patch changing hmm_range_fault() to use walk.test_walk which will remove the call to find_vma(). Jason had some concerns about testing it so that's why I have been working on some HMM self tests before resending it. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 2/2] mm/hmm: hmm_range_fault() infinite loop
On 8/27/19 11:41 AM, Jason Gunthorpe wrote: On Fri, Aug 23, 2019 at 03:17:53PM -0700, Ralph Campbell wrote: Signed-off-by: Ralph Campbell mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hmm.c b/mm/hmm.c index 29371485fe94..4882b83aeccb 100644 +++ b/mm/hmm.c @@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; + if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) + return -EPERM; Can walk->vma be NULL here? hmm_vma_do_fault() touches it unconditionally. Jason walk->vma can be NULL. hmm_vma_do_fault() no longer touches it unconditionally, that is what the preceding patch fixes. I suppose I could change hmm_vma_walk_hole_() to check for NULL and fill in the pfns[] array, I just chose to handle it in hmm_vma_do_fault(). ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 1/2] mm/hmm: hmm_range_fault() NULL pointer bug
Although hmm_range_fault() calls find_vma() to make sure that a vma exists before calling walk_page_range(), hmm_vma_walk_hole() can still be called with walk->vma == NULL if the start and end address are not contained within the vma range. hmm_range_fault() /* calls find_vma() but no range check */ walk_page_range() /* calls find_vma(), sets walk->vma = NULL */ __walk_page_range() walk_pgd_range() walk_p4d_range() walk_pud_range() hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(vma=0) Signed-off-by: Ralph Campbell --- mm/hmm.c | 13 + 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index fc05c8fe78b4..29371485fe94 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -229,6 +229,9 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; + if (!vma) + goto err; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) flags |= FAULT_FLAG_ALLOW_RETRY; if (write_fault) @@ -239,12 +242,14 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, /* Note, handle_mm_fault did up_read(>mmap_sem)) */ return -EAGAIN; } - if (ret & VM_FAULT_ERROR) { - *pfn = range->values[HMM_PFN_ERROR]; - return -EFAULT; - } + if (ret & VM_FAULT_ERROR) + goto err; return -EBUSY; + +err: + *pfn = range->values[HMM_PFN_ERROR]; + return -EFAULT; } static int hmm_pfns_bad(unsigned long addr, -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 2/2] mm/hmm: hmm_range_fault() infinite loop
Normally, callers to handle_mm_fault() are supposed to check the vma->vm_flags first. hmm_range_fault() checks for VM_READ but doesn't check for VM_WRITE if the caller requests a page to be faulted in with write permission (via the hmm_range.pfns[] value). If the vma is write protected, this can result in an infinite loop: hmm_range_fault() walk_page_range() ... hmm_vma_walk_hole() hmm_vma_walk_hole_() hmm_vma_do_fault() handle_mm_fault(FAULT_FLAG_WRITE) /* returns VM_FAULT_WRITE */ /* returns -EBUSY */ /* returns -EBUSY */ /* returns -EBUSY */ /* loops on -EBUSY and range->valid */ Prevent this by checking for vma->vm_flags & VM_WRITE before calling handle_mm_fault(). Signed-off-by: Ralph Campbell --- mm/hmm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hmm.c b/mm/hmm.c index 29371485fe94..4882b83aeccb 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -292,6 +292,9 @@ static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, hmm_vma_walk->last = addr; i = (addr - range->start) >> PAGE_SHIFT; + if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE)) + return -EPERM; + for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = range->values[HMM_PFN_NONE]; if (fault || write_fault) { -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 0/2] mm/hmm: two bug fixes for hmm_range_fault()
I have been working on converting Jerome's hmm_dummy driver and self tests into a stand-alone set of tests to be included in tools/testing/selftests/vm and came across these two bug fixes in the process. The tests aren't quite ready to be posted as a patch. I'm posting the fixes now since I thought they shouldn't wait. They should probably have a fixes line but with all the HMM changes, I wasn't sure exactly which commit to use. These are based on top of Jason's latest hmm branch. Ralph Campbell (2): mm/hmm: hmm_range_fault() NULL pointer bug mm/hmm: hmm_range_fault() infinite loop mm/hmm.c | 16 1 file changed, 12 insertions(+), 4 deletions(-) -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 2/4] mm/hmm: allow snapshot of the special zero page
Allow hmm_range_fault() to return success (0) when the CPU pagetable entry points to the special shared zero page. The caller can then handle the zero page by possibly clearing device private memory instead of DMAing a zero page. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hmm.c b/mm/hmm.c index 06041d4399ff..7217912bef13 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -532,7 +532,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, return -EBUSY; } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { *pfn = range->values[HMM_PFN_SPECIAL]; - return -EFAULT; + return is_zero_pfn(pte_pfn(pte)) ? 0 : -EFAULT; } *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 1/4] mm/hmm: make full use of walk_page_range()
hmm_range_fault() calls find_vma() and walk_page_range() in a loop. This is unnecessary duplication since walk_page_range() calls find_vma() in a loop already. Simplify hmm_range_fault() by defining a walk_test() callback function to filter unhandled vmas. This also fixes a bug where hmm_range_fault() was not checking start >= vma->vm_start before checking vma->vm_flags so hmm_range_fault() could return an error based on the wrong vma for the requested range. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 113 --- 1 file changed, 50 insertions(+), 63 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 902f5fa6bf93..06041d4399ff 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -252,18 +252,17 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, return -EFAULT; } -static int hmm_pfns_bad(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_pfns_fill(unsigned long addr, +unsigned long end, +struct hmm_range *range, +enum hmm_pfn_value_e value) { - struct hmm_vma_walk *hmm_vma_walk = walk->private; - struct hmm_range *range = hmm_vma_walk->range; uint64_t *pfns = range->pfns; unsigned long i; i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) - pfns[i] = range->values[HMM_PFN_ERROR]; + pfns[i] = range->values[value]; return 0; } @@ -584,7 +583,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, } return 0; } else if (!pmd_present(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) { /* @@ -612,7 +611,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, * recover. */ if (pmd_bad(pmd)) - return hmm_pfns_bad(start, end, walk); + return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR); ptep = pte_offset_map(pmdp, addr); i = (addr - range->start) >> PAGE_SHIFT; @@ -770,13 +769,36 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, #define hmm_vma_walk_hugetlb_entry NULL #endif /* CONFIG_HUGETLB_PAGE */ -static void hmm_pfns_clear(struct hmm_range *range, - uint64_t *pfns, - unsigned long addr, - unsigned long end) +static int hmm_vma_walk_test(unsigned long start, +unsigned long end, +struct mm_walk *walk) { - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = range->values[HMM_PFN_NONE]; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + + /* If range is no longer valid, force retry. */ + if (!range->valid) + return -EBUSY; + + /* +* Skip vma ranges that don't have struct page backing them or +* map I/O devices directly. +*/ + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) + return -EFAULT; + + /* +* If the vma does not allow read access, then assume that it does not +* allow write access either. HMM does not support architectures +* that allow write without read. +*/ + if (!(vma->vm_flags & VM_READ)) { + (void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE); + return -EPERM; + } + + return 0; } /* @@ -857,6 +879,7 @@ static const struct mm_walk_ops hmm_walk_ops = { .pmd_entry = hmm_vma_walk_pmd, .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, + .test_walk = hmm_vma_walk_test, }; /** @@ -889,63 +912,27 @@ static const struct mm_walk_ops hmm_walk_ops = { */ long hmm_range_fault(struct hmm_range *range, unsigned int flags) { - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; + unsigned long start = range->start; + struct hmm_vma_walk hmm_vma_walk = { + .range = range, + .last = start, + .flags = flags, + }; struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; int ret; lockdep_assert_held(>mmu_notifier.mm->mmap_sem); do { - /* If range is no longer valid force retry. */ - if (!rang
[PATCH 0/4] HMM tests and minor fixes
These changes are based on Jason's latest hmm branch. Patch 1 was previously posted here [1] but was dropped from the orginal series. Hopefully, the tests will reduce concerns about edge conditions. I'm sure more tests could be usefully added but I thought this was a good starting point. [1] https://lore.kernel.org/linux-mm/20190726005650.2566-6-rcampb...@nvidia.com/ Ralph Campbell (4): mm/hmm: make full use of walk_page_range() mm/hmm: allow snapshot of the special zero page mm/hmm: allow hmm_range_fault() of mmap(PROT_NONE) mm/hmm/test: add self tests for HMM MAINTAINERS|3 + drivers/char/Kconfig | 11 + drivers/char/Makefile |1 + drivers/char/hmm_dmirror.c | 1504 include/Kbuild |1 + include/uapi/linux/hmm_dmirror.h | 74 ++ mm/hmm.c | 117 +- tools/testing/selftests/vm/.gitignore |1 + tools/testing/selftests/vm/Makefile|3 + tools/testing/selftests/vm/config |3 + tools/testing/selftests/vm/hmm-tests.c | 1304 tools/testing/selftests/vm/run_vmtests | 16 + tools/testing/selftests/vm/test_hmm.sh | 105 ++ 13 files changed, 3079 insertions(+), 64 deletions(-) create mode 100644 drivers/char/hmm_dmirror.c create mode 100644 include/uapi/linux/hmm_dmirror.h create mode 100644 tools/testing/selftests/vm/hmm-tests.c create mode 100755 tools/testing/selftests/vm/test_hmm.sh -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 3/4] mm/hmm: allow hmm_range_fault() of mmap(PROT_NONE)
Allow hmm_range_fault() to return success (0) when the range has no access (!(vma->vm_flags & VM_READ)). The range->pfns[] array will be filled with range->values[HMM_PFN_NONE] in this case. This allows the caller to get a snapshot of a range without having to lookup the vma before calling hmm_range_fault(). If the call to hmm_range_fault() is not a snapshot, the caller can still check that pfns have the desired access permissions. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/hmm.c b/mm/hmm.c index 7217912bef13..16c834e5d1c0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -795,7 +795,9 @@ static int hmm_vma_walk_test(unsigned long start, */ if (!(vma->vm_flags & VM_READ)) { (void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE); - return -EPERM; + + /* Skip this vma and continue processing the next vma. */ + return 1; } return 0; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 4/4] mm/hmm/test: add self tests for HMM
Add self tests for HMM. Signed-off-by: Ralph Campbell --- MAINTAINERS|3 + drivers/char/Kconfig | 11 + drivers/char/Makefile |1 + drivers/char/hmm_dmirror.c | 1504 include/Kbuild |1 + include/uapi/linux/hmm_dmirror.h | 74 ++ tools/testing/selftests/vm/.gitignore |1 + tools/testing/selftests/vm/Makefile|3 + tools/testing/selftests/vm/config |3 + tools/testing/selftests/vm/hmm-tests.c | 1304 tools/testing/selftests/vm/run_vmtests | 16 + tools/testing/selftests/vm/test_hmm.sh | 105 ++ 12 files changed, 3026 insertions(+) create mode 100644 drivers/char/hmm_dmirror.c create mode 100644 include/uapi/linux/hmm_dmirror.h create mode 100644 tools/testing/selftests/vm/hmm-tests.c create mode 100755 tools/testing/selftests/vm/test_hmm.sh diff --git a/MAINTAINERS b/MAINTAINERS index 43604d6ab96c..8ab242d91876 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7318,8 +7318,11 @@ M: Jérôme Glisse L: linux...@kvack.org S: Maintained F: mm/hmm* +F: drivers/char/hmm* F: include/linux/hmm* +F: include/uapi/linux/hmm* F: Documentation/vm/hmm.rst +F: tools/testing/selftests/vm/*hmm* HOST AP DRIVER M: Jouni Malinen diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig index 3e866885a405..b4ad868ead63 100644 --- a/drivers/char/Kconfig +++ b/drivers/char/Kconfig @@ -557,6 +557,17 @@ config ADI and SSM (Silicon Secured Memory). Intended consumers of this driver include crash and makedumpfile. +config HMM_DMIRROR + tristate "HMM driver for testing Heterogeneous Memory Management" + depends on HMM_MIRROR + depends on DEVICE_PRIVATE + help + This is a pseudo device driver solely for testing HMM. + Say Y here if you want to build the HMM test driver. + Doing so will allow you to run tools/testing/selftest/vm/hmm-tests. + + If in doubt, say "N". + endmenu config RANDOM_TRUST_CPU diff --git a/drivers/char/Makefile b/drivers/char/Makefile index fbea7dd12932..c9ddd8e550c5 100644 --- a/drivers/char/Makefile +++ b/drivers/char/Makefile @@ -54,3 +54,4 @@ js-rtc-y = rtc.o obj-$(CONFIG_XILLYBUS) += xillybus/ obj-$(CONFIG_POWERNV_OP_PANEL) += powernv-op-panel.o obj-$(CONFIG_ADI) += adi.o +obj-$(CONFIG_HMM_DMIRROR) += hmm_dmirror.o diff --git a/drivers/char/hmm_dmirror.c b/drivers/char/hmm_dmirror.c new file mode 100644 index ..ccb4e03a03b5 --- /dev/null +++ b/drivers/char/hmm_dmirror.c @@ -0,0 +1,1504 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2013 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Jérôme Glisse + */ +/* + * This is a driver to exercice the HMM (heterogeneous memory management) + * mirror and zone device private memory migration APIs of the kernel. + * Userspace programs can register with the driver to mirror their own address + * space and can use the device to read/write any valid virtual address. + * + * In some ways it can also serve as an example driver for people wanting to use + * HMM inside their own device driver. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define DMIRROR_NDEVICES 2 +#define DMIRROR_RANGE_FAULT_TIMEOUT1000 +#define DEVMEM_CHUNK_SIZE (256 * 1024 * 1024U) +#define DEVMEM_CHUNKS_RESERVE 16 + +static const struct dev_pagemap_ops dmirror_devmem_ops; +static dev_t dmirror_dev; +static struct platform_device *dmirror_platform_devices[DMIRROR_NDEVICES]; +static struct page *dmirror_zero_page; + +struct dmirror_device; + +struct dmirror_bounce { + void*ptr; + unsigned long size; + unsigned long addr; + unsigned long cpages; +}; + +#define DPT_SHIFT PAGE_SHIFT +#define DPT_VALID (1UL << 0) +#define DPT_WRITE (1UL << 1) +#define DPT_DPAGE (1UL << 2) +#define DPT_ZPAGE 0x20UL + +const uint64_t dmirror_hmm_flags[HMM_PFN_FLAG_MAX] = { + [HMM_PFN_VALID] = DPT_VALID, + [HMM_PFN_WRITE] = DPT_WRITE, + [HMM_PFN_DEVICE_PRIVATE] = DPT_DPAGE, +}; + +static const uint64_
Re: [PATCH 2/4] mm/hmm: allow snapshot of the special zero page
On 9/12/19 1:26 AM, Christoph Hellwig wrote: On Wed, Sep 11, 2019 at 03:28:27PM -0700, Ralph Campbell wrote: Allow hmm_range_fault() to return success (0) when the CPU pagetable entry points to the special shared zero page. The caller can then handle the zero page by possibly clearing device private memory instead of DMAing a zero page. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hmm.c b/mm/hmm.c index 06041d4399ff..7217912bef13 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -532,7 +532,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, return -EBUSY; } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) { *pfn = range->values[HMM_PFN_SPECIAL]; - return -EFAULT; + return is_zero_pfn(pte_pfn(pte)) ? 0 : -EFAULT; Any chance to just use a normal if here: if (!is_zero_pfn(pte_pfn(pte))) return -EFAULT; return 0; Sure, no problem. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/4] mm/hmm: make full use of walk_page_range()
On 9/12/19 1:26 AM, Christoph Hellwig wrote: +static int hmm_pfns_fill(unsigned long addr, +unsigned long end, +struct hmm_range *range, +enum hmm_pfn_value_e value) Nit: can we use the space a little more efficient, e.g.: static int hmm_pfns_fill(unsigned long addr, unsigned long end, struct hmm_range *range, enum hmm_pfn_value_e value) +static int hmm_vma_walk_test(unsigned long start, +unsigned long end, +struct mm_walk *walk) Same here. + if (!(vma->vm_flags & VM_READ)) { + (void) hmm_pfns_fill(start, end, range, HMM_PFN_NONE); There should be no need for the void cast here. OK. I'll post a v2 with the these changes. Thanks for the reviews. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 08/13] mm: remove the mask variable in hmm_vma_walk_hugetlb_entry
On 7/29/19 10:51 PM, Christoph Hellwig wrote: The pagewalk code already passes the value as the hmask parameter. Signed-off-by: Christoph Hellwig --- mm/hmm.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index f26d6abc4ed2..88b77a4a6a1e 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -771,19 +771,16 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { #ifdef CONFIG_HUGETLB_PAGE - unsigned long addr = start, i, pfn, mask; + unsigned long addr = start, i, pfn; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - struct hstate *h = hstate_vma(vma); uint64_t orig_pfn, cpu_flags; bool fault, write_fault; spinlock_t *ptl; pte_t entry; int ret = 0; - mask = huge_page_size(h) - 1; - ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte); entry = huge_ptep_get(pte); @@ -799,7 +796,7 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, goto unlock; } - pfn = pte_pfn(entry) + ((start & mask) >> PAGE_SHIFT); + pfn = pte_pfn(entry) + ((start & hmask) >> PAGE_SHIFT); This needs to be "~hmask" so that the upper bits of the start address are not added to the pfn. It's the middle bits of the address that offset into the huge page that are needed. for (; addr < end; addr += PAGE_SIZE, i++, pfn++) range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags; ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 1/9] mm: turn migrate_vma upside down
On 7/29/19 7:28 AM, Christoph Hellwig wrote: There isn't any good reason to pass callbacks to migrate_vma. Instead we can just export the three steps done by this function to drivers and let them sequence the operation without callbacks. This removes a lot of boilerplate code as-is, and will allow the drivers to drastically improve code flow and error handling further on. Signed-off-by: Christoph Hellwig --- Documentation/vm/hmm.rst | 55 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 122 +++-- include/linux/migrate.h| 118 ++-- mm/migrate.c | 242 +++-- 4 files changed, 193 insertions(+), 344 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index ddcb5ca8b296..ad880e3996b1 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -339,58 +339,9 @@ Migration to and from device memory === Because the CPU cannot access device memory, migration must use the device DMA -engine to perform copy from and to device memory. For this we need a new -migration helper:: - - int migrate_vma(const struct migrate_vma_ops *ops, - struct vm_area_struct *vma, - unsigned long mentries, - unsigned long start, - unsigned long end, - unsigned long *src, - unsigned long *dst, - void *private); - -Unlike other migration functions it works on a range of virtual address, there -are two reasons for that. First, device DMA copy has a high setup overhead cost -and thus batching multiple pages is needed as otherwise the migration overhead -makes the whole exercise pointless. The second reason is because the -migration might be for a range of addresses the device is actively accessing. - -The migrate_vma_ops struct defines two callbacks. First one (alloc_and_copy()) -controls destination memory allocation and copy operation. Second one is there -to allow the device driver to perform cleanup operations after migration:: - - struct migrate_vma_ops { - void (*alloc_and_copy)(struct vm_area_struct *vma, -const unsigned long *src, -unsigned long *dst, -unsigned long start, -unsigned long end, -void *private); - void (*finalize_and_map)(struct vm_area_struct *vma, - const unsigned long *src, - const unsigned long *dst, - unsigned long start, - unsigned long end, - void *private); - }; - -It is important to stress that these migration helpers allow for holes in the -virtual address range. Some pages in the range might not be migrated for all -the usual reasons (page is pinned, page is locked, ...). This helper does not -fail but just skips over those pages. - -The alloc_and_copy() might decide to not migrate all pages in the -range (for reasons under the callback control). For those, the callback just -has to leave the corresponding dst entry empty. - -Finally, the migration of the struct page might fail (for file backed page) for -various reasons (failure to freeze reference, or update page cache, ...). If -that happens, then the finalize_and_map() can catch any pages that were not -migrated. Note those pages were still copied to a new page and thus we wasted -bandwidth but this is considered as a rare event and a price that we are -willing to pay to keep all the code simpler. +engine to perform copy from and to device memory. For this we need a new to +use migrate_vma_setup(), migrate_vma_pages(), and migrate_vma_finalize() +helpers. Memory cgroup (memcg) and rss accounting diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 345c63cb752a..38416798abd4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -131,9 +131,8 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, unsigned long *dst_pfns, unsigned long start, unsigned long end, - void *private) + struct nouveau_dmem_fault *fault) { - struct nouveau_dmem_fault *fault = private; struct nouveau_drm *drm = fault->drm; struct device *dev = drm->dev->dev; unsigned long addr, i, npages = 0; @@ -230,14 +229,9 @@ nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, } } -void nouveau_dmem_fault_finalize_and_map(struct vm_area_struct *vma, -const unsigned long *src_pfns, -const unsigned long
[PATCH] nouveau/hmm: map pages after migration
When memory is migrated to the GPU it is likely to be accessed by GPU code soon afterwards. Instead of waiting for a GPU fault, map the migrated memory into the GPU page tables with the same access permissions as the source CPU page table entries. This preserves copy on write semantics. Signed-off-by: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: "Jérôme Glisse" Cc: Ben Skeggs --- This patch is based on top of Christoph Hellwig's 9 patch series https://lore.kernel.org/linux-mm/20190729234611.gc7...@redhat.com/T/#u "turn the hmm migrate_vma upside down" but without patch 9 "mm: remove the unused MIGRATE_PFN_WRITE" and adds a use for the flag. drivers/gpu/drm/nouveau/nouveau_dmem.c | 45 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 86 ++ drivers/gpu/drm/nouveau/nouveau_svm.h | 19 ++ 3 files changed, 133 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index ef9de82b0744..c83e6f118817 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -25,11 +25,13 @@ #include "nouveau_dma.h" #include "nouveau_mem.h" #include "nouveau_bo.h" +#include "nouveau_svm.h" #include #include #include #include +#include #include #include @@ -560,11 +562,12 @@ nouveau_dmem_init(struct nouveau_drm *drm) } static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, - struct vm_area_struct *vma, unsigned long addr, - unsigned long src, dma_addr_t *dma_addr) + struct vm_area_struct *vma, unsigned long src, + dma_addr_t *dma_addr, u64 *pfn) { struct device *dev = drm->dev->dev; struct page *dpage, *spage; + unsigned long paddr; spage = migrate_pfn_to_page(src); if (!spage || !(src & MIGRATE_PFN_MIGRATE)) @@ -572,17 +575,21 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, dpage = nouveau_dmem_page_alloc_locked(drm); if (!dpage) - return 0; + goto out; *dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dev, *dma_addr)) goto out_free_page; + paddr = nouveau_dmem_page_addr(dpage); if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM, - nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST, - *dma_addr)) + paddr, NOUVEAU_APER_HOST, *dma_addr)) goto out_dma_unmap; + *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM | + ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); + if (src & MIGRATE_PFN_WRITE) + *pfn |= NVIF_VMM_PFNMAP_V0_W; return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; out_dma_unmap: @@ -590,18 +597,19 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, out_free_page: nouveau_dmem_page_free_locked(drm, dpage); out: + *pfn = NVIF_VMM_PFNMAP_V0_NONE; return 0; } static void nouveau_dmem_migrate_chunk(struct migrate_vma *args, - struct nouveau_drm *drm, dma_addr_t *dma_addrs) + struct nouveau_drm *drm, dma_addr_t *dma_addrs, u64 *pfns) { struct nouveau_fence *fence; unsigned long addr = args->start, nr_dma = 0, i; for (i = 0; addr < args->end; i++) { args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->vma, - addr, args->src[i], _addrs[nr_dma]); + args->src[i], _addrs[nr_dma], [i]); if (args->dst[i]) nr_dma++; addr += PAGE_SIZE; @@ -615,10 +623,6 @@ static void nouveau_dmem_migrate_chunk(struct migrate_vma *args, dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE, DMA_BIDIRECTIONAL); } - /* -* FIXME optimization: update GPU page table to point to newly migrated -* memory. -*/ migrate_vma_finalize(args); } @@ -631,11 +635,12 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, unsigned long npages = (end - start) >> PAGE_SHIFT; unsigned long max = min(SG_MAX_SINGLE_ALLOC, npages); dma_addr_t *dma_addrs; + u64 *pfns; struct migrate_vma args = { .vma= vma, .start = start, }; - unsigned long c, i; + unsigned long i; int ret = -ENOMEM; args.src = kcalloc(max, sizeof(args.src), GFP_KERNEL); @@ -649,19 +654,25 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, if (!dma
Re: [PATCH 6/9] nouveau: simplify nouveau_dmem_migrate_to_ram
On 8/8/19 8:33 AM, Christoph Hellwig wrote: Factor the main copy page to ram routine out into a helper that acts on a single page and which doesn't require the nouveau_dmem_fault structure for argument passing. Also remove the loop over multiple pages as we only handle one at the moment, although the structure of the main worker function makes it relatively easy to add multi page support back if needed in the future. But at least for now this avoid the needed to dynamically allocate memory for the dma addresses in what is essentially the page fault path. Signed-off-by: Christoph Hellwig Reviewed-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 159 +++-- 1 file changed, 40 insertions(+), 119 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 21052a4aaf69..473195762974 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -86,13 +86,6 @@ static inline struct nouveau_dmem *page_to_dmem(struct page *page) return container_of(page->pgmap, struct nouveau_dmem, pagemap); } -struct nouveau_dmem_fault { - struct nouveau_drm *drm; - struct nouveau_fence *fence; - dma_addr_t *dma; - unsigned long npages; -}; - struct nouveau_migrate { struct vm_area_struct *vma; struct nouveau_drm *drm; @@ -146,130 +139,57 @@ static void nouveau_dmem_fence_done(struct nouveau_fence **fence) } } -static void -nouveau_dmem_fault_alloc_and_copy(struct vm_area_struct *vma, - const unsigned long *src_pfns, - unsigned long *dst_pfns, - unsigned long start, - unsigned long end, - struct nouveau_dmem_fault *fault) +static vm_fault_t nouveau_dmem_fault_copy_one(struct nouveau_drm *drm, + struct vm_fault *vmf, struct migrate_vma *args, + dma_addr_t *dma_addr) { - struct nouveau_drm *drm = fault->drm; struct device *dev = drm->dev->dev; - unsigned long addr, i, npages = 0; - nouveau_migrate_copy_t copy; - int ret; - + struct page *dpage, *spage; + vm_fault_t ret = VM_FAULT_SIGBUS; You can remove this line and return VM_FAULT_SIGBUS in the error path below. - /* First allocate new memory */ - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *dpage, *spage; - - dst_pfns[i] = 0; - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; - - dpage = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!dpage) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - continue; - } - lock_page(dpage); - - dst_pfns[i] = migrate_pfn(page_to_pfn(dpage)) | - MIGRATE_PFN_LOCKED; - npages++; - } + spage = migrate_pfn_to_page(args->src[0]); + if (!spage || !(args->src[0] & MIGRATE_PFN_MIGRATE)) + return 0; - /* Allocate storage for DMA addresses, so we can unmap later. */ - fault->dma = kmalloc(sizeof(*fault->dma) * npages, GFP_KERNEL); - if (!fault->dma) + dpage = alloc_page_vma(GFP_HIGHUSER, vmf->vma, vmf->address); + if (!dpage) goto error; + lock_page(dpage); - /* Copy things over */ - copy = drm->dmem->migrate.copy_func; - for (addr = start, i = 0; addr < end; addr += PAGE_SIZE, i++) { - struct page *spage, *dpage; - - dpage = migrate_pfn_to_page(dst_pfns[i]); - if (!dpage || dst_pfns[i] == MIGRATE_PFN_ERROR) - continue; - - spage = migrate_pfn_to_page(src_pfns[i]); - if (!spage || !(src_pfns[i] & MIGRATE_PFN_MIGRATE)) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - __free_page(dpage); - continue; - } - - fault->dma[fault->npages] = - dma_map_page_attrs(dev, dpage, 0, PAGE_SIZE, - PCI_DMA_BIDIRECTIONAL, - DMA_ATTR_SKIP_CPU_SYNC); - if (dma_mapping_error(dev, fault->dma[fault->npages])) { - dst_pfns[i] = MIGRATE_PFN_ERROR; - __free_page(dpage); - continue; - } + *dma_addr = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(dev, *dma_addr)) + goto error_free_page; - ret = copy(drm, 1, NOUVEAU
Re: [PATCH] nouveau/hmm: map pages after migration
On 8/8/19 12:07 AM, Christoph Hellwig wrote: On Wed, Aug 07, 2019 at 08:02:14AM -0700, Ralph Campbell wrote: When memory is migrated to the GPU it is likely to be accessed by GPU code soon afterwards. Instead of waiting for a GPU fault, map the migrated memory into the GPU page tables with the same access permissions as the source CPU page table entries. This preserves copy on write semantics. Signed-off-by: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: "Jérôme Glisse" Cc: Ben Skeggs --- This patch is based on top of Christoph Hellwig's 9 patch series https://lore.kernel.org/linux-mm/20190729234611.gc7...@redhat.com/T/#u "turn the hmm migrate_vma upside down" but without patch 9 "mm: remove the unused MIGRATE_PFN_WRITE" and adds a use for the flag. This looks useful. I've already dropped that patch for the pending resend. Thanks. static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, - struct vm_area_struct *vma, unsigned long addr, - unsigned long src, dma_addr_t *dma_addr) + struct vm_area_struct *vma, unsigned long src, + dma_addr_t *dma_addr, u64 *pfn) I'll pick up the removal of the not needed addr argument for the patch introducing nouveau_dmem_migrate_copy_one, thanks, static void nouveau_dmem_migrate_chunk(struct migrate_vma *args, - struct nouveau_drm *drm, dma_addr_t *dma_addrs) + struct nouveau_drm *drm, dma_addr_t *dma_addrs, u64 *pfns) { struct nouveau_fence *fence; unsigned long addr = args->start, nr_dma = 0, i; for (i = 0; addr < args->end; i++) { args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->vma, - addr, args->src[i], _addrs[nr_dma]); + args->src[i], _addrs[nr_dma], [i]); Nit: I find the [i] way to pass the argument a little weird to read. Why not "pfns + i"? OK, will do in v2. Should I convert to "dma_addrs + nr_dma" too? +u64 * +nouveau_pfns_alloc(unsigned long npages) +{ + struct nouveau_pfnmap_args *args; + + args = kzalloc(sizeof(*args) + npages * sizeof(args->p.phys[0]), Can we use struct_size here? Yes, good suggestion. + int ret; + + if (!svm) + return; + + mutex_lock(>mutex); + svmm = nouveau_find_svmm(svm, mm); + if (!svmm) { + mutex_unlock(>mutex); + return; + } + mutex_unlock(>mutex); Given that nouveau_find_svmm doesn't take any kind of reference, what gurantees svmm doesn't go away after dropping the lock? I asked Ben and Jerome about this too. I'm still looking into it. @@ -44,5 +49,19 @@ static inline int nouveau_svmm_bind(struct drm_device *device, void *p, { return -ENOSYS; } + +u64 *nouveau_pfns_alloc(unsigned long npages) +{ + return NULL; +} + +void nouveau_pfns_free(u64 *pfns) +{ +} + +void nouveau_pfns_map(struct nouveau_drm *drm, struct mm_struct *mm, + unsigned long addr, u64 *pfns, unsigned long npages) +{ +} #endif /* IS_ENABLED(CONFIG_DRM_NOUVEAU_SVM) */ nouveau_dmem.c and nouveau_svm.c are both built conditional on CONFIG_DRM_NOUVEAU_SVM, so there is no need for stubs here. Good point. I'll remove them in v2. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: hmm_range_fault related fixes and legacy API removal v2
On 7/22/19 2:44 AM, Christoph Hellwig wrote: Hi Jérôme, Ben and Jason, below is a series against the hmm tree which fixes up the mmap_sem locking in nouveau and while at it also removes leftover legacy HMM APIs only used by nouveau. The first 4 patches are a bug fix for nouveau, which I suspect should go into this merge window even if the code is marked as staging, just to avoid people copying the breakage. Changes since v1: - don't return the valid state from hmm_range_unregister - additional nouveau cleanups I ran some OpenCL tests from Jerome with nouveau and this series, 5.3.0-rc1, and my two HMM fixes: ("mm/hmm: fix ZONE_DEVICE anon page mapping reuse") ("mm/hmm: Fix bad subpage pointer in try_to_unmap_one") You can add for the series: Tested-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] mm/hmm: replace hmm_update with mmu_notifier_range
The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig Cc: Ben Skeggs --- This is based on 5.3.0-rc1 plus Christoph Hellwig's 6 patches ("hmm_range_fault related fixes and legacy API removal v2"). Jason, I believe this is the patch you were requesting. drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h | 31 --- mm/hmm.c | 13 --- 3 files changed, 10 insertions(+), 38 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index a9c5c58d425b..6298d2dadb55 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 9f32586684c9..659e25a15700 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -383,9 +360,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror -* @update: update information (see struct hmm_update) -* Return: -EAGAIN if update.blockable false and callback need to -* block, 0 otherwise. +* @update: update information (see struct mmu_notifier_range) +* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false +* and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -397,7 +374,7 @@ struct hmm_mirror_ops { * synchronous call. */ int (*sync_cpu_device_pagetables)(struct hmm_mirror *mirror, - const struct hmm_update *update); + const struct mmu_notifier_range *update); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 16b6731a34db..b810a4fa3de9 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -165,7 +165,6 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, { struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; - struct hmm_update update; struct hmm_range *range; unsigned long flags; int ret = 0; @@ -173,15 +172,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, if (!kref_get_unless_zero(>kref)) return 0; - update.start = nrange->start; - update.end = nrange->end; - update.event = HMM_UPDATE_INVALIDATE; - update.blockable = mmu_notifier_range_blockable(nrange); - spin_lock_irqsave(>ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, >ranges, list) { - if (update.end < range->start || update.start >= range->end) + if (nrange->end < range->start || nrange->start >= range->end) continue; range->valid = false; @@ -198,9 +192,10 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, list_for_each_entry(mirror, >mirrors, l
Re: [PATCH] mm/hmm: replace hmm_update with mmu_notifier_range
On 7/24/19 6:14 PM, Jason Gunthorpe wrote: On Tue, Jul 23, 2019 at 02:05:06PM -0700, Ralph Campbell wrote: The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig Cc: Ben Skeggs This is based on 5.3.0-rc1 plus Christoph Hellwig's 6 patches ("hmm_range_fault related fixes and legacy API removal v2"). Jason, I believe this is the patch you were requesting. Doesn't this need revision to include amgpu? drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c: .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_gfx, drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c: .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_hsa, Thanks, Jason Yes. I have added this to v2 which I'll send out with Christoph's 2 patches and the hmm_range.vma removal patch you suggested. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v2 4/7] mm: merge hmm_range_snapshot into hmm_range_fault
From: Christoph Hellwig Add a HMM_FAULT_SNAPSHOT flag so that hmm_range_snapshot can be merged into the almost identical hmm_range_fault function. Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe --- Documentation/vm/hmm.rst | 17 include/linux/hmm.h | 4 +- mm/hmm.c | 85 +--- 3 files changed, 13 insertions(+), 93 deletions(-) diff --git a/Documentation/vm/hmm.rst b/Documentation/vm/hmm.rst index 710ce1c701bf..ddcb5ca8b296 100644 --- a/Documentation/vm/hmm.rst +++ b/Documentation/vm/hmm.rst @@ -192,15 +192,14 @@ read only, or fully unmap, etc.). The device must complete the update before the driver callback returns. When the device driver wants to populate a range of virtual addresses, it can -use either:: +use:: - long hmm_range_snapshot(struct hmm_range *range); - long hmm_range_fault(struct hmm_range *range, bool block); + long hmm_range_fault(struct hmm_range *range, unsigned int flags); -The first one (hmm_range_snapshot()) will only fetch present CPU page table +With the HMM_RANGE_SNAPSHOT flag, it will only fetch present CPU page table entries and will not trigger a page fault on missing or non-present entries. -The second one does trigger a page fault on missing or read-only entries if -write access is requested (see below). Page faults use the generic mm page +Without that flag, it does trigger a page fault on missing or read-only entries +if write access is requested (see below). Page faults use the generic mm page fault code path just like a CPU page fault. Both functions copy CPU page table entries into their pfns array argument. Each @@ -227,20 +226,20 @@ The usage pattern is:: /* * Just wait for range to be valid, safe to ignore return value as we - * will use the return value of hmm_range_snapshot() below under the + * will use the return value of hmm_range_fault() below under the * mmap_sem to ascertain the validity of the range. */ hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC); again: down_read(>mmap_sem); - ret = hmm_range_snapshot(); + ret = hmm_range_fault(, HMM_RANGE_SNAPSHOT); if (ret) { up_read(>mmap_sem); if (ret == -EBUSY) { /* * No need to check hmm_range_wait_until_valid() return value - * on retry we will get proper error with hmm_range_snapshot() + * on retry we will get proper error with hmm_range_fault() */ hmm_range_wait_until_valid(, TIMEOUT_IN_MSEC); goto again; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 15f1b113be3c..f3693dcc8b98 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -412,7 +412,9 @@ void hmm_range_unregister(struct hmm_range *range); */ #define HMM_FAULT_ALLOW_RETRY (1 << 0) -long hmm_range_snapshot(struct hmm_range *range); +/* Don't fault in missing PTEs, just snapshot the current state. */ +#define HMM_FAULT_SNAPSHOT (1 << 1) + long hmm_range_fault(struct hmm_range *range, unsigned int flags); long hmm_range_dma_map(struct hmm_range *range, diff --git a/mm/hmm.c b/mm/hmm.c index 84f2791d3510..1bc014cddd78 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -280,7 +280,6 @@ struct hmm_vma_walk { struct hmm_range*range; struct dev_pagemap *pgmap; unsigned long last; - boolfault; unsigned intflags; }; @@ -373,7 +372,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { struct hmm_range *range = hmm_vma_walk->range; - if (!hmm_vma_walk->fault) + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) return; /* @@ -418,7 +417,7 @@ static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, { unsigned long i; - if (!hmm_vma_walk->fault) { + if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) { *fault = *write_fault = false; return; } @@ -936,85 +935,6 @@ void hmm_range_unregister(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_unregister); -/* - * hmm_range_snapshot() - snapshot CPU page table for a range - * @range: range - * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid - * permission (for instance asking for write and range is read only), - * -EBUSY if you need to retry, -EFAULT invalid (ie either no valid - * vma or it is illegal to access that range), number of valid pages - * in range->pfns[] (from range start address). - * - * This snapshots the CPU page table for a range of virtual addresses. Snapshot - * validity is tracked by range struct. See in include/linux/hmm.h for example - * on how to use.
[PATCH v2 5/7] mm/hmm: make full use of walk_page_range()
hmm_range_fault() calls find_vma() and walk_page_range() in a loop. This is unnecessary duplication since walk_page_range() calls find_vma() in a loop already. Simplify hmm_range_fault() by defining a walk_test() callback function to filter unhandled vmas. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 130 --- 1 file changed, 57 insertions(+), 73 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 1bc014cddd78..838cd1d50497 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -840,13 +840,44 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, #endif } -static void hmm_pfns_clear(struct hmm_range *range, - uint64_t *pfns, - unsigned long addr, - unsigned long end) +static int hmm_vma_walk_test(unsigned long start, +unsigned long end, +struct mm_walk *walk) { - for (; addr < end; addr += PAGE_SIZE, pfns++) - *pfns = range->values[HMM_PFN_NONE]; + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + struct vm_area_struct *vma = walk->vma; + + /* If range is no longer valid, force retry. */ + if (!range->valid) + return -EBUSY; + + /* +* Skip vma ranges that don't have struct page backing them or +* map I/O devices directly. +* TODO: handle peer-to-peer device mappings. +*/ + if (vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) { + if (huge_page_shift(hstate_vma(vma)) != range->page_shift && + range->page_shift != PAGE_SHIFT) + return -EINVAL; + } else { + if (range->page_shift != PAGE_SHIFT) + return -EINVAL; + } + + /* +* If vma does not allow read access, then assume that it does not +* allow write access, either. HMM does not support architectures +* that allow write without read. +*/ + if (!(vma->vm_flags & VM_READ)) + return -EPERM; + + return 0; } /* @@ -965,82 +996,35 @@ EXPORT_SYMBOL(hmm_range_unregister); */ long hmm_range_fault(struct hmm_range *range, unsigned int flags) { - const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; - unsigned long start = range->start, end; - struct hmm_vma_walk hmm_vma_walk; + unsigned long start = range->start; + struct hmm_vma_walk hmm_vma_walk = {}; struct hmm *hmm = range->hmm; - struct vm_area_struct *vma; - struct mm_walk mm_walk; + struct mm_walk mm_walk = {}; int ret; lockdep_assert_held(>mm->mmap_sem); - do { - /* If range is no longer valid force retry. */ - if (!range->valid) - return -EBUSY; + hmm_vma_walk.range = range; + hmm_vma_walk.last = start; + hmm_vma_walk.flags = flags; + mm_walk.private = _vma_walk; - vma = find_vma(hmm->mm, start); - if (vma == NULL || (vma->vm_flags & device_vma)) - return -EFAULT; - - if (is_vm_hugetlb_page(vma)) { - if (huge_page_shift(hstate_vma(vma)) != - range->page_shift && - range->page_shift != PAGE_SHIFT) - return -EINVAL; - } else { - if (range->page_shift != PAGE_SHIFT) - return -EINVAL; - } + mm_walk.mm = hmm->mm; + mm_walk.pud_entry = hmm_vma_walk_pud; + mm_walk.pmd_entry = hmm_vma_walk_pmd; + mm_walk.pte_hole = hmm_vma_walk_hole; + mm_walk.hugetlb_entry = hmm_vma_walk_hugetlb_entry; + mm_walk.test_walk = hmm_vma_walk_test; - if (!(vma->vm_flags & VM_READ)) { - /* -* If vma do not allow read access, then assume that it -* does not allow write access, either. HMM does not -* support architecture that allow write without read. -*/ - hmm_pfns_clear(range, range->pfns, - range->start, range->end); - return -EPERM; - } + do { + ret = walk_page_range(start, range->end, _walk); + start = hmm_vma_walk.last; - range->vma = vma; - hmm_vma_walk.pgmap = NULL; - hmm_vma_walk.last = start; -
[PATCH v2 2/7] mm/hmm: a few more C style and comment clean ups
A few more comments and minor programming style clean ups. There should be no functional changes. Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 39 +-- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 4040b4427635..362944b0fbca 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -32,7 +32,7 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops; * hmm_get_or_create - register HMM against an mm (HMM internal) * * @mm: mm struct to attach to - * Returns: returns an HMM object, either by referencing the existing + * Return: an HMM object, either by referencing the existing * (per-process) object, or by creating a new one. * * This is not intended to be used directly by device drivers. If mm already @@ -325,8 +325,8 @@ static int hmm_pfns_bad(unsigned long addr, } /* - * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) - * @start: range virtual start address (inclusive) + * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s) + * @addr: range virtual start address (inclusive) * @end: range virtual end address (exclusive) * @fault: should we fault or not ? * @write_fault: write fault ? @@ -376,9 +376,9 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* * So we not only consider the individual per page request we also * consider the default flags requested for the range. The API can -* be use in 2 fashions. The first one where the HMM user coalesce -* multiple page fault into one request and set flags per pfns for -* of those faults. The second one where the HMM user want to pre- +* be used 2 ways. The first one where the HMM user coalesces +* multiple page faults into one request and sets flags per pfn for +* those faults. The second one where the HMM user wants to pre- * fault a range with specific flags. For the latter one it is a * waste to have the user pre-fill the pfn arrays with a default * flags value. @@ -388,7 +388,7 @@ static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, /* We aren't ask to do anything ... */ if (!(pfns & range->flags[HMM_PFN_VALID])) return; - /* If this is device memory than only fault if explicitly requested */ + /* If this is device memory then only fault if explicitly requested */ if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { /* Do we fault on device memory ? */ if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { @@ -502,7 +502,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, hmm_vma_walk->last = end; return 0; #else - /* If THP is not enabled then we should never reach that code ! */ + /* If THP is not enabled then we should never reach this code ! */ return -EINVAL; #endif } @@ -522,7 +522,6 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; bool fault, write_fault; uint64_t cpu_flags; pte_t pte = *ptep; @@ -571,8 +570,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, if (fault || write_fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; - migration_entry_wait(vma->vm_mm, -pmdp, addr); + migration_entry_wait(walk->mm, pmdp, addr); return -EBUSY; } return 0; @@ -620,13 +618,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; - struct vm_area_struct *vma = walk->vma; uint64_t *pfns = range->pfns; unsigned long addr = start, i; pte_t *ptep; pmd_t pmd; - again: pmd = READ_ONCE(*pmdp); if (pmd_none(pmd)) @@ -648,7 +644,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, 0, , _fault); if (fault || write_fault) { hmm_vma_walk->last = addr; - pmd_migration_entry_wait(vma->vm_mm, pmdp); + pmd_migration_entry_wait(walk->mm, pmdp); return -EBUSY; } return 0; @@ -657,11 +653,11 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
[PATCH v2 1/7] mm/hmm: replace hmm_update with mmu_notifier_range
The hmm_mirror_ops callback function sync_cpu_device_pagetables() passes a struct hmm_update which is a simplified version of struct mmu_notifier_range. This is unnecessary so replace hmm_update with mmu_notifier_range directly. Signed-off-by: Ralph Campbell Reviewed: Christoph Hellwig Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Ben Skeggs --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 +++ drivers/gpu/drm/nouveau/nouveau_svm.c | 4 ++-- include/linux/hmm.h| 31 -- mm/hmm.c | 13 --- 4 files changed, 14 insertions(+), 42 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3971c201f320..cf945080dff3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -196,12 +196,12 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * potentially dirty. */ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ @@ -244,12 +244,12 @@ static int amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ static int amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); unsigned long start = update->start; unsigned long end = update->end; - bool blockable = update->blockable; + bool blockable = mmu_notifier_range_blockable(update); struct interval_tree_node *it; /* notification is exclusive, but interval is inclusive */ diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 545100f7c594..79b29c918717 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -252,13 +252,13 @@ nouveau_svmm_invalidate(struct nouveau_svmm *svmm, u64 start, u64 limit) static int nouveau_svmm_sync_cpu_device_pagetables(struct hmm_mirror *mirror, - const struct hmm_update *update) + const struct mmu_notifier_range *update) { struct nouveau_svmm *svmm = container_of(mirror, typeof(*svmm), mirror); unsigned long start = update->start; unsigned long limit = update->end; - if (!update->blockable) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; SVMM_DBG(svmm, "invalidate %016lx-%016lx", start, limit); diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 9f32586684c9..659e25a15700 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -340,29 +340,6 @@ static inline uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range, struct hmm_mirror; -/* - * enum hmm_update_event - type of update - * @HMM_UPDATE_INVALIDATE: invalidate range (no indication as to why) - */ -enum hmm_update_event { - HMM_UPDATE_INVALIDATE, -}; - -/* - * struct hmm_update - HMM update information for callback - * - * @start: virtual start address of the range to update - * @end: virtual end address of the range to update - * @event: event triggering the update (what is happening) - * @blockable: can the callback block/sleep ? - */ -struct hmm_update { - unsigned long start; - unsigned long end; - enum hmm_update_event event; - bool blockable; -}; - /* * struct hmm_mirror_ops - HMM mirror device operations callback * @@ -383,9 +360,9 @@ struct hmm_mirror_ops { /* sync_cpu_device_pagetables() - synchronize page tables * * @mirror: pointer to struct hmm_mirror -* @update: update information (see struct hmm_update) -* Return: -EAGAIN if update.blockable false and callback need to -* block, 0 otherwise. +* @update: update information (see struct mmu_notifier_range) +* Return: -EAGAIN if mmu_notifier_range_blockable(update) is false +* and callback needs to block, 0 otherwise. * * This callback ultimately originates from mmu_notifiers when the CPU * page table is updated. The device driver must update its page table @@ -397,7 +374,7 @@ struct hmm_mirror_ops { * synchronous call.
[PATCH v2 6/7] mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd
walk_page_range() will only call hmm_vma_walk_hugetlb_entry() for hugetlbfs pages and doesn't call hmm_vma_walk_pmd() in this case. Therefore, it is safe to remove the check for vma->vm_flags & VM_HUGETLB in hmm_vma_walk_pmd(). Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe Cc: Christoph Hellwig --- mm/hmm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index 838cd1d50497..29f322ca5d58 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -630,9 +630,6 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, if (pmd_none(pmd)) return hmm_vma_walk_hole(start, end, walk); - if (pmd_huge(pmd) && (range->vma->vm_flags & VM_HUGETLB)) - return hmm_pfns_bad(start, end, walk); - if (thp_migration_supported() && is_pmd_migration_entry(pmd)) { bool fault, write_fault; unsigned long npages; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v2 3/7] mm/hmm: replace the block argument to hmm_range_fault with a flags value
From: Christoph Hellwig This allows easier expansion to other flags, and also makes the callers a little easier to read. Signed-off-by: Christoph Hellwig Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 2 +- include/linux/hmm.h | 11 +++- mm/hmm.c| 74 - 4 files changed, 48 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index e51b48ac48eb..12a59ac83f72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -832,7 +832,7 @@ int amdgpu_ttm_tt_get_user_pages(struct amdgpu_bo *bo, struct page **pages) down_read(>mmap_sem); - r = hmm_range_fault(range, true); + r = hmm_range_fault(range, 0); if (unlikely(r < 0)) { if (likely(r == -EAGAIN)) { /* diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 79b29c918717..49b520c60fc5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -505,7 +505,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) return -EBUSY; } - ret = hmm_range_fault(range, true); + ret = hmm_range_fault(range, 0); if (ret <= 0) { if (ret == 0) ret = -EBUSY; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 659e25a15700..15f1b113be3c 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -406,12 +406,19 @@ int hmm_range_register(struct hmm_range *range, unsigned long end, unsigned page_shift); void hmm_range_unregister(struct hmm_range *range); + +/* + * Retry fault if non-blocking, drop mmap_sem and return -EAGAIN in that case. + */ +#define HMM_FAULT_ALLOW_RETRY (1 << 0) + long hmm_range_snapshot(struct hmm_range *range); -long hmm_range_fault(struct hmm_range *range, bool block); +long hmm_range_fault(struct hmm_range *range, unsigned int flags); + long hmm_range_dma_map(struct hmm_range *range, struct device *device, dma_addr_t *daddrs, - bool block); + unsigned int flags); long hmm_range_dma_unmap(struct hmm_range *range, struct vm_area_struct *vma, struct device *device, diff --git a/mm/hmm.c b/mm/hmm.c index 362944b0fbca..84f2791d3510 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -281,7 +281,7 @@ struct hmm_vma_walk { struct dev_pagemap *pgmap; unsigned long last; boolfault; - boolblock; + unsigned intflags; }; static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, @@ -293,8 +293,11 @@ static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, struct vm_area_struct *vma = walk->vma; vm_fault_t ret; - flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= write_fault ? FAULT_FLAG_WRITE : 0; + if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY) + flags |= FAULT_FLAG_ALLOW_RETRY; + if (write_fault) + flags |= FAULT_FLAG_WRITE; + ret = handle_mm_fault(vma, addr, flags); if (ret & VM_FAULT_RETRY) { /* Note, handle_mm_fault did up_read(>mmap_sem)) */ @@ -1012,26 +1015,26 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Return: number of valid pages in range->pfns[] (from range start - * address). This may be zero. If the return value is negative, - * then one of the following values may be returned: +/** + * hmm_range_fault - try to fault some address in a virtual address range + * @range: range being faulted + * @flags: HMM_FAULT_* flags * - * -EINVAL invalid arguments or mm or virtual address are in an - *invalid vma (for instance device file vma). - * -ENOMEM: Out of memory. - * -EPERM: Invalid permission (for instance asking for write and - *range is read only). - * -EAGAIN: If you need to retry and mmap_sem was drop. This can only - *happens if block argument is false. - * -EBUSY: If the the range is being invalidated and you should wait - *for invalidation to finish. - * -EFAULT: Invalid (ie
[PATCH v2 7/7] mm/hmm: remove hmm_range vma
Since hmm_range_fault() doesn't use the struct hmm_range vma field, remove it. Suggested-by: Jason Gunthorpe Signed-off-by: Ralph Campbell Cc: "Jérôme Glisse" Cc: Christoph Hellwig --- drivers/gpu/drm/nouveau/nouveau_svm.c | 7 +++ include/linux/hmm.h | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 49b520c60fc5..a74530b5a523 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -496,12 +496,12 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) range->start, range->end, PAGE_SHIFT); if (ret) { - up_read(>vma->vm_mm->mmap_sem); + up_read(>hmm->mm->mmap_sem); return (int)ret; } if (!hmm_range_wait_until_valid(range, HMM_RANGE_DEFAULT_TIMEOUT)) { - up_read(>vma->vm_mm->mmap_sem); + up_read(>hmm->mm->mmap_sem); return -EBUSY; } @@ -509,7 +509,7 @@ nouveau_range_fault(struct hmm_mirror *mirror, struct hmm_range *range) if (ret <= 0) { if (ret == 0) ret = -EBUSY; - up_read(>vma->vm_mm->mmap_sem); + up_read(>hmm->mm->mmap_sem); hmm_range_unregister(range); return ret; } @@ -682,7 +682,6 @@ nouveau_svm_fault(struct nvif_notify *notify) args.i.p.addr + args.i.p.size, fn - fi); /* Have HMM fault pages within the fault window to the GPU. */ - range.vma = vma; range.start = args.i.p.addr; range.end = args.i.p.addr + args.i.p.size; range.pfns = args.phys; diff --git a/include/linux/hmm.h b/include/linux/hmm.h index f3693dcc8b98..68949cf815f9 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -164,7 +164,6 @@ enum hmm_pfn_value_e { */ struct hmm_range { struct hmm *hmm; - struct vm_area_struct *vma; struct list_headlist; unsigned long start; unsigned long end; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v2 0/7] mm/hmm: more HMM clean up
Here are seven more patches for things I found to clean up. This was based on top of Christoph's seven patches: "hmm_range_fault related fixes and legacy API removal v3". I assume this will go into Jason's tree since there will likely be more HMM changes in this cycle. Changes from v1 to v2: Added AMD GPU to hmm_update removal. Added 2 patches from Christoph. Added 2 patches as a result of Jason's suggestions. Christoph Hellwig (2): mm/hmm: replace the block argument to hmm_range_fault with a flags value mm: merge hmm_range_snapshot into hmm_range_fault Ralph Campbell (5): mm/hmm: replace hmm_update with mmu_notifier_range mm/hmm: a few more C style and comment clean ups mm/hmm: make full use of walk_page_range() mm/hmm: remove hugetlbfs check in hmm_vma_walk_pmd mm/hmm: remove hmm_range vma Documentation/vm/hmm.rst| 17 +- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/nouveau/nouveau_svm.c | 13 +- include/linux/hmm.h | 47 ++-- mm/hmm.c| 340 6 files changed, 150 insertions(+), 277 deletions(-) -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: hmm_range_fault related fixes and legacy API removal v3
On 7/25/19 5:16 PM, Jason Gunthorpe wrote: On Wed, Jul 24, 2019 at 08:52:51AM +0200, Christoph Hellwig wrote: Hi Jérôme, Ben and Jason, below is a series against the hmm tree which fixes up the mmap_sem locking in nouveau and while at it also removes leftover legacy HMM APIs only used by nouveau. The first 4 patches are a bug fix for nouveau, which I suspect should go into this merge window even if the code is marked as staging, just to avoid people copying the breakage. Changes since v2: - new patch from Jason to document FAULT_FLAG_ALLOW_RETRY semantics better - remove -EAGAIN handling in nouveau earlier I don't see Ralph's tested by, do you think it changed enough to require testing again? If so, Ralph would you be so kind? In any event, I'm sending this into linux-next and intend to forward the first four next week. Thanks, Jason I have been testing Christoph's v3 with my set of v2 changes so feel free to add my tested-by. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 2/7] mm/hmm: a few more C style and comment clean ups
On 7/25/19 11:23 PM, Christoph Hellwig wrote: Note: it seems like you've only CCed me on patches 2-7, but not on the cover letter and patch 1. I'll try to find them later, but to make Ccs useful they should normally cover the whole series. Otherwise this looks fine to me: Reviewed-by: Christoph Hellwig Thanks for the review and sorry about the oversight on CCs. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v4 hmm 12/12] mm/hmm: Fix error flows in hmm_invalidate_range_start
On 6/24/19 2:01 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe If the trylock on the hmm->mirrors_sem fails the function will return without decrementing the notifiers that were previously incremented. Since the caller will not call invalidate_range_end() on EAGAIN this will result in notifiers becoming permanently incremented and deadlock. If the sync_cpu_device_pagetables() required blocking the function will not return EAGAIN even though the device continues to touch the pages. This is a violation of the mmu notifier contract. Switch, and rename, the ranges_lock to a spin lock so we can reliably obtain it without blocking during error unwind. The error unwind is necessary since the notifiers count must be held incremented across the call to sync_cpu_device_pagetables() as we cannot allow the range to become marked valid by a parallel invalidate_start/end() pair while doing sync_cpu_device_pagetables(). Signed-off-by: Jason Gunthorpe Reviewed-by: Ralph Campbell Reviewed-by: Christoph Hellwig Tested-by: Philip Yang --- include/linux/hmm.h | 2 +- mm/hmm.c| 72 +++-- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index bf013e96525771..0fa8ea34ccef6d 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -86,7 +86,7 @@ struct hmm { struct mm_struct*mm; struct kref kref; - struct mutexlock; + spinlock_t ranges_lock; struct list_headranges; struct list_headmirrors; struct mmu_notifier mmu_notifier; diff --git a/mm/hmm.c b/mm/hmm.c index b224ea635a7716..89549eac03d506 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -64,7 +64,7 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) init_rwsem(>mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(>ranges); - mutex_init(>lock); + spin_lock_init(>ranges_lock); kref_init(>kref); hmm->notifiers = 0; hmm->mm = mm; @@ -144,6 +144,23 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) hmm_put(hmm); } +static void notifiers_decrement(struct hmm *hmm) +{ + lockdep_assert_held(>ranges_lock); + Why not acquire the lock here and release at the end instead of asserting the lock is held? It looks like everywhere notifiers_decrement() is called does that. + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; + + list_for_each_entry(range, >ranges, list) { + if (range->valid) + continue; + range->valid = true; + } + wake_up_all(>wq); + } +} + static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { @@ -151,6 +168,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; + unsigned long flags; int ret = 0; if (!kref_get_unless_zero(>kref)) @@ -161,12 +179,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, update.event = HMM_UPDATE_INVALIDATE; update.blockable = mmu_notifier_range_blockable(nrange); - if (mmu_notifier_range_blockable(nrange)) - mutex_lock(>lock); - else if (!mutex_trylock(>lock)) { - ret = -EAGAIN; - goto out; - } + spin_lock_irqsave(>ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, >ranges, list) { if (update.end < range->start || update.start >= range->end) @@ -174,7 +187,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, range->valid = false; } - mutex_unlock(>lock); + spin_unlock_irqrestore(>ranges_lock, flags); if (mmu_notifier_range_blockable(nrange)) down_read(>mirrors_sem); @@ -182,16 +195,26 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, ret = -EAGAIN; goto out; } + list_for_each_entry(mirror, >mirrors, list) { - int ret; + int rc; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, ); - if (!update.blockable && ret == -EAGAIN) + rc = mirror->ops->sync_cpu_device_pagetables(mirror, ); + if (rc) { + if (WARN_ON(update.blockable || rc != -EAGAIN)) + continue; + ret = -EAGAIN; break; + } } up_read(>mirrors_sem); out:
Re: [PATCH] drm/nouveau/dmem: missing mutex_lock in error path
On 6/13/19 5:49 PM, John Hubbard wrote: On 6/13/19 5:11 PM, Ralph Campbell wrote: In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before calling nouveau_dmem_chunk_alloc(). Reacquire the lock before continuing to the next page. Signed-off-by: Ralph Campbell --- I found this while testing Jason Gunthorpe's hmm tree but this is independent of those changes. I guess it could go through David Airlie's tree for nouveau or Jason's tree. Hi Ralph, btw, was this the fix for the crash you were seeing? It might be nice to mention in the commit description, if you are seeing real symptoms. drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 27aa4e72abe9..00f7236af1b9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm, ret = nouveau_dmem_chunk_alloc(drm); if (ret) { if (c) - break; Actually, the pre-existing code is a little concerning. Your change preserves the behavior, but it seems questionable to be doing a "return 0" (whether via the above break, or your change) when it's in this partially allocated state. It's reporting success when it only allocates part of what was requested, and it doesn't fill in the pages array either. + return 0; return ret; } + mutex_lock(>dmem->mutex); continue; } The above comment is about pre-existing potential problems, but your patch itself looks correct, so: Reviewed-by: John Hubbard thanks, The crash was the NULL pointer bug in Christoph's patch #10. I sent a separate reply for that. Below is the console output I got, then I made the changes just based on code inspection. Do you think I should include it in the change log? As for the "return 0", If you follow the call chain, nouveau_dmem_pages_alloc() is only ever called for one page so this currently "works" but I agree it is a bit of a time bomb. There are a number of other bugs that I can see that need fixing but I think those should be separate patches. [ 1294.871933] = [ 1294.876656] WARNING: bad unlock balance detected! [ 1294.881375] 5.2.0-rc3+ #5 Not tainted [ 1294.885048] - [ 1294.889773] test-malloc-vra/6299 is trying to release lock (>dmem->mutex) at: [ 1294.897482] [] nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.905782] but there are no more locks to release! [ 1294.910690] [ 1294.910690] other info that might help us debug this: [ 1294.917249] 1 lock held by test-malloc-vra/6299: [ 1294.921881] #0: 16e10454 (>mmap_sem#2){}, at: nouveau_svmm_bind+0x142/0x210 [nouveau] [ 1294.931313] [ 1294.931313] stack backtrace: [ 1294.935702] CPU: 4 PID: 6299 Comm: test-malloc-vra Not tainted 5.2.0-rc3+ #5 [ 1294.942786] Hardware name: ASUS X299-A/PRIME X299-A, BIOS 1401 05/21/2018 [ 1294.949590] Call Trace: [ 1294.952059] dump_stack+0x7c/0xc0 [ 1294.955469] ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.962213] print_unlock_imbalance_bug.cold.52+0xca/0xcf [ 1294.967641] lock_release+0x306/0x380 [ 1294.971383] ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.978089] ? lock_downgrade+0x2d0/0x2d0 [ 1294.982121] ? find_held_lock+0xac/0xd0 [ 1294.985979] __mutex_unlock_slowpath+0x8f/0x3f0 [ 1294.990540] ? wait_for_completion+0x230/0x230 [ 1294.995002] ? rwlock_bug.part.2+0x60/0x60 [ 1294.999197] nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1295.005751] ? page_mapping+0x98/0x110 [ 1295.009511] migrate_vma+0xa74/0x1090 [ 1295.013186] ? move_to_new_page+0x480/0x480 [ 1295.017400] ? __kmalloc+0x153/0x300 [ 1295.021052] ? nouveau_dmem_migrate_vma+0xd8/0x1e0 [nouveau] [ 1295.026796] nouveau_dmem_migrate_vma+0x157/0x1e0 [nouveau] [ 1295.032466] ? nouveau_dmem_init+0x490/0x490 [nouveau] [ 1295.037612] ? vmacache_find+0xc2/0x110 [ 1295.041537] nouveau_svmm_bind+0x1b4/0x210 [nouveau] [ 1295.046583] ? nouveau_svm_fault+0x13e0/0x13e0 [nouveau] [ 1295.051912] drm_ioctl_kernel+0x14d/0x1a0 [ 1295.055930] ? drm_setversion+0x330/0x330 [ 1295.059971] drm_ioctl+0x308/0x530 [ 1295.063384] ? drm_version+0x150/0x150 [ 1295.067153] ? find_held_lock+0xac/0xd0 [ 1295.070996] ? __pm_runtime_resume+0x3f/0xa0 [ 1295.075285] ? mark_held_locks+0x29/0xa0 [ 1295.079230] ? _raw_spin_unlock_irqrestore+0x3c/0x50 [ 1295.084232] ? lockdep_hardirqs_on+0x17d/0x250 [ 1295.088768] nouveau_drm_ioctl+0x9a/0x100 [nouveau] [ 1295.093661] do_vfs_ioctl+0x137/0x
[PATCH v2] drm/nouveau/dmem: missing mutex_lock in error path
In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before calling nouveau_dmem_chunk_alloc() as shown when CONFIG_PROVE_LOCKING is enabled: [ 1294.871933] = [ 1294.876656] WARNING: bad unlock balance detected! [ 1294.881375] 5.2.0-rc3+ #5 Not tainted [ 1294.885048] - [ 1294.889773] test-malloc-vra/6299 is trying to release lock (>dmem->mutex) at: [ 1294.897482] [] nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.905782] but there are no more locks to release! [ 1294.910690] [ 1294.910690] other info that might help us debug this: [ 1294.917249] 1 lock held by test-malloc-vra/6299: [ 1294.921881] #0: 16e10454 (>mmap_sem#2){}, at: nouveau_svmm_bind+0x142/0x210 [nouveau] [ 1294.931313] [ 1294.931313] stack backtrace: [ 1294.935702] CPU: 4 PID: 6299 Comm: test-malloc-vra Not tainted 5.2.0-rc3+ #5 [ 1294.942786] Hardware name: ASUS X299-A/PRIME X299-A, BIOS 1401 05/21/2018 [ 1294.949590] Call Trace: [ 1294.952059] dump_stack+0x7c/0xc0 [ 1294.955469] ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.962213] print_unlock_imbalance_bug.cold.52+0xca/0xcf [ 1294.967641] lock_release+0x306/0x380 [ 1294.971383] ? nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1294.978089] ? lock_downgrade+0x2d0/0x2d0 [ 1294.982121] ? find_held_lock+0xac/0xd0 [ 1294.985979] __mutex_unlock_slowpath+0x8f/0x3f0 [ 1294.990540] ? wait_for_completion+0x230/0x230 [ 1294.995002] ? rwlock_bug.part.2+0x60/0x60 [ 1294.999197] nouveau_dmem_migrate_alloc_and_copy+0x79f/0xbf0 [nouveau] [ 1295.005751] ? page_mapping+0x98/0x110 [ 1295.009511] migrate_vma+0xa74/0x1090 [ 1295.013186] ? move_to_new_page+0x480/0x480 [ 1295.017400] ? __kmalloc+0x153/0x300 [ 1295.021052] ? nouveau_dmem_migrate_vma+0xd8/0x1e0 [nouveau] [ 1295.026796] nouveau_dmem_migrate_vma+0x157/0x1e0 [nouveau] [ 1295.032466] ? nouveau_dmem_init+0x490/0x490 [nouveau] [ 1295.037612] ? vmacache_find+0xc2/0x110 [ 1295.041537] nouveau_svmm_bind+0x1b4/0x210 [nouveau] [ 1295.046583] ? nouveau_svm_fault+0x13e0/0x13e0 [nouveau] [ 1295.051912] drm_ioctl_kernel+0x14d/0x1a0 [ 1295.055930] ? drm_setversion+0x330/0x330 [ 1295.059971] drm_ioctl+0x308/0x530 [ 1295.063384] ? drm_version+0x150/0x150 [ 1295.067153] ? find_held_lock+0xac/0xd0 [ 1295.070996] ? __pm_runtime_resume+0x3f/0xa0 [ 1295.075285] ? mark_held_locks+0x29/0xa0 [ 1295.079230] ? _raw_spin_unlock_irqrestore+0x3c/0x50 [ 1295.084232] ? lockdep_hardirqs_on+0x17d/0x250 [ 1295.088768] nouveau_drm_ioctl+0x9a/0x100 [nouveau] [ 1295.093661] do_vfs_ioctl+0x137/0x9a0 [ 1295.097341] ? ioctl_preallocate+0x140/0x140 [ 1295.101623] ? match_held_lock+0x1b/0x230 [ 1295.105646] ? match_held_lock+0x1b/0x230 [ 1295.109660] ? find_held_lock+0xac/0xd0 [ 1295.113512] ? __do_page_fault+0x324/0x630 [ 1295.117617] ? lock_downgrade+0x2d0/0x2d0 [ 1295.121648] ? mark_held_locks+0x79/0xa0 [ 1295.125583] ? handle_mm_fault+0x352/0x430 [ 1295.129687] ksys_ioctl+0x60/0x90 [ 1295.133020] ? mark_held_locks+0x29/0xa0 [ 1295.136964] __x64_sys_ioctl+0x3d/0x50 [ 1295.140726] do_syscall_64+0x68/0x250 [ 1295.144400] entry_SYSCALL_64_after_hwframe+0x49/0xbe [ 1295.149465] RIP: 0033:0x7f1a3495809b [ 1295.153053] Code: 0f 1e fa 48 8b 05 ed bd 0c 00 64 c7 00 26 00 00 00 48 c7 c0 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d bd bd 0c 00 f7 d8 64 89 01 48 [ 1295.171850] RSP: 002b:7ffef7ed1358 EFLAGS: 0246 ORIG_RAX: 0010 [ 1295.179451] RAX: ffda RBX: 7ffef7ed1628 RCX: 7f1a3495809b [ 1295.186601] RDX: 7ffef7ed13b0 RSI: 40406449 RDI: 0004 [ 1295.193759] RBP: 7ffef7ed13b0 R08: R09: 0157e770 [ 1295.200917] R10: 0151c010 R11: 0246 R12: 40406449 [ 1295.208083] R13: 0004 R14: R15: Reacquire the lock before continuing to the next page. Signed-off-by: Ralph Campbell --- I found this while testing Jason Gunthorpe's hmm tree but this is independent of those changes. Jason thinks it is best to go through David Airlie's nouveau tree. Changes for v2: Updated change log to include console output. drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 27aa4e72abe9..00f7236af1b9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm, ret = nouveau_dmem_chunk_alloc(drm); if (ret) { if (c) - break; + return 0;
Re: [PATCH 10/22] memremap: add a migrate callback to struct dev_pagemap_ops
E */ - static void pgmap_array_delete(struct resource *res) { xa_store_range(_array, PHYS_PFN(res->start), PHYS_PFN(res->end), diff --git a/mm/hmm.c b/mm/hmm.c index 6dc769feb2e1..aab799677c7d 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1330,15 +1330,12 @@ static void hmm_devmem_ref_kill(struct dev_pagemap *pgmap) percpu_ref_kill(pgmap->ref); } -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) +static vm_fault_t hmm_devmem_migrate(struct vm_fault *vmf) { - struct hmm_devmem *devmem = page->pgmap->data; + struct hmm_devmem *devmem = vmf->page->pgmap->data; - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); + return devmem->ops->fault(devmem, vmf->vma, vmf->address, vmf->page, + vmf->flags, vmf->pmd); } static void hmm_devmem_free(struct page *page, void *data) @@ -1351,6 +1348,7 @@ static void hmm_devmem_free(struct page *page, void *data) static const struct dev_pagemap_ops hmm_pagemap_ops = { .page_free = hmm_devmem_free, .kill = hmm_devmem_ref_kill, + .migrate= hmm_devmem_migrate, }; /* @@ -1405,7 +1403,6 @@ struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; devmem->pfn_last = devmem->pfn_first + (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; devmem->pagemap.res = *devmem->resource; diff --git a/mm/memory.c b/mm/memory.c index ddf20bd0c317..cbf3cb598436 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2782,13 +2782,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - /* -* For un-addressable device memory we call the pgmap -* fault handler callback. The callback must migrate -* the page back to some CPU accessible page. -*/ - ret = device_private_entry_fault(vma, vmf->address, entry, -vmf->flags, vmf->pmd); + vmf->page = device_private_entry_to_page(entry); + ret = page->pgmap->ops->migrate(vmf); This needs to either initialize "page" or be changed to "vmf->page". Otherwise, it is a NULL pointer dereference. } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { You can add: Reviewed-by: Ralph Campbell ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/nouveau/dmem: missing mutex_lock in error path
In nouveau_dmem_pages_alloc(), the drm->dmem->mutex is unlocked before calling nouveau_dmem_chunk_alloc(). Reacquire the lock before continuing to the next page. Signed-off-by: Ralph Campbell --- I found this while testing Jason Gunthorpe's hmm tree but this is independant of those changes. I guess it could go through David Airlie's tree for nouveau or Jason's tree. drivers/gpu/drm/nouveau/nouveau_dmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 27aa4e72abe9..00f7236af1b9 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -379,9 +379,10 @@ nouveau_dmem_pages_alloc(struct nouveau_drm *drm, ret = nouveau_dmem_chunk_alloc(drm); if (ret) { if (c) - break; + return 0; return ret; } + mutex_lock(>dmem->mutex); continue; } -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH 18/22] mm: mark DEVICE_PUBLIC as broken
On 6/13/19 12:44 PM, Jason Gunthorpe wrote: On Thu, Jun 13, 2019 at 11:43:21AM +0200, Christoph Hellwig wrote: The code hasn't been used since it was added to the tree, and doesn't appear to actually be usable. Mark it as BROKEN until either a user comes along or we finally give up on it. Signed-off-by: Christoph Hellwig mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index 0d2ba7e1f43e..406fa45e9ecc 100644 +++ b/mm/Kconfig @@ -721,6 +721,7 @@ config DEVICE_PRIVATE config DEVICE_PUBLIC bool "Addressable device memory (like GPU memory)" depends on ARCH_HAS_HMM + depends on BROKEN select HMM select DEV_PAGEMAP_OPS This seems a bit harsh, we do have another kconfig that selects this one today: config DRM_NOUVEAU_SVM bool "(EXPERIMENTAL) Enable SVM (Shared Virtual Memory) support" depends on ARCH_HAS_HMM depends on DRM_NOUVEAU depends on STAGING select HMM_MIRROR select DEVICE_PRIVATE default n help Say Y here if you want to enable experimental support for Shared Virtual Memory (SVM). Maybe it should be depends on STAGING not broken? or maybe nouveau_svm doesn't actually need DEVICE_PRIVATE? Jason I think you are confusing DEVICE_PRIVATE for DEVICE_PUBLIC. DRM_NOUVEAU_SVM does use DEVICE_PRIVATE but not DEVICE_PUBLIC. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2 00/15] Consolidate the mmu notifier interval_tree and locking
On 10/28/19 1:10 PM, Jason Gunthorpe wrote: From: Jason Gunthorpe 8 of the mmu_notifier using drivers (i915_gem, radeon_mn, umem_odp, hfi1, scif_dma, vhost, gntdev, hmm) drivers are using a common pattern where they only use invalidate_range_start/end and immediately check the invalidating range against some driver data structure to tell if the driver is interested. Half of them use an interval_tree, the others are simple linear search lists. Of the ones I checked they largely seem to have various kinds of races, bugs and poor implementation. This is a result of the complexity in how the notifier interacts with get_user_pages(). It is extremely difficult to use it correctly. Consolidate all of this code together into the core mmu_notifier and provide a locking scheme similar to hmm_mirror that allows the user to safely use get_user_pages() and reliably know if the page list still matches the mm. This new arrangment plays nicely with the !blockable mode for OOM. Scanning the interval tree is done such that the intersection test will always succeed, and since there is no invalidate_range_end exposed to drivers the scheme safely allows multiple drivers to be subscribed. Four places are converted as an example of how the new API is used. Four are left for future patches: - i915_gem has complex locking around destruction of a registration, needs more study - hfi1 (2nd user) needs access to the rbtree - scif_dma has a complicated logic flow - vhost's mmu notifiers are already being rewritten This series, and the other code it depends on is available on my github: https://github.com/jgunthorpe/linux/commits/mmu_notifier v2 changes: - Add mmu_range_set_seq() to set the mrn sequence number under the driver lock and make the locking more understandable - Add some additional comments around locking/READ_ONCe - Make the WARN_ON flow in mn_itree_invalidate a bit easier to follow - Fix wrong WARN_ON Jason Gunthorpe (15): mm/mmu_notifier: define the header pre-processor parts even if disabled mm/mmu_notifier: add an interval tree notifier mm/hmm: allow hmm_range to be used with a mmu_range_notifier or hmm_mirror mm/hmm: define the pre-processor related parts of hmm.h even if disabled RDMA/odp: Use mmu_range_notifier_insert() RDMA/hfi1: Use mmu_range_notifier_inset for user_exp_rcv drm/radeon: use mmu_range_notifier_insert xen/gntdev: Use select for DMA_SHARED_BUFFER xen/gntdev: use mmu_range_notifier_insert nouveau: use mmu_notifier directly for invalidate_range_start nouveau: use mmu_range_notifier instead of hmm_mirror drm/amdgpu: Call find_vma under mmap_sem drm/amdgpu: Use mmu_range_insert instead of hmm_mirror drm/amdgpu: Use mmu_range_notifier instead of hmm_mirror mm/hmm: remove hmm_mirror and related Documentation/vm/hmm.rst | 105 +--- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 +- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c| 14 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c| 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c| 457 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h| 53 -- drivers/gpu/drm/amd/amdgpu/amdgpu_object.h| 13 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 111 ++-- drivers/gpu/drm/nouveau/nouveau_svm.c | 231 +--- drivers/gpu/drm/radeon/radeon.h | 9 +- drivers/gpu/drm/radeon/radeon_mn.c| 219 ++- drivers/infiniband/core/device.c | 1 - drivers/infiniband/core/umem_odp.c| 288 + drivers/infiniband/hw/hfi1/file_ops.c | 2 +- drivers/infiniband/hw/hfi1/hfi.h | 2 +- drivers/infiniband/hw/hfi1/user_exp_rcv.c | 146 ++--- drivers/infiniband/hw/hfi1/user_exp_rcv.h | 3 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 7 +- drivers/infiniband/hw/mlx5/mr.c | 3 +- drivers/infiniband/hw/mlx5/odp.c | 50 +- drivers/xen/Kconfig | 3 +- drivers/xen/gntdev-common.h | 8 +- drivers/xen/gntdev.c | 180 ++ include/linux/hmm.h | 195 +-- include/linux/mmu_notifier.h | 144 - include/rdma/ib_umem_odp.h| 65 +-- include/rdma/ib_verbs.h | 2 - kernel/fork.c | 1 - mm/Kconfig| 2 +- mm/hmm.c | 275 + mm/mmu_notifier.c | 546 +- 32 files changed, 1225 insertions(+), 1922 deletions(-) You can add my Tested-by for the mm and nouveau changes. IOW, patches 1-4, 10-11, and 15. Tested-by: Ralph Campbell ___ dri-devel mailing list dri-devel
Re: [PATCH v3 02/14] mm/mmu_notifier: add an interval tree notifier
On 11/13/19 8:46 AM, Jason Gunthorpe wrote: On Wed, Nov 13, 2019 at 05:59:52AM -0800, Christoph Hellwig wrote: +int mmu_interval_notifier_insert(struct mmu_interval_notifier *mni, + struct mm_struct *mm, unsigned long start, + unsigned long length, + const struct mmu_interval_notifier_ops *ops); +int mmu_interval_notifier_insert_locked( + struct mmu_interval_notifier *mni, struct mm_struct *mm, + unsigned long start, unsigned long length, + const struct mmu_interval_notifier_ops *ops); Very inconsistent indentation between these two related functions. clang-format.. The kernel config is set to prefer a line up under the ( if all the arguments will fit within the 80 cols otherwise it does a 1 tab continuation indent. + /* +* The inv_end incorporates a deferred mechanism like +* rtnl_unlock(). Adds and removes are queued until the final inv_end +* happens then they are progressed. This arrangement for tree updates +* is used to avoid using a blocking lock during +* invalidate_range_start. Nitpick: That comment can be condensed into one less line: The rtnl_unlock can move up a line too. My editor is failing me on this. + /* +* TODO: Since we already have a spinlock above, this would be faster +* as wake_up_q +*/ + if (need_wake) + wake_up_all(_mm->wq); So why is this important enough for a TODO comment, but not important enough to do right away? Lets drop the comment, I'm noto sure wake_up_q is even a function this layer should be calling. Actually, I think you can remove the "need_wake" variable since it is unconditionally set to "true". Also, the comment in__mmu_interval_notifier_insert() says "mni->mr_invalidate_seq" and I think that should be "mni->invalidate_seq". ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
Re: [PATCH v2] nouveau/hmm: map pages after migration
On 3/3/20 4:42 AM, Jason Gunthorpe wrote: On Mon, Mar 02, 2020 at 05:00:23PM -0800, Ralph Campbell wrote: When memory is migrated to the GPU, it is likely to be accessed by GPU code soon afterwards. Instead of waiting for a GPU fault, map the migrated memory into the GPU page tables with the same access permissions as the source CPU page table entries. This preserves copy on write semantics. Signed-off-by: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: "Jérôme Glisse" Cc: Ben Skeggs --- Originally this patch was targeted for Jason's rdma tree since other HMM related changes were queued there. Now that those have been merged, this patch just contains changes to nouveau so it could go through any tree. I guess Ben Skeggs' tree would be appropriate. Yep +static inline struct nouveau_pfnmap_args * +nouveau_pfns_to_args(void *pfns) don't use static inline inside C files OK. +{ + struct nvif_vmm_pfnmap_v0 *p = + container_of(pfns, struct nvif_vmm_pfnmap_v0, phys); + + return container_of(p, struct nouveau_pfnmap_args, p); And this should just be return container_of(pfns, struct nouveau_pfnmap_args, p.phys); Much simpler, thanks. +static struct nouveau_svmm * +nouveau_find_svmm(struct nouveau_svm *svm, struct mm_struct *mm) +{ + struct nouveau_ivmm *ivmm; + + list_for_each_entry(ivmm, >inst, head) { + if (ivmm->svmm->notifier.mm == mm) + return ivmm->svmm; + } + return NULL; +} Is this re-implementing mmu_notifier_get() ? Jason Not quite. This is being called from an ioctl() call on the GPU device file which calls nouveau_svmm_bind() which locks mmap_sem for reading, walks the vmas for the address range given in the ioctl() data, and migrates the pages to the GPU memory. mmu_notifier_get() would try to lock mmap_sem for writing so that would deadlock. But it is similar in that the GPU specific process context (nouveau_svmm) needs to be found for the given ioctl caller. If find_get_mmu_notifier() was exported, I think that could work. Now that I look at this again, there is an easier way to find the svmm and I see some other bugs that need fixing. I'll post a v3 as soon as I get those written and tested. Thanks for the review. ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v3 1/4] nouveau/hmm: fix vma range check for migration
find_vma_intersection(mm, start, end) only guarantees that end is greater than or equal to vma->vm_start but doesn't guarantee that start is greater than or equal to vma->vm_start. The calculation for the intersecting range in nouveau_svmm_bind() isn't accounting for this and can call migrate_vma_setup() with a starting address less than vma->vm_start. This results in migrate_vma_setup() returning -EINVAL for the range instead of nouveau skipping that part of the range and migrating the rest. Signed-off-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index df9bf1fd1bc0..169320409286 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -179,6 +179,7 @@ nouveau_svmm_bind(struct drm_device *dev, void *data, if (!vma) break; + addr = max(addr, vma->vm_start); next = min(vma->vm_end, end); /* This is a best effort so we ignore errors */ nouveau_dmem_migrate_vma(cli->drm, vma, addr, next); -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v3 3/4] nouveau: remove useless SVM range check
When nouveau processes GPU faults, it checks to see if the fault address falls within the "unmanaged" range which is reserved for fixed allocations instead of addresses chosen by the core mm code. If start is greater than or equal to svmm->unmanaged.limit, then limit will also be greater than svmm->unmanaged.limit which is greater than svmm->unmanaged.start and the start = max_t(u64, start, svmm->unmanaged.limit) will change nothing. Just remove the useless lines of code. Signed-off-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index c567526b75b8..8dfa5cb74826 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -663,9 +663,6 @@ nouveau_svm_fault(struct nvif_notify *notify) limit = start + (ARRAY_SIZE(args.phys) << PAGE_SHIFT); if (start < svmm->unmanaged.limit) limit = min_t(u64, limit, svmm->unmanaged.start); - else - if (limit > svmm->unmanaged.start) - start = max_t(u64, start, svmm->unmanaged.limit); SVMM_DBG(svmm, "wndw %016llx-%016llx", start, limit); mm = svmm->notifier.mm; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v3 2/4] nouveau/hmm: check for SVM initialized before migrating
When migrating system memory to GPU memory, check that SVM has been enabled. Even though most errors can be ignored since migration is a performance optimization, return an error because this is a violation of the API. Signed-off-by: Ralph Campbell --- drivers/gpu/drm/nouveau/nouveau_svm.c | 5 + 1 file changed, 5 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c b/drivers/gpu/drm/nouveau/nouveau_svm.c index 169320409286..c567526b75b8 100644 --- a/drivers/gpu/drm/nouveau/nouveau_svm.c +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c @@ -171,6 +171,11 @@ nouveau_svmm_bind(struct drm_device *dev, void *data, mm = get_task_mm(current); down_read(>mmap_sem); + if (!cli->svm.svmm) { + up_read(>mmap_sem); + return -EINVAL; + } + for (addr = args->va_start, end = args->va_start + size; addr < end;) { struct vm_area_struct *vma; unsigned long next; -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v3 4/4] nouveau/hmm: map pages after migration
When memory is migrated to the GPU, it is likely to be accessed by GPU code soon afterwards. Instead of waiting for a GPU fault, map the migrated memory into the GPU page tables with the same access permissions as the source CPU page table entries. This preserves copy on write semantics. Signed-off-by: Ralph Campbell Cc: Christoph Hellwig Cc: Jason Gunthorpe Cc: "Jérôme Glisse" Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 46 +--- drivers/gpu/drm/nouveau/nouveau_dmem.h | 2 + drivers/gpu/drm/nouveau/nouveau_svm.c | 59 +- drivers/gpu/drm/nouveau/nouveau_svm.h | 5 +++ 4 files changed, 95 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 0ad5d87b5a8e..981c05a2a6ca 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -25,11 +25,13 @@ #include "nouveau_dma.h" #include "nouveau_mem.h" #include "nouveau_bo.h" +#include "nouveau_svm.h" #include #include #include #include +#include #include #include @@ -558,10 +560,11 @@ nouveau_dmem_init(struct nouveau_drm *drm) } static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, - unsigned long src, dma_addr_t *dma_addr) + unsigned long src, dma_addr_t *dma_addr, u64 *pfn) { struct device *dev = drm->dev->dev; struct page *dpage, *spage; + unsigned long paddr; spage = migrate_pfn_to_page(src); if (!spage || !(src & MIGRATE_PFN_MIGRATE)) @@ -569,17 +572,21 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, dpage = nouveau_dmem_page_alloc_locked(drm); if (!dpage) - return 0; + goto out; *dma_addr = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_BIDIRECTIONAL); if (dma_mapping_error(dev, *dma_addr)) goto out_free_page; + paddr = nouveau_dmem_page_addr(dpage); if (drm->dmem->migrate.copy_func(drm, 1, NOUVEAU_APER_VRAM, - nouveau_dmem_page_addr(dpage), NOUVEAU_APER_HOST, - *dma_addr)) + paddr, NOUVEAU_APER_HOST, *dma_addr)) goto out_dma_unmap; + *pfn = NVIF_VMM_PFNMAP_V0_V | NVIF_VMM_PFNMAP_V0_VRAM | + ((paddr >> PAGE_SHIFT) << NVIF_VMM_PFNMAP_V0_ADDR_SHIFT); + if (src & MIGRATE_PFN_WRITE) + *pfn |= NVIF_VMM_PFNMAP_V0_W; return migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; out_dma_unmap: @@ -587,18 +594,20 @@ static unsigned long nouveau_dmem_migrate_copy_one(struct nouveau_drm *drm, out_free_page: nouveau_dmem_page_free_locked(drm, dpage); out: + *pfn = NVIF_VMM_PFNMAP_V0_NONE; return 0; } static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, - struct migrate_vma *args, dma_addr_t *dma_addrs) + struct nouveau_svmm *svmm, struct migrate_vma *args, + dma_addr_t *dma_addrs, u64 *pfns) { struct nouveau_fence *fence; unsigned long addr = args->start, nr_dma = 0, i; for (i = 0; addr < args->end; i++) { args->dst[i] = nouveau_dmem_migrate_copy_one(drm, args->src[i], - dma_addrs + nr_dma); + dma_addrs + nr_dma, pfns + i); if (args->dst[i]) nr_dma++; addr += PAGE_SIZE; @@ -607,20 +616,18 @@ static void nouveau_dmem_migrate_chunk(struct nouveau_drm *drm, nouveau_fence_new(drm->dmem->migrate.chan, false, ); migrate_vma_pages(args); nouveau_dmem_fence_done(); + nouveau_pfns_map(svmm, args->vma->vm_mm, args->start, pfns, i); while (nr_dma--) { dma_unmap_page(drm->dev->dev, dma_addrs[nr_dma], PAGE_SIZE, DMA_BIDIRECTIONAL); } - /* -* FIXME optimization: update GPU page table to point to newly migrated -* memory. -*/ migrate_vma_finalize(args); } int nouveau_dmem_migrate_vma(struct nouveau_drm *drm, +struct nouveau_svmm *svmm, struct vm_area_struct *vma, unsigned long start, unsigned long end) @@ -632,7 +639,8 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, .vma= vma, .start = start, }; - unsigned long c, i; + unsigned long i; + u64 *pfns; int ret = -ENOMEM; args.src = kcalloc(max, sizeof(*args.src), GFP_KERNEL); @@ -646,19 +654,25 @@ nouveau_dmem_migrate_vma(struct nouveau_drm *drm, if (!dma_addrs)
[PATCH v3 0/4] nouveau/hmm: map pages after migration
Originally patch 4 was targeted for Jason's rdma tree since other HMM related changes were queued there. Now that those have been merged, these patches just contain changes to nouveau so they could go through any tree. I guess Ben Skeggs' tree would be appropriate. Changes since v2: Added patches 1-3 to fix some minor issues. Eliminated nouveau_find_svmm() since it is easily found. Applied Jason Gunthorpe's suggestions for nouveau_pfns_to_args(). Changes since v1: Rebase to linux-5.6.0-rc4 Address Christoph Hellwig's comments Ralph Campbell (4): nouveau/hmm: fix vma range check for migration nouveau/hmm: check for SVM initialized before migrating nouveau: remove useless SVM range check nouveau/hmm: map pages after migration drivers/gpu/drm/nouveau/nouveau_dmem.c | 46 +++-- drivers/gpu/drm/nouveau/nouveau_dmem.h | 2 + drivers/gpu/drm/nouveau/nouveau_svm.c | 69 -- drivers/gpu/drm/nouveau/nouveau_svm.h | 5 ++ 4 files changed, 102 insertions(+), 20 deletions(-) -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel