[PATCH v6 6/8] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 8 mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 14 +++--- mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcbd0e574917..3b93ce496dd4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 77c3f079c723..79c84bb48ea9 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4309939be22d..f0ad70c29500 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2026,7 +2024,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2245,7 +2243,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c20a8d2de3f3..44fe3565ef37 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3247,7 +3247,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED |
[PATCH v6 7/8] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Changes since v1: - Initialize flags field from mmu_notifier_range_init() arguments Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v6 8/8] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v6 2/8] mm/mmu_notifier: convert user range->blockable to helper function
From: Jérôme Glisse Use the mmu_notifier_range_blockable() helper function instead of directly dereferencing the range->blockable field. This is done to make it easier to change the mmu_notifier range field. This patch is the outcome of the following coccinelle patch: %<--- @@ identifier I1, FN; @@ FN(..., struct mmu_notifier_range *I1, ...) { <... -I1->blockable +mmu_notifier_range_blockable(I1) ...> } --->% spatch --in-place --sp-file blockable.spatch --dir . Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/xen/gntdev.c| 6 +++--- mm/hmm.c| 6 +++--- mm/mmu_notifier.c | 2 +- virt/kvm/kvm_main.c | 3 ++- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 1d3f9a31ad61..777b3f8727e7 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(>lock); else if (!mutex_trylock(>lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e6ec79ad9cc8..59ef912fbe03 100644 ---
[PATCH v6 5/8] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. Changes since v1: - add the flags parameter to init range flags This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 - kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 12 mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 25 - mm/migrate.c | 5 - mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 3 ++- mm/rmap.c| 6 -- 14 files changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..fcbd0e574917 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,7 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma,
[PATCH v6 4/8] mm/mmu_notifier: contextual information for event enums
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v6 1/8] mm/mmu_notifier: helper to test if a range invalidation is blockable
From: Jérôme Glisse Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v6 3/8] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags
From: Jérôme Glisse Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.20.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v6 0/8] mmu notifier provide context informations
From: Jérôme Glisse (Andrew this apply on top of my HMM patchset as otherwise you will have conflict with changes to mm/hmm.c) Changes since v5: - drop KVM bits waiting for KVM people to express interest if they do not then i will post patchset to remove change_pte_notify as without the changes in v5 change_pte_notify is just useless (it it is useless today upstream it is just wasting cpu cycles) - rebase on top of lastest Linus tree Previous cover letter with minor update: Here i am not posting users of this, they already have been posted to appropriate mailing list [6] and will be merge through the appropriate tree once this patchset is upstream. Note that this serie does not change any behavior for any existing code. It just pass down more information to mmu notifier listener. The rational for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Without this serie, driver are force to assume that every notification is an munmap which triggers useless trashing within drivers that associate structure with range of virtual address. Each driver is force to free up its tracking structure and then restore it on next device page fault. With this serie we can also optimize device page table update [6]. More over this can also be use to optimize out some page table updates like for KVM where we can update the secondary MMU directly from the callback instead of clearing it. ACKS AMD/RADEON https://lkml.org/lkml/2019/2/1/395 ACKS RDMA https://lkml.org/lkml/2018/12/6/1473 Cheers, Jérôme [1] v1 https://lkml.org/lkml/2018/3/23/1049 [2] v2 https://lkml.org/lkml/2018/12/5/10 [3] v3 https://lkml.org/lkml/2018/12/13/620 [4] v4 https://lkml.org/lkml/2019/1/23/838 [5] v5 https://lkml.org/lkml/2019/2/19/752 [6] patches to use this: https://lkml.org/lkml/2019/1/23/833 https://lkml.org/lkml/2019/1/23/834 https://lkml.org/lkml/2019/1/23/832 https://lkml.org/lkml/2019/1/23/831 Cc: Andrew Morton Cc: linux...@kvack.org Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Alex Deucher Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ben Skeggs Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (8): mm/mmu_notifier: helper to test if a range invalidation is blockable mm/mmu_notifier: convert user range->blockable to helper function mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags mm/mmu_notifier: contextual information for event enums mm/mmu_notifier: contextual information for event triggering invalidation v2 mm/mmu_notifier: use correct mmu_notifier events for each invalidation mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2 mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 ++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 +- drivers/infiniband/core/umem_odp.c | 5 +- drivers/xen/gntdev.c| 6 +-- fs/proc/task_mmu.c | 3 +- include/linux/mmu_notifier.h| 63 +++-- kernel/events/uprobes.c | 3 +- mm/hmm.c| 6 +-- mm/huge_memory.c| 14 +++--- mm/hugetlb.c| 12 +++-- mm/khugepaged.c | 3 +- mm/ksm.c| 6 ++- mm/madvise.c| 3 +- mm/memory.c | 25 ++ mm/migrate.c| 5 +- mm/mmu_notifier.c | 12 - mm/mprotect.c | 4 +- mm/mremap.c | 3 +- mm/oom_kill.c | 3 +- mm/rmap.c |
[PATCH] gpu/nouveau: empty chunk do not have a buffer object associated with them.
From: Jérôme Glisse Empty chunk do not have a bo associated with them so no need to pin/unpin on suspend/resume. This fix suspend/resume on 5.1rc1 when NOUVEAU_SVM is enabled. Signed-off-by: Jérôme Glisse Reviewed-by: Tobias Klausmann Tested-by: Tobias Klausmann Cc: Ben Skeggs Cc: dri-devel@lists.freedesktop.org Cc: nouv...@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter --- drivers/gpu/drm/nouveau/nouveau_dmem.c | 8 1 file changed, 8 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index aa9fec80492d..a510dbe9a9cb 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -456,11 +456,6 @@ nouveau_dmem_resume(struct nouveau_drm *drm) /* FIXME handle pin failure */ WARN_ON(ret); } - list_for_each_entry (chunk, >dmem->chunk_empty, list) { - ret = nouveau_bo_pin(chunk->bo, TTM_PL_FLAG_VRAM, false); - /* FIXME handle pin failure */ - WARN_ON(ret); - } mutex_unlock(>dmem->mutex); } @@ -479,9 +474,6 @@ nouveau_dmem_suspend(struct nouveau_drm *drm) list_for_each_entry (chunk, >dmem->chunk_full, list) { nouveau_bo_unpin(chunk->bo); } - list_for_each_entry (chunk, >dmem->chunk_empty, list) { - nouveau_bo_unpin(chunk->bo); - } mutex_unlock(>dmem->mutex); } -- 2.17.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v5 9/9] mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2
From: Jérôme Glisse When notifying change for a range use MMU_NOTIFIER_USE_CHANGE_PTE flag for page table update that use set_pte_at_notify() and where the we are going either from read and write to read only with same pfn or read only to read and write with new pfn. Note that set_pte_at_notify() itself should only be use in rare cases ie we do not want to use it when we are updating a significant range of virtual addresses and thus a significant number of pte. Instead for those cases the event provided to mmu notifer invalidate_range_start() callback should be use for optimization. Changes since v1: - Use the new unsigned flags field in struct mmu_notifier_range - Use the new flags parameter to mmu_notifier_range_init() - Explicitly list all the patterns where we can use change_pte() Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 34 -- mm/ksm.c | 11 ++- mm/memory.c | 5 +++-- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b6c004bd9f6a..0230a4b06b46 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -40,6 +40,26 @@ enum mmu_notifier_event { MMU_NOTIFY_SOFT_DIRTY, }; +/* + * @MMU_NOTIFIER_RANGE_BLOCKABLE: can the mmu notifier range_start/range_end + * callback block or not ? If set then the callback can block. + * + * @MMU_NOTIFIER_USE_CHANGE_PTE: only set when the page table it updated with + * the set_pte_at_notify() the valid patterns for this are: + * - pte read and write to read only same pfn + * - pte read only to read and write (pfn can change or stay the same) + * - pte read only to read only with different pfn + * It is illegal to set in any other circumstances. + * + * Note that set_pte_at_notify() should not be use outside of the above cases. + * When updating a range in batch (like write protecting a range) it is better + * to rely on invalidate_range_start() and struct mmu_notifier_range to infer + * the kind of update that is happening (as an example you can look at the + * mmu_notifier_range_update_to_read_only() function). + */ +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) +#define MMU_NOTIFIER_USE_CHANGE_PTE (1 << 1) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -55,8 +75,6 @@ struct mmu_notifier_mm { spinlock_t lock; }; -#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) - struct mmu_notifier_range { struct vm_area_struct *vma; struct mm_struct *mm; @@ -268,6 +286,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return (range->flags & MMU_NOTIFIER_USE_CHANGE_PTE); +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -509,6 +533,12 @@ mmu_notifier_range_blockable(const struct mmu_notifier_range *range) return true; } +static inline bool +mmu_notifier_range_use_change_pte(const struct mmu_notifier_range *range) +{ + return false; +} + static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; diff --git a/mm/ksm.c b/mm/ksm.c index b782fadade8f..41e51882f999 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1066,9 +1066,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, BUG_ON(PageTransCompound(page)); - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, - pvmw.address, - pvmw.address + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, vma, mm, + pvmw.address, pvmw.address + PAGE_SIZE); mmu_notifier_invalidate_range_start(); if (!page_vma_mapped_walk()) @@ -1155,8 +1155,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (!pmd) goto out; - mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, - addr + PAGE_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, + MMU_NOTIFIER_USE_CHANGE_PTE, + vma, mm, addr, addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(); ptep = pte_offset_map_lock(mm, pmd, addr, ); diff
[PATCH v5 8/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 0379956fff23..b6c004bd9f6a 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -259,6 +259,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) @@ -568,6 +570,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index abd88c466eb2..ee36068077b6 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v5 5/9] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. Changes since v1: - add the flags parameter to init range flags This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +unsigned flags, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, flags, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, 0, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 5 - kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 12 mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 25 - mm/migrate.c | 5 - mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 3 ++- mm/rmap.c| 6 -- 14 files changed, 62 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 92a91e7816d8..fcbd0e574917 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,7 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2386e71ac1b8..62f94cd85455 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -356,6 +356,9 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + unsigned flags, + struct vm_area_struct *vma, struct mm_struct *mm,
[PATCH v5 2/9] mm/mmu_notifier: convert user range->blockable to helper function
From: Jérôme Glisse Use the mmu_notifier_range_blockable() helper function instead of directly dereferencing the range->blockable field. This is done to make it easier to change the mmu_notifier range field. This patch is the outcome of the following coccinelle patch: %<--- @@ identifier I1, FN; @@ FN(..., struct mmu_notifier_range *I1, ...) { <... -I1->blockable +mmu_notifier_range_blockable(I1) ...> } --->% spatch --in-place --sp-file blockable.spatch --dir . Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 ++-- drivers/infiniband/core/umem_odp.c | 5 +++-- drivers/xen/gntdev.c| 6 +++--- mm/hmm.c| 6 +++--- mm/mmu_notifier.c | 2 +- virt/kvm/kvm_main.c | 3 ++- 8 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..58ed401c5996 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -256,14 +256,14 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } @@ -299,7 +299,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end = range->end - 1; - if (amdgpu_mn_read_lock(amn, range->blockable)) + if (amdgpu_mn_read_lock(amn, mmu_notifier_range_blockable(range))) return -EAGAIN; it = interval_tree_iter_first(>objects, range->start, end); @@ -307,7 +307,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 1d3f9a31ad61..777b3f8727e7 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -122,7 +122,7 @@ userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, while (it) { struct drm_i915_gem_object *obj; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; break; } diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..c9bd1278f573 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -133,7 +133,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (range->blockable) + if (mmu_notifier_range_blockable(range)) mutex_lock(>lock); else if (!mutex_trylock(>lock)) return -EAGAIN; @@ -144,7 +144,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; - if (!range->blockable) { + if (!mmu_notifier_range_blockable(range)) { ret = -EAGAIN; goto out_unlock; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 012044f16d1c..3a3f1538d295 100644 --- a/drivers/infiniband/core/umem_odp.c +++
[PATCH v5 4/9] mm/mmu_notifier: contextual information for event enums
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index c8672c366f67..2386e71ac1b8 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v5 7/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Changes since v1: - Initialize flags field from mmu_notifier_range_init() arguments Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 62f94cd85455..0379956fff23 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -58,10 +58,12 @@ struct mmu_notifier_mm { #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; + enum mmu_notifier_event event; }; struct mmu_notifier_ops { @@ -363,10 +365,12 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; - range->flags = 0; + range->flags = flags; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v5 0/9] mmu notifier provide context informations
From: Jérôme Glisse Since last version [4] i added the extra bits needed for the change_pte optimization (which is a KSM thing). Here i am not posting users of this, they will be posted to the appropriate sub-systems (KVM, GPU, RDMA, ...) once this serie get upstream. If you want to look at users of this see [5] [6]. If this gets in 5.1 then i will be submitting those users for 5.2 (including KVM if KVM folks feel comfortable with it). Note that this serie does not change any behavior for any existing code. It just pass down more informations to mmu notifier listener. The rational for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Without this serie, driver are force to assume that every notification is an munmap which triggers useless trashing within drivers that associate structure with range of virtual address. Each driver is force to free up its tracking structure and then restore it on next device page fault. With this serie we can also optimize device page table update [5]. More over this can also be use to optimize out some page table updates like for KVM where we can update the secondary MMU directly from the callback instead of clearing it. Patches to leverage this serie will be posted separately to each sub- system. Cheers, Jérôme [1] v1 https://lkml.org/lkml/2018/3/23/1049 [2] v2 https://lkml.org/lkml/2018/12/5/10 [3] v3 https://lkml.org/lkml/2018/12/13/620 [4] v4 https://lkml.org/lkml/2019/1/23/838 [5] patches to use this: https://lkml.org/lkml/2019/1/23/833 https://lkml.org/lkml/2019/1/23/834 https://lkml.org/lkml/2019/1/23/832 https://lkml.org/lkml/2019/1/23/831 [6] KVM restore change pte optimization https://patchwork.kernel.org/cover/10791179/ Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (9): mm/mmu_notifier: helper to test if a range invalidation is blockable mm/mmu_notifier: convert user range->blockable to helper function mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags mm/mmu_notifier: contextual information for event enums mm/mmu_notifier: contextual information for event triggering invalidation v2 mm/mmu_notifier: use correct mmu_notifier events for each invalidation mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening v2 mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper mm/mmu_notifier: set MMU_NOTIFIER_USE_CHANGE_PTE flag where appropriate v2 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 8 +-- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/gpu/drm/radeon/radeon_mn.c | 4 +- drivers/infiniband/core/umem_odp.c | 5 +- drivers/xen/gntdev.c| 6 +- fs/proc/task_mmu.c | 3 +- include/linux/mmu_notifier.h| 93 +++-- kernel/events/uprobes.c | 3 +- mm/hmm.c| 6 +- mm/huge_memory.c| 14 ++-- mm/hugetlb.c| 12 ++-- mm/khugepaged.c | 3 +- mm/ksm.c| 9 ++- mm/madvise.c| 3 +- mm/memory.c | 26 --- mm/migrate.c| 5 +- mm/mmu_notifier.c | 12 +++- mm/mprotect.c | 4 +- mm/mremap.c | 3 +- mm/oom_kill.c | 3 +- mm/rmap.c | 6 +- virt/kvm/kvm_main.c | 3 +- 22 files changed, 180 insertions(+), 53 deletions(-) -- 2.17.2 ___
[PATCH v5 6/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 4 ++-- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 8 mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 14 +++--- mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 33 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcbd0e574917..3b93ce496dd4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1151,8 +1151,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, - NULL, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, + 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 46f546bdba00..8e8342080013 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c9d638f1b34e..1da6ca0f0f6d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1184,9 +1184,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1349,9 +1348,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2028,7 +2026,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2247,7 +2245,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d9e5c5a4c004..a58115c6b0a3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3250,7 +3250,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { -
[PATCH v5 1/9] mm/mmu_notifier: helper to test if a range invalidation is blockable
From: Jérôme Glisse Simple helpers to test if range invalidation is blockable. Latter patches use cocinnelle to convert all direct dereference of range-> blockable to use this function instead so that we can convert the blockable field to an unsigned for more flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 11 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..e630def131ce 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -226,6 +226,12 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return range->blockable; +} + static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) @@ -455,6 +461,11 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, #define mmu_notifier_range_init(range, mm, start, end) \ _mmu_notifier_range_init(range, start, end) +static inline bool +mmu_notifier_range_blockable(const struct mmu_notifier_range *range) +{ + return true; +} static inline int mm_has_notifiers(struct mm_struct *mm) { -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v5 3/9] mm/mmu_notifier: convert mmu_notifier_range->blockable to a flags
From: Jérôme Glisse Use an unsigned field for flags other than blockable and convert the blockable field to be one of those flags. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Joonas Lahtinen Cc: Jani Nikula Cc: Rodrigo Vivi Cc: Jan Kara Cc: Andrea Arcangeli Cc: Peter Xu Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index e630def131ce..c8672c366f67 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,11 +25,13 @@ struct mmu_notifier_mm { spinlock_t lock; }; +#define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; - bool blockable; + unsigned flags; }; struct mmu_notifier_ops { @@ -229,7 +231,7 @@ extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { - return range->blockable; + return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) @@ -275,7 +277,7 @@ static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = true; + range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } } @@ -284,7 +286,7 @@ static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) { - range->blockable = false; + range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; return __mmu_notifier_invalidate_range_start(range); } return 0; @@ -331,6 +333,7 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, range->mm = mm; range->start = start; range->end = end; + range->flags = 0; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 4/5] mm/hmm: add support for peer to peer to HMM device memory
From: Jérôme Glisse Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/hmm.h | 47 + mm/hmm.c| 63 + 2 files changed, 105 insertions(+), 5 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 4a1454e3efba..7a3ac182cc48 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -710,6 +710,53 @@ struct hmm_devmem_ops { const struct page *page, unsigned int flags, pmd_t *pmdp); + + /* +* p2p_map() - map page for peer to peer between device +* @devmem: device memory structure (see struct hmm_devmem) +* @range: range of virtual address that is being mapped +* @device: device the range is being map to +* @addr: first virtual address in the range to consider +* @pa: device address (where actual mapping is store) +* Returns: number of page successfuly mapped, 0 otherwise +* +* Map page belonging to devmem to another device for peer to peer +* access. Device can decide not to map in which case memory will +* be migrated to main memory. +* +* Also there is no garantee that all the pages in the range does +* belongs to the devmem so it is up to the function to check that +* every single page does belong to devmem. +* +* Note for now we do not care about error exect error, so on failure +* function should just return 0. +*/ + long (*p2p_map)(struct hmm_devmem *devmem, + struct hmm_range *range, + struct device *device, + unsigned long addr, + dma_addr_t *pas); + + /* +* p2p_unmap() - unmap page from peer to peer between device +* @devmem: device memory structure (see struct hmm_devmem) +* @range: range of virtual address that is being mapped +* @device: device the range is being map to +* @addr: first virtual address in the range to consider +* @pa: device address (where actual mapping is store) +* Returns: number of page successfuly unmapped, 0 otherwise +* +* Unmap page belonging to devmem previously map with p2p_map(). +* +* Note there is no garantee that all the pages in the range does +* belongs to the devmem so it is up to the function to check that +* every single page does belong to devmem. +*/ + unsigned long (*p2p_unmap)(struct hmm_devmem *devmem, + struct hmm_range *range, + struct device *device, + unsigned long addr, + dma_addr_t *pas); }; /* diff --git a/mm/hmm.c b/mm/hmm.c index 1a444885404e..fd49b1e116d0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1193,16 +1193,19 @@ long hmm_range_dma_map(struct hmm_range *range, dma_addr_t *daddrs, bool block) { - unsigned long i, npages, mapped, page_size; + unsigned long i, npages, mapped, page_size, addr; long ret; +again: ret = hmm_range_fault(range, block); if (ret <= 0) return ret ? ret : -EBUSY; + mapped = 0; + addr = range->start; page_size = hmm_range_page_size(range); npages = (range->end - range->start) >> range->page_shift; - for (i = 0, mapped = 0; i < npages; ++i) { + for (i = 0; i < npages; ++i, addr += page_size) { enum dma_data_direction dir = DMA_FROM_DEVICE; struct page *page; @@ -1226,6 +1229,29 @@ long hmm_range_dma_map(struct hmm_range *range, goto unmap; } + if (is_device_private_page(page)) { + struct hmm_devmem *devmem = page->pgmap->data; + + if (!devmem->ops->p2p_map || !devmem->ops->p2p_unmap) { + /* Fall-back to main memory. */ + range->default_flags |= + range->flags[HMM_PFN_DEVICE_PRIVATE]; + goto again; + } + + ret = devmem->ops->p2p_map(devmem, range, device, + addr, daddrs); + if (ret <= 0) { + /* Fall-back to main memory. */ + range->default_flags |= +
[RFC PATCH 3/5] mm/vma: add support for peer to peer to device vma
From: Jérôme Glisse Allow mmap of device file to export device memory to peer to peer devices. This will allow for instance a network device to access a GPU memory or to access a storage device queue directly. The common case will be a vma created by userspace device driver that is then share to another userspace device driver which call in its kernel device driver to map that vma. The vma does not need to have any valid CPU mapping so that only peer to peer device might access its content. Or it could have valid CPU mapping too in that case it should point to same memory for consistency. Note that peer to peer mapping is highly platform and device dependent and it might not work in all the cases. However we do expect supports for this to grow on more hardware platform. This patch only adds new call backs to vm_operations_struct bulk of code light within common bus driver (like pci) and device driver (both the exporting and importing device). Current design mandate that the importer must obey mmu_notifier and invalidate any peer to peer mapping anytime a notification of invalidation happens for a range that have been peer to peer mapped. This allows exporter device to easily invalidate mapping for any importer device. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-ker...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/mm.h | 38 ++ 1 file changed, 38 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index 80bb6408fe73..1bd60a90e575 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -429,6 +429,44 @@ struct vm_operations_struct { pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); + /* +* Optional for device driver that want to allow peer to peer (p2p) +* mapping of their vma (which can be back by some device memory) to +* another device. +* +* Note that the exporting device driver might not have map anything +* inside the vma for the CPU but might still want to allow a peer +* device to access the range of memory corresponding to a range in +* that vma. +* +* FOR PREDICTABILITY IF DRIVER SUCCESSFULY MAP A RANGE ONCE FOR A +* DEVICE THEN FURTHER MAPPING OF THE SAME IF THE VMA IS STILL VALID +* SHOULD ALSO BE SUCCESSFUL. Following this rule allow the importing +* device to map once during setup and report any failure at that time +* to the userspace. Further mapping of the same range might happen +* after mmu notifier invalidation over the range. The exporting device +* can use this to move things around (defrag BAR space for instance) +* or do other similar task. +* +* IMPORTER MUST OBEY mmu_notifier NOTIFICATION AND CALL p2p_unmap() +* WHEN A NOTIFIER IS CALL FOR THE RANGE ! THIS CAN HAPPEN AT ANY +* POINT IN TIME WITH NO LOCK HELD. +* +* In below function, the device argument is the importing device, +* the exporting device is the device to which the vma belongs. +*/ + long (*p2p_map)(struct vm_area_struct *vma, + struct device *device, + unsigned long start, + unsigned long end, + dma_addr_t *pa, + bool write); + long (*p2p_unmap)(struct vm_area_struct *vma, + struct device *device, + unsigned long start, + unsigned long end, + dma_addr_t *pa); + /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 5/5] mm/hmm: add support for peer to peer to special device vma
From: Jérôme Glisse Special device vma (mmap of a device file) can correspond to device driver object that some device driver might want to share with other device (giving access to). This add support for HMM to map those special device vma if the owning device (exporter) allows it. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- include/linux/hmm.h | 6 ++ mm/hmm.c| 156 ++-- 2 files changed, 128 insertions(+), 34 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 7a3ac182cc48..98ebe9f52432 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -137,6 +137,7 @@ enum hmm_pfn_flag_e { * result of vmf_insert_pfn() or vm_insert_page(). Therefore, it should not * be mirrored by a device, because the entry will never have HMM_PFN_VALID * set and the pfn value is undefined. + * HMM_PFN_P2P: this entry have been map as P2P ie the dma address is valid * * Driver provide entry value for none entry, error entry and special entry, * driver can alias (ie use same value for error and special for instance). It @@ -151,6 +152,7 @@ enum hmm_pfn_value_e { HMM_PFN_ERROR, HMM_PFN_NONE, HMM_PFN_SPECIAL, + HMM_PFN_P2P, HMM_PFN_VALUE_MAX }; @@ -250,6 +252,8 @@ static inline bool hmm_range_valid(struct hmm_range *range) static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, uint64_t pfn) { + if (pfn == range->values[HMM_PFN_P2P]) + return NULL; if (pfn == range->values[HMM_PFN_NONE]) return NULL; if (pfn == range->values[HMM_PFN_ERROR]) @@ -270,6 +274,8 @@ static inline struct page *hmm_pfn_to_page(const struct hmm_range *range, static inline unsigned long hmm_pfn_to_pfn(const struct hmm_range *range, uint64_t pfn) { + if (pfn == range->values[HMM_PFN_P2P]) + return -1UL; if (pfn == range->values[HMM_PFN_NONE]) return -1UL; if (pfn == range->values[HMM_PFN_ERROR]) diff --git a/mm/hmm.c b/mm/hmm.c index fd49b1e116d0..621a4f831483 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -1058,37 +1058,36 @@ long hmm_range_snapshot(struct hmm_range *range) } EXPORT_SYMBOL(hmm_range_snapshot); -/* - * hmm_range_fault() - try to fault some address in a virtual address range - * @range: range being faulted - * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: 0 on success ortherwise: - * -EINVAL: - * Invalid argument - * -ENOMEM: - * Out of memory. - * -EPERM: - * Invalid permission (for instance asking for write and range - * is read only). - * -EAGAIN: - * If you need to retry and mmap_sem was drop. This can only - * happens if block argument is false. - * -EBUSY: - * If the the range is being invalidated and you should wait for - * invalidation to finish. - * -EFAULT: - * Invalid (ie either no valid vma or it is illegal to access that - * range), number of valid pages in range->pfns[] (from range start - * address). - * - * This is similar to a regular CPU page fault except that it will not trigger - * any memory migration if the memory being faulted is not accessible by CPUs - * and caller does not ask for migration. - * - * On error, for one virtual address in the range, the function will mark the - * corresponding HMM pfn entry with an error flag. - */ -long hmm_range_fault(struct hmm_range *range, bool block) +static int hmm_vma_p2p_map(struct hmm_range *range, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct device *device, dma_addr_t *pas) +{ + struct hmm_vma_walk hmm_vma_walk; + unsigned long npages, i; + bool fault, write; + uint64_t *pfns; + int ret; + + i = (start - range->start) >> PAGE_SHIFT; + npages = (end - start) >> PAGE_SHIFT; + pfns = >pfns[i]; + pas = [i]; + + hmm_vma_walk.range = range; + hmm_vma_walk.fault = true; + hmm_range_need_fault(_vma_walk, pfns, npages, + 0, , ); + + ret = vma->vm_ops->p2p_map(vma, device, start, end, pas, write); + for (i = 0; i < npages; ++i) { + pfns[i] = ret ? range->values[HMM_PFN_ERROR] : + range->values[HMM_PFN_P2P]; + } + return ret; +} + +static long
[RFC PATCH 2/5] drivers/base: add a function to test peer to peer capability
From: Jérôme Glisse device_test_p2p() return true if two devices can peer to peer to each other. We add a generic function as different inter-connect can support peer to peer and we want to genericaly test this no matter what the inter-connect might be. However this version only support PCIE for now. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-ker...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- drivers/base/core.c| 20 include/linux/device.h | 1 + 2 files changed, 21 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index 0073b09bb99f..56023b00e108 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "base.h" #include "power/power.h" @@ -3167,3 +3168,22 @@ void device_set_of_node_from_dev(struct device *dev, const struct device *dev2) dev->of_node_reused = true; } EXPORT_SYMBOL_GPL(device_set_of_node_from_dev); + +/** + * device_test_p2p - test if two device can peer to peer to each other + * @devA: device A + * @devB: device B + * Returns: true if device can peer to peer to each other, false otherwise + */ +bool device_test_p2p(struct device *devA, struct device *devB) +{ + /* +* For now we only support PCIE peer to peer but other inter-connect +* can be added. +*/ + if (pci_test_p2p(devA, devB)) + return true; + + return false; +} +EXPORT_SYMBOL_GPL(device_test_p2p); diff --git a/include/linux/device.h b/include/linux/device.h index 6cb4640b6160..0d532d7f0779 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1250,6 +1250,7 @@ extern int device_online(struct device *dev); extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); extern void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); +bool device_test_p2p(struct device *devA, struct device *devB); static inline int dev_num_vf(struct device *dev) { -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 1/5] pci/p2p: add a function to test peer to peer capability
From: Jérôme Glisse device_test_p2p() return true if two devices can peer to peer to each other. We add a generic function as different inter-connect can support peer to peer and we want to genericaly test this no matter what the inter-connect might be. However this version only support PCIE for now. Signed-off-by: Jérôme Glisse Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-ker...@vger.kernel.org Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org --- drivers/pci/p2pdma.c | 27 +++ include/linux/pci-p2pdma.h | 6 ++ 2 files changed, 33 insertions(+) diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c index c52298d76e64..620ac60babb5 100644 --- a/drivers/pci/p2pdma.c +++ b/drivers/pci/p2pdma.c @@ -797,3 +797,30 @@ ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, return sprintf(page, "%s\n", pci_name(p2p_dev)); } EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show); + +bool pci_test_p2p(struct device *devA, struct device *devB) +{ + struct pci_dev *pciA, *pciB; + bool ret; + int tmp; + + /* +* For now we only support PCIE peer to peer but other inter-connect +* can be added. +*/ + pciA = find_parent_pci_dev(devA); + pciB = find_parent_pci_dev(devB); + if (pciA == NULL || pciB == NULL) { + ret = false; + goto out; + } + + tmp = upstream_bridge_distance(pciA, pciB, NULL); + ret = tmp < 0 ? false : true; + +out: + pci_dev_put(pciB); + pci_dev_put(pciA); + return false; +} +EXPORT_SYMBOL_GPL(pci_test_p2p); diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h index bca9bc3e5be7..7671cc499a08 100644 --- a/include/linux/pci-p2pdma.h +++ b/include/linux/pci-p2pdma.h @@ -36,6 +36,7 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev, bool *use_p2pdma); ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev, bool use_p2pdma); +bool pci_test_p2p(struct device *devA, struct device *devB); #else /* CONFIG_PCI_P2PDMA */ static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size, u64 offset) @@ -97,6 +98,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page, { return sprintf(page, "none\n"); } + +static inline bool pci_test_p2p(struct device *devA, struct device *devB) +{ + return false; +} #endif /* CONFIG_PCI_P2PDMA */ -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 0/5] Device peer to peer (p2p) through vma
From: Jérôme Glisse This patchset add support for peer to peer between device in two manner. First for device memory use through HMM in process regular address space (ie inside a regular vma that is not an mmap of device file or special file). Second for special vma ie mmap of a device file, in this case some device driver might want to allow other device to directly access memory use for those special vma (not that the memory might not even be map to CPU in this case). They are many use cases for this they mainly fall into 2 category: [A]-Allow device to directly map and control another device command queue. [B]-Allow device to access another device memory without disrupting the other device computation. Corresponding workloads: [1]-Network device directly access an control a block device command queue so that it can do storage access without involving the CPU. This fall into [A] [2]-Accelerator device doing heavy computation and network device is monitoring progress. Direct accelerator's memory access by the network device avoid the need to use much slower system memory. This fall into [B]. [3]-Accelerator device doing heavy computation and network device is streaming out the result. This avoid the need to first bounce the result through system memory (it saves both system memory and bandwidth). This fall into [B]. [4]-Chaining device computation. For instance a camera device take a picture, stream it to a color correction device that stream it to final memory. This fall into [A and B]. People have more ideas on how to use this than i can list here. The intention of this patchset is to provide the means to achieve those and much more. I have done a testing using nouveau and Mellanox mlx5 where the mlx5 device can directly access GPU memory [1]. I intend to use this inside nouveau and help porting AMD ROCm RDMA to use this [2]. I believe other people have express interest in working on using this with network device and block device. From implementation point of view this just add 2 new call back to vm_operations struct (for special device vma support) and 2 new call back to HMM device memory structure for HMM device memory support. For now it needs IOMMU off with ACS disabled and for both device to be on same PCIE sub-tree (can not cross root complex). However the intention here is different from some other peer to peer work in that we do want to support IOMMU and are fine with going through the root complex in that case. In other words, the bandwidth advantage of avoiding the root complex is of less importance than the programming model for the feature. We do actualy expect that this will be use mostly with IOMMU enabled and thus with having to go through the root bridge. Another difference from other p2p solution is that we do require that the importing device abide to mmu notifier invalidation so that the exporting device can always invalidate a mapping at any point in time. For this reasons we do not need a struct page for the device memory. Also in all the cases the policy and final decision on wether to map or not is solely under the control of the exporting device. Finaly the device memory might not even be map to the CPU and thus we have to go through the exporting device driver to get the physical address at which the memory is accessible. The core change are minimal (adding new call backs to some struct). IOMMU support will need little change too. Most of the code is in driver to implement export policy and BAR space management. Very gross playground with IOMMU support in [3] (top 3 patches). Cheers, Jérôme [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-p2p [2] https://github.com/RadeonOpenCompute/ROCnRDMA [3] https://cgit.freedesktop.org/~glisse/linux/log/?h=wip-hmm-p2p Cc: Logan Gunthorpe Cc: Greg Kroah-Hartman Cc: Rafael J. Wysocki Cc: Bjorn Helgaas Cc: Christian Koenig Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: linux-...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Joerg Roedel Cc: io...@lists.linux-foundation.org Jérôme Glisse (5): pci/p2p: add a function to test peer to peer capability drivers/base: add a function to test peer to peer capability mm/vma: add support for peer to peer to device vma mm/hmm: add support for peer to peer to HMM device memory mm/hmm: add support for peer to peer to special device vma drivers/base/core.c| 20 drivers/pci/p2pdma.c | 27 + include/linux/device.h | 1 + include/linux/hmm.h| 53 + include/linux/mm.h | 38 +++ include/linux/pci-p2pdma.h | 6 + mm/hmm.c | 219 ++--- 7 files changed, 325 insertions(+), 39 deletions(-) -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org
[PATCH v4 9/9] RDMA/umem_odp: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/infiniband/core/umem_odp.c | 22 +++--- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index a4ec43093cb3..fa4e7fdcabfc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -140,8 +140,15 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { + bool update_to_read_only = *((bool *)cookie); + ib_umem_notifier_start_account(item); - item->umem.context->invalidate_range(item, start, end); + /* +* If it is already read only and we are updating to read only then we +* do not need to change anything. So save time and skip this one. +*/ + if (!update_to_read_only || !item->read_only) + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -150,6 +157,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); + bool update_to_read_only; if (range->blockable) down_read(_mm->umem_rwsem); @@ -166,10 +174,13 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return 0; } + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + return rbt_ib_umem_for_each_in_range(_mm->umem_tree, range->start, range->end, invalidate_range_start_trampoline, -range->blockable, NULL); +range->blockable, +_to_read_only); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -363,6 +374,9 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, goto out_odp_data; } + /* Assume read only at first, each time GUP is call this is updated. */ + odp_data->read_only = true; + odp_data->dma_list = vzalloc(array_size(pages, sizeof(*odp_data->dma_list))); if (!odp_data->dma_list) { @@ -619,8 +633,10 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, goto out_put_task; } - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & ODP_WRITE_ALLOWED_BIT) { + umem_odp->read_only = false; flags |= FOLL_WRITE; + } start_idx = (user_virt - ib_umem_start(umem)) >> page_shift; k = start_idx; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0b1446fe2fab..8256668c6170 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -76,6 +76,7 @@ struct ib_umem_odp { struct completion notifier_completion; int dying; struct work_struct work; + bool read_only; }; static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 8/9] gpu/drm/i915: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/i915/i915_gem_userptr.c | 16 1 file changed, 16 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 9558582c105e..23330ac3d7ea 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -59,6 +59,7 @@ struct i915_mmu_object { struct interval_tree_node it; struct list_head link; struct work_struct work; + bool read_only; bool attached; }; @@ -119,6 +120,7 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, container_of(_mn, struct i915_mmu_notifier, mn); struct i915_mmu_object *mo; struct interval_tree_node *it; + bool update_to_read_only; LIST_HEAD(cancelled); unsigned long end; @@ -128,6 +130,8 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, /* interval ranges are inclusive, but invalidate range is exclusive */ end = range->end - 1; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + spin_lock(>lock); it = interval_tree_iter_first(>objects, range->start, end); while (it) { @@ -145,6 +149,17 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, * object if it is not in the process of being destroyed. */ mo = container_of(it, struct i915_mmu_object, it); + + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + if (update_to_read_only && mo->read_only) { + it = interval_tree_iter_next(it, range->start, end); + continue; + } + if (kref_get_unless_zero(>obj->base.refcount)) queue_work(mn->wq, >work); @@ -270,6 +285,7 @@ i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object *obj, mo->mn = mn; mo->obj = obj; mo->it.start = obj->userptr.ptr; + mo->read_only = i915_gem_object_is_readonly(obj); mo->it.last = obj->userptr.ptr + obj->base.size - 1; INIT_WORK(>work, cancel_userptr); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 7/9] gpu/drm/amdgpu: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 3e6823fdd939..7880eda064cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -294,6 +294,7 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + bool update_to_read_only; unsigned long end; /* notification is exclusive, but interval is inclusive */ @@ -302,6 +303,8 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; @@ -317,6 +320,16 @@ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, list_for_each_entry(bo, >bos, mn_list) { struct kgd_mem *mem = bo->kfd_bo; + bool read_only; + + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + read_only = amdgpu_ttm_tt_is_readonly(bo->tbo.ttm); + if (update_to_read_only && read_only) + continue; if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, range->start, -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 4/9] mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening This patch is just passing down the new informations by adding it to the mmu_notifier_range structure. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 1 file changed, 4 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index a9808add4070..7514775817de 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -56,9 +56,11 @@ struct mmu_notifier_mm { }; struct mmu_notifier_range { + struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; + enum mmu_notifier_event event; bool blockable; }; @@ -354,6 +356,8 @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { + range->vma = vma; + range->event = event; range->mm = mm; range->start = start; range->end = end; -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 5/9] mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper
From: Jérôme Glisse Helper to test if a range is updated to read only (it is still valid to read from the range). This is useful for device driver or anyone who wish to optimize out update when they know that they already have the range map read only. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 4 mm/mmu_notifier.c| 10 ++ 2 files changed, 14 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 7514775817de..be873c431886 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -257,6 +257,8 @@ extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -553,6 +555,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) { } +#define mmu_notifier_range_update_to_read_only(r) false + #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9c884abc7850..0b2f77715a08 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -395,3 +395,13 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); + +bool +mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range) +{ + if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA) + return false; + /* Return true if the vma still have the read flag set. */ + return range->vma->vm_flags & VM_READ; +} +EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 6/9] gpu/drm/radeon: optimize out the case when a range is updated to read only
From: Jérôme Glisse When range of virtual address is updated read only and corresponding user ptr object are already read only it is pointless to do anything. Optimize this case out. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/radeon/radeon_mn.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index b3019505065a..f77294f58e63 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -124,6 +124,7 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + bool update_to_read_only; unsigned long end; int ret = 0; @@ -138,6 +139,8 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, else if (!mutex_trylock(>lock)) return -EAGAIN; + update_to_read_only = mmu_notifier_range_update_to_read_only(range); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct radeon_mn_node *node; @@ -153,10 +156,20 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, it = interval_tree_iter_next(it, range->start, end); list_for_each_entry(bo, >bos, mn_list) { + bool read_only; if (!bo->tbo.ttm || bo->tbo.ttm->state != tt_bound) continue; + /* +* If it is already read only and we are updating to +* read only then we do not need to change anything. +* So save time and skip this one. +*/ + read_only = radeon_ttm_tt_is_readonly(bo->tbo.ttm); + if (update_to_read_only && read_only) + continue; + r = radeon_bo_reserve(bo, true); if (r) { DRM_ERROR("(%ld) failed to reserve user bo\n", r); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v4 3/9] mm/mmu_notifier: use correct mmu_notifier events for each invalidation
From: Jérôme Glisse This update each existing invalidation to use the correct mmu notifier event that represent what is happening to the CPU page table. See the patch which introduced the events to see the rational behind this. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 2 +- kernel/events/uprobes.c | 2 +- mm/huge_memory.c| 14 ++ mm/hugetlb.c| 7 --- mm/khugepaged.c | 2 +- mm/ksm.c| 4 ++-- mm/madvise.c| 2 +- mm/memory.c | 16 mm/migrate.c| 4 ++-- mm/mprotect.c | 5 +++-- mm/rmap.c | 6 +++--- 11 files changed, 32 insertions(+), 32 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 57e7f98647d3..cce226f3305f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,7 +1143,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, + mmu_notifier_range_init(, MMU_NOTIFY_SOFT_DIRTY, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b67fe7e59621..87e76a1dc758 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -174,7 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct mem_cgroup *memcg; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, mm, addr, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, mm, addr, addr + PAGE_SIZE); VM_BUG_ON_PAGE(PageTransHuge(old_page), old_page); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b353e8b7876f..957d23754217 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1182,9 +1182,8 @@ static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, cond_resched(); } - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1346,9 +1345,8 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, - haddr, - haddr + HPAGE_PMD_SIZE); + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, + haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); spin_lock(vmf->ptl); @@ -2025,7 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, address & HPAGE_PUD_MASK, (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE); mmu_notifier_invalidate_range_start(); @@ -2244,7 +2242,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; struct mmu_notifier_range range; - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, vma->vm_mm, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, vma->vm_mm, address & HPAGE_PMD_MASK, (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbda46ad6a30..f691398ac6b6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; if (cow) { - mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, vma, src, + mmu_notifier_range_init(, MMU_NOTIFY_CLEAR, vma, src, vma->vm_start,
[PATCH v4 0/9] mmu notifier provide context informations
From: Jérôme Glisse Hi Andrew, i see that you still have my event patch in you queue [1]. This patchset replace that single patch and is broken down in further step so that it is easier to review and ascertain that no mistake were made during mechanical changes. Here are the step: Patch 1 - add the enum values Patch 2 - coccinelle semantic patch to convert all call site of mmu_notifier_range_init to default enum value and also to passing down the vma when it is available Patch 3 - update many call site to more accurate enum values Patch 4 - add the information to the mmu_notifier_range struct Patch 5 - helper to test if a range is updated to read only All the remaining patches are update to various driver to demonstrate how this new information get use by device driver. I build tested with make all and make all minus everything that enable mmu notifier ie building with MMU_NOTIFIER=no. Also tested with some radeon,amd gpu and intel gpu. If they are no objections i believe best plan would be to merge the the first 5 patches (all mm changes) through your queue for 5.1 and then to delay driver update to each individual driver tree for 5.2. This will allow each individual device driver maintainer time to more thouroughly test this more then my own testing. Note that i also intend to use this feature further in nouveau and HMM down the road. I also expect that other user like KVM might be interested into leveraging this new information to optimize some of there secondary page table invalidation. Here is an explaination on the rational for this patchset: CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). [1] https://www.ozlabs.org/~akpm/mmotm/broken-out/mm-mmu_notifier-contextual-information-for-event-triggering-invalidation-v2.patch Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (9): mm/mmu_notifier: contextual information for event enums mm/mmu_notifier: contextual information for event triggering invalidation mm/mmu_notifier: use correct mmu_notifier events for each invalidation mm/mmu_notifier: pass down vma and reasons why mmu notifier is happening mm/mmu_notifier: mmu_notifier_range_update_to_read_only() helper gpu/drm/radeon: optimize out the case when a range is updated to read only gpu/drm/amdgpu: optimize out the case when a range is updated to read only gpu/drm/i915: optimize out the case when a range is updated to read only RDMA/umem_odp: optimize out the case when a range is updated to read only drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 13 drivers/gpu/drm/i915/i915_gem_userptr.c | 16 ++ drivers/gpu/drm/radeon/radeon_mn.c | 13 drivers/infiniband/core/umem_odp.c | 22 +++-- fs/proc/task_mmu.c | 3 +- include/linux/mmu_notifier.h| 42 - include/rdma/ib_umem_odp.h | 1 + kernel/events/uprobes.c | 3 +- mm/huge_memory.c| 14 + mm/hugetlb.c| 11 --- mm/khugepaged.c | 3 +- mm/ksm.c| 6 ++-- mm/madvise.c| 3 +- mm/memory.c | 25 +-- mm/migrate.c| 5 ++- mm/mmu_notifier.c | 10 ++ mm/mprotect.c | 4 ++- mm/mremap.c | 3 +- mm/oom_kill.c | 3 +- mm/rmap.c | 6 ++-- 20 files changed, 171 insertions(+), 35 deletions(-) -- 2.17.2 ___
[PATCH v4 2/9] mm/mmu_notifier: contextual information for event triggering invalidation
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset do the initial mechanical convertion of all the places that calls mmu_notifier_range_init to also provide the default MMU_NOTIFY_UNMAP event as well as the vma if it is know (most invalidation happens against a given vma). Passing down the vma allows the users of mmu notifier to inspect the new vma page protection. The MMU_NOTIFY_UNMAP is always the safe default as users of mmu notifier should assume that every for the range is going away when that event happens. A latter patch do convert mm call path to use a more appropriate events for each call. This is done as 2 patches so that no call site is forgotten especialy as it uses this following coccinelle patch: %<-- @@ identifier I1, I2, I3, I4; @@ static inline void mmu_notifier_range_init(struct mmu_notifier_range *I1, +enum mmu_notifier_event event, +struct vm_area_struct *vma, struct mm_struct *I2, unsigned long I3, unsigned long I4) { ... } @@ @@ -#define mmu_notifier_range_init(range, mm, start, end) +#define mmu_notifier_range_init(range, event, vma, mm, start, end) @@ expression E1, E3, E4; identifier I1; @@ <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, I1, I1->vm_mm, E3, E4) ...> @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(..., struct vm_area_struct *VMA, ...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN, VMA; @@ FN(...) { struct vm_area_struct *VMA; <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, VMA, E2, E3, E4) ...> } @@ expression E1, E2, E3, E4; identifier FN; @@ FN(...) { <... mmu_notifier_range_init(E1, +MMU_NOTIFY_UNMAP, NULL, E2, E3, E4) ...> } -->% Applied with: spatch --all-includes --sp-file mmu-notifier.spatch fs/proc/task_mmu.c --in-place spatch --sp-file mmu-notifier.spatch --dir kernel/events/ --in-place spatch --sp-file mmu-notifier.spatch --dir mm --in-place Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 4 +++- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 10 ++ mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 25 - mm/migrate.c | 5 - mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 3 ++- mm/rmap.c| 6 -- 14 files changed, 59 insertions(+), 30 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0ec9edab2f3..57e7f98647d3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1143,7 +1143,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, MMU_NOTIFY_UNMAP, + NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index abc9dbb7bcb6..a9808add4070 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -348,6 +348,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, + enum mmu_notifier_event event, + struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) @@ -482,7 +484,7 @@ static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range,
[PATCH v4 1/9] mm/mmu_notifier: contextual information for event enums
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). This patch introduce a set of enums that can be associated with each of the events triggering a mmu notifier. Latter patches take advantages of those enum values. - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- include/linux/mmu_notifier.h | 30 ++ 1 file changed, 30 insertions(+) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 4050ec1c3b45..abc9dbb7bcb6 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -10,6 +10,36 @@ struct mmu_notifier; struct mmu_notifier_ops; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags). User should soft dirty the page in the end callback to make + * sure that anyone relying on soft dirtyness catch pages that might be written + * through non CPU mappings. + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + #ifdef CONFIG_MMU_NOTIFIER /* -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v3 3/3] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset adds event information so that users of mmu notifier can differentiate among broad category: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Changes since v1: - use mmu_notifier_range_init() helper to to optimize out the case when mmu notifier is not enabled - use kernel doc format for describing the enum values Signed-off-by: Jérôme Glisse Acked-by: Christian König Acked-by: Jan Kara Acked-by: Felix Kuehling Acked-by: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/dax.c | 7 +++ fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 35 +-- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 10 ++ mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 18 -- mm/migrate.c | 5 +++-- mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 2 +- mm/rmap.c| 6 -- 15 files changed, 90 insertions(+), 29 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 874085bacaf5..6056b03a1626 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -768,6 +768,13 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); + /* +* All the field are populated by follow_pte_pmd() except +* the event field. +*/ + mmu_notifier_range_init(, NULL, 0, -1UL, + MMU_NOTIFY_PROTECTION_PAGE); + /* * Note because we provide start/end to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b3ddceb003bc..f68a9ebb0218 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1141,7 +1141,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, mm, 0, -1UL, + MMU_NOTIFY_SOFT_DIRTY); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 39b06772427f..d249e24acea5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,10 +25,39 @@ struct mmu_notifier_mm { spinlock_t lock; }; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + *
[PATCH v3 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls v3
From: Jérôme Glisse To avoid having to change many call sites everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end cakks. No functional changes with this patch. Changes since v2: - fix build warning in migrate.c when CONFIG_MMU_NOTIFIER=n Changes since v1: - introduce mmu_notifier_range_init() as an helper to initialize the range structure allowing to optimize out the case when mmu notifier is not enabled - fix mm/migrate.c migrate_vma_collect() Signed-off-by: Jérôme Glisse Acked-by: Christian König Acked-by: Jan Kara Acked-by: Felix Kuehling Acked-by: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- fs/dax.c | 8 +-- fs/proc/task_mmu.c | 7 ++- include/linux/mm.h | 4 +- include/linux/mmu_notifier.h | 87 +--- kernel/events/uprobes.c | 10 ++-- mm/huge_memory.c | 54 ++-- mm/hugetlb.c | 52 ++- mm/khugepaged.c | 10 ++-- mm/ksm.c | 21 mm/madvise.c | 21 mm/memory.c | 97 ++-- mm/migrate.c | 28 +-- mm/mmu_notifier.c| 35 +++-- mm/mprotect.c| 15 +++--- mm/mremap.c | 10 ++-- mm/oom_kill.c| 17 --- mm/rmap.c| 30 ++- 17 files changed, 259 insertions(+), 247 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 9bcce89ea18e..874085bacaf5 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -758,7 +758,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, >i_mmap, index, index) { - unsigned long address, start, end; + struct mmu_notifier_range range; + unsigned long address; cond_resched(); @@ -772,7 +773,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, , , , , )) + if (follow_pte_pmd(vma->vm_mm, address, , + , , )) continue; /* @@ -814,7 +816,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, pte_unmap_unlock(ptep, ptl); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(); } i_mmap_unlock_read(mapping); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..b3ddceb003bc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; @@ -1139,11 +1140,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, downgrade_write(>mmap_sem); break; } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); + mmu_notifier_invalidate_range_end(); tlb_finish_mmu(, 0, -1); up_read(>mmap_sem); out_mm: diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..e7b6f2b30713 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1397,6 +1397,8 @@ struct mm_walk { void *private; }; +struct mmu_notifier_range; + int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); @@ -1405,7 +1407,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
[PATCH v3 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback v2
From: Jérôme Glisse To avoid having to change many callback definition everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end callback. No functional changes with this patch. Changed since v1: - fix make htmldocs warning in amdgpu_mn.c Signed-off-by: Jérôme Glisse Acked-by: Jan Kara Acked-by: Felix Kuehling Acked-by: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 47 +++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 14 drivers/gpu/drm/radeon/radeon_mn.c | 16 - drivers/infiniband/core/umem_odp.c | 20 +-- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +++ drivers/misc/mic/scif/scif_dma.c| 11 ++ drivers/misc/sgi-gru/grutlbpurge.c | 14 drivers/xen/gntdev.c| 12 +++ include/linux/mmu_notifier.h| 14 +--- mm/hmm.c| 23 +--- mm/mmu_notifier.c | 21 +-- virt/kvm/kvm_main.c | 14 +++- 12 files changed, 103 insertions(+), 116 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index e55508b39496..3e6823fdd939 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -238,44 +238,40 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * amdgpu_mn_invalidate_range_start_gfx - callback to notify about mm change * * @mn: our notifier - * @mm: the mm this callback is about - * @start: start of updated range - * @end: end of updated range + * @range: mmu notifier context * * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end); - amdgpu_mn_invalidate_node(node, start, end); + amdgpu_mn_invalidate_node(node, range->start, end); } return 0; @@ -294,39 +290,38 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node;
[PATCH v3 0/3] mmu notifier contextual informations
From: Jérôme Glisse Changes since v2: - fix build warning with CONFIG_MMU_NOTIFIER=n - fix make htmldocs warning Changes since v1: - fix build with CONFIG_MMU_NOTIFIER=n - kernel docs Original cover letter: This patchset add contextual information, why an invalidation is happening, to mmu notifier callback. This is necessary for user of mmu notifier that wish to maintains their own data structure without having to add new fields to struct vm_area_struct (vma). For instance device can have they own page table that mirror the process address space. When a vma is unmap (munmap() syscall) the device driver can free the device page table for the range. Today we do not have any information on why a mmu notifier call back is happening and thus device driver have to assume that it is always an munmap(). This is inefficient at it means that it needs to re-allocate device page table on next page fault and rebuild the whole device driver data structure for the range. Other use case beside munmap() also exist, for instance it is pointless for device driver to invalidate the device page table when the invalidation is for the soft dirtyness tracking. Or device driver can optimize away mprotect() that change the page table permission access for the range. This patchset enable all this optimizations for device driver. I do not include any of those in this serie but other patchset i am posting will leverage this. From code point of view the patchset is pretty simple, the first two patches consolidate all mmu notifier arguments into a struct so that it is easier to add/change arguments. The last patch adds the contextual information (munmap, protection, soft dirty, clear, ...). Cheers, Jérôme Cc: Christian König Cc: Jan Kara Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: Arnd Bergmann Jérôme Glisse (3): mm/mmu_notifier: use structure for invalidate_range_start/end callback v2 mm/mmu_notifier: use structure for invalidate_range_start/end calls v3 mm/mmu_notifier: contextual information for event triggering invalidation v2 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 47 - drivers/gpu/drm/i915/i915_gem_userptr.c | 14 ++- drivers/gpu/drm/radeon/radeon_mn.c | 16 ++- drivers/infiniband/core/umem_odp.c | 20 ++-- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +-- drivers/misc/mic/scif/scif_dma.c| 11 +- drivers/misc/sgi-gru/grutlbpurge.c | 14 ++- drivers/xen/gntdev.c| 12 +-- fs/dax.c| 15 ++- fs/proc/task_mmu.c | 8 +- include/linux/mm.h | 4 +- include/linux/mmu_notifier.h| 132 ++-- kernel/events/uprobes.c | 11 +- mm/hmm.c| 23 ++--- mm/huge_memory.c| 58 +-- mm/hugetlb.c| 54 +- mm/khugepaged.c | 11 +- mm/ksm.c| 23 ++--- mm/madvise.c| 22 ++-- mm/memory.c | 103 +- mm/migrate.c| 29 +++--- mm/mmu_notifier.c | 22 ++-- mm/mprotect.c | 16 +-- mm/mremap.c | 11 +- mm/oom_kill.c | 17 +-- mm/rmap.c | 32 +++--- virt/kvm/kvm_main.c | 14 +-- 27 files changed, 406 insertions(+), 346 deletions(-) -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] dma-buf: balance refcount inbalance
From: Jérôme Glisse The debugfs take reference on fence without dropping them. Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Daniel Vetter Cc: Sumit Semwal Cc: linux-me...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-...@lists.linaro.org Cc: Stéphane Marchesin Cc: sta...@vger.kernel.org --- drivers/dma-buf/dma-buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 13884474d158..69842145c223 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1069,6 +1069,7 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); + dma_fence_put(fence); } rcu_read_unlock(); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] dma-buf: fix debugfs versus rcu and fence dumping v2
From: Jérôme Glisse The debugfs take reference on fence without dropping them. Also the rcu section are not well balance. Fix all that ... Changed since v1: - moved fobj logic around to be rcu safe Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Daniel Vetter Cc: Sumit Semwal Cc: linux-me...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-...@lists.linaro.org Cc: Stéphane Marchesin Cc: sta...@vger.kernel.org --- drivers/dma-buf/dma-buf.c | 21 - 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 13884474d158..9688d99894d6 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1048,27 +1048,38 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) while (true) { seq = read_seqcount_begin(>seq); rcu_read_lock(); - fobj = rcu_dereference(robj->fence); - shared_count = fobj ? fobj->shared_count : 0; fence = rcu_dereference(robj->fence_excl); + fence = dma_fence_get_rcu(fence); if (!read_seqcount_retry(>seq, seq)) break; rcu_read_unlock(); } - - if (fence) + if (fence) { seq_printf(s, "\tExclusive fence: %s %s %ssignalled\n", fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); - for (i = 0; i < shared_count; i++) { + dma_fence_put(fence); + } + + rcu_read_lock(); + fobj = rcu_dereference(robj->fence); + shared_count = fobj ? fobj->shared_count : 0; + for (i = 0, fence = NULL; i < shared_count; i++) { fence = rcu_dereference(fobj->shared[i]); if (!dma_fence_get_rcu(fence)) continue; + rcu_read_unlock(); + seq_printf(s, "\tShared fence: %s %s %ssignalled\n", fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); + dma_fence_put(fence); + + rcu_read_lock(); + fobj = rcu_dereference(robj->fence); + shared_count = fobj ? fobj->shared_count : 0; } rcu_read_unlock(); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] dma-buf: fix debugfs versus rcu and fence dumping
From: Jérôme Glisse The debugfs take reference on fence without dropping them. Also the rcu section are not well balance. Fix all that ... Signed-off-by: Jérôme Glisse Cc: Christian König Cc: Daniel Vetter Cc: Sumit Semwal Cc: linux-me...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linaro-mm-...@lists.linaro.org Cc: Stéphane Marchesin Cc: sta...@vger.kernel.org --- drivers/dma-buf/dma-buf.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 13884474d158..f6f4de42ac49 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -1051,24 +1051,31 @@ static int dma_buf_debug_show(struct seq_file *s, void *unused) fobj = rcu_dereference(robj->fence); shared_count = fobj ? fobj->shared_count : 0; fence = rcu_dereference(robj->fence_excl); + fence = dma_fence_get_rcu(fence); if (!read_seqcount_retry(>seq, seq)) break; rcu_read_unlock(); } - - if (fence) + if (fence) { seq_printf(s, "\tExclusive fence: %s %s %ssignalled\n", fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); + dma_fence_put(fence); + } + + rcu_read_lock(); for (i = 0; i < shared_count; i++) { fence = rcu_dereference(fobj->shared[i]); if (!dma_fence_get_rcu(fence)) continue; + rcu_read_unlock(); seq_printf(s, "\tShared fence: %s %s %ssignalled\n", fence->ops->get_driver_name(fence), fence->ops->get_timeline_name(fence), dma_fence_is_signaled(fence) ? "" : "un"); + dma_fence_put(fence); + rcu_read_lock(); } rcu_read_unlock(); -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH v2 3/3] mm/mmu_notifier: contextual information for event triggering invalidation v2
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset adds event information so that users of mmu notifier can differentiate among broad category: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Changes since v1: - use mmu_notifier_range_init() helper to to optimize out the case when mmu notifier is not enabled - use kernel doc format for describing the enum values Signed-off-by: Jérôme Glisse Acked-by: Christian König Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org --- fs/dax.c | 7 +++ fs/proc/task_mmu.c | 3 ++- include/linux/mmu_notifier.h | 35 +-- kernel/events/uprobes.c | 3 ++- mm/huge_memory.c | 12 mm/hugetlb.c | 10 ++ mm/khugepaged.c | 3 ++- mm/ksm.c | 6 -- mm/madvise.c | 3 ++- mm/memory.c | 18 -- mm/migrate.c | 5 +++-- mm/mprotect.c| 3 ++- mm/mremap.c | 3 ++- mm/oom_kill.c| 2 +- mm/rmap.c| 6 -- 15 files changed, 90 insertions(+), 29 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 874085bacaf5..6056b03a1626 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -768,6 +768,13 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, address = pgoff_address(index, vma); + /* +* All the field are populated by follow_pte_pmd() except +* the event field. +*/ + mmu_notifier_range_init(, NULL, 0, -1UL, + MMU_NOTIFY_PROTECTION_PAGE); + /* * Note because we provide start/end to follow_pte_pmd it will * call mmu_notifier_invalidate_range_start() on our behalf diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b3ddceb003bc..f68a9ebb0218 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1141,7 +1141,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, break; } - mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_range_init(, mm, 0, -1UL, + MMU_NOTIFY_SOFT_DIRTY); mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 39b06772427f..d249e24acea5 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,10 +25,39 @@ struct mmu_notifier_mm { spinlock_t lock; }; +/** + * enum mmu_notifier_event - reason for the mmu notifier callback + * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that + * move the range + * + * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like + * madvise() or replacing a page by another one, ...). + * + * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range + * ie using the vma access permission (vm_page_prot) to update the whole range + * is enough no need to inspect changes to the CPU page table (mprotect() + * syscall) + * + * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for + * pages in the range so to mirror those changes the user must inspect the CPU + * page table (from the end callback). + * + * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same + * access flags) + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, +
[PATCH v2 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback
From: Jérôme Glisse To avoid having to change many callback definition everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end callback. No functional changes with this patch. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 43 +++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 14 drivers/gpu/drm/radeon/radeon_mn.c | 16 - drivers/infiniband/core/umem_odp.c | 20 +--- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +++- drivers/misc/mic/scif/scif_dma.c| 11 ++- drivers/misc/sgi-gru/grutlbpurge.c | 14 drivers/xen/gntdev.c| 12 +++ include/linux/mmu_notifier.h| 14 +--- mm/hmm.c| 23 ++--- mm/mmu_notifier.c | 21 ++-- virt/kvm/kvm_main.c | 14 +++- 12 files changed, 102 insertions(+), 113 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index e55508b39496..5bc7e59a05a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -246,36 +246,34 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * potentially dirty. */ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end); - amdgpu_mn_invalidate_node(node, start, end); + amdgpu_mn_invalidate_node(node, range->start, end); } return 0; @@ -294,39 +292,38 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end);
[PATCH v2 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls v2
From: Jérôme Glisse To avoid having to change many call sites everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end cakks. No functional changes with this patch. Changes since v1: - introduce mmu_notifier_range_init() as an helper to initialize the range structure allowing to optimize out the case when mmu notifier is not enabled - fix mm/migrate.c migrate_vma_collect() Signed-off-by: Jérôme Glisse Acked-by: Christian König Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org --- fs/dax.c | 8 +-- fs/proc/task_mmu.c | 7 ++- include/linux/mm.h | 4 +- include/linux/mmu_notifier.h | 87 +--- kernel/events/uprobes.c | 10 ++-- mm/huge_memory.c | 54 ++-- mm/hugetlb.c | 52 ++- mm/khugepaged.c | 10 ++-- mm/ksm.c | 21 mm/madvise.c | 21 mm/memory.c | 97 ++-- mm/migrate.c | 25 +- mm/mmu_notifier.c| 35 +++-- mm/mprotect.c| 15 +++--- mm/mremap.c | 10 ++-- mm/oom_kill.c| 17 --- mm/rmap.c| 30 ++- 17 files changed, 258 insertions(+), 245 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 9bcce89ea18e..874085bacaf5 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -758,7 +758,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, >i_mmap, index, index) { - unsigned long address, start, end; + struct mmu_notifier_range range; + unsigned long address; cond_resched(); @@ -772,7 +773,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, , , , , )) + if (follow_pte_pmd(vma->vm_mm, address, , + , , )) continue; /* @@ -814,7 +816,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, pte_unmap_unlock(ptep, ptl); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(); } i_mmap_unlock_read(mapping); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..b3ddceb003bc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; @@ -1139,11 +1140,13 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, downgrade_write(>mmap_sem); break; } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + mmu_notifier_range_init(, mm, 0, -1UL); + mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); + mmu_notifier_invalidate_range_end(); tlb_finish_mmu(, 0, -1); up_read(>mmap_sem); out_mm: diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..e7b6f2b30713 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1397,6 +1397,8 @@ struct mm_walk { void *private; }; +struct mmu_notifier_range; + int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); @@ -1405,7 +1407,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, -unsigned long *start, unsigned
[PATCH 0/3] mmu notifier contextual informations
From: Jérôme Glisse This patchset add contextual information, why an invalidation is happening, to mmu notifier callback. This is necessary for user of mmu notifier that wish to maintains their own data structure without having to add new fields to struct vm_area_struct (vma). For instance device can have they own page table that mirror the process address space. When a vma is unmap (munmap() syscall) the device driver can free the device page table for the range. Today we do not have any information on why a mmu notifier call back is happening and thus device driver have to assume that it is always an munmap(). This is inefficient at it means that it needs to re-allocate device page table on next page fault and rebuild the whole device driver data structure for the range. Other use case beside munmap() also exist, for instance it is pointless for device driver to invalidate the device page table when the invalidation is for the soft dirtyness tracking. Or device driver can optimize away mprotect() that change the page table permission access for the range. This patchset enable all this optimizations for device driver. I do not include any of those in this serie but other patchset i am posting will leverage this. From code point of view the patchset is pretty simple, the first two patches consolidate all mmu notifier arguments into a struct so that it is easier to add/change arguments. The last patch adds the contextual information (munmap, protection, soft dirty, clear, ...). Cheers, Jérôme Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Jérôme Glisse (3): mm/mmu_notifier: use structure for invalidate_range_start/end callback mm/mmu_notifier: use structure for invalidate_range_start/end calls mm/mmu_notifier: contextual information for event triggering invalidation drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 43 - drivers/gpu/drm/i915/i915_gem_userptr.c | 14 ++- drivers/gpu/drm/radeon/radeon_mn.c | 16 ++-- drivers/infiniband/core/umem_odp.c | 20 ++--- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 ++- drivers/misc/mic/scif/scif_dma.c| 11 +-- drivers/misc/sgi-gru/grutlbpurge.c | 14 ++- drivers/xen/gntdev.c| 12 +-- fs/dax.c| 11 ++- fs/proc/task_mmu.c | 10 ++- include/linux/mm.h | 4 +- include/linux/mmu_notifier.h| 106 +++--- kernel/events/uprobes.c | 13 +-- mm/hmm.c| 23 ++--- mm/huge_memory.c| 58 ++-- mm/hugetlb.c| 63 +++-- mm/khugepaged.c | 13 +-- mm/ksm.c| 26 +++--- mm/madvise.c| 22 ++--- mm/memory.c | 112 ++-- mm/migrate.c| 30 --- mm/mmu_notifier.c | 22 +++-- mm/mprotect.c | 17 ++-- mm/mremap.c | 14 +-- mm/oom_kill.c | 20 +++-- mm/rmap.c | 34 --- virt/kvm/kvm_main.c | 14 ++- 27 files changed, 421 insertions(+), 334 deletions(-) -- 2.17.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 3/3] mm/mmu_notifier: contextual information for event triggering invalidation
From: Jérôme Glisse CPU page table update can happens for many reasons, not only as a result of a syscall (munmap(), mprotect(), mremap(), madvise(), ...) but also as a result of kernel activities (memory compression, reclaim, migration, ...). Users of mmu notifier API track changes to the CPU page table and take specific action for them. While current API only provide range of virtual address affected by the change, not why the changes is happening. This patchset adds event information so that users of mmu notifier can differentiate among broad category: - UNMAP: munmap() or mremap() - CLEAR: page table is cleared (migration, compaction, reclaim, ...) - PROTECTION_VMA: change in access protections for the range - PROTECTION_PAGE: change in access protections for page in the range - SOFT_DIRTY: soft dirtyness tracking Being able to identify munmap() and mremap() from other reasons why the page table is cleared is important to allow user of mmu notifier to update their own internal tracking structure accordingly (on munmap or mremap it is not longer needed to track range of virtual address as it becomes invalid). Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org --- fs/dax.c | 1 + fs/proc/task_mmu.c | 1 + include/linux/mmu_notifier.h | 33 + kernel/events/uprobes.c | 1 + mm/huge_memory.c | 4 mm/hugetlb.c | 4 mm/khugepaged.c | 1 + mm/ksm.c | 2 ++ mm/madvise.c | 1 + mm/memory.c | 5 + mm/migrate.c | 2 ++ mm/mprotect.c| 1 + mm/mremap.c | 1 + mm/oom_kill.c| 1 + mm/rmap.c| 2 ++ 15 files changed, 60 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index e22508ee19ec..83092c5ac5f0 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -761,6 +761,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, struct mmu_notifier_range range; unsigned long address; + range.event = MMU_NOTIFY_PROTECTION_PAGE; range.mm = vma->vm_mm; cond_resched(); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 53d625925669..4abb1668eeb3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1144,6 +1144,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, range.start = 0; range.end = -1UL; range.mm = mm; + range.event = MMU_NOTIFY_SOFT_DIRTY; mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cbeece8e47d4..3077d487be8b 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -25,10 +25,43 @@ struct mmu_notifier_mm { spinlock_t lock; }; +/* + * What event is triggering the invalidation: + * + * MMU_NOTIFY_UNMAP + *either munmap() that unmap the range or a mremap() that move the range + * + * MMU_NOTIFY_CLEAR + *clear page table entry (many reasons for this like madvise() or replacing + *a page by another one, ...). + * + * MMU_NOTIFY_PROTECTION_VMA + *update is due to protection change for the range ie using the vma access + *permission (vm_page_prot) to update the whole range is enough no need to + *inspect changes to the CPU page table (mprotect() syscall) + * + * MMU_NOTIFY_PROTECTION_PAGE + *update is due to change in read/write flag for pages in the range so to + *mirror those changes the user must inspect the CPU page table (from the + *end callback). + * + * + * MMU_NOTIFY_SOFT_DIRTY + *soft dirty accounting (still same page and same access flags) + */ +enum mmu_notifier_event { + MMU_NOTIFY_UNMAP = 0, + MMU_NOTIFY_CLEAR, + MMU_NOTIFY_PROTECTION_VMA, + MMU_NOTIFY_PROTECTION_PAGE, + MMU_NOTIFY_SOFT_DIRTY, +}; + struct mmu_notifier_range { struct mm_struct *mm; unsigned long start; unsigned long end; + enum mmu_notifier_event event; bool blockable; }; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index aa7996ca361e..b6ef3be1c24e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -174,6 +174,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct mmu_notifier_range range; struct
[PATCH 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls
From: Jérôme Glisse To avoid having to change many call sites everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end cakks. No functional changes with this patch. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org --- fs/dax.c | 10 +++- fs/proc/task_mmu.c | 9 ++- include/linux/mm.h | 4 +- include/linux/mmu_notifier.h | 59 +-- kernel/events/uprobes.c | 12 ++-- mm/huge_memory.c | 54 ++ mm/hugetlb.c | 59 ++- mm/khugepaged.c | 12 ++-- mm/ksm.c | 24 mm/madvise.c | 21 +++ mm/memory.c | 107 --- mm/migrate.c | 28 - mm/mmu_notifier.c| 35 +++- mm/mprotect.c| 16 -- mm/mremap.c | 13 +++-- mm/oom_kill.c| 19 --- mm/rmap.c| 32 +++ 17 files changed, 276 insertions(+), 238 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 9bcce89ea18e..e22508ee19ec 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -758,7 +758,10 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, >i_mmap, index, index) { - unsigned long address, start, end; + struct mmu_notifier_range range; + unsigned long address; + + range.mm = vma->vm_mm; cond_resched(); @@ -772,7 +775,8 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, * call mmu_notifier_invalidate_range_start() on our behalf * before taking any lock. */ - if (follow_pte_pmd(vma->vm_mm, address, , , , , )) + if (follow_pte_pmd(vma->vm_mm, address, , + , , )) continue; /* @@ -814,7 +818,7 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, pte_unmap_unlock(ptep, ptl); } - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); + mmu_notifier_invalidate_range_end(); } i_mmap_unlock_read(mapping); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c3764c469b..53d625925669 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, }; @@ -1139,11 +1140,15 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, downgrade_write(>mmap_sem); break; } - mmu_notifier_invalidate_range_start(mm, 0, -1); + + range.start = 0; + range.end = -1UL; + range.mm = mm; + mmu_notifier_invalidate_range_start(); } walk_page_range(0, mm->highest_vm_end, _refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) - mmu_notifier_invalidate_range_end(mm, 0, -1); + mmu_notifier_invalidate_range_end(); tlb_finish_mmu(, 0, -1); up_read(>mmap_sem); out_mm: diff --git a/include/linux/mm.h b/include/linux/mm.h index 5411de93a363..e7b6f2b30713 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1397,6 +1397,8 @@ struct mm_walk { void *private; }; +struct mmu_notifier_range; + int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk); int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); @@ -1405,7 +1407,7 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, -unsigned long *start, unsigned long *end, +struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
[PATCH 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback
From: Jérôme Glisse To avoid having to change many callback definition everytime we want to add a parameter use a structure to group all parameters for the mmu_notifier invalidate_range_start/end callback. No functional changes with this patch. Signed-off-by: Jérôme Glisse Cc: Andrew Morton Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Jan Kara Cc: Dan Williams Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Michal Hocko Cc: Christian Koenig Cc: Felix Kuehling Cc: Ralph Campbell Cc: John Hubbard Cc: k...@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: linux-fsde...@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 43 +++-- drivers/gpu/drm/i915/i915_gem_userptr.c | 14 drivers/gpu/drm/radeon/radeon_mn.c | 16 - drivers/infiniband/core/umem_odp.c | 20 +--- drivers/infiniband/hw/hfi1/mmu_rb.c | 13 +++- drivers/misc/mic/scif/scif_dma.c| 11 ++- drivers/misc/sgi-gru/grutlbpurge.c | 14 drivers/xen/gntdev.c| 12 +++ include/linux/mmu_notifier.h| 14 +--- mm/hmm.c| 23 ++--- mm/mmu_notifier.c | 21 ++-- virt/kvm/kvm_main.c | 14 +++- 12 files changed, 102 insertions(+), 113 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index e55508b39496..5bc7e59a05a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -246,36 +246,34 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * potentially dirty. */ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; /* TODO we should be able to split locking for interval tree and * amdgpu_mn_invalidate_node */ - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end); - amdgpu_mn_invalidate_node(node, start, end); + amdgpu_mn_invalidate_node(node, range->start, end); } return 0; @@ -294,39 +292,38 @@ static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) + const struct mmu_notifier_range *range) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; + unsigned long end; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = range->end - 1; - if (amdgpu_mn_read_lock(amn, blockable)) + if (amdgpu_mn_read_lock(amn, range->blockable)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, range->start, end); while (it) { struct amdgpu_mn_node *node; struct amdgpu_bo *bo; - if (!blockable) { + if (!range->blockable) { amdgpu_mn_read_unlock(amn); return -EAGAIN; } node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, range->start, end);
[PATCH 1/2] gpu/radeon: use HMM mirror instead of mmu_notifier
From: Jérôme Glisse HMM provide a sets of helpers to avoid individual drivers re-doing their own. This patch convert the radeon to use HMM mirror to track CPU page table update and invalidate accordingly for userptr object. Signed-off-by: Jérôme Glisse Cc: dri-devel@lists.freedesktop.org Cc: Alex Deucher Cc: Christian König Cc: Felix Kuehling Cc: David (ChunMing) Zhou Cc: Nicolai Hähnle Cc: amd-...@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter --- drivers/gpu/drm/radeon/radeon_mn.c | 126 ++--- 1 file changed, 63 insertions(+), 63 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index f8b35df44c60..a3bf74c1a3fc 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -30,7 +30,7 @@ #include #include -#include +#include #include #include @@ -40,7 +40,7 @@ struct radeon_mn { /* constant after initialisation */ struct radeon_device*rdev; struct mm_struct*mm; - struct mmu_notifier mn; + struct hmm_mirror mirror; /* only used on destruction */ struct work_struct work; @@ -87,72 +87,67 @@ static void radeon_mn_destroy(struct work_struct *work) } mutex_unlock(>lock); mutex_unlock(>mn_lock); - mmu_notifier_unregister(>mn, rmn->mm); + hmm_mirror_unregister(>mirror); kfree(rmn); } /** * radeon_mn_release - callback to notify about mm destruction * - * @mn: our notifier - * @mn: the mm this callback is about + * @mirror: our mirror struct * * Shedule a work item to lazy destroy our notifier. */ -static void radeon_mn_release(struct mmu_notifier *mn, - struct mm_struct *mm) +static void radeon_mirror_release(struct hmm_mirror *mirror) { - struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); + struct radeon_mn *rmn = container_of(mirror, struct radeon_mn, mirror); INIT_WORK(>work, radeon_mn_destroy); schedule_work(>work); } /** - * radeon_mn_invalidate_range_start - callback to notify about mm change + * radeon_sync_cpu_device_pagetables - callback to synchronize with mm changes * - * @mn: our notifier - * @mn: the mm this callback is about - * @start: start of updated range - * @end: end of updated range + * @mirror: our HMM mirror + * @update: update informations (start, end, event, blockable, ...) * - * We block for all BOs between start and end to be idle and - * unmap them by move them into system domain again. + * We block for all BOs between start and end to be idle and unmap them by + * moving them into system domain again (trigger a call to ttm_backend_func. + * unbind see radeon_ttm.c). */ -static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, -struct mm_struct *mm, -unsigned long start, -unsigned long end, -bool blockable) +static int radeon_sync_cpu_device_pagetables(struct hmm_mirror *mirror, +const struct hmm_update *update) { - struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); + struct radeon_mn *rmn = container_of(mirror, struct radeon_mn, mirror); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + unsigned long end; int ret = 0; /* notification is exclusive, but interval is inclusive */ - end -= 1; + end = update->end - 1; /* TODO we should be able to split locking for interval tree and * the tear down. */ - if (blockable) + if (update->blockable) mutex_lock(>lock); else if (!mutex_trylock(>lock)) return -EAGAIN; - it = interval_tree_iter_first(>objects, start, end); + it = interval_tree_iter_first(>objects, update->start, end); while (it) { struct radeon_mn_node *node; struct radeon_bo *bo; long r; - if (!blockable) { + if (!update->blockable) { ret = -EAGAIN; goto out_unlock; } node = container_of(it, struct radeon_mn_node, it); - it = interval_tree_iter_next(it, start, end); + it = interval_tree_iter_next(it, update->start, end); list_for_each_entry(bo, >bos, mn_list) { @@ -178,16 +173,16 @@ static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, radeon_bo_unreserve(bo); } } - + out_unlock: mutex_unlock(>lock); return ret; } -static const struct mmu_notifier_ops radeon_mn_ops = { - .release =
[PATCH 2/2] gpu/radeon: use HMM mirror for userptr buffer object.
From: Jérôme Glisse This replace existing code that rely on get_user_page() aka GUP with code that now use HMM mirror to mirror a range of virtual address as a buffer object accessible by the GPU. There is no functional changes from userspace point of view. From kernel point of view we no longer pin pages for userptr buffer object which is a welcome change (i am assuming that everyone dislike page pin as i do). Signed-off-by: Jérôme Glisse Cc: dri-devel@lists.freedesktop.org Cc: Alex Deucher Cc: Christian König Cc: Felix Kuehling Cc: David (ChunMing) Zhou Cc: Nicolai Hähnle Cc: amd-...@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter --- drivers/gpu/drm/radeon/radeon.h | 14 ++- drivers/gpu/drm/radeon/radeon_gem.c | 16 +-- drivers/gpu/drm/radeon/radeon_mn.c | 157 +++- drivers/gpu/drm/radeon/radeon_ttm.c | 129 +++ 4 files changed, 196 insertions(+), 120 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 1a6f6edb3515..6c83bf911e9c 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -514,6 +514,8 @@ struct radeon_bo { pid_t pid; struct radeon_mn*mn; + uint64_t*pfns; + unsigned long userptr; struct list_headmn_list; }; #define gem_to_radeon_bo(gobj) container_of((gobj), struct radeon_bo, gem_base) @@ -1787,12 +1789,22 @@ void radeon_test_syncing(struct radeon_device *rdev); #if defined(CONFIG_MMU_NOTIFIER) int radeon_mn_register(struct radeon_bo *bo, unsigned long addr); void radeon_mn_unregister(struct radeon_bo *bo); +int radeon_mn_bo_map(struct radeon_bo *bo, struct ttm_dma_tt *dma, bool write); +void radeon_mn_bo_unmap(struct radeon_bo *bo, struct ttm_dma_tt *dma, + bool write); #else static inline int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) { return -ENODEV; } static inline void radeon_mn_unregister(struct radeon_bo *bo) {} +static int radeon_mn_bo_map(struct radeon_bo *bo, struct ttm_dma_tt *dma, + bool write) +{ + return -ENODEV; +} +static void radeon_mn_bo_unmap(struct radeon_bo *bo, struct ttm_dma_tt *dma, + bool write) {} #endif /* @@ -2818,7 +2830,7 @@ extern void radeon_legacy_set_clock_gating(struct radeon_device *rdev, int enabl extern void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable); extern void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain); extern bool radeon_ttm_bo_is_radeon_bo(struct ttm_buffer_object *bo); -extern int radeon_ttm_tt_set_userptr(struct ttm_tt *ttm, uint64_t addr, +extern int radeon_ttm_tt_set_userptr(struct ttm_tt *ttm, struct radeon_bo *bo, uint32_t flags); extern bool radeon_ttm_tt_has_userptr(struct ttm_tt *ttm); extern bool radeon_ttm_tt_is_readonly(struct ttm_tt *ttm); diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 27d8e7dd2d06..b489025086c4 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -323,15 +323,19 @@ int radeon_gem_userptr_ioctl(struct drm_device *dev, void *data, goto handle_lockup; bo = gem_to_radeon_bo(gobj); - r = radeon_ttm_tt_set_userptr(bo->tbo.ttm, args->addr, args->flags); + + /* +* Always register an HMM mirror (if one is not already registered). +* This means ignoring RADEON_GEM_USERPTR_REGISTER flag but that flag +* is already made mandatory by flags sanity check above. +*/ + r = radeon_mn_register(bo, args->addr); if (r) goto release_object; - if (args->flags & RADEON_GEM_USERPTR_REGISTER) { - r = radeon_mn_register(bo, args->addr); - if (r) - goto release_object; - } + r = radeon_ttm_tt_set_userptr(bo->tbo.ttm, bo, args->flags); + if (r) + goto release_object; if (args->flags & RADEON_GEM_USERPTR_VALIDATE) { down_read(>mm->mmap_sem); diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index a3bf74c1a3fc..ff53ffa5deef 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -262,9 +262,18 @@ int radeon_mn_register(struct radeon_bo *bo, unsigned long addr) struct list_head bos; struct interval_tree_node *it; + bo->userptr = addr; + bo->pfns = kvmalloc_array(bo->tbo.num_pages, sizeof(uint64_t), + GFP_KERNEL | __GFP_ZERO); + if (bo->pfns == NULL) + return -ENOMEM; + rmn = radeon_mn_get(rdev); - if (IS_ERR(rmn)) + if (IS_ERR(rmn)) { + kvfree(bo->pfns); +
[PATCH 2/2] gpu/i915: use HMM mirror for userptr buffer object.
From: Jérôme Glisse This replace existing code that rely on get_user_page() aka GUP with code that now use HMM mirror to mirror a range of virtual address as a buffer object accessible by the GPU. There is no functional changes from userspace point of view. From kernel point of view we no longer pin pages for userptr buffer object which is a welcome change (i am assuming that everyone dislike page pin as i do). Another change, from kernel point of view, is that it does no longer have a fast path with get_user_pages_fast() this can eventually added back through HMM. Signed-off-by: Jérôme Glisse Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Cc: Chris Wilson Cc: Lionel Landwerlin Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-...@lists.freedesktop.org --- drivers/gpu/drm/i915/i915_gem_userptr.c | 206 1 file changed, 102 insertions(+), 104 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 5e09b654b5ad..378aab438ebd 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -464,7 +464,7 @@ __i915_gem_userptr_alloc_pages(struct drm_i915_gem_object *obj, static int __i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, - bool value) + struct hmm_range *range) { int ret = 0; @@ -486,86 +486,120 @@ __i915_gem_userptr_set_active(struct drm_i915_gem_object *obj, /* In order to serialise get_pages with an outstanding * cancel_userptr, we must drop the struct_mutex and try again. */ - if (!value) + if (range) { + if (!hmm_vma_range_done(range)) + ret = -EAGAIN; + else + add_object(obj->userptr.mmu_object); + } else del_object(obj->userptr.mmu_object); - else if (!work_pending(>userptr.mmu_object->work)) - add_object(obj->userptr.mmu_object); - else - ret = -EAGAIN; spin_unlock(>userptr.mmu_object->mirror->lock); #endif return ret; } -static void -__i915_gem_userptr_get_pages_worker(struct work_struct *_work) +static int +i915_gem_userptr_map(struct drm_i915_gem_object *obj, bool try) { - struct get_pages_work *work = container_of(_work, typeof(*work), work); - struct drm_i915_gem_object *obj = work->obj; - const int npages = obj->base.size >> PAGE_SHIFT; +#if defined(CONFIG_HMM_MIRROR) + static const uint64_t i915_range_flags[HMM_PFN_FLAG_MAX] = { + (1 << 0), /* HMM_PFN_VALID */ + (1 << 1), /* HMM_PFN_WRITE */ + 0 /* HMM_PFN_DEVICE_PRIVATE */ + }; + static const uint64_t i915_range_values[HMM_PFN_VALUE_MAX] = { + 0xfffeUL, /* HMM_PFN_ERROR */ + 0, /* HMM_PFN_NONE */ + 0xfffcUL /* HMM_PFN_SPECIAL */ + }; + + const unsigned long npages = obj->base.size >> PAGE_SHIFT; + struct mm_struct *mm = obj->userptr.mm->mm; + struct sg_table *pages; + struct hmm_range range; struct page **pvec; - int pinned, ret; - - ret = -ENOMEM; - pinned = 0; - - pvec = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - if (pvec != NULL) { - struct mm_struct *mm = obj->userptr.mm->mm; - unsigned int flags = 0; - - if (!i915_gem_object_is_readonly(obj)) - flags |= FOLL_WRITE; - - ret = -EFAULT; - if (mmget_not_zero(mm)) { - down_read(>mmap_sem); - while (pinned < npages) { - ret = get_user_pages_remote - (work->task, mm, -obj->userptr.ptr + pinned * PAGE_SIZE, -npages - pinned, -flags, -pvec + pinned, NULL, NULL); - if (ret < 0) - break; - - pinned += ret; - } - up_read(>mmap_sem); - mmput(mm); - } + unsigned long i; + bool write = !i915_gem_object_is_readonly(obj); + int err; + + range.pfns = kvmalloc_array(npages, sizeof(uint64_t), + try ? GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN : GFP_KERNEL); + if (range.pfns == NULL) + return try ? -EAGAIN : -ENOMEM; + + range.pfn_shift = 12; + range.start = obj->userptr.ptr; + range.flags = i915_range_flags; + range.values = i915_range_values; + range.end = range.start + obj->base.size; + +
[PATCH 0/2] [i915] Getting rid of GUP and use HMM for user ptr features.
From: Jérôme Glisse [This depends on some HMM patchset queued upstream see branch [1]] This is simple change to switch to use HMM for user ptr buffer object which conveniently avoid to pin pages. I have more things in the pipe to make HMM more usefull for such cases (like sharing more resources accross multiple mirror of a same process). Beside avoiding pining, this is also an attempt to isolate core mm from device drivers by having clearly define API and boundary where we can set expection of everyone and thus having mm folks to have to read and understand driver code and conversly having driver folks understand mm maze. This is also part of what i want to discuss during XDC2018. Consider this as an RFC to start the discussion. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-intel-v00 Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Cc: Chris Wilson Cc: Lionel Landwerlin Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-...@lists.freedesktop.org Jérôme Glisse (2): gpu/i915: use HMM mirror instead of mmu_notifier gpu/i915: use HMM mirror for userptr buffer object. drivers/gpu/drm/i915/Kconfig| 4 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 395 2 files changed, 199 insertions(+), 200 deletions(-) -- 2.17.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 0/2] [radeon] Getting rid of GUP and use HMM for user ptr features.
From: Jérôme Glisse [This depends on some HMM patchset queued upstream see branch [1]] This is simple change to switch to use HMM for user ptr buffer object which conveniently avoid to pin pages. I have more things in the pipe to make HMM more usefull for such cases (like sharing more resources accross multiple mirror of a same process). Beside avoiding pining, this is also an attempt to isolate core mm from device drivers by having clearly define API and boundary where we can set expection of everyone and thus having mm folks to have to read and understand driver code and conversly having driver folks understand mm maze. This is also part of what i want to discuss during XDC2018. Consider this as an RFC to start the discussion. [1] https://cgit.freedesktop.org/~glisse/linux/log/?h=hmm-radeon-v00 Cc: dri-devel@lists.freedesktop.org Cc: Alex Deucher Cc: Christian König Cc: Felix Kuehling Cc: David (ChunMing) Zhou Cc: Nicolai Hähnle Cc: amd-...@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Jérôme Glisse (2): gpu/radeon: use HMM mirror instead of mmu_notifier gpu/radeon: use HMM mirror for userptr buffer object. drivers/gpu/drm/radeon/radeon.h | 14 +- drivers/gpu/drm/radeon/radeon_gem.c | 16 +- drivers/gpu/drm/radeon/radeon_mn.c | 283 +--- drivers/gpu/drm/radeon/radeon_ttm.c | 129 ++--- 4 files changed, 259 insertions(+), 183 deletions(-) -- 2.17.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 1/2] gpu/i915: use HMM mirror instead of mmu_notifier
From: Jérôme Glisse HMM provide a sets of helpers to avoid individual drivers re-doing their own. This patch convert the radeon to use HMM mirror to track CPU page table update and invalidate accordingly for userptr object. Signed-off-by: Jérôme Glisse Cc: dri-devel@lists.freedesktop.org Cc: David Airlie Cc: Daniel Vetter Cc: Chris Wilson Cc: Lionel Landwerlin Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-...@lists.freedesktop.org --- drivers/gpu/drm/i915/Kconfig| 4 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 189 2 files changed, 97 insertions(+), 96 deletions(-) diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 33a458b7f1fc..40bba0bd8124 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -87,10 +87,10 @@ config DRM_I915_COMPRESS_ERROR config DRM_I915_USERPTR bool "Always enable userptr support" depends on DRM_I915 - select MMU_NOTIFIER + select HMM_MIRROR default y help - This option selects CONFIG_MMU_NOTIFIER if it isn't already + This option selects CONFIG_HMM_MIRROR if it isn't already selected to enabled full userptr support. If in doubt, say "Y". diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 2c9b284036d1..5e09b654b5ad 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -28,7 +28,7 @@ #include "i915_trace.h" #include "intel_drv.h" #include -#include +#include #include #include #include @@ -36,25 +36,25 @@ struct i915_mm_struct { struct mm_struct *mm; struct drm_i915_private *i915; - struct i915_mmu_notifier *mn; + struct i915_mirror *mirror; struct hlist_node node; struct kref kref; struct work_struct work; }; -#if defined(CONFIG_MMU_NOTIFIER) +#if defined(CONFIG_HMM_MIRROR) #include -struct i915_mmu_notifier { +struct i915_mirror { spinlock_t lock; struct hlist_node node; - struct mmu_notifier mn; + struct hmm_mirror mirror; struct rb_root_cached objects; struct workqueue_struct *wq; }; struct i915_mmu_object { - struct i915_mmu_notifier *mn; + struct i915_mirror *mirror; struct drm_i915_gem_object *obj; struct interval_tree_node it; struct list_head link; @@ -99,7 +99,7 @@ static void add_object(struct i915_mmu_object *mo) if (mo->attached) return; - interval_tree_insert(>it, >mn->objects); + interval_tree_insert(>it, >mirror->objects); mo->attached = true; } @@ -108,33 +108,29 @@ static void del_object(struct i915_mmu_object *mo) if (!mo->attached) return; - interval_tree_remove(>it, >mn->objects); + interval_tree_remove(>it, >mirror->objects); mo->attached = false; } -static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, - struct mm_struct *mm, - unsigned long start, - unsigned long end, - bool blockable) +static int i915_sync_cpu_device_pagetables(struct hmm_mirror *_mirror, + const struct hmm_update *update) { - struct i915_mmu_notifier *mn = - container_of(_mn, struct i915_mmu_notifier, mn); + struct i915_mirror *mirror = + container_of(_mirror, struct i915_mirror, mirror); + /* interval ranges are inclusive, but invalidate range is exclusive */ + unsigned long end = update->end - 1; struct i915_mmu_object *mo; struct interval_tree_node *it; LIST_HEAD(cancelled); - if (RB_EMPTY_ROOT(>objects.rb_root)) + if (RB_EMPTY_ROOT(>objects.rb_root)) return 0; - /* interval ranges are inclusive, but invalidate range is exclusive */ - end--; - - spin_lock(>lock); - it = interval_tree_iter_first(>objects, start, end); + spin_lock(>lock); + it = interval_tree_iter_first(>objects, update->start, end); while (it) { - if (!blockable) { - spin_unlock(>lock); + if (!update->blockable) { + spin_unlock(>lock); return -EAGAIN; } /* The mmu_object is released late when destroying the @@ -148,50 +144,56 @@ static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, */ mo = container_of(it, struct i915_mmu_object, it); if (kref_get_unless_zero(>obj->base.refcount)) - queue_work(mn->wq, >work); +
[PATCH] drm/nouveau: remove ghost file
From: Jérôme Glisse This ghost file have been haunting us. Signed-off-by: Jérôme Glisse --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b. | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b. diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b. b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/gp10b. deleted file mode 100644 index e69de29bb2d1.. -- 2.17.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 08/13] drm/nouveau: special mapping method for HMM (user interface)
From: Jérôme GlisseSigned-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvif/if000c.h | 17 drivers/gpu/drm/nouveau/include/nvif/vmm.h| 2 + drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h | 25 drivers/gpu/drm/nouveau/nvif/vmm.c| 29 ++ drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c| 49 --- 5 files changed, 99 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000c.h b/drivers/gpu/drm/nouveau/include/nvif/if000c.h index 2928ecd989ad..2c24817ca533 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/if000c.h +++ b/drivers/gpu/drm/nouveau/include/nvif/if000c.h @@ -14,6 +14,8 @@ struct nvif_vmm_v0 { #define NVIF_VMM_V0_PUT0x02 #define NVIF_VMM_V0_MAP0x03 #define NVIF_VMM_V0_UNMAP 0x04 +#define NVIF_VMM_V0_HMM_MAP0x05 +#define NVIF_VMM_V0_HMM_UNMAP 0x06 struct nvif_vmm_page_v0 { __u8 version; @@ -61,4 +63,19 @@ struct nvif_vmm_unmap_v0 { __u8 pad01[7]; __u64 addr; }; + +struct nvif_vmm_hmm_map_v0 { + __u8 version; + __u8 pad01[7]; + __u64 addr; + __u64 npages; + __u64 pages; +}; + +struct nvif_vmm_hmm_unmap_v0 { + __u8 version; + __u8 pad01[7]; + __u64 addr; + __u64 npages; +}; #endif diff --git a/drivers/gpu/drm/nouveau/include/nvif/vmm.h b/drivers/gpu/drm/nouveau/include/nvif/vmm.h index c5db8a2e82df..c5e4adaa0e3c 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/vmm.h +++ b/drivers/gpu/drm/nouveau/include/nvif/vmm.h @@ -39,4 +39,6 @@ void nvif_vmm_put(struct nvif_vmm *, struct nvif_vma *); int nvif_vmm_map(struct nvif_vmm *, u64 addr, u64 size, void *argv, u32 argc, struct nvif_mem *, u64 offset); int nvif_vmm_unmap(struct nvif_vmm *, u64); +int nvif_vmm_hmm_map(struct nvif_vmm *vmm, u64 addr, u64 npages, u64 *pages); +int nvif_vmm_hmm_unmap(struct nvif_vmm *vmm, u64 addr, u64 npages); #endif diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h index 719d50e6296f..8f08718e05aa 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h @@ -4,20 +4,6 @@ #include #include -/* Need to change HMM to be more driver friendly */ -#if IS_ENABLED(CONFIG_HMM) -#else -typedef unsigned long hmm_pfn_t; -#define HMM_PFN_VALID (1 << 0) -#define HMM_PFN_READ (1 << 1) -#define HMM_PFN_WRITE (1 << 2) -#define HMM_PFN_ERROR (1 << 3) -#define HMM_PFN_EMPTY (1 << 4) -#define HMM_PFN_SPECIAL (1 << 5) -#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) -#define HMM_PFN_SHIFT 7 -#endif - struct nvkm_vma { struct list_head head; struct rb_node tree; @@ -79,10 +65,13 @@ struct nvkm_vmm_map { struct nvkm_mm_node *mem; struct scatterlist *sgl; dma_addr_t *dma; -#define NV_HMM_PAGE_FLAG_V HMM_PFN_VALID -#define NV_HMM_PAGE_FLAG_W HMM_PFN_WRITE -#define NV_HMM_PAGE_FLAG_E HMM_PFN_ERROR -#define NV_HMM_PAGE_PFN_SHIFT HMM_PFN_SHIFT +#define NV_HMM_PAGE_FLAG_V (1 << 0) +#define NV_HMM_PAGE_FLAG_R 0 +#define NV_HMM_PAGE_FLAG_W (1 << 1) +#define NV_HMM_PAGE_FLAG_E (-1ULL) +#define NV_HMM_PAGE_FLAG_N 0 +#define NV_HMM_PAGE_FLAG_S (1ULL << 63) +#define NV_HMM_PAGE_PFN_SHIFT 8 u64 *pages; u64 off; diff --git a/drivers/gpu/drm/nouveau/nvif/vmm.c b/drivers/gpu/drm/nouveau/nvif/vmm.c index 31cdb2d2e1ff..27a7b95b4e9c 100644 --- a/drivers/gpu/drm/nouveau/nvif/vmm.c +++ b/drivers/gpu/drm/nouveau/nvif/vmm.c @@ -32,6 +32,35 @@ nvif_vmm_unmap(struct nvif_vmm *vmm, u64 addr) sizeof(struct nvif_vmm_unmap_v0)); } +int +nvif_vmm_hmm_map(struct nvif_vmm *vmm, u64 addr, u64 npages, u64 *pages) +{ + struct nvif_vmm_hmm_map_v0 args; + int ret; + + args.version = 0; + args.addr = addr; + args.npages = npages; + args.pages = (uint64_t)pages; + ret = nvif_object_mthd(>object, NVIF_VMM_V0_HMM_MAP, + , sizeof(args)); + return ret; +} + +int +nvif_vmm_hmm_unmap(struct nvif_vmm *vmm, u64 addr, u64 npages) +{ + struct nvif_vmm_hmm_unmap_v0 args; + int ret; + + args.version = 0; + args.addr = addr; + args.npages = npages; + ret = nvif_object_mthd(>object, NVIF_VMM_V0_HMM_UNMAP, + , sizeof(args)); + return ret; +} + int nvif_vmm_map(struct nvif_vmm *vmm, u64 addr, u64 size, void *argv, u32 argc, struct nvif_mem *mem, u64 offset) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c
[RFC PATCH 07/13] drm/nouveau: special mapping method for HMM
From: Jérôme GlisseHMM does not have any of the usual memory object properties. For HMM inside any range the following is true: - not all page in a range are valid - not all page have same permission (read only, read and write) - not all page are in same memory (system memory, GPU memory) Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h | 21 + drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c | 105 - drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h | 6 ++ drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c | 73 ++ 4 files changed, 204 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h index baab93398e54..719d50e6296f 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h @@ -2,6 +2,21 @@ #ifndef __NVKM_MMU_H__ #define __NVKM_MMU_H__ #include +#include + +/* Need to change HMM to be more driver friendly */ +#if IS_ENABLED(CONFIG_HMM) +#else +typedef unsigned long hmm_pfn_t; +#define HMM_PFN_VALID (1 << 0) +#define HMM_PFN_READ (1 << 1) +#define HMM_PFN_WRITE (1 << 2) +#define HMM_PFN_ERROR (1 << 3) +#define HMM_PFN_EMPTY (1 << 4) +#define HMM_PFN_SPECIAL (1 << 5) +#define HMM_PFN_DEVICE_UNADDRESSABLE (1 << 6) +#define HMM_PFN_SHIFT 7 +#endif struct nvkm_vma { struct list_head head; @@ -56,6 +71,7 @@ void nvkm_vmm_part(struct nvkm_vmm *, struct nvkm_memory *inst); int nvkm_vmm_get(struct nvkm_vmm *, u8 page, u64 size, struct nvkm_vma **); void nvkm_vmm_put(struct nvkm_vmm *, struct nvkm_vma **); + struct nvkm_vmm_map { struct nvkm_memory *memory; u64 offset; @@ -63,6 +79,11 @@ struct nvkm_vmm_map { struct nvkm_mm_node *mem; struct scatterlist *sgl; dma_addr_t *dma; +#define NV_HMM_PAGE_FLAG_V HMM_PFN_VALID +#define NV_HMM_PAGE_FLAG_W HMM_PFN_WRITE +#define NV_HMM_PAGE_FLAG_E HMM_PFN_ERROR +#define NV_HMM_PAGE_PFN_SHIFT HMM_PFN_SHIFT + u64 *pages; u64 off; const struct nvkm_vmm_page *page; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c index 20d31526ba8f..96671987ce53 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c @@ -75,7 +75,7 @@ struct nvkm_vmm_iter { struct nvkm_vmm *vmm; u64 cnt; u16 max, lvl; - u64 start, addr; + u64 start, addr, *pages; u32 pte[NVKM_VMM_LEVELS_MAX]; struct nvkm_vmm_pt *pt[NVKM_VMM_LEVELS_MAX]; int flush; @@ -281,6 +281,59 @@ nvkm_vmm_unref_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes) return true; } +static bool +nvkm_vmm_unref_hmm_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes) +{ + const struct nvkm_vmm_desc *desc = it->desc; + const int type = desc->type == SPT; + struct nvkm_vmm_pt *pgt = it->pt[0]; + struct nvkm_mmu_pt *pt; + int mapped; + + pt = pgt->pt[type]; + mapped = desc->func->hmm_unmap(it->vmm, pt, ptei, ptes, NULL); + if (mapped <= 0) + return false; + ptes = mapped; + + /* Dual-PTs need special handling, unless PDE becoming invalid. */ + if (desc->type == SPT && (pgt->refs[0] || pgt->refs[1])) + nvkm_vmm_unref_sptes(it, pgt, desc, ptei, ptes); + + /* GPU may have cached the PTs, flush before freeing. */ + nvkm_vmm_flush_mark(it); + nvkm_vmm_flush(it); + + nvkm_kmap(pt->memory); + while (mapped--) { + u64 data = nvkm_ro64(pt->memory, pt->base + ptei * 8); + dma_addr_t dma = (data >> 8) << 12; + + if (!data) { + ptei++; + continue; + } + dma_unmap_page(it->vmm->mmu->subdev.device->dev, dma, + PAGE_SIZE, DMA_BIDIRECTIONAL); + VMM_WO064(pt, it->vmm, ptei++ * 8, 0UL); + } + nvkm_done(pt->memory); + + /* Drop PTE references. */ + pgt->refs[type] -= ptes; + + /* PT no longer neeed? Destroy it. */ + if (!pgt->refs[type]) { + it->lvl++; + TRA(it, "%s empty", nvkm_vmm_desc_type(desc)); + it->lvl--; + nvkm_vmm_unref_pdes(it); + return false; /* PTE writes for unmap() not necessary. */ + } + + return true; +} + static void nvkm_vmm_ref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt, const struct nvkm_vmm_desc *desc, u32 ptei, u32 ptes) @@ -349,6 +402,32 @@ nvkm_vmm_ref_sptes(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgt, } } +static bool +nvkm_vmm_ref_hmm_ptes(struct nvkm_vmm_iter *it, u32 ptei, u32 ptes) +{ + const struct
[RFC PATCH 13/13] drm/nouveau: HACK FOR HMM AREA
From: Jérôme GlisseAllow userspace to create a virtual address range hole for GEM object. Signed-off-by: Jérôme Glisse --- drivers/gpu/drm/nouveau/nouveau_ttm.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c b/drivers/gpu/drm/nouveau/nouveau_ttm.c index dff51a0ee028..eafde4c6b7d4 100644 --- a/drivers/gpu/drm/nouveau/nouveau_ttm.c +++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c @@ -172,6 +172,13 @@ nouveau_ttm_mmap(struct file *filp, struct vm_area_struct *vma) if (unlikely(vma->vm_pgoff < DRM_FILE_PAGE_OFFSET)) return drm_legacy_mmap(filp, vma); + /* Hack for HMM */ + if (vma->vm_pgoff < (DRM_FILE_PAGE_OFFSET + (4UL << 30))) { + struct nouveau_cli *cli = file_priv->driver_priv; + + return nouveau_vmm_hmm(cli, filp, vma); + } + return ttm_bo_mmap(filp, vma, >ttm.bdev); } @@ -305,7 +312,7 @@ nouveau_ttm_init(struct nouveau_drm *drm) drm->ttm.bo_global_ref.ref.object, _bo_driver, dev->anon_inode->i_mapping, - DRM_FILE_PAGE_OFFSET, + DRM_FILE_PAGE_OFFSET + (4UL << 30), drm->client.mmu.dmabits <= 32 ? true : false); if (ret) { NV_ERROR(drm, "error initialising bo driver, %d\n", ret); -- 2.14.3 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 12/13] drm/nouveau: HMM area creation helpers for nouveau client
From: Jérôme GlisseHelpers to create area of virtual address under HMM control for a nouveau client. GPU access to HMM area are valid as long as the hole vma exist in the process virtual address space. Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_hmm.c | 28 drivers/gpu/drm/nouveau/nouveau_hmm.h | 1 + drivers/gpu/drm/nouveau/nouveau_vmm.c | 83 +++ drivers/gpu/drm/nouveau/nouveau_vmm.h | 12 + 4 files changed, 124 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nouveau_hmm.c b/drivers/gpu/drm/nouveau/nouveau_hmm.c index a4c6f687f6a8..680e29bbf367 100644 --- a/drivers/gpu/drm/nouveau/nouveau_hmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_hmm.c @@ -245,6 +245,31 @@ nouveau_vmm_sync_pagetables(struct hmm_mirror *mirror, unsigned long start, unsigned long end) { + struct nouveau_hmm *hmm; + struct nouveau_cli *cli; + + hmm = container_of(mirror, struct nouveau_hmm, mirror); + if (!hmm->hole.vma || hmm->hole.start == hmm->hole.end) + return; + + /* Ignore area inside hole */ + end = min(end, TASK_SIZE); + if (start >= hmm->hole.start && end <= hmm->hole.end) + return; + if (start < hmm->hole.start && end > hmm->hole.start) { + nouveau_vmm_sync_pagetables(mirror, update, start, + hmm->hole.start); + start = hmm->hole.end; + } else if (start < hmm->hole.end && start >= hmm->hole.start) { + start = hmm->hole.end; + } + if (end <= start) + return; + + cli = container_of(hmm, struct nouveau_cli, hmm); + mutex_lock(>mutex); + nvif_vmm_hmm_unmap(>vmm.vmm, start, (end - start) >> PAGE_SHIFT); + mutex_unlock(>mutex); } static const struct hmm_mirror_ops nouveau_hmm_mirror_ops = { @@ -254,6 +279,8 @@ static const struct hmm_mirror_ops nouveau_hmm_mirror_ops = { void nouveau_hmm_fini(struct nouveau_cli *cli) { + struct nouveau_hmm *hmm = >hmm; + if (!cli->hmm.enabled) return; @@ -262,6 +289,7 @@ nouveau_hmm_fini(struct nouveau_cli *cli) nvif_object_fini(>hmm.rpfb); hmm_mirror_unregister(>hmm.mirror); + nvif_vmm_hmm_fini(>vmm.vmm, hmm->hole.start, hmm->hole.end); nouveau_vmm_sync_pagetables(>hmm.mirror, HMM_UPDATE_INVALIDATE, PAGE_SIZE, TASK_SIZE); } diff --git a/drivers/gpu/drm/nouveau/nouveau_hmm.h b/drivers/gpu/drm/nouveau/nouveau_hmm.h index 47f31cf8ac56..bc68dcf0748b 100644 --- a/drivers/gpu/drm/nouveau/nouveau_hmm.h +++ b/drivers/gpu/drm/nouveau/nouveau_hmm.h @@ -33,6 +33,7 @@ #if defined(CONFIG_HMM_MIRROR) && defined(CONFIG_DEVICE_PRIVATE) struct nouveau_hmm { + struct nouveau_vmm_hole hole; struct nvif_object rpfb; struct nvif_notify pending; struct task_struct *task; diff --git a/drivers/gpu/drm/nouveau/nouveau_vmm.c b/drivers/gpu/drm/nouveau/nouveau_vmm.c index f5371d96b003..8e6c47a99edb 100644 --- a/drivers/gpu/drm/nouveau/nouveau_vmm.c +++ b/drivers/gpu/drm/nouveau/nouveau_vmm.c @@ -115,6 +115,89 @@ nouveau_vma_new(struct nouveau_bo *nvbo, struct nouveau_vmm *vmm, return ret; } +static int +vmm_hole_fault(struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static void +vmm_hole_open(struct vm_area_struct *vma) +{ + struct nouveau_cli *cli = vma->vm_private_data; + struct nouveau_vmm_hole *hole = >hmm.hole; + + /* +* No need for atomic this happen under mmap_sem write lock. Make sure +* this assumption holds with a BUG_ON() +*/ + BUG_ON(down_read_trylock(>vm_mm->mmap_sem)); + hole->count++; +} + +static void +vmm_hole_close(struct vm_area_struct *vma) +{ + struct nouveau_cli *cli = vma->vm_private_data; + struct nouveau_vmm_hole *hole = >hmm.hole; + + /* +* No need for atomic this happen under mmap_sem write lock with one +* exception when a process is being kill (from do_exit()). For that +* reasons we don't test with BUG_ON(). +*/ + if ((--hole->count) <= 0) { + nouveau_hmm_fini(cli); + hole->vma = NULL; + } +} + +static int +vmm_hole_access(struct vm_area_struct *vma, unsigned long addr, + void *buf, int len, int write) +{ + return -EIO; +} + +static const struct vm_operations_struct vmm_hole_vm_ops = { + .access = vmm_hole_access, + .close = vmm_hole_close, + .fault = vmm_hole_fault, + .open = vmm_hole_open, +}; + +int +nouveau_vmm_hmm(struct nouveau_cli *cli, struct file *file, + struct vm_area_struct *vma) +{ + struct nouveau_vmm_hole *hole = >hmm.hole; + unsigned long size = vma->vm_end - vma->vm_start; +
[RFC PATCH 04/13] drm/nouveau/mmu/gp100: allow gcc/tex to generate replayable faults
From: Ben SkeggsSigned-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c index 059fafe0e771..8752d9ce4af0 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgp100.c @@ -315,7 +315,10 @@ gp100_vmm_flush(struct nvkm_vmm *vmm, int depth) int gp100_vmm_join(struct nvkm_vmm *vmm, struct nvkm_memory *inst) { - const u64 base = BIT_ULL(10) /* VER2 */ | BIT_ULL(11); /* 64KiB */ + const u64 base = BIT_ULL(4) /* FAULT_REPLAY_TEX */ | +BIT_ULL(5) /* FAULT_REPLAY_GCC */ | +BIT_ULL(10) /* VER2 */ | +BIT_ULL(11) /* 64KiB */; return gf100_vmm_join_(vmm, inst, base); } -- 2.14.3 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 06/13] drm/nouveau/fault/gp100: initial implementation of MaxwellFaultBufferA
From: Ben SkeggsSigned-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvif/class.h | 2 + drivers/gpu/drm/nouveau/include/nvif/clb069.h | 8 ++ .../gpu/drm/nouveau/include/nvkm/engine/fault.h| 1 + drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 6 + drivers/gpu/drm/nouveau/nvkm/engine/device/user.c | 1 + drivers/gpu/drm/nouveau/nvkm/engine/fault/Kbuild | 4 + drivers/gpu/drm/nouveau/nvkm/engine/fault/base.c | 116 ++ drivers/gpu/drm/nouveau/nvkm/engine/fault/gp100.c | 61 + drivers/gpu/drm/nouveau/nvkm/engine/fault/priv.h | 29 + drivers/gpu/drm/nouveau/nvkm/engine/fault/user.c | 136 + drivers/gpu/drm/nouveau/nvkm/engine/fault/user.h | 7 ++ 11 files changed, 371 insertions(+) create mode 100644 drivers/gpu/drm/nouveau/include/nvif/clb069.h create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/base.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/gp100.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/priv.h create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/user.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/user.h diff --git a/drivers/gpu/drm/nouveau/include/nvif/class.h b/drivers/gpu/drm/nouveau/include/nvif/class.h index a7c5bf572788..98ac250670b7 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/class.h +++ b/drivers/gpu/drm/nouveau/include/nvif/class.h @@ -52,6 +52,8 @@ #define NV04_DISP /* cl0046.h */ 0x0046 +#define MAXWELL_FAULT_BUFFER_A/* clb069.h */ 0xb069 + #define NV03_CHANNEL_DMA /* cl506b.h */ 0x006b #define NV10_CHANNEL_DMA /* cl506b.h */ 0x006e #define NV17_CHANNEL_DMA /* cl506b.h */ 0x176e diff --git a/drivers/gpu/drm/nouveau/include/nvif/clb069.h b/drivers/gpu/drm/nouveau/include/nvif/clb069.h new file mode 100644 index ..b0d509fd8631 --- /dev/null +++ b/drivers/gpu/drm/nouveau/include/nvif/clb069.h @@ -0,0 +1,8 @@ +#ifndef __NVIF_CLB069_H__ +#define __NVIF_CLB069_H__ + +struct nvb069_vn { +}; + +#define NVB069_VN_NTFY_FAULT 0x00 +#endif diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h index 398ca5a02eee..08893f13e2f9 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h @@ -1,4 +1,5 @@ #ifndef __NVKM_FAULT_H__ #define __NVKM_FAULT_H__ #include +int gp100_fault_new(struct nvkm_device *, int, struct nvkm_engine **); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 2fe862ac0d95..ee67caf95a4e 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2184,6 +2184,7 @@ nv130_chipset = { .ce[5] = gp100_ce_new, .dma = gf119_dma_new, .disp = gp100_disp_new, + .fault = gp100_fault_new, .fifo = gp100_fifo_new, .gr = gp100_gr_new, .sw = gf100_sw_new, @@ -2217,6 +2218,7 @@ nv132_chipset = { .ce[3] = gp102_ce_new, .disp = gp102_disp_new, .dma = gf119_dma_new, + .fault = gp100_fault_new, .fifo = gp100_fifo_new, .gr = gp102_gr_new, .nvdec = gp102_nvdec_new, @@ -2252,6 +2254,7 @@ nv134_chipset = { .ce[3] = gp102_ce_new, .disp = gp102_disp_new, .dma = gf119_dma_new, + .fault = gp100_fault_new, .fifo = gp100_fifo_new, .gr = gp102_gr_new, .nvdec = gp102_nvdec_new, @@ -2287,6 +2290,7 @@ nv136_chipset = { .ce[3] = gp102_ce_new, .disp = gp102_disp_new, .dma = gf119_dma_new, + .fault = gp100_fault_new, .fifo = gp100_fifo_new, .gr = gp102_gr_new, .nvdec = gp102_nvdec_new, @@ -2322,6 +2326,7 @@ nv137_chipset = { .ce[3] = gp102_ce_new, .disp = gp102_disp_new, .dma = gf119_dma_new, + .fault = gp100_fault_new, .fifo = gp100_fifo_new, .gr = gp107_gr_new, .nvdec = gp102_nvdec_new, @@ -2382,6 +2387,7 @@ nv13b_chipset = { .top = gk104_top_new, .ce[2] = gp102_ce_new, .dma = gf119_dma_new, + .fault = gp100_fault_new, .fifo = gp10b_fifo_new, .gr = gp10b_gr_new, .sw = gf100_sw_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c index 17adcb4e8854..5eee439f615c 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c @@ -276,6 +276,7 @@ nvkm_udevice_child_get(struct nvkm_object *object, int index, struct nvkm_device *device =
[RFC PATCH 11/13] drm/nouveau: add HMM area creation user interface
From: Jérôme GlisseUser API to create HMM area. Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvif/if000c.h | 9 + drivers/gpu/drm/nouveau/include/nvif/vmm.h | 2 + drivers/gpu/drm/nouveau/nvif/vmm.c | 51 ++ drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c | 39 4 files changed, 101 insertions(+) diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000c.h b/drivers/gpu/drm/nouveau/include/nvif/if000c.h index 2c24817ca533..0383864b033b 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/if000c.h +++ b/drivers/gpu/drm/nouveau/include/nvif/if000c.h @@ -16,6 +16,8 @@ struct nvif_vmm_v0 { #define NVIF_VMM_V0_UNMAP 0x04 #define NVIF_VMM_V0_HMM_MAP0x05 #define NVIF_VMM_V0_HMM_UNMAP 0x06 +#define NVIF_VMM_V0_HMM_INIT 0x07 +#define NVIF_VMM_V0_HMM_FINI 0x08 struct nvif_vmm_page_v0 { __u8 version; @@ -78,4 +80,11 @@ struct nvif_vmm_hmm_unmap_v0 { __u64 addr; __u64 npages; }; + +struct nvif_vmm_hmm_v0 { + __u8 version; + __u8 pad01[7]; + __u64 start; + __u64 end; +}; #endif diff --git a/drivers/gpu/drm/nouveau/include/nvif/vmm.h b/drivers/gpu/drm/nouveau/include/nvif/vmm.h index c5e4adaa0e3c..f11f8c510ebd 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/vmm.h +++ b/drivers/gpu/drm/nouveau/include/nvif/vmm.h @@ -39,6 +39,8 @@ void nvif_vmm_put(struct nvif_vmm *, struct nvif_vma *); int nvif_vmm_map(struct nvif_vmm *, u64 addr, u64 size, void *argv, u32 argc, struct nvif_mem *, u64 offset); int nvif_vmm_unmap(struct nvif_vmm *, u64); +int nvif_vmm_hmm_init(struct nvif_vmm *vmm, u64 hstart, u64 hend); +void nvif_vmm_hmm_fini(struct nvif_vmm *vmm, u64 hstart, u64 hend); int nvif_vmm_hmm_map(struct nvif_vmm *vmm, u64 addr, u64 npages, u64 *pages); int nvif_vmm_hmm_unmap(struct nvif_vmm *vmm, u64 addr, u64 npages); #endif diff --git a/drivers/gpu/drm/nouveau/nvif/vmm.c b/drivers/gpu/drm/nouveau/nvif/vmm.c index 27a7b95b4e9c..788e02e47750 100644 --- a/drivers/gpu/drm/nouveau/nvif/vmm.c +++ b/drivers/gpu/drm/nouveau/nvif/vmm.c @@ -32,6 +32,57 @@ nvif_vmm_unmap(struct nvif_vmm *vmm, u64 addr) sizeof(struct nvif_vmm_unmap_v0)); } +int +nvif_vmm_hmm_init(struct nvif_vmm *vmm, u64 hstart, u64 hend) +{ + struct nvif_vmm_hmm_v0 args; + int ret; + + if (hstart > PAGE_SIZE) { + args.version = 0; + args.start = PAGE_SIZE; + args.end = hstart; + ret = nvif_object_mthd(>object, NVIF_VMM_V0_HMM_INIT, + , sizeof(args)); + if (ret) + return ret; + } + + args.version = 0; + args.start = hend; + args.end = TASK_SIZE; + ret = nvif_object_mthd(>object, NVIF_VMM_V0_HMM_INIT, + , sizeof(args)); + if (ret && hstart > PAGE_SIZE) { + args.version = 0; + args.start = PAGE_SIZE; + args.end = hstart; + nvif_object_mthd(>object, NVIF_VMM_V0_HMM_FINI, +, sizeof(args)); + } + return ret; +} + +void +nvif_vmm_hmm_fini(struct nvif_vmm *vmm, u64 hstart, u64 hend) +{ + struct nvif_vmm_hmm_v0 args; + + if (hstart > PAGE_SIZE) { + args.version = 0; + args.start = PAGE_SIZE; + args.end = hstart; + nvif_object_mthd(>object, NVIF_VMM_V0_HMM_FINI, +, sizeof(args)); + } + + args.version = 0; + args.start = hend; + args.end = TASK_SIZE; + nvif_object_mthd(>object, NVIF_VMM_V0_HMM_FINI, +, sizeof(args)); +} + int nvif_vmm_hmm_map(struct nvif_vmm *vmm, u64 addr, u64 npages, u64 *pages) { diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c index 739f2af02552..34e00aa73fd0 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/uvmm.c @@ -274,6 +274,43 @@ nvkm_uvmm_mthd_page(struct nvkm_uvmm *uvmm, void *argv, u32 argc) return 0; } +static int +nvkm_uvmm_mthd_hmm_init(struct nvkm_uvmm *uvmm, void *argv, u32 argc) +{ + union { + struct nvif_vmm_hmm_v0 v0; + } *args = argv; + struct nvkm_vmm *vmm = uvmm->vmm; + struct nvkm_vma *vma; + int ret = -ENOSYS; + + if ((ret = nvif_unpack(ret, , , args->v0, 0, 0, false))) + return ret; + + mutex_lock(>mutex); + ret = nvkm_vmm_hmm_init(vmm, args->v0.start,
[RFC PATCH 09/13] drm/nouveau: add SVM through HMM support to nouveau client
From: Jérôme GlisseSVM (Share Virtual Memory) through HMM (Heterogeneous Memory Management) to nouveau client. SVM means that any valid pointer (private anonymous, share memory or mmap of regular file) on the CPU is also valid on the GPU. To achieve SVM with nouveau we use HMM kernel infrastructure. There is one nouveau client object created each time the device file is open by a process, this is best we can achieve. Idealy we would like an object that exist for each process address space but there is no such thing in the kernel. Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/Kbuild| 3 + drivers/gpu/drm/nouveau/nouveau_drm.c | 5 + drivers/gpu/drm/nouveau/nouveau_drv.h | 3 + drivers/gpu/drm/nouveau/nouveau_hmm.c | 339 ++ drivers/gpu/drm/nouveau/nouveau_hmm.h | 63 +++ 5 files changed, 413 insertions(+) create mode 100644 drivers/gpu/drm/nouveau/nouveau_hmm.c create mode 100644 drivers/gpu/drm/nouveau/nouveau_hmm.h diff --git a/drivers/gpu/drm/nouveau/Kbuild b/drivers/gpu/drm/nouveau/Kbuild index 9c0c650655e9..8e61e118ccfe 100644 --- a/drivers/gpu/drm/nouveau/Kbuild +++ b/drivers/gpu/drm/nouveau/Kbuild @@ -35,6 +35,9 @@ nouveau-y += nouveau_prime.o nouveau-y += nouveau_sgdma.o nouveau-y += nouveau_ttm.o nouveau-y += nouveau_vmm.o +ifdef CONFIG_HMM_MIRROR +nouveau-$(CONFIG_DEVICE_PRIVATE) += nouveau_hmm.o +endif # DRM - modesetting nouveau-$(CONFIG_DRM_NOUVEAU_BACKLIGHT) += nouveau_backlight.o diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 3e293029e3a6..e67b08ba8b80 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -167,6 +167,7 @@ nouveau_cli_work(struct work_struct *w) static void nouveau_cli_fini(struct nouveau_cli *cli) { + nouveau_hmm_fini(cli); nouveau_cli_work_flush(cli, true); usif_client_fini(cli); nouveau_vmm_fini(>vmm); @@ -965,6 +966,10 @@ nouveau_drm_open(struct drm_device *dev, struct drm_file *fpriv) list_add(>head, >clients); mutex_unlock(>client.mutex); + ret = nouveau_hmm_init(cli); + if (ret) + return ret; + done: if (ret && cli) { nouveau_cli_fini(cli); diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h index 96f6bd8aee5d..75c741d5125c 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drv.h +++ b/drivers/gpu/drm/nouveau/nouveau_drv.h @@ -65,6 +65,7 @@ struct platform_device; #include "nouveau_fence.h" #include "nouveau_bios.h" #include "nouveau_vmm.h" +#include "nouveau_hmm.h" struct nouveau_drm_tile { struct nouveau_fence *fence; @@ -104,6 +105,8 @@ struct nouveau_cli { struct list_head notifys; char name[32]; + struct nouveau_hmm hmm; + struct work_struct work; struct list_head worker; struct mutex lock; diff --git a/drivers/gpu/drm/nouveau/nouveau_hmm.c b/drivers/gpu/drm/nouveau/nouveau_hmm.c new file mode 100644 index ..a4c6f687f6a8 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nouveau_hmm.c @@ -0,0 +1,339 @@ +/* + * Copyright (C) 2018 Red Hat All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial + * portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Author: Jérôme Glisse, Ben Skeggs + */ +#include +#include +#include "nouveau_hmm.h" +#include "nouveau_drv.h" +#include "nouveau_bo.h" +#include +#include +#include + +struct fault_entry { + u32 instlo; + u32 insthi; + u32 addrlo; + u32 addrhi; + u32 timelo; + u32 timehi; + u32 rsvd; + u32 info; +}; + +#define NV_PFAULT_ACCESS_R 0 /* read */ +#define NV_PFAULT_ACCESS_W 1 /* write */ +#define NV_PFAULT_ACCESS_A 2 /* atomic */ +#define NV_PFAULT_ACCESS_P 3 /* prefetch */ + +static
[RFC PATCH 10/13] drm/nouveau: add HMM area creation
From: Jérôme GlisseHMM area is a virtual address range under HMM control, GPU access inside such range is like CPU access. For thing to work properly HMM range should cover everything except a reserved range for GEM buffer object. Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c | 63 +++ drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h | 2 + 2 files changed, 65 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c index 96671987ce53..ef4b839932fa 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c @@ -1540,6 +1540,69 @@ nvkm_vmm_get_locked(struct nvkm_vmm *vmm, bool getref, bool mapref, bool sparse, return 0; } +int +nvkm_vmm_hmm_init(struct nvkm_vmm *vmm, u64 start, u64 end, + struct nvkm_vma **pvma) +{ + struct nvkm_vma *vma = NULL, *tmp; + struct rb_node *node; + + /* Locate smallest block that can possibly satisfy the allocation. */ + node = vmm->free.rb_node; + while (node) { + struct nvkm_vma *this = rb_entry(node, typeof(*this), tree); + + if (this->addr <= start && (this->addr + this->size) >= end) { + rb_erase(>tree, >free); + vma = this; + break; + } + node = node->rb_left; + } + + if (vma == NULL) { + return -EINVAL; + } + + if (start != vma->addr) { + if (!(tmp = nvkm_vma_tail(vma, vma->size + vma->addr - start))) { + nvkm_vmm_put_region(vmm, vma); + return -ENOMEM; + } + nvkm_vmm_free_insert(vmm, vma); + vma = tmp; + } + + if (end < (vma->addr + vma->size)) { + if (!(tmp = nvkm_vma_tail(vma, vma->size + vma->addr - end))) { + nvkm_vmm_put_region(vmm, vma); + return -ENOMEM; + } + nvkm_vmm_free_insert(vmm, tmp); + } + + vma->mapref = false; + vma->sparse = false; + vma->page = NVKM_VMA_PAGE_NONE; + vma->refd = NVKM_VMA_PAGE_NONE; + vma->used = true; + nvkm_vmm_node_insert(vmm, vma); + *pvma = vma; + return 0; +} + +void +nvkm_vmm_hmm_fini(struct nvkm_vmm *vmm, u64 start, u64 end) +{ + struct nvkm_vma *vma; + u64 size = (end - start); + + vma = nvkm_vmm_node_search(vmm, start); + if (vma && vma->addr == start && vma->size == size) { + nvkm_vmm_put_locked(vmm, vma); + } +} + int nvkm_vmm_get(struct nvkm_vmm *vmm, u8 page, u64 size, struct nvkm_vma **pvma) { diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h index a630aa2a77e4..04d672a4dccb 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h @@ -165,6 +165,8 @@ int nvkm_vmm_get_locked(struct nvkm_vmm *, bool getref, bool mapref, bool sparse, u8 page, u8 align, u64 size, struct nvkm_vma **pvma); void nvkm_vmm_put_locked(struct nvkm_vmm *, struct nvkm_vma *); +int nvkm_vmm_hmm_init(struct nvkm_vmm *, u64, u64, struct nvkm_vma **); +void nvkm_vmm_hmm_fini(struct nvkm_vmm *, u64, u64); void nvkm_vmm_unmap_locked(struct nvkm_vmm *, struct nvkm_vma *); void nvkm_vmm_unmap_region(struct nvkm_vmm *vmm, struct nvkm_vma *vma); void nvkm_vmm_hmm_map(struct nvkm_vmm *vmm, u64 addr, u64 npages, u64 *pages); -- 2.14.3 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 01/13] drm/nouveau/vmm: enable page table iterator over non populated range
From: Jérôme GlisseThis patch modify the page table iterator to support empty range when unmaping a range (ie when it is not trying to populate the range). Signed-off-by: Jérôme Glisse Cc: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c | 75 ++- 1 file changed, 51 insertions(+), 24 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c index 93946dcee319..20d31526ba8f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.c @@ -75,6 +75,7 @@ struct nvkm_vmm_iter { struct nvkm_vmm *vmm; u64 cnt; u16 max, lvl; + u64 start, addr; u32 pte[NVKM_VMM_LEVELS_MAX]; struct nvkm_vmm_pt *pt[NVKM_VMM_LEVELS_MAX]; int flush; @@ -485,6 +486,23 @@ nvkm_vmm_ref_swpt(struct nvkm_vmm_iter *it, struct nvkm_vmm_pt *pgd, u32 pdei) return true; } +static inline u64 +nvkm_vmm_iter_addr(const struct nvkm_vmm_iter *it, + const struct nvkm_vmm_desc *desc) +{ + int max = it->max; + u64 addr; + + /* Reconstruct address */ + addr = it->pte[max--]; + do { + addr = addr << desc[max].bits; + addr |= it->pte[max]; + } while (max--); + + return addr; +} + static inline u64 nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, u64 addr, u64 size, const char *name, bool ref, @@ -494,21 +512,23 @@ nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, { const struct nvkm_vmm_desc *desc = page->desc; struct nvkm_vmm_iter it; - u64 bits = addr >> page->shift; + u64 addr_bits = addr >> page->shift; it.page = page; it.desc = desc; it.vmm = vmm; it.cnt = size >> page->shift; it.flush = NVKM_VMM_LEVELS_MAX; + it.start = it.addr = addr; /* Deconstruct address into PTE indices for each mapping level. */ for (it.lvl = 0; desc[it.lvl].bits; it.lvl++) { - it.pte[it.lvl] = bits & ((1 << desc[it.lvl].bits) - 1); - bits >>= desc[it.lvl].bits; + it.pte[it.lvl] = addr_bits & ((1 << desc[it.lvl].bits) - 1); + addr_bits >>= desc[it.lvl].bits; } it.max = --it.lvl; it.pt[it.max] = vmm->pd; + addr_bits = addr >> page->shift; it.lvl = 0; TRA(, "%s: %016llx %016llx %d %lld PTEs", name, @@ -521,7 +541,8 @@ nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, const int type = desc->type == SPT; const u32 pten = 1 << desc->bits; const u32 ptei = it.pte[0]; - const u32 ptes = min_t(u64, it.cnt, pten - ptei); + u32 ptes = min_t(u64, it.cnt, pten - ptei); + u64 tmp; /* Walk down the tree, finding page tables for each level. */ for (; it.lvl; it.lvl--) { @@ -529,9 +550,14 @@ nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, struct nvkm_vmm_pt *pgd = pgt; /* Software PT. */ - if (ref && NVKM_VMM_PDE_INVALID(pgd->pde[pdei])) { - if (!nvkm_vmm_ref_swpt(, pgd, pdei)) - goto fail; + if (NVKM_VMM_PDE_INVALID(pgd->pde[pdei])) { + if (ref) { + if (!nvkm_vmm_ref_swpt(, pgd, pdei)) + goto fail; + } else { + it.pte[it.lvl] += 1; + goto next; + } } it.pt[it.lvl - 1] = pgt = pgd->pde[pdei]; @@ -545,9 +571,16 @@ nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, if (!nvkm_vmm_ref_hwpt(, pgd, pdei)) goto fail; } + + /* With HMM we might walk down un-populated range */ + if (!pgt) { + it.pte[it.lvl] += 1; + goto next; + } } /* Handle PTE updates. */ + it.addr = nvkm_vmm_iter_addr(, desc) << PAGE_SHIFT; if (!REF_PTES || REF_PTES(, ptei, ptes)) { struct nvkm_mmu_pt *pt = pgt->pt[type]; if (MAP_PTES || CLR_PTES) { @@ -558,32 +591,26 @@ nvkm_vmm_iter(struct nvkm_vmm *vmm, const struct nvkm_vmm_page *page, nvkm_vmm_flush_mark(); } } +
[RFC PATCH 02/13] drm/nouveau/core/memory: add some useful accessor macros
From: Jérôme GlisseAdds support for 64-bits read. Signed-off-by: Jérôme Glisse --- drivers/gpu/drm/nouveau/include/nvkm/core/memory.h | 8 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h b/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h index 05f505de0075..d1a886c4d2d9 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/memory.h @@ -82,6 +82,14 @@ void nvkm_memory_tags_put(struct nvkm_memory *, struct nvkm_device *, nvkm_wo32((o), __a + 4, upper_32_bits(__d)); \ } while(0) +#define nvkm_ro64(o,a) ({ \ + u64 _data; \ + _data = nvkm_ro32((o), (a) + 4); \ + _data = _data << 32; \ + _data |= nvkm_ro32((o), (a) + 0); \ + _data; \ +}) + #define nvkm_fill(t,s,o,a,d,c) do { \ u64 _a = (a), _c = (c), _d = (d), _o = _a >> s, _s = _c << s; \ u##t __iomem *_m = nvkm_kmap(o); \ -- 2.14.3 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[RFC PATCH 03/13] drm/nouveau/core: define engine for handling replayable faults
From: Ben SkeggsSigned-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvkm/core/device.h | 3 +++ drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h | 4 drivers/gpu/drm/nouveau/nvkm/core/subdev.c | 1 + drivers/gpu/drm/nouveau/nvkm/engine/Kbuild | 1 + drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 ++ drivers/gpu/drm/nouveau/nvkm/engine/device/priv.h | 1 + drivers/gpu/drm/nouveau/nvkm/engine/fault/Kbuild| 0 7 files changed, 12 insertions(+) create mode 100644 drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/fault/Kbuild diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h index 560265b15ec2..de3d2566ee4d 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h @@ -42,6 +42,7 @@ enum nvkm_devidx { NVKM_ENGINE_CIPHER, NVKM_ENGINE_DISP, NVKM_ENGINE_DMAOBJ, + NVKM_ENGINE_FAULT, NVKM_ENGINE_FIFO, NVKM_ENGINE_GR, NVKM_ENGINE_IFB, @@ -147,6 +148,7 @@ struct nvkm_device { struct nvkm_engine *cipher; struct nvkm_disp *disp; struct nvkm_dma *dma; + struct nvkm_engine *fault; struct nvkm_fifo *fifo; struct nvkm_gr *gr; struct nvkm_engine *ifb; @@ -218,6 +220,7 @@ struct nvkm_device_chip { int (*cipher )(struct nvkm_device *, int idx, struct nvkm_engine **); int (*disp)(struct nvkm_device *, int idx, struct nvkm_disp **); int (*dma )(struct nvkm_device *, int idx, struct nvkm_dma **); + int (*fault )(struct nvkm_device *, int idx, struct nvkm_engine **); int (*fifo)(struct nvkm_device *, int idx, struct nvkm_fifo **); int (*gr )(struct nvkm_device *, int idx, struct nvkm_gr **); int (*ifb )(struct nvkm_device *, int idx, struct nvkm_engine **); diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h new file mode 100644 index ..398ca5a02eee --- /dev/null +++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/fault.h @@ -0,0 +1,4 @@ +#ifndef __NVKM_FAULT_H__ +#define __NVKM_FAULT_H__ +#include +#endif diff --git a/drivers/gpu/drm/nouveau/nvkm/core/subdev.c b/drivers/gpu/drm/nouveau/nvkm/core/subdev.c index a134d225f958..0d50b2206da2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/core/subdev.c +++ b/drivers/gpu/drm/nouveau/nvkm/core/subdev.c @@ -63,6 +63,7 @@ nvkm_subdev_name[NVKM_SUBDEV_NR] = { [NVKM_ENGINE_CIPHER ] = "cipher", [NVKM_ENGINE_DISP] = "disp", [NVKM_ENGINE_DMAOBJ ] = "dma", + [NVKM_ENGINE_FAULT ] = "fault", [NVKM_ENGINE_FIFO] = "fifo", [NVKM_ENGINE_GR ] = "gr", [NVKM_ENGINE_IFB ] = "ifb", diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/Kbuild index 78571e8b01c5..3aa90a6d5392 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/engine/Kbuild @@ -7,6 +7,7 @@ include $(src)/nvkm/engine/cipher/Kbuild include $(src)/nvkm/engine/device/Kbuild include $(src)/nvkm/engine/disp/Kbuild include $(src)/nvkm/engine/dma/Kbuild +include $(src)/nvkm/engine/fault/Kbuild include $(src)/nvkm/engine/fifo/Kbuild include $(src)/nvkm/engine/gr/Kbuild include $(src)/nvkm/engine/mpeg/Kbuild diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 05cd674326a6..2fe862ac0d95 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2466,6 +2466,7 @@ nvkm_device_engine(struct nvkm_device *device, int index) _(CIPHER , device->cipher , device->cipher); _(DISP , device->disp, >disp->engine); _(DMAOBJ , device->dma , >dma->engine); + _(FAULT , device->fault , device->fault); _(FIFO , device->fifo, >fifo->engine); _(GR , device->gr , >gr->engine); _(IFB, device->ifb , device->ifb); @@ -2919,6 +2920,7 @@ nvkm_device_ctor(const struct nvkm_device_func *func, _(NVKM_ENGINE_CIPHER , cipher); _(NVKM_ENGINE_DISP, disp); _(NVKM_ENGINE_DMAOBJ , dma); + _(NVKM_ENGINE_FAULT ,fault); _(NVKM_ENGINE_FIFO, fifo); _(NVKM_ENGINE_GR , gr); _(NVKM_ENGINE_IFB , ifb); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/priv.h b/drivers/gpu/drm/nouveau/nvkm/engine/device/priv.h index 08d0bf605722..3be45ac6e58d 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/priv.h @@ -32,6 +32,7 @@ #include #include #include
[RFC PATCH 00/13] SVM (share virtual memory) with HMM in nouveau
From: Jérôme Glisse(mm is cced just to allow exposure of device driver work without ccing a long list of peoples. I do not think there is anything usefull to discuss from mm point of view but i might be wrong, so just for the curious :)). git://people.freedesktop.org/~glisse/linux branch: nouveau-hmm-v00 https://cgit.freedesktop.org/~glisse/linux/log/?h=nouveau-hmm-v00 This patchset adds SVM (Share Virtual Memory) using HMM (Heterogeneous Memory Management) to the nouveau driver. SVM means that GPU threads spawn by GPU driver for a specific user process can access any valid CPU address in that process. A valid pointer is a pointer inside an area coming from mmap of private, share or regular file. Pointer to a mmap of a device file or special file are not supported. This is an RFC for few reasons technical reasons listed below and also because we are still working on a proper open source userspace (namely a OpenCL 2.0 for nouveau inside mesa). Open source userspace being a requirement for the DRM subsystem. I pushed in [1] a simple standalone program that can be use to test SVM through HMM with nouveau. I expect we will have a somewhat working userspace in the coming weeks, work being well underway and some patches have already been posted on mesa mailing list. They are two aspect that need to sorted before this can be considered ready. First we want to decide how to update GPU page table from HMM. In this patchset i added new methods to vmm to allow GPU page table to be updated without nvkm_memory or nvkm_vma object (see patch 7 and 8 special mapping method for HMM). It just take an array of pages and flags. It allow for both system and device private memory to be interleaved. The second aspect is how to create a HMM enabled channel. Channel is a term use for NVidia GPU command queue, each process using nouveau have at least one channel, it can have multiple channels. They are not created by process directly but rather by device driver backend of common library like OpenGL, OpenCL or Vulkan. They are work underway to revamp nouveau channel creation with a new userspace API. So we might want to delay upstreaming until this lands. We can stil discuss one aspect specific to HMM here namely the issue around GEM objects used for some specific part of the GPU. Some engine inside the GPU (engine are a GPU block like the display block which is responsible of scaning memory to send out a picture through some connector for instance HDMI or DisplayPort) can only access memory with virtual address below (1 << 40). To accomodate those we need to create a "hole" inside the process address space. This patchset have a hack for that (patch 13 HACK FOR HMM AREA), it reserves a range of device file offset so that process can mmap this range with PROT_NONE to create a hole (process must make sure the hole is below 1 << 40). I feel un-easy of doing it this way but maybe it is ok with other folks. Note that this patchset do not show usage of device private memory as it depends on other architectural changes to nouveau. However it is very easy to add it with some gross hack so if people would like to see it i can also post an RFC for that. As a preview it only adds two new ioctl which allow userspace to ask for migration of a range of virtual address, expectation is that the userspace library will know better where to place thing and kernel will try to sastify this (with no guaranty, it is a best effort). As usual comments and questions are welcome. Cheers, Jérôme Glisse [1] https://cgit.freedesktop.org/~glisse/moche Ben Skeggs (4): drm/nouveau/core: define engine for handling replayable faults drm/nouveau/mmu/gp100: allow gcc/tex to generate replayable faults drm/nouveau/mc/gp100-: handle replayable fault interrupt drm/nouveau/fault/gp100: initial implementation of MaxwellFaultBufferA Jérôme Glisse (9): drm/nouveau/vmm: enable page table iterator over non populated range drm/nouveau/core/memory: add some useful accessor macros drm/nouveau: special mapping method for HMM drm/nouveau: special mapping method for HMM (user interface) drm/nouveau: add SVM through HMM support to nouveau client drm/nouveau: add HMM area creation drm/nouveau: add HMM area creation user interface drm/nouveau: HMM area creation helpers for nouveau client drm/nouveau: HACK FOR HMM AREA drivers/gpu/drm/nouveau/Kbuild | 3 + drivers/gpu/drm/nouveau/include/nvif/class.h | 2 + drivers/gpu/drm/nouveau/include/nvif/clb069.h | 8 + drivers/gpu/drm/nouveau/include/nvif/if000c.h | 26 ++ drivers/gpu/drm/nouveau/include/nvif/vmm.h | 4 + drivers/gpu/drm/nouveau/include/nvkm/core/device.h | 3 + drivers/gpu/drm/nouveau/include/nvkm/core/memory.h | 8 + .../gpu/drm/nouveau/include/nvkm/engine/fault.h| 5 + drivers/gpu/drm/nouveau/include/nvkm/subdev/mmu.h | 10 + drivers/gpu/drm/nouveau/nouveau_drm.c | 5 +
[RFC PATCH 05/13] drm/nouveau/mc/gp100-: handle replayable fault interrupt
From: Ben SkeggsSigned-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp100.c | 20 +++- drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp10b.c | 2 +- drivers/gpu/drm/nouveau/nvkm/subdev/mc/priv.h | 2 ++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp100.c index 7321ad3758c3..9ab5bfe1e588 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp100.c @@ -75,10 +75,28 @@ gp100_mc_intr_mask(struct nvkm_mc *base, u32 mask, u32 intr) spin_unlock_irqrestore(>lock, flags); } +const struct nvkm_mc_map +gp100_mc_intr[] = { + { 0x0400, NVKM_ENGINE_DISP }, + { 0x0100, NVKM_ENGINE_FIFO }, + { 0x0200, NVKM_ENGINE_FAULT }, + { 0x4000, NVKM_SUBDEV_IBUS }, + { 0x1000, NVKM_SUBDEV_BUS }, + { 0x0800, NVKM_SUBDEV_FB }, + { 0x0200, NVKM_SUBDEV_LTC }, + { 0x0100, NVKM_SUBDEV_PMU }, + { 0x0020, NVKM_SUBDEV_GPIO }, + { 0x0020, NVKM_SUBDEV_I2C }, + { 0x0010, NVKM_SUBDEV_TIMER }, + { 0x0004, NVKM_SUBDEV_THERM }, + { 0x2000, NVKM_SUBDEV_FB }, + {}, +}; + static const struct nvkm_mc_func gp100_mc = { .init = nv50_mc_init, - .intr = gk104_mc_intr, + .intr = gp100_mc_intr, .intr_unarm = gp100_mc_intr_unarm, .intr_rearm = gp100_mc_intr_rearm, .intr_mask = gp100_mc_intr_mask, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp10b.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp10b.c index 2283e3b74277..ff8629de97d6 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp10b.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/gp10b.c @@ -34,7 +34,7 @@ gp10b_mc_init(struct nvkm_mc *mc) static const struct nvkm_mc_func gp10b_mc = { .init = gp10b_mc_init, - .intr = gk104_mc_intr, + .intr = gp100_mc_intr, .intr_unarm = gp100_mc_intr_unarm, .intr_rearm = gp100_mc_intr_rearm, .intr_mask = gp100_mc_intr_mask, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/priv.h index 8869d79c2b59..d9e3691d45b7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/priv.h @@ -57,4 +57,6 @@ int gp100_mc_new_(const struct nvkm_mc_func *, struct nvkm_device *, int, extern const struct nvkm_mc_map gk104_mc_intr[]; extern const struct nvkm_mc_map gk104_mc_reset[]; + +extern const struct nvkm_mc_map gp100_mc_intr[]; #endif -- 2.14.3 ___ dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH 00/13] mmu_notifier kill invalidate_page callback v2
From: Jérôme Glisse(Sorry for so many list cross-posting and big cc) Changes since v1: - remove more dead code in kvm (no testing impact) - more accurate end address computation (patch 2) in page_mkclean_one and try_to_unmap_one - added tested-by/reviewed-by gotten so far Tested as both host and guest kernel with KVM nothing is burning yet. Previous cover letter: Please help testing ! The invalidate_page callback suffered from 2 pitfalls. First it used to happen after page table lock was release and thus a new page might have been setup for the virtual address before the call to invalidate_page(). This is in a weird way fixed by c7ab0d2fdc840266b39db94538f74207ec2afbf6 which moved the callback under the page table lock. Which also broke several existing user of the mmu_notifier API that assumed they could sleep inside this callback. The second pitfall was invalidate_page being the only callback not taking a range of address in respect to invalidation but was giving an address and a page. Lot of the callback implementer assumed this could never be THP and thus failed to invalidate the appropriate range for THP pages. By killing this callback we unify the mmu_notifier callback API to always take a virtual address range as input. There is now 2 clear API (I am not mentioning the youngess API which is seldomly used): - invalidate_range_start()/end() callback (which allow you to sleep) - invalidate_range() where you can not sleep but happen right after page table update under page table lock Note that a lot of existing user feels broken in respect to range_start/ range_end. Many user only have range_start() callback but there is nothing preventing them to undo what was invalidated in their range_start() callback after it returns but before any CPU page table update take place. The code pattern use in kvm or umem odp is an example on how to properly avoid such race. In a nutshell use some kind of sequence number and active range invalidation counter to block anything that might undo what the range_start() callback did. If you do not care about keeping fully in sync with CPU page table (ie you can live with CPU page table pointing to new different page for a given virtual address) then you can take a reference on the pages inside the range_start callback and drop it in range_end or when your driver is done with those pages. Last alternative is to use invalidate_range() if you can do invalidation without sleeping as invalidate_range() callback happens under the CPU page table spinlock right after the page table is updated. Note this is barely tested. I intend to do more testing of next few days but i do not have access to all hardware that make use of the mmu_notifier API. First 2 patches convert existing call of mmu_notifier_invalidate_page() to mmu_notifier_invalidate_range() and bracket those call with call to mmu_notifier_invalidate_range_start()/end(). The next 10 patches remove existing invalidate_page() callback as it can no longer happen. Finaly the last page remove it completely so it can RIP. Jérôme Glisse (13): dax: update to new mmu_notifier semantic mm/rmap: update to new mmu_notifier semantic powerpc/powernv: update to new mmu_notifier semantic drm/amdgpu: update to new mmu_notifier semantic IB/umem: update to new mmu_notifier semantic IB/hfi1: update to new mmu_notifier semantic iommu/amd: update to new mmu_notifier semantic iommu/intel: update to new mmu_notifier semantic misc/mic/scif: update to new mmu_notifier semantic sgi-gru: update to new mmu_notifier semantic xen/gntdev: update to new mmu_notifier semantic KVM: update to new mmu_notifier semantic mm/mmu_notifier: kill invalidate_page Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Andrew Morton Cc: Andrea Arcangeli Cc: Joerg Roedel Cc: Dan Williams Cc: Sudeep Dutt Cc: Ashutosh Dixit Cc: Dimitri Sivanich Cc: Jack Steiner Cc: Paolo Bonzini Cc: Radim Krčmář Cc: linuxppc-...@lists.ozlabs.org Cc: dri-devel@lists.freedesktop.org Cc: amd-...@lists.freedesktop.org Cc: linux-r...@vger.kernel.org Cc: io...@lists.linux-foundation.org Cc: xen-de...@lists.xenproject.org Cc: k...@vger.kernel.org Jérôme Glisse (13): dax: update to new mmu_notifier semantic mm/rmap: update to new mmu_notifier semantic v2 powerpc/powernv: update to new mmu_notifier semantic drm/amdgpu: update to new mmu_notifier semantic IB/umem: update to new mmu_notifier semantic IB/hfi1: update to new mmu_notifier semantic iommu/amd: update to new mmu_notifier semantic iommu/intel: update to new mmu_notifier semantic misc/mic/scif: update to new mmu_notifier semantic sgi-gru: update to new
[PATCH] radeon/kms: fix dp displayport mode validation
From: Jerome Glisse jgli...@redhat.com Check if there is a big enough dp clock enough dp lane to drive the video mode provided. Signed-off-by: Jerome Glisse jgli...@redhat.com Cc: sta...@kernel.org --- drivers/gpu/drm/radeon/atombios_dp.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/atombios_dp.c b/drivers/gpu/drm/radeon/atombios_dp.c index 4e7778d..695de9a 100644 --- a/drivers/gpu/drm/radeon/atombios_dp.c +++ b/drivers/gpu/drm/radeon/atombios_dp.c @@ -187,9 +187,9 @@ static int dp_link_clock_for_mode_clock(u8 dpcd[DP_DPCD_SIZE], int mode_clock) int dp_mode_valid(u8 dpcd[DP_DPCD_SIZE], int mode_clock) { int lanes = dp_lanes_for_mode_clock(dpcd, mode_clock); - int bw = dp_lanes_for_mode_clock(dpcd, mode_clock); + int dp_clock = dp_link_clock_for_mode_clock(dpcd, mode_clock); - if ((lanes == 0) || (bw == 0)) + if ((lanes == 0) || (dp_clock == 0)) return MODE_CLOCK_HIGH; return MODE_OK; -- 1.7.3.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: forbid big bo allocation (fdo 31708) v4
From: Jerome Glisse jgli...@redhat.com Forbid allocating buffer bigger than visible VRAM or GTT, also properly set lpfn field. v2 - use max macro - silence warning v3 - don't explicitly set range limit - use min macro v4 - use max btw GTT VRAM size Cc: stable sta...@kernel.org Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon_object.c | 13 +++-- 1 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 1d06774..2011e00 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -69,7 +69,7 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) u32 c = 0; rbo-placement.fpfn = 0; - rbo-placement.lpfn = rbo-rdev-mc.active_vram_size PAGE_SHIFT; + rbo-placement.lpfn = 0; rbo-placement.placement = rbo-placements; rbo-placement.busy_placement = rbo-placements; if (domain RADEON_GEM_DOMAIN_VRAM) @@ -91,7 +91,8 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, { struct radeon_bo *bo; enum ttm_bo_type type; - int page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long max_size; int r; if (unlikely(rdev-mman.bdev.dev_mapping == NULL)) { @@ -104,6 +105,14 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, } *bo_ptr = NULL; + /* maximun bo size is the max btw visible vram and gtt size */ + max_size = max(rdev-mc.visible_vram_size, rdev-mc.gtt_size); + if ((page_align PAGE_SHIFT) = max_size) { + printk(KERN_WARNING %s:%d alloc size %ldM bigger than %ldMb limit\n, + __func__, __LINE__, page_align (20 - PAGE_SHIFT), max_size 20); + return -ENOMEM; + } + retry: bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); if (bo == NULL) -- 1.7.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: forbid big bo allocation (fdo 31708) v3
From: Jerome Glisse jgli...@redhat.com Forbid allocating buffer bigger than visible VRAM or GTT, also properly set lpfn field. v2 - use max macro - silence warning v3 - don't explicitly set range limit - use min macro Cc: stable sta...@kernel.org Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon_object.c | 13 +++-- 1 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 1d06774..a598d00 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -69,7 +69,7 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) u32 c = 0; rbo-placement.fpfn = 0; - rbo-placement.lpfn = rbo-rdev-mc.active_vram_size PAGE_SHIFT; + rbo-placement.lpfn = 0; rbo-placement.placement = rbo-placements; rbo-placement.busy_placement = rbo-placements; if (domain RADEON_GEM_DOMAIN_VRAM) @@ -91,7 +91,8 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, { struct radeon_bo *bo; enum ttm_bo_type type; - int page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long max_size = 0; int r; if (unlikely(rdev-mman.bdev.dev_mapping == NULL)) { @@ -104,6 +105,14 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, } *bo_ptr = NULL; + /* maximun bo size is the minimun btw visible vram and gtt size */ + max_size = min(rdev-mc.visible_vram_size, rdev-mc.gtt_size); + if ((page_align PAGE_SHIFT) = max_size) { + printk(KERN_WARNING %s:%d alloc size %ldM bigger than %ldMb limit\n, + __func__, __LINE__, page_align (20 - PAGE_SHIFT), max_size 20); + return -ENOMEM; + } + retry: bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); if (bo == NULL) -- 1.7.3.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: forbid big bo allocation (fdo 31708)
From: Jerome Glisse jgli...@redhat.com Forbid allocating buffer bigger than visible VRAM or GTT, also properly set lpfn field. Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon_object.c | 36 ++- 1 files changed, 30 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 1d06774..7ce31be 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -64,23 +64,35 @@ bool radeon_ttm_bo_is_radeon_bo(struct ttm_buffer_object *bo) return false; } +#define MAX(a,b) (((a)(b))?(a):(b)) + void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) { u32 c = 0; rbo-placement.fpfn = 0; - rbo-placement.lpfn = rbo-rdev-mc.active_vram_size PAGE_SHIFT; + rbo-placement.lpfn = 0; rbo-placement.placement = rbo-placements; rbo-placement.busy_placement = rbo-placements; - if (domain RADEON_GEM_DOMAIN_VRAM) + if (domain RADEON_GEM_DOMAIN_VRAM) { + rbo-placement.lpfn = MAX(rbo-placement.lpfn, rbo-rdev-mc.active_vram_size PAGE_SHIFT); rbo-placements[c++] = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_VRAM; - if (domain RADEON_GEM_DOMAIN_GTT) + } + if (domain RADEON_GEM_DOMAIN_GTT) { + rbo-placement.lpfn = MAX(rbo-placement.lpfn, rbo-rdev-mc.gtt_size PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT; - if (domain RADEON_GEM_DOMAIN_CPU) + } + if (domain RADEON_GEM_DOMAIN_CPU) { + /* 4G limit for CPU domain */ + rbo-placement.lpfn = MAX(rbo-placement.lpfn, 0x PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM; - if (!c) + } + if (!c) { + /* 4G limit for CPU domain */ + rbo-placement.lpfn = MAX(rbo-placement.lpfn, 0x PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM; + } rbo-placement.num_placement = c; rbo-placement.num_busy_placement = c; } @@ -91,7 +103,8 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, { struct radeon_bo *bo; enum ttm_bo_type type; - int page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long max_size = 0; int r; if (unlikely(rdev-mman.bdev.dev_mapping == NULL)) { @@ -104,6 +117,17 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, } *bo_ptr = NULL; + /* maximun bo size is the minimun btw visible vram and gtt size */ + max_size = rdev-mc.visible_vram_size; + if (max_size rdev-mc.gtt_size) { + max_size = rdev-mc.gtt_size; + } + if ((page_align PAGE_SHIFT) = max_size) { + printk(KERN_WARNING %s:%d alloc size %ldM bigger than %ldMb limit\n, + __func__, __LINE__, page_align (20 - PAGE_SHIFT), max_size 20); + return -ENOMEM; + } + retry: bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); if (bo == NULL) -- 1.7.3.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: forbid big bo allocation (fdo 31708) v2
From: Jerome Glisse jgli...@redhat.com Forbid allocating buffer bigger than visible VRAM or GTT, also properly set lpfn field. v2 - use max macro - silence warning Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon_object.c | 34 ++- 1 files changed, 28 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 1d06774..c2fa64c 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -69,18 +69,28 @@ void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) u32 c = 0; rbo-placement.fpfn = 0; - rbo-placement.lpfn = rbo-rdev-mc.active_vram_size PAGE_SHIFT; + rbo-placement.lpfn = 0; rbo-placement.placement = rbo-placements; rbo-placement.busy_placement = rbo-placements; - if (domain RADEON_GEM_DOMAIN_VRAM) + if (domain RADEON_GEM_DOMAIN_VRAM) { + rbo-placement.lpfn = max((unsigned)rbo-placement.lpfn, (unsigned)rbo-rdev-mc.active_vram_size PAGE_SHIFT); rbo-placements[c++] = TTM_PL_FLAG_WC | TTM_PL_FLAG_UNCACHED | TTM_PL_FLAG_VRAM; - if (domain RADEON_GEM_DOMAIN_GTT) + } + if (domain RADEON_GEM_DOMAIN_GTT) { + rbo-placement.lpfn = max((unsigned)rbo-placement.lpfn, (unsigned)rbo-rdev-mc.gtt_size PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT; - if (domain RADEON_GEM_DOMAIN_CPU) + } + if (domain RADEON_GEM_DOMAIN_CPU) { + /* 4G limit for CPU domain */ + rbo-placement.lpfn = max(rbo-placement.lpfn, 0x PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM; - if (!c) + } + if (!c) { + /* 4G limit for CPU domain */ + rbo-placement.lpfn = max(rbo-placement.lpfn, 0x PAGE_SHIFT); rbo-placements[c++] = TTM_PL_MASK_CACHING | TTM_PL_FLAG_SYSTEM; + } rbo-placement.num_placement = c; rbo-placement.num_busy_placement = c; } @@ -91,7 +101,8 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, { struct radeon_bo *bo; enum ttm_bo_type type; - int page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long page_align = roundup(byte_align, PAGE_SIZE) PAGE_SHIFT; + unsigned long max_size = 0; int r; if (unlikely(rdev-mman.bdev.dev_mapping == NULL)) { @@ -104,6 +115,17 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, } *bo_ptr = NULL; + /* maximun bo size is the minimun btw visible vram and gtt size */ + max_size = rdev-mc.visible_vram_size; + if (max_size rdev-mc.gtt_size) { + max_size = rdev-mc.gtt_size; + } + if ((page_align PAGE_SHIFT) = max_size) { + printk(KERN_WARNING %s:%d alloc size %ldM bigger than %ldMb limit\n, + __func__, __LINE__, page_align (20 - PAGE_SHIFT), max_size 20); + return -ENOMEM; + } + retry: bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); if (bo == NULL) -- 1.7.3.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: forbid allocating bo bigger than VRAM or GTT (fdo 31708)
From: Jerome Glisse jgli...@redhat.com Forbid allocating buffer bigger than VRAM or GTT, also properly set lpfn field of placement if VRAM is too small. Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon.h|2 +- drivers/gpu/drm/radeon/radeon_object.c | 19 ++- drivers/gpu/drm/radeon/radeon_ttm.c|6 +++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 73f600d..2068cf4 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1361,7 +1361,7 @@ extern void radeon_surface_init(struct radeon_device *rdev); extern int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data); extern void radeon_legacy_set_clock_gating(struct radeon_device *rdev, int enable); extern void radeon_atom_set_clock_gating(struct radeon_device *rdev, int enable); -extern void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain); +extern void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain, u32 size); extern bool radeon_ttm_bo_is_radeon_bo(struct ttm_buffer_object *bo); extern void radeon_vram_location(struct radeon_device *rdev, struct radeon_mc *mc, u64 base); extern void radeon_gtt_location(struct radeon_device *rdev, struct radeon_mc *mc); diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index 8eb1834..a09d076 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -64,12 +64,18 @@ bool radeon_ttm_bo_is_radeon_bo(struct ttm_buffer_object *bo) return false; } -void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain) +void radeon_ttm_placement_from_domain(struct radeon_bo *rbo, u32 domain, u32 size) { u32 c = 0; rbo-placement.fpfn = 0; rbo-placement.lpfn = rbo-rdev-mc.active_vram_size PAGE_SHIFT; + /* size bigger than vram directly fallback to GTT*/ + if (size = rbo-rdev-mc.active_vram_size) { + rbo-placement.lpfn = rbo-rdev-mc.gtt_size PAGE_SHIFT; + if (!(domain (RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_CPU))) + domain |= RADEON_GEM_DOMAIN_GTT; + } rbo-placement.placement = rbo-placements; rbo-placement.busy_placement = rbo-placements; if (domain RADEON_GEM_DOMAIN_VRAM) @@ -102,6 +108,9 @@ int radeon_bo_create(struct radeon_device *rdev, struct drm_gem_object *gobj, type = ttm_bo_type_device; } *bo_ptr = NULL; + if (size = rdev-mc.active_vram_size size = rdev-mc.gtt_size) { + return -ENOMEM; + } retry: bo = kzalloc(sizeof(struct radeon_bo), GFP_KERNEL); @@ -111,7 +120,7 @@ retry: bo-gobj = gobj; bo-surface_reg = -1; INIT_LIST_HEAD(bo-list); - radeon_ttm_placement_from_domain(bo, domain); + radeon_ttm_placement_from_domain(bo, domain, size); /* Kernel allocation are uninterruptible */ mutex_lock(rdev-vram_mutex); r = ttm_bo_init(rdev-mman.bdev, bo-tbo, size, type, @@ -197,7 +206,7 @@ int radeon_bo_pin(struct radeon_bo *bo, u32 domain, u64 *gpu_addr) *gpu_addr = radeon_bo_gpu_offset(bo); return 0; } - radeon_ttm_placement_from_domain(bo, domain); + radeon_ttm_placement_from_domain(bo, domain, bo-tbo.num_pages PAGE_SHIFT); if (domain == RADEON_GEM_DOMAIN_VRAM) { /* force to pin into visible video ram */ bo-placement.lpfn = bo-rdev-mc.visible_vram_size PAGE_SHIFT; @@ -343,7 +352,7 @@ int radeon_bo_list_validate(struct list_head *head) domain = lobj-wdomain ? lobj-wdomain : lobj-rdomain; retry: - radeon_ttm_placement_from_domain(bo, domain); + radeon_ttm_placement_from_domain(bo, domain, bo-tbo.num_pages PAGE_SHIFT); r = ttm_bo_validate(bo-tbo, bo-placement, true, false, false); if (unlikely(r)) { @@ -535,7 +544,7 @@ int radeon_bo_fault_reserve_notify(struct ttm_buffer_object *bo) offset = bo-mem.start PAGE_SHIFT; if ((offset + size) rdev-mc.visible_vram_size) { /* hurrah the memory is not visible ! */ - radeon_ttm_placement_from_domain(rbo, RADEON_GEM_DOMAIN_VRAM); + radeon_ttm_placement_from_domain(rbo, RADEON_GEM_DOMAIN_VRAM, bo-num_pages PAGE_SHIFT); rbo-placement.lpfn = rdev-mc.visible_vram_size PAGE_SHIFT; r = ttm_bo_validate(bo, rbo-placement, false, true, false); if (unlikely(r != 0)) diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c
[PATCH] drm/radeon/kms: r6xx/r7xx flush shader cache at fence emision
From: Jerome Glisse jgli...@redhat.com GPU is prone to lockup if we deallocate shader bo right after submitting command using the shader. Force shader cache flush after each batch submission seems to fix the issue. It could fix some of the lockup people were experiencing. Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/r600.c |7 +++ drivers/gpu/drm/radeon/r600_blit_kms.c |2 +- 2 files changed, 8 insertions(+), 1 deletions(-) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index d0ebae9..4076443 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2335,6 +2335,13 @@ void r600_fence_ring_emit(struct radeon_device *rdev, { /* Also consider EVENT_WRITE_EOP. it handles the interrupts + timestamps + events */ + /* flush shader */ + radeon_ring_write(rdev, PACKET3(PACKET3_SURFACE_SYNC, 3)); + radeon_ring_write(rdev, PACKET3_SH_ACTION_ENA); + radeon_ring_write(rdev, 0x); + radeon_ring_write(rdev, 0x); + radeon_ring_write(rdev, 10); + radeon_ring_write(rdev, PACKET3(PACKET3_EVENT_WRITE, 0)); radeon_ring_write(rdev, CACHE_FLUSH_AND_INV_EVENT); /* wait for 3D idle clean */ diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index d13622a..0efba07 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -581,7 +581,7 @@ int r600_blit_prepare_copy(struct radeon_device *rdev, int size_bytes) ring_size += 40; /* shaders + def state */ ring_size += 10; /* fence emit for VB IB */ ring_size += 5; /* done copy */ - ring_size += 10; /* fence emit for done copy */ + ring_size += 15; /* fence emit for done copy */ r = radeon_ring_lock(rdev, ring_size); if (r) return r; -- 1.7.2.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: r6xx/r7xx flush shader cache at fence emision V2
From: Jerome Glisse jgli...@redhat.com GPU is prone to lockup if we deallocate shader bo right after submitting command using the shader. Force shader cache flush after each batch submission seems to fix the issue. It could fix some of the lockup people were experiencing. V2 move shader flush after pipeline flush it seems to be more reliable that way (ie if userspace didn't submit a flush pipeline at end of its command buffer we might still lockup while moving shader flush after fence flush pipeline seems to prevent this). Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/r600.c |6 ++ drivers/gpu/drm/radeon/r600_blit_kms.c |2 +- 2 files changed, 7 insertions(+), 1 deletions(-) diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index d0ebae9..eab8de0 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -2337,6 +2337,12 @@ void r600_fence_ring_emit(struct radeon_device *rdev, radeon_ring_write(rdev, PACKET3(PACKET3_EVENT_WRITE, 0)); radeon_ring_write(rdev, CACHE_FLUSH_AND_INV_EVENT); + /* flush shader */ + radeon_ring_write(rdev, PACKET3(PACKET3_SURFACE_SYNC, 3)); + radeon_ring_write(rdev, PACKET3_SH_ACTION_ENA); + radeon_ring_write(rdev, 0x); + radeon_ring_write(rdev, 0x); + radeon_ring_write(rdev, 10); /* wait for 3D idle clean */ radeon_ring_write(rdev, PACKET3(PACKET3_SET_CONFIG_REG, 1)); radeon_ring_write(rdev, (WAIT_UNTIL - PACKET3_SET_CONFIG_REG_OFFSET) 2); diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index d13622a..0efba07 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -581,7 +581,7 @@ int r600_blit_prepare_copy(struct radeon_device *rdev, int size_bytes) ring_size += 40; /* shaders + def state */ ring_size += 10; /* fence emit for VB IB */ ring_size += 5; /* done copy */ - ring_size += 10; /* fence emit for done copy */ + ring_size += 15; /* fence emit for done copy */ r = radeon_ring_lock(rdev, ring_size); if (r) return r; -- 1.7.2.2 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
[PATCH] drm/radeon/kms: fix GTT/VRAM overlapping test
From: Jerome Glisse jgli...@redhat.com GTT/VRAM overlapping test had a typo which leaded to not detecting case when vram_end gtt_end. This patch fix the logic and should fix #16574 cc: stable Signed-off-by: Jerome Glisse jgli...@redhat.com --- drivers/gpu/drm/radeon/radeon_device.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 4f7a170..69b3c22 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -199,7 +199,7 @@ void radeon_vram_location(struct radeon_device *rdev, struct radeon_mc *mc, u64 mc-mc_vram_size = mc-aper_size; } mc-vram_end = mc-vram_start + mc-mc_vram_size - 1; - if (rdev-flags RADEON_IS_AGP mc-vram_end mc-gtt_start mc-vram_end = mc-gtt_end) { + if (rdev-flags RADEON_IS_AGP mc-vram_end mc-gtt_start mc-vram_start = mc-gtt_end) { dev_warn(rdev-dev, limiting VRAM to PCI aperture size\n); mc-real_vram_size = mc-aper_size; mc-mc_vram_size = mc-aper_size; -- 1.7.1 ___ dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel