from:"Muchun Song"

Re: [PATCH v2] mm: fix missing wake-up event for FSDAX pages

2022-07-05 Thread Muchun Song

On Tue, Jul 05, 2022 at 08:00:42PM -0700, Andrew Morton wrote:
> On Wed, 6 Jul 2022 10:47:32 +0800 Muchun Song  
> wrote:
> 
> > > If this wakeup is not one of these, then are there reports from the
> > > softlockup detector?
> > > 
> > > Do we have reports of processes permanently stuck in D state?
> > >
> > 
> > No. The task is in an TASK_INTERRUPTIBLE state (see 
> > __fuse_dax_break_layouts). 
> > The hung task reporter only reports D task (TASK_UNINTERRUPTIBLE).
> 
> Thanks, I updated the changelog a bit.
> 
> : FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
> : 1, then the page is freed.  The FSDAX pages can be pinned through GUP,
> : then they will be unpinned via unpin_user_page() using a folio variant
> : to put the page, however, folio variants did not consider this special
> : case, the result will be to miss a wakeup event (like the user of
> : __fuse_dax_break_layouts()).  This results in a task being permanently
> : stuck in TASK_INTERRUPTIBLE state.
> : 
> : Since FSDAX pages are only possibly obtained by GUP users, so fix GUP
> : instead of folio_put() to lower overhead.
> 
> I believe these details are helpful for -stable maintainers who are
> wondering why they were sent stuff.  Also for maintainers of
> downstreeam older kernels who are scratching heads over some user bug
> report, trying to find a patch which might fix it - for this they want
> to see a description of the user-visible effects, for matching with
> that bug report.
>

Thanks Andrew, it's really helpful.

Re: [PATCH v2] mm: fix missing wake-up event for FSDAX pages

2022-07-05 Thread Muchun Song

On Tue, Jul 05, 2022 at 04:47:10PM -0700, Andrew Morton wrote:
> On Wed, 6 Jul 2022 00:38:41 +0100 Matthew Wilcox  wrote:
> 
> > On Tue, Jul 05, 2022 at 02:18:19PM -0700, Andrew Morton wrote:
> > > On Tue,  5 Jul 2022 20:35:32 +0800 Muchun Song  
> > > wrote:
> > > 
> > > > FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
> > > > 1, then the page is freed.  The FSDAX pages can be pinned through GUP,
> > > > then they will be unpinned via unpin_user_page() using a folio variant
> > > > to put the page, however, folio variants did not consider this special
> > > > case, the result will be to miss a wakeup event (like the user of
> > > > __fuse_dax_break_layouts()).  Since FSDAX pages are only possible get
> > > > by GUP users, so fix GUP instead of folio_put() to lower overhead.
> > > > 
> > > 
> > > What are the user visible runtime effects of this bug?
> > 
> > "missing wake up event" seems pretty obvious to me?  Something goes to
> > sleep waiting for a page to become unused, and is never woken.
> 
> No, missed wakeups are often obscured by another wakeup coming in
> shortly afterwards.
> 

I need to clarify the task will never be woken up.

> If this wakeup is not one of these, then are there reports from the
> softlockup detector?
> 
> Do we have reports of processes permanently stuck in D state?
>

No. The task is in an TASK_INTERRUPTIBLE state (see __fuse_dax_break_layouts). 
The hung task reporter only reports D task (TASK_UNINTERRUPTIBLE).

Thanks.
>

[PATCH v2] mm: fix missing wake-up event for FSDAX pages

2022-07-05 Thread Muchun Song

FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
1, then the page is freed.  The FSDAX pages can be pinned through GUP,
then they will be unpinned via unpin_user_page() using a folio variant
to put the page, however, folio variants did not consider this special
case, the result will be to miss a wakeup event (like the user of
__fuse_dax_break_layouts()).  Since FSDAX pages are only possible get
by GUP users, so fix GUP instead of folio_put() to lower overhead.

Fixes: d8ddc099c6b3 ("mm/gup: Add gup_put_folio()")
Suggested-by: Matthew Wilcox 
Signed-off-by: Muchun Song 
Cc: 
---
v2:
 - Fix GUP instead of folio_put() suggested by Matthew.

 include/linux/mm.h | 14 +-
 mm/gup.c   |  6 --
 mm/memremap.c  |  6 +++---
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 517f9deba56f..b324c9fa2940 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1157,23 +1157,27 @@ static inline bool is_zone_movable_page(const struct 
page *page)
 #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
 
-bool __put_devmap_managed_page(struct page *page);
-static inline bool put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs);
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
 {
if (!static_branch_unlikely(_managed_key))
return false;
if (!is_zone_device_page(page))
return false;
-   return __put_devmap_managed_page(page);
+   return __put_devmap_managed_page_refs(page, refs);
 }
-
 #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
-static inline bool put_devmap_managed_page(struct page *page)
+static inline bool put_devmap_managed_page_refs(struct page *page, int refs)
 {
return false;
 }
 #endif /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
 
+static inline bool put_devmap_managed_page(struct page *page)
+{
+   return put_devmap_managed_page_refs(page, 1);
+}
+
 /* 127: arbitrary random number, small enough to assemble well */
 #define folio_ref_zero_or_close_to_overflow(folio) \
((unsigned int) folio_ref_count(folio) + 127u <= 127u)
diff --git a/mm/gup.c b/mm/gup.c
index 4e1999402e71..965ba755023f 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,7 +87,8 @@ static inline struct folio *try_get_folio(struct page *page, 
int refs)
 * belongs to this folio.
 */
if (unlikely(page_folio(page) != folio)) {
-   folio_put_refs(folio, refs);
+   if (!put_devmap_managed_page_refs(>page, refs))
+   folio_put_refs(folio, refs);
goto retry;
}
 
@@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, 
unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
 
-   folio_put_refs(folio, refs);
+   if (!put_devmap_managed_page_refs(>page, refs))
+   folio_put_refs(folio, refs);
 }
 
 /**
diff --git a/mm/memremap.c b/mm/memremap.c
index f0955785150f..58b20c3c300b 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -509,7 +509,7 @@ void free_zone_device_page(struct page *page)
 }
 
 #ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs)
 {
if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;
@@ -519,9 +519,9 @@ bool __put_devmap_managed_page(struct page *page)
 * refcount is 1, then the page is free and the refcount is
 * stable because nobody holds a reference on the page.
 */
-   if (page_ref_dec_return(page) == 1)
+   if (page_ref_sub_return(page, refs) == 1)
wake_up_var(>_refcount);
return true;
 }
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page_refs);
 #endif /* CONFIG_FS_DAX */
-- 
2.11.0

Re: [PATCH] mm: fix missing wake-up event for FSDAX pages

2022-07-04 Thread Muchun Song

On Mon, Jul 04, 2022 at 11:38:16AM +0100, Matthew Wilcox wrote:
> On Mon, Jul 04, 2022 at 03:40:54PM +0800, Muchun Song wrote:
> > FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
> > 1, then the page is freed.  The FSDAX pages can be pinned through GUP,
> > then they will be unpinned via unpin_user_page() using a folio variant
> > to put the page, however, folio variants did not consider this special
> > case, the result will be to miss a wakeup event (like the user of
> > __fuse_dax_break_layouts()).
> 
> Argh, no.  The 1-based refcounts are a blight on the entire kernel.
> They need to go away, not be pushed into folios as well.  I think

I would be happy if this could go away.

> we're close to having that fixed, but until then, this should do
> the trick?
> 

The following fix looks good to me since it lowers the overhead as
much as possible

Thanks.

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index cc98ab012a9b..4cef5e0f78b6 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1129,18 +1129,18 @@ static inline bool is_zone_movable_page(const struct 
> page *page)
>  #if defined(CONFIG_ZONE_DEVICE) && defined(CONFIG_FS_DAX)
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
>  
> -bool __put_devmap_managed_page(struct page *page);
> -static inline bool put_devmap_managed_page(struct page *page)
> +bool __put_devmap_managed_page(struct page *page, int refs);
> +static inline bool put_devmap_managed_page(struct page *page, int refs)
>  {
>   if (!static_branch_unlikely(_managed_key))
>   return false;
>   if (!is_zone_device_page(page))
>   return false;
> - return __put_devmap_managed_page(page);
> + return __put_devmap_managed_page(page, refs);
>  }
>  
>  #else /* CONFIG_ZONE_DEVICE && CONFIG_FS_DAX */
> -static inline bool put_devmap_managed_page(struct page *page)
> +static inline bool put_devmap_managed_page(struct page *page, int refs)
>  {
>   return false;
>  }
> @@ -1246,7 +1246,7 @@ static inline void put_page(struct page *page)
>* For some devmap managed pages we need to catch refcount transition
>* from 2 to 1:
>*/
> - if (put_devmap_managed_page(>page))
> + if (put_devmap_managed_page(>page, 1))
>   return;
>   folio_put(folio);
>  }
> diff --git a/mm/gup.c b/mm/gup.c
> index d1132b39aa8f..28df02121c78 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -88,7 +88,8 @@ static inline struct folio *try_get_folio(struct page 
> *page, int refs)
>* belongs to this folio.
>*/
>   if (unlikely(page_folio(page) != folio)) {
> - folio_put_refs(folio, refs);
> + if (!put_devmap_managed_page(>page, refs))
> + folio_put_refs(folio, refs);
>   goto retry;
>   }
>  
> @@ -177,6 +178,8 @@ static void gup_put_folio(struct folio *folio, int refs, 
> unsigned int flags)
>   refs *= GUP_PIN_COUNTING_BIAS;
>   }
>  
> + if (put_devmap_managed_page(>page, refs))
> + return;
>   folio_put_refs(folio, refs);
>  }
>  
> diff --git a/mm/memremap.c b/mm/memremap.c
> index b870a659eee6..b25e40e3a11e 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -499,7 +499,7 @@ void free_zone_device_page(struct page *page)
>  }
>  
>  #ifdef CONFIG_FS_DAX
> -bool __put_devmap_managed_page(struct page *page)
> +bool __put_devmap_managed_page(struct page *page, int refs)
>  {
>   if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
>   return false;
> @@ -509,7 +509,7 @@ bool __put_devmap_managed_page(struct page *page)
>* refcount is 1, then the page is free and the refcount is
>* stable because nobody holds a reference on the page.
>*/
> - if (page_ref_dec_return(page) == 1)
> + if (page_ref_sub_return(page, refs) == 1)
>   wake_up_var(>_refcount);
>   return true;
>  }
> diff --git a/mm/swap.c b/mm/swap.c
> index c6194cfa2af6..94e42a9bab92 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -960,7 +960,7 @@ void release_pages(struct page **pages, int nr)
>   unlock_page_lruvec_irqrestore(lruvec, flags);
>   lruvec = NULL;
>   }
> - if (put_devmap_managed_page(>page))
> + if (put_devmap_managed_page(>page, 1))
>   continue;
>   if (folio_put_testzero(folio))
>   free_zone_device_page(>page);
>

[PATCH] mm: fix missing wake-up event for FSDAX pages

2022-07-04 Thread Muchun Song

FSDAX page refcounts are 1-based, rather than 0-based: if refcount is
1, then the page is freed.  The FSDAX pages can be pinned through GUP,
then they will be unpinned via unpin_user_page() using a folio variant
to put the page, however, folio variants did not consider this special
case, the result will be to miss a wakeup event (like the user of
__fuse_dax_break_layouts()).

Fixes: d8ddc099c6b3 ("mm/gup: Add gup_put_folio()")
Signed-off-by: Muchun Song 
---
 include/linux/mm.h | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 517f9deba56f..32aaa7b06f5a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1223,6 +1223,9 @@ static inline __must_check bool try_get_page(struct page 
*page)
  */
 static inline void folio_put(struct folio *folio)
 {
+   if (put_devmap_managed_page(>page))
+   return;
+
if (folio_put_testzero(folio))
__folio_put(folio);
 }
@@ -1243,8 +1246,13 @@ static inline void folio_put(struct folio *folio)
  */
 static inline void folio_put_refs(struct folio *folio, int refs)
 {
-   if (folio_ref_sub_and_test(folio, refs))
-   __folio_put(folio);
+   /*
+* For fsdax managed pages we need to catch refcount transition
+* from 2 to 1:
+*/
+   if (refs > 1)
+   folio_ref_sub(folio, refs - 1);
+   folio_put(folio);
 }
 
 void release_pages(struct page **pages, int nr);
@@ -1268,15 +1276,7 @@ static inline void folios_put(struct folio **folios, 
unsigned int nr)
 
 static inline void put_page(struct page *page)
 {
-   struct folio *folio = page_folio(page);
-
-   /*
-* For some devmap managed pages we need to catch refcount transition
-* from 2 to 1:
-*/
-   if (put_devmap_managed_page(>page))
-   return;
-   folio_put(folio);
+   folio_put(page_folio(page));
 }
 
 /*
-- 
2.11.0

[PATCH v7 6/6] mm: simplify follow_invalidate_pte()

2022-04-02 Thread Muchun Song

The only user (DAX) of range and pmdpp parameters of follow_invalidate_pte()
is gone, it is safe to remove them and make it static to simlify the code.
This is revertant of the following commits:

  097963959594 ("mm: add follow_pte_pmd()")
  a4d1a8852513 ("dax: update to new mmu_notifier semantic")

There is only one caller of the follow_invalidate_pte().  So just fold it
into follow_pte() and remove it.

Signed-off-by: Muchun Song 
Reviewed-by: Christoph Hellwig 
---
 include/linux/mm.h |  3 --
 mm/memory.c| 81 --
 2 files changed, 23 insertions(+), 61 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9bada4096ac..be7ec4c37ebe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1871,9 +1871,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cc6968dc8e4e..84f7250e6cd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4964,9 +4964,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+  pte_t **ptepp, spinlock_t **ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4989,35 +5009,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
 
-   if (pmd_huge(*pmd)) {
-   if (!pmdpp)
-   goto out;
-
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
-   *ptlp = pmd_lock(mm, pmd);
-   if (pmd_huge(*pmd)) {
-   *pmdpp = pmd;
-   return 0;
-   }
-   spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
-   }
-
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5025,38 +5019,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
-
-/**
- * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
- *
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location

[PATCH v7 5/6] dax: fix missing writeprotect the pte entry

2022-04-02 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
Reviewed-by: Christoph Hellwig 
---
 fs/dax.c | 99 
 1 file changed, 12 insertions(+), 87 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a372304c9695..1ac12e877f4f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -789,96 +790,12 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
-{
-   struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
-
-   i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
-   cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-   set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
-   pte_unmap_unlock(ptep, ptl);
-   }
-
-   mmu_notifier_invalidate_range_end();
-   }
-   i_mmap_unlock_read(mapping);
-}
-
 static int dax_writeback_one(struct xa_state *

[PATCH v7 4/6] mm: pvmw: add support for walking devmap pages

2022-04-02 Thread Muchun Song

The devmap pages can not use page_vma_mapped_walk() to check if a huge
devmap page is mapped into a vma.  Add support for walking huge devmap
pages so that DAX can use it in the next patch.

Signed-off-by: Muchun Song 
---
 mm/page_vma_mapped.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..3da82bf65de8 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,16 +210,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);
 
-   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
+   (pmd_present(pmde) && pmd_devmap(pmde))) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
-   if (likely(pmd_trans_huge(pmde))) {
-   if (pvmw->flags & PVMW_MIGRATION)
-   return not_found(pvmw);
-   if (!check_pmd(pmd_pfn(pmde), pvmw))
-   return not_found(pvmw);
-   return true;
-   }
if (!pmd_present(pmde)) {
swp_entry_t entry;
 
@@ -232,6 +226,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
return not_found(pvmw);
return true;
}
+   if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+   if (pvmw->flags & PVMW_MIGRATION)
+   return not_found(pvmw);
+   if (!check_pmd(pmd_pfn(pmde), pvmw))
+   return not_found(pvmw);
+   return true;
+   }
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
-- 
2.11.0

[PATCH v7 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-04-02 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 +++
 mm/internal.h| 26 +
 mm/rmap.c| 65 +++-
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b58ddb8b2220..a6ec0d3e40c1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -263,6 +263,9 @@ unsigned long page_address_in_vma(struct page *, struct 
vm_area_struct *);
  */
 int folio_mkclean(struct folio *);
 
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index f45292dc4ef5..664e6d48607c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,26 +516,22 @@ void mlock_page_drain(int cpu);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -545,6 +541,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Then at what user virtual address will none of the range be found in vma?
  * Assumes that vma_address() already returned a good starting address.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 723682ddb9e8..ad5cf0e45a73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long address = pvmw->address;
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
-   vma_address_end());
+   vma_address_end(pvmw));
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_mapped_walk()) {
+   while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
 
-   address = pvmw.address;
-   if (pvmw.pte) {
+   address = pvmw->address;
+   if (pvmw->pte) {
pte_t entry;
-

[PATCH v7 2/6] dax: fix cache flush on PMD-mapped pages

2022-04-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.  This is just a
documentation issue with the respect to properly documenting the expected
usage of cache flushing before modifying the pmd.  However, in practice
this is not a problem due to the fact that DAX is not available on
architectures with virtually indexed caches per:

  commit d92576f1167c ("dax: does not work correctly with virtual aliasing 
caches")

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
Reviewed-by: Dan Williams 
Reviewed-by: Christoph Hellwig 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..a372304c9695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v7 1/6] mm: rmap: fix cache flush on THP pages

2022-04-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
Reviewed-by: Dan Williams 
Reviewed-by: Christoph Hellwig 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fc46a3d7b704..723682ddb9e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, folio_pfn(folio));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v7 0/6] Fix some bugs related to ramp and dax

2022-04-02 Thread Muchun Song

This series is based on next-20220225.

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

v7:
- Remove redurant "*" above vma_address() reported by Christoph.
- Fix oops (reported by Qian) on arm64 by using "pmd_present() && pmd_devmap()"
  to workaround the bug in pmd_leaf() on arm64, which is fixed in another
  patch [1].

[1] https://lore.kernel.org/all/20220403024928.4125-1-songmuc...@bytedance.com/

v6:
- Collect Reviewed-by from Christoph Hellwig.
- Fold dax_entry_mkclean() into dax_writeback_one().

v5:
- Collect Reviewed-by from Dan Williams.
- Fix panic reported by kernel test robot .
- Remove pmdpp parameter from follow_invalidate_pte() and fold it into 
follow_pte().

v4:
- Fix compilation error on riscv.

v3:
- Based on next-20220225.

v2:
- Avoid the overly long line in lots of places suggested by Christoph.
- Fix a compiler warning reported by kernel test robot since pmd_pfn()
  is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
- Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  mm: pvmw: add support for walking devmap pages
  dax: fix missing writeprotect the pte entry
  mm: simplify follow_invalidate_pte()

 fs/dax.c | 98 +++-
 include/linux/mm.h   |  3 --
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 +-
 mm/memory.c  | 81 ---
 mm/page_vma_mapped.c | 17 -
 mm/rmap.c| 68 +---
 7 files changed, 120 insertions(+), 176 deletions(-)

-- 
2.11.0

Re: [PATCH v5 0/6] Fix some bugs related to ramp and dax

2022-04-02 Thread Muchun Song

On Thu, Mar 31, 2022 at 11:55 PM Qian Cai  wrote:
>
> On Fri, Mar 18, 2022 at 03:45:23PM +0800, Muchun Song wrote:
> > This series is based on next-20220225.
> >
> > Patch 1-2 fix a cache flush bug, because subsequent patches depend on
> > those on those changes, there are placed in this series.  Patch 3-4
> > are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
> > since the previous patch remove the usage of follow_invalidate_pte().
>
> Reverting this series fixed boot crashes.
>
>  KASAN: null-ptr-deref in range [0x0018-0x001f]
>  Mem abort info:
>ESR = 0x9604
>EC = 0x25: DABT (current EL), IL = 32 bits
>SET = 0, FnV = 0
>EA = 0, S1PTW = 0
>FSC = 0x04: level 0 translation fault
>  Data abort info:
>ISV = 0, ISS = 0x0004
>CM = 0, WnR = 0
>  [dfff8003] address between user and kernel address ranges
>  Internal error: Oops: 9604 [#1] PREEMPT SMP
>  Modules linked in: cdc_ether usbnet ipmi_devintf ipmi_msghandler 
> cppc_cpufreq fuse ip_tables x_tables ipv6 btrfs blake2b_generic libcrc32c xor 
> xor_neon raid6_pq zstd_compress dm_mod nouveau crct10dif_ce drm_ttm_helper 
> mlx5_core ttm drm_dp_helper drm_kms_helper nvme mpt3sas nvme_core xhci_pci 
> raid_class drm xhci_pci_renesas
>  CPU: 3 PID: 1707 Comm: systemd-udevd Not tainted 
> 5.17.0-next-20220331-4-g2d550916a6b9 #51
>  pstate: 104000c9 (nzcV daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>  pc : __lock_acquire
>  lr : lock_acquire.part.0
>  sp : 800030a16fd0
>  x29: 800030a16fd0 x28: dd876c4e9f90 x27: 0018
>  x26:  x25: 0018 x24: 
>  x23: 08022beacf00 x22: dd8772507660 x21: 
>  x20:  x19:  x18: dd8772417d2c
>  x17: dd876c5bc2e0 x16: 1fffe100457d5b06 x15: 0094
>  x14: f1f1 x13: f3f3f3f3 x12: 08022beacf08
>  x11: 1bb0ee482fa5 x10: dd8772417d28 x9 : 
>  x8 : 0003 x7 : dd876c4e9f90 x6 : 
>  x5 :  x4 : 0001 x3 : 
>  x2 :  x1 : 0003 x0 : dfff8000
>  Call trace:
>   __lock_acquire
>   lock_acquire.part.0
>   lock_acquire
>   _raw_spin_lock
>   page_vma_mapped_walk
>   try_to_migrate_one
>   rmap_walk_anon
>   try_to_migrate
>   __unmap_and_move
>   unmap_and_move
>   migrate_pages
>   migrate_misplaced_page
>   do_huge_pmd_numa_page
>   __handle_mm_fault
>   handle_mm_fault
>   do_translation_fault
>   do_mem_abort
>   el0_da
>   el0t_64_sync_handler
>   el0t_64_sync
>  Code: d65f03c0 d343ff61 d2d0 f2fbffe0 (38e06820)

Hi,

I have found the root cause. It is because the implementation of
pmd_leaf() on arm64 is wrong.  It didn't consider the PROT_NONE
mapped PMD, which does not match the expectation of pmd_leaf().
I'll send a fixed patch for arm64 like the following.

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 94e147e5456c..09eaae46a19b 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -535,7 +535,7 @@ extern pgprot_t phys_mem_access_prot(struct file
*file, unsigned long pfn,
 PMD_TYPE_TABLE)
 #define pmd_sect(pmd)  ((pmd_val(pmd) & PMD_TYPE_MASK) == \
 PMD_TYPE_SECT)
-#define pmd_leaf(pmd)  pmd_sect(pmd)
+#define pmd_leaf(pmd)  (pmd_present(pmd) && !(pmd_val(pmd) &
PMD_TABLE_BIT))
 #define pmd_bad(pmd)   (!pmd_table(pmd))

 #define pmd_leaf_size(pmd) (pmd_cont(pmd) ? CONT_PMD_SIZE : PMD_SIZE)

Thanks.

Re: [PATCH v5 0/6] Fix some bugs related to ramp and dax

2022-04-01 Thread Muchun Song

On Fri, Apr 1, 2022 at 11:44 AM Muchun Song  wrote:
>
> On Thu, Mar 31, 2022 at 11:55 PM Qian Cai  wrote:
> >
> > On Fri, Mar 18, 2022 at 03:45:23PM +0800, Muchun Song wrote:
> > > This series is based on next-20220225.
> > >
> > > Patch 1-2 fix a cache flush bug, because subsequent patches depend on
> > > those on those changes, there are placed in this series.  Patch 3-4
> > > are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
> > > since the previous patch remove the usage of follow_invalidate_pte().
> >
> > Reverting this series fixed boot crashes.
> >
> >  KASAN: null-ptr-deref in range [0x0018-0x001f]
> >  Mem abort info:
> >ESR = 0x9604
> >EC = 0x25: DABT (current EL), IL = 32 bits
> >SET = 0, FnV = 0
> >EA = 0, S1PTW = 0
> >FSC = 0x04: level 0 translation fault
> >  Data abort info:
> >ISV = 0, ISS = 0x0004
> >CM = 0, WnR = 0
> >  [dfff8003] address between user and kernel address ranges
> >  Internal error: Oops: 9604 [#1] PREEMPT SMP
> >  Modules linked in: cdc_ether usbnet ipmi_devintf ipmi_msghandler 
> > cppc_cpufreq fuse ip_tables x_tables ipv6 btrfs blake2b_generic libcrc32c 
> > xor xor_neon raid6_pq zstd_compress dm_mod nouveau crct10dif_ce 
> > drm_ttm_helper mlx5_core ttm drm_dp_helper drm_kms_helper nvme mpt3sas 
> > nvme_core xhci_pci raid_class drm xhci_pci_renesas
> >  CPU: 3 PID: 1707 Comm: systemd-udevd Not tainted 
> > 5.17.0-next-20220331-4-g2d550916a6b9 #51
> >  pstate: 104000c9 (nzcV daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> >  pc : __lock_acquire
> >  lr : lock_acquire.part.0
> >  sp : 800030a16fd0
> >  x29: 800030a16fd0 x28: dd876c4e9f90 x27: 0018
> >  x26:  x25: 0018 x24: 
> >  x23: 08022beacf00 x22: dd8772507660 x21: 
> >  x20:  x19:  x18: dd8772417d2c
> >  x17: dd876c5bc2e0 x16: 1fffe100457d5b06 x15: 0094
> >  x14: f1f1 x13: f3f3f3f3 x12: 08022beacf08
> >  x11: 1bb0ee482fa5 x10: dd8772417d28 x9 : 
> >  x8 : 0003 x7 : dd876c4e9f90 x6 : 
> >  x5 :  x4 : 0001 x3 : 
> >  x2 :  x1 : 0003 x0 : dfff8000
> >  Call trace:
> >   __lock_acquire
> >   lock_acquire.part.0
> >   lock_acquire
> >   _raw_spin_lock
> >   page_vma_mapped_walk
> >   try_to_migrate_one
> >   rmap_walk_anon
> >   try_to_migrate
> >   __unmap_and_move
> >   unmap_and_move
> >   migrate_pages
> >   migrate_misplaced_page
> >   do_huge_pmd_numa_page
> >   __handle_mm_fault
> >   handle_mm_fault
> >   do_translation_fault
> >   do_mem_abort
> >   el0_da
> >   el0t_64_sync_handler
> >   el0t_64_sync
> >  Code: d65f03c0 d343ff61 d2d0 f2fbffe0 (38e06820)
> >  ---[ end trace  ]---
> >  Kernel panic - not syncing: Oops: Fatal exception
> >  SMP: stopping secondary CPUs
> >  Kernel Offset: 0x5d8763da from 0x8800
> >  PHYS_OFFSET: 0x8000
> >  CPU features: 0x000,00085c0d,19801c82
> >  Memory Limit: none
> >  ---[ end Kernel panic - not syncing: Oops: Fatal exception ]---
>
> Thanks for your report. Would you mind providing the .config?

Hi Qian Cai,

Would you mind helping me test if the following patch works properly?
Thanks.

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index b3bf802a6435..3da82bf65de8 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,7 +210,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);

-   if (pmd_leaf(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde) ||
+   (pmd_present(pmde) && pmd_devmap(pmde))) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
if (!pmd_present(pmde)) {

Re: [PATCH v5 0/6] Fix some bugs related to ramp and dax

2022-03-31 Thread Muchun Song

On Thu, Mar 31, 2022 at 11:55 PM Qian Cai  wrote:
>
> On Fri, Mar 18, 2022 at 03:45:23PM +0800, Muchun Song wrote:
> > This series is based on next-20220225.
> >
> > Patch 1-2 fix a cache flush bug, because subsequent patches depend on
> > those on those changes, there are placed in this series.  Patch 3-4
> > are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
> > since the previous patch remove the usage of follow_invalidate_pte().
>
> Reverting this series fixed boot crashes.
>
>  KASAN: null-ptr-deref in range [0x0018-0x001f]
>  Mem abort info:
>ESR = 0x9604
>EC = 0x25: DABT (current EL), IL = 32 bits
>SET = 0, FnV = 0
>EA = 0, S1PTW = 0
>FSC = 0x04: level 0 translation fault
>  Data abort info:
>ISV = 0, ISS = 0x0004
>CM = 0, WnR = 0
>  [dfff8003] address between user and kernel address ranges
>  Internal error: Oops: 9604 [#1] PREEMPT SMP
>  Modules linked in: cdc_ether usbnet ipmi_devintf ipmi_msghandler 
> cppc_cpufreq fuse ip_tables x_tables ipv6 btrfs blake2b_generic libcrc32c xor 
> xor_neon raid6_pq zstd_compress dm_mod nouveau crct10dif_ce drm_ttm_helper 
> mlx5_core ttm drm_dp_helper drm_kms_helper nvme mpt3sas nvme_core xhci_pci 
> raid_class drm xhci_pci_renesas
>  CPU: 3 PID: 1707 Comm: systemd-udevd Not tainted 
> 5.17.0-next-20220331-4-g2d550916a6b9 #51
>  pstate: 104000c9 (nzcV daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>  pc : __lock_acquire
>  lr : lock_acquire.part.0
>  sp : 800030a16fd0
>  x29: 800030a16fd0 x28: dd876c4e9f90 x27: 0018
>  x26:  x25: 0018 x24: 
>  x23: 08022beacf00 x22: dd8772507660 x21: 
>  x20:  x19:  x18: dd8772417d2c
>  x17: dd876c5bc2e0 x16: 1fffe100457d5b06 x15: 0094
>  x14: f1f1 x13: f3f3f3f3 x12: 08022beacf08
>  x11: 1bb0ee482fa5 x10: dd8772417d28 x9 : 
>  x8 : 0003 x7 : dd876c4e9f90 x6 : 
>  x5 :  x4 : 0001 x3 : 
>  x2 :  x1 : 0003 x0 : dfff8000
>  Call trace:
>   __lock_acquire
>   lock_acquire.part.0
>   lock_acquire
>   _raw_spin_lock
>   page_vma_mapped_walk
>   try_to_migrate_one
>   rmap_walk_anon
>   try_to_migrate
>   __unmap_and_move
>   unmap_and_move
>   migrate_pages
>   migrate_misplaced_page
>   do_huge_pmd_numa_page
>   __handle_mm_fault
>   handle_mm_fault
>   do_translation_fault
>   do_mem_abort
>   el0_da
>   el0t_64_sync_handler
>   el0t_64_sync
>  Code: d65f03c0 d343ff61 d2d0 f2fbffe0 (38e06820)
>  ---[ end trace  ]---
>  Kernel panic - not syncing: Oops: Fatal exception
>  SMP: stopping secondary CPUs
>  Kernel Offset: 0x5d8763da from 0x8800
>  PHYS_OFFSET: 0x8000
>  CPU features: 0x000,00085c0d,19801c82
>  Memory Limit: none
>  ---[ end Kernel panic - not syncing: Oops: Fatal exception ]---

Thanks for your report. Would you mind providing the .config?

Re: [PATCH v6 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-03-30 Thread Muchun Song

On Wed, Mar 30, 2022 at 1:47 PM Christoph Hellwig  wrote:
>
> On Tue, Mar 29, 2022 at 09:48:50PM +0800, Muchun Song wrote:
> > + * * Return the start of user virtual address at the specific offset within
>
> Double "*" here.

Thanks for pointing out this.

>
> Also Shiyang has been wanting a quite similar vma_pgoff_address for use
> in dax.c.  Maybe we'll need to look into moving this to linux/mm.h.
>

I saw Shiyang is ready to rebase onto this patch.  So should I
move it to linux/mm.h or let Shiyang does?

Thanks.

[PATCH v6 6/6] mm: simplify follow_invalidate_pte()

2022-03-29 Thread Muchun Song

The only user (DAX) of range and pmdpp parameters of follow_invalidate_pte()
is gone, it is safe to remove them and make it static to simlify the code.
This is revertant of the following commits:

  097963959594 ("mm: add follow_pte_pmd()")
  a4d1a8852513 ("dax: update to new mmu_notifier semantic")

There is only one caller of the follow_invalidate_pte().  So just fold it
into follow_pte() and remove it.

Signed-off-by: Muchun Song 
Reviewed-by: Christoph Hellwig 
---
 include/linux/mm.h |  3 --
 mm/memory.c| 81 --
 2 files changed, 23 insertions(+), 61 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9bada4096ac..be7ec4c37ebe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1871,9 +1871,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cc6968dc8e4e..84f7250e6cd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4964,9 +4964,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+  pte_t **ptepp, spinlock_t **ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4989,35 +5009,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
 
-   if (pmd_huge(*pmd)) {
-   if (!pmdpp)
-   goto out;
-
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
-   *ptlp = pmd_lock(mm, pmd);
-   if (pmd_huge(*pmd)) {
-   *pmdpp = pmd;
-   return 0;
-   }
-   spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
-   }
-
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5025,38 +5019,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
-
-/**
- * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
- *
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location

[PATCH v6 5/6] dax: fix missing writeprotect the pte entry

2022-03-29 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
Reviewed-by: Christoph Hellwig 
---
 fs/dax.c | 99 
 1 file changed, 12 insertions(+), 87 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a372304c9695..1ac12e877f4f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -789,96 +790,12 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
-/* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
-{
-   struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
-
-   i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
-   cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-   set_pte_at(vma->vm_mm, address, ptep, pte);
-unlock_pte:
-   pte_unmap_unlock(ptep, ptl);
-   }
-
-   mmu_notifier_invalidate_range_end();
-   }
-   i_mmap_unlock_read(mapping);
-}
-
 static int dax_writeback_one(struct xa_state *

[PATCH v6 4/6] mm: pvmw: add support for walking devmap pages

2022-03-29 Thread Muchun Song

The devmap pages can not use page_vma_mapped_walk() to check if a huge
devmap page is mapped into a vma.  Add support for walking huge devmap
pages so that DAX can use it in the next patch.

Signed-off-by: Muchun Song 
---
 mm/page_vma_mapped.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..b3bf802a6435 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,16 +210,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);
 
-   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_leaf(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
-   if (likely(pmd_trans_huge(pmde))) {
-   if (pvmw->flags & PVMW_MIGRATION)
-   return not_found(pvmw);
-   if (!check_pmd(pmd_pfn(pmde), pvmw))
-   return not_found(pvmw);
-   return true;
-   }
if (!pmd_present(pmde)) {
swp_entry_t entry;
 
@@ -232,6 +225,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
return not_found(pvmw);
return true;
}
+   if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+   if (pvmw->flags & PVMW_MIGRATION)
+   return not_found(pvmw);
+   if (!check_pmd(pmd_pfn(pmde), pvmw))
+   return not_found(pvmw);
+   return true;
+   }
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
-- 
2.11.0

[PATCH v6 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-03-29 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 +++
 mm/internal.h| 26 +
 mm/rmap.c| 65 +++-
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b58ddb8b2220..a6ec0d3e40c1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -263,6 +263,9 @@ unsigned long page_address_in_vma(struct page *, struct 
vm_area_struct *);
  */
 int folio_mkclean(struct folio *);
 
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index f45292dc4ef5..ff873944749f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,26 +516,22 @@ void mlock_page_drain(int cpu);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -545,6 +541,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Then at what user virtual address will none of the range be found in vma?
  * Assumes that vma_address() already returned a good starting address.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 723682ddb9e8..ad5cf0e45a73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long address = pvmw->address;
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
-   vma_address_end());
+   vma_address_end(pvmw));
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_mapped_walk()) {
+   while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
 
-   address = pvmw.address;
-   if (pvmw.pte) {
+   address = pvmw->address;
+   if (pvmw->pte) {
pte_t entry;
-

[PATCH v6 2/6] dax: fix cache flush on PMD-mapped pages

2022-03-29 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.  This is just a
documentation issue with the respect to properly documenting the expected
usage of cache flushing before modifying the pmd.  However, in practice
this is not a problem due to the fact that DAX is not available on
architectures with virtually indexed caches per:

  commit d92576f1167c ("dax: does not work correctly with virtual aliasing 
caches")

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
Reviewed-by: Dan Williams 
Reviewed-by: Christoph Hellwig 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..a372304c9695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v6 1/6] mm: rmap: fix cache flush on THP pages

2022-03-29 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
Reviewed-by: Dan Williams 
Reviewed-by: Christoph Hellwig 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fc46a3d7b704..723682ddb9e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, folio_pfn(folio));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v6 0/6] Fix some bugs related to ramp and dax

2022-03-29 Thread Muchun Song

This series is based on next-20220225.

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

v6:
- Collect Reviewed-by from Christoph Hellwig.
- Fold dax_entry_mkclean() into dax_writeback_one().

v5:
- Collect Reviewed-by from Dan Williams.
- Fix panic reported by kernel test robot .
- Remove pmdpp parameter from follow_invalidate_pte() and fold it into 
follow_pte().

v4:
- Fix compilation error on riscv.

v3:
- Based on next-20220225.

v2:
- Avoid the overly long line in lots of places suggested by Christoph.
- Fix a compiler warning reported by kernel test robot since pmd_pfn()
  is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
- Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  mm: pvmw: add support for walking devmap pages
  dax: fix missing writeprotect the pte entry
  mm: simplify follow_invalidate_pte()

 fs/dax.c | 98 +++-
 include/linux/mm.h   |  3 --
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 +-
 mm/memory.c  | 81 ---
 mm/page_vma_mapped.c | 16 -
 mm/rmap.c| 68 +---
 7 files changed, 119 insertions(+), 176 deletions(-)

-- 
2.11.0

Re: [PATCH v5 5/6] dax: fix missing writeprotect the pte entry

2022-03-22 Thread Muchun Song

On Tue, Mar 22, 2022 at 4:37 PM Christoph Hellwig  wrote:
>
> > +static void dax_entry_mkclean(struct address_space *mapping, unsigned long 
> > pfn,
> > +   unsigned long npfn, pgoff_t start)
> >  {
> >   struct vm_area_struct *vma;
> > + pgoff_t end = start + npfn - 1;
> >
> >   i_mmap_lock_read(mapping);
> > + vma_interval_tree_foreach(vma, >i_mmap, start, end) {
> > + pfn_mkclean_range(pfn, npfn, start, vma);
> >   cond_resched();
> >   }
> >   i_mmap_unlock_read(mapping);
>
>
> Is there any point in even keeping this helper vs just open coding it
> in the only caller below?

Good point. I'll fold dax_entry_mkclean() into the caller.

>
> Otherwise looks good:
>
> Reviewed-by: Christoph Hellwig 

Thanks.

[PATCH v5 6/6] mm: simplify follow_invalidate_pte()

2022-03-18 Thread Muchun Song

The only user (DAX) of range and pmdpp parameters of follow_invalidate_pte()
is gone, it is safe to remove them and make it static to simlify the code.
This is revertant of the following commits:

  097963959594 ("mm: add follow_pte_pmd()")
  a4d1a8852513 ("dax: update to new mmu_notifier semantic")

There is only one caller of the follow_invalidate_pte().  So just fold it
into follow_pte() and remove it.

Signed-off-by: Muchun Song 
---
 include/linux/mm.h |  3 --
 mm/memory.c| 81 --
 2 files changed, 23 insertions(+), 61 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9bada4096ac..be7ec4c37ebe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1871,9 +1871,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cc6968dc8e4e..84f7250e6cd1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4964,9 +4964,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed.  The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function.  While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+  pte_t **ptepp, spinlock_t **ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4989,35 +5009,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
pmd = pmd_offset(pud, address);
VM_BUG_ON(pmd_trans_huge(*pmd));
 
-   if (pmd_huge(*pmd)) {
-   if (!pmdpp)
-   goto out;
-
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
-   *ptlp = pmd_lock(mm, pmd);
-   if (pmd_huge(*pmd)) {
-   *pmdpp = pmd;
-   return 0;
-   }
-   spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
-   }
-
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5025,38 +5019,9 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
-
-/**
- * follow_pte - look up PTE at a user virtual address
- * @mm: the mm_struct of the target address space
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
- *
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location is stored in @ptlp.
- * The contents of

[PATCH v5 5/6] dax: fix missing writeprotect the pte entry

2022-03-18 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 83 ++--
 1 file changed, 7 insertions(+), 76 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a372304c9695..7fd4a16769f9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -789,87 +790,17 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, unsigned long pfn,
+ unsigned long npfn, pgoff_t start)
 {
struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
+   pgoff_t end = start + npfn - 1;
 
i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
+   vma_interval_tree_foreach(vma, >i_mmap, start, end) {
+   pfn_mkclean_range(pfn, npfn, start, vma);
cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-

[PATCH v5 4/6] mm: pvmw: add support for walking devmap pages

2022-03-18 Thread Muchun Song

The devmap pages can not use page_vma_mapped_walk() to check if a huge
devmap page is mapped into a vma.  Add support for walking huge devmap
pages so that DAX can use it in the next patch.

Signed-off-by: Muchun Song 
---
 mm/page_vma_mapped.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..b3bf802a6435 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,16 +210,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);
 
-   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_leaf(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
-   if (likely(pmd_trans_huge(pmde))) {
-   if (pvmw->flags & PVMW_MIGRATION)
-   return not_found(pvmw);
-   if (!check_pmd(pmd_pfn(pmde), pvmw))
-   return not_found(pvmw);
-   return true;
-   }
if (!pmd_present(pmde)) {
swp_entry_t entry;
 
@@ -232,6 +225,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
return not_found(pvmw);
return true;
}
+   if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
+   if (pvmw->flags & PVMW_MIGRATION)
+   return not_found(pvmw);
+   if (!check_pmd(pmd_pfn(pmde), pvmw))
+   return not_found(pvmw);
+   return true;
+   }
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
-- 
2.11.0

[PATCH v5 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-03-18 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 +++
 mm/internal.h| 26 +
 mm/rmap.c| 65 +++-
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b58ddb8b2220..a6ec0d3e40c1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -263,6 +263,9 @@ unsigned long page_address_in_vma(struct page *, struct 
vm_area_struct *);
  */
 int folio_mkclean(struct folio *);
 
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index f45292dc4ef5..ff873944749f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,26 +516,22 @@ void mlock_page_drain(int cpu);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -545,6 +541,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Then at what user virtual address will none of the range be found in vma?
  * Assumes that vma_address() already returned a good starting address.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 723682ddb9e8..ad5cf0e45a73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long address = pvmw->address;
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
-   vma_address_end());
+   vma_address_end(pvmw));
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_mapped_walk()) {
+   while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
 
-   address = pvmw.address;
-   if (pvmw.pte) {
+   address = pvmw->address;
+   if (pvmw->pte) {
pte_t entry;
-

[PATCH v5 2/6] dax: fix cache flush on PMD-mapped pages

2022-03-18 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.  This is just a
documentation issue with the respect to properly documenting the expected
usage of cache flushing before modifying the pmd.  However, in practice
this is not a problem due to the fact that DAX is not available on
architectures with virtually indexed caches per:

  commit d92576f1167c ("dax: does not work correctly with virtual aliasing 
caches")

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
Reviewed-by: Dan Williams 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..a372304c9695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v5 1/6] mm: rmap: fix cache flush on THP pages

2022-03-18 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
Reviewed-by: Dan Williams 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fc46a3d7b704..723682ddb9e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, folio_pfn(folio));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v5 0/6] Fix some bugs related to ramp and dax

2022-03-18 Thread Muchun Song

This series is based on next-20220225.

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

v5:
- Collect Reviewed-by from Dan Williams.
- Fix panic reported by kernel test robot .
- Remove pmdpp parameter from follow_invalidate_pte() and fold it into 
follow_pte().

v4:
- Fix compilation error on riscv.

v3:
- Based on next-20220225.

v2:
- Avoid the overly long line in lots of places suggested by Christoph.
- Fix a compiler warning reported by kernel test robot since pmd_pfn()
  is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
- Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  mm: pvmw: add support for walking devmap pages
  dax: fix missing writeprotect the pte entry
  mm: simplify follow_invalidate_pte()

 fs/dax.c | 82 +---
 include/linux/mm.h   |  3 --
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 +++--
 mm/memory.c  | 81 +++
 mm/page_vma_mapped.c | 16 +-
 mm/rmap.c| 68 +++
 7 files changed, 114 insertions(+), 165 deletions(-)

-- 
2.11.0

Re: [PATCH v4 5/6] dax: fix missing writeprotect the pte entry

2022-03-15 Thread Muchun Song

On Tue, Mar 15, 2022 at 4:50 AM Dan Williams  wrote:
>
> On Fri, Mar 11, 2022 at 1:06 AM Muchun Song  wrote:
> >
> > On Thu, Mar 10, 2022 at 8:59 AM Dan Williams  
> > wrote:
> > >
> > > On Wed, Mar 2, 2022 at 12:30 AM Muchun Song  
> > > wrote:
> > > >
> > > > Currently dax_mapping_entry_mkclean() fails to clean and write protect
> > > > the pte entry within a DAX PMD entry during an *sync operation. This
> > > > can result in data loss in the following sequence:
> > > >
> > > >   1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
> > > >  making the pmd entry dirty and writeable.
> > > >   2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
> > > >  write to the same file, dirtying PMD radix tree entry (already
> > > >  done in 1)) and making the pte entry dirty and writeable.
> > > >   3) fsync, flushing out PMD data and cleaning the radix tree entry. We
> > > >  currently fail to mark the pte entry as clean and write protected
> > > >  since the vma of process B is not covered in dax_entry_mkclean().
> > > >   4) process B writes to the pte. These don't cause any page faults 
> > > > since
> > > >  the pte entry is dirty and writeable. The radix tree entry remains
> > > >  clean.
> > > >   5) fsync, which fails to flush the dirty PMD data because the radix 
> > > > tree
> > > >  entry was clean.
> > > >   6) crash - dirty data that should have been fsync'd as part of 5) 
> > > > could
> > > >  still have been in the processor cache, and is lost.
> > >
> > > Excellent description.
> > >
> > > >
> > > > Just to use pfn_mkclean_range() to clean the pfns to fix this issue.
> > >
> > > So the original motivation for CONFIG_FS_DAX_LIMITED was for archs
> > > that do not have spare PTE bits to indicate pmd_devmap(). So this fix
> > > can only work in the CONFIG_FS_DAX_LIMITED=n case and in that case it
> > > seems you can use the current page_mkclean_one(), right?
> >
> > I don't know the history of CONFIG_FS_DAX_LIMITED.
> > page_mkclean_one() need a struct page associated with
> > the pfn,  do the struct pages exist when CONFIG_FS_DAX_LIMITED
> > and ! FS_DAX_PMD?
>
> CONFIG_FS_DAX_LIMITED was created to preserve some DAX use for S390
> which does not have CONFIG_ARCH_HAS_PTE_DEVMAP. Without PTE_DEVMAP
> then get_user_pages() for DAX mappings fails.
>
> To your question, no, there are no pages at all in the
> CONFIG_FS_DAX_LIMITED=y case. So page_mkclean_one() could only be
> deployed for PMD mappings, but I think it is reasonable to just
> disable PMD mappings for the CONFIG_FS_DAX_LIMITED=y case.
>
> Going forward the hope is to remove the ARCH_HAS_PTE_DEVMAP
> requirement for DAX, and use PTE_SPECIAL for the S390 case. However,
> that still wants to have 'struct page' availability as an across the
> board requirement.

Got it. Thanks for your patient explanation.

>
> > If yes, I think you are right. But I don't
> > see this guarantee. I am not familiar with DAX code, so what am
> > I missing here?
>
> Perhaps I missed a 'struct page' dependency? I thought the bug you are
> fixing only triggers in the presence of PMDs. The

Right.

> CONFIG_FS_DAX_LIMITED=y case can still use the current "page-less"
> mkclean path for PTEs.

But I think introducing pfn_mkclean_range() could make the code
simple and easy to maintain here since it could handle both PTE
and PMD mappings.  And page_vma_mapped_walk() could work
on PFNs since commit [1], which is the case here, we do not need
extra code to handle the page-less case here.  What do you
think?

[1] 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=b786e44a4dbfe64476e7120ec7990b89a37be37d

Re: [PATCH v4 5/6] dax: fix missing writeprotect the pte entry

2022-03-11 Thread Muchun Song

On Thu, Mar 10, 2022 at 8:59 AM Dan Williams  wrote:
>
> On Wed, Mar 2, 2022 at 12:30 AM Muchun Song  wrote:
> >
> > Currently dax_mapping_entry_mkclean() fails to clean and write protect
> > the pte entry within a DAX PMD entry during an *sync operation. This
> > can result in data loss in the following sequence:
> >
> >   1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
> >  making the pmd entry dirty and writeable.
> >   2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
> >  write to the same file, dirtying PMD radix tree entry (already
> >  done in 1)) and making the pte entry dirty and writeable.
> >   3) fsync, flushing out PMD data and cleaning the radix tree entry. We
> >  currently fail to mark the pte entry as clean and write protected
> >  since the vma of process B is not covered in dax_entry_mkclean().
> >   4) process B writes to the pte. These don't cause any page faults since
> >  the pte entry is dirty and writeable. The radix tree entry remains
> >  clean.
> >   5) fsync, which fails to flush the dirty PMD data because the radix tree
> >  entry was clean.
> >   6) crash - dirty data that should have been fsync'd as part of 5) could
> >  still have been in the processor cache, and is lost.
>
> Excellent description.
>
> >
> > Just to use pfn_mkclean_range() to clean the pfns to fix this issue.
>
> So the original motivation for CONFIG_FS_DAX_LIMITED was for archs
> that do not have spare PTE bits to indicate pmd_devmap(). So this fix
> can only work in the CONFIG_FS_DAX_LIMITED=n case and in that case it
> seems you can use the current page_mkclean_one(), right?

I don't know the history of CONFIG_FS_DAX_LIMITED.
page_mkclean_one() need a struct page associated with
the pfn,  do the struct pages exist when CONFIG_FS_DAX_LIMITED
and ! FS_DAX_PMD? If yes, I think you are right. But I don't
see this guarantee. I am not familiar with DAX code, so what am
I missing here?

Thanks.

Re: [PATCH v4 2/6] dax: fix cache flush on PMD-mapped pages

2022-03-10 Thread Muchun Song

On Thu, Mar 10, 2022 at 8:06 AM Dan Williams  wrote:
>
> On Wed, Mar 2, 2022 at 12:29 AM Muchun Song  wrote:
> >
> > The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
> > However, it does not cover the full pages in a THP except a head page.
> > Replace it with flush_cache_range() to fix this issue.
>
> This needs to clarify that this is just a documentation issue with the
> respect to properly documenting the expected usage of cache flushing
> before modifying the pmd. However, in practice this is not a problem
> due to the fact that DAX is not available on architectures with
> virtually indexed caches per:

Right. I'll add this into the commit log.

>
> d92576f1167c dax: does not work correctly with virtual aliasing caches
>
> Otherwise, you can add:
>
> Reviewed-by: Dan Williams 

Thanks.

[PATCH v4 6/6] mm: remove range parameter from follow_invalidate_pte()

2022-03-02 Thread Muchun Song

The only user (DAX) of range parameter of follow_invalidate_pte()
is gone, it safe to remove the range paramter and make it static
to simlify the code.

Signed-off-by: Muchun Song 
---
 include/linux/mm.h |  3 ---
 mm/memory.c| 23 +++
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9bada4096ac..be7ec4c37ebe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1871,9 +1871,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cc6968dc8e4e..278ab6d62b54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4964,9 +4964,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+static int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+pte_t **ptepp, pmd_t **pmdpp, spinlock_t 
**ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4993,31 +4992,17 @@ int follow_invalidate_pte(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
}
 
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5025,8 +5010,6 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
@@ -5055,7 +5038,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp)
 {
-   return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+   return follow_invalidate_pte(mm, address, ptepp, NULL, ptlp);
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
-- 
2.11.0

[PATCH v4 5/6] dax: fix missing writeprotect the pte entry

2022-03-02 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 83 ++--
 1 file changed, 7 insertions(+), 76 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a372304c9695..7fd4a16769f9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -789,87 +790,17 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, unsigned long pfn,
+ unsigned long npfn, pgoff_t start)
 {
struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
+   pgoff_t end = start + npfn - 1;
 
i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
+   vma_interval_tree_foreach(vma, >i_mmap, start, end) {
+   pfn_mkclean_range(pfn, npfn, start, vma);
cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-

[PATCH v4 4/6] mm: pvmw: add support for walking devmap pages

2022-03-02 Thread Muchun Song

The devmap pages can not use page_vma_mapped_walk() to check if a huge
devmap page is mapped into a vma.  Add support for walking huge devmap
pages so that DAX can use it in the next patch.

Signed-off-by: Muchun Song 
---
 mm/page_vma_mapped.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..f9ffa84adf4d 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,10 +210,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);
 
-   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_trans_huge(pmde) || pmd_devmap(pmde) ||
+   is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
-   if (likely(pmd_trans_huge(pmde))) {
+   if (likely(pmd_trans_huge(pmde) || pmd_devmap(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (!check_pmd(pmd_pfn(pmde), pvmw))
-- 
2.11.0

[PATCH v4 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-03-02 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 +++
 mm/internal.h| 26 +
 mm/rmap.c| 65 +++-
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b58ddb8b2220..a6ec0d3e40c1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -263,6 +263,9 @@ unsigned long page_address_in_vma(struct page *, struct 
vm_area_struct *);
  */
 int folio_mkclean(struct folio *);
 
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index f45292dc4ef5..ff873944749f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,26 +516,22 @@ void mlock_page_drain(int cpu);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -545,6 +541,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Then at what user virtual address will none of the range be found in vma?
  * Assumes that vma_address() already returned a good starting address.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 723682ddb9e8..ad5cf0e45a73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long address = pvmw->address;
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
-   vma_address_end());
+   vma_address_end(pvmw));
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_mapped_walk()) {
+   while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
 
-   address = pvmw.address;
-   if (pvmw.pte) {
+   address = pvmw->address;
+   if (pvmw->pte) {
pte_t entry;
-

[PATCH v4 2/6] dax: fix cache flush on PMD-mapped pages

2022-03-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..a372304c9695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v4 1/6] mm: rmap: fix cache flush on THP pages

2022-03-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fc46a3d7b704..723682ddb9e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, folio_pfn(folio));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v4 0/6] Fix some bugs related to ramp and dax

2022-03-02 Thread Muchun Song

This series is based on next-20220225.

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

v4:
- Fix compilation error on riscv.

v3:
- Based on next-20220225.

v2:
- Avoid the overly long line in lots of places suggested by Christoph.
- Fix a compiler warning reported by kernel test robot since pmd_pfn()
  is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
- Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  mm: pvmw: add support for walking devmap pages
  dax: fix missing writeprotect the pte entry
  mm: remove range parameter from follow_invalidate_pte()

 fs/dax.c | 82 +---
 include/linux/mm.h   |  3 --
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 +++--
 mm/memory.c  | 23 ++-
 mm/page_vma_mapped.c |  5 ++--
 mm/rmap.c| 68 +++
 7 files changed, 89 insertions(+), 121 deletions(-)

-- 
2.11.0

Re: [PATCH v3 4/6] mm: pvmw: add support for walking devmap pages

2022-02-28 Thread Muchun Song

On Tue, Mar 1, 2022 at 5:26 AM Andrew Morton  wrote:
>
> On Mon, 28 Feb 2022 14:35:34 +0800 Muchun Song  
> wrote:
>
> > The devmap pages can not use page_vma_mapped_walk() to check if a huge
> > devmap page is mapped into a vma.  Add support for walking huge devmap
> > pages so that DAX can use it in the next patch.
> >
>
> x86_64 allnoconfig:
>
> In file included from :
> In function 'check_pmd',
> inlined from 'page_vma_mapped_walk' at mm/page_vma_mapped.c:219:10:
> ././include/linux/compiler_types.h:347:45: error: call to 
> '__compiletime_assert_232' declared with attribute error: BUILD_BUG failed
>   347 | _compiletime_assert(condition, msg, __compiletime_assert_, 
> __COUNTER__)
>   | ^
> ././include/linux/compiler_types.h:328:25: note: in definition of macro 
> '__compiletime_assert'
>   328 | prefix ## suffix();   
>   \
>   | ^~
> ././include/linux/compiler_types.h:347:9: note: in expansion of macro 
> '_compiletime_assert'
>   347 | _compiletime_assert(condition, msg, __compiletime_assert_, 
> __COUNTER__)
>   | ^~~
> ./include/linux/build_bug.h:39:37: note: in expansion of macro 
> 'compiletime_assert'
>39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
>   | ^~
> ./include/linux/build_bug.h:59:21: note: in expansion of macro 
> 'BUILD_BUG_ON_MSG'
>59 | #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
>   | ^~~~
> ./include/linux/huge_mm.h:307:28: note: in expansion of macro 'BUILD_BUG'
>   307 | #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
>   |^
> ./include/linux/huge_mm.h:104:26: note: in expansion of macro 
> 'HPAGE_PMD_SHIFT'
>   104 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
>   |  ^~~
> ./include/linux/huge_mm.h:105:26: note: in expansion of macro 
> 'HPAGE_PMD_ORDER'
>   105 | #define HPAGE_PMD_NR (1<   |  ^~~
> mm/page_vma_mapped.c:113:20: note: in expansion of macro 'HPAGE_PMD_NR'
>   113 | if ((pfn + HPAGE_PMD_NR - 1) < pvmw->pfn)
>   |^~~~
> make[1]: *** [scripts/Makefile.build:288: mm/page_vma_mapped.o] Error 1
> make: *** [Makefile:1971: mm] Error 2
>
>
> because check_pmd() uses HPAGE_PMD_NR and
>
> #else /* CONFIG_TRANSPARENT_HUGEPAGE */
> #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
>
> I don't immediately see why this patch triggers it...

Maybe the reason is as follows.

The first check_pmd() is wrapped inside `if (pmd_trans_huge(pmde))`
block, since pmd_trans_huge() just returns 0, check_pmd() will be
optimized out.  There is a `if (!thp_migration_supported()) return;` block
before the second check_pmd(), however, thp_migration_supported()
returns 0 on riscv. So the second check_pmd() can be optimized out as
well.  I think I should replace `pmd_leaf` with `pmd_trans_huge() ||
pmd_devmap()`
to fix it.

Thanks.

[PATCH v3 6/6] mm: remove range parameter from follow_invalidate_pte()

2022-02-27 Thread Muchun Song

The only user (DAX) of range parameter of follow_invalidate_pte()
is gone, it safe to remove the range paramter and make it static
to simlify the code.

Signed-off-by: Muchun Song 
---
 include/linux/mm.h |  3 ---
 mm/memory.c| 23 +++
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c9bada4096ac..be7ec4c37ebe 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1871,9 +1871,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index cc6968dc8e4e..278ab6d62b54 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4964,9 +4964,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+static int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+pte_t **ptepp, pmd_t **pmdpp, spinlock_t 
**ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4993,31 +4992,17 @@ int follow_invalidate_pte(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
}
 
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -5025,8 +5010,6 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
@@ -5055,7 +5038,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp)
 {
-   return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+   return follow_invalidate_pte(mm, address, ptepp, NULL, ptlp);
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
-- 
2.11.0

[PATCH v3 5/6] dax: fix missing writeprotect the pte entry

2022-02-27 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 83 ++--
 1 file changed, 7 insertions(+), 76 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a372304c9695..7fd4a16769f9 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -789,87 +790,17 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, unsigned long pfn,
+ unsigned long npfn, pgoff_t start)
 {
struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
+   pgoff_t end = start + npfn - 1;
 
i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
+   vma_interval_tree_foreach(vma, >i_mmap, start, end) {
+   pfn_mkclean_range(pfn, npfn, start, vma);
cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-

[PATCH v3 4/6] mm: pvmw: add support for walking devmap pages

2022-02-27 Thread Muchun Song

The devmap pages can not use page_vma_mapped_walk() to check if a huge
devmap page is mapped into a vma.  Add support for walking huge devmap
pages so that DAX can use it in the next patch.

Signed-off-by: Muchun Song 
---
 mm/page_vma_mapped.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 1187f9c1ec5b..3f337e4e7f5f 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -210,10 +210,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw)
 */
pmde = READ_ONCE(*pvmw->pmd);
 
-   if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+   if (pmd_leaf(pmde) || is_pmd_migration_entry(pmde)) {
pvmw->ptl = pmd_lock(mm, pvmw->pmd);
pmde = *pvmw->pmd;
-   if (likely(pmd_trans_huge(pmde))) {
+   if (likely(pmd_leaf(pmde))) {
if (pvmw->flags & PVMW_MIGRATION)
return not_found(pvmw);
if (!check_pmd(pmd_pfn(pmde), pvmw))
-- 
2.11.0

[PATCH v3 3/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-02-27 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 +++
 mm/internal.h| 26 +
 mm/rmap.c| 65 +++-
 3 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index b58ddb8b2220..a6ec0d3e40c1 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -263,6 +263,9 @@ unsigned long page_address_in_vma(struct page *, struct 
vm_area_struct *);
  */
 int folio_mkclean(struct folio *);
 
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked);
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index f45292dc4ef5..ff873944749f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -516,26 +516,22 @@ void mlock_page_drain(int cpu);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -545,6 +541,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Then at what user virtual address will none of the range be found in vma?
  * Assumes that vma_address() already returned a good starting address.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 723682ddb9e8..ad5cf0e45a73 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -929,12 +929,12 @@ int folio_referenced(struct folio *folio, int is_locked,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long address = pvmw->address;
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +942,16 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
 */
mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
-   vma_address_end());
+   vma_address_end(pvmw));
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_mapped_walk()) {
+   while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
 
-   address = pvmw.address;
-   if (pvmw.pte) {
+   address = pvmw->address;
+   if (pvmw->pte) {
pte_t entry;
-

[PATCH v3 2/6] dax: fix cache flush on PMD-mapped pages

2022-02-27 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 67a08a32fccb..a372304c9695 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -845,7 +845,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v3 1/6] mm: rmap: fix cache flush on THP pages

2022-02-27 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index fc46a3d7b704..723682ddb9e8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -970,7 +970,8 @@ static bool page_mkclean_one(struct folio *folio, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, folio_pfn(folio));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v3 0/6] Fix some bugs related to ramp and dax

2022-02-27 Thread Muchun Song

This series is based on next-20220225.

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

v3:
- Based on next-20220225.

v2:
- Avoid the overly long line in lots of places suggested by Christoph.
- Fix a compiler warning reported by kernel test robot since pmd_pfn()
  is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
- Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  mm: pvmw: add support for walking devmap pages
  dax: fix missing writeprotect the pte entry
  mm: remove range parameter from follow_invalidate_pte()

 fs/dax.c | 82 +---
 include/linux/mm.h   |  3 --
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 +++--
 mm/memory.c  | 23 ++-
 mm/page_vma_mapped.c |  4 +--
 mm/rmap.c| 68 +++
 7 files changed, 88 insertions(+), 121 deletions(-)

-- 
2.11.0

[PATCH v2 6/6] mm: remove range parameter from follow_invalidate_pte()

2022-02-02 Thread Muchun Song

The only user (DAX) of range parameter of follow_invalidate_pte()
is gone, it safe to remove the range paramter and make it static
to simlify the code.

Signed-off-by: Muchun Song 
---
 include/linux/mm.h |  3 ---
 mm/memory.c| 23 +++
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d211a06784d5..7895b17f6847 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1814,9 +1814,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 514a81cdd1ae..e8ce066be5f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4869,9 +4869,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+static int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+pte_t **ptepp, pmd_t **pmdpp, spinlock_t 
**ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4898,31 +4897,17 @@ int follow_invalidate_pte(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
}
 
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4930,8 +4915,6 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
@@ -4960,7 +4943,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp)
 {
-   return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+   return follow_invalidate_pte(mm, address, ptepp, NULL, ptlp);
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
-- 
2.11.0

[PATCH v2 5/6] dax: fix missing writeprotect the pte entry

2022-02-02 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Just to use pfn_mkclean_range() to clean the pfns to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 83 ++--
 1 file changed, 7 insertions(+), 76 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index e031e4b6c13c..b64ac02d55d7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -801,87 +802,17 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, unsigned long pfn,
+ unsigned long npfn, pgoff_t start)
 {
struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
+   pgoff_t end = start + npfn - 1;
 
i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
+   vma_interval_tree_foreach(vma, >i_mmap, start, end) {
+   pfn_mkclean_range(pfn, npfn, start, vma);
cond_resched();
-
-   if (!(vma->vm_flags & VM_SHARED))
-   continue;
-
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address,
- address + HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-   pte = pte_wrprotect(pte);
-   pte = pte_mkclean(pte);
-

[PATCH v2 4/6] mm: rmap: introduce pfn_mkclean_range() to cleans PTEs

2022-02-02 Thread Muchun Song

The page_mkclean_one() is supposed to be used with the pfn that has a
associated struct page, but not all the pfns (e.g. DAX) have a struct
page. Introduce a new function pfn_mkclean_range() to cleans the PTEs
(including PMDs) mapped with range of pfns which has no struct page
associated with them. This helper will be used by DAX device in the
next patch to make pfns clean.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h |  3 ++
 mm/internal.h| 26 ++--
 mm/rmap.c| 84 +---
 3 files changed, 86 insertions(+), 27 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 78373935ad49..668a1e81b442 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -241,6 +241,9 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk 
*pvmw);
  */
 unsigned long page_address_in_vma(struct page *, struct vm_area_struct *);
 
+int pfn_mkclean_range(unsigned long pfn, int npfn, pgoff_t pgoff,
+ struct vm_area_struct *vma);
+
 /*
  * Cleans the PTEs of shared mappings.
  * (and since clean PTEs should also be readonly, write protects them too)
diff --git a/mm/internal.h b/mm/internal.h
index 5458cd08df33..dc71256e568f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -449,26 +449,22 @@ extern void clear_page_mlock(struct page *page);
 extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
 
 /*
- * At what user virtual address is page expected in vma?
- * Returns -EFAULT if all of the page is outside the range of vma.
- * If page is a compound head, the entire compound page is considered.
+ * Return the start of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page);
if (pgoff >= vma->vm_pgoff) {
address = vma->vm_start +
((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address >= vma->vm_end)
address = -EFAULT;
-   } else if (PageHead(page) &&
-  pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+   } else if (pgoff + nr_pages - 1 >= vma->vm_pgoff) {
/* Test above avoids possibility of wrap to 0 on 32-bit */
address = vma->vm_start;
} else {
@@ -478,6 +474,18 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
+ * Return the start of user virtual address of a page within a vma.
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address(page_to_pgoff(page), compound_nr(page), vma);
+}
+
+/*
  * Return the end of user virtual address at the specific offset within
  * a vma.
  */
diff --git a/mm/rmap.c b/mm/rmap.c
index 0ba12dc9fae3..8f1860dc22bc 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -928,34 +928,33 @@ int page_referenced(struct page *page,
return pra.referenced;
 }
 
-static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
-   unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
 {
-   struct page_vma_mapped_walk pvmw = {
-   .page = page,
-   .vma = vma,
-   .address = address,
-   .flags = PVMW_SYNC,
-   };
+   int cleaned = 0;
+   struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
-   int *cleaned = arg;
+   unsigned long end;
+
+   if (pvmw->flags & PVMW_PFN_WALK)
+   end = vma_pgoff_address_end(pvmw->index, pvmw->nr, vma);
+   else
+   end = vma_address_end(pvmw->page, vma);
 
/*
 * We have to assume the worse case ie pmd for invalidation. Note that
 * the page can not be free from this function.
 */
-   mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE,
-   0, vma, vma->vm_mm, address,
-   vma_address_end(page, vma));
+   mmu_notifier_range_init(, MMU_NOTIFY_PROTECTION_PAGE, 0, vma,
+   vma->vm_mm, pvmw->address, end);
mmu_notifier_invalidate_range_start();
 
-   while (page_vma_ma

[PATCH v2 3/6] mm: page_vma_mapped: support checking if a pfn is mapped into a vma

2022-02-02 Thread Muchun Song

page_vma_mapped_walk() is supposed to check if a page is mapped into a vma.
However, not all page frames (e.g. PFN_DEV) have a associated struct page
with it. There is going to be some duplicate codes similar with this function
if someone want to check if a pfn (without a struct page) is mapped into a
vma. So add support for checking if a pfn is mapped into a vma. In the next
patch, the dax will use this new feature.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h| 14 --
 include/linux/swapops.h | 13 +++---
 mm/internal.h   | 28 +---
 mm/page_vma_mapped.c| 68 +++--
 4 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 221c3c6438a7..78373935ad49 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -204,9 +204,18 @@ int make_device_exclusive_range(struct mm_struct *mm, 
unsigned long start,
 #define PVMW_SYNC  (1 << 0)
 /* Look for migarion entries rather than present PTEs */
 #define PVMW_MIGRATION (1 << 1)
+/* Walk the page table by checking the pfn instead of a struct page */
+#define PVMW_PFN_WALK  (1 << 2)
 
 struct page_vma_mapped_walk {
-   struct page *page;
+   union {
+   struct page *page;
+   struct {
+   unsigned long pfn;
+   unsigned int nr;
+   pgoff_t index;
+   };
+   };
struct vm_area_struct *vma;
unsigned long address;
pmd_t *pmd;
@@ -218,7 +227,8 @@ struct page_vma_mapped_walk {
 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 {
/* HugeTLB pte is set to the relevant page table entry without 
pte_mapped. */
-   if (pvmw->pte && !PageHuge(pvmw->page))
+   if (pvmw->pte && (pvmw->flags & PVMW_PFN_WALK ||
+ !PageHuge(pvmw->page)))
pte_unmap(pvmw->pte);
if (pvmw->ptl)
spin_unlock(pvmw->ptl);
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index d356ab4047f7..d28bf65fd6a5 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -247,17 +247,22 @@ static inline int is_writable_migration_entry(swp_entry_t 
entry)
 
 #endif
 
-static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
+static inline unsigned long pfn_swap_entry_to_pfn(swp_entry_t entry)
 {
-   struct page *p = pfn_to_page(swp_offset(entry));
+   unsigned long pfn = swp_offset(entry);
 
/*
 * Any use of migration entries may only occur while the
 * corresponding page is locked
 */
-   BUG_ON(is_migration_entry(entry) && !PageLocked(p));
+   BUG_ON(is_migration_entry(entry) && !PageLocked(pfn_to_page(pfn)));
+
+   return pfn;
+}
 
-   return p;
+static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry)
+{
+   return pfn_to_page(pfn_swap_entry_to_pfn(entry));
 }
 
 /*
diff --git a/mm/internal.h b/mm/internal.h
index deb9bda18e59..5458cd08df33 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -478,25 +478,35 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
- * Then at what user virtual address will none of the page be found in vma?
- * Assumes that vma_address() already returned a good starting address.
- * If page is a compound head, the entire compound page is considered.
+ * Return the end of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address_end(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address_end(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
-   unsigned long address;
+   unsigned long address = vma->vm_start;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page) + compound_nr(page);
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+   address += (pgoff + nr_pages - vma->vm_pgoff) << PAGE_SHIFT;
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
address = vma->vm_end;
return address;
 }
 
+/*
+ * Return the end of user virtual address of a page within a vma. Assumes that
+ * vma_address() already returned a good starting address. If page is a 
compound
+ * head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address_end(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address_end(page_to_pgoff(page), compound_nr(page)

[PATCH v2 2/6] dax: fix cache flush on PMD-mapped pages

2022-02-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 88be1c02a151..e031e4b6c13c 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -857,7 +857,8 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH v2 1/6] mm: rmap: fix cache flush on THP pages

2022-02-02 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
Reviewed-by: Yang Shi 
---
 mm/rmap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index b0fd9dc19eba..0ba12dc9fae3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -974,7 +974,8 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, page_to_pfn(page));
+   flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

[PATCH v2 0/6] Fix some bugs related to ramp and dax

2022-02-02 Thread Muchun Song

Patch 1-2 fix a cache flush bug, because subsequent patches depend on
those on those changes, there are placed in this series.  Patch 3-4
are preparation for fixing a dax bug in patch 5.  Patch 6 is code cleanup
since the previous patch remove the usage of follow_invalidate_pte().

Changes in v2:
  - Avoid the overly long line in lots of places suggested by Christoph.
  - Fix a compiler warning reported by kernel test robot since pmd_pfn()
is not defined when !CONFIG_TRANSPARENT_HUGEPAGE on powerpc architecture.
  - Split a new patch 4 for preparation of fixing the dax bug.

Muchun Song (6):
  mm: rmap: fix cache flush on THP pages
  dax: fix cache flush on PMD-mapped pages
  mm: page_vma_mapped: support checking if a pfn is mapped into a vma
  mm: rmap: introduce pfn_mkclean_range() to cleans PTEs
  dax: fix missing writeprotect the pte entry
  mm: remove range parameter from follow_invalidate_pte()

 fs/dax.c| 82 --
 include/linux/mm.h  |  3 --
 include/linux/rmap.h| 17 --
 include/linux/swapops.h | 13 +---
 mm/internal.h   | 52 +++--
 mm/memory.c | 23 ++---
 mm/page_vma_mapped.c| 68 --
 mm/rmap.c   | 87 ++---
 8 files changed, 180 insertions(+), 165 deletions(-)

-- 
2.11.0

Re: [PATCH 4/5] dax: fix missing writeprotect the pte entry

2022-01-24 Thread Muchun Song

On Mon, Jan 24, 2022 at 3:41 PM Christoph Hellwig  wrote:
>
> On Fri, Jan 21, 2022 at 03:55:14PM +0800, Muchun Song wrote:
> > Reuse some infrastructure of page_mkclean_one() to let DAX can handle
> > similar case to fix this issue.
>
> Can you split out some of the infrastructure changes into proper
> well-documented preparation patches?

Will do. I'll introduce page_vma_mkclean_one in a prepared patch
and then fix the DAX issue in a separate patch. Thanks for your
suggestions.

>
> > + pgoff_t pgoff_end = pgoff_start + npfn - 1;
> >
> >   i_mmap_lock_read(mapping);
> > - vma_interval_tree_foreach(vma, >i_mmap, index, index) {
> > - struct mmu_notifier_range range;
> > - unsigned long address;
> > -
> > + vma_interval_tree_foreach(vma, >i_mmap, pgoff_start, 
> > pgoff_end) {
>
> Please avoid the overly long lines here.  Just using start and end
> might be an easy option.
>

Will do.

Thanks.

Re: [PATCH 3/5] mm: page_vma_mapped: support checking if a pfn is mapped into a vma

2022-01-24 Thread Muchun Song

On Mon, Jan 24, 2022 at 3:36 PM Christoph Hellwig  wrote:
>
> On Fri, Jan 21, 2022 at 03:55:13PM +0800, Muchun Song wrote:
> > + if (pvmw->pte && ((pvmw->flags & PVMW_PFN_WALK) || 
> > !PageHuge(pvmw->page)))
>
> Please avoid the overly long line here and in a few other places.

OK.

>
> > +/*
> > + * Then at what user virtual address will none of the page be found in vma?
>
> Doesn't parse, what is this trying to say?

Well, I am also confused.

BTW, this is not introduced by me, it is introduced by:

  commit 37ffe9f4d7ff ("mm/thp: fix vma_address() if virtual address
below file offset")

If it is really confusing, I can replace this line with:

"Return the end user virtual address of a page within a vma"

Thanks.

Re: [PATCH 1/5] mm: rmap: fix cache flush on THP pages

2022-01-24 Thread Muchun Song

On Mon, Jan 24, 2022 at 3:34 PM Christoph Hellwig  wrote:
>
> On Fri, Jan 21, 2022 at 03:55:11PM +0800, Muchun Song wrote:
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/mm/rmap.c b/mm/rmap.c
> > index b0fd9dc19eba..65670cb805d6 100644
> > --- a/mm/rmap.c
> > +++ b/mm/rmap.c
> > @@ -974,7 +974,7 @@ static bool page_mkclean_one(struct page *page, struct 
> > vm_area_struct *vma,
> >   if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
> >   continue;
> >
> > - flush_cache_page(vma, address, page_to_pfn(page));
> > + flush_cache_range(vma, address, address + 
> > HPAGE_PMD_SIZE);
>
> Do we need a flush_cache_folio here given that we must be dealing with
> what effectively is a folio here?

I think it is a future improvement. I suspect it will be easy if
someone wants to backport this patch. If we do not want
someone to do this, I think it is better to introduce
flush_cache_folio in this patch. What do you think?

>
> Also please avoid the overly long line.
>

OK.

Thanks.

[PATCH 5/5] mm: remove range parameter from follow_invalidate_pte()

2022-01-20 Thread Muchun Song

The only user (DAX) of range parameter of follow_invalidate_pte()
is gone, it safe to remove the range paramter and make it static
to simlify the code.

Signed-off-by: Muchun Song 
---
 include/linux/mm.h |  3 ---
 mm/memory.c| 23 +++
 2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d211a06784d5..7895b17f6847 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1814,9 +1814,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long 
addr,
unsigned long end, unsigned long floor, unsigned long ceiling);
 int
 copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct 
*src_vma);
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp);
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp);
 int follow_pfn(struct vm_area_struct *vma, unsigned long address,
diff --git a/mm/memory.c b/mm/memory.c
index 514a81cdd1ae..e8ce066be5f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4869,9 +4869,8 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 
unsigned long address)
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
 
-int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range, pte_t **ptepp,
- pmd_t **pmdpp, spinlock_t **ptlp)
+static int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+pte_t **ptepp, pmd_t **pmdpp, spinlock_t 
**ptlp)
 {
pgd_t *pgd;
p4d_t *p4d;
@@ -4898,31 +4897,17 @@ int follow_invalidate_pte(struct mm_struct *mm, 
unsigned long address,
if (!pmdpp)
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
-   NULL, mm, address & PMD_MASK,
-   (address & PMD_MASK) + 
PMD_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
*pmdpp = pmd;
return 0;
}
spin_unlock(*ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
}
 
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
 
-   if (range) {
-   mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
-   address & PAGE_MASK,
-   (address & PAGE_MASK) + PAGE_SIZE);
-   mmu_notifier_invalidate_range_start(range);
-   }
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
goto unlock;
@@ -4930,8 +4915,6 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
return 0;
 unlock:
pte_unmap_unlock(ptep, *ptlp);
-   if (range)
-   mmu_notifier_invalidate_range_end(range);
 out:
return -EINVAL;
 }
@@ -4960,7 +4943,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned 
long address,
 int follow_pte(struct mm_struct *mm, unsigned long address,
   pte_t **ptepp, spinlock_t **ptlp)
 {
-   return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
+   return follow_invalidate_pte(mm, address, ptepp, NULL, ptlp);
 }
 EXPORT_SYMBOL_GPL(follow_pte);
 
-- 
2.11.0

[PATCH 4/5] dax: fix missing writeprotect the pte entry

2022-01-20 Thread Muchun Song

Currently dax_mapping_entry_mkclean() fails to clean and write protect
the pte entry within a DAX PMD entry during an *sync operation. This
can result in data loss in the following sequence:

  1) process A mmap write to DAX PMD, dirtying PMD radix tree entry and
 making the pmd entry dirty and writeable.
  2) process B mmap with the @offset (e.g. 4K) and @length (e.g. 4K)
 write to the same file, dirtying PMD radix tree entry (already
 done in 1)) and making the pte entry dirty and writeable.
  3) fsync, flushing out PMD data and cleaning the radix tree entry. We
 currently fail to mark the pte entry as clean and write protected
 since the vma of process B is not covered in dax_entry_mkclean().
  4) process B writes to the pte. These don't cause any page faults since
 the pte entry is dirty and writeable. The radix tree entry remains
 clean.
  5) fsync, which fails to flush the dirty PMD data because the radix tree
 entry was clean.
  6) crash - dirty data that should have been fsync'd as part of 5) could
 still have been in the processor cache, and is lost.

Reuse some infrastructure of page_mkclean_one() to let DAX can handle
similar case to fix this issue.

Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 78 +---
 include/linux/rmap.h |  9 ++
 mm/internal.h| 27 --
 mm/rmap.c| 69 ++
 4 files changed, 85 insertions(+), 98 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 2955ec65eb65..7d4e3e68b861 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -801,86 +802,21 @@ static void *dax_insert_entry(struct xa_state *xas,
return entry;
 }
 
-static inline
-unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
-{
-   unsigned long address;
-
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-   VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-   return address;
-}
-
 /* Walk all mappings of a given index of a file and writeprotect them */
-static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
-   unsigned long pfn)
+static void dax_entry_mkclean(struct address_space *mapping, unsigned long pfn,
+ unsigned long npfn, pgoff_t pgoff_start)
 {
struct vm_area_struct *vma;
-   pte_t pte, *ptep = NULL;
-   pmd_t *pmdp = NULL;
-   spinlock_t *ptl;
+   pgoff_t pgoff_end = pgoff_start + npfn - 1;
 
i_mmap_lock_read(mapping);
-   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
-   struct mmu_notifier_range range;
-   unsigned long address;
-
+   vma_interval_tree_foreach(vma, >i_mmap, pgoff_start, 
pgoff_end) {
cond_resched();
 
if (!(vma->vm_flags & VM_SHARED))
continue;
 
-   address = pgoff_address(index, vma);
-
-   /*
-* follow_invalidate_pte() will use the range to call
-* mmu_notifier_invalidate_range_start() on our behalf before
-* taking any lock.
-*/
-   if (follow_invalidate_pte(vma->vm_mm, address, , ,
- , ))
-   continue;
-
-   /*
-* No need to call mmu_notifier_invalidate_range() as we are
-* downgrading page table protection not changing it to point
-* to a new page.
-*
-* See Documentation/vm/mmu_notifier.rst
-*/
-   if (pmdp) {
-#ifdef CONFIG_FS_DAX_PMD
-   pmd_t pmd;
-
-   if (pfn != pmd_pfn(*pmdp))
-   goto unlock_pmd;
-   if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
-   goto unlock_pmd;
-
-   flush_cache_range(vma, address, address + 
HPAGE_PMD_SIZE);
-   pmd = pmdp_invalidate(vma, address, pmdp);
-   pmd = pmd_wrprotect(pmd);
-   pmd = pmd_mkclean(pmd);
-   set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-unlock_pmd:
-#endif
-   spin_unlock(ptl);
-   } else {
-   if (pfn != pte_pfn(*ptep))
-   goto unlock_pte;
-   if (!pte_dirty(*ptep) && !pte_write(*ptep))
-   goto unlock_pte;
-
-   flush_cache_page(vma, address, pfn);
-   pte = ptep_clear_flush(vma, address, ptep);
-

[PATCH 3/5] mm: page_vma_mapped: support checking if a pfn is mapped into a vma

2022-01-20 Thread Muchun Song

page_vma_mapped_walk() is supposed to check if a page is mapped into a vma.
However, not all page frames (e.g. PFN_DEV) have a associated struct page
with it. There is going to be some duplicate codes similar with this function
if someone want to check if a pfn (without a struct page) is mapped into a
vma. So add support for checking if a pfn is mapped into a vma. In the next
patch, the dax will use this new feature.

Signed-off-by: Muchun Song 
---
 include/linux/rmap.h | 13 +--
 mm/internal.h| 25 +---
 mm/page_vma_mapped.c | 65 +---
 3 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 221c3c6438a7..7628474732e7 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -204,9 +204,18 @@ int make_device_exclusive_range(struct mm_struct *mm, 
unsigned long start,
 #define PVMW_SYNC  (1 << 0)
 /* Look for migarion entries rather than present PTEs */
 #define PVMW_MIGRATION (1 << 1)
+/* Walk the page table by checking the pfn instead of a struct page */
+#define PVMW_PFN_WALK  (1 << 2)
 
 struct page_vma_mapped_walk {
-   struct page *page;
+   union {
+   struct page *page;
+   struct {
+   unsigned long pfn;
+   unsigned int nr;
+   pgoff_t index;
+   };
+   };
struct vm_area_struct *vma;
unsigned long address;
pmd_t *pmd;
@@ -218,7 +227,7 @@ struct page_vma_mapped_walk {
 static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
 {
/* HugeTLB pte is set to the relevant page table entry without 
pte_mapped. */
-   if (pvmw->pte && !PageHuge(pvmw->page))
+   if (pvmw->pte && ((pvmw->flags & PVMW_PFN_WALK) || 
!PageHuge(pvmw->page)))
pte_unmap(pvmw->pte);
if (pvmw->ptl)
spin_unlock(pvmw->ptl);
diff --git a/mm/internal.h b/mm/internal.h
index deb9bda18e59..d6e3e8e1be2d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -478,25 +478,34 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 }
 
 /*
- * Then at what user virtual address will none of the page be found in vma?
- * Assumes that vma_address() already returned a good starting address.
- * If page is a compound head, the entire compound page is considered.
+ * Return the end of user virtual address at the specific offset within
+ * a vma.
  */
 static inline unsigned long
-vma_address_end(struct page *page, struct vm_area_struct *vma)
+vma_pgoff_address_end(pgoff_t pgoff, unsigned long nr_pages,
+ struct vm_area_struct *vma)
 {
-   pgoff_t pgoff;
unsigned long address;
 
-   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
-   pgoff = page_to_pgoff(page) + compound_nr(page);
-   address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+   address = vma->vm_start + ((pgoff + nr_pages - vma->vm_pgoff) << 
PAGE_SHIFT);
/* Check for address beyond vma (or wrapped through 0?) */
if (address < vma->vm_start || address > vma->vm_end)
address = vma->vm_end;
return address;
 }
 
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address_end(struct page *page, struct vm_area_struct *vma)
+{
+   VM_BUG_ON_PAGE(PageKsm(page), page);/* KSM page->index unusable */
+   return vma_pgoff_address_end(page_to_pgoff(page), compound_nr(page), 
vma);
+}
+
 static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
struct file *fpin)
 {
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index f7b331081791..c8819770d457 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -53,10 +53,16 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
return true;
 }
 
-static inline bool pfn_is_match(struct page *page, unsigned long pfn)
+static inline bool pfn_is_match(struct page_vma_mapped_walk *pvmw, unsigned 
long pfn)
 {
-   unsigned long page_pfn = page_to_pfn(page);
+   struct page *page;
+   unsigned long page_pfn;
 
+   if (pvmw->flags & PVMW_PFN_WALK)
+   return pfn >= pvmw->pfn && pfn - pvmw->pfn < pvmw->nr;
+
+   page = pvmw->page;
+   page_pfn = page_to_pfn(page);
/* normal page and hugetlbfs page */
if (!PageTransCompound(page) || PageHuge(page))
return page_pfn == pfn;
@@ -116,7 +122,7 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw

[PATCH 2/5] dax: fix cache flush on PMD-mapped pages

2022-01-20 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue.

Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean")
Signed-off-by: Muchun Song 
---
 fs/dax.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dax.c b/fs/dax.c
index 88be1c02a151..2955ec65eb65 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -857,7 +857,7 @@ static void dax_entry_mkclean(struct address_space 
*mapping, pgoff_t index,
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
 
-   flush_cache_page(vma, address, pfn);
+   flush_cache_range(vma, address, address + 
HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
-- 
2.11.0

[PATCH 1/5] mm: rmap: fix cache flush on THP pages

2022-01-20 Thread Muchun Song

The flush_cache_page() only remove a PAGE_SIZE sized range from the cache.
However, it does not cover the full pages in a THP except a head page.
Replace it with flush_cache_range() to fix this issue. At least, no
problems were found due to this. Maybe because the architectures that
have virtual indexed caches is less.

Fixes: f27176cfc363 ("mm: convert page_mkclean_one() to use 
page_vma_mapped_walk()")
Signed-off-by: Muchun Song 
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/rmap.c b/mm/rmap.c
index b0fd9dc19eba..65670cb805d6 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -974,7 +974,7 @@ static bool page_mkclean_one(struct page *page, struct 
vm_area_struct *vma,
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
 
-   flush_cache_page(vma, address, page_to_pfn(page));
+   flush_cache_range(vma, address, address + 
HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
-- 
2.11.0

Re: [External] Re: [PATCH v20 6/9] mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page

2021-04-20 Thread Muchun Song

On Tue, Apr 20, 2021 at 7:20 AM Mike Kravetz  wrote:
>
> On 4/15/21 1:40 AM, Muchun Song wrote:
> > When we free a HugeTLB page to the buddy allocator, we need to allocate
> > the vmemmap pages associated with it. However, we may not be able to
> > allocate the vmemmap pages when the system is under memory pressure. In
> > this case, we just refuse to free the HugeTLB page. This changes behavior
> > in some corner cases as listed below:
> >
> >  1) Failing to free a huge page triggered by the user (decrease nr_pages).
> >
> > User needs to try again later.
> >
> >  2) Failing to free a surplus huge page when freed by the application.
> >
> > Try again later when freeing a huge page next time.
> >
> >  3) Failing to dissolve a free huge page on ZONE_MOVABLE via
> > offline_pages().
> >
> > This can happen when we have plenty of ZONE_MOVABLE memory, but
> > not enough kernel memory to allocate vmemmmap pages.  We may even
> > be able to migrate huge page contents, but will not be able to
> > dissolve the source huge page.  This will prevent an offline
> > operation and is unfortunate as memory offlining is expected to
> > succeed on movable zones.  Users that depend on memory hotplug
> > to succeed for movable zones should carefully consider whether the
> > memory savings gained from this feature are worth the risk of
> > possibly not being able to offline memory in certain situations.
> >
> >  4) Failing to dissolve a huge page on CMA/ZONE_MOVABLE via
> > alloc_contig_range() - once we have that handling in place. Mainly
> > affects CMA and virtio-mem.
> >
> > Similar to 3). virito-mem will handle migration errors gracefully.
> > CMA might be able to fallback on other free areas within the CMA
> > region.
> >
> > Vmemmap pages are allocated from the page freeing context. In order for
> > those allocations to be not disruptive (e.g. trigger oom killer)
> > __GFP_NORETRY is used. hugetlb_lock is dropped for the allocation
> > because a non sleeping allocation would be too fragile and it could fail
> > too easily under memory pressure. GFP_ATOMIC or other modes to access
> > memory reserves is not used because we want to prevent consuming
> > reserves under heavy hugetlb freeing.
> >
> > Signed-off-by: Muchun Song 
> > ---
> >  Documentation/admin-guide/mm/hugetlbpage.rst|  8 +++
> >  Documentation/admin-guide/mm/memory-hotplug.rst | 13 
> >  include/linux/hugetlb.h |  3 +
> >  include/linux/mm.h  |  2 +
> >  mm/hugetlb.c| 85 
> > -
> >  mm/hugetlb_vmemmap.c| 34 ++
> >  mm/hugetlb_vmemmap.h|  6 ++
> >  mm/sparse-vmemmap.c | 75 +-
> >  8 files changed, 210 insertions(+), 16 deletions(-)
> >
> > diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst 
> > b/Documentation/admin-guide/mm/hugetlbpage.rst
> > index f7b1c7462991..6988895d09a8 100644
> > --- a/Documentation/admin-guide/mm/hugetlbpage.rst
> > +++ b/Documentation/admin-guide/mm/hugetlbpage.rst
> > @@ -60,6 +60,10 @@ HugePages_Surp
> >  the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
> >  maximum number of surplus huge pages is controlled by
> >  ``/proc/sys/vm/nr_overcommit_hugepages``.
> > + Note: When the feature of freeing unused vmemmap pages associated
> > + with each hugetlb page is enabled, the number of surplus huge pages
> > + may be temporarily larger than the maximum number of surplus huge
> > + pages when the system is under memory pressure.
> >  Hugepagesize
> >   is the default hugepage size (in Kb).
> >  Hugetlb
> > @@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task.  A 
> > user with root
> >  privileges can dynamically allocate more or free some persistent huge pages
> >  by increasing or decreasing the value of ``nr_hugepages``.
> >
> > +Note: When the feature of freeing unused vmemmap pages associated with each
> > +hugetlb page is enabled, we can fail to free the huge pages triggered by
> > +the user when ths system is under memory pressure.  Please try again later.
> > +
> >  Pages that are used as huge pages are reserved inside the kernel and cannot
> >  be used for other purposes.  Huge pages cannot be swapped out under
> >  memory pressure.
> >

Re: [External] [PATCH v4 5/5] mm/memcg: Improve refill_obj_stock() performance

2021-04-19 Thread Muchun Song

On Mon, Apr 19, 2021 at 8:01 AM Waiman Long  wrote:
>
> There are two issues with the current refill_obj_stock() code. First of
> all, when nr_bytes reaches over PAGE_SIZE, it calls drain_obj_stock() to
> atomically flush out remaining bytes to obj_cgroup, clear cached_objcg
> and do a obj_cgroup_put(). It is likely that the same obj_cgroup will
> be used again which leads to another call to drain_obj_stock() and
> obj_cgroup_get() as well as atomically retrieve the available byte from
> obj_cgroup. That is costly. Instead, we should just uncharge the excess
> pages, reduce the stock bytes and be done with it. The drain_obj_stock()
> function should only be called when obj_cgroup changes.
>
> Secondly, when charging an object of size not less than a page in
> obj_cgroup_charge(), it is possible that the remaining bytes to be
> refilled to the stock will overflow a page and cause refill_obj_stock()
> to uncharge 1 page. To avoid the additional uncharge in this case,
> a new overfill flag is added to refill_obj_stock() which will be set
> when called from obj_cgroup_charge().
>
> Signed-off-by: Waiman Long 
> ---
>  mm/memcontrol.c | 23 +--
>  1 file changed, 17 insertions(+), 6 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index a6dd18f6d8a8..d13961352eef 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3357,23 +3357,34 @@ static bool obj_stock_flush_required(struct 
> memcg_stock_pcp *stock,
> return false;
>  }
>
> -static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> +static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> +bool overfill)
>  {
> unsigned long flags;
> struct obj_stock *stock = get_obj_stock();
> +   unsigned int nr_pages = 0;
>
> if (stock->cached_objcg != objcg) { /* reset if necessary */
> -   drain_obj_stock(stock);
> +   if (stock->cached_objcg)
> +   drain_obj_stock(stock);
> obj_cgroup_get(objcg);
> stock->cached_objcg = objcg;
> stock->nr_bytes = atomic_xchg(>nr_charged_bytes, 0);
> }
> stock->nr_bytes += nr_bytes;
>
> -   if (stock->nr_bytes > PAGE_SIZE)
> -   drain_obj_stock(stock);
> +   if (!overfill && (stock->nr_bytes > PAGE_SIZE)) {
> +   nr_pages = stock->nr_bytes >> PAGE_SHIFT;
> +   stock->nr_bytes &= (PAGE_SIZE - 1);
> +   }
>
> put_obj_stock(flags);
> +
> +   if (nr_pages) {
> +   rcu_read_lock();
> +   __memcg_kmem_uncharge(obj_cgroup_memcg(objcg), nr_pages);
> +   rcu_read_unlock();
> +   }

It is not safe to call __memcg_kmem_uncharge() under rcu lock
and without holding a reference to memcg. More details can refer
to the following link.

https://lore.kernel.org/linux-mm/20210319163821.20704-2-songmuc...@bytedance.com/

In the above patchset, we introduce obj_cgroup_uncharge_pages to
uncharge some pages from object cgroup. You can use this safe
API.

Thanks.

>  }
>
>  int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
> @@ -3410,7 +3421,7 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t 
> gfp, size_t size)
>
> ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
> if (!ret && nr_bytes)
> -   refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
> +   refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, true);
>
> css_put(>css);
> return ret;
> @@ -3418,7 +3429,7 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t 
> gfp, size_t size)
>
>  void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
>  {
> -   refill_obj_stock(objcg, size);
> +   refill_obj_stock(objcg, size, false);
>  }
>
>  #endif /* CONFIG_MEMCG_KMEM */
> --
> 2.18.1
>

Re: [External] [PATCH v4] dma-buf: Add DmaBufTotal counter in meminfo

2021-04-17 Thread Muchun Song

On Sat, Apr 17, 2021 at 9:44 PM  wrote:
>
> On 4/17/21 3:07 PM, Muchun Song wrote:
> > On Sat, Apr 17, 2021 at 6:41 PM Peter Enderborg
> >  wrote:
> >> This adds a total used dma-buf memory. Details
> >> can be found in debugfs, however it is not for everyone
> >> and not always available. dma-buf are indirect allocated by
> >> userspace. So with this value we can monitor and detect
> >> userspace applications that have problems.
> >>
> >> Signed-off-by: Peter Enderborg 
> >> ---
> >>  drivers/dma-buf/dma-buf.c | 13 +
> >>  fs/proc/meminfo.c |  5 -
> >>  include/linux/dma-buf.h   |  1 +
> >>  3 files changed, 18 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> >> index f264b70c383e..197e5c45dd26 100644
> >> --- a/drivers/dma-buf/dma-buf.c
> >> +++ b/drivers/dma-buf/dma-buf.c
> >> @@ -37,6 +37,7 @@ struct dma_buf_list {
> >>  };
> >>
> >>  static struct dma_buf_list db_list;
> >> +static atomic_long_t dma_buf_global_allocated;
> >>
> >>  static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int 
> >> buflen)
> >>  {
> >> @@ -79,6 +80,7 @@ static void dma_buf_release(struct dentry *dentry)
> >> if (dmabuf->resv == (struct dma_resv *)[1])
> >> dma_resv_fini(dmabuf->resv);
> >>
> >> +   atomic_long_sub(dmabuf->size, _buf_global_allocated);
> >> module_put(dmabuf->owner);
> >> kfree(dmabuf->name);
> >> kfree(dmabuf);
> >> @@ -586,6 +588,7 @@ struct dma_buf *dma_buf_export(const struct 
> >> dma_buf_export_info *exp_info)
> >> mutex_lock(_list.lock);
> >> list_add(>list_node, _list.head);
> >> mutex_unlock(_list.lock);
> >> +   atomic_long_add(dmabuf->size, _buf_global_allocated);
> >>
> >> return dmabuf;
> >>
> >> @@ -1346,6 +1349,16 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct 
> >> dma_buf_map *map)
> >>  }
> >>  EXPORT_SYMBOL_GPL(dma_buf_vunmap);
> >>
> >> +/**
> >> + * dma_buf_allocated_pages - Return the used nr of pages
> >> + * allocated for dma-buf
> >> + */
> >> +long dma_buf_allocated_pages(void)
> >> +{
> >> +   return atomic_long_read(_buf_global_allocated) >> PAGE_SHIFT;
> >> +}
> >> +EXPORT_SYMBOL_GPL(dma_buf_allocated_pages);
> > dma_buf_allocated_pages is only called from fs/proc/meminfo.c.
> > I am confused why it should be exported. If it won't be called
> > from the driver module, we should not export it.
>
> Ah. I thought you did not want the GPL restriction. I don't have real
> opinion about it. It's written to be following the rest of the module.
> It is not needed for the usage of dma-buf in kernel module. But I
> don't see any reason for hiding it either.

The modules do not need dma_buf_allocated_pages, hiding it
can prevent the module from calling it. So I think that
EXPORT_SYMBOL_GPL is unnecessary. If one day someone
want to call it from the module, maybe it’s not too late to export
it at that time.

>
>
> > Thanks.
> >
> >> +
> >>  #ifdef CONFIG_DEBUG_FS
> >>  static int dma_buf_debug_show(struct seq_file *s, void *unused)
> >>  {
> >> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> >> index 6fa761c9cc78..ccc7c40c8db7 100644
> >> --- a/fs/proc/meminfo.c
> >> +++ b/fs/proc/meminfo.c
> >> @@ -16,6 +16,7 @@
> >>  #ifdef CONFIG_CMA
> >>  #include 
> >>  #endif
> >> +#include 
> >>  #include 
> >>  #include "internal.h"
> >>
> >> @@ -145,7 +146,9 @@ static int meminfo_proc_show(struct seq_file *m, void 
> >> *v)
> >> show_val_kb(m, "CmaFree:",
> >> global_zone_page_state(NR_FREE_CMA_PAGES));
> >>  #endif
> >> -
> >> +#ifdef CONFIG_DMA_SHARED_BUFFER
> >> +   show_val_kb(m, "DmaBufTotal:", dma_buf_allocated_pages());
> >> +#endif
> >> hugetlb_report_meminfo(m);
> >>
> >> arch_report_meminfo(m);
> >> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> >> index efdc56b9d95f..5b05816bd2cd 100644
> >> --- a/include/linux/dma-buf.h
> >> +++ b/include/linux/dma-buf.h
> >> @@ -507,4 +507,5 @@ int dma_buf_mmap(struct dma_buf *, struct 
> >> vm_area_struct *,
> >>  unsigned long);
> >>  int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> >>  void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> >> +long dma_buf_allocated_pages(void);
> >>  #endif /* __DMA_BUF_H__ */
> >> --
> >> 2.17.1
> >>

Re: [External] [PATCH v4] dma-buf: Add DmaBufTotal counter in meminfo

2021-04-17 Thread Muchun Song

On Sat, Apr 17, 2021 at 6:41 PM Peter Enderborg
 wrote:
>
> This adds a total used dma-buf memory. Details
> can be found in debugfs, however it is not for everyone
> and not always available. dma-buf are indirect allocated by
> userspace. So with this value we can monitor and detect
> userspace applications that have problems.
>
> Signed-off-by: Peter Enderborg 
> ---
>  drivers/dma-buf/dma-buf.c | 13 +
>  fs/proc/meminfo.c |  5 -
>  include/linux/dma-buf.h   |  1 +
>  3 files changed, 18 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index f264b70c383e..197e5c45dd26 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -37,6 +37,7 @@ struct dma_buf_list {
>  };
>
>  static struct dma_buf_list db_list;
> +static atomic_long_t dma_buf_global_allocated;
>
>  static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
>  {
> @@ -79,6 +80,7 @@ static void dma_buf_release(struct dentry *dentry)
> if (dmabuf->resv == (struct dma_resv *)[1])
> dma_resv_fini(dmabuf->resv);
>
> +   atomic_long_sub(dmabuf->size, _buf_global_allocated);
> module_put(dmabuf->owner);
> kfree(dmabuf->name);
> kfree(dmabuf);
> @@ -586,6 +588,7 @@ struct dma_buf *dma_buf_export(const struct 
> dma_buf_export_info *exp_info)
> mutex_lock(_list.lock);
> list_add(>list_node, _list.head);
> mutex_unlock(_list.lock);
> +   atomic_long_add(dmabuf->size, _buf_global_allocated);
>
> return dmabuf;
>
> @@ -1346,6 +1349,16 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct 
> dma_buf_map *map)
>  }
>  EXPORT_SYMBOL_GPL(dma_buf_vunmap);
>
> +/**
> + * dma_buf_allocated_pages - Return the used nr of pages
> + * allocated for dma-buf
> + */
> +long dma_buf_allocated_pages(void)
> +{
> +   return atomic_long_read(_buf_global_allocated) >> PAGE_SHIFT;
> +}
> +EXPORT_SYMBOL_GPL(dma_buf_allocated_pages);

dma_buf_allocated_pages is only called from fs/proc/meminfo.c.
I am confused why it should be exported. If it won't be called
from the driver module, we should not export it.

Thanks.

> +
>  #ifdef CONFIG_DEBUG_FS
>  static int dma_buf_debug_show(struct seq_file *s, void *unused)
>  {
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 6fa761c9cc78..ccc7c40c8db7 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -16,6 +16,7 @@
>  #ifdef CONFIG_CMA
>  #include 
>  #endif
> +#include 
>  #include 
>  #include "internal.h"
>
> @@ -145,7 +146,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "CmaFree:",
> global_zone_page_state(NR_FREE_CMA_PAGES));
>  #endif
> -
> +#ifdef CONFIG_DMA_SHARED_BUFFER
> +   show_val_kb(m, "DmaBufTotal:", dma_buf_allocated_pages());
> +#endif
> hugetlb_report_meminfo(m);
>
> arch_report_meminfo(m);
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index efdc56b9d95f..5b05816bd2cd 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -507,4 +507,5 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct 
> *,
>  unsigned long);
>  int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
>  void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +long dma_buf_allocated_pages(void);
>  #endif /* __DMA_BUF_H__ */
> --
> 2.17.1
>

[PATCH v3 7/8] mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock

2021-04-16 Thread Muchun Song

The css_set_lock is used to guard the list of inherited objcgs. So there
is no need to uncharge kernel memory under css_set_lock. Just move it
out of the lock.

Signed-off-by: Muchun Song 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Johannes Weiner 
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c4eebe2a2914..e0c398fe7443 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -289,9 +289,10 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
 
-   spin_lock_irqsave(_set_lock, flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
+
+   spin_lock_irqsave(_set_lock, flags);
list_del(>list);
spin_unlock_irqrestore(_set_lock, flags);
 
-- 
2.11.0

[PATCH v3 8/8] mm: vmscan: remove noinline_for_stack

2021-04-16 Thread Muchun Song

The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan:
set up pagevec as late as possible in shrink_inactive_list()"), its
purpose is to delay the allocation of pagevec as late as possible to
save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off
reclaim stack") replace pagevecs by lists of pages_to_free. So we do
not need noinline_for_stack, just remove it (let the compiler decide
whether to inline).

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Acked-by: Roman Gushchin 
Reviewed-by: Shakeel Butt 
Acked-by: Michal Hocko 
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bc5cf409958..2d2727b78df9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2014,8 +2014,8 @@ static int too_many_isolated(struct pglist_data *pgdat, 
int file,
  *
  * Returns the number of pages moved to the given lruvec.
  */
-static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
-struct list_head *list)
+static unsigned int move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
 {
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
@@ -2095,7 +2095,7 @@ static int current_may_throttle(void)
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
  */
-static noinline_for_stack unsigned long
+static unsigned long
 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 struct scan_control *sc, enum lru_list lru)
 {
-- 
2.11.0

[PATCH v3 3/8] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

2021-04-16 Thread Muchun Song

All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
as the 2nd parameter to it (except isolate_migratepages_block()). But
for isolate_migratepages_block(), the page_pgdat(page) is also equal
to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
need the pgdat parameter. Just remove it to simplify the code.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h | 10 +-
 mm/compaction.c|  2 +-
 mm/memcontrol.c|  9 +++--
 mm/swap.c  |  2 +-
 mm/workingset.c|  2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c193be760709..f2a5aaba3577 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -743,13 +743,12 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
- * @pgdat: pgdat of the page
  *
  * This function relies on page->mem_cgroup being stable.
  */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-   struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+   pg_data_t *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);
 
VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
@@ -1221,9 +1220,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
return >__lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-   struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+   pg_data_t *pgdat = page_pgdat(page);
+
return >__lruvec;
 }
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c5028bfbd56..1c500e697c88 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -994,7 +994,7 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
 
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 50e3cf1e263e..caf193088beb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1181,9 +1181,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct 
page *page)
 struct lruvec *lock_page_lruvec(struct page *page)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock(>lru_lock);
 
lruvec_memcg_debug(lruvec, page);
@@ -1194,9 +1193,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
 struct lruvec *lock_page_lruvec_irq(struct page *page)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irq(>lru_lock);
 
lruvec_memcg_debug(lruvec, page);
@@ -1207,9 +1205,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
 struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long 
*flags)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irqsave(>lru_lock, *flags);
 
lruvec_memcg_debug(lruvec, page);
diff --git a/mm/swap.c b/mm/swap.c
index a75a8265302b..e0d5699213cc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, 
unsigned int nr_pages)
 
 void lru_note_cost_page(struct page *page)
 {
-   lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+   lru_note_cost(mem_cgroup_page_lruvec(page),
  page_is_file_lru(page), thp_nr_pages(page));
 }
 
diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..4f7a306ce75a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
-   lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+   lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
 out:
rcu_read_unlock();
-- 
2.11.0

[PATCH v3 6/8] mm: memcontrol: simplify the logic of objcg pinning memcg

2021-04-16 Thread Muchun Song

The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by
the css_set_lock. We do not need to care about objcg->memcg being
released in the process of obj_cgroup_release(). So there is no need
to pin memcg before releasing objcg. Remove those pinning logic to
simplfy the code.

There are only two places that modifies the objcg->memcg. One is the
initialization to objcg->memcg in the memcg_online_kmem(), another
is objcgs reparenting in the memcg_reparent_objcgs(). It is also
impossible for the two to run in parallel. So xchg() is unnecessary
and it is enough to use WRITE_ONCE().

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index caf193088beb..c4eebe2a2914 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -261,7 +261,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup 
*objcg,
 static void obj_cgroup_release(struct percpu_ref *ref)
 {
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
-   struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
@@ -291,11 +290,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
nr_pages = nr_bytes >> PAGE_SHIFT;
 
spin_lock_irqsave(_set_lock, flags);
-   memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
list_del(>list);
-   mem_cgroup_put(memcg);
spin_unlock_irqrestore(_set_lock, flags);
 
percpu_ref_exit(ref);
@@ -330,17 +327,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup 
*memcg,
 
spin_lock_irq(_set_lock);
 
-   /* Move active objcg to the parent's list */
-   xchg(>memcg, parent);
-   css_get(>css);
-   list_add(>list, >objcg_list);
-
-   /* Move already reparented objcgs to the parent's list */
-   list_for_each_entry(iter, >objcg_list, list) {
-   css_get(>css);
-   xchg(>memcg, parent);
-   css_put(>css);
-   }
+   /* 1) Ready to reparent active objcg. */
+   list_add(>list, >objcg_list);
+   /* 2) Reparent active objcg and already reparented objcgs to parent. */
+   list_for_each_entry(iter, >objcg_list, list)
+   WRITE_ONCE(iter->memcg, parent);
+   /* 3) Move already reparented objcgs to the parent's list */
list_splice(>objcg_list, >objcg_list);
 
spin_unlock_irq(_set_lock);
-- 
2.11.0

[PATCH v3 4/8] mm: memcontrol: simplify lruvec_holds_page_lru_lock

2021-04-16 Thread Muchun Song

We already have a helper lruvec_memcg() to get the memcg from lruvec, we
do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
to simplify the code. We can have a single definition for this function
that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
CONFIG_MEMCG.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h | 31 +++
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f2a5aaba3577..2fc728492c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct 
page *page)
return mem_cgroup_lruvec(memcg, pgdat);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
-   pg_data_t *pgdat = page_pgdat(page);
-   const struct mem_cgroup *memcg;
-   struct mem_cgroup_per_node *mz;
-
-   if (mem_cgroup_disabled())
-   return lruvec == >__lruvec;
-
-   mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-   memcg = page_memcg(page) ? : root_mem_cgroup;
-
-   return lruvec->pgdat == pgdat && mz->memcg == memcg;
-}
-
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -1227,14 +1211,6 @@ static inline struct lruvec 
*mem_cgroup_page_lruvec(struct page *page)
return >__lruvec;
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
-   pg_data_t *pgdat = page_pgdat(page);
-
-   return lruvec == >__lruvec;
-}
-
 static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 {
 }
@@ -1516,6 +1492,13 @@ static inline void unlock_page_lruvec_irqrestore(struct 
lruvec *lruvec,
spin_unlock_irqrestore(>lru_lock, flags);
 }
 
+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+ struct lruvec *lruvec)
+{
+   return lruvec_pgdat(lruvec) == page_pgdat(page) &&
+  lruvec_memcg(lruvec) == page_memcg(page);
+}
+
 /* Don't lock again iff page's lruvec locked */
 static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
struct lruvec *locked_lruvec)
-- 
2.11.0

[PATCH v3 5/8] mm: memcontrol: rename lruvec_holds_page_lru_lock to page_matches_lruvec

2021-04-16 Thread Muchun Song

lruvec_holds_page_lru_lock() doesn't check anything about locking and is
used to check whether the page belongs to the lruvec. So rename it to
page_matches_lruvec().

Signed-off-by: Muchun Song 
Acked-by: Michal Hocko 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
---
 include/linux/memcontrol.h | 8 
 mm/vmscan.c| 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2fc728492c9b..0ce97eff79e2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1492,8 +1492,8 @@ static inline void unlock_page_lruvec_irqrestore(struct 
lruvec *lruvec,
spin_unlock_irqrestore(>lru_lock, flags);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
+/* Test requires a stable page->memcg binding, see page_memcg() */
+static inline bool page_matches_lruvec(struct page *page, struct lruvec 
*lruvec)
 {
return lruvec_pgdat(lruvec) == page_pgdat(page) &&
   lruvec_memcg(lruvec) == page_memcg(page);
@@ -1504,7 +1504,7 @@ static inline struct lruvec 
*relock_page_lruvec_irq(struct page *page,
struct lruvec *locked_lruvec)
 {
if (locked_lruvec) {
-   if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+   if (page_matches_lruvec(page, locked_lruvec))
return locked_lruvec;
 
unlock_page_lruvec_irq(locked_lruvec);
@@ -1518,7 +1518,7 @@ static inline struct lruvec 
*relock_page_lruvec_irqsave(struct page *page,
struct lruvec *locked_lruvec, unsigned long *flags)
 {
if (locked_lruvec) {
-   if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+   if (page_matches_lruvec(page, locked_lruvec))
return locked_lruvec;
 
unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb8321026c0c..2bc5cf409958 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2062,7 +2062,7 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
 * All pages were isolated from the same lruvec (and isolation
 * inhibits memcg migration).
 */
-   VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+   VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
-- 
2.11.0

[PATCH v3 1/8] mm: memcontrol: fix page charging in page replacement

2021-04-16 Thread Muchun Song

The pages aren't accounted at the root level, so do not charge the page
to the root memcg in page replacement. Although we do not display the
value (mem_cgroup_usage) so there shouldn't be any actual problem, but
there is a WARN_ON_ONCE in the page_counter_cancel(). Who knows if it
will trigger? So it is better to fix it.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 mm/memcontrol.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ada9e650a5..f229de925aa5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6806,9 +6806,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
page *newpage)
/* Force-charge the new page. The old one will be freed soon */
nr_pages = thp_nr_pages(newpage);
 
-   page_counter_charge(>memory, nr_pages);
-   if (do_memsw_account())
-   page_counter_charge(>memsw, nr_pages);
+   if (!mem_cgroup_is_root(memcg)) {
+   page_counter_charge(>memory, nr_pages);
+   if (do_memsw_account())
+   page_counter_charge(>memsw, nr_pages);
+   }
 
css_get(>css);
commit_charge(newpage, memcg);
-- 
2.11.0

[PATCH v3 2/8] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

2021-04-16 Thread Muchun Song

When mm is NULL, we do not need to hold rcu lock and call css_tryget for
the root memcg. And we also do not need to check !mm in every loop of
while. So bail out early when !mm.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 mm/memcontrol.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f229de925aa5..50e3cf1e263e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -901,20 +901,23 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct 
mm_struct *mm)
if (mem_cgroup_disabled())
return NULL;
 
+   /*
+* Page cache insertions can happen without an
+* actual mm context, e.g. during disk probing
+* on boot, loopback IO, acct() writes etc.
+*
+* No need to css_get on root memcg as the reference
+* counting is disabled on the root level in the
+* cgroup core. See CSS_NO_REF.
+*/
+   if (unlikely(!mm))
+   return root_mem_cgroup;
+
rcu_read_lock();
do {
-   /*
-* Page cache insertions can happen without an
-* actual mm context, e.g. during disk probing
-* on boot, loopback IO, acct() writes etc.
-*/
-   if (unlikely(!mm))
+   memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+   if (unlikely(!memcg))
memcg = root_mem_cgroup;
-   else {
-   memcg = 
mem_cgroup_from_task(rcu_dereference(mm->owner));
-   if (unlikely(!memcg))
-   memcg = root_mem_cgroup;
-   }
} while (!css_tryget(>css));
rcu_read_unlock();
return memcg;
-- 
2.11.0

[PATCH v3 0/8] memcontrol code cleanup and simplification

2021-04-16 Thread Muchun Song

This patch series is part of [1] patch series. Because those patches are
code cleanup or simplification. I gather those patches into a separate
series to make it easier to review.

[1] 
https://lore.kernel.org/linux-mm/20210409122959.82264-1-songmuc...@bytedance.com/

Changlogs in v3:
  1. Collect Acked-by and Review-by tags.
  2. Add a comment to patch 5 (suggested by Johannes).

  Thanks to Johannes, Shakeel and Michal's review.

Changlogs in v2:
  1. Collect Acked-by and Review-by tags.
  2. Add a new patch to rename lruvec_holds_page_lru_lock to 
page_matches_lruvec.
  3. Add a comment to patch 2.

  Thanks to Roman, Johannes, Shakeel and Michal's review.

Muchun Song (8):
  mm: memcontrol: fix page charging in page replacement
  mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm
  mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec
  mm: memcontrol: simplify lruvec_holds_page_lru_lock
  mm: memcontrol: rename lruvec_holds_page_lru_lock to
page_matches_lruvec
  mm: memcontrol: simplify the logic of objcg pinning memcg
  mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock
  mm: vmscan: remove noinline_for_stack

 include/linux/memcontrol.h | 43 ++
 mm/compaction.c|  2 +-
 mm/memcontrol.c| 65 +-
 mm/swap.c  |  2 +-
 mm/vmscan.c|  8 +++---
 mm/workingset.c|  2 +-
 6 files changed, 50 insertions(+), 72 deletions(-)

-- 
2.11.0

Re: [External] Re: [PATCH v20 5/9] mm: hugetlb: defer freeing of HugeTLB pages

2021-04-16 Thread Muchun Song

On Sat, Apr 17, 2021 at 7:56 AM Mike Kravetz  wrote:
>
> On 4/15/21 1:40 AM, Muchun Song wrote:
> > In the subsequent patch, we should allocate the vmemmap pages when
> > freeing a HugeTLB page. But update_and_free_page() can be called
> > under any context, so we cannot use GFP_KERNEL to allocate vmemmap
> > pages. However, we can defer the actual freeing in a kworker to
> > prevent from using GFP_ATOMIC to allocate the vmemmap pages.
>
> Thanks!  I knew we would need to introduce a kworker for this when I
> removed the kworker previously used in free_huge_page.

Yeah, but another choice is using GFP_ATOMIC to allocate vmemmap
pages when we are in an atomic context. If not atomic context, just
use GFP_KERNEL. In this case, we can drop kworker.

>
> > The __update_and_free_page() is where the call to allocate vmemmmap
> > pages will be inserted.
>
> This patch adds the functionality required for __update_and_free_page
> to potentially sleep and fail.  More questions will come up in the
> subsequent patch when code must deal with the failures.

Right. More questions are welcome.

>
> >
> > Signed-off-by: Muchun Song 
> > ---
> >  mm/hugetlb.c | 73 
> > 
> >  mm/hugetlb_vmemmap.c | 12 -
> >  mm/hugetlb_vmemmap.h | 17 
> >  3 files changed, 85 insertions(+), 17 deletions(-)
> >
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 923d05e2806b..eeb8f5480170 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -1376,7 +1376,7 @@ static void remove_hugetlb_page(struct hstate *h, 
> > struct page *page,
> >   h->nr_huge_pages_node[nid]--;
> >  }
> >
> > -static void update_and_free_page(struct hstate *h, struct page *page)
> > +static void __update_and_free_page(struct hstate *h, struct page *page)
> >  {
> >   int i;
> >   struct page *subpage = page;
> > @@ -1399,12 +1399,73 @@ static void update_and_free_page(struct hstate *h, 
> > struct page *page)
> >   }
> >  }
> >
> > +/*
> > + * As update_and_free_page() can be called under any context, so we cannot
> > + * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
> > + * actual freeing in a workqueue to prevent from using GFP_ATOMIC to 
> > allocate
> > + * the vmemmap pages.
> > + *
> > + * free_hpage_workfn() locklessly retrieves the linked list of pages to be
> > + * freed and frees them one-by-one. As the page->mapping pointer is going
> > + * to be cleared in free_hpage_workfn() anyway, it is reused as the 
> > llist_node
> > + * structure of a lockless linked list of huge pages to be freed.
> > + */
> > +static LLIST_HEAD(hpage_freelist);
> > +
> > +static void free_hpage_workfn(struct work_struct *work)
> > +{
> > + struct llist_node *node;
> > +
> > + node = llist_del_all(_freelist);
> > +
> > + while (node) {
> > + struct page *page;
> > + struct hstate *h;
> > +
> > + page = container_of((struct address_space **)node,
> > +  struct page, mapping);
> > + node = node->next;
> > + page->mapping = NULL;
> > + h = page_hstate(page);
>
> The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate is going to
> trigger because a previous call to remove_hugetlb_page() will
> set_compound_page_dtor(page, NULL_COMPOUND_DTOR)

Sorry, I did not realise that. Thanks for your reminder.

>
> Note how h(hstate) is grabbed before calling update_and_free_page in
> existing code.
>
> We could potentially drop the !PageHuge(page) in page_hstate.  Or,
> perhaps just use 'size_to_hstate(page_size(page))' in free_hpage_workfn.

I prefer not to change the behavior of page_hstate(). So I
should use 'size_to_hstate(page_size(page))' directly.

Thanks Mike.


> --
> Mike Kravetz

Re: [External] [PATCH v3] dma-buf: Add DmaBufTotal counter in meminfo

2021-04-16 Thread Muchun Song

On Sat, Apr 17, 2021 at 12:08 AM Peter Enderborg
 wrote:
>
> This adds a total used dma-buf memory. Details
> can be found in debugfs, however it is not for everyone
> and not always available. dma-buf are indirect allocated by
> userspace. So with this value we can monitor and detect
> userspace applications that have problems.

I want to know more details about the problems.
Can you share what problems you have encountered?

Thanks.

>
> Signed-off-by: Peter Enderborg 
> ---
>  drivers/dma-buf/dma-buf.c | 12 
>  fs/proc/meminfo.c |  5 -
>  include/linux/dma-buf.h   |  1 +
>  3 files changed, 17 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index f264b70c383e..d40fff2ae1fa 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -37,6 +37,7 @@ struct dma_buf_list {
>  };
>
>  static struct dma_buf_list db_list;
> +static atomic_long_t dma_buf_global_allocated;
>
>  static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
>  {
> @@ -79,6 +80,7 @@ static void dma_buf_release(struct dentry *dentry)
> if (dmabuf->resv == (struct dma_resv *)[1])
> dma_resv_fini(dmabuf->resv);
>
> +   atomic_long_sub(dmabuf->size, _buf_global_allocated);
> module_put(dmabuf->owner);
> kfree(dmabuf->name);
> kfree(dmabuf);
> @@ -586,6 +588,7 @@ struct dma_buf *dma_buf_export(const struct 
> dma_buf_export_info *exp_info)
> mutex_lock(_list.lock);
> list_add(>list_node, _list.head);
> mutex_unlock(_list.lock);
> +   atomic_long_add(dmabuf->size, _buf_global_allocated);
>
> return dmabuf;
>
> @@ -1346,6 +1349,15 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, struct 
> dma_buf_map *map)
>  }
>  EXPORT_SYMBOL_GPL(dma_buf_vunmap);
>
> +/**
> + * dma_buf_get_size - Return the used nr pages by dma-buf
> + */
> +long dma_buf_allocated_pages(void)
> +{
> +   return atomic_long_read(_buf_global_allocated) >> PAGE_SHIFT;
> +}
> +EXPORT_SYMBOL_GPL(dma_buf_allocated_pages);

Why need "EXPORT_SYMBOL_GPL"?

> +
>  #ifdef CONFIG_DEBUG_FS
>  static int dma_buf_debug_show(struct seq_file *s, void *unused)
>  {
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 6fa761c9cc78..ccc7c40c8db7 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -16,6 +16,7 @@
>  #ifdef CONFIG_CMA
>  #include 
>  #endif
> +#include 
>  #include 
>  #include "internal.h"
>
> @@ -145,7 +146,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
> show_val_kb(m, "CmaFree:",
> global_zone_page_state(NR_FREE_CMA_PAGES));
>  #endif
> -
> +#ifdef CONFIG_DMA_SHARED_BUFFER
> +   show_val_kb(m, "DmaBufTotal:", dma_buf_allocated_pages());
> +#endif
> hugetlb_report_meminfo(m);
>
> arch_report_meminfo(m);
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index efdc56b9d95f..5b05816bd2cd 100644
> --- a/include/linux/dma-buf.h
> +++ b/include/linux/dma-buf.h
> @@ -507,4 +507,5 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct 
> *,
>  unsigned long);
>  int dma_buf_vmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
>  void dma_buf_vunmap(struct dma_buf *dmabuf, struct dma_buf_map *map);
> +long dma_buf_allocated_pages(void);
>  #endif /* __DMA_BUF_H__ */
> --
> 2.17.1
>

Re: [External] Re: [PATCH v20 4/9] mm: hugetlb: free the vmemmap pages associated with each HugeTLB page

2021-04-16 Thread Muchun Song

On Sat, Apr 17, 2021 at 5:10 AM Mike Kravetz  wrote:
>
> On 4/15/21 1:40 AM, Muchun Song wrote:
> > Every HugeTLB has more than one struct page structure. We __know__ that
> > we only use the first 4 (__NR_USED_SUBPAGE) struct page structures
> > to store metadata associated with each HugeTLB.
> >
> > There are a lot of struct page structures associated with each HugeTLB
> > page. For tail pages, the value of compound_head is the same. So we can
> > reuse first page of tail page structures. We map the virtual addresses
> > of the remaining pages of tail page structures to the first tail page
> > struct, and then free these page frames. Therefore, we need to reserve
> > two pages as vmemmap areas.
> >
> > When we allocate a HugeTLB page from the buddy, we can free some vmemmap
> > pages associated with each HugeTLB page. It is more appropriate to do it
> > in the prep_new_huge_page().
> >
> > The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap
> > pages associated with a HugeTLB page can be freed, returns zero for
> > now, which means the feature is disabled. We will enable it once all
> > the infrastructure is there.
> >
> > Signed-off-by: Muchun Song 
> > Reviewed-by: Oscar Salvador 
> > Tested-by: Chen Huang 
> > Tested-by: Bodeddula Balasubramaniam 
> > Acked-by: Michal Hocko 
>
> There may need to be some trivial rebasing due to Oscar's changes
> when they go in.

Yeah, thanks for your reminder.

>
> Reviewed-by: Mike Kravetz 
> --
> Mike Kravetz

Re: [External] Re: [PATCH v2 5/8] mm: memcontrol: rename lruvec_holds_page_lru_lock to page_matches_lruvec

2021-04-16 Thread Muchun Song

On Fri, Apr 16, 2021 at 11:20 PM Johannes Weiner  wrote:
>
> On Fri, Apr 16, 2021 at 01:14:04PM +0800, Muchun Song wrote:
> > lruvec_holds_page_lru_lock() doesn't check anything about locking and is
> > used to check whether the page belongs to the lruvec. So rename it to
> > page_matches_lruvec().
> >
> > Signed-off-by: Muchun Song 
>
> The rename makes sense, since the previous name was defined by a
> specific use case rather than what it does. That said, it did imply a
> lock context that makes the test result stable. Without that the
> function could use a short comment, IMO. How about:
>
> /* Test requires a stable page->memcg binding, see page_memcg() */

Make sense. I will add this comment.

>
> With that,
> Acked-by: Johannes Weiner 

Thanks.

[PATCH v2 8/8] mm: vmscan: remove noinline_for_stack

2021-04-15 Thread Muchun Song

The noinline_for_stack is introduced by commit 666356297ec4 ("vmscan:
set up pagevec as late as possible in shrink_inactive_list()"), its
purpose is to delay the allocation of pagevec as late as possible to
save stack memory. But the commit 2bcf88796381 ("mm: take pagevecs off
reclaim stack") replace pagevecs by lists of pages_to_free. So we do
not need noinline_for_stack, just remove it (let the compiler decide
whether to inline).

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Acked-by: Roman Gushchin 
Reviewed-by: Shakeel Butt 
Acked-by: Michal Hocko 
---
 mm/vmscan.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2bc5cf409958..2d2727b78df9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2014,8 +2014,8 @@ static int too_many_isolated(struct pglist_data *pgdat, 
int file,
  *
  * Returns the number of pages moved to the given lruvec.
  */
-static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
-struct list_head *list)
+static unsigned int move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
 {
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
@@ -2095,7 +2095,7 @@ static int current_may_throttle(void)
  * shrink_inactive_list() is a helper for shrink_node().  It returns the number
  * of reclaimed pages
  */
-static noinline_for_stack unsigned long
+static unsigned long
 shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 struct scan_control *sc, enum lru_list lru)
 {
-- 
2.11.0

[PATCH v2 7/8] mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock

2021-04-15 Thread Muchun Song

The css_set_lock is used to guard the list of inherited objcgs. So there
is no need to uncharge kernel memory under css_set_lock. Just move it
out of the lock.

Signed-off-by: Muchun Song 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Johannes Weiner 
---
 mm/memcontrol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c4eebe2a2914..e0c398fe7443 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -289,9 +289,10 @@ static void obj_cgroup_release(struct percpu_ref *ref)
WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
nr_pages = nr_bytes >> PAGE_SHIFT;
 
-   spin_lock_irqsave(_set_lock, flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
+
+   spin_lock_irqsave(_set_lock, flags);
list_del(>list);
spin_unlock_irqrestore(_set_lock, flags);
 
-- 
2.11.0

[PATCH v2 6/8] mm: memcontrol: simplify the logic of objcg pinning memcg

2021-04-15 Thread Muchun Song

The obj_cgroup_release() and memcg_reparent_objcgs() are serialized by
the css_set_lock. We do not need to care about objcg->memcg being
released in the process of obj_cgroup_release(). So there is no need
to pin memcg before releasing objcg. Remove those pinning logic to
simplfy the code.

There are only two places that modifies the objcg->memcg. One is the
initialization to objcg->memcg in the memcg_online_kmem(), another
is objcgs reparenting in the memcg_reparent_objcgs(). It is also
impossible for the two to run in parallel. So xchg() is unnecessary
and it is enough to use WRITE_ONCE().

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
---
 mm/memcontrol.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index caf193088beb..c4eebe2a2914 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -261,7 +261,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup 
*objcg,
 static void obj_cgroup_release(struct percpu_ref *ref)
 {
struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
-   struct mem_cgroup *memcg;
unsigned int nr_bytes;
unsigned int nr_pages;
unsigned long flags;
@@ -291,11 +290,9 @@ static void obj_cgroup_release(struct percpu_ref *ref)
nr_pages = nr_bytes >> PAGE_SHIFT;
 
spin_lock_irqsave(_set_lock, flags);
-   memcg = obj_cgroup_memcg(objcg);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
list_del(>list);
-   mem_cgroup_put(memcg);
spin_unlock_irqrestore(_set_lock, flags);
 
percpu_ref_exit(ref);
@@ -330,17 +327,12 @@ static void memcg_reparent_objcgs(struct mem_cgroup 
*memcg,
 
spin_lock_irq(_set_lock);
 
-   /* Move active objcg to the parent's list */
-   xchg(>memcg, parent);
-   css_get(>css);
-   list_add(>list, >objcg_list);
-
-   /* Move already reparented objcgs to the parent's list */
-   list_for_each_entry(iter, >objcg_list, list) {
-   css_get(>css);
-   xchg(>memcg, parent);
-   css_put(>css);
-   }
+   /* 1) Ready to reparent active objcg. */
+   list_add(>list, >objcg_list);
+   /* 2) Reparent active objcg and already reparented objcgs to parent. */
+   list_for_each_entry(iter, >objcg_list, list)
+   WRITE_ONCE(iter->memcg, parent);
+   /* 3) Move already reparented objcgs to the parent's list */
list_splice(>objcg_list, >objcg_list);
 
spin_unlock_irq(_set_lock);
-- 
2.11.0

[PATCH v2 5/8] mm: memcontrol: rename lruvec_holds_page_lru_lock to page_matches_lruvec

2021-04-15 Thread Muchun Song

lruvec_holds_page_lru_lock() doesn't check anything about locking and is
used to check whether the page belongs to the lruvec. So rename it to
page_matches_lruvec().

Signed-off-by: Muchun Song 
---
 include/linux/memcontrol.h | 7 +++
 mm/vmscan.c| 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 2fc728492c9b..40b0c31ea4ba 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1492,8 +1492,7 @@ static inline void unlock_page_lruvec_irqrestore(struct 
lruvec *lruvec,
spin_unlock_irqrestore(>lru_lock, flags);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
+static inline bool page_matches_lruvec(struct page *page, struct lruvec 
*lruvec)
 {
return lruvec_pgdat(lruvec) == page_pgdat(page) &&
   lruvec_memcg(lruvec) == page_memcg(page);
@@ -1504,7 +1503,7 @@ static inline struct lruvec 
*relock_page_lruvec_irq(struct page *page,
struct lruvec *locked_lruvec)
 {
if (locked_lruvec) {
-   if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+   if (page_matches_lruvec(page, locked_lruvec))
return locked_lruvec;
 
unlock_page_lruvec_irq(locked_lruvec);
@@ -1518,7 +1517,7 @@ static inline struct lruvec 
*relock_page_lruvec_irqsave(struct page *page,
struct lruvec *locked_lruvec, unsigned long *flags)
 {
if (locked_lruvec) {
-   if (lruvec_holds_page_lru_lock(page, locked_lruvec))
+   if (page_matches_lruvec(page, locked_lruvec))
return locked_lruvec;
 
unlock_page_lruvec_irqrestore(locked_lruvec, *flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bb8321026c0c..2bc5cf409958 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2062,7 +2062,7 @@ static unsigned noinline_for_stack 
move_pages_to_lru(struct lruvec *lruvec,
 * All pages were isolated from the same lruvec (and isolation
 * inhibits memcg migration).
 */
-   VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+   VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
-- 
2.11.0

[PATCH v2 4/8] mm: memcontrol: simplify lruvec_holds_page_lru_lock

2021-04-15 Thread Muchun Song

We already have a helper lruvec_memcg() to get the memcg from lruvec, we
do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So use
lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
to simplify the code. We can have a single definition for this function
that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
CONFIG_MEMCG.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h | 31 +++
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f2a5aaba3577..2fc728492c9b 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -755,22 +755,6 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct 
page *page)
return mem_cgroup_lruvec(memcg, pgdat);
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
-   pg_data_t *pgdat = page_pgdat(page);
-   const struct mem_cgroup *memcg;
-   struct mem_cgroup_per_node *mz;
-
-   if (mem_cgroup_disabled())
-   return lruvec == >__lruvec;
-
-   mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
-   memcg = page_memcg(page) ? : root_mem_cgroup;
-
-   return lruvec->pgdat == pgdat && mz->memcg == memcg;
-}
-
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
 
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
@@ -1227,14 +1211,6 @@ static inline struct lruvec 
*mem_cgroup_page_lruvec(struct page *page)
return >__lruvec;
 }
 
-static inline bool lruvec_holds_page_lru_lock(struct page *page,
- struct lruvec *lruvec)
-{
-   pg_data_t *pgdat = page_pgdat(page);
-
-   return lruvec == >__lruvec;
-}
-
 static inline void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
 {
 }
@@ -1516,6 +1492,13 @@ static inline void unlock_page_lruvec_irqrestore(struct 
lruvec *lruvec,
spin_unlock_irqrestore(>lru_lock, flags);
 }
 
+static inline bool lruvec_holds_page_lru_lock(struct page *page,
+ struct lruvec *lruvec)
+{
+   return lruvec_pgdat(lruvec) == page_pgdat(page) &&
+  lruvec_memcg(lruvec) == page_memcg(page);
+}
+
 /* Don't lock again iff page's lruvec locked */
 static inline struct lruvec *relock_page_lruvec_irq(struct page *page,
struct lruvec *locked_lruvec)
-- 
2.11.0

[PATCH v2 3/8] mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec

2021-04-15 Thread Muchun Song

All the callers of mem_cgroup_page_lruvec() just pass page_pgdat(page)
as the 2nd parameter to it (except isolate_migratepages_block()). But
for isolate_migratepages_block(), the page_pgdat(page) is also equal
to the local variable of @pgdat. So mem_cgroup_page_lruvec() do not
need the pgdat parameter. Just remove it to simplify the code.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 include/linux/memcontrol.h | 10 +-
 mm/compaction.c|  2 +-
 mm/memcontrol.c|  9 +++--
 mm/swap.c  |  2 +-
 mm/workingset.c|  2 +-
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index c193be760709..f2a5aaba3577 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -743,13 +743,12 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
 /**
  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
  * @page: the page
- * @pgdat: pgdat of the page
  *
  * This function relies on page->mem_cgroup being stable.
  */
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-   struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+   pg_data_t *pgdat = page_pgdat(page);
struct mem_cgroup *memcg = page_memcg(page);
 
VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page);
@@ -1221,9 +1220,10 @@ static inline struct lruvec *mem_cgroup_lruvec(struct 
mem_cgroup *memcg,
return >__lruvec;
 }
 
-static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page,
-   struct pglist_data *pgdat)
+static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page)
 {
+   pg_data_t *pgdat = page_pgdat(page);
+
return >__lruvec;
 }
 
diff --git a/mm/compaction.c b/mm/compaction.c
index 8c5028bfbd56..1c500e697c88 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -994,7 +994,7 @@ isolate_migratepages_block(struct compact_control *cc, 
unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
 
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 50e3cf1e263e..caf193088beb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1181,9 +1181,8 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct 
page *page)
 struct lruvec *lock_page_lruvec(struct page *page)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock(>lru_lock);
 
lruvec_memcg_debug(lruvec, page);
@@ -1194,9 +1193,8 @@ struct lruvec *lock_page_lruvec(struct page *page)
 struct lruvec *lock_page_lruvec_irq(struct page *page)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irq(>lru_lock);
 
lruvec_memcg_debug(lruvec, page);
@@ -1207,9 +1205,8 @@ struct lruvec *lock_page_lruvec_irq(struct page *page)
 struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long 
*flags)
 {
struct lruvec *lruvec;
-   struct pglist_data *pgdat = page_pgdat(page);
 
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
+   lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irqsave(>lru_lock, *flags);
 
lruvec_memcg_debug(lruvec, page);
diff --git a/mm/swap.c b/mm/swap.c
index a75a8265302b..e0d5699213cc 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -313,7 +313,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, 
unsigned int nr_pages)
 
 void lru_note_cost_page(struct page *page)
 {
-   lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+   lru_note_cost(mem_cgroup_page_lruvec(page),
  page_is_file_lru(page), thp_nr_pages(page));
 }
 
diff --git a/mm/workingset.c b/mm/workingset.c
index b7cdeca5a76d..4f7a306ce75a 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -408,7 +408,7 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
-   lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+   lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
 out:
rcu_read_unlock();
-- 
2.11.0

[PATCH v2 2/8] mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm

2021-04-15 Thread Muchun Song

When mm is NULL, we do not need to hold rcu lock and call css_tryget for
the root memcg. And we also do not need to check !mm in every loop of
while. So bail out early when !mm.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 mm/memcontrol.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f229de925aa5..50e3cf1e263e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -901,20 +901,23 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct 
mm_struct *mm)
if (mem_cgroup_disabled())
return NULL;
 
+   /*
+* Page cache insertions can happen without an
+* actual mm context, e.g. during disk probing
+* on boot, loopback IO, acct() writes etc.
+*
+* No need to css_get on root memcg as the reference
+* counting is disabled on the root level in the
+* cgroup core. See CSS_NO_REF.
+*/
+   if (unlikely(!mm))
+   return root_mem_cgroup;
+
rcu_read_lock();
do {
-   /*
-* Page cache insertions can happen without an
-* actual mm context, e.g. during disk probing
-* on boot, loopback IO, acct() writes etc.
-*/
-   if (unlikely(!mm))
+   memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+   if (unlikely(!memcg))
memcg = root_mem_cgroup;
-   else {
-   memcg = 
mem_cgroup_from_task(rcu_dereference(mm->owner));
-   if (unlikely(!memcg))
-   memcg = root_mem_cgroup;
-   }
} while (!css_tryget(>css));
rcu_read_unlock();
return memcg;
-- 
2.11.0

[PATCH v2 0/8] memcontrol code cleanup and simplification

2021-04-15 Thread Muchun Song

This patch series is part of [1] patch series. Because those patches are
code cleanup or simplification. I gather those patches into a separate
series to make it easier to review.

[1] 
https://lore.kernel.org/linux-mm/20210409122959.82264-1-songmuc...@bytedance.com/

Changlogs in v2:
  1. Collect Acked-by and Review-by tags.
  2. Add a new patch to rename lruvec_holds_page_lru_lock to 
page_matches_lruvec.
  3. Add a comment to patch 2.

  Thanks to Roman, Johannes, Shakeel and Michal's review.

Muchun Song (8):
  mm: memcontrol: fix page charging in page replacement
  mm: memcontrol: bail out early when !mm in get_mem_cgroup_from_mm
  mm: memcontrol: remove the pgdata parameter of mem_cgroup_page_lruvec
  mm: memcontrol: simplify lruvec_holds_page_lru_lock
  mm: memcontrol: rename lruvec_holds_page_lru_lock to
page_matches_lruvec
  mm: memcontrol: simplify the logic of objcg pinning memcg
  mm: memcontrol: move obj_cgroup_uncharge_pages() out of css_set_lock
  mm: vmscan: remove noinline_for_stack

 include/linux/memcontrol.h | 42 +-
 mm/compaction.c|  2 +-
 mm/memcontrol.c| 65 +-
 mm/swap.c  |  2 +-
 mm/vmscan.c|  8 +++---
 mm/workingset.c|  2 +-
 6 files changed, 49 insertions(+), 72 deletions(-)

-- 
2.11.0

[PATCH v2 1/8] mm: memcontrol: fix page charging in page replacement

2021-04-15 Thread Muchun Song

The pages aren't accounted at the root level, so do not charge the page
to the root memcg in page replacement. Although we do not display the
value (mem_cgroup_usage) so there shouldn't be any actual problem, but
there is a WARN_ON_ONCE in the page_counter_cancel(). Who knows if it
will trigger? So it is better to fix it.

Signed-off-by: Muchun Song 
Acked-by: Johannes Weiner 
Reviewed-by: Shakeel Butt 
Acked-by: Roman Gushchin 
Acked-by: Michal Hocko 
---
 mm/memcontrol.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 64ada9e650a5..f229de925aa5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6806,9 +6806,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct 
page *newpage)
/* Force-charge the new page. The old one will be freed soon */
nr_pages = thp_nr_pages(newpage);
 
-   page_counter_charge(>memory, nr_pages);
-   if (do_memsw_account())
-   page_counter_charge(>memsw, nr_pages);
+   if (!mem_cgroup_is_root(memcg)) {
+   page_counter_charge(>memory, nr_pages);
+   if (do_memsw_account())
+   page_counter_charge(>memsw, nr_pages);
+   }
 
css_get(>css);
commit_charge(newpage, memcg);
-- 
2.11.0

Re: [External] Re: [PATCH 4/7] mm: memcontrol: simplify lruvec_holds_page_lru_lock

2021-04-15 Thread Muchun Song

On Thu, Apr 15, 2021 at 1:49 AM Johannes Weiner  wrote:
>
> On Wed, Apr 14, 2021 at 06:00:42PM +0800, Muchun Song wrote:
> > On Wed, Apr 14, 2021 at 5:44 PM Michal Hocko  wrote:
> > >
> > > On Tue 13-04-21 14:51:50, Muchun Song wrote:
> > > > We already have a helper lruvec_memcg() to get the memcg from lruvec, we
> > > > do not need to do it ourselves in the lruvec_holds_page_lru_lock(). So 
> > > > use
> > > > lruvec_memcg() instead. And if mem_cgroup_disabled() returns false, the
> > > > page_memcg(page) (the LRU pages) cannot be NULL. So remove the odd logic
> > > > of "memcg = page_memcg(page) ? : root_mem_cgroup". And use lruvec_pgdat
> > > > to simplify the code. We can have a single definition for this function
> > > > that works for !CONFIG_MEMCG, CONFIG_MEMCG + mem_cgroup_disabled() and
> > > > CONFIG_MEMCG.
> > >
> > > Neat. While you are at it wouldn't it make sesne to rename the function
> > > as well. I do not want to bikeshed but this is really a misnomer. it
> > > doesn't check anything about locking. page_belongs_lruvec?
> >
> > Right. lruvec_holds_page_lru_lock is used to check whether
> > the page belongs to the lruvec. page_belongs_lruvec
> > obviously more clearer. I can rename it to
> > page_belongs_lruvec the next version.
>
> This sounds strange to me, I think 'belongs' needs a 'to' to be
> correct, so page_belongs_to_lruvec(). Still kind of a mouthful.
>
> page_matches_lruvec()?
>

I prefer this name. If you also agree, I will use this name.

Thanks.

> page_from_lruvec()?

[PATCH v20 8/9] mm: memory_hotplug: disable memmap_on_memory when hugetlb_free_vmemmap enabled

2021-04-15 Thread Muchun Song

The parameter of memory_hotplug.memmap_on_memory is not compatible with
hugetlb_free_vmemmap. So disable it when hugetlb_free_vmemmap is
enabled.

Signed-off-by: Muchun Song 
---
 Documentation/admin-guide/kernel-parameters.txt |  4 
 drivers/acpi/acpi_memhotplug.c  |  1 +
 mm/memory_hotplug.c | 18 +-
 3 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 9e655f5206ac..1f648b3e6120 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2893,6 +2893,10 @@
Note that even when enabled, there are a few cases where
the feature is not effective.
 
+   This is not compatible with hugetlb_free_vmemmap. If
+   both parameters are enabled, hugetlb_free_vmemmap takes
+   precedence over memory_hotplug.memmap_on_memory.
+
memtest=[KNL,X86,ARM,PPC,RISCV] Enable memtest
Format: 
default : 0 
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 8cc195c4c861..0d7f595ee441 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 68923c19bdea..c45ed6c0cd9f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -981,6 +981,7 @@ static int online_memory_block(struct memory_block *mem, 
void *arg)
 
 bool mhp_supports_memmap_on_memory(unsigned long size)
 {
+   bool supported;
unsigned long nr_vmemmap_pages = size / PAGE_SIZE;
unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page);
unsigned long remaining_size = size - vmemmap_size;
@@ -1011,11 +1012,18 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
 *   altmap as an alternative source of memory, and we do not 
exactly
 *   populate a single PMD.
 */
-   return memmap_on_memory &&
-  IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
-  size == memory_block_size_bytes() &&
-  IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
-  IS_ALIGNED(remaining_size, pageblock_nr_pages << PAGE_SHIFT);
+   supported = memmap_on_memory &&
+   IS_ENABLED(CONFIG_MHP_MEMMAP_ON_MEMORY) &&
+   size == memory_block_size_bytes() &&
+   IS_ALIGNED(vmemmap_size, PMD_SIZE) &&
+   IS_ALIGNED(remaining_size, pageblock_nr_pages << 
PAGE_SHIFT);
+
+   if (supported && is_hugetlb_free_vmemmap_enabled()) {
+   pr_info("Cannot enable memory_hotplug.memmap_on_memory, it is 
not compatible with hugetlb_free_vmemmap\n");
+   supported = false;
+   }
+
+   return supported;
 }
 
 /*
-- 
2.11.0

[PATCH v20 9/9] mm: hugetlb: introduce nr_free_vmemmap_pages in the struct hstate

2021-04-15 Thread Muchun Song

All the infrastructure is ready, so we introduce nr_free_vmemmap_pages
field in the hstate to indicate how many vmemmap pages associated with
a HugeTLB page that can be freed to buddy allocator. And initialize it
in the hugetlb_vmemmap_init(). This patch is actual enablement of the
feature.

There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.

Signed-off-by: Muchun Song 
Acked-by: Mike Kravetz 
Reviewed-by: Oscar Salvador 
Reviewed-by: Miaohe Lin 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c|  1 +
 mm/hugetlb_vmemmap.c| 33 +
 mm/hugetlb_vmemmap.h| 10 ++
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4015cedacf91..710d821fbca6 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -602,6 +602,9 @@ struct hstate {
unsigned int nr_huge_pages_node[MAX_NUMNODES];
unsigned int free_huge_pages_node[MAX_NUMNODES];
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
+#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+   unsigned int nr_free_vmemmap_pages;
+#endif
 #ifdef CONFIG_CGROUP_HUGETLB
/* cgroup control files */
struct cftype cgroup_files_dfl[7];
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1c37f0098e00..8adf52f4c7e4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3361,6 +3361,7 @@ void __init hugetlb_add_hstate(unsigned int order)
h->next_nid_to_free = first_memory_node;
snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
huge_page_size(h)/1024);
+   hugetlb_vmemmap_init(h);
 
parsed_hstate = h;
 }
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 3070e1465b1b..f9f9bb212319 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -262,3 +262,36 @@ void free_huge_page_vmemmap(struct hstate *h, struct page 
*head)
 
SetHPageVmemmapOptimized(head);
 }
+
+void __init hugetlb_vmemmap_init(struct hstate *h)
+{
+   unsigned int nr_pages = pages_per_huge_page(h);
+   unsigned int vmemmap_pages;
+
+   /*
+* There are only (RESERVE_VMEMMAP_SIZE / sizeof(struct page)) struct
+* page structs that can be used when CONFIG_HUGETLB_PAGE_FREE_VMEMMAP,
+* so add a BUILD_BUG_ON to catch invalid usage of the tail struct page.
+*/
+   BUILD_BUG_ON(__NR_USED_SUBPAGE >=
+RESERVE_VMEMMAP_SIZE / sizeof(struct page));
+
+   if (!hugetlb_free_vmemmap_enabled)
+   return;
+
+   vmemmap_pages = (nr_pages * sizeof(struct page)) >> PAGE_SHIFT;
+   /*
+* The head page and the first tail page are not to be freed to buddy
+* allocator, the other pages will map to the first tail page, so they
+* can be freed.
+*
+* Could RESERVE_VMEMMAP_NR be greater than @vmemmap_pages? It is true
+* on some architectures (e.g. aarch64). See Documentation/arm64/
+* hugetlbpage.rst for more details.
+*/
+   if (likely(vmemmap_pages > RESERVE_VMEMMAP_NR))
+   h->nr_free_vmemmap_pages = vmemmap_pages - RESERVE_VMEMMAP_NR;
+
+   pr_info("can free %d vmemmap pages for %s\n", h->nr_free_vmemmap_pages,
+   h->name);
+}
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index a37771b0b82a..cb2bef8f9e73 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -13,17 +13,15 @@
 #ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
 int alloc_huge_page_vmemmap(struct hstate *h, struct page *head);
 void free_huge_page_vmemmap(struct hstate *h, struct page *head);
+void hugetlb_vmemmap_init(struct hstate *h);
 
 /*
  * How many vmemmap pages associated with a HugeTLB page that can be freed
  * to the buddy allocator.
- *
- * Todo: Returns zero for now, which means the feature is disabled. We will
- * enable it once all the infrastructure is there.
  */
 static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 {
-   return 0;
+   return h->nr_free_vmemmap_pages;
 }
 #else
 static inline int alloc_huge_page_vmemmap(struct hstate *h, struct page *head)
@@ -35,6 +33,10 @@ static inline void free_huge_page_vmemmap(struct hstate *h, 
struct page *head)
 {
 }
 
+static inline void hugetlb_vmemmap_init(struct hstate *h)
+{
+}
+
 static inline unsigned int free_vmemmap_pages_per_hpage(struct hstate *h)
 {
return 0;
-- 
2.11.0

[PATCH v20 7/9] mm: hugetlb: add a kernel parameter hugetlb_free_vmemmap

2021-04-15 Thread Muchun Song

Add a kernel parameter hugetlb_free_vmemmap to enable the feature of
freeing unused vmemmap pages associated with each hugetlb page on boot.

We disables PMD mapping of vmemmap pages for x86-64 arch when this
feature is enabled. Because vmemmap_remap_free() depends on vmemmap
being base page mapped.

Signed-off-by: Muchun Song 
Reviewed-by: Oscar Salvador 
Reviewed-by: Barry Song 
Reviewed-by: Miaohe Lin 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
---
 Documentation/admin-guide/kernel-parameters.txt | 17 +
 Documentation/admin-guide/mm/hugetlbpage.rst|  3 +++
 arch/x86/mm/init_64.c   |  8 ++--
 include/linux/hugetlb.h | 19 +++
 mm/hugetlb_vmemmap.c| 24 
 5 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 3bf052d14504..9e655f5206ac 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1611,6 +1611,23 @@
Documentation/admin-guide/mm/hugetlbpage.rst.
Format: size[KMG]
 
+   hugetlb_free_vmemmap=
+   [KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
+   enabled.
+   Allows heavy hugetlb users to free up some more
+   memory (6 * PAGE_SIZE for each 2MB hugetlb page).
+   This feauture is not free though. Large page
+   tables are not used to back vmemmap pages which
+   can lead to a performance degradation for some
+   workloads. Also there will be memory allocation
+   required when hugetlb pages are freed from the
+   pool which can lead to corner cases under heavy
+   memory pressure.
+   Format: { on | off (default) }
+
+   on:  enable the feature
+   off: disable the feature
+
hung_task_panic=
[KNL] Should the hung task detector generate panics.
Format: 0 | 1
diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst 
b/Documentation/admin-guide/mm/hugetlbpage.rst
index 6988895d09a8..8abaeb144e44 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -153,6 +153,9 @@ default_hugepagesz
 
will all result in 256 2M huge pages being allocated.  Valid default
huge page size is architecture dependent.
+hugetlb_free_vmemmap
+   When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
+   unused vmemmap pages associated with each HugeTLB page.
 
 When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
 indicates the current number of pre-allocated huge pages of the default size.
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 65ea58527176..9d9d18d0c2a1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1609,7 +1610,8 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
 
-   if (end - start < PAGES_PER_SECTION * sizeof(struct page))
+   if ((is_hugetlb_free_vmemmap_enabled()  && !altmap) ||
+   end - start < PAGES_PER_SECTION * sizeof(struct page))
err = vmemmap_populate_basepages(start, end, node, NULL);
else if (boot_cpu_has(X86_FEATURE_PSE))
err = vmemmap_populate_hugepages(start, end, node, altmap);
@@ -1637,6 +1639,8 @@ void register_page_bootmem_memmap(unsigned long 
section_nr,
pmd_t *pmd;
unsigned int nr_pmd_pages;
struct page *page;
+   bool base_mapping = !boot_cpu_has(X86_FEATURE_PSE) ||
+   is_hugetlb_free_vmemmap_enabled();
 
for (; addr < end; addr = next) {
pte_t *pte = NULL;
@@ -1662,7 +1666,7 @@ void register_page_bootmem_memmap(unsigned long 
section_nr,
}
get_page_bootmem(section_nr, pud_page(*pud), MIX_SECTION_INFO);
 
-   if (!boot_cpu_has(X86_FEATURE_PSE)) {
+   if (base_mapping) {
next = (addr + PAGE_SIZE) & PAGE_MASK;
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 6e970a7d3480..4015cedacf91 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -894,6 +894,20 @@ static inline void huge_ptep_modify_prot_commit(struct 
vm_area_struct *vma,

[PATCH v20 6/9] mm: hugetlb: alloc the vmemmap pages associated with each HugeTLB page

2021-04-15 Thread Muchun Song

When we free a HugeTLB page to the buddy allocator, we need to allocate
the vmemmap pages associated with it. However, we may not be able to
allocate the vmemmap pages when the system is under memory pressure. In
this case, we just refuse to free the HugeTLB page. This changes behavior
in some corner cases as listed below:

 1) Failing to free a huge page triggered by the user (decrease nr_pages).

User needs to try again later.

 2) Failing to free a surplus huge page when freed by the application.

Try again later when freeing a huge page next time.

 3) Failing to dissolve a free huge page on ZONE_MOVABLE via
offline_pages().

This can happen when we have plenty of ZONE_MOVABLE memory, but
not enough kernel memory to allocate vmemmmap pages.  We may even
be able to migrate huge page contents, but will not be able to
dissolve the source huge page.  This will prevent an offline
operation and is unfortunate as memory offlining is expected to
succeed on movable zones.  Users that depend on memory hotplug
to succeed for movable zones should carefully consider whether the
memory savings gained from this feature are worth the risk of
possibly not being able to offline memory in certain situations.

 4) Failing to dissolve a huge page on CMA/ZONE_MOVABLE via
alloc_contig_range() - once we have that handling in place. Mainly
affects CMA and virtio-mem.

Similar to 3). virito-mem will handle migration errors gracefully.
CMA might be able to fallback on other free areas within the CMA
region.

Vmemmap pages are allocated from the page freeing context. In order for
those allocations to be not disruptive (e.g. trigger oom killer)
__GFP_NORETRY is used. hugetlb_lock is dropped for the allocation
because a non sleeping allocation would be too fragile and it could fail
too easily under memory pressure. GFP_ATOMIC or other modes to access
memory reserves is not used because we want to prevent consuming
reserves under heavy hugetlb freeing.

Signed-off-by: Muchun Song 
---
 Documentation/admin-guide/mm/hugetlbpage.rst|  8 +++
 Documentation/admin-guide/mm/memory-hotplug.rst | 13 
 include/linux/hugetlb.h |  3 +
 include/linux/mm.h  |  2 +
 mm/hugetlb.c| 85 -
 mm/hugetlb_vmemmap.c| 34 ++
 mm/hugetlb_vmemmap.h|  6 ++
 mm/sparse-vmemmap.c | 75 +-
 8 files changed, 210 insertions(+), 16 deletions(-)

diff --git a/Documentation/admin-guide/mm/hugetlbpage.rst 
b/Documentation/admin-guide/mm/hugetlbpage.rst
index f7b1c7462991..6988895d09a8 100644
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -60,6 +60,10 @@ HugePages_Surp
 the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
 maximum number of surplus huge pages is controlled by
 ``/proc/sys/vm/nr_overcommit_hugepages``.
+   Note: When the feature of freeing unused vmemmap pages associated
+   with each hugetlb page is enabled, the number of surplus huge pages
+   may be temporarily larger than the maximum number of surplus huge
+   pages when the system is under memory pressure.
 Hugepagesize
is the default hugepage size (in Kb).
 Hugetlb
@@ -80,6 +84,10 @@ returned to the huge page pool when freed by a task.  A user 
with root
 privileges can dynamically allocate more or free some persistent huge pages
 by increasing or decreasing the value of ``nr_hugepages``.
 
+Note: When the feature of freeing unused vmemmap pages associated with each
+hugetlb page is enabled, we can fail to free the huge pages triggered by
+the user when ths system is under memory pressure.  Please try again later.
+
 Pages that are used as huge pages are reserved inside the kernel and cannot
 be used for other purposes.  Huge pages cannot be swapped out under
 memory pressure.
diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst 
b/Documentation/admin-guide/mm/memory-hotplug.rst
index 05d51d2d8beb..c6bae2d77160 100644
--- a/Documentation/admin-guide/mm/memory-hotplug.rst
+++ b/Documentation/admin-guide/mm/memory-hotplug.rst
@@ -357,6 +357,19 @@ creates ZONE_MOVABLE as following.
Unfortunately, there is no information to show which memory block belongs
to ZONE_MOVABLE. This is TBD.
 
+   Memory offlining can fail when dissolving a free huge page on ZONE_MOVABLE
+   and the feature of freeing unused vmemmap pages associated with each hugetlb
+   page is enabled.
+
+   This can happen when we have plenty of ZONE_MOVABLE memory, but not enough
+   kernel memory to allocate vmemmmap pages.  We may even be able to migrate
+   huge page contents, but will not be able to dissolve the source huge page.
+   This will prevent an offline operation and is unfortunate as memory 
offlining

[PATCH v20 5/9] mm: hugetlb: defer freeing of HugeTLB pages

2021-04-15 Thread Muchun Song

In the subsequent patch, we should allocate the vmemmap pages when
freeing a HugeTLB page. But update_and_free_page() can be called
under any context, so we cannot use GFP_KERNEL to allocate vmemmap
pages. However, we can defer the actual freeing in a kworker to
prevent from using GFP_ATOMIC to allocate the vmemmap pages.

The __update_and_free_page() is where the call to allocate vmemmmap
pages will be inserted.

Signed-off-by: Muchun Song 
---
 mm/hugetlb.c | 73 
 mm/hugetlb_vmemmap.c | 12 -
 mm/hugetlb_vmemmap.h | 17 
 3 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 923d05e2806b..eeb8f5480170 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1376,7 +1376,7 @@ static void remove_hugetlb_page(struct hstate *h, struct 
page *page,
h->nr_huge_pages_node[nid]--;
 }
 
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void __update_and_free_page(struct hstate *h, struct page *page)
 {
int i;
struct page *subpage = page;
@@ -1399,12 +1399,73 @@ static void update_and_free_page(struct hstate *h, 
struct page *page)
}
 }
 
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+   struct llist_node *node;
+
+   node = llist_del_all(_freelist);
+
+   while (node) {
+   struct page *page;
+   struct hstate *h;
+
+   page = container_of((struct address_space **)node,
+struct page, mapping);
+   node = node->next;
+   page->mapping = NULL;
+   h = page_hstate(page);
+
+   __update_and_free_page(h, page);
+
+   cond_resched();
+   }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+   if (free_vmemmap_pages_per_hpage(h))
+   flush_work(_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+bool atomic)
+{
+   if (!free_vmemmap_pages_per_hpage(h) || !atomic) {
+   __update_and_free_page(h, page);
+   return;
+   }
+
+   /*
+* Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+*
+* Only call schedule_work() if hpage_freelist is previously
+* empty. Otherwise, schedule_work() had been called but the workfn
+* hasn't retrieved the list yet.
+*/
+   if (llist_add((struct llist_node *)>mapping, _freelist))
+   schedule_work(_hpage_work);
+}
+
 static void update_and_free_pages_bulk(struct hstate *h, struct list_head 
*list)
 {
struct page *page, *t_page;
 
list_for_each_entry_safe(page, t_page, list, lru) {
-   update_and_free_page(h, page);
+   update_and_free_page(h, page, false);
cond_resched();
}
 }
@@ -1471,12 +1532,12 @@ void free_huge_page(struct page *page)
if (HPageTemporary(page)) {
remove_hugetlb_page(h, page, false);
spin_unlock_irqrestore(_lock, flags);
-   update_and_free_page(h, page);
+   update_and_free_page(h, page, true);
} else if (h->surplus_huge_pages_node[nid]) {
/* remove the page from active list */
remove_hugetlb_page(h, page, true);
spin_unlock_irqrestore(_lock, flags);
-   update_and_free_page(h, page);
+   update_and_free_page(h, page, true);
} else {
arch_clear_hugepage_flags(page);
enqueue_huge_page(h, page);
@@ -1785,7 +1846,7 @@ int dissolve_free_huge_page(struct page *page)
remove_hugetlb_page(h, page, false);
h->max_huge_pages--;
spin_unlock_irq(_lock);
-   update_and_free_page(h, head);
+   update_and_free_page(h, head, false);
return 0;
}
 out:
@@ -2627,6 +2688,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned 
long count, int nid,
 * pages in hstate via the proc/sysfs interfaces.
 */
mutex_lock(>resize_lock);
+   flush_free_hpage_work(h);
spin_lock_irq(_lock);
 
/*
@@ -

[PATCH v20 4/9] mm: hugetlb: free the vmemmap pages associated with each HugeTLB page

2021-04-15 Thread Muchun Song

Every HugeTLB has more than one struct page structure. We __know__ that
we only use the first 4 (__NR_USED_SUBPAGE) struct page structures
to store metadata associated with each HugeTLB.

There are a lot of struct page structures associated with each HugeTLB
page. For tail pages, the value of compound_head is the same. So we can
reuse first page of tail page structures. We map the virtual addresses
of the remaining pages of tail page structures to the first tail page
struct, and then free these page frames. Therefore, we need to reserve
two pages as vmemmap areas.

When we allocate a HugeTLB page from the buddy, we can free some vmemmap
pages associated with each HugeTLB page. It is more appropriate to do it
in the prep_new_huge_page().

The free_vmemmap_pages_per_hpage(), which indicates how many vmemmap
pages associated with a HugeTLB page can be freed, returns zero for
now, which means the feature is disabled. We will enable it once all
the infrastructure is there.

Signed-off-by: Muchun Song 
Reviewed-by: Oscar Salvador 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
Acked-by: Michal Hocko 
---
 include/linux/bootmem_info.h |  28 +-
 include/linux/mm.h   |   3 +
 mm/Makefile  |   1 +
 mm/hugetlb.c |   2 +
 mm/hugetlb_vmemmap.c | 218 +++
 mm/hugetlb_vmemmap.h |  20 
 mm/sparse-vmemmap.c  | 194 ++
 7 files changed, 465 insertions(+), 1 deletion(-)
 create mode 100644 mm/hugetlb_vmemmap.c
 create mode 100644 mm/hugetlb_vmemmap.h

diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
index 4ed6dee1adc9..2bc8b1f69c93 100644
--- a/include/linux/bootmem_info.h
+++ b/include/linux/bootmem_info.h
@@ -2,7 +2,7 @@
 #ifndef __LINUX_BOOTMEM_INFO_H
 #define __LINUX_BOOTMEM_INFO_H
 
-#include 
+#include 
 
 /*
  * Types for free bootmem stored in page->lru.next. These have to be in
@@ -22,6 +22,27 @@ void __init register_page_bootmem_info_node(struct 
pglist_data *pgdat);
 void get_page_bootmem(unsigned long info, struct page *page,
  unsigned long type);
 void put_page_bootmem(struct page *page);
+
+/*
+ * Any memory allocated via the memblock allocator and not via the
+ * buddy will be marked reserved already in the memmap. For those
+ * pages, we can call this function to free it to buddy allocator.
+ */
+static inline void free_bootmem_page(struct page *page)
+{
+   unsigned long magic = (unsigned long)page->freelist;
+
+   /*
+* The reserve_bootmem_region sets the reserved flag on bootmem
+* pages.
+*/
+   VM_BUG_ON_PAGE(page_ref_count(page) != 2, page);
+
+   if (magic == SECTION_INFO || magic == MIX_SECTION_INFO)
+   put_page_bootmem(page);
+   else
+   VM_BUG_ON_PAGE(1, page);
+}
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
@@ -35,6 +56,11 @@ static inline void get_page_bootmem(unsigned long info, 
struct page *page,
unsigned long type)
 {
 }
+
+static inline void free_bootmem_page(struct page *page)
+{
+   free_reserved_page(page);
+}
 #endif
 
 #endif /* __LINUX_BOOTMEM_INFO_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 25b9041f9925..a4d160ddb749 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3046,6 +3046,9 @@ static inline void print_vma_addr(char *prefix, unsigned 
long rip)
 }
 #endif
 
+void vmemmap_remap_free(unsigned long start, unsigned long end,
+   unsigned long reuse);
+
 void *sparse_buffer_alloc(unsigned long size);
 struct page * __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap);
diff --git a/mm/Makefile b/mm/Makefile
index d0ccddae7a45..40ee404e200e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_FRONTSWAP)   += frontswap.o
 obj-$(CONFIG_ZSWAP)+= zswap.o
 obj-$(CONFIG_HAS_DMA)  += dmapool.o
 obj-$(CONFIG_HUGETLBFS)+= hugetlb.o
+obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)+= hugetlb_vmemmap.o
 obj-$(CONFIG_NUMA) += mempolicy.o
 obj-$(CONFIG_SPARSEMEM)+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d81d5947ed..923d05e2806b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include "internal.h"
+#include "hugetlb_vmemmap.h"
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -1485,6 +1486,7 @@ void free_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+   free_huge_page_vmemmap(h, page);
INIT_LIST_HEAD(>lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
hugetlb_set_page_subpool(page, NULL);
diff --git a/mm/hugetlb_vmemmap

[PATCH v20 3/9] mm: hugetlb: gather discrete indexes of tail page

2021-04-15 Thread Muchun Song

For HugeTLB page, there are more metadata to save in the struct page.
But the head struct page cannot meet our needs, so we have to abuse
other tail struct page to store the metadata. In order to avoid
conflicts caused by subsequent use of more tail struct pages, we can
gather these discrete indexes of tail struct page. In this case, it
will be easier to add a new tail page index later.

Signed-off-by: Muchun Song 
Reviewed-by: Oscar Salvador 
Reviewed-by: Miaohe Lin 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
Acked-by: Michal Hocko 
---
 include/linux/hugetlb.h| 21 +++--
 include/linux/hugetlb_cgroup.h | 19 +++
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 09f1fd12a6fa..0abed7e766b8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -29,6 +29,23 @@ typedef struct { unsigned long pd; } hugepd_t;
 #include 
 #include 
 
+/*
+ * For HugeTLB page, there are more metadata to save in the struct page. But
+ * the head struct page cannot meet our needs, so we have to abuse other tail
+ * struct page to store the metadata. In order to avoid conflicts caused by
+ * subsequent use of more tail struct pages, we gather these discrete indexes
+ * of tail struct page here.
+ */
+enum {
+   SUBPAGE_INDEX_SUBPOOL = 1,  /* reuse page->private */
+#ifdef CONFIG_CGROUP_HUGETLB
+   SUBPAGE_INDEX_CGROUP,   /* reuse page->private */
+   SUBPAGE_INDEX_CGROUP_RSVD,  /* reuse page->private */
+   __MAX_CGROUP_SUBPAGE_INDEX = SUBPAGE_INDEX_CGROUP_RSVD,
+#endif
+   __NR_USED_SUBPAGE,
+};
+
 struct hugepage_subpool {
spinlock_t lock;
long count;
@@ -626,13 +643,13 @@ extern unsigned int default_hstate_idx;
  */
 static inline struct hugepage_subpool *hugetlb_page_subpool(struct page *hpage)
 {
-   return (struct hugepage_subpool *)(hpage+1)->private;
+   return (void *)page_private(hpage + SUBPAGE_INDEX_SUBPOOL);
 }
 
 static inline void hugetlb_set_page_subpool(struct page *hpage,
struct hugepage_subpool *subpool)
 {
-   set_page_private(hpage+1, (unsigned long)subpool);
+   set_page_private(hpage + SUBPAGE_INDEX_SUBPOOL, (unsigned long)subpool);
 }
 
 static inline struct hstate *hstate_file(struct file *f)
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index 0bff345c4bc6..0b8d1fdda3a1 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -21,15 +21,16 @@ struct hugetlb_cgroup;
 struct resv_map;
 struct file_region;
 
+#ifdef CONFIG_CGROUP_HUGETLB
 /*
  * Minimum page order trackable by hugetlb cgroup.
  * At least 4 pages are necessary for all the tracking information.
- * The second tail page (hpage[2]) is the fault usage cgroup.
- * The third tail page (hpage[3]) is the reservation usage cgroup.
+ * The second tail page (hpage[SUBPAGE_INDEX_CGROUP]) is the fault
+ * usage cgroup. The third tail page (hpage[SUBPAGE_INDEX_CGROUP_RSVD])
+ * is the reservation usage cgroup.
  */
-#define HUGETLB_CGROUP_MIN_ORDER   2
+#define HUGETLB_CGROUP_MIN_ORDER order_base_2(__MAX_CGROUP_SUBPAGE_INDEX + 1)
 
-#ifdef CONFIG_CGROUP_HUGETLB
 enum hugetlb_memory_event {
HUGETLB_MAX,
HUGETLB_NR_MEMORY_EVENTS,
@@ -66,9 +67,9 @@ __hugetlb_cgroup_from_page(struct page *page, bool rsvd)
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return NULL;
if (rsvd)
-   return (struct hugetlb_cgroup *)page[3].private;
+   return (void *)page_private(page + SUBPAGE_INDEX_CGROUP_RSVD);
else
-   return (struct hugetlb_cgroup *)page[2].private;
+   return (void *)page_private(page + SUBPAGE_INDEX_CGROUP);
 }
 
 static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page 
*page)
@@ -90,9 +91,11 @@ static inline int __set_hugetlb_cgroup(struct page *page,
if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER)
return -1;
if (rsvd)
-   page[3].private = (unsigned long)h_cg;
+   set_page_private(page + SUBPAGE_INDEX_CGROUP_RSVD,
+(unsigned long)h_cg);
else
-   page[2].private = (unsigned long)h_cg;
+   set_page_private(page + SUBPAGE_INDEX_CGROUP,
+(unsigned long)h_cg);
return 0;
 }
 
-- 
2.11.0

[PATCH v20 2/9] mm: hugetlb: introduce a new config HUGETLB_PAGE_FREE_VMEMMAP

2021-04-15 Thread Muchun Song

The option HUGETLB_PAGE_FREE_VMEMMAP allows for the freeing of
some vmemmap pages associated with pre-allocated HugeTLB pages.
For example, on X86_64 6 vmemmap pages of size 4KB each can be
saved for each 2MB HugeTLB page. 4094 vmemmap pages of size 4KB
each can be saved for each 1GB HugeTLB page.

When a HugeTLB page is allocated or freed, the vmemmap array
representing the range associated with the page will need to be
remapped. When a page is allocated, vmemmap pages are freed
after remapping. When a page is freed, previously discarded
vmemmap pages must be allocated before remapping.

The config option is introduced early so that supporting code
can be written to depend on the option. The initial version of
the code only provides support for x86-64.

If config HAVE_BOOTMEM_INFO_NODE is enabled, the freeing vmemmap
page code denpend on it to free vmemmap pages. Otherwise, just
use free_reserved_page() to free vmemmmap pages. The routine
register_page_bootmem_info() is used to register bootmem info.
Therefore, make sure register_page_bootmem_info is enabled if
HUGETLB_PAGE_FREE_VMEMMAP is defined.

Signed-off-by: Muchun Song 
Reviewed-by: Oscar Salvador 
Acked-by: Mike Kravetz 
Reviewed-by: Miaohe Lin 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
Reviewed-by: Balbir Singh 
---
 arch/x86/mm/init_64.c | 2 +-
 fs/Kconfig| 5 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3aaf1d30c777..65ea58527176 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1270,7 +1270,7 @@ static struct kcore_list kcore_vsyscall;
 
 static void __init register_page_bootmem_info(void)
 {
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
int i;
 
for_each_online_node(i)
diff --git a/fs/Kconfig b/fs/Kconfig
index dcd9161fbeba..6ce6fdac00a3 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -240,6 +240,11 @@ config HUGETLBFS
 config HUGETLB_PAGE
def_bool HUGETLBFS
 
+config HUGETLB_PAGE_FREE_VMEMMAP
+   def_bool HUGETLB_PAGE
+   depends on X86_64
+   depends on SPARSEMEM_VMEMMAP
+
 config MEMFD_CREATE
def_bool TMPFS || HUGETLBFS
 
-- 
2.11.0

[PATCH v20 1/9] mm: memory_hotplug: factor out bootmem core functions to bootmem_info.c

2021-04-15 Thread Muchun Song

Move bootmem info registration common API to individual bootmem_info.c.
And we will use {get,put}_page_bootmem() to initialize the page for the
vmemmap pages or free the vmemmap pages to buddy in the later patch.
So move them out of CONFIG_MEMORY_HOTPLUG_SPARSE. This is just code
movement without any functional change.

Signed-off-by: Muchun Song 
Acked-by: Mike Kravetz 
Reviewed-by: Oscar Salvador 
Reviewed-by: David Hildenbrand 
Reviewed-by: Miaohe Lin 
Tested-by: Chen Huang 
Tested-by: Bodeddula Balasubramaniam 
---
 arch/sparc/mm/init_64.c|   1 +
 arch/x86/mm/init_64.c  |   3 +-
 include/linux/bootmem_info.h   |  40 +
 include/linux/memory_hotplug.h |  27 -
 mm/Makefile|   1 +
 mm/bootmem_info.c  | 127 +
 mm/memory_hotplug.c| 116 -
 mm/sparse.c|   1 +
 8 files changed, 172 insertions(+), 144 deletions(-)
 create mode 100644 include/linux/bootmem_info.h
 create mode 100644 mm/bootmem_info.c

diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index e454f179cf5d..ac9d8b161e0c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index e527d829e1ed..3aaf1d30c777 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1623,7 +1624,7 @@ int __meminit vmemmap_populate(unsigned long start, 
unsigned long end, int node,
return err;
 }
 
-#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && 
defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
 void register_page_bootmem_memmap(unsigned long section_nr,
  struct page *start_page, unsigned long 
nr_pages)
 {
diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h
new file mode 100644
index ..4ed6dee1adc9
--- /dev/null
+++ b/include/linux/bootmem_info.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_BOOTMEM_INFO_H
+#define __LINUX_BOOTMEM_INFO_H
+
+#include 
+
+/*
+ * Types for free bootmem stored in page->lru.next. These have to be in
+ * some random range in unsigned long space for debugging purposes.
+ */
+enum {
+   MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
+   SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
+   MIX_SECTION_INFO,
+   NODE_INFO,
+   MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
+};
+
+#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
+
+void get_page_bootmem(unsigned long info, struct page *page,
+ unsigned long type);
+void put_page_bootmem(struct page *page);
+#else
+static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+}
+
+static inline void put_page_bootmem(struct page *page)
+{
+}
+
+static inline void get_page_bootmem(unsigned long info, struct page *page,
+   unsigned long type)
+{
+}
+#endif
+
+#endif /* __LINUX_BOOTMEM_INFO_H */
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a85d4b7d15c2..ff1449f645f1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -18,18 +18,6 @@ struct vmem_altmap;
 #ifdef CONFIG_MEMORY_HOTPLUG
 struct page *pfn_to_online_page(unsigned long pfn);
 
-/*
- * Types for free bootmem stored in page->lru.next. These have to be in
- * some random range in unsigned long space for debugging purposes.
- */
-enum {
-   MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12,
-   SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE,
-   MIX_SECTION_INFO,
-   NODE_INFO,
-   MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO,
-};
-
 /* Types for control the zone type of onlined and offlined memory */
 enum {
/* Offline the memory. */
@@ -220,17 +208,6 @@ static inline void arch_refresh_nodedata(int nid, 
pg_data_t *pgdat)
 #endif /* CONFIG_NUMA */
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
-#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
-#else
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-#endif
-extern void put_page_bootmem(struct page *page);
-extern void get_page_bootmem(unsigned long ingo, struct page *page,
-unsigned long type);
-
 void get_online_mems(void);
 void put_online_mems(void);
 
@@ -258,10 +235,6 @@ static inline void zone_span_writelock(struct zone *zone) 
{}
 static inline void zone_span_writeunlock(struct zone *zone) {}
 static inline void zone_seqlock_init(struct zone *zone) {}
 
-static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
-{
-}
-
 static inline int t

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1237 matches

Mail list logo