from:"Miaohe Lin"

Re: [linus:master] [mm] d99e3140a4: BUG:KCSAN:data-race_in_folio_remove_rmap_ptes/print_report

2024-05-28 Thread Miaohe Lin

On 2024/5/28 15:43, David Hildenbrand wrote:
> Am 28.05.24 um 09:11 schrieb kernel test robot:
>>
>>
>> Hello,
>>
>> kernel test robot noticed 
>> "BUG:KCSAN:data-race_in_folio_remove_rmap_ptes/print_report" on:
>>
>> commit: d99e3140a4d33e26066183ff727d8f02f56bec64 ("mm: turn 
>> folio_test_hugetlb into a PageType")
>> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
>>
>> [test failed on linus/master  c760b3725e52403dc1b28644fb09c47a83cacea6]
>> [test failed on linux-next/master 3689b0ef08b70e4e03b82ebd37730a03a672853a]
>>
>> in testcase: trinity
>> version: trinity-i386-abe9de86-1_20230429
>> with following parameters:
>>
>> runtime: 300s
>> group: group-04
>> nr_groups: 5
>>
>>
>>
>> compiler: gcc-13
>> test machine: qemu-system-x86_64 -enable-kvm -cpu SandyBridge -smp 2 -m 16G
>>
>> (please refer to attached dmesg/kmsg for entire log/backtrace)
>>
>>
>> we noticed this issue does not always happen. we also noticed there are
>> different random KCSAN issues for both this commit and its parent. but below
>> 4 only happen on this commit with not small rate and keep clean on parent.
>>
> 
> Likely that's just a page_type check racing against concurrent
> mapcount changes.
> 
> In __folio_rmap_sanity_checks() we check
> VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio);
> 
> To make sure we don't get hugetlb folios in the wrong rmap code path. That
> can easily race with concurrent mapcount changes, just like any other
> page_type checks that end up in folio_test_type/page_has_type e.g., from
> PFN walkers.
> 
> Load tearing in these functions shouldn't really result in false positives
> (what we care about), but READ_ONCE shouldn't hurt or make a difference.
> 
> 
> From b03dc9bf27571442d886d8da624a4e4f737433f2 Mon Sep 17 00:00:00 2001
> From: David Hildenbrand 
> Date: Tue, 28 May 2024 09:37:20 +0200
> Subject: [PATCH] mm: read page_type using READ_ONCE
> 
> KCSAN complains about possible data races: while we check for a
> page_type -- for example for sanity checks -- we might concurrently
> modify the mapcount that overlays page_type.
> 
> Let's use READ_ONCE to avoid laod tearing (shouldn't make a difference)
> and to make KCSAN happy.
> 
> Note: nothing should really be broken besides wrong KCSAN complaints.
> 
> Reported-by: kernel test robot 
> Closes: https://lore.kernel.org/oe-lkp/202405281431.c46a3be9-...@intel.com
> Signed-off-by: David Hildenbrand 

LGTM. Thanks for fixing.

Reviewed-by: Miaohe Lin 
Thanks.
.

Re: [PATCH] memory tier: rename destroy_memory_type() to put_memory_type()

2023-07-06 Thread Miaohe Lin

On 2023/7/6 19:58, Xiao Yang wrote:
> On 2023/7/6 14:39, Miaohe Lin wrote:
>> It appears that destroy_memory_type() isn't a very good name because
>> we usually will not free the memory_type here. So rename it to a more
>> appropriate name i.e. put_memory_type().
>>
>> Suggested-by: Huang, Ying 
>> Signed-off-by: Miaohe Lin 
>> ---
>>   drivers/dax/kmem.c   | 4 ++--
>>   include/linux/memory-tiers.h | 4 ++--
>>   mm/memory-tiers.c    | 6 +++---
>>   3 files changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
>> index 898ca9505754..c57acb73e3db 100644
>> --- a/drivers/dax/kmem.c
>> +++ b/drivers/dax/kmem.c
>> @@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
>>   return rc;
>>     error_dax_driver:
>> -    destroy_memory_type(dax_slowmem_type);
>> +    put_memory_type(dax_slowmem_type);
>>   err_dax_slowmem_type:
>>   kfree_const(kmem_name);
>>   return rc;
>> @@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
>>   dax_driver_unregister(_dax_kmem_driver);
>>   if (!any_hotremove_failed)
>>   kfree_const(kmem_name);
>> -    destroy_memory_type(dax_slowmem_type);
>> +    put_memory_type(dax_slowmem_type);
>>   }
>>     MODULE_AUTHOR("Intel Corporation");
>> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
>> index fc9647b1b4f9..437441cdf78f 100644
>> --- a/include/linux/memory-tiers.h
>> +++ b/include/linux/memory-tiers.h
>> @@ -33,7 +33,7 @@ struct memory_dev_type {
>>   #ifdef CONFIG_NUMA
>>   extern bool numa_demotion_enabled;
>>   struct memory_dev_type *alloc_memory_type(int adistance);
>> -void destroy_memory_type(struct memory_dev_type *memtype);
>> +void put_memory_type(struct memory_dev_type *memtype);
>>   void init_node_memory_type(int node, struct memory_dev_type *default_type);
>>   void clear_node_memory_type(int node, struct memory_dev_type *memtype);
>>   #ifdef CONFIG_MIGRATION
>> @@ -68,7 +68,7 @@ static inline struct memory_dev_type 
>> *alloc_memory_type(int adistance)
>>   return NULL;
>>   }
>>   -static inline void destroy_memory_type(struct memory_dev_type *memtype)
>> +static inline void put_memory_type(struct memory_dev_type *memtype)
>>   {
>>     }
>> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
>> index 1719fa3bcf02..c49ab03f49b1 100644
>> --- a/mm/memory-tiers.c
>> +++ b/mm/memory-tiers.c
>> @@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int 
>> adistance)
>>   }
>>   EXPORT_SYMBOL_GPL(alloc_memory_type);
>>   -void destroy_memory_type(struct memory_dev_type *memtype)
>> +void put_memory_type(struct memory_dev_type *memtype)
>>   {
>>   kref_put(>kref, release_memtype);
>>   }
>> -EXPORT_SYMBOL_GPL(destroy_memory_type);
>> +EXPORT_SYMBOL_GPL(put_memory_type);
>>     void init_node_memory_type(int node, struct memory_dev_type *memtype)
>>   {
>> @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct 
>> memory_dev_type *memtype)
>>    */
>>   if (!node_memory_types[node].map_count) {
>>   node_memory_types[node].memtype = NULL;
>> -    destroy_memory_type(memtype);
>> +    put_memory_type(memtype);
> Hi Maohe,
> 
> I didn't find that destroy_memory_type(memtype) is called here on mainline 
> kernel. Did I miss something?

It's on linux-next tree:
 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=e5bf0402b80d80b66e9765b2c160b5199a5c7d3b

> 
> Other than that, it looks good to me.
> Reviewed-by: Xiao Yang 

Thanks for your review and comment.

[PATCH] memory tier: rename destroy_memory_type() to put_memory_type()

2023-07-06 Thread Miaohe Lin

It appears that destroy_memory_type() isn't a very good name because
we usually will not free the memory_type here. So rename it to a more
appropriate name i.e. put_memory_type().

Suggested-by: Huang, Ying 
Signed-off-by: Miaohe Lin 
---
 drivers/dax/kmem.c   | 4 ++--
 include/linux/memory-tiers.h | 4 ++--
 mm/memory-tiers.c| 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 898ca9505754..c57acb73e3db 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -264,7 +264,7 @@ static int __init dax_kmem_init(void)
return rc;
 
 error_dax_driver:
-   destroy_memory_type(dax_slowmem_type);
+   put_memory_type(dax_slowmem_type);
 err_dax_slowmem_type:
kfree_const(kmem_name);
return rc;
@@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void)
dax_driver_unregister(_dax_kmem_driver);
if (!any_hotremove_failed)
kfree_const(kmem_name);
-   destroy_memory_type(dax_slowmem_type);
+   put_memory_type(dax_slowmem_type);
 }
 
 MODULE_AUTHOR("Intel Corporation");
diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
index fc9647b1b4f9..437441cdf78f 100644
--- a/include/linux/memory-tiers.h
+++ b/include/linux/memory-tiers.h
@@ -33,7 +33,7 @@ struct memory_dev_type {
 #ifdef CONFIG_NUMA
 extern bool numa_demotion_enabled;
 struct memory_dev_type *alloc_memory_type(int adistance);
-void destroy_memory_type(struct memory_dev_type *memtype);
+void put_memory_type(struct memory_dev_type *memtype);
 void init_node_memory_type(int node, struct memory_dev_type *default_type);
 void clear_node_memory_type(int node, struct memory_dev_type *memtype);
 #ifdef CONFIG_MIGRATION
@@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int 
adistance)
return NULL;
 }
 
-static inline void destroy_memory_type(struct memory_dev_type *memtype)
+static inline void put_memory_type(struct memory_dev_type *memtype)
 {
 
 }
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 1719fa3bcf02..c49ab03f49b1 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance)
 }
 EXPORT_SYMBOL_GPL(alloc_memory_type);
 
-void destroy_memory_type(struct memory_dev_type *memtype)
+void put_memory_type(struct memory_dev_type *memtype)
 {
kref_put(>kref, release_memtype);
 }
-EXPORT_SYMBOL_GPL(destroy_memory_type);
+EXPORT_SYMBOL_GPL(put_memory_type);
 
 void init_node_memory_type(int node, struct memory_dev_type *memtype)
 {
@@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct 
memory_dev_type *memtype)
 */
if (!node_memory_types[node].map_count) {
node_memory_types[node].memtype = NULL;
-   destroy_memory_type(memtype);
+   put_memory_type(memtype);
}
mutex_unlock(_tier_lock);
 }
-- 
2.33.0

[PATCH v3 0/4] close various race windows for swap

2021-04-20 Thread Miaohe Lin

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and most of the remaining patches try to use this to
close various race windows. More details can be found in the respective
changelogs. Thanks!

v2->v3:
  some commit log and comment enhance per Huang, Ying
  remove ref_initialized field
  squash PATCH 1-2

v1->v2:
  reorganize the patch-2/5
  various enhance and fixup per Huang, Ying
  Many thanks for the comments of Huang, Ying, Dennis Zhou and Tim Chen.

Miaohe Lin (4):
  mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  swap: fix do_swap_page() race with swapoff
  mm/swap: remove confusing checking for non_swap_entry() in
swap_ra_info()
  mm/shmem: fix shmem_swapin() race with swapoff

 include/linux/swap.h | 14 ++--
 mm/memory.c  |  9 +
 mm/shmem.c   |  6 
 mm/swap_state.c  |  6 
 mm/swapfile.c| 79 +++-
 5 files changed, 76 insertions(+), 38 deletions(-)

-- 
2.19.1

[PATCH v3 2/4] swap: fix do_swap_page() race with swapoff

2021-04-20 Thread Miaohe Lin

When I was investigating the swap code, I found the below possible race
window:

CPU 1   CPU 2
-   -
do_swap_page
  if (data_race(si->flags & SWP_SYNCHRONOUS_IO)
  swap_readpage
if (data_race(sis->flags & SWP_FS_OPS)) {
swapoff
  p->flags &= ~SWP_VALID;
  ..
  synchronize_rcu();
  ..
  p->swap_file = NULL;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Reported-by: kernel test robot  (auto build test ERROR)
Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h | 9 +
 mm/memory.c  | 9 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index c9e7fea10b83..46d51d058d05 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -527,6 +527,15 @@ static inline struct swap_info_struct 
*swp_swap_info(swp_entry_t entry)
return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+   return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)  (NULL)
 #define get_nr_swap_pages()0L
 #define total_swap_pages   0L
diff --git a/mm/memory.c b/mm/memory.c
index 27014c3bde9f..7a2fe12cf641 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
+   struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(entry);
+   if (unlikely(!si))
+   goto out;
 
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
@@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+   if (si)
+   put_swap_device(si);
return ret;
 out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(swapcache);
put_page(swapcache);
}
+   if (si)
+   put_swap_device(si);
return ret;
 }
 
-- 
2.19.1

[PATCH v3 1/4] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-20 Thread Miaohe Lin

Using current get/put_swap_device() to guard against concurrent swapoff
for some swap ops, e.g. swap_readpage(), looks terrible because they
might take really long time. This patch adds the percpu_ref support to
serialize against concurrent swapoff. Also we remove the SWP_VALID flag
because it's used together with RCU solution.

Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  5 +--
 mm/swapfile.c| 79 +++-
 2 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..c9e7fea10b83 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
-   SWP_VALID   = (1 << 13),/* swap is valid to be operated on? */
/* add others here before... */
SWP_SCANNING= (1 << 14),/* refcount in scan_swap_map */
 };
@@ -240,6 +239,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+   struct percpu_ref users;/* indicate and keep swap device valid. 
*/
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
@@ -260,6 +260,7 @@ struct swap_info_struct {
struct block_device *bdev;  /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size;/* seldom referenced */
+   struct completion comp; /* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
atomic_t frontswap_pages;   /* frontswap pages in-use counter */
@@ -511,7 +512,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-   rcu_read_unlock();
+   percpu_ref_put(>users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..a640fc84be5b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(>lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+   struct swap_info_struct *si;
+
+   si = container_of(ref, struct swap_info_struct, users);
+   complete(>comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
struct swap_cluster_info *ci = si->cluster_info;
@@ -1270,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct 
swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1  CPU2
  *   do_swap_page()
@@ -1309,21 +1312,27 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
-   rcu_read_lock();
-   if (data_race(!(si->flags & SWP_VALID)))
-   goto unlock_out;
+   if (!percpu_ref_tryget_live(>users))
+   goto out;
+   /*
+* Guarantee the si->users are checked before accessing other
+* fields of swap_info_struct.
+*
+* Paired with the spin_unlock() after setup_swap_info() in
+* enable_swap_info().
+*/
+   smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
-   goto unlock_out;
+

[PATCH v3 4/4] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-20 Thread Miaohe Lin

When I was investigating the swap code, I found the below possible race
window:

CPU 1 CPU 2
- -
shmem_swapin
  swap_cluster_readahead
if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
  swapoff
percpu_ref_kill(>users)
synchronize_rcu()
wait_for_completion
..
si->swap_file = NULL;
struct inode *inode = si->swap_file->f_mapping->host;[oops!]

Close this race window by using get/put_swap_device() to guard against
concurrent swapoff.

Fixes: 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or 
not")
Signed-off-by: Miaohe Lin 
---
 mm/shmem.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 26c76b13ad23..936ba5595297 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
vm_area_struct *vma)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
 {
+   struct swap_info_struct *si;
struct vm_area_struct pvma;
struct page *page;
struct vm_fault vmf = {
.vma = ,
};
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(swap);
+   if (unlikely(!si))
+   return NULL;
shmem_pseudo_vma_init(, info, index);
page = swap_cluster_readahead(swap, gfp, );
shmem_pseudo_vma_destroy();
+   put_swap_device(si);
 
return page;
 }
-- 
2.19.1

[PATCH v3 3/4] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()

2021-04-20 Thread Miaohe Lin

The non_swap_entry() was used for working with VMA based swap readahead
via commit ec560175c0b6 ("mm, swap: VMA based swap readahead"). Then it's
moved to swap_ra_info() since commit eaf649ebc3ac ("mm: swap: clean up swap
readahead"). But this makes the code confusing. The non_swap_entry() check
looks racy because while we released the pte lock, somebody else might have
faulted in this pte. So we should check whether it's swap pte first to
guard against such race or swap_type will be unexpected. But the swap_entry
isn't used in this function and we will have enough checking when we really
operate the PTE entries later. So checking for non_swap_entry() is not
really needed here and should be removed to avoid confusion.

Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..df5405384520 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
-   swp_entry_t entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
@@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 
faddr = vmf->address;
orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-   entry = pte_to_swp_entry(*pte);
-   if ((unlikely(non_swap_entry(entry {
-   pte_unmap(orig_pte);
-   return;
-   }
 
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
-- 
2.19.1

Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-19 Thread Miaohe Lin

On 2021/4/19 15:52, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 15:09, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> On 2021/4/19 10:48, Huang, Ying wrote:
>>>>> Miaohe Lin  writes:
>>>>>
>>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>>> patch adds the percpu_ref support for swap.
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin 
>>>>>> ---
>>>>>>  include/linux/swap.h |  3 +++
>>>>>>  mm/swapfile.c| 33 +
>>>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>> index 144727041e78..8be36eb58b7a 100644
>>>>>> --- a/include/linux/swap.h
>>>>>> +++ b/include/linux/swap.h
>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>   */
>>>>>>  struct swap_info_struct {
>>>>>> +struct percpu_ref users;/* serialization against 
>>>>>> concurrent swapoff */
>>>>>
>>>>> The comments aren't general enough.  We use this to check whether the
>>>>> swap device has been fully initialized, etc. May be something as below?
>>>>>
>>>>> /* indicate and keep swap device valid */
>>>>
>>>> Looks good.
>>>>
>>>>>
>>>>>>  unsigned long   flags;  /* SWP_USED etc: see above */
>>>>>>  signed shortprio;   /* swap priority of this type */
>>>>>>  struct plist_node list; /* entry in swap_active_head */
>>>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>>>  struct block_device *bdev;  /* swap device or bdev of swap 
>>>>>> file */
>>>>>>  struct file *swap_file; /* seldom referenced */
>>>>>>  unsigned int old_block_size;/* seldom referenced */
>>>>>> +bool ref_initialized;   /* seldom referenced */
>>>>>> +struct completion comp; /* seldom referenced */
>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>  unsigned long *frontswap_map;   /* frontswap in-use, one bit 
>>>>>> per page */
>>>>>>  atomic_t frontswap_pages;   /* frontswap pages in-use 
>>>>>> counter */
>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>> index 149e77454e3c..66515a3a2824 100644
>>>>>> --- a/mm/swapfile.c
>>>>>> +++ b/mm/swapfile.c
>>>>>> @@ -39,6 +39,7 @@
>>>>>>  #include 
>>>>>>  #include 
>>>>>>  #include 
>>>>>> +#include 
>>>>>>  
>>>>>>  #include 
>>>>>>  #include 
>>>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct 
>>>>>> *work)
>>>>>>  spin_unlock(>lock);
>>>>>>  }
>>>>>>  
>>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>>> +{
>>>>>> +struct swap_info_struct *si;
>>>>>> +
>>>>>> +si = container_of(ref, struct swap_info_struct, users);
>>>>>> +complete(>comp);
>>>>>> +}
>>>>>> +
>>>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long 
>>>>>> idx)
>>>>>>  {
>>>>>>  struct swap_cluster_info *ci = si->cluster_info;
>>>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct 
>>>>>> swap_info_struct *p, int prio,
>>>>>>   * Guarantee swap_map, cluster_info, etc. fields are valid
>>>>>>   * between get/put_swap_device() if SWP_VALID bit is set
>>>>>>   */
>>>>>> -synchronize_rcu();
>>>>>
>>>>> You cannot remove this without changing get/put_swap_device().  It's
>>>>> better to squash at least PATCH 1-2.
>>>>
>>>> Will squash PATCH 1-2. Tha

Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-19 Thread Miaohe Lin

On 2021/4/19 15:41, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 15:04, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> On 2021/4/19 10:15, Huang, Ying wrote:
>>>>> Miaohe Lin  writes:
>>>>>
>>>>>> When I was investigating the swap code, I found the below possible race
>>>>>> window:
>>>>>>
>>>>>> CPU 1   CPU 2
>>>>>> -   -
>>>>>> shmem_swapin
>>>>>>   swap_cluster_readahead
>>>>>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>>>> swapoff
>>>>>>   si->flags &= 
>>>>>> ~SWP_VALID;
>>>>>>   ..
>>>>>>   synchronize_rcu();
>>>>>>   ..
>>>>>
>>>>> You have removed these code in the previous patches of the series.  And
>>>>> they are not relevant in this patch.
>>>>
>>>> Yes, I should change these. Thanks.
>>>>
>>>>>
>>>>>>   si->swap_file = NULL;
>>>>>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>>>
>>>>>> Close this race window by using get/put_swap_device() to guard against
>>>>>> concurrent swapoff.
>>>>>>
>>>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>>>
>>>>> No.  This isn't the commit that introduces the race condition.  Please
>>>>> recheck your git blame result.
>>>>>
>>>>
>>>> I think this is really hard to find exact commit. I used git blame and 
>>>> found
>>>> this race should be existed when this is introduced. Any suggestion ?
>>>> Thanks.
>>>
>>> I think the commit that introduces the race condition is commit
>>> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
>>> not")
>>>
>>
>> Thanks.
>> The commit log only describes one race condition. And for that one, this 
>> should be correct
>> Fixes tag. But there are still many other race conditions inside 
>> swap_cluster_readahead,
>> such as swap_readpage() called from swap_cluster_readahead. This tag could 
>> not cover the
>> all race windows.
> 
> No. swap_readpage() in swap_cluster_readahead() is OK.  Because
> __read_swap_cache_async() is called before that, so the swap entry will
> be marked with SWAP_HAS_CACHE, and page will be locked.
> 

Oh... I missed this. Many thanks for your remind.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>>> Best Regards,
>>>>> Huang, Ying
>>>>>
>>>>>> Signed-off-by: Miaohe Lin 
>>>>>> ---
>>>>>>  mm/shmem.c | 6 ++
>>>>>>  1 file changed, 6 insertions(+)
>>>>>>
>>>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>>>> index 26c76b13ad23..936ba5595297 100644
>>>>>> --- a/mm/shmem.c
>>>>>> +++ b/mm/shmem.c
>>>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>>>>>> vm_area_struct *vma)
>>>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>>>  struct shmem_inode_info *info, pgoff_t index)
>>>>>>  {
>>>>>> +struct swap_info_struct *si;
>>>>>>  struct vm_area_struct pvma;
>>>>>>  struct page *page;
>>>>>>  struct vm_fault vmf = {
>>>>>>  .vma = ,
>>>>>>  };
>>>>>>  
>>>>>> +/* Prevent swapoff from happening to us. */
>>>>>> +si = get_swap_device(swap);
>>>>>> +if (unlikely(!si))
>>>>>> +return NULL;
>>>>>>  shmem_pseudo_vma_init(, info, index);
>>>>>>  page = swap_cluster_readahead(swap, gfp, );
>>>>>>  shmem_pseudo_vma_destroy();
>>>>>> +put_swap_device(si);
>>>>>>  
>>>>>>  return page;
>>>>>>  }
>>>>> .
>>>>>
>>> .
>>>
> .
>

Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-19 Thread Miaohe Lin

On 2021/4/19 15:09, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 10:48, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>> patch adds the percpu_ref support for swap.
>>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  include/linux/swap.h |  3 +++
>>>>  mm/swapfile.c| 33 +
>>>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 144727041e78..8be36eb58b7a 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>   * The in-memory structure used to track swap areas.
>>>>   */
>>>>  struct swap_info_struct {
>>>> +  struct percpu_ref users;/* serialization against concurrent 
>>>> swapoff */
>>>
>>> The comments aren't general enough.  We use this to check whether the
>>> swap device has been fully initialized, etc. May be something as below?
>>>
>>> /* indicate and keep swap device valid */
>>
>> Looks good.
>>
>>>
>>>>unsigned long   flags;  /* SWP_USED etc: see above */
>>>>signed shortprio;   /* swap priority of this type */
>>>>struct plist_node list; /* entry in swap_active_head */
>>>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>>>struct block_device *bdev;  /* swap device or bdev of swap file */
>>>>struct file *swap_file; /* seldom referenced */
>>>>unsigned int old_block_size;/* seldom referenced */
>>>> +  bool ref_initialized;   /* seldom referenced */
>>>> +  struct completion comp; /* seldom referenced */
>>>>  #ifdef CONFIG_FRONTSWAP
>>>>unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>>>atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>> index 149e77454e3c..66515a3a2824 100644
>>>> --- a/mm/swapfile.c
>>>> +++ b/mm/swapfile.c
>>>> @@ -39,6 +39,7 @@
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> +#include 
>>>>  
>>>>  #include 
>>>>  #include 
>>>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct 
>>>> *work)
>>>>spin_unlock(>lock);
>>>>  }
>>>>  
>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>> +{
>>>> +  struct swap_info_struct *si;
>>>> +
>>>> +  si = container_of(ref, struct swap_info_struct, users);
>>>> +  complete(>comp);
>>>> +}
>>>> +
>>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>>  {
>>>>struct swap_cluster_info *ci = si->cluster_info;
>>>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct 
>>>> *p, int prio,
>>>> * Guarantee swap_map, cluster_info, etc. fields are valid
>>>> * between get/put_swap_device() if SWP_VALID bit is set
>>>> */
>>>> -  synchronize_rcu();
>>>
>>> You cannot remove this without changing get/put_swap_device().  It's
>>> better to squash at least PATCH 1-2.
>>
>> Will squash PATCH 1-2. Thanks.
>>
>>>
>>>> +  percpu_ref_resurrect(>users);
>>>>spin_lock(_lock);
>>>>spin_lock(>lock);
>>>>_enable_swap_info(p);
>>>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
>>>> specialfile)
>>>>p->flags &= ~SWP_VALID; /* mark swap device as invalid */
>>>>spin_unlock(>lock);
>>>>spin_unlock(_lock);
>>>> +
>>>> +  percpu_ref_kill(>users);
>>>>/*
>>>> -   * wait for swap operations protected by get/put_swap_device()
>>>> -   * to complete
>>>> +   * We need synchronize_rcu() here to protect the accessing
>>>> +   * to the swap cache data structure.
>>>> */
>>>>synchronize_rcu();
>>>> +  /*
>>>> +   * Wait for swap op

Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-19 Thread Miaohe Lin

On 2021/4/19 15:04, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/19 10:15, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1   CPU 2
>>>> -   -
>>>> shmem_swapin
>>>>   swap_cluster_readahead
>>>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>>>> swapoff
>>>>   si->flags &= ~SWP_VALID;
>>>>   ..
>>>>   synchronize_rcu();
>>>>   ..
>>>
>>> You have removed these code in the previous patches of the series.  And
>>> they are not relevant in this patch.
>>
>> Yes, I should change these. Thanks.
>>
>>>
>>>>   si->swap_file = NULL;
>>>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>>>
>>>> Close this race window by using get/put_swap_device() to guard against
>>>> concurrent swapoff.
>>>>
>>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>>
>>> No.  This isn't the commit that introduces the race condition.  Please
>>> recheck your git blame result.
>>>
>>
>> I think this is really hard to find exact commit. I used git blame and found
>> this race should be existed when this is introduced. Any suggestion ?
>> Thanks.
> 
> I think the commit that introduces the race condition is commit
> 8fd2e0b505d1 ("mm: swap: check if swap backing device is congested or
> not")
> 

Thanks.
The commit log only describes one race condition. And for that one, this should 
be correct
Fixes tag. But there are still many other race conditions inside 
swap_cluster_readahead,
such as swap_readpage() called from swap_cluster_readahead. This tag could not 
cover the
all race windows.

> Best Regards,
> Huang, Ying
> 
>>> Best Regards,
>>> Huang, Ying
>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  mm/shmem.c | 6 ++
>>>>  1 file changed, 6 insertions(+)
>>>>
>>>> diff --git a/mm/shmem.c b/mm/shmem.c
>>>> index 26c76b13ad23..936ba5595297 100644
>>>> --- a/mm/shmem.c
>>>> +++ b/mm/shmem.c
>>>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>>>> vm_area_struct *vma)
>>>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>>>struct shmem_inode_info *info, pgoff_t index)
>>>>  {
>>>> +  struct swap_info_struct *si;
>>>>struct vm_area_struct pvma;
>>>>struct page *page;
>>>>struct vm_fault vmf = {
>>>>.vma = ,
>>>>};
>>>>  
>>>> +  /* Prevent swapoff from happening to us. */
>>>> +  si = get_swap_device(swap);
>>>> +  if (unlikely(!si))
>>>> +  return NULL;
>>>>shmem_pseudo_vma_init(, info, index);
>>>>page = swap_cluster_readahead(swap, gfp, );
>>>>shmem_pseudo_vma_destroy();
>>>> +  put_swap_device(si);
>>>>  
>>>>return page;
>>>>  }
>>> .
>>>
> .
>

Re: [PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-19 Thread Miaohe Lin

On 2021/4/19 10:54, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> Use percpu_ref to serialize against concurrent swapoff. Also remove the
>> SWP_VALID flag because it's used together with RCU solution.
>>
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h |  3 +--
>>  mm/swapfile.c| 43 +--
>>  2 files changed, 18 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 8be36eb58b7a..993693b38109 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -177,7 +177,6 @@ enum {
>>  SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
>>  SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
>>  SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
>> -SWP_VALID   = (1 << 13),/* swap is valid to be operated on? */
>>  /* add others here before... */
>>  SWP_SCANNING= (1 << 14),/* refcount in scan_swap_map */
>>  };
>> @@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -rcu_read_unlock();
>> +percpu_ref_put(>users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 66515a3a2824..90e197bc2eeb 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct 
>> swap_info_struct *p,
>>   * via preventing the swap device from being swapoff, until
>>   * put_swap_device() is called.  Otherwise return NULL.
>>   *
>> - * The entirety of the RCU read critical section must come before the
>> - * return from or after the call to synchronize_rcu() in
>> - * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
>> - * true, the si->map, si->cluster_info, etc. must be valid in the
>> - * critical section.
>> - *
>>   * Notice that swapoff or swapoff+swapon can still happen before the
>> - * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
>> - * in put_swap_device() if there isn't any other way to prevent
>> - * swapoff, such as page lock, page table lock, etc.  The caller must
>> - * be prepared for that.  For example, the following situation is
>> - * possible.
>> + * percpu_ref_tryget_live() in get_swap_device() or after the
>> + * percpu_ref_put() in put_swap_device() if there isn't any other way
>> + * to prevent swapoff, such as page lock, page table lock, etc.  The
>> + * caller must be prepared for that.  For example, the following
>> + * situation is possible.
>>   *
>>   *   CPU1   CPU2
>>   *   do_swap_page()
>> @@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
>> entry)
>>  si = swp_swap_info(entry);
>>  if (!si)
>>  goto bad_nofile;
>> -
>> -rcu_read_lock();
>> -if (data_race(!(si->flags & SWP_VALID)))
>> -goto unlock_out;
>> +if (!percpu_ref_tryget_live(>users))
>> +goto out;
>> +/*
>> + * Guarantee we will not reference uninitialized fields
>> + * of swap_info_struct.
>> + */
> 
> /*
>  * Guarantee the si->users are checked before accessing other fields of
>  * swap_info_struct.
> */
> 
>> +smp_rmb();
> 
> Usually, smp_rmb() need to be paired with smp_wmb().  Some comments are
> needed for that.  Here smb_rmb() is paired with the spin_unlock() after
> setup_swap_info() in enable_swap_info().
> 
>>  offset = swp_offset(entry);
>>  if (offset >= si->max)
>> -goto unlock_out;
>> +goto put_out;
>>  
>>  return si;
>>  bad_nofile:
>>  pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
>>  out:
>>  return NULL;
>> -unlock_out:
>> -rcu_read_unlock();
>> +put_out:
>> +percpu_ref_put(>users);
>>  return NULL;
>>  }
>>  
>> @@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct 
>> *p, int prio,
>>  
>>  static void _enable_swap_info(struct swap_info_struct *p)
>>  {
>> -p->flags |= SWP_WRITEOK | SWP_VALID;
>> +p->flags |= SWP_WRITEOK;
>>  atomic_long_add(p->pages, _swap_pages)

Re: [PATCH v2 3/5] swap: fix do_swap_page() race with swapoff

2021-04-19 Thread Miaohe Lin

On 2021/4/19 10:23, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1CPU 2
>> --
>> do_swap_page
> 
> This is OK for swap cache cases.  So
> 
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO))
> 
> should be shown here.

Ok.

> 
>>   swap_readpage(skip swap cache case)
>> if (data_race(sis->flags & SWP_FS_OPS)) {
>>  swapoff
>>p->flags = &= ~SWP_VALID;
>>..
>>synchronize_rcu();
>>..
>>p->swap_file = NULL;
>> struct file *swap_file = sis->swap_file;
>> struct address_space *mapping = swap_file->f_mapping;[oops!]
>>
>> Note that for the pages that are swapped in through swap cache, this isn't
>> an issue. Because the page is locked, and the swap entry will be marked
>> with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
>> unlocked.
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
> 
> I still suggest to squash PATCH 1-3, at least PATCH 1-2.  That will
> change the relevant code together and make it easier to review.
> 

Will squash PATCH 1-2. Thanks.

> Best Regards,
> Huang, Ying
> 
>> Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous 
>> device")
>> Reported-by: kernel test robot  (auto build test ERROR)
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h | 9 +
>>  mm/memory.c  | 9 +
>>  2 files changed, 18 insertions(+)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 993693b38109..523c2411a135 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -528,6 +528,15 @@ static inline struct swap_info_struct 
>> *swp_swap_info(swp_entry_t entry)
>>  return NULL;
>>  }
>>  
>> +static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
>> +{
>> +return NULL;
>> +}
>> +
>> +static inline void put_swap_device(struct swap_info_struct *si)
>> +{
>> +}
>> +
>>  #define swap_address_space(entry)   (NULL)
>>  #define get_nr_swap_pages() 0L
>>  #define total_swap_pages0L
>> diff --git a/mm/memory.c b/mm/memory.c
>> index 27014c3bde9f..7a2fe12cf641 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  {
>>  struct vm_area_struct *vma = vmf->vma;
>>  struct page *page = NULL, *swapcache;
>> +struct swap_info_struct *si = NULL;
>>  swp_entry_t entry;
>>  pte_t pte;
>>  int locked;
>> @@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  goto out;
>>  }
>>  
>> +/* Prevent swapoff from happening to us. */
>> +si = get_swap_device(entry);
>> +if (unlikely(!si))
>> +goto out;
>>  
>>  delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
>>  page = lookup_swap_cache(entry, vma, vmf->address);
>> @@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  out:
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  out_nomap:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock_page(swapcache);
>>  put_page(swapcache);
>>  }
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  }
> .
>

Re: [PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-19 Thread Miaohe Lin

On 2021/4/19 10:15, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1   CPU 2
>> -   -
>> shmem_swapin
>>   swap_cluster_readahead
>> if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
>> swapoff
>>   si->flags &= ~SWP_VALID;
>>   ..
>>   synchronize_rcu();
>>   ..
> 
> You have removed these code in the previous patches of the series.  And
> they are not relevant in this patch.

Yes, I should change these. Thanks.

> 
>>   si->swap_file = NULL;
>> struct inode *inode = si->swap_file->f_mapping->host;[oops!]
>>
>> Close this race window by using get/put_swap_device() to guard against
>> concurrent swapoff.
>>
>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> 
> No.  This isn't the commit that introduces the race condition.  Please
> recheck your git blame result.
> 

I think this is really hard to find exact commit. I used git blame and found
this race should be existed when this is introduced. Any suggestion ?
Thanks.

> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/shmem.c | 6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/mm/shmem.c b/mm/shmem.c
>> index 26c76b13ad23..936ba5595297 100644
>> --- a/mm/shmem.c
>> +++ b/mm/shmem.c
>> @@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
>> vm_area_struct *vma)
>>  static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
>>  struct shmem_inode_info *info, pgoff_t index)
>>  {
>> +struct swap_info_struct *si;
>>  struct vm_area_struct pvma;
>>  struct page *page;
>>  struct vm_fault vmf = {
>>  .vma = ,
>>  };
>>  
>> +/* Prevent swapoff from happening to us. */
>> +si = get_swap_device(swap);
>> +if (unlikely(!si))
>> +return NULL;
>>  shmem_pseudo_vma_init(, info, index);
>>  page = swap_cluster_readahead(swap, gfp, );
>>  shmem_pseudo_vma_destroy();
>> +put_swap_device(si);
>>  
>>  return page;
>>  }
> .
>

Re: [PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()

2021-04-19 Thread Miaohe Lin

On 2021/4/19 9:53, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> While we released the pte lock, somebody else might faulted in this pte.
>> So we should check whether it's swap pte first to guard against such race
>> or swp_type would be unexpected. But the swap_entry isn't used in this
>> function and we will have enough checking when we really operate the PTE
>> entries later. So checking for non_swap_entry() is not really needed here
>> and should be removed to avoid confusion.
> 
> Please rephrase the change log to describe why we have the code and why
> it's unnecessary now.  You can dig the git history via git-blame to find
> out it.
> 

Will try to do it. Thanks.

> The patch itself looks good to me.
> 
> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/swap_state.c | 6 --
>>  1 file changed, 6 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 272ea2108c9d..df5405384520 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  {
>>  struct vm_area_struct *vma = vmf->vma;
>>  unsigned long ra_val;
>> -swp_entry_t entry;
>>  unsigned long faddr, pfn, fpfn;
>>  unsigned long start, end;
>>  pte_t *pte, *orig_pte;
>> @@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  
>>  faddr = vmf->address;
>>  orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>> -entry = pte_to_swp_entry(*pte);
>> -if ((unlikely(non_swap_entry(entry {
>> -pte_unmap(orig_pte);
>> -return;
>> -}
>>  
>>  fpfn = PFN_DOWN(faddr);
>>  ra_val = GET_SWAP_RA_VAL(vma);
> .
>

Re: [PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-19 Thread Miaohe Lin

On 2021/4/19 10:48, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for swap.
>>
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h |  3 +++
>>  mm/swapfile.c| 33 +
>>  2 files changed, 32 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..8be36eb58b7a 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +struct percpu_ref users;/* serialization against concurrent 
>> swapoff */
> 
> The comments aren't general enough.  We use this to check whether the
> swap device has been fully initialized, etc. May be something as below?
> 
> /* indicate and keep swap device valid */

Looks good.

> 
>>  unsigned long   flags;  /* SWP_USED etc: see above */
>>  signed shortprio;   /* swap priority of this type */
>>  struct plist_node list; /* entry in swap_active_head */
>> @@ -260,6 +261,8 @@ struct swap_info_struct {
>>  struct block_device *bdev;  /* swap device or bdev of swap file */
>>  struct file *swap_file; /* seldom referenced */
>>  unsigned int old_block_size;/* seldom referenced */
>> +bool ref_initialized;   /* seldom referenced */
>> +struct completion comp; /* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>  atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..66515a3a2824 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  #include 
>>  #include 
>> @@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
>>  spin_unlock(>lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +struct swap_info_struct *si;
>> +
>> +si = container_of(ref, struct swap_info_struct, users);
>> +complete(>comp);
>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct 
>> *p, int prio,
>>   * Guarantee swap_map, cluster_info, etc. fields are valid
>>   * between get/put_swap_device() if SWP_VALID bit is set
>>   */
>> -synchronize_rcu();
> 
> You cannot remove this without changing get/put_swap_device().  It's
> better to squash at least PATCH 1-2.

Will squash PATCH 1-2. Thanks.

> 
>> +percpu_ref_resurrect(>users);
>>  spin_lock(_lock);
>>  spin_lock(>lock);
>>  _enable_swap_info(p);
>> @@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
>> specialfile)
>>  p->flags &= ~SWP_VALID; /* mark swap device as invalid */
>>  spin_unlock(>lock);
>>  spin_unlock(_lock);
>> +
>> +percpu_ref_kill(>users);
>>  /*
>> - * wait for swap operations protected by get/put_swap_device()
>> - * to complete
>> + * We need synchronize_rcu() here to protect the accessing
>> + * to the swap cache data structure.
>>   */
>>  synchronize_rcu();
>> +/*
>> + * Wait for swap operations protected by get/put_swap_device()
>> + * to complete.
>> + */
> 
> I think the comments (after some revision) can be moved before
> percpu_ref_kill().  The synchronize_rcu() comments can be merged.
> 

Ok.

>> +wait_for_completion(>comp);
>>  
>>  flush_work(>discard_work);
>>  
>> @@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct 
>> *si)
>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  {
>>  struct swap_info_struct *p;
>> -struct filename *name;
>> +struct filename *name = NULL;
>>  struct file *swap_file = NULL;
>>  struct address_space *mapping;
>>  int prio;
>> @@ -3163,6 +3179,15 @@ SYSCALL_DEFIN

[PATCH v2 0/5] close various race windows for swap

2021-04-17 Thread Miaohe Lin

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and most of the remaining patches try to use this to
close various race windows. More details can be found in the respective
changelogs. Thanks!

v1->v2:
  reorganize the patch-2/5
  various enhance and fixup per Huang, Ying
  Many thanks for the comments of Huang, Ying, Dennis Zhou and Tim Chen.

Miaohe Lin (5):
  mm/swapfile: add percpu_ref support for swap
  mm/swapfile: use percpu_ref to serialize against concurrent swapoff
  swap: fix do_swap_page() race with swapoff
  mm/swap: remove confusing checking for non_swap_entry() in
swap_ra_info()
  mm/shmem: fix shmem_swapin() race with swapoff

 include/linux/swap.h | 15 +++--
 mm/memory.c  |  9 ++
 mm/shmem.c   |  6 
 mm/swap_state.c  |  6 
 mm/swapfile.c| 74 +++-
 5 files changed, 73 insertions(+), 37 deletions(-)

-- 
2.19.1

[PATCH v2 5/5] mm/shmem: fix shmem_swapin() race with swapoff

2021-04-17 Thread Miaohe Lin

When I was investigating the swap code, I found the below possible race
window:

CPU 1   CPU 2
-   -
shmem_swapin
  swap_cluster_readahead
if (likely(si->flags & (SWP_BLKDEV | SWP_FS_OPS))) {
swapoff
  si->flags &= ~SWP_VALID;
  ..
  synchronize_rcu();
  ..
  si->swap_file = NULL;
struct inode *inode = si->swap_file->f_mapping->host;[oops!]

Close this race window by using get/put_swap_device() to guard against
concurrent swapoff.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Miaohe Lin 
---
 mm/shmem.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/mm/shmem.c b/mm/shmem.c
index 26c76b13ad23..936ba5595297 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1492,15 +1492,21 @@ static void shmem_pseudo_vma_destroy(struct 
vm_area_struct *vma)
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
 {
+   struct swap_info_struct *si;
struct vm_area_struct pvma;
struct page *page;
struct vm_fault vmf = {
.vma = ,
};
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(swap);
+   if (unlikely(!si))
+   return NULL;
shmem_pseudo_vma_init(, info, index);
page = swap_cluster_readahead(swap, gfp, );
shmem_pseudo_vma_destroy();
+   put_swap_device(si);
 
return page;
 }
-- 
2.19.1

[PATCH v2 3/5] swap: fix do_swap_page() race with swapoff

2021-04-17 Thread Miaohe Lin

When I was investigating the swap code, I found the below possible race
window:

CPU 1   CPU 2
-   -
do_swap_page
  swap_readpage(skip swap cache case)
if (data_race(sis->flags & SWP_FS_OPS)) {
swapoff
  p->flags = &= ~SWP_VALID;
  ..
  synchronize_rcu();
  ..
  p->swap_file = NULL;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;[oops!]

Note that for the pages that are swapped in through swap cache, this isn't
an issue. Because the page is locked, and the swap entry will be marked
with SWAP_HAS_CACHE, so swapoff() can not proceed until the page has been
unlocked.

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 0bcac06f27d7 ("mm,swap: skip swapcache for swapin of synchronous device")
Reported-by: kernel test robot  (auto build test ERROR)
Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h | 9 +
 mm/memory.c  | 9 +
 2 files changed, 18 insertions(+)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 993693b38109..523c2411a135 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -528,6 +528,15 @@ static inline struct swap_info_struct 
*swp_swap_info(swp_entry_t entry)
return NULL;
 }
 
+static inline struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+   return NULL;
+}
+
+static inline void put_swap_device(struct swap_info_struct *si)
+{
+}
+
 #define swap_address_space(entry)  (NULL)
 #define get_nr_swap_pages()0L
 #define total_swap_pages   0L
diff --git a/mm/memory.c b/mm/memory.c
index 27014c3bde9f..7a2fe12cf641 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
+   struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3338,6 +3339,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out;
}
 
+   /* Prevent swapoff from happening to us. */
+   si = get_swap_device(entry);
+   if (unlikely(!si))
+   goto out;
 
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
@@ -3514,6 +3519,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+   if (si)
+   put_swap_device(si);
return ret;
 out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3532,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(swapcache);
put_page(swapcache);
}
+   if (si)
+   put_swap_device(si);
return ret;
 }
 
-- 
2.19.1

[PATCH v2 2/5] mm/swapfile: use percpu_ref to serialize against concurrent swapoff

2021-04-17 Thread Miaohe Lin

Use percpu_ref to serialize against concurrent swapoff. Also remove the
SWP_VALID flag because it's used together with RCU solution.

Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  3 +--
 mm/swapfile.c| 43 +--
 2 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8be36eb58b7a..993693b38109 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -177,7 +177,6 @@ enum {
SWP_PAGE_DISCARD = (1 << 10),   /* freed swap page-cluster discards */
SWP_STABLE_WRITES = (1 << 11),  /* no overwrite PG_writeback pages */
SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */
-   SWP_VALID   = (1 << 13),/* swap is valid to be operated on? */
/* add others here before... */
SWP_SCANNING= (1 << 14),/* refcount in scan_swap_map */
 };
@@ -514,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-   rcu_read_unlock();
+   percpu_ref_put(>users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 66515a3a2824..90e197bc2eeb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1279,18 +1279,12 @@ static unsigned char __swap_entry_free_locked(struct 
swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1  CPU2
  *   do_swap_page()
@@ -1318,21 +1312,24 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
-   rcu_read_lock();
-   if (data_race(!(si->flags & SWP_VALID)))
-   goto unlock_out;
+   if (!percpu_ref_tryget_live(>users))
+   goto out;
+   /*
+* Guarantee we will not reference uninitialized fields
+* of swap_info_struct.
+*/
+   smp_rmb();
offset = swp_offset(entry);
if (offset >= si->max)
-   goto unlock_out;
+   goto put_out;
 
return si;
 bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
return NULL;
-unlock_out:
-   rcu_read_unlock();
+put_out:
+   percpu_ref_put(>users);
return NULL;
 }
 
@@ -2475,7 +2472,7 @@ static void setup_swap_info(struct swap_info_struct *p, 
int prio,
 
 static void _enable_swap_info(struct swap_info_struct *p)
 {
-   p->flags |= SWP_WRITEOK | SWP_VALID;
+   p->flags |= SWP_WRITEOK;
atomic_long_add(p->pages, _swap_pages);
total_swap_pages += p->pages;
 
@@ -2507,7 +2504,7 @@ static void enable_swap_info(struct swap_info_struct *p, 
int prio,
spin_unlock(_lock);
/*
 * Guarantee swap_map, cluster_info, etc. fields are valid
-* between get/put_swap_device() if SWP_VALID bit is set
+* between get/put_swap_device().
 */
percpu_ref_resurrect(>users);
spin_lock(_lock);
@@ -2625,12 +2622,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
 
reenable_swap_slots_cache_unlock();
 
-   spin_lock(_lock);
-   spin_lock(>lock);
-   p->flags &= ~SWP_VALID; /* mark swap device as invalid */
-   spin_unlock(>lock);
-   spin_unlock(_lock);
-
percpu_ref_kill(>users);
/*
 * We need synchronize_rcu() here to protect the accessing
-- 
2.19.1

[PATCH v2 4/5] mm/swap: remove confusing checking for non_swap_entry() in swap_ra_info()

2021-04-17 Thread Miaohe Lin

While we released the pte lock, somebody else might faulted in this pte.
So we should check whether it's swap pte first to guard against such race
or swp_type would be unexpected. But the swap_entry isn't used in this
function and we will have enough checking when we really operate the PTE
entries later. So checking for non_swap_entry() is not really needed here
and should be removed to avoid confusion.

Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..df5405384520 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -721,7 +721,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
-   swp_entry_t entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
pte_t *pte, *orig_pte;
@@ -739,11 +738,6 @@ static void swap_ra_info(struct vm_fault *vmf,
 
faddr = vmf->address;
orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-   entry = pte_to_swp_entry(*pte);
-   if ((unlikely(non_swap_entry(entry {
-   pte_unmap(orig_pte);
-   return;
-   }
 
fpfn = PFN_DOWN(faddr);
ra_val = GET_SWAP_RA_VAL(vma);
-- 
2.19.1

[PATCH v2 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-17 Thread Miaohe Lin

We will use percpu-refcount to serialize against concurrent swapoff. This
patch adds the percpu_ref support for swap.

Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  3 +++
 mm/swapfile.c| 33 +
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..8be36eb58b7a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -240,6 +240,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+   struct percpu_ref users;/* serialization against concurrent 
swapoff */
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
@@ -260,6 +261,8 @@ struct swap_info_struct {
struct block_device *bdev;  /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size;/* seldom referenced */
+   bool ref_initialized;   /* seldom referenced */
+   struct completion comp; /* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
atomic_t frontswap_pages;   /* frontswap pages in-use counter */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..66515a3a2824 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -511,6 +512,14 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(>lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+   struct swap_info_struct *si;
+
+   si = container_of(ref, struct swap_info_struct, users);
+   complete(>comp);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
struct swap_cluster_info *ci = si->cluster_info;
@@ -2500,7 +2509,7 @@ static void enable_swap_info(struct swap_info_struct *p, 
int prio,
 * Guarantee swap_map, cluster_info, etc. fields are valid
 * between get/put_swap_device() if SWP_VALID bit is set
 */
-   synchronize_rcu();
+   percpu_ref_resurrect(>users);
spin_lock(_lock);
spin_lock(>lock);
_enable_swap_info(p);
@@ -2621,11 +2630,18 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
spin_unlock(>lock);
spin_unlock(_lock);
+
+   percpu_ref_kill(>users);
/*
-* wait for swap operations protected by get/put_swap_device()
-* to complete
+* We need synchronize_rcu() here to protect the accessing
+* to the swap cache data structure.
 */
synchronize_rcu();
+   /*
+* Wait for swap operations protected by get/put_swap_device()
+* to complete.
+*/
+   wait_for_completion(>comp);
 
flush_work(>discard_work);
 
@@ -3132,7 +3148,7 @@ static bool swap_discardable(struct swap_info_struct *si)
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
struct swap_info_struct *p;
-   struct filename *name;
+   struct filename *name = NULL;
struct file *swap_file = NULL;
struct address_space *mapping;
int prio;
@@ -3163,6 +3179,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, 
specialfile, int, swap_flags)
 
INIT_WORK(>discard_work, swap_discard_work);
 
+   if (!p->ref_initialized) {
+   error = percpu_ref_init(>users, swap_users_ref_free,
+   PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+   if (unlikely(error))
+   goto bad_swap;
+   init_completion(>comp);
+   p->ref_initialized = true;
+   }
+
name = getname(specialfile);
if (IS_ERR(name)) {
error = PTR_ERR(name);
-- 
2.19.1

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-16 Thread Miaohe Lin

On 2021/4/16 14:25, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/15 22:31, Dennis Zhou wrote:
>>> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou  writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou  writes:
>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>>>> Dennis Zhou  writes:
>>>>>>>>
>>>>>>>>> Hello,
>>>>>>>>>
>>>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>>>> "Huang, Ying"  writes:
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent 
>>>>>>>>>>>>>>>> swapoff. This
>>>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin 
>>>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>>>  mm/swapfile.c| 25 ++---
>>>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>>>> +  struct percpu_ref users;/* serialization 
>>>>>>>>>>>>>>>> against concurrent swapoff */
>>>>>>>>>>>>>>>>unsigned long   flags;  /* SWP_USED etc: see 
>>>>>>>>>>>>>>>> above */
>>>>>>>>>>>>>>>>signed shortprio;   /* swap priority of 
>>>>>>>>>>>>>>>> this type */
>>>>>>>>>>>>>>>>struct plist_node list; /* entry in 
>>>>>>>>>>>>>>>> swap_active_head */
>>>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>>>struct block_device *bdev;  /* swap device or bdev 
>>>>>>>>>>>>>>>> of swap file */
>>>>>>>>>>>>>>>>struct file *swap_file; /* seldom referenced */
>>>>>>>>>>>>>>>>unsigned int old_block_size;/* seldom referenced */
>>>>>>>>>>>>>>>> +  struct completion comp; /* seldom referenced */
>>>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>>>unsigned long *frontswap_map;   /* frontswap in-use, 
>>>>>>>>>>>>>>>

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-15 Thread Miaohe Lin

On 2021/4/15 22:31, Dennis Zhou wrote:
> On Thu, Apr 15, 2021 at 01:24:31PM +0800, Huang, Ying wrote:
>> Dennis Zhou  writes:
>>
>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou  writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou  writes:
>>>>>>
>>>>>>> Hello,
>>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>> Miaohe Lin  writes:
>>>>>>>>
>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>> "Huang, Ying"  writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent 
>>>>>>>>>>>>>> swapoff. This
>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin 
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>  mm/swapfile.c| 25 ++---
>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>> +struct percpu_ref users;/* serialization 
>>>>>>>>>>>>>> against concurrent swapoff */
>>>>>>>>>>>>>>  unsigned long   flags;  /* SWP_USED etc: see 
>>>>>>>>>>>>>> above */
>>>>>>>>>>>>>>  signed shortprio;   /* swap priority of 
>>>>>>>>>>>>>> this type */
>>>>>>>>>>>>>>  struct plist_node list; /* entry in 
>>>>>>>>>>>>>> swap_active_head */
>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>  struct block_device *bdev;  /* swap device or bdev 
>>>>>>>>>>>>>> of swap file */
>>>>>>>>>>>>>>  struct file *swap_file; /* seldom referenced */
>>>>>>>>>>>>>>  unsigned int old_block_size;/* seldom referenced */
>>>>>>>>>>>>>> +struct completion comp; /* seldom referenced */
>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>  unsigned long *frontswap_map;   /* frontswap in-use, 
>>>>>>>>>>>>>> one bit per page */
>>>>>>>>>>>>>>  atomic_t frontswap_pages;   /* frontswap pages 
>>>>>>>>>>>>>> in-use counter */
>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>> --- a/mm/swapfile.c
>&g

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-15 Thread Miaohe Lin

On 2021/4/15 12:20, Dennis Zhou wrote:
> On Thu, Apr 15, 2021 at 11:16:42AM +0800, Miaohe Lin wrote:
>> On 2021/4/14 22:53, Dennis Zhou wrote:
>>> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>>>> Dennis Zhou  writes:
>>>>
>>>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>>>> Dennis Zhou  writes:
>>>>>>
>>>>>>> Hello,
>>>>>>>
>>>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>>>> Miaohe Lin  writes:
>>>>>>>>
>>>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>
>>>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>>>> "Huang, Ying"  writes:
>>>>>>>>>>>>
>>>>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>>>>
>>>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent 
>>>>>>>>>>>>>> swapoff. This
>>>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Signed-off-by: Miaohe Lin 
>>>>>>>>>>>>>> ---
>>>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>>>  mm/swapfile.c| 25 ++---
>>>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>>>   */
>>>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>>>> +struct percpu_ref users;/* serialization 
>>>>>>>>>>>>>> against concurrent swapoff */
>>>>>>>>>>>>>>  unsigned long   flags;  /* SWP_USED etc: see 
>>>>>>>>>>>>>> above */
>>>>>>>>>>>>>>  signed shortprio;   /* swap priority of 
>>>>>>>>>>>>>> this type */
>>>>>>>>>>>>>>  struct plist_node list; /* entry in 
>>>>>>>>>>>>>> swap_active_head */
>>>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>>>  struct block_device *bdev;  /* swap device or bdev 
>>>>>>>>>>>>>> of swap file */
>>>>>>>>>>>>>>  struct file *swap_file; /* seldom referenced */
>>>>>>>>>>>>>>  unsigned int old_block_size;/* seldom referenced */
>>>>>>>>>>>>>> +struct completion comp; /* seldom referenced */
>>>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>>>  unsigned long *frontswap_map;   /* frontswap in-use, 
>>>>>>>>>>>>>> one bit per page */
>>>>>>>>>>>>>>  atomic_t frontswap_pages;   /* frontswap pages 
>>>>>>>>>>>>>> in-use counter */
>>>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>>>> --- a/mm/swapfile.c
>>>

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-14 Thread Miaohe Lin

On 2021/4/15 0:13, Tim Chen wrote:
> 
> 
> On 4/13/21 6:04 PM, Huang, Ying wrote:
>> Tim Chen  writes:
>>
>>> On 4/12/21 6:27 PM, Huang, Ying wrote:
>>>

 This isn't the commit that introduces the race.  You can use `git blame`
 find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
 swap: skip swapcache for swapin of synchronous device".

 And I suggest to merge 1/5 and 2/5 to make it easy to get the full
 picture.
>>>
>>> I'll suggest make fix to do_swap_page race with get/put_swap_device
>>> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
>>> be combined together.
>>
>> The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
>> think it's good to wrap swap_read_page() with it.  After all, some
>> complex operations are done in swap_read_page(), including
>> blk_io_schedule().
>>
> 
> In that case then have the patches to make get/put_swap_device to use
> percpu_ref first.  And the patch to to fix the race in do_swap_page
> later in another patch.
> 
> Patch 2 is mixing the two.
> 

Looks like a good way to organize this patch series. Many thanks!

> Tim
> .
>

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-14 Thread Miaohe Lin

On 2021/4/14 22:53, Dennis Zhou wrote:
> On Wed, Apr 14, 2021 at 01:44:58PM +0800, Huang, Ying wrote:
>> Dennis Zhou  writes:
>>
>>> On Wed, Apr 14, 2021 at 11:59:03AM +0800, Huang, Ying wrote:
>>>> Dennis Zhou  writes:
>>>>
>>>>> Hello,
>>>>>
>>>>> On Wed, Apr 14, 2021 at 10:06:48AM +0800, Huang, Ying wrote:
>>>>>> Miaohe Lin  writes:
>>>>>>
>>>>>>> On 2021/4/14 9:17, Huang, Ying wrote:
>>>>>>>> Miaohe Lin  writes:
>>>>>>>>
>>>>>>>>> On 2021/4/12 15:24, Huang, Ying wrote:
>>>>>>>>>> "Huang, Ying"  writes:
>>>>>>>>>>
>>>>>>>>>>> Miaohe Lin  writes:
>>>>>>>>>>>
>>>>>>>>>>>> We will use percpu-refcount to serialize against concurrent 
>>>>>>>>>>>> swapoff. This
>>>>>>>>>>>> patch adds the percpu_ref support for later fixup.
>>>>>>>>>>>>
>>>>>>>>>>>> Signed-off-by: Miaohe Lin 
>>>>>>>>>>>> ---
>>>>>>>>>>>>  include/linux/swap.h |  2 ++
>>>>>>>>>>>>  mm/swapfile.c| 25 ++---
>>>>>>>>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>>>>>>>>
>>>>>>>>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>>>>>>>>> index 144727041e78..849ba5265c11 100644
>>>>>>>>>>>> --- a/include/linux/swap.h
>>>>>>>>>>>> +++ b/include/linux/swap.h
>>>>>>>>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>>>>>>>>   * The in-memory structure used to track swap areas.
>>>>>>>>>>>>   */
>>>>>>>>>>>>  struct swap_info_struct {
>>>>>>>>>>>> +  struct percpu_ref users;/* serialization against 
>>>>>>>>>>>> concurrent swapoff */
>>>>>>>>>>>>unsigned long   flags;  /* SWP_USED etc: see above */
>>>>>>>>>>>>signed shortprio;   /* swap priority of this type */
>>>>>>>>>>>>struct plist_node list; /* entry in swap_active_head */
>>>>>>>>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>>>>>>>>struct block_device *bdev;  /* swap device or bdev of swap 
>>>>>>>>>>>> file */
>>>>>>>>>>>>struct file *swap_file; /* seldom referenced */
>>>>>>>>>>>>unsigned int old_block_size;/* seldom referenced */
>>>>>>>>>>>> +  struct completion comp; /* seldom referenced */
>>>>>>>>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>>>>>>>>unsigned long *frontswap_map;   /* frontswap in-use, one bit 
>>>>>>>>>>>> per page */
>>>>>>>>>>>>atomic_t frontswap_pages;   /* frontswap pages in-use 
>>>>>>>>>>>> counter */
>>>>>>>>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>>>>>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>>>>>>>>> --- a/mm/swapfile.c
>>>>>>>>>>>> +++ b/mm/swapfile.c
>>>>>>>>>>>> @@ -39,6 +39,7 @@
>>>>>>>>>>>>  #include 
>>>>>>>>>>>>  #include 
>>>>>>>>>>>>  #include 
>>>>>>>>>>>> +#include 
>>>>>>>>>>>>  
>>>>>>>>>>>>  #include 
>>>>>>>>>>>>  #include 
>>>>>>>>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct 
>>>>>>>>>>>> work_struct *work)
>>>>>>>>>>>>spin_unlock(>lock);
>>>>>>

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-13 Thread Miaohe Lin

On 2021/4/14 11:07, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/13 9:27, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1  CPU 2
>>>> -  -
>>>> do_swap_page
>>>>   synchronous swap_readpage
>>>> alloc_page_vma
>>>>swapoff
>>>>  release swap_file, bdev, or ...
>>>>   swap_readpage
>>>>check sis->flags is ok
>>>>  access swap_file, bdev...[oops!]
>>>>si->flags = 0
>>>>
>>>> Using current get/put_swap_device() to guard against concurrent swapoff for
>>>> swap_readpage() looks terrible because swap_readpage() may take really long
>>>> time. And this race may not be really pernicious because swapoff is usually
>>>> done when system shutdown only. To reduce the performance overhead on the
>>>> hot-path as much as possible, it appears we can use the percpu_ref to close
>>>> this race window(as suggested by Huang, Ying).
>>>>
>>>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
>>>
>>> This isn't the commit that introduces the race.  You can use `git blame`
>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>> swap: skip swapcache for swapin of synchronous device".
>>>
>>
>> Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race 
>> between
>> swapoff and some swap operations"). And I think this commit does not fix the 
>> race
>> condition completely, so I reuse the Fixes tag inside it.
>>
>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>> picture.
>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  include/linux/swap.h |  2 +-
>>>>  mm/memory.c  | 10 ++
>>>>  mm/swapfile.c| 28 +++-
>>>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>> index 849ba5265c11..9066addb57fd 100644
>>>> --- a/include/linux/swap.h
>>>> +++ b/include/linux/swap.h
>>>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>>>  
>>>>  static inline void put_swap_device(struct swap_info_struct *si)
>>>>  {
>>>> -  rcu_read_unlock();
>>>> +  percpu_ref_put(>users);
>>>>  }
>>>>  
>>>>  #else /* CONFIG_SWAP */
>>>> diff --git a/mm/memory.c b/mm/memory.c
>>>> index cc71a445c76c..8543c47b955c 100644
>>>> --- a/mm/memory.c
>>>> +++ b/mm/memory.c
>>>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>  {
>>>>struct vm_area_struct *vma = vmf->vma;
>>>>struct page *page = NULL, *swapcache;
>>>> +  struct swap_info_struct *si = NULL;
>>>>swp_entry_t entry;
>>>>pte_t pte;
>>>>int locked;
>>>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>>>}
>>>>  
>>>>
>>>
>>> I suggest to add comments here as follows (words copy from Matthew Wilcox)
>>>
>>> /* Prevent swapoff from happening to us */
>>
>> Ok.
>>
>>>
>>>> +  si = get_swap_device(entry);
>>>> +  /* In case we raced with swapoff. */
>>>> +  if (unlikely(!si))
>>>> +  goto out;
>>>> +
>>>
>>> Because we wrap the whole do_swap_page() with get/put_swap_device()
>>> now.  We can remove several get/put_swap_device() for function called by
>>> do_swap_page().  That can be another optimization patch.
>>
>> I tried to remove several get/put_swap_device() for function called
>> by do_swap_page() only before I send this series. But it seems they have
>> other callers without proper get/put_swap_device().
> 
> Then we need to revise these callers instead.  Anyway, can be another
> series.

Yes. can be another series.
Thanks.

> 
> Best Regards,
> Huang, Ying
> 
> .
>

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-13 Thread Miaohe Lin

On 2021/4/13 9:27, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1CPU 2
>> --
>> do_swap_page
>>   synchronous swap_readpage
>> alloc_page_vma
>>  swapoff
>>release swap_file, bdev, or ...
>>   swap_readpage
>>  check sis->flags is ok
>>access swap_file, bdev...[oops!]
>>  si->flags = 0
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
>>
>> Fixes: 235b62176712 ("mm/swap: add cluster lock")
> 
> This isn't the commit that introduces the race.  You can use `git blame`
> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
> swap: skip swapcache for swapin of synchronous device".
> 

Sorry about it! What I refer to is commit eb085574a752 ("mm, swap: fix race 
between
swapoff and some swap operations"). And I think this commit does not fix the 
race
condition completely, so I reuse the Fixes tag inside it.

> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
> picture.
> 
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h |  2 +-
>>  mm/memory.c  | 10 ++
>>  mm/swapfile.c| 28 +++-
>>  3 files changed, 22 insertions(+), 18 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 849ba5265c11..9066addb57fd 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
>>  
>>  static inline void put_swap_device(struct swap_info_struct *si)
>>  {
>> -rcu_read_unlock();
>> +percpu_ref_put(>users);
>>  }
>>  
>>  #else /* CONFIG_SWAP */
>> diff --git a/mm/memory.c b/mm/memory.c
>> index cc71a445c76c..8543c47b955c 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  {
>>  struct vm_area_struct *vma = vmf->vma;
>>  struct page *page = NULL, *swapcache;
>> +struct swap_info_struct *si = NULL;
>>  swp_entry_t entry;
>>  pte_t pte;
>>  int locked;
>> @@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  }
>>  
>>
> 
> I suggest to add comments here as follows (words copy from Matthew Wilcox)
> 
>   /* Prevent swapoff from happening to us */

Ok.

> 
>> +si = get_swap_device(entry);
>> +/* In case we raced with swapoff. */
>> +if (unlikely(!si))
>> +goto out;
>> +
> 
> Because we wrap the whole do_swap_page() with get/put_swap_device()
> now.  We can remove several get/put_swap_device() for function called by
> do_swap_page().  That can be another optimization patch.

I tried to remove several get/put_swap_device() for function called
by do_swap_page() only before I send this series. But it seems they have
other callers without proper get/put_swap_device().

> 
>>  delayacct_set_flag(DELAYACCT_PF_SWAPIN);
>>  page = lookup_swap_cache(entry, vma, vmf->address);
>>  swapcache = page;
>> @@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  out:
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  out_nomap:
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>> @@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>>  unlock_page(swapcache);
>>  put_page(swapcache);
>>  }
>> +if (si)
>> +put_swap_device(si);
>>  return ret;
>>  }
>>  
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 724173cd7d0c..01032c72ceae 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct 
>> swap_info_struc

Re: [PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff

2021-04-13 Thread Miaohe Lin

On 2021/4/13 9:36, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> swap_cluster_readahead() could race with swapoff and might dereference
>> si->swap_file after it's released by swapoff. Close this race window by
>> using get/put_swap_device() pair.
> 
> I think we should fix the callers instead to reduce the overhead.  Now,
> do_swap_page() has been fixed.  We need to fix shmem_swapin().
> 

Will do. Many thanks.

> Best Regards,
> Huang, Ying
> 
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/swap_state.c | 11 +--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 3bf0d0c297bc..eba6b0cf6cf9 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
>> gfp_t gfp_mask,
>>  unsigned long offset = entry_offset;
>>  unsigned long start_offset, end_offset;
>>  unsigned long mask;
>> -struct swap_info_struct *si = swp_swap_info(entry);
>> +struct swap_info_struct *si;
>>  struct blk_plug plug;
>>  bool do_poll = true, page_allocated;
>>  struct vm_area_struct *vma = vmf->vma;
>>  unsigned long addr = vmf->address;
>>  
>> +si = get_swap_device(entry);
>> +/* In case we raced with swapoff. */
>> +if (!si)
>> +return NULL;
>> +
>>  mask = swapin_nr_pages(offset) - 1;
>>  if (!mask)
>>  goto skip;
>> @@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
>> gfp_t gfp_mask,
>>  
>>  lru_add_drain();/* Push any new pages onto the LRU now */
>>  skip:
>> -return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
>> +page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
>> +put_swap_device(si);
>> +return page;
>>  }
>>  
>>  int init_swap_address_space(unsigned int type, unsigned long nr_pages)
> .
>

Re: [PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff

2021-04-13 Thread Miaohe Lin

On 2021/4/13 9:33, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> The function get_shadow_from_swap_cache() can race with swapoff, though
>> it's only called by do_swap_page() now.
>>
>> Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous 
>> LRU")
>> Signed-off-by: Miaohe Lin 
> 
> This is unnecessary.  The only caller has guaranteed the swap device
> from swapoff

Ok. This patch is just in case get_shadow_from_swap_cache() would be
called from elsewhere someday.

It's unnecessary and can be dropped now

.
> 
> Best Regards,
> Huang, Ying
> 
>> ---
>>  mm/swap_state.c | 9 ++---
>>  1 file changed, 6 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 272ea2108c9d..709c260d644a 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -83,11 +83,14 @@ void show_swap_cache_info(void)
>>  
>>  void *get_shadow_from_swap_cache(swp_entry_t entry)
>>  {
>> -struct address_space *address_space = swap_address_space(entry);
>> -pgoff_t idx = swp_offset(entry);
>> +struct swap_info_struct *si;
>>  struct page *page;
>>  
>> -page = xa_load(_space->i_pages, idx);
>> +si = get_swap_device(entry);
>> +if (!si)
>> +return NULL;
>> +page = xa_load(_address_space(entry)->i_pages, swp_offset(entry));
>> +put_swap_device(si);
>>  if (xa_is_value(page))
>>  return page;
>>  return NULL;
> 
> .
>

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-13 Thread Miaohe Lin

On 2021/4/14 9:04, Huang, Ying wrote:
> Tim Chen  writes:
> 
>> On 4/12/21 6:27 PM, Huang, Ying wrote:
>>
>>>
>>> This isn't the commit that introduces the race.  You can use `git blame`
>>> find out the correct commit.  For this it's commit 0bcac06f27d7 "mm,
>>> swap: skip swapcache for swapin of synchronous device".
>>>
>>> And I suggest to merge 1/5 and 2/5 to make it easy to get the full
>>> picture.
>>
>> I'll suggest make fix to do_swap_page race with get/put_swap_device
>> as a first patch. Then the per_cpu_ref stuff in patch 1 and patch 2 can
>> be combined together.
> 
> The original get/put_swap_device() use rcu_read_lock/unlock().  I don't
> think it's good to wrap swap_read_page() with it.  After all, some
> complex operations are done in swap_read_page(), including
> blk_io_schedule().
> 

The patch was split to make it easier to review originally, i.e. 1/5 introduces
the percpu_ref to swap and 2/5 uses it to fix the race between do_swap_page()
and swapoff.
Btw, I have no preference for merging 1/5 and 2/5 or not.

> Best Regards,
> Huang, Ying
> .
>

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-13 Thread Miaohe Lin

On 2021/4/14 9:17, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/12 15:24, Huang, Ying wrote:
>>> "Huang, Ying"  writes:
>>>
>>>> Miaohe Lin  writes:
>>>>
>>>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>>>> patch adds the percpu_ref support for later fixup.
>>>>>
>>>>> Signed-off-by: Miaohe Lin 
>>>>> ---
>>>>>  include/linux/swap.h |  2 ++
>>>>>  mm/swapfile.c| 25 ++---
>>>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>>>> index 144727041e78..849ba5265c11 100644
>>>>> --- a/include/linux/swap.h
>>>>> +++ b/include/linux/swap.h
>>>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>>>   * The in-memory structure used to track swap areas.
>>>>>   */
>>>>>  struct swap_info_struct {
>>>>> + struct percpu_ref users;/* serialization against concurrent 
>>>>> swapoff */
>>>>>   unsigned long   flags;  /* SWP_USED etc: see above */
>>>>>   signed shortprio;   /* swap priority of this type */
>>>>>   struct plist_node list; /* entry in swap_active_head */
>>>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>>>>   struct block_device *bdev;  /* swap device or bdev of swap file */
>>>>>   struct file *swap_file; /* seldom referenced */
>>>>>   unsigned int old_block_size;/* seldom referenced */
>>>>> + struct completion comp; /* seldom referenced */
>>>>>  #ifdef CONFIG_FRONTSWAP
>>>>>   unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>>>>   atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>>>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>>>> index 149e77454e3c..724173cd7d0c 100644
>>>>> --- a/mm/swapfile.c
>>>>> +++ b/mm/swapfile.c
>>>>> @@ -39,6 +39,7 @@
>>>>>  #include 
>>>>>  #include 
>>>>>  #include 
>>>>> +#include 
>>>>>  
>>>>>  #include 
>>>>>  #include 
>>>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct 
>>>>> *work)
>>>>>   spin_unlock(>lock);
>>>>>  }
>>>>>  
>>>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>>>> +{
>>>>> + struct swap_info_struct *si;
>>>>> +
>>>>> + si = container_of(ref, struct swap_info_struct, users);
>>>>> + complete(>comp);
>>>>> + percpu_ref_exit(>users);
>>>>
>>>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>>>> get_swap_device(), better to add comments there.
>>>
>>> I just noticed that the comments of percpu_ref_tryget_live() says,
>>>
>>>  * This function is safe to call as long as @ref is between init and exit.
>>>
>>> While we need to call get_swap_device() almost at any time, so it's
>>> better to avoid to call percpu_ref_exit() at all.  This will waste some
>>> memory, but we need to follow the API definition to avoid potential
>>> issues in the long term.
>>
>> I have to admit that I'am not really familiar with percpu_ref. So I read the
>> implementation code of the percpu_ref and found percpu_ref_tryget_live() 
>> could
>> be called after exit now. But you're right we need to follow the API 
>> definition
>> to avoid potential issues in the long term.
>>
>>>
>>> And we need to call percpu_ref_init() before insert the swap_info_struct
>>> into the swap_info[].
>>
>> If we remove the call to percpu_ref_exit(), we should not use 
>> percpu_ref_init()
>> here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() 
>> while
>> this is not the case as we do not call percpu_ref_exit(). Maybe 
>> percpu_ref_reinit()
>> or percpu_ref_resurrect() will do the work.
>>
>> One more thing, how could I distinguish the killed percpu_ref from newly 
>> allocated one?
>> It seems percpu_ref_is_dying is only safe to call when @ref is between init 
>> and exit.
>> Maybe I could do this in alloc_swap_info()?
>

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-13 Thread Miaohe Lin

On 2021/4/12 15:24, Huang, Ying wrote:
> "Huang, Ying"  writes:
> 
>> Miaohe Lin  writes:
>>
>>> We will use percpu-refcount to serialize against concurrent swapoff. This
>>> patch adds the percpu_ref support for later fixup.
>>>
>>> Signed-off-by: Miaohe Lin 
>>> ---
>>>  include/linux/swap.h |  2 ++
>>>  mm/swapfile.c| 25 ++---
>>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>>> index 144727041e78..849ba5265c11 100644
>>> --- a/include/linux/swap.h
>>> +++ b/include/linux/swap.h
>>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>>   * The in-memory structure used to track swap areas.
>>>   */
>>>  struct swap_info_struct {
>>> +   struct percpu_ref users;/* serialization against concurrent 
>>> swapoff */
>>> unsigned long   flags;  /* SWP_USED etc: see above */
>>> signed shortprio;   /* swap priority of this type */
>>> struct plist_node list; /* entry in swap_active_head */
>>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>> struct block_device *bdev;  /* swap device or bdev of swap file */
>>> struct file *swap_file; /* seldom referenced */
>>> unsigned int old_block_size;/* seldom referenced */
>>> +   struct completion comp; /* seldom referenced */
>>>  #ifdef CONFIG_FRONTSWAP
>>> unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>> atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>>> index 149e77454e3c..724173cd7d0c 100644
>>> --- a/mm/swapfile.c
>>> +++ b/mm/swapfile.c
>>> @@ -39,6 +39,7 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  
>>>  #include 
>>>  #include 
>>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>> spin_unlock(>lock);
>>>  }
>>>  
>>> +static void swap_users_ref_free(struct percpu_ref *ref)
>>> +{
>>> +   struct swap_info_struct *si;
>>> +
>>> +   si = container_of(ref, struct swap_info_struct, users);
>>> +   complete(>comp);
>>> +   percpu_ref_exit(>users);
>>
>> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
>> get_swap_device(), better to add comments there.
> 
> I just noticed that the comments of percpu_ref_tryget_live() says,
> 
>  * This function is safe to call as long as @ref is between init and exit.
> 
> While we need to call get_swap_device() almost at any time, so it's
> better to avoid to call percpu_ref_exit() at all.  This will waste some
> memory, but we need to follow the API definition to avoid potential
> issues in the long term.

I have to admit that I'am not really familiar with percpu_ref. So I read the
implementation code of the percpu_ref and found percpu_ref_tryget_live() could
be called after exit now. But you're right we need to follow the API definition
to avoid potential issues in the long term.

> 
> And we need to call percpu_ref_init() before insert the swap_info_struct
> into the swap_info[].

If we remove the call to percpu_ref_exit(), we should not use percpu_ref_init()
here because *percpu_ref->data is assumed to be NULL* in percpu_ref_init() while
this is not the case as we do not call percpu_ref_exit(). Maybe 
percpu_ref_reinit()
or percpu_ref_resurrect() will do the work.

One more thing, how could I distinguish the killed percpu_ref from newly 
allocated one?
It seems percpu_ref_is_dying is only safe to call when @ref is between init and 
exit.
Maybe I could do this in alloc_swap_info()?

> 
>>> +}
>>> +
>>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>>  {
>>> struct swap_cluster_info *ci = si->cluster_info;
>>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct 
>>> *p, int prio,
>>>  * Guarantee swap_map, cluster_info, etc. fields are valid
>>>  * between get/put_swap_device() if SWP_VALID bit is set
>>>  */
>>> -   synchronize_rcu();
>>> +   percpu_ref_reinit(>users);
>>
>> Although the effect is same, I think it's better to use
>> percpu_ref_resurrect() here to improve code readability.
> 
> Check the original commit description for commit eb085574a752 "mm, swap:
> fix race between swapoff

Re: [PATCH v2 5/5] mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()

2021-04-12 Thread Miaohe Lin

On 2021/4/13 2:51, Mike Kravetz wrote:
> On 4/10/21 12:23 AM, Miaohe Lin wrote:
>> The local variable pseudo_vma is not used anymore.
>>
>> Signed-off-by: Miaohe Lin 
> 
> Thanks,
> 
> That should have been removed with 1b426bac66e6 ("hugetlb: use same fault
> hash key for shared and private mappings").
> 

Yep. Many thanks for Reviewed-by tag! :)

> Reviewed-by: Mike Kravetz 
>

Re: [PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-12 Thread Miaohe Lin

On 2021/4/12 11:30, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> We will use percpu-refcount to serialize against concurrent swapoff. This
>> patch adds the percpu_ref support for later fixup.
>>
>> Signed-off-by: Miaohe Lin 
>> ---
>>  include/linux/swap.h |  2 ++
>>  mm/swapfile.c| 25 ++---
>>  2 files changed, 24 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 144727041e78..849ba5265c11 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -240,6 +240,7 @@ struct swap_cluster_list {
>>   * The in-memory structure used to track swap areas.
>>   */
>>  struct swap_info_struct {
>> +struct percpu_ref users;/* serialization against concurrent 
>> swapoff */
>>  unsigned long   flags;  /* SWP_USED etc: see above */
>>  signed shortprio;   /* swap priority of this type */
>>  struct plist_node list; /* entry in swap_active_head */
>> @@ -260,6 +261,7 @@ struct swap_info_struct {
>>  struct block_device *bdev;  /* swap device or bdev of swap file */
>>  struct file *swap_file; /* seldom referenced */
>>  unsigned int old_block_size;/* seldom referenced */
>> +struct completion comp; /* seldom referenced */
>>  #ifdef CONFIG_FRONTSWAP
>>  unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
>>  atomic_t frontswap_pages;   /* frontswap pages in-use counter */
>> diff --git a/mm/swapfile.c b/mm/swapfile.c
>> index 149e77454e3c..724173cd7d0c 100644
>> --- a/mm/swapfile.c
>> +++ b/mm/swapfile.c
>> @@ -39,6 +39,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  
>>  #include 
>>  #include 
>> @@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
>>  spin_unlock(>lock);
>>  }
>>  
>> +static void swap_users_ref_free(struct percpu_ref *ref)
>> +{
>> +struct swap_info_struct *si;
>> +
>> +si = container_of(ref, struct swap_info_struct, users);
>> +complete(>comp);
>> +percpu_ref_exit(>users);
> 
> Because percpu_ref_exit() is used, we cannot use percpu_ref_tryget() in
> get_swap_device(), better to add comments there.

Will do.

> 
>> +}
>> +
>>  static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
>>  {
>>  struct swap_cluster_info *ci = si->cluster_info;
>> @@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct 
>> *p, int prio,
>>   * Guarantee swap_map, cluster_info, etc. fields are valid
>>   * between get/put_swap_device() if SWP_VALID bit is set
>>   */
>> -synchronize_rcu();
>> +percpu_ref_reinit(>users);
> 
> Although the effect is same, I think it's better to use
> percpu_ref_resurrect() here to improve code readability.
> 

Agree.

>>  spin_lock(_lock);
>>  spin_lock(>lock);
>>  _enable_swap_info(p);
>> @@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
>> specialfile)
>>  p->flags &= ~SWP_VALID; /* mark swap device as invalid */
>>  spin_unlock(>lock);
>>  spin_unlock(_lock);
>> +
>> +percpu_ref_kill(>users);
>>  /*
>>   * wait for swap operations protected by get/put_swap_device()
>>   * to complete
>>   */
>> -synchronize_rcu();
>> +wait_for_completion(>comp);
> 
> Better to move percpu_ref_kill() after the comments.  And maybe revise
> the comments.

Will do.

> 
>>  
>>  flush_work(>discard_work);
>>  
>> @@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct 
>> *si)
>>  SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>>  {
>>  struct swap_info_struct *p;
>> -struct filename *name;
>> +struct filename *name = NULL;
>>  struct file *swap_file = NULL;
>>  struct address_space *mapping;
>>  int prio;
>> @@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, 
>> specialfile, int, swap_flags)
>>  
>>  INIT_WORK(>discard_work, swap_discard_work);
>>  
>> +init_completion(>comp);
>> +error = percpu_ref_init(>users, swap_users_ref_free,
>> +PERCPU_REF_INIT_DEAD, GFP_KERNEL);
>> +if (unlikely(error))
>> +goto bad_swap;
>> +
>>  name = getname(specialfile);
>>  if (IS_ERR(name)) {
>>  error = PTR_ERR(name);
>> @@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, 
>> specialfile, int, swap_flags)
>>  bad_swap_unlock_inode:
>>  inode_unlock(inode);
>>  bad_swap:
>> +percpu_ref_exit(>users);
> 
> Usually the resource freeing order matches their allocating order
> reversely.  So, if there's no special reason, please follow that rule.
> 

My oversight. Will fix it in V2.

> Best Regards,
> Huang, Ying
> 
>>  free_percpu(p->percpu_cluster);
>>  p->percpu_cluster = NULL;
>>  free_percpu(p->cluster_next_cpu);
> .
> 

Many thanks for review and nice suggestion! :)

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-11 Thread Miaohe Lin

On 2021/4/12 9:44, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/10 1:17, Tim Chen wrote:
>>>
>>>
>>> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>>>> On 2021/4/9 5:34, Tim Chen wrote:
>>>>>
>>>>>
>>>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>>>> When I was investigating the swap code, I found the below possible race
>>>>>> window:
>>>>>>
>>>>>> CPU 1CPU 2
>>>>>> --
>>>>>> do_swap_page
>>>>>>   synchronous swap_readpage
>>>>>> alloc_page_vma
>>>>>>  swapoff
>>>>>>release swap_file, bdev, or ...
>>>>>
>>>>
>>>> Many thanks for quick review and reply!
>>>>
>>>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>>>> happens after we have cleared the SWP_VALID bit in si->flags in 
>>>>> destroy_swap_extents
>>>>> if I read the swapoff code correctly.
>>>> Agree. Let's look this more close:
>>>> CPU1   CPU2
>>>> -  -
>>>> swap_readpage
>>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>>>swapoff
>>>>  p->swap_file 
>>>> = NULL;
>>>> struct file *swap_file = sis->swap_file;
>>>> struct address_space *mapping = swap_file->f_mapping;[oops!]
>>>>  ...
>>>>  p->flags = 0;
>>>> ...
>>>>
>>>> Does this make sense for you?
>>>
>>> p->swapfile = NULL happens after the 
>>> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence 
>>> in swapoff().
>>>
>>> So I don't think the sequence you illustrated on CPU2 is in the right order.
>>> That said, without get_swap_device/put_swap_device in swap_readpage, you 
>>> could
>>> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I 
>>> think
>>> the problematic race looks something like the following:
>>>
>>>
>>> CPU1CPU2
>>> -   -
>>> swap_readpage
>>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>> swapoff
>>>   p->flags = &= 
>>> ~SWP_VALID;
>>>   ..
>>>   
>>> synchronize_rcu();
>>>   ..
>>>   p->swap_file 
>>> = NULL;
>>> struct file *swap_file = sis->swap_file;
>>> struct address_space *mapping = swap_file->f_mapping;[oops!]
>>>   ...
>>> ...
>>>
>>
>> Agree. This is also what I meant to illustrate. And you provide a better 
>> one. Many thanks!
> 
> For the pages that are swapped in through swap cache.  That isn't an
> issue.  Because the page is locked, the swap entry will be marked with
> SWAP_HAS_CACHE, so swapoff() cannot proceed until the page has been
> unlocked.
> 
> So the race is for the fast path as follows,
> 
>   if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
>   __swap_count(entry) == 1)
> 
> I found it in your original patch description.  But please make it more
> explicit to reduce the potential confusing.

Sure. Should I rephrase the commit log to clarify this or add a comment in the 
code?

Thanks.

> 
> Best Regards,
> Huang, Ying
> .
>

Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()

2021-04-11 Thread Miaohe Lin

On 2021/4/12 8:55, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/4/9 16:50, Huang, Ying wrote:
>>> Miaohe Lin  writes:
>>>
>>>> While we released the pte lock, somebody else might faulted in this pte.
>>>> So we should check whether it's swap pte first to guard against such race
>>>> or swp_type would be unexpected. And we can also avoid some unnecessary
>>>> readahead cpu cycles possibly.
>>>>
>>>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  mm/swap_state.c | 13 +
>>>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>>>> index 709c260d644a..3bf0d0c297bc 100644
>>>> --- a/mm/swap_state.c
>>>> +++ b/mm/swap_state.c
>>>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>>  {
>>>>struct vm_area_struct *vma = vmf->vma;
>>>>unsigned long ra_val;
>>>> -  swp_entry_t entry;
>>>> +  swp_entry_t swap_entry;
>>>>unsigned long faddr, pfn, fpfn;
>>>>unsigned long start, end;
>>>> -  pte_t *pte, *orig_pte;
>>>> +  pte_t *pte, *orig_pte, entry;
>>>>unsigned int max_win, hits, prev_win, win, left;
>>>>  #ifndef CONFIG_64BIT
>>>>pte_t *tpte;
>>>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>>>  
>>>>faddr = vmf->address;
>>>>orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>>>> -  entry = pte_to_swp_entry(*pte);
>>>> -  if ((unlikely(non_swap_entry(entry {
>>>> +  entry = *pte;
>>>> +  if (unlikely(!is_swap_pte(entry))) {
>>>> +  pte_unmap(orig_pte);
>>>> +  return;
>>>> +  }
>>>> +  swap_entry = pte_to_swp_entry(entry);
>>>> +  if ((unlikely(non_swap_entry(swap_entry {
>>>>pte_unmap(orig_pte);
>>>>return;
>>>>}
>>>
>>> This isn't a real issue.  entry or swap_entry isn't used in this
>>
>> Agree. It seems the entry or swap_entry here is just used for check whether
>> pte is still valid swap_entry.
> 
> If you check the git history, you will find that the check has been
> necessary before.  Because the function is used earlier in
> do_swap_page() at that time.
> 

I see. Many thanks for explanation. :)

> Best Regards,
> Huang, Ying
> .
>

Re: [Question] Is there a race window between swapoff vs synchronous swap_readpage

2021-04-11 Thread Miaohe Lin

On 2021/3/30 15:27, Yu Zhao wrote:
> On Tue, Mar 30, 2021 at 12:57 AM Huang, Ying  wrote:
>>
>> Yu Zhao  writes:
>>
>>> On Mon, Mar 29, 2021 at 9:44 PM Huang, Ying  wrote:
>>>>
>>>> Miaohe Lin  writes:
>>>>
>>>>> On 2021/3/30 9:57, Huang, Ying wrote:
>>>>>> Hi, Miaohe,
>>>>>>
>>>>>> Miaohe Lin  writes:
>>>>>>
>>>>>>> Hi all,
>>>>>>> I am investigating the swap code, and I found the below possible race 
>>>>>>> window:
>>>>>>>
>>>>>>> CPU 1   CPU 2
>>>>>>> -   -
>>>>>>> do_swap_page
>>>>>>>   skip swapcache case (synchronous swap_readpage)
>>>>>>> alloc_page_vma
>>>>>>> swapoff
>>>>>>>   release 
>>>>>>> swap_file, bdev, or ...
>>>>>>>   swap_readpage
>>>>>>> check sis->flags is ok
>>>>>>>   access swap_file, bdev or ...[oops!]
>>>>>>> si->flags = 0
>>>>>>>
>>>>>>> The swapcache case is ok because swapoff will wait on the page_lock of 
>>>>>>> swapcache page.
>>>>>>> Is this will really happen or Am I miss something ?
>>>>>>> Any reply would be really grateful. Thanks! :)
>>>>>>
>>>>>> This appears possible.  Even for swapcache case, we can't guarantee the
>>>>>
>>>>> Many thanks for reply!
>>>>>
>>>>>> swap entry gotten from the page table is always valid too.  The
>>>>>
>>>>> The page table may change at any time. And we may thus do some useless 
>>>>> work.
>>>>> But the pte_same() check could handle these races correctly if these do 
>>>>> not
>>>>> result in oops.
>>>>>
>>>>>> underlying swap device can be swapped off at the same time.  So we use
>>>>>> get/put_swap_device() for that.  Maybe we need similar stuff here.
>>>>>
>>>>> Using get/put_swap_device() to guard against swapoff for swap_readpage() 
>>>>> sounds
>>>>> really bad as swap_readpage() may take really long time. Also such race 
>>>>> may not be
>>>>> really hurtful because swapoff is usually done when system shutdown only.
>>>>> I can not figure some simple and stable stuff out to fix this. Any 
>>>>> suggestions or
>>>>> could anyone help get rid of such race?
>>>>
>>>> Some reference counting on the swap device can prevent swap device from
>>>> swapping-off.  To reduce the performance overhead on the hot-path as
>>>> much as possible, it appears we can use the percpu_ref.
>>>
>>> Hi,
>>>
>>> I've been seeing crashes when testing the latest kernels with
>>>   stress-ng --class vm -a 20 -t 600s --temp-path /tmp
>>>
>>> I haven't had time to look into them yet:
>>>
>>> DEBUG_VM:
>>>   BUG: unable to handle page fault for address: 905c33c9a000
>>>   Call Trace:
>>>get_swap_pages+0x278/0x590
>>>get_swap_page+0x1ab/0x280
>>>add_to_swap+0x7d/0x130
>>>shrink_page_list+0xf84/0x25f0
>>>reclaim_pages+0x313/0x430
>>>madvise_cold_or_pageout_pte_range+0x95c/0xaa0
>>
>> If my understanding were correct, two bugs are reported?  One above and
>> one below?  If so, and the above one is reported firstly.  Can you share
>> the full bug message reported in dmesg?
> 
> No, they are from two different kernel configs. I saw the first crash
> and didn't know what to look. So I turned on KASAN to see if it gives
> more clue. Unfortunately I haven't had time to spend more time on it.
> 
>> Can you convert the call trace to source line?  And the commit of the
>> kernel?  Or the full kconfig?  So I can build it by myself.
> 
> It seems to be very reproducible if you enable these three options, on
> 5.12, 5.11, 5.10 which is where I gave up trying.
> 
>>> CONFIG_MEMCG_SWAP=y
>>> CONFIG_THP_SWAP=y
>>> CONFIG_

Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

2021-04-11 Thread Miaohe Lin

On 2021/4/11 4:02, Yu Zhao wrote:
> On Sat, Apr 10, 2021 at 5:01 AM Miaohe Lin  wrote:
>>
>> On 2021/4/10 18:42, Yu Zhao wrote:
>>> On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin  wrote:
>>>>
>>>> Hi:
>>>> On 2021/4/5 18:20, Miaohe Lin wrote:
>>>>> frontswap_register_ops can race with swapon. Consider the following scene:
>>>>
>>>> Any comment or suggestion? Or is this race window too theoretical to fix?
>>>> Thanks.
>>>
>>> Let me run a stress test and get back to you (within a day or so).
>>
>> That's very kind of you. Many thanks!
> 
> I'm still getting the following crash. Probably I should try the other
> series you sent earlier too?
> 
>   BUG: unable to handle page fault for address: aa7937d82000
>   RIP: 0010:scan_swap_map_slots+0x12b/0x7f0
>   Call Trace:
>   get_swap_pages+0x278/0x590
>get_swap_page+0x1ab/0x280
>add_to_swap+0x7d/0x130
>shrink_page_list+0xf84/0x25f0
>reclaim_pages+0x313/0x430
>   madvise_cold_or_pageout_pte_range+0x95c/0xaa0
>walk_p4d_range+0x43f/0x790
>walk_pgd_range+0xf1/0x150
>__walk_page_range+0x6f/0x1b0
>walk_page_range+0xbe/0x1e
>do_madvise+0x271/0x720

Sorry about it! This patch is fix the frontswap_register_ops() race with 
swapon/swapoff.
And the earlier patch is fix the race I found when I was investigating the swap 
code. So
this crash may not be included.

> 
>>>>> CPU1  CPU2
>>>>>   
>>>>> frontswap_register_ops
>>>>>   fill bitmap a
>>>>>   ops->init
>>>>>   sys_swapon
>>>>> enable_swap_info
>>>>>   frontswap_init without new ops
>>>>>   add ops to frontswap_ops list
>>>>>   check if swap_active_head changed
>>>>>   add to swap_active_head
>>>>>
>>>>> So the frontswap_ops init is missed on the new swap device. Consider the
>>>>> another scene:
>>>>> CPU1CPU2
>>>>> 
>>>>> frontswap_register_ops
>>>>>   fill bitmap a
>>>>>   ops->init
>>>>>   add ops to frontswap_ops list
>>>>> sys_swapon
>>>>>   enable_swap_info
>>>>> frontswap_init with new ops
>>>>> add to swap_active_head
>>>>>   check if swap_active_head changed
>>>>>   ops->init for new swap device [twice!]
>>>>>
>>>>> The frontswap_ops init will be called two times on the new swap device 
>>>>> this
>>>>> time. frontswap_register_ops can also race with swapoff. Consider the
>>>>> following scene:
>>>>>
>>>>> CPU1CPU2
>>>>> 
>>>>>         sys_swapoff
>>>>> removed from swap_active_head
>>>>> frontswap_register_ops
>>>>>   fill bitmap a
>>>>>   ops->init without swap device
>>>>>   add ops to frontswap_ops list
>>>>> invalidate_area with new ops
>>>>>   check if swap_active_head changed
>>>>>
>>>>> We could call invalidate_area on a swap device under swapoff with 
>>>>> frontswap
>>>>> is uninitialized yet. Fix all these by using swapon_mutex to guard against
>>>>> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
>>>>> devices under swapoff.
>>>>>
>>>>> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
>>>>> Signed-off-by: Miaohe Lin 
>>>>> ---
>>>>>  include/linux/swapfile.h |  2 ++
>>>>>  mm/frontswap.c   | 40 +---
>>>>>  mm/swapfile.c| 13 -
>>>>>  3 files changed, 31 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/swapfile.h

Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

2021-04-10 Thread Miaohe Lin

On 2021/4/10 18:42, Yu Zhao wrote:
> On Sat, Apr 10, 2021 at 1:30 AM Miaohe Lin  wrote:
>>
>> Hi:
>> On 2021/4/5 18:20, Miaohe Lin wrote:
>>> frontswap_register_ops can race with swapon. Consider the following scene:
>>
>> Any comment or suggestion? Or is this race window too theoretical to fix?
>> Thanks.
> 
> Let me run a stress test and get back to you (within a day or so).

That's very kind of you. Many thanks!

> 
>>>
>>> CPU1  CPU2
>>>   
>>> frontswap_register_ops
>>>   fill bitmap a
>>>   ops->init
>>>   sys_swapon
>>> enable_swap_info
>>>   frontswap_init without new ops
>>>   add ops to frontswap_ops list
>>>   check if swap_active_head changed
>>>   add to swap_active_head
>>>
>>> So the frontswap_ops init is missed on the new swap device. Consider the
>>> another scene:
>>> CPU1CPU2
>>> 
>>> frontswap_register_ops
>>>   fill bitmap a
>>>   ops->init
>>>   add ops to frontswap_ops list
>>> sys_swapon
>>>   enable_swap_info
>>> frontswap_init with new ops
>>> add to swap_active_head
>>>   check if swap_active_head changed
>>>   ops->init for new swap device [twice!]
>>>
>>> The frontswap_ops init will be called two times on the new swap device this
>>> time. frontswap_register_ops can also race with swapoff. Consider the
>>> following scene:
>>>
>>> CPU1CPU2
>>> 
>>> sys_swapoff
>>> removed from swap_active_head
>>> frontswap_register_ops
>>>   fill bitmap a
>>>   ops->init without swap device
>>>   add ops to frontswap_ops list
>>>         invalidate_area with new ops
>>>   check if swap_active_head changed
>>>
>>> We could call invalidate_area on a swap device under swapoff with frontswap
>>> is uninitialized yet. Fix all these by using swapon_mutex to guard against
>>> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
>>> devices under swapoff.
>>>
>>> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
>>> Signed-off-by: Miaohe Lin 
>>> ---
>>>  include/linux/swapfile.h |  2 ++
>>>  mm/frontswap.c   | 40 +---
>>>  mm/swapfile.c| 13 -
>>>  3 files changed, 31 insertions(+), 24 deletions(-)
>>>
>>> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
>>> index e06febf62978..7ae15d917828 100644
>>> --- a/include/linux/swapfile.h
>>> +++ b/include/linux/swapfile.h
>>> @@ -9,8 +9,10 @@
>>>  extern spinlock_t swap_lock;
>>>  extern struct plist_head swap_active_head;
>>>  extern struct swap_info_struct *swap_info[];
>>> +extern struct mutex swapon_mutex;
>>>  extern int try_to_unuse(unsigned int, bool, unsigned long);
>>>  extern unsigned long generic_max_swapfile_size(void);
>>>  extern unsigned long max_swapfile_size(void);
>>> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
>>>
>>>  #endif /* _LINUX_SWAPFILE_H */
>>> diff --git a/mm/frontswap.c b/mm/frontswap.c
>>> index 130e301c5ac0..c16bfc7550b5 100644
>>> --- a/mm/frontswap.c
>>> +++ b/mm/frontswap.c
>>> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>>>
>>>   bitmap_zero(a, MAX_SWAPFILES);
>>>   bitmap_zero(b, MAX_SWAPFILES);
>>> -
>>> + mutex_lock(_mutex);
>>>   spin_lock(_lock);
>>>   plist_for_each_entry(si, _active_head, list) {
>>>   if (!WARN_ON(!si->frontswap_map))
>>>   set_bit(si->type, a);
>>>   }
>>> + /*
>>> +  * There might be some swap devices under swapoff, i.e. they are
>>

Re: [PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

2021-04-10 Thread Miaohe Lin

Hi:
On 2021/4/5 18:20, Miaohe Lin wrote:
> frontswap_register_ops can race with swapon. Consider the following scene:

Any comment or suggestion? Or is this race window too theoretical to fix?
Thanks.

> 
> CPU1  CPU2
>   
> frontswap_register_ops
>   fill bitmap a
>   ops->init
>   sys_swapon
> enable_swap_info
>   frontswap_init without new ops
>   add ops to frontswap_ops list
>   check if swap_active_head changed
>   add to swap_active_head
> 
> So the frontswap_ops init is missed on the new swap device. Consider the
> another scene:
> CPU1CPU2
> 
> frontswap_register_ops
>   fill bitmap a
>   ops->init
>   add ops to frontswap_ops list
> sys_swapon
>   enable_swap_info
> frontswap_init with new ops
> add to swap_active_head
>   check if swap_active_head changed
>   ops->init for new swap device [twice!]
> 
> The frontswap_ops init will be called two times on the new swap device this
> time. frontswap_register_ops can also race with swapoff. Consider the
> following scene:
> 
> CPU1CPU2
> 
> sys_swapoff
> removed from swap_active_head
> frontswap_register_ops
>   fill bitmap a
>   ops->init without swap device
>   add ops to frontswap_ops list
> invalidate_area with new ops
>   check if swap_active_head changed
> 
> We could call invalidate_area on a swap device under swapoff with frontswap
> is uninitialized yet. Fix all these by using swapon_mutex to guard against
> race with swapon and add swap_info_get_if_under_swapoff() to collect swap
> devices under swapoff.
> 
> Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
> Signed-off-by: Miaohe Lin 
> ---
>  include/linux/swapfile.h |  2 ++
>  mm/frontswap.c   | 40 +---
>  mm/swapfile.c| 13 -
>  3 files changed, 31 insertions(+), 24 deletions(-)
> 
> diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
> index e06febf62978..7ae15d917828 100644
> --- a/include/linux/swapfile.h
> +++ b/include/linux/swapfile.h
> @@ -9,8 +9,10 @@
>  extern spinlock_t swap_lock;
>  extern struct plist_head swap_active_head;
>  extern struct swap_info_struct *swap_info[];
> +extern struct mutex swapon_mutex;
>  extern int try_to_unuse(unsigned int, bool, unsigned long);
>  extern unsigned long generic_max_swapfile_size(void);
>  extern unsigned long max_swapfile_size(void);
> +extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
>  
>  #endif /* _LINUX_SWAPFILE_H */
> diff --git a/mm/frontswap.c b/mm/frontswap.c
> index 130e301c5ac0..c16bfc7550b5 100644
> --- a/mm/frontswap.c
> +++ b/mm/frontswap.c
> @@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>  
>   bitmap_zero(a, MAX_SWAPFILES);
>   bitmap_zero(b, MAX_SWAPFILES);
> -
> + mutex_lock(_mutex);
>   spin_lock(_lock);
>   plist_for_each_entry(si, _active_head, list) {
>   if (!WARN_ON(!si->frontswap_map))
>   set_bit(si->type, a);
>   }
> + /*
> +  * There might be some swap devices under swapoff, i.e. they are
> +  * removed from swap_active_head but frontswap_invalidate_area()
> +  * is not called yet due to swapon_mutex is held here. We must
> +  * collect these swap devices and call ops->init on them or they
> +  * might invalidate frontswap area while frontswap is uninitialized.
> +  */
> + for_each_clear_bit(i, a, MAX_SWAPFILES) {
> + si = swap_info_get_if_under_swapoff(i);
> + if (!si || !si->frontswap_map)
> + continue;
> + set_bit(si->type, b);
> + }
> + bitmap_or(a, a, b, MAX_SWAPFILES);
>   spin_unlock(_lock);
>  
>   /* the new ops needs to know the currently active swap devices */
> @@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
>   ops->next = frontswap_ops;
>   } while (cmpxchg(_ops, ops->next, ops) != ops->next);
>  
> - static_bra

[PATCH v2 3/5] mm/hugeltb: clarify (chg - freed) won't go negative in hugetlb_unreserve_pages()

2021-04-10 Thread Miaohe Lin

The resv_map could be NULL since this routine can be called in the evict
inode path for all hugetlbfs inodes and we will have chg = 0 in this case.
But (chg - freed) won't go negative as Mike pointed out:

 "If resv_map is NULL, then no hugetlb pages can be allocated/associated
  with the file.  As a result, remove_inode_hugepages will never find any
  huge pages associated with the inode and the passed value 'freed' will
  always be zero."

Add a comment clarifying this to make it clear and also avoid confusion.

Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9b4c05699a90..387c9a62615e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5435,6 +5435,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long 
start, long end,
/*
 * If the subpool has a minimum size, the number of global
 * reservations to be released may be adjusted.
+*
+* Note that !resv_map implies freed == 0. So (chg - freed)
+* won't go negative.
 */
gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
hugetlb_acct_memory(h, -gbl_reserve);
-- 
2.19.1

[PATCH v2 1/5] mm/hugeltb: remove redundant VM_BUG_ON() in region_add()

2021-04-10 Thread Miaohe Lin

The same VM_BUG_ON() check is already done in the callee. Remove this extra
one to simplify the code slightly.

Reviewed-by: Mike Kravetz 
Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c22111f3da20..a03a50b7c410 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -556,7 +556,6 @@ static long region_add(struct resv_map *resv, long f, long 
t,
resv->adds_in_progress -= in_regions_needed;
 
spin_unlock(>lock);
-   VM_BUG_ON(add < 0);
return add;
 }
 
-- 
2.19.1

[PATCH v2 4/5] mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()

2021-04-10 Thread Miaohe Lin

A rare out of memory error would prevent removal of the reserve map region
for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid
dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages
and hugetlb_acct_memory could possibly fail too. We should correctly handle
these cases.

Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 387c9a62615e..a14bb1a03ee4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -745,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
 {
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+   bool reserved = false;
 
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-   if (rsv_adjust) {
+   if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
 
-   hugetlb_acct_memory(h, 1);
+   if (!hugetlb_acct_memory(h, 1))
+   reserved = true;
+   } else if (!rsv_adjust) {
+   reserved = true;
}
+
+   if (!reserved)
+   pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
 }
 
 /*
-- 
2.19.1

[PATCH v2 0/5] Cleanup and fixup for hugetlb

2021-04-10 Thread Miaohe Lin

Hi all,
This series contains cleanups to remove redundant VM_BUG_ON() and
simplify the return code. Also this handles the error case in
hugetlb_fix_reserve_counts() correctly. More details can be found
in the respective changelogs. Thanks!

v1->v2:
  collect Reviewed-by tag
  remove the HPAGE_RESV_OWNER check per Mike
  add a comment to hugetlb_unreserve_pages per Mike
  expand warning message a bit for hugetlb_fix_reserve_counts
  Add a new patch to remove unused variable
  Many thanks for Mike's review and suggestion!

Miaohe Lin (5):
  mm/hugeltb: remove redundant VM_BUG_ON() in region_add()
  mm/hugeltb: simplify the return code of __vma_reservation_common()
  mm/hugeltb: clarify (chg - freed) won't go negative in
hugetlb_unreserve_pages()
  mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()
  mm/hugetlb: remove unused variable pseudo_vma in
remove_inode_hugepages()

 fs/hugetlbfs/inode.c |  3 ---
 mm/hugetlb.c | 57 +---
 2 files changed, 33 insertions(+), 27 deletions(-)

-- 
2.19.1

[PATCH v2 5/5] mm/hugetlb: remove unused variable pseudo_vma in remove_inode_hugepages()

2021-04-10 Thread Miaohe Lin

The local variable pseudo_vma is not used anymore.

Signed-off-by: Miaohe Lin 
---
 fs/hugetlbfs/inode.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d81f52b87bd7..a2a42335e8fd 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -463,14 +463,11 @@ static void remove_inode_hugepages(struct inode *inode, 
loff_t lstart,
struct address_space *mapping = >i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
-   struct vm_area_struct pseudo_vma;
struct pagevec pvec;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
 
-   vma_init(_vma, current->mm);
-   pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
pagevec_init();
next = start;
while (next < end) {
-- 
2.19.1

[PATCH v2 2/5] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-10 Thread Miaohe Lin

It's guaranteed that the vma is associated with a resv_map, i.e. either
VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
have returned via !resv check above. So it's unneeded to check whether
HPAGE_RESV_OWNER is set here. Simplify the return code to make it more
clear.

Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 41 -
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a03a50b7c410..9b4c05699a90 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2163,27 +2163,26 @@ static long __vma_reservation_common(struct hstate *h,
 
if (vma->vm_flags & VM_MAYSHARE)
return ret;
-   else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-   /*
-* In most cases, reserves always exist for private mappings.
-* However, a file associated with mapping could have been
-* hole punched or truncated after reserves were consumed.
-* As subsequent fault on such a range will not use reserves.
-* Subtle - The reserve map for private mappings has the
-* opposite meaning than that of shared mappings.  If NO
-* entry is in the reserve map, it means a reservation exists.
-* If an entry exists in the reserve map, it means the
-* reservation has already been consumed.  As a result, the
-* return value of this routine is the opposite of the
-* value returned from reserve map manipulation routines above.
-*/
-   if (ret)
-   return 0;
-   else
-   return 1;
-   }
-   else
-   return ret < 0 ? ret : 0;
+   /*
+* We know private mapping must have HPAGE_RESV_OWNER set.
+*
+* In most cases, reserves always exist for private mappings.
+* However, a file associated with mapping could have been
+* hole punched or truncated after reserves were consumed.
+* As subsequent fault on such a range will not use reserves.
+* Subtle - The reserve map for private mappings has the
+* opposite meaning than that of shared mappings.  If NO
+* entry is in the reserve map, it means a reservation exists.
+* If an entry exists in the reserve map, it means the
+* reservation has already been consumed.  As a result, the
+* return value of this routine is the opposite of the
+* value returned from reserve map manipulation routines above.
+*/
+   if (ret > 0)
+   return 0;
+   if (ret == 0)
+   return 1;
+   return ret;
 }
 
 static long vma_needs_reservation(struct hstate *h,
-- 
2.19.1

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-09 Thread Miaohe Lin

On 2021/4/10 1:17, Tim Chen wrote:
> 
> 
> On 4/9/21 1:42 AM, Miaohe Lin wrote:
>> On 2021/4/9 5:34, Tim Chen wrote:
>>>
>>>
>>> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>>>> When I was investigating the swap code, I found the below possible race
>>>> window:
>>>>
>>>> CPU 1  CPU 2
>>>> -  -
>>>> do_swap_page
>>>>   synchronous swap_readpage
>>>> alloc_page_vma
>>>>swapoff
>>>>  release swap_file, bdev, or ...
>>>
>>
>> Many thanks for quick review and reply!
>>
>>> Perhaps I'm missing something.  The release of swap_file, bdev etc
>>> happens after we have cleared the SWP_VALID bit in si->flags in 
>>> destroy_swap_extents
>>> if I read the swapoff code correctly.
>> Agree. Let's look this more close:
>> CPU1 CPU2
>> --
>> swap_readpage
>>   if (data_race(sis->flags & SWP_FS_OPS)) {
>>  swapoff
>>p->swap_file 
>> = NULL;
>> struct file *swap_file = sis->swap_file;
>> struct address_space *mapping = swap_file->f_mapping;[oops!]
>>...
>>p->flags = 0;
>> ...
>>
>> Does this make sense for you?
> 
> p->swapfile = NULL happens after the 
> p->flags &= ~SWP_VALID, synchronize_rcu(), destroy_swap_extents() sequence in 
> swapoff().
> 
> So I don't think the sequence you illustrated on CPU2 is in the right order.
> That said, without get_swap_device/put_swap_device in swap_readpage, you could
> potentially blow pass synchronize_rcu() on CPU2 and causes a problem.  so I 
> think
> the problematic race looks something like the following:
> 
> 
> CPU1  CPU2
> - -
> swap_readpage
>   if (data_race(sis->flags & SWP_FS_OPS)) {
>   swapoff
> p->flags = &= 
> ~SWP_VALID;
> ..
> 
> synchronize_rcu();
> ..
> p->swap_file 
> = NULL;
> struct file *swap_file = sis->swap_file;
> struct address_space *mapping = swap_file->f_mapping;[oops!]
> ...
> ...
> 

Agree. This is also what I meant to illustrate. And you provide a better one. 
Many thanks!

> By adding get_swap_device/put_swap_device, then the race is fixed.
> 
> 
> CPU1  CPU2
> - -
> swap_readpage
>   get_swap_device()
>   ..
>   if (data_race(sis->flags & SWP_FS_OPS)) {
>   swapoff
> p->flags = &= 
> ~SWP_VALID;
> ..
> struct file *swap_file = sis->swap_file;
> struct address_space *mapping = swap_file->f_mapping;[valid value]
>   ..
>   put_swap_device()
> 
> synchronize_rcu();
> ..
> p->swap_file 
> = NULL;
> 
> 
>>
>>>>
>>>>   swap_readpage
>>>>check sis->flags is ok
>>>>  access swap_file, bdev...[oops!]
>>>>si->flags = 0
>>>
>>> This happens after we clear the si->flags
>>> synchronize_rcu()
>>> release swap_file, bdev, in 
>>> destroy_swap_extents()
>>>
>&

Re: [PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()

2021-04-09 Thread Miaohe Lin

On 2021/4/9 16:50, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> While we released the pte lock, somebody else might faulted in this pte.
>> So we should check whether it's swap pte first to guard against such race
>> or swp_type would be unexpected. And we can also avoid some unnecessary
>> readahead cpu cycles possibly.
>>
>> Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/swap_state.c | 13 +
>>  1 file changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/mm/swap_state.c b/mm/swap_state.c
>> index 709c260d644a..3bf0d0c297bc 100644
>> --- a/mm/swap_state.c
>> +++ b/mm/swap_state.c
>> @@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  {
>>  struct vm_area_struct *vma = vmf->vma;
>>  unsigned long ra_val;
>> -swp_entry_t entry;
>> +swp_entry_t swap_entry;
>>  unsigned long faddr, pfn, fpfn;
>>  unsigned long start, end;
>> -pte_t *pte, *orig_pte;
>> +pte_t *pte, *orig_pte, entry;
>>  unsigned int max_win, hits, prev_win, win, left;
>>  #ifndef CONFIG_64BIT
>>  pte_t *tpte;
>> @@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
>>  
>>  faddr = vmf->address;
>>  orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
>> -entry = pte_to_swp_entry(*pte);
>> -if ((unlikely(non_swap_entry(entry {
>> +entry = *pte;
>> +if (unlikely(!is_swap_pte(entry))) {
>> +pte_unmap(orig_pte);
>> +return;
>> +}
>> +swap_entry = pte_to_swp_entry(entry);
>> +if ((unlikely(non_swap_entry(swap_entry {
>>  pte_unmap(orig_pte);
>>  return;
>>  }
> 
> This isn't a real issue.  entry or swap_entry isn't used in this

Agree. It seems the entry or swap_entry here is just used for check whether
pte is still valid swap_entry.

> function.  And we have enough checking when we really operate the PTE
> entries later.  But I admit it's confusing.  So I suggest to just remove
> the checking.  We will check it when necessary.

Sounds reasonable. Will do it in v2.

Many thanks for review and reply!

> 
> Best Regards,
> Huang, Ying
> .
>

Re: [PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-09 Thread Miaohe Lin

On 2021/4/9 5:34, Tim Chen wrote:
> 
> 
> On 4/8/21 6:08 AM, Miaohe Lin wrote:
>> When I was investigating the swap code, I found the below possible race
>> window:
>>
>> CPU 1CPU 2
>> --
>> do_swap_page
>>   synchronous swap_readpage
>> alloc_page_vma
>>  swapoff
>>release swap_file, bdev, or ...
> 

Many thanks for quick review and reply!

> Perhaps I'm missing something.  The release of swap_file, bdev etc
> happens after we have cleared the SWP_VALID bit in si->flags in 
> destroy_swap_extents
> if I read the swapoff code correctly.
Agree. Let's look this more close:
CPU1CPU2
-   -
swap_readpage
  if (data_race(sis->flags & SWP_FS_OPS)) {
swapoff
  p->swap_file 
= NULL;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;[oops!]
  ...
  p->flags = 0;
...

Does this make sense for you?

> >
>>   swap_readpage
>>  check sis->flags is ok
>>access swap_file, bdev...[oops!]
>>  si->flags = 0
> 
> This happens after we clear the si->flags
>   synchronize_rcu()
>   release swap_file, bdev, in 
> destroy_swap_extents()
> 
> So I think if we have get_swap_device/put_swap_device in do_swap_page,
> it should fix the race you've pointed out here.  
> Then synchronize_rcu() will wait till we have completed do_swap_page and
> call put_swap_device.

Right, get_swap_device/put_swap_device could fix this race. __But__ 
rcu_read_lock()
in get_swap_device() could disable preempt and do_swap_page() may take a really 
long
time because it involves I/O. It may not be acceptable to disable preempt for 
such a
long time. :(

>   
>>
>> Using current get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really long
>> time. And this race may not be really pernicious because swapoff is usually
>> done when system shutdown only. To reduce the performance overhead on the
>> hot-path as much as possible, it appears we can use the percpu_ref to close
>> this race window(as suggested by Huang, Ying).
> 
> I think it is better to break this patch into two.
> > One patch is to fix the race in do_swap_page and swapoff
> by adding get_swap_device/put_swap_device in do_swap_page.
> 
> The second patch is to modify get_swap_device and put_swap_device
> with percpu_ref. But swapoff is a relatively rare events.  

Sounds reasonable. Will do it.

> 
> I am not sure making percpu_ref change for performance is really beneficial.
> Did you encounter a real use case where you see a problem with swapoff?
> The delay in swapoff is primarily in try_to_unuse to bring all
> the swapped off pages back into memory.  Synchronizing with other
> CPU for paging in probably is a small component in overall scheme
> of things.
> 

I can't find a more simple and stable way to fix this potential and 
*theoretical* issue.
This could happen in real word but the race window should be very small. While 
swapoff
is usually done when system shutdown only, I'am not really sure if this effort 
is worth.

But IMO, we should eliminate any potential trouble. :)

> Thanks.
> 

Thanks again.

> Tim
> 
> .
>

Re: [PATCH 0/5] close various race windows for swap

2021-04-09 Thread Miaohe Lin

On 2021/4/8 22:55, riteshh wrote:
> On 21/04/08 09:08AM, Miaohe Lin wrote:
>> Hi all,
>> When I was investigating the swap code, I found some possible race
>> windows. This series aims to fix all these races. But using current
>> get/put_swap_device() to guard against concurrent swapoff for
>> swap_readpage() looks terrible because swap_readpage() may take really
>> long time. And to reduce the performance overhead on the hot-path as
>> much as possible, it appears we can use the percpu_ref to close this
>> race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
>> support for swap and the rest of the patches use this to close various
>> race windows. More details can be found in the respective changelogs.
>> Thanks!
>>
>> Miaohe Lin (5):
>>   mm/swapfile: add percpu_ref support for swap
>>   swap: fix do_swap_page() race with swapoff
>>   mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
>>   mm/swap_state: fix potential faulted in race in swap_ra_info()
>>   mm/swap_state: fix swap_cluster_readahead() race with swapoff
> 

Many thanks for quick respond.

> Somehow I see Patch-1 and Patch-2 are missing on linux-mm[1].

I have no idea why Patch-1 and Patch-2 are missing. But they could be found at:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2542188.html

> Also I wanted to ask if you have a way to trigger this in a more controlled
> environment (consistently)?
> 

This is *theoretical* issue. The race window is very small but not impossible.
Please see the discussion:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg2530094.html

> [1]: 
> https://patchwork.kernel.org/project/linux-mm/cover/20210408130820.48233-1-linmia...@huawei.com/
> 

Thanks again.

> -ritesh
> 
>>
>>  include/linux/swap.h |  4 +++-
>>  mm/memory.c  | 10 +
>>  mm/swap_state.c  | 33 +
>>  mm/swapfile.c| 50 +++-
>>  4 files changed, 68 insertions(+), 29 deletions(-)
>>
>> --
>> 2.19.1
>>
>>
> .
>

Re: [PATCH 4/4] mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()

2021-04-09 Thread Miaohe Lin

On 2021/4/9 13:04, Andrew Morton wrote:
> On Fri, 9 Apr 2021 11:17:49 +0800 Miaohe Lin  wrote:
> 
>> On 2021/4/9 7:25, Mike Kravetz wrote:
>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>> A rare out of memory error would prevent removal of the reserve map region
>>>> for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid
>>>> dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages
>>>> and hugetlb_acct_memory could possibly fail too. We should correctly handle
>>>> these cases.
>>>
>>> Yes, this is a potential issue.
>>>
>>> The 'good news' is that hugetlb_fix_reserve_counts() is unlikely to ever
>>> be called.  To do so would imply we could not allocate a region entry
>>> which is only 6 words in size.  We also keep a 'cache' of entries so we
>>> may not even need to allocate.
>>>
>>> But, as mentioned it is a potential issue.
>>
>> Yes, a potential *theoretical* issue.
>>
>>>
>>>> Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of 
>>>> pages")
>>>
>>> This is likely going to make this get picked by by stable releases.
>>> That is unfortunate as mentioned above this is mostly theoretical.
>>>
>>
>> I will drop this. This does not worth backport.
>>
> 
> -stable have been asked not to backport MM patches unless MM patches
> include "cc:stable".  ie, no making our backporting decisions for us,
> please.
> 

Sorry about it! I would retain the Fixes tag.
Many thanks for pointing this out.

> .
>

Re: [PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-09 Thread Miaohe Lin

On 2021/4/9 12:37, Mike Kravetz wrote:
> On 4/8/21 8:01 PM, Miaohe Lin wrote:
>> On 2021/4/9 6:53, Mike Kravetz wrote:
>>>
>>> Yes, add a comment to hugetlb_unreserve_pages saying that !resv_map
>>> implies freed == 0.
>>>
>>
>> Sounds good!
>>
>>> It would also be helpful to check for (chg - freed) == 0 and skip the
>>> calls to hugepage_subpool_put_pages() and hugetlb_acct_memory().  Both
>>> of those routines may perform an unnecessary lock/unlock cycle in this
>>> case.
>>>
>>> A simple
>>> if (chg == free)
>>> return 0;
>>> before the call to hugepage_subpool_put_pages would work.
>>
>> This may not be really helpful because hugepage_subpool_put_pages() and 
>> hugetlb_acct_memory()
>> both would handle delta == 0 case without unnecessary lock/unlock cycle.
>> Does this make sense for you? If so, I will prepare v2 with the changes to 
>> add a comment
>> to hugetlb_unreserve_pages() __without__ the check for (chg - freed) == 0.
> 
> Sorry, I forgot about the recent changes to check for delta == 0.
> No need for the check here, just the comment.
> 

That's all right. Will add the comment in V2.
Thanks.

Re: [PATCH 4/4] mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()

2021-04-08 Thread Miaohe Lin

On 2021/4/9 7:25, Mike Kravetz wrote:
> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>> A rare out of memory error would prevent removal of the reserve map region
>> for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid
>> dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages
>> and hugetlb_acct_memory could possibly fail too. We should correctly handle
>> these cases.
> 
> Yes, this is a potential issue.
> 
> The 'good news' is that hugetlb_fix_reserve_counts() is unlikely to ever
> be called.  To do so would imply we could not allocate a region entry
> which is only 6 words in size.  We also keep a 'cache' of entries so we
> may not even need to allocate.
> 
> But, as mentioned it is a potential issue.

Yes, a potential *theoretical* issue.

> 
>> Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of 
>> pages")
> 
> This is likely going to make this get picked by by stable releases.
> That is unfortunate as mentioned above this is mostly theoretical.
> 

I will drop this. This does not worth backport.

>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/hugetlb.c | 11 +--
>>  1 file changed, 9 insertions(+), 2 deletions(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index bdff8d23803f..ca5464ed04b7 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -745,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
>>  {
>>  struct hugepage_subpool *spool = subpool_inode(inode);
>>  long rsv_adjust;
>> +bool reserved = false;
>>  
>>  rsv_adjust = hugepage_subpool_get_pages(spool, 1);
>> -if (rsv_adjust) {
>> +if (rsv_adjust > 0) {
>>  struct hstate *h = hstate_inode(inode);
>>  
>> -hugetlb_acct_memory(h, 1);
>> +if (!hugetlb_acct_memory(h, 1))
>> +reserved = true;
>> +} else if (!rsv_adjust) {
>> +reserved = true;
>>  }
>> +
>> +if (!reserved)
>> +pr_warn("hugetlb: fix reserve count failed\n");
> 
> We should expand this warning message a bit to indicate what this may
> mean to the user.  Add something like"
>   "Huge Page Reserved count may go negative".
> 

Will add it in v2. Many thanks for review and nice suggestion ! :)

Re: [PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-08 Thread Miaohe Lin

On 2021/4/9 6:53, Mike Kravetz wrote:
> On 4/7/21 8:26 PM, Miaohe Lin wrote:
>> On 2021/4/8 11:24, Miaohe Lin wrote:
>>> On 2021/4/8 4:53, Mike Kravetz wrote:
>>>> On 4/7/21 12:24 AM, Miaohe Lin wrote:
>>>>> Hi:
>>>>> On 2021/4/7 10:49, Mike Kravetz wrote:
>>>>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>>>>> The resv_map could be NULL since this routine can be called in the evict
>>>>>>> inode path for all hugetlbfs inodes. So we could have chg = 0 and this
>>>>>>> would result in a negative value when chg - freed. This is unexpected 
>>>>>>> for
>>>>>>> hugepage_subpool_put_pages() and hugetlb_acct_memory().
>>>>>>
>>>>>> I am not sure if this is possible.
>>>>>>
>>>>>> It is true that resv_map could be NULL.  However, I believe resv map
>>>>>> can only be NULL for inodes that are not regular or link inodes.  This
>>>>>> is the inode creation code in hugetlbfs_get_inode().
>>>>>>
>>>>>>/*
>>>>>>  * Reserve maps are only needed for inodes that can have 
>>>>>> associated
>>>>>>  * page allocations.
>>>>>>  */
>>>>>> if (S_ISREG(mode) || S_ISLNK(mode)) {
>>>>>> resv_map = resv_map_alloc();
>>>>>> if (!resv_map)
>>>>>> return NULL;
>>>>>> }
>>>>>>
>>>>>
>>>>> Agree.
>>>>>
>>>>>> If resv_map is NULL, then no hugetlb pages can be allocated/associated
>>>>>> with the file.  As a result, remove_inode_hugepages will never find any
>>>>>> huge pages associated with the inode and the passed value 'freed' will
>>>>>> always be zero.
>>>>>>
>>>>>
>>>>> But I am confused now. AFAICS, remove_inode_hugepages() searches the 
>>>>> address_space of
>>>>> the inode to remove the hugepages while does not care if inode has 
>>>>> associated resv_map.
>>>>> How does it prevent hugetlb pages from being allocated/associated with 
>>>>> the file if
>>>>> resv_map is NULL? Could you please explain this more?
>>>>>
>>>>
>>>> Recall that there are only two ways to get huge pages associated with
>>>> a hugetlbfs file: fallocate and mmap/write fault.  Directly writing to
>>>> hugetlbfs files is not supported.
>>>>
>>>> If you take a closer look at hugetlbfs_get_inode, it has that code to
>>>> allocate the resv map mentioned above as well as the following:
>>>>
>>>>switch (mode & S_IFMT) {
>>>>default:
>>>>init_special_inode(inode, mode, dev);
>>>>break;
>>>>case S_IFREG:
>>>>inode->i_op = _inode_operations;
>>>>inode->i_fop = _file_operations;
>>>>break;
>>>>case S_IFDIR:
>>>>inode->i_op = _dir_inode_operations;
>>>>inode->i_fop = _dir_operations;
>>>>
>>>>/* directory inodes start off with i_nlink == 2 (for 
>>>> "." entry) */
>>>>inc_nlink(inode);
>>>>break;
>>>>case S_IFLNK:
>>>>inode->i_op = _symlink_inode_operations;
>>>>inode_nohighmem(inode);
>>>>break;
>>>>}
>>>>
>>>> Notice that only S_IFREG inodes will have i_fop == 
>>>> _file_operations.
>>>> hugetlbfs_file_operations contain the hugetlbfs specific mmap and fallocate
>>>> routines.  Hence, only files with S_IFREG inodes can potentially have
>>>> associated huge pages.  S_IFLNK inodes can as well via file linking.
>>>>
>>>> If an inode is not S_ISREG(mode) || S_ISLNK(mode), then it will not have
>>>> a resv_map.  In addition, it will not have hugetlbfs_file_operations and
>>>> can not have associated huge pages.
>>>>
>>>
>>> Many many thanks for detailed and patien

Re: [PATCH 2/4] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-08 Thread Miaohe Lin

On 2021/4/9 6:40, Mike Kravetz wrote:
> On 4/7/21 7:44 PM, Miaohe Lin wrote:
>> On 2021/4/8 5:23, Mike Kravetz wrote:
>>> On 4/6/21 8:09 PM, Miaohe Lin wrote:
>>>> On 2021/4/7 10:37, Mike Kravetz wrote:
>>>>> On 4/6/21 7:05 PM, Miaohe Lin wrote:
>>>>>> Hi:
>>>>>> On 2021/4/7 8:53, Mike Kravetz wrote:
>>>>>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>>>>>> It's guaranteed that the vma is associated with a resv_map, i.e. either
>>>>>>>> VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
>>>>>>>> have returned via !resv check above. So ret must be less than 0 in the
>>>>>>>> 'else' case. Simplify the return code to make this clear.
>>>>>>>
>>>>>>> I believe we still neeed that ternary operator in the return statement.
>>>>>>> Why?
>>>>>>>
>>>>>>> There are two basic types of mappings to be concerned with:
>>>>>>> shared and private.
>>>>>>> For private mappings, a task can 'own' the mapping as indicated by
>>>>>>> HPAGE_RESV_OWNER.  Or, it may not own the mapping.  The most common way
>>>>>>> to create a non-owner private mapping is to have a task with a private
>>>>>>> mapping fork.  The parent process will have HPAGE_RESV_OWNER set, the
>>>>>>> child process will not.  The idea is that since the child has a COW copy
>>>>>>> of the mapping it should not consume reservations made by the parent.
>>>>>>
>>>>>> The child process will not have HPAGE_RESV_OWNER set because at fork 
>>>>>> time, we do:
>>>>>>  /*
>>>>>>   * Clear hugetlb-related page reserves for children. This only
>>>>>>   * affects MAP_PRIVATE mappings. Faults generated by the child
>>>>>>   * are not guaranteed to succeed, even if read-only
>>>>>>   */
>>>>>>  if (is_vm_hugetlb_page(tmp))
>>>>>>  reset_vma_resv_huge_pages(tmp);
>>>>>> i.e. we have vma->vm_private_data = (void *)0; for child process and 
>>>>>> vma_resv_map() will
>>>>>> return NULL in this case.
>>>>>> Or am I missed something?
>>>>>>
>>>>>>> Only the parent (HPAGE_RESV_OWNER) is allowed to consume the
>>>>>>> reservations.
>>>>>>> Hope that makens sense?
>>>>>>>
>>>>>>>>
>>>>>>>> Signed-off-by: Miaohe Lin 
>>>>>>>> ---
>>>>>>>>  mm/hugetlb.c | 2 +-
>>>>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>>>
>>>>>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>>>>>> index a03a50b7c410..b7864abded3d 100644
>>>>>>>> --- a/mm/hugetlb.c
>>>>>>>> +++ b/mm/hugetlb.c
>>>>>>>> @@ -2183,7 +2183,7 @@ static long __vma_reservation_common(struct 
>>>>>>>> hstate *h,
>>>>>>>>return 1;
>>>>>>>>}
>>>>>>>>else
>>>>>>>
>>>>>>> This else also handles the case !HPAGE_RESV_OWNER.  In this case, we
>>>>>>
>>>>>> IMO, for the case !HPAGE_RESV_OWNER, we won't reach here. What do you 
>>>>>> think?
>>>>>>
>>>>>
>>>>> I think you are correct.
>>>>>
>>>>> However, if this is true we should be able to simply the code even
>>>>> further.  There is no need to check for HPAGE_RESV_OWNER because we know
>>>>> it must be set.  Correct?  If so, the code could look something like:
>>>>>
>>>>>   if (vma->vm_flags & VM_MAYSHARE)
>>>>>   return ret;
>>>>>
>>>>>   /* We know private mapping with HPAGE_RESV_OWNER */
>>>>>* ...   *
>>>>>* Add that existing comment */
>>>>>
>>>>>   if (ret > 0)
>>>>>   return 0;
>>>>>   if (ret == 0)
>>>>>   return 1;
>>&g

[PATCH 3/5] mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff

2021-04-08 Thread Miaohe Lin

The function get_shadow_from_swap_cache() can race with swapoff, though
it's only called by do_swap_page() now.

Fixes: aae466b0052e ("mm/swap: implement workingset detection for anonymous 
LRU")
Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 272ea2108c9d..709c260d644a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,11 +83,14 @@ void show_swap_cache_info(void)
 
 void *get_shadow_from_swap_cache(swp_entry_t entry)
 {
-   struct address_space *address_space = swap_address_space(entry);
-   pgoff_t idx = swp_offset(entry);
+   struct swap_info_struct *si;
struct page *page;
 
-   page = xa_load(_space->i_pages, idx);
+   si = get_swap_device(entry);
+   if (!si)
+   return NULL;
+   page = xa_load(_address_space(entry)->i_pages, swp_offset(entry));
+   put_swap_device(si);
if (xa_is_value(page))
return page;
return NULL;
-- 
2.19.1

[PATCH 2/5] swap: fix do_swap_page() race with swapoff

2021-04-08 Thread Miaohe Lin

When I was investigating the swap code, I found the below possible race
window:

CPU 1   CPU 2
-   -
do_swap_page
  synchronous swap_readpage
alloc_page_vma
swapoff
  release swap_file, bdev, or ...
  swap_readpage
check sis->flags is ok
  access swap_file, bdev...[oops!]
si->flags = 0

Using current get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really long
time. And this race may not be really pernicious because swapoff is usually
done when system shutdown only. To reduce the performance overhead on the
hot-path as much as possible, it appears we can use the percpu_ref to close
this race window(as suggested by Huang, Ying).

Fixes: 235b62176712 ("mm/swap: add cluster lock")
Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  2 +-
 mm/memory.c  | 10 ++
 mm/swapfile.c| 28 +++-
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 849ba5265c11..9066addb57fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -513,7 +513,7 @@ sector_t swap_page_sector(struct page *page);
 
 static inline void put_swap_device(struct swap_info_struct *si)
 {
-   rcu_read_unlock();
+   percpu_ref_put(>users);
 }
 
 #else /* CONFIG_SWAP */
diff --git a/mm/memory.c b/mm/memory.c
index cc71a445c76c..8543c47b955c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3311,6 +3311,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
+   struct swap_info_struct *si = NULL;
swp_entry_t entry;
pte_t pte;
int locked;
@@ -3339,6 +3340,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
}
 
 
+   si = get_swap_device(entry);
+   /* In case we raced with swapoff. */
+   if (unlikely(!si))
+   goto out;
+
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
@@ -3514,6 +3520,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+   if (si)
+   put_swap_device(si);
return ret;
 out_nomap:
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3525,6 +3533,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
unlock_page(swapcache);
put_page(swapcache);
}
+   if (si)
+   put_swap_device(si);
return ret;
 }
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 724173cd7d0c..01032c72ceae 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1280,18 +1280,12 @@ static unsigned char __swap_entry_free_locked(struct 
swap_info_struct *p,
  * via preventing the swap device from being swapoff, until
  * put_swap_device() is called.  Otherwise return NULL.
  *
- * The entirety of the RCU read critical section must come before the
- * return from or after the call to synchronize_rcu() in
- * enable_swap_info() or swapoff().  So if "si->flags & SWP_VALID" is
- * true, the si->map, si->cluster_info, etc. must be valid in the
- * critical section.
- *
  * Notice that swapoff or swapoff+swapon can still happen before the
- * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
- * in put_swap_device() if there isn't any other way to prevent
- * swapoff, such as page lock, page table lock, etc.  The caller must
- * be prepared for that.  For example, the following situation is
- * possible.
+ * percpu_ref_tryget_live() in get_swap_device() or after the
+ * percpu_ref_put() in put_swap_device() if there isn't any other way
+ * to prevent swapoff, such as page lock, page table lock, etc.  The
+ * caller must be prepared for that.  For example, the following
+ * situation is possible.
  *
  *   CPU1  CPU2
  *   do_swap_page()
@@ -1319,21 +1313,21 @@ struct swap_info_struct *get_swap_device(swp_entry_t 
entry)
si = swp_swap_info(entry);
if (!si)
goto bad_nofile;
-
-   rcu_read_lock();
if (data_race(!(si->flags & SWP_VALID)))
-   goto unlock_out;
+   goto out;
+   if (!percpu_ref_tryget_live(>users))
+   goto out;
offset = swp_offset(entry);
if (offset >= si->max)
-   goto unlock_out;
+   goto put_out;
 
return si;
 bad_nofile:
pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
 out:
return NULL;
-unlock_out:
-   rcu_read_unlock();
+put_out:
+   percpu_ref_put(>users);
return NULL;
 }
 
-- 
2.19.1

[PATCH 1/5] mm/swapfile: add percpu_ref support for swap

2021-04-08 Thread Miaohe Lin

We will use percpu-refcount to serialize against concurrent swapoff. This
patch adds the percpu_ref support for later fixup.

Signed-off-by: Miaohe Lin 
---
 include/linux/swap.h |  2 ++
 mm/swapfile.c| 25 ++---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 144727041e78..849ba5265c11 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -240,6 +240,7 @@ struct swap_cluster_list {
  * The in-memory structure used to track swap areas.
  */
 struct swap_info_struct {
+   struct percpu_ref users;/* serialization against concurrent 
swapoff */
unsigned long   flags;  /* SWP_USED etc: see above */
signed shortprio;   /* swap priority of this type */
struct plist_node list; /* entry in swap_active_head */
@@ -260,6 +261,7 @@ struct swap_info_struct {
struct block_device *bdev;  /* swap device or bdev of swap file */
struct file *swap_file; /* seldom referenced */
unsigned int old_block_size;/* seldom referenced */
+   struct completion comp; /* seldom referenced */
 #ifdef CONFIG_FRONTSWAP
unsigned long *frontswap_map;   /* frontswap in-use, one bit per page */
atomic_t frontswap_pages;   /* frontswap pages in-use counter */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 149e77454e3c..724173cd7d0c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -511,6 +512,15 @@ static void swap_discard_work(struct work_struct *work)
spin_unlock(>lock);
 }
 
+static void swap_users_ref_free(struct percpu_ref *ref)
+{
+   struct swap_info_struct *si;
+
+   si = container_of(ref, struct swap_info_struct, users);
+   complete(>comp);
+   percpu_ref_exit(>users);
+}
+
 static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
 {
struct swap_cluster_info *ci = si->cluster_info;
@@ -2500,7 +2510,7 @@ static void enable_swap_info(struct swap_info_struct *p, 
int prio,
 * Guarantee swap_map, cluster_info, etc. fields are valid
 * between get/put_swap_device() if SWP_VALID bit is set
 */
-   synchronize_rcu();
+   percpu_ref_reinit(>users);
spin_lock(_lock);
spin_lock(>lock);
_enable_swap_info(p);
@@ -2621,11 +2631,13 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
p->flags &= ~SWP_VALID; /* mark swap device as invalid */
spin_unlock(>lock);
spin_unlock(_lock);
+
+   percpu_ref_kill(>users);
/*
 * wait for swap operations protected by get/put_swap_device()
 * to complete
 */
-   synchronize_rcu();
+   wait_for_completion(>comp);
 
flush_work(>discard_work);
 
@@ -3132,7 +3144,7 @@ static bool swap_discardable(struct swap_info_struct *si)
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
struct swap_info_struct *p;
-   struct filename *name;
+   struct filename *name = NULL;
struct file *swap_file = NULL;
struct address_space *mapping;
int prio;
@@ -3163,6 +3175,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, 
specialfile, int, swap_flags)
 
INIT_WORK(>discard_work, swap_discard_work);
 
+   init_completion(>comp);
+   error = percpu_ref_init(>users, swap_users_ref_free,
+   PERCPU_REF_INIT_DEAD, GFP_KERNEL);
+   if (unlikely(error))
+   goto bad_swap;
+
name = getname(specialfile);
if (IS_ERR(name)) {
error = PTR_ERR(name);
@@ -3356,6 +3374,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, 
int, swap_flags)
 bad_swap_unlock_inode:
inode_unlock(inode);
 bad_swap:
+   percpu_ref_exit(>users);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
free_percpu(p->cluster_next_cpu);
-- 
2.19.1

[PATCH 5/5] mm/swap_state: fix swap_cluster_readahead() race with swapoff

2021-04-08 Thread Miaohe Lin

swap_cluster_readahead() could race with swapoff and might dereference
si->swap_file after it's released by swapoff. Close this race window by
using get/put_swap_device() pair.

Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3bf0d0c297bc..eba6b0cf6cf9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -626,12 +626,17 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
gfp_t gfp_mask,
unsigned long offset = entry_offset;
unsigned long start_offset, end_offset;
unsigned long mask;
-   struct swap_info_struct *si = swp_swap_info(entry);
+   struct swap_info_struct *si;
struct blk_plug plug;
bool do_poll = true, page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
 
+   si = get_swap_device(entry);
+   /* In case we raced with swapoff. */
+   if (!si)
+   return NULL;
+
mask = swapin_nr_pages(offset) - 1;
if (!mask)
goto skip;
@@ -673,7 +678,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, 
gfp_t gfp_mask,
 
lru_add_drain();/* Push any new pages onto the LRU now */
 skip:
-   return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+   page = read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll);
+   put_swap_device(si);
+   return page;
 }
 
 int init_swap_address_space(unsigned int type, unsigned long nr_pages)
-- 
2.19.1

[PATCH 4/5] mm/swap_state: fix potential faulted in race in swap_ra_info()

2021-04-08 Thread Miaohe Lin

While we released the pte lock, somebody else might faulted in this pte.
So we should check whether it's swap pte first to guard against such race
or swp_type would be unexpected. And we can also avoid some unnecessary
readahead cpu cycles possibly.

Fixes: ec560175c0b6 ("mm, swap: VMA based swap readahead")
Signed-off-by: Miaohe Lin 
---
 mm/swap_state.c | 13 +
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/mm/swap_state.c b/mm/swap_state.c
index 709c260d644a..3bf0d0c297bc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -724,10 +724,10 @@ static void swap_ra_info(struct vm_fault *vmf,
 {
struct vm_area_struct *vma = vmf->vma;
unsigned long ra_val;
-   swp_entry_t entry;
+   swp_entry_t swap_entry;
unsigned long faddr, pfn, fpfn;
unsigned long start, end;
-   pte_t *pte, *orig_pte;
+   pte_t *pte, *orig_pte, entry;
unsigned int max_win, hits, prev_win, win, left;
 #ifndef CONFIG_64BIT
pte_t *tpte;
@@ -742,8 +742,13 @@ static void swap_ra_info(struct vm_fault *vmf,
 
faddr = vmf->address;
orig_pte = pte = pte_offset_map(vmf->pmd, faddr);
-   entry = pte_to_swp_entry(*pte);
-   if ((unlikely(non_swap_entry(entry {
+   entry = *pte;
+   if (unlikely(!is_swap_pte(entry))) {
+   pte_unmap(orig_pte);
+   return;
+   }
+   swap_entry = pte_to_swp_entry(entry);
+   if ((unlikely(non_swap_entry(swap_entry {
pte_unmap(orig_pte);
return;
}
-- 
2.19.1

[PATCH 0/5] close various race windows for swap

2021-04-08 Thread Miaohe Lin

Hi all,
When I was investigating the swap code, I found some possible race
windows. This series aims to fix all these races. But using current
get/put_swap_device() to guard against concurrent swapoff for
swap_readpage() looks terrible because swap_readpage() may take really
long time. And to reduce the performance overhead on the hot-path as
much as possible, it appears we can use the percpu_ref to close this
race window(as suggested by Huang, Ying). The patch 1 adds percpu_ref
support for swap and the rest of the patches use this to close various
race windows. More details can be found in the respective changelogs.
Thanks!

Miaohe Lin (5):
  mm/swapfile: add percpu_ref support for swap
  swap: fix do_swap_page() race with swapoff
  mm/swap_state: fix get_shadow_from_swap_cache() race with swapoff
  mm/swap_state: fix potential faulted in race in swap_ra_info()
  mm/swap_state: fix swap_cluster_readahead() race with swapoff

 include/linux/swap.h |  4 +++-
 mm/memory.c  | 10 +
 mm/swap_state.c  | 33 +
 mm/swapfile.c| 50 +++-
 4 files changed, 68 insertions(+), 29 deletions(-)

-- 
2.19.1

Re: [PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-07 Thread Miaohe Lin

On 2021/4/8 11:24, Miaohe Lin wrote:
> On 2021/4/8 4:53, Mike Kravetz wrote:
>> On 4/7/21 12:24 AM, Miaohe Lin wrote:
>>> Hi:
>>> On 2021/4/7 10:49, Mike Kravetz wrote:
>>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>>> The resv_map could be NULL since this routine can be called in the evict
>>>>> inode path for all hugetlbfs inodes. So we could have chg = 0 and this
>>>>> would result in a negative value when chg - freed. This is unexpected for
>>>>> hugepage_subpool_put_pages() and hugetlb_acct_memory().
>>>>
>>>> I am not sure if this is possible.
>>>>
>>>> It is true that resv_map could be NULL.  However, I believe resv map
>>>> can only be NULL for inodes that are not regular or link inodes.  This
>>>> is the inode creation code in hugetlbfs_get_inode().
>>>>
>>>>/*
>>>>  * Reserve maps are only needed for inodes that can have associated
>>>>  * page allocations.
>>>>  */
>>>> if (S_ISREG(mode) || S_ISLNK(mode)) {
>>>> resv_map = resv_map_alloc();
>>>> if (!resv_map)
>>>> return NULL;
>>>> }
>>>>
>>>
>>> Agree.
>>>
>>>> If resv_map is NULL, then no hugetlb pages can be allocated/associated
>>>> with the file.  As a result, remove_inode_hugepages will never find any
>>>> huge pages associated with the inode and the passed value 'freed' will
>>>> always be zero.
>>>>
>>>
>>> But I am confused now. AFAICS, remove_inode_hugepages() searches the 
>>> address_space of
>>> the inode to remove the hugepages while does not care if inode has 
>>> associated resv_map.
>>> How does it prevent hugetlb pages from being allocated/associated with the 
>>> file if
>>> resv_map is NULL? Could you please explain this more?
>>>
>>
>> Recall that there are only two ways to get huge pages associated with
>> a hugetlbfs file: fallocate and mmap/write fault.  Directly writing to
>> hugetlbfs files is not supported.
>>
>> If you take a closer look at hugetlbfs_get_inode, it has that code to
>> allocate the resv map mentioned above as well as the following:
>>
>>  switch (mode & S_IFMT) {
>>  default:
>>  init_special_inode(inode, mode, dev);
>>  break;
>>  case S_IFREG:
>>  inode->i_op = _inode_operations;
>>  inode->i_fop = _file_operations;
>>  break;
>>  case S_IFDIR:
>>  inode->i_op = _dir_inode_operations;
>>  inode->i_fop = _dir_operations;
>>
>>  /* directory inodes start off with i_nlink == 2 (for 
>> "." entry) */
>>  inc_nlink(inode);
>>  break;
>>  case S_IFLNK:
>>  inode->i_op = _symlink_inode_operations;
>>  inode_nohighmem(inode);
>>  break;
>>  }
>>
>> Notice that only S_IFREG inodes will have i_fop == 
>> _file_operations.
>> hugetlbfs_file_operations contain the hugetlbfs specific mmap and fallocate
>> routines.  Hence, only files with S_IFREG inodes can potentially have
>> associated huge pages.  S_IFLNK inodes can as well via file linking.
>>
>> If an inode is not S_ISREG(mode) || S_ISLNK(mode), then it will not have
>> a resv_map.  In addition, it will not have hugetlbfs_file_operations and
>> can not have associated huge pages.
>>
> 
> Many many thanks for detailed and patient explanation! :) I think I have got 
> the idea!
> 
>> I looked at this closely when adding commits
>> 58b6e5e8f1ad hugetlbfs: fix memory leak for resv_map
>> f27a5136f70a hugetlbfs: always use address space in inode for resv_map 
>> pointer
>>
>> I may not be remembering all of the details correctly.  Commit f27a5136f70a
>> added the comment that resv_map could be NULL to hugetlb_unreserve_pages.
>>
> 
> Since we must have freed == 0 while chg == 0. Should we make this assumption 
> explict
> by something like below?
> 
> WARN_ON(chg < freed);
> 

Or just a comment to avoid confusion ?

> Thanks again!
>

Re: [PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-07 Thread Miaohe Lin

On 2021/4/8 4:53, Mike Kravetz wrote:
> On 4/7/21 12:24 AM, Miaohe Lin wrote:
>> Hi:
>> On 2021/4/7 10:49, Mike Kravetz wrote:
>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>> The resv_map could be NULL since this routine can be called in the evict
>>>> inode path for all hugetlbfs inodes. So we could have chg = 0 and this
>>>> would result in a negative value when chg - freed. This is unexpected for
>>>> hugepage_subpool_put_pages() and hugetlb_acct_memory().
>>>
>>> I am not sure if this is possible.
>>>
>>> It is true that resv_map could be NULL.  However, I believe resv map
>>> can only be NULL for inodes that are not regular or link inodes.  This
>>> is the inode creation code in hugetlbfs_get_inode().
>>>
>>>/*
>>>  * Reserve maps are only needed for inodes that can have associated
>>>  * page allocations.
>>>  */
>>> if (S_ISREG(mode) || S_ISLNK(mode)) {
>>> resv_map = resv_map_alloc();
>>> if (!resv_map)
>>> return NULL;
>>> }
>>>
>>
>> Agree.
>>
>>> If resv_map is NULL, then no hugetlb pages can be allocated/associated
>>> with the file.  As a result, remove_inode_hugepages will never find any
>>> huge pages associated with the inode and the passed value 'freed' will
>>> always be zero.
>>>
>>
>> But I am confused now. AFAICS, remove_inode_hugepages() searches the 
>> address_space of
>> the inode to remove the hugepages while does not care if inode has 
>> associated resv_map.
>> How does it prevent hugetlb pages from being allocated/associated with the 
>> file if
>> resv_map is NULL? Could you please explain this more?
>>
> 
> Recall that there are only two ways to get huge pages associated with
> a hugetlbfs file: fallocate and mmap/write fault.  Directly writing to
> hugetlbfs files is not supported.
> 
> If you take a closer look at hugetlbfs_get_inode, it has that code to
> allocate the resv map mentioned above as well as the following:
> 
>   switch (mode & S_IFMT) {
>   default:
>   init_special_inode(inode, mode, dev);
>   break;
>   case S_IFREG:
>   inode->i_op = _inode_operations;
>   inode->i_fop = _file_operations;
>   break;
>   case S_IFDIR:
>   inode->i_op = _dir_inode_operations;
>   inode->i_fop = _dir_operations;
> 
>   /* directory inodes start off with i_nlink == 2 (for 
> "." entry) */
>   inc_nlink(inode);
>   break;
>   case S_IFLNK:
>   inode->i_op = _symlink_inode_operations;
>   inode_nohighmem(inode);
>   break;
>   }
> 
> Notice that only S_IFREG inodes will have i_fop == _file_operations.
> hugetlbfs_file_operations contain the hugetlbfs specific mmap and fallocate
> routines.  Hence, only files with S_IFREG inodes can potentially have
> associated huge pages.  S_IFLNK inodes can as well via file linking.
> 
> If an inode is not S_ISREG(mode) || S_ISLNK(mode), then it will not have
> a resv_map.  In addition, it will not have hugetlbfs_file_operations and
> can not have associated huge pages.
> 

Many many thanks for detailed and patient explanation! :) I think I have got 
the idea!

> I looked at this closely when adding commits
> 58b6e5e8f1ad hugetlbfs: fix memory leak for resv_map
> f27a5136f70a hugetlbfs: always use address space in inode for resv_map pointer
> 
> I may not be remembering all of the details correctly.  Commit f27a5136f70a
> added the comment that resv_map could be NULL to hugetlb_unreserve_pages.
> 

Since we must have freed == 0 while chg == 0. Should we make this assumption 
explict
by something like below?

WARN_ON(chg < freed);

Thanks again!

Re: [PATCH 2/4] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-07 Thread Miaohe Lin

On 2021/4/8 5:23, Mike Kravetz wrote:
> On 4/6/21 8:09 PM, Miaohe Lin wrote:
>> On 2021/4/7 10:37, Mike Kravetz wrote:
>>> On 4/6/21 7:05 PM, Miaohe Lin wrote:
>>>> Hi:
>>>> On 2021/4/7 8:53, Mike Kravetz wrote:
>>>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>>>> It's guaranteed that the vma is associated with a resv_map, i.e. either
>>>>>> VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
>>>>>> have returned via !resv check above. So ret must be less than 0 in the
>>>>>> 'else' case. Simplify the return code to make this clear.
>>>>>
>>>>> I believe we still neeed that ternary operator in the return statement.
>>>>> Why?
>>>>>
>>>>> There are two basic types of mappings to be concerned with:
>>>>> shared and private.
>>>>> For private mappings, a task can 'own' the mapping as indicated by
>>>>> HPAGE_RESV_OWNER.  Or, it may not own the mapping.  The most common way
>>>>> to create a non-owner private mapping is to have a task with a private
>>>>> mapping fork.  The parent process will have HPAGE_RESV_OWNER set, the
>>>>> child process will not.  The idea is that since the child has a COW copy
>>>>> of the mapping it should not consume reservations made by the parent.
>>>>
>>>> The child process will not have HPAGE_RESV_OWNER set because at fork time, 
>>>> we do:
>>>>/*
>>>> * Clear hugetlb-related page reserves for children. This only
>>>> * affects MAP_PRIVATE mappings. Faults generated by the child
>>>> * are not guaranteed to succeed, even if read-only
>>>> */
>>>>        if (is_vm_hugetlb_page(tmp))
>>>>reset_vma_resv_huge_pages(tmp);
>>>> i.e. we have vma->vm_private_data = (void *)0; for child process and 
>>>> vma_resv_map() will
>>>> return NULL in this case.
>>>> Or am I missed something?
>>>>
>>>>> Only the parent (HPAGE_RESV_OWNER) is allowed to consume the
>>>>> reservations.
>>>>> Hope that makens sense?
>>>>>
>>>>>>
>>>>>> Signed-off-by: Miaohe Lin 
>>>>>> ---
>>>>>>  mm/hugetlb.c | 2 +-
>>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>>>> index a03a50b7c410..b7864abded3d 100644
>>>>>> --- a/mm/hugetlb.c
>>>>>> +++ b/mm/hugetlb.c
>>>>>> @@ -2183,7 +2183,7 @@ static long __vma_reservation_common(struct hstate 
>>>>>> *h,
>>>>>>  return 1;
>>>>>>  }
>>>>>>  else
>>>>>
>>>>> This else also handles the case !HPAGE_RESV_OWNER.  In this case, we
>>>>
>>>> IMO, for the case !HPAGE_RESV_OWNER, we won't reach here. What do you 
>>>> think?
>>>>
>>>
>>> I think you are correct.
>>>
>>> However, if this is true we should be able to simply the code even
>>> further.  There is no need to check for HPAGE_RESV_OWNER because we know
>>> it must be set.  Correct?  If so, the code could look something like:
>>>
>>> if (vma->vm_flags & VM_MAYSHARE)
>>> return ret;
>>>
>>> /* We know private mapping with HPAGE_RESV_OWNER */
>>>  * ...   *
>>>  * Add that existing comment */
>>>
>>> if (ret > 0)
>>> return 0;
>>> if (ret == 0)
>>> return 1;
>>> return ret;
>>>
>>
>> Many thanks for good suggestion! What do you mean is this ?
> 
> I think the below changes would work fine.
> 
> However, this patch/discussion has made me ask the question.  Do we need
> the HPAGE_RESV_OWNER flag?  Is the followng true?
> !(vm_flags & VM_MAYSHARE) && vma_resv_map()  ===> HPAGE_RESV_OWNER
> !(vm_flags & VM_MAYSHARE) && !vma_resv_map() ===> !HPAGE_RESV_OWNER
> 

I agree with you.

HPAGE_RESV_OWNER is set in hugetlb_reserve_pages() and there's no way to clear 
it
in the owner process. The child process can not inherit both HPAGE_RESV_OWNER 
and
resv_map. So for !HPAGE_RESV_OWNER vma, it knows nothing about resv_map.

IMO, in !(vm_flags & VM_MAYSHARE) case, we must have:
!!vma_resv_map() == !!HPAGE_RESV_OWNER

> I am not suggesting we eliminate the flag and make corresponding
> changes.  Just curious if you believe we 'could' remove the flag and
> depend on the above conditions.
> 
> One reason for NOT removing the flag is that that flag itself and
> supporting code and commnets help explain what happens with hugetlb
> reserves for COW mappings.  That code is hard to understand and the
> existing code and coments around HPAGE_RESV_OWNER help with
> understanding.

Agree. These codes took me several days to understand...

> 

Thanks.

Re: [PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-07 Thread Miaohe Lin

Hi:
On 2021/4/7 10:49, Mike Kravetz wrote:
> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>> The resv_map could be NULL since this routine can be called in the evict
>> inode path for all hugetlbfs inodes. So we could have chg = 0 and this
>> would result in a negative value when chg - freed. This is unexpected for
>> hugepage_subpool_put_pages() and hugetlb_acct_memory().
> 
> I am not sure if this is possible.
> 
> It is true that resv_map could be NULL.  However, I believe resv map
> can only be NULL for inodes that are not regular or link inodes.  This
> is the inode creation code in hugetlbfs_get_inode().
> 
>/*
>  * Reserve maps are only needed for inodes that can have associated
>  * page allocations.
>  */
> if (S_ISREG(mode) || S_ISLNK(mode)) {
> resv_map = resv_map_alloc();
> if (!resv_map)
> return NULL;
> }
> 

Agree.

> If resv_map is NULL, then no hugetlb pages can be allocated/associated
> with the file.  As a result, remove_inode_hugepages will never find any
> huge pages associated with the inode and the passed value 'freed' will
> always be zero.
> 

But I am confused now. AFAICS, remove_inode_hugepages() searches the 
address_space of
the inode to remove the hugepages while does not care if inode has associated 
resv_map.
How does it prevent hugetlb pages from being allocated/associated with the file 
if
resv_map is NULL? Could you please explain this more?

Many thanks.

> Does that sound correct?
>

Re: [PATCH 2/4] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-06 Thread Miaohe Lin

On 2021/4/7 10:37, Mike Kravetz wrote:
> On 4/6/21 7:05 PM, Miaohe Lin wrote:
>> Hi:
>> On 2021/4/7 8:53, Mike Kravetz wrote:
>>> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>>>> It's guaranteed that the vma is associated with a resv_map, i.e. either
>>>> VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
>>>> have returned via !resv check above. So ret must be less than 0 in the
>>>> 'else' case. Simplify the return code to make this clear.
>>>
>>> I believe we still neeed that ternary operator in the return statement.
>>> Why?
>>>
>>> There are two basic types of mappings to be concerned with:
>>> shared and private.
>>> For private mappings, a task can 'own' the mapping as indicated by
>>> HPAGE_RESV_OWNER.  Or, it may not own the mapping.  The most common way
>>> to create a non-owner private mapping is to have a task with a private
>>> mapping fork.  The parent process will have HPAGE_RESV_OWNER set, the
>>> child process will not.  The idea is that since the child has a COW copy
>>> of the mapping it should not consume reservations made by the parent.
>>
>> The child process will not have HPAGE_RESV_OWNER set because at fork time, 
>> we do:
>>  /*
>>   * Clear hugetlb-related page reserves for children. This only
>>   * affects MAP_PRIVATE mappings. Faults generated by the child
>>   * are not guaranteed to succeed, even if read-only
>>   */
>>  if (is_vm_hugetlb_page(tmp))
>>  reset_vma_resv_huge_pages(tmp);
>> i.e. we have vma->vm_private_data = (void *)0; for child process and 
>> vma_resv_map() will
>> return NULL in this case.
>> Or am I missed something?
>>
>>> Only the parent (HPAGE_RESV_OWNER) is allowed to consume the
>>> reservations.
>>> Hope that makens sense?
>>>
>>>>
>>>> Signed-off-by: Miaohe Lin 
>>>> ---
>>>>  mm/hugetlb.c | 2 +-
>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>
>>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>>> index a03a50b7c410..b7864abded3d 100644
>>>> --- a/mm/hugetlb.c
>>>> +++ b/mm/hugetlb.c
>>>> @@ -2183,7 +2183,7 @@ static long __vma_reservation_common(struct hstate 
>>>> *h,
>>>>return 1;
>>>>}
>>>>else
>>>
>>> This else also handles the case !HPAGE_RESV_OWNER.  In this case, we
>>
>> IMO, for the case !HPAGE_RESV_OWNER, we won't reach here. What do you think?
>>
> 
> I think you are correct.
> 
> However, if this is true we should be able to simply the code even
> further.  There is no need to check for HPAGE_RESV_OWNER because we know
> it must be set.  Correct?  If so, the code could look something like:
> 
>   if (vma->vm_flags & VM_MAYSHARE)
>   return ret;
> 
>   /* We know private mapping with HPAGE_RESV_OWNER */
>* ...   *
>* Add that existing comment */
> 
>   if (ret > 0)
>   return 0;
>   if (ret == 0)
>   return 1;
>   return ret;
> 

Many thanks for good suggestion! What do you mean is this ?

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a03a50b7c410..9b4c05699a90 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2163,27 +2163,26 @@ static long __vma_reservation_common(struct hstate *h,

if (vma->vm_flags & VM_MAYSHARE)
return ret;
-   else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-   /*
-* In most cases, reserves always exist for private mappings.
-* However, a file associated with mapping could have been
-* hole punched or truncated after reserves were consumed.
-* As subsequent fault on such a range will not use reserves.
-* Subtle - The reserve map for private mappings has the
-* opposite meaning than that of shared mappings.  If NO
-* entry is in the reserve map, it means a reservation exists.
-* If an entry exists in the reserve map, it means the
-* reservation has already been consumed.  As a result, the
-* return value of this routine is the opposite of the
-* value returned from reserve map manipulation routines above.
-*/
-   if (ret)
-   return 0;
-   else
-

Re: [PATCH 2/4] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-06 Thread Miaohe Lin

Hi:
On 2021/4/7 8:53, Mike Kravetz wrote:
> On 4/2/21 2:32 AM, Miaohe Lin wrote:
>> It's guaranteed that the vma is associated with a resv_map, i.e. either
>> VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
>> have returned via !resv check above. So ret must be less than 0 in the
>> 'else' case. Simplify the return code to make this clear.
> 
> I believe we still neeed that ternary operator in the return statement.
> Why?
> 
> There are two basic types of mappings to be concerned with:
> shared and private.
> For private mappings, a task can 'own' the mapping as indicated by
> HPAGE_RESV_OWNER.  Or, it may not own the mapping.  The most common way
> to create a non-owner private mapping is to have a task with a private
> mapping fork.  The parent process will have HPAGE_RESV_OWNER set, the
> child process will not.  The idea is that since the child has a COW copy
> of the mapping it should not consume reservations made by the parent.

The child process will not have HPAGE_RESV_OWNER set because at fork time, we 
do:
/*
 * Clear hugetlb-related page reserves for children. This only
 * affects MAP_PRIVATE mappings. Faults generated by the child
 * are not guaranteed to succeed, even if read-only
 */
if (is_vm_hugetlb_page(tmp))
reset_vma_resv_huge_pages(tmp);
i.e. we have vma->vm_private_data = (void *)0; for child process and 
vma_resv_map() will
return NULL in this case.
Or am I missed something?

> Only the parent (HPAGE_RESV_OWNER) is allowed to consume the
> reservations.
> Hope that makens sense?
> 
>>
>> Signed-off-by: Miaohe Lin 
>> ---
>>  mm/hugetlb.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>> index a03a50b7c410..b7864abded3d 100644
>> --- a/mm/hugetlb.c
>> +++ b/mm/hugetlb.c
>> @@ -2183,7 +2183,7 @@ static long __vma_reservation_common(struct hstate *h,
>>  return 1;
>>  }
>>  else
> 
> This else also handles the case !HPAGE_RESV_OWNER.  In this case, we

IMO, for the case !HPAGE_RESV_OWNER, we won't reach here. What do you think?

> never want to indicate reservations are available.  The ternary makes
> sure a positive value is never returned.
> 

Many thanks for review and reply! :)

[PATCH] mm/frontswap: fix frontswap_register_ops() race with swapon and swapoff

2021-04-05 Thread Miaohe Lin

frontswap_register_ops can race with swapon. Consider the following scene:

CPU1CPU2

frontswap_register_ops
  fill bitmap a
  ops->init
sys_swapon
  enable_swap_info
frontswap_init without new ops
  add ops to frontswap_ops list
  check if swap_active_head changed
add to swap_active_head

So the frontswap_ops init is missed on the new swap device. Consider the
another scene:
CPU1CPU2

frontswap_register_ops
  fill bitmap a
  ops->init
  add ops to frontswap_ops list
sys_swapon
  enable_swap_info
frontswap_init with new ops
add to swap_active_head
  check if swap_active_head changed
  ops->init for new swap device [twice!]

The frontswap_ops init will be called two times on the new swap device this
time. frontswap_register_ops can also race with swapoff. Consider the
following scene:

CPU1CPU2

sys_swapoff
  removed from swap_active_head
frontswap_register_ops
  fill bitmap a
  ops->init without swap device
  add ops to frontswap_ops list
invalidate_area with new ops
  check if swap_active_head changed

We could call invalidate_area on a swap device under swapoff with frontswap
is uninitialized yet. Fix all these by using swapon_mutex to guard against
race with swapon and add swap_info_get_if_under_swapoff() to collect swap
devices under swapoff.

Fixes: d1dc6f1bcf1e ("frontswap: allow multiple backends")
Signed-off-by: Miaohe Lin 
---
 include/linux/swapfile.h |  2 ++
 mm/frontswap.c   | 40 +---
 mm/swapfile.c| 13 -
 3 files changed, 31 insertions(+), 24 deletions(-)

diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e06febf62978..7ae15d917828 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -9,8 +9,10 @@
 extern spinlock_t swap_lock;
 extern struct plist_head swap_active_head;
 extern struct swap_info_struct *swap_info[];
+extern struct mutex swapon_mutex;
 extern int try_to_unuse(unsigned int, bool, unsigned long);
 extern unsigned long generic_max_swapfile_size(void);
 extern unsigned long max_swapfile_size(void);
+extern struct swap_info_struct *swap_info_get_if_under_swapoff(int type);
 
 #endif /* _LINUX_SWAPFILE_H */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 130e301c5ac0..c16bfc7550b5 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -123,12 +123,26 @@ void frontswap_register_ops(struct frontswap_ops *ops)
 
bitmap_zero(a, MAX_SWAPFILES);
bitmap_zero(b, MAX_SWAPFILES);
-
+   mutex_lock(_mutex);
spin_lock(_lock);
plist_for_each_entry(si, _active_head, list) {
if (!WARN_ON(!si->frontswap_map))
set_bit(si->type, a);
}
+   /*
+* There might be some swap devices under swapoff, i.e. they are
+* removed from swap_active_head but frontswap_invalidate_area()
+* is not called yet due to swapon_mutex is held here. We must
+* collect these swap devices and call ops->init on them or they
+* might invalidate frontswap area while frontswap is uninitialized.
+*/
+   for_each_clear_bit(i, a, MAX_SWAPFILES) {
+   si = swap_info_get_if_under_swapoff(i);
+   if (!si || !si->frontswap_map)
+   continue;
+   set_bit(si->type, b);
+   }
+   bitmap_or(a, a, b, MAX_SWAPFILES);
spin_unlock(_lock);
 
/* the new ops needs to know the currently active swap devices */
@@ -144,29 +158,9 @@ void frontswap_register_ops(struct frontswap_ops *ops)
ops->next = frontswap_ops;
} while (cmpxchg(_ops, ops->next, ops) != ops->next);
 
-   static_branch_inc(_enabled_key);
-
-   spin_lock(_lock);
-   plist_for_each_entry(si, _active_head, list) {
-   if (si->frontswap_map)
-   set_bit(si->type, b);
-   }
-   spin_unlock(_lock);
+   mutex_unlock(_mutex);
 
-   /*
-* On the very unlikely chance that a swap device was added or
-* removed between setting the "a" list bits and the ops init
-* calls, we re-check and do init or invalidate for any changed
-* bits.
-*/
-   if (unlikely(!bitmap_equal(a, b, MAX_S

[PATCH 3/4] mm/hugeltb: fix potential wrong gbl_reserve value for hugetlb_acct_memory()

2021-04-02 Thread Miaohe Lin

The resv_map could be NULL since this routine can be called in the evict
inode path for all hugetlbfs inodes. So we could have chg = 0 and this
would result in a negative value when chg - freed. This is unexpected for
hugepage_subpool_put_pages() and hugetlb_acct_memory().

Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b7864abded3d..bdff8d23803f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5413,6 +5413,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long 
start, long end,
long chg = 0;
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+   long delta;
 
/*
 * Since this routine can be called in the evict inode path for all
@@ -5437,7 +5438,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long 
start, long end,
 * If the subpool has a minimum size, the number of global
 * reservations to be released may be adjusted.
 */
-   gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+   delta = chg > 0 ? chg - freed : freed;
+   gbl_reserve = hugepage_subpool_put_pages(spool, delta);
hugetlb_acct_memory(h, -gbl_reserve);
 
return 0;
-- 
2.19.1

[PATCH 4/4] mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()

2021-04-02 Thread Miaohe Lin

A rare out of memory error would prevent removal of the reserve map region
for a page. hugetlb_fix_reserve_counts() handles this rare case to avoid
dangling with incorrect counts. Unfortunately, hugepage_subpool_get_pages
and hugetlb_acct_memory could possibly fail too. We should correctly handle
these cases.

Fixes: b5cec28d36f5 ("hugetlbfs: truncate_hugepages() takes a range of pages")
Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bdff8d23803f..ca5464ed04b7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -745,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
 {
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+   bool reserved = false;
 
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-   if (rsv_adjust) {
+   if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
 
-   hugetlb_acct_memory(h, 1);
+   if (!hugetlb_acct_memory(h, 1))
+   reserved = true;
+   } else if (!rsv_adjust) {
+   reserved = true;
}
+
+   if (!reserved)
+   pr_warn("hugetlb: fix reserve count failed\n");
 }
 
 /*
-- 
2.19.1

[PATCH 1/4] mm/hugeltb: remove redundant VM_BUG_ON() in region_add()

2021-04-02 Thread Miaohe Lin

The same VM_BUG_ON() check is already done in the callee. Remove this extra
one to simplify the code slightly.

Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c22111f3da20..a03a50b7c410 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -556,7 +556,6 @@ static long region_add(struct resv_map *resv, long f, long 
t,
resv->adds_in_progress -= in_regions_needed;
 
spin_unlock(>lock);
-   VM_BUG_ON(add < 0);
return add;
 }
 
-- 
2.19.1

[PATCH 2/4] mm/hugeltb: simplify the return code of __vma_reservation_common()

2021-04-02 Thread Miaohe Lin

It's guaranteed that the vma is associated with a resv_map, i.e. either
VM_MAYSHARE or HPAGE_RESV_OWNER, when the code reaches here or we would
have returned via !resv check above. So ret must be less than 0 in the
'else' case. Simplify the return code to make this clear.

Signed-off-by: Miaohe Lin 
---
 mm/hugetlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a03a50b7c410..b7864abded3d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2183,7 +2183,7 @@ static long __vma_reservation_common(struct hstate *h,
return 1;
}
else
-   return ret < 0 ? ret : 0;
+   return ret;
 }
 
 static long vma_needs_reservation(struct hstate *h,
-- 
2.19.1

[PATCH 0/4] Cleanup and fixup for hugetlb

2021-04-02 Thread Miaohe Lin

Hi all,
This series contains cleanups to remove redundant VM_BUG_ON() and
simplify the return code. Also this fixes potential wrong gbl_reserve
value and handle the error case in hugetlb_fix_reserve_counts(). More
details can be found in the respective changelogs. Thanks!

Miaohe Lin (4):
  mm/hugeltb: remove redundant VM_BUG_ON() in region_add()
  mm/hugeltb: simplify the return code of __vma_reservation_common()
  mm/hugeltb: fix potential wrong gbl_reserve value for
hugetlb_acct_memory()
  mm/hugeltb: handle the error case in hugetlb_fix_reserve_counts()

 mm/hugetlb.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

-- 
2.19.1

Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Miaohe Lin

On 2021/4/1 11:35, Roman Gushchin wrote:
> On Thu, Apr 01, 2021 at 11:31:16AM +0800, Miaohe Lin wrote:
>> On 2021/4/1 11:01, Muchun Song wrote:
>>> Christian Borntraeger reported a warning about "percpu ref
>>> (obj_cgroup_release) <= 0 (-1) after switching to atomic".
>>> Because we forgot to obtain the reference to the objcg and
>>> wrongly obtain the reference of memcg.
>>>
>>> Reported-by: Christian Borntraeger 
>>> Signed-off-by: Muchun Song 
>>
>> Thanks for the patch.
>> Is a Fixes tag needed?
> 
> No, as the original patch hasn't been merged into the Linus's tree yet.
> So the fix can be simply squashed.
> 
> Btw, the fix looks good to me.
> 
> Acked-by: Roman Gushchin 
> 

I see. Many thanks for explanation!

The code looks good to me.
Reviewed-by: Miaohe Lin 

>>
>>> ---
>>>  include/linux/memcontrol.h | 6 ++
>>>  mm/memcontrol.c| 6 +-
>>>  2 files changed, 11 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>>> index 0e8907957227..c960fd49c3e8 100644
>>> --- a/include/linux/memcontrol.h
>>> +++ b/include/linux/memcontrol.h
>>> @@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup 
>>> *objcg)
>>> percpu_ref_get(>refcnt);
>>>  }
>>>  
>>> +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
>>> +  unsigned long nr)
>>> +{
>>> +   percpu_ref_get_many(>refcnt, nr);
>>> +}
>>> +
>>>  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>>>  {
>>> percpu_ref_put(>refcnt);
>>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>>> index c0b83a396299..64ada9e650a5 100644
>>> --- a/mm/memcontrol.c
>>> +++ b/mm/memcontrol.c
>>> @@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned 
>>> int nr)
>>>  
>>> for (i = 1; i < nr; i++)
>>> head[i].memcg_data = head->memcg_data;
>>> -   css_get_many(>css, nr - 1);
>>> +
>>> +   if (PageMemcgKmem(head))
>>> +   obj_cgroup_get_many(__page_objcg(head), nr - 1);
>>> +   else
>>> +   css_get_many(>css, nr - 1);
>>>  }
>>>  
>>>  #ifdef CONFIG_MEMCG_SWAP
>>>
>>
> .
>

Re: [PATCH] mm: memcontrol: fix forget to obtain the ref to objcg in split_page_memcg

2021-03-31 Thread Miaohe Lin

On 2021/4/1 11:01, Muchun Song wrote:
> Christian Borntraeger reported a warning about "percpu ref
> (obj_cgroup_release) <= 0 (-1) after switching to atomic".
> Because we forgot to obtain the reference to the objcg and
> wrongly obtain the reference of memcg.
> 
> Reported-by: Christian Borntraeger 
> Signed-off-by: Muchun Song 

Thanks for the patch.
Is a Fixes tag needed?

> ---
>  include/linux/memcontrol.h | 6 ++
>  mm/memcontrol.c| 6 +-
>  2 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 0e8907957227..c960fd49c3e8 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -804,6 +804,12 @@ static inline void obj_cgroup_get(struct obj_cgroup 
> *objcg)
>   percpu_ref_get(>refcnt);
>  }
>  
> +static inline void obj_cgroup_get_many(struct obj_cgroup *objcg,
> +unsigned long nr)
> +{
> + percpu_ref_get_many(>refcnt, nr);
> +}
> +
>  static inline void obj_cgroup_put(struct obj_cgroup *objcg)
>  {
>   percpu_ref_put(>refcnt);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index c0b83a396299..64ada9e650a5 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -3133,7 +3133,11 @@ void split_page_memcg(struct page *head, unsigned int 
> nr)
>  
>   for (i = 1; i < nr; i++)
>   head[i].memcg_data = head->memcg_data;
> - css_get_many(>css, nr - 1);
> +
> + if (PageMemcgKmem(head))
> + obj_cgroup_get_many(__page_objcg(head), nr - 1);
> + else
> + css_get_many(>css, nr - 1);
>  }
>  
>  #ifdef CONFIG_MEMCG_SWAP
>

[PATCH 3/4] ksm: remove dedicated macro KSM_FLAG_MASK

2021-03-30 Thread Miaohe Lin

The macro KSM_FLAG_MASK is used in rmap_walk_ksm() only. So we can replace
~KSM_FLAG_MASK with PAGE_MASK to remove this dedicated macro and make code
more consistent because PAGE_MASK is used elsewhere in this file.

Signed-off-by: Miaohe Lin 
---
 mm/ksm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 5650a282da3e..f9ca6d94bf7f 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -215,8 +215,6 @@ struct rmap_item {
 #define SEQNR_MASK 0x0ff   /* low bits of unstable tree seqnr */
 #define UNSTABLE_FLAG  0x100   /* is a node of the unstable tree */
 #define STABLE_FLAG0x200   /* is listed from the stable tree */
-#define KSM_FLAG_MASK  (SEQNR_MASK|UNSTABLE_FLAG|STABLE_FLAG)
-   /* to mask all the flags */
 
 /* The stable and unstable tree heads */
 static struct rb_root one_stable_tree[1] = { RB_ROOT };
@@ -2631,7 +2629,7 @@ void rmap_walk_ksm(struct page *page, struct 
rmap_walk_control *rwc)
vma = vmac->vma;
 
/* Ignore the stable/unstable/sqnr flags */
-   addr = rmap_item->address & ~KSM_FLAG_MASK;
+   addr = rmap_item->address & PAGE_MASK;
 
if (addr < vma->vm_start || addr >= vma->vm_end)
continue;
-- 
2.19.1

[PATCH 1/4] ksm: remove redundant VM_BUG_ON_PAGE() on stable_tree_search()

2021-03-30 Thread Miaohe Lin

The same VM_BUG_ON_PAGE() check is already done in the callee. Remove these
extra caller one to simplify code slightly.

Signed-off-by: Miaohe Lin 
---
 mm/ksm.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 1f2c62e1d797..359afb3023b4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1771,7 +1771,6 @@ static struct page *stable_tree_search(struct page *page)
 * stable_node_dup is the dup to replace.
 */
if (stable_node_dup == stable_node) {
-   VM_BUG_ON(is_stable_node_chain(stable_node_dup));
VM_BUG_ON(is_stable_node_dup(stable_node_dup));
/* chain is missing so create it */
stable_node = alloc_stable_node_chain(stable_node_dup,
@@ -1785,7 +1784,6 @@ static struct page *stable_tree_search(struct page *page)
 * of the current nid for this page
 * content.
 */
-   VM_BUG_ON(!is_stable_node_chain(stable_node));
VM_BUG_ON(!is_stable_node_dup(stable_node_dup));
VM_BUG_ON(page_node->head != _nodes);
list_del(_node->list);
-- 
2.19.1

[PATCH 4/4] ksm: fix potential missing rmap_item for stable_node

2021-03-30 Thread Miaohe Lin

When remove rmap_item from stable tree, STABLE_FLAG of rmap_item is cleared
with head reserved. So the following scenario might happen:
For ksm page with rmap_item1:
cmp_and_merge_page
  stable_node->head = _nodes;
  remove_rmap_item_from_tree, but head still equal to stable_node;
  try_to_merge_with_ksm_page failed;
  return;
For the same ksm page with rmap_item2, stable node migration succeed this
time. The stable_node->head does not equal to migrate_nodes now.
For ksm page with rmap_item1 again:
cmp_and_merge_page
 stable_node->head != _nodes && rmap_item->head == stable_node
 return;
We would miss the rmap_item for stable_node and might result in failed
rmap_walk_ksm(). Fix this by set rmap_item->head to NULL when rmap_item
is removed from stable tree.

Fixes: 4146d2d673e8 ("ksm: make !merge_across_nodes migration safe")
Signed-off-by: Miaohe Lin 
---
 mm/ksm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/ksm.c b/mm/ksm.c
index f9ca6d94bf7f..8609c67f04a2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -791,6 +791,7 @@ static void remove_rmap_item_from_tree(struct rmap_item 
*rmap_item)
stable_node->rmap_hlist_len--;
 
put_anon_vma(rmap_item->anon_vma);
+   rmap_item->head = NULL;
rmap_item->address &= PAGE_MASK;
 
} else if (rmap_item->address & UNSTABLE_FLAG) {
-- 
2.19.1

[PATCH 2/4] ksm: use GET_KSM_PAGE_NOLOCK to get ksm page in remove_rmap_item_from_tree()

2021-03-30 Thread Miaohe Lin

It's unnecessary to lock the page when get ksm page if we're going to
remove the rmap item as page migration is irrelevant in this case. Use
GET_KSM_PAGE_NOLOCK instead to save some page lock cycles.

Signed-off-by: Miaohe Lin 
---
 mm/ksm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index 359afb3023b4..5650a282da3e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -778,12 +778,11 @@ static void remove_rmap_item_from_tree(struct rmap_item 
*rmap_item)
struct page *page;
 
stable_node = rmap_item->head;
-   page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
+   page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
if (!page)
goto out;
 
hlist_del(_item->hlist);
-   unlock_page(page);
put_page(page);
 
if (!hlist_empty(_node->hlist))
-- 
2.19.1

[PATCH 0/4] Cleanup and fixup for ksm

2021-03-30 Thread Miaohe Lin

Hi all,
This series contains cleanups to remove unnecessary VM_BUG_ON_PAGE and
dedicated macro KSM_FLAG_MASK. Also this fixes potential missing rmap_item
for stable_node which would result in failed rmap_walk_ksm(). More details
can be found in the respective changelogs. Thanks!

Miaohe Lin (4):
  ksm: remove redundant VM_BUG_ON_PAGE() on stable_tree_search()
  ksm: use GET_KSM_PAGE_NOLOCK to get ksm page in
remove_rmap_item_from_tree()
  ksm: remove dedicated macro KSM_FLAG_MASK
  ksm: fix potential missing rmap_item for stable_node

 mm/ksm.c | 10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

-- 
2.19.1

Re: [Question] Is there a race window between swapoff vs synchronous swap_readpage

2021-03-30 Thread Miaohe Lin

On 2021/3/30 11:44, Huang, Ying wrote:
> Miaohe Lin  writes:
> 
>> On 2021/3/30 9:57, Huang, Ying wrote:
>>> Hi, Miaohe,
>>>
>>> Miaohe Lin  writes:
>>>
>>>> Hi all,
>>>> I am investigating the swap code, and I found the below possible race 
>>>> window:
>>>>
>>>> CPU 1  CPU 2
>>>> -  -
>>>> do_swap_page
>>>>   skip swapcache case (synchronous swap_readpage)
>>>> alloc_page_vma
>>>>swapoff
>>>>  release swap_file, 
>>>> bdev, or ...
>>>>   swap_readpage
>>>>check sis->flags is ok
>>>>  access swap_file, bdev or ...[oops!]
>>>>si->flags = 0
>>>>
>>>> The swapcache case is ok because swapoff will wait on the page_lock of 
>>>> swapcache page.
>>>> Is this will really happen or Am I miss something ?
>>>> Any reply would be really grateful. Thanks! :)
>>>
>>> This appears possible.  Even for swapcache case, we can't guarantee the
>>
>> Many thanks for reply!
>>
>>> swap entry gotten from the page table is always valid too.  The
>>
>> The page table may change at any time. And we may thus do some useless work.
>> But the pte_same() check could handle these races correctly if these do not
>> result in oops.
>>
>>> underlying swap device can be swapped off at the same time.  So we use
>>> get/put_swap_device() for that.  Maybe we need similar stuff here.
>>
>> Using get/put_swap_device() to guard against swapoff for swap_readpage() 
>> sounds
>> really bad as swap_readpage() may take really long time. Also such race may 
>> not be
>> really hurtful because swapoff is usually done when system shutdown only.
>> I can not figure some simple and stable stuff out to fix this. Any 
>> suggestions or
>> could anyone help get rid of such race?
> 
> Some reference counting on the swap device can prevent swap device from
> swapping-off.  To reduce the performance overhead on the hot-path as
> much as possible, it appears we can use the percpu_ref.
> 

Sounds a good idea. Many thanks for your suggestion. :)

> Best Regards,
> Huang, Ying
> .
>

Re: [Question] Is there a race window between swapoff vs synchronous swap_readpage

2021-03-29 Thread Miaohe Lin

On 2021/3/30 9:57, Huang, Ying wrote:
> Hi, Miaohe,
> 
> Miaohe Lin  writes:
> 
>> Hi all,
>> I am investigating the swap code, and I found the below possible race window:
>>
>> CPU 1CPU 2
>> --
>> do_swap_page
>>   skip swapcache case (synchronous swap_readpage)
>> alloc_page_vma
>>  swapoff
>>release swap_file, 
>> bdev, or ...
>>   swap_readpage
>>  check sis->flags is ok
>>access swap_file, bdev or ...[oops!]
>>  si->flags = 0
>>
>> The swapcache case is ok because swapoff will wait on the page_lock of 
>> swapcache page.
>> Is this will really happen or Am I miss something ?
>> Any reply would be really grateful. Thanks! :)
> 
> This appears possible.  Even for swapcache case, we can't guarantee the

Many thanks for reply!

> swap entry gotten from the page table is always valid too.  The

The page table may change at any time. And we may thus do some useless work.
But the pte_same() check could handle these races correctly if these do not
result in oops.

> underlying swap device can be swapped off at the same time.  So we use
> get/put_swap_device() for that.  Maybe we need similar stuff here.

Using get/put_swap_device() to guard against swapoff for swap_readpage() sounds
really bad as swap_readpage() may take really long time. Also such race may not 
be
really hurtful because swapoff is usually done when system shutdown only.
I can not figure some simple and stable stuff out to fix this. Any suggestions 
or
could anyone help get rid of such race?

Anyway, thanks again!

> 
> Best Regards,
> Huang, Ying
> .
>

Re: [PATCH v2 5/8] hugetlb: call update_and_free_page without hugetlb_lock

2021-03-29 Thread Miaohe Lin

On 2021/3/30 7:23, Mike Kravetz wrote:
> With the introduction of remove_hugetlb_page(), there is no need for
> update_and_free_page to hold the hugetlb lock.  Change all callers to
> drop the lock before calling.
> 
> With additional code modifications, this will allow loops which decrease
> the huge page pool to drop the hugetlb_lock with each page to reduce
> long hold times.
> 
> The ugly unlock/lock cycle in free_pool_huge_page will be removed in
> a subsequent patch which restructures free_pool_huge_page.
> 
> Signed-off-by: Mike Kravetz 
> Acked-by: Michal Hocko 
> Reviewed-by: Muchun Song 

Looks good to me. Thanks!
Reviewed-by: Miaohe Lin 

> ---
>  mm/hugetlb.c | 32 +++-
>  1 file changed, 27 insertions(+), 5 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 16beae49..dec7bd0dc63d 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1451,16 +1451,18 @@ static void __free_huge_page(struct page *page)
>  
>   if (HPageTemporary(page)) {
>   remove_hugetlb_page(h, page, false);
> + spin_unlock(_lock);
>   update_and_free_page(h, page);
>   } else if (h->surplus_huge_pages_node[nid]) {
>   /* remove the page from active list */
>   remove_hugetlb_page(h, page, true);
> + spin_unlock(_lock);
>   update_and_free_page(h, page);
>   } else {
>   arch_clear_hugepage_flags(page);
>   enqueue_huge_page(h, page);
> + spin_unlock(_lock);
>   }
> - spin_unlock(_lock);
>  }
>  
>  /*
> @@ -1741,7 +1743,13 @@ static int free_pool_huge_page(struct hstate *h, 
> nodemask_t *nodes_allowed,
>   list_entry(h->hugepage_freelists[node].next,
> struct page, lru);
>   remove_hugetlb_page(h, page, acct_surplus);
> + /*
> +  * unlock/lock around update_and_free_page is temporary
> +  * and will be removed with subsequent patch.
> +  */
> + spin_unlock(_lock);
>   update_and_free_page(h, page);
> + spin_lock(_lock);
>   ret = 1;
>   break;
>   }
> @@ -1810,8 +1818,9 @@ int dissolve_free_huge_page(struct page *page)
>   }
>   remove_hugetlb_page(h, page, false);
>   h->max_huge_pages--;
> + spin_unlock(_lock);
>   update_and_free_page(h, head);
> - rc = 0;
> + return 0;
>   }
>  out:
>   spin_unlock(_lock);
> @@ -2674,22 +2683,35 @@ static void try_to_free_low(struct hstate *h, 
> unsigned long count,
>   nodemask_t *nodes_allowed)
>  {
>   int i;
> + struct page *page, *next;
> + LIST_HEAD(page_list);
>  
>   if (hstate_is_gigantic(h))
>   return;
>  
> + /*
> +  * Collect pages to be freed on a list, and free after dropping lock
> +  */
> + INIT_LIST_HEAD(_list);
>   for_each_node_mask(i, *nodes_allowed) {
> - struct page *page, *next;
>   struct list_head *freel = >hugepage_freelists[i];
>   list_for_each_entry_safe(page, next, freel, lru) {
>   if (count >= h->nr_huge_pages)
> - return;
> + goto out;
>   if (PageHighMem(page))
>   continue;
>   remove_hugetlb_page(h, page, false);
> - update_and_free_page(h, page);
> + list_add(>lru, _list);
>   }
>   }
> +
> +out:
> + spin_unlock(_lock);
> + list_for_each_entry_safe(page, next, _list, lru) {
> + update_and_free_page(h, page);
> + cond_resched();
> + }
> + spin_lock(_lock);
>  }
>  #else
>  static inline void try_to_free_low(struct hstate *h, unsigned long count,
>

[Question] Is there a race window between swapoff vs synchronous swap_readpage

2021-03-29 Thread Miaohe Lin

Hi all,
I am investigating the swap code, and I found the below possible race window:

CPU 1   CPU 2
-   -
do_swap_page
  skip swapcache case (synchronous swap_readpage)
alloc_page_vma
swapoff
  release swap_file, 
bdev, or ...
  swap_readpage
check sis->flags is ok
  access swap_file, bdev or ...[oops!]
si->flags = 0

The swapcache case is ok because swapoff will wait on the page_lock of 
swapcache page.
Is this will really happen or Am I miss something ?
Any reply would be really grateful. Thanks! :)

Re: [PATCH 4/8] hugetlb: create remove_hugetlb_page() to separate functionality

2021-03-26 Thread Miaohe Lin

On 2021/3/27 3:57, Mike Kravetz wrote:
> On 3/25/21 7:10 PM, Miaohe Lin wrote:
>> On 2021/3/25 8:28, Mike Kravetz wrote:
>>> The new remove_hugetlb_page() routine is designed to remove a hugetlb
>>> page from hugetlbfs processing.  It will remove the page from the active
>>> or free list, update global counters and set the compound page
>>> destructor to NULL so that PageHuge() will return false for the 'page'.
>>> After this call, the 'page' can be treated as a normal compound page or
>>> a collection of base size pages.
>>>
>>> remove_hugetlb_page is to be called with the hugetlb_lock held.
>>>
>>> Creating this routine and separating functionality is in preparation for
>>> restructuring code to reduce lock hold times.
>>>
>>> Signed-off-by: Mike Kravetz 
>>> ---
>>>  mm/hugetlb.c | 70 +---
>>>  1 file changed, 45 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
>>> index 404b0b1c5258..3938ec086b5c 100644
>>> --- a/mm/hugetlb.c
>>> +++ b/mm/hugetlb.c
>>> @@ -1327,6 +1327,46 @@ static inline void 
>>> destroy_compound_gigantic_page(struct page *page,
>>> unsigned int order) { }
>>>  #endif
>>>  
>>> +/*
>>> + * Remove hugetlb page from lists, and update dtor so that page appears
>>> + * as just a compound page.  A reference is held on the page.
>>> + * NOTE: hugetlb specific page flags stored in page->private are not
>>> + *  automatically cleared.  These flags may be used in routines
>>> + *  which operate on the resulting compound page.
>>
>> It seems HPageFreed and HPageTemporary is cleared. Which hugetlb specific 
>> page flags
>> is reserverd here and why? Could you please give a simple example to clarify
>> this in the comment to help understand this NOTE?
>>
> 
> I will remove that NOTE: in the comment to avoid any confusion.
> 
> The NOTE was add in the RFC that contained a separate patch to add a flag
> that tracked huge pages allocated from CMA.  That flag needed to remain
> for subsequent freeing of such pages.  This is no longer needed.
> 

Many thanks for explaination. I was confused about that NOTE. :)

>> The code looks good to me. Many thanks!
>> Reviewed-by: Miaohe Lin 
> 
> Thanks,
>

Re: [PATCH 8/8] hugetlb: add lockdep_assert_held() calls for hugetlb_lock

2021-03-25 Thread Miaohe Lin

On 2021/3/25 8:28, Mike Kravetz wrote:
> After making hugetlb lock irq safe and separating some functionality
> done under the lock, add some lockdep_assert_held to help verify
> locking.
> 

Looks good to me. Thanks.
Reviewed-by: Miaohe Lin 

> Signed-off-by: Mike Kravetz 
> ---
>  mm/hugetlb.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e4c441b878f2..de5b3cf4a155 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1062,6 +1062,8 @@ static bool vma_has_reserves(struct vm_area_struct 
> *vma, long chg)
>  static void enqueue_huge_page(struct hstate *h, struct page *page)
>  {
>   int nid = page_to_nid(page);
> +
> + lockdep_assert_held(_lock);
>   list_move(>lru, >hugepage_freelists[nid]);
>   h->free_huge_pages++;
>   h->free_huge_pages_node[nid]++;
> @@ -1073,6 +1075,7 @@ static struct page *dequeue_huge_page_node_exact(struct 
> hstate *h, int nid)
>   struct page *page;
>   bool pin = !!(current->flags & PF_MEMALLOC_PIN);
>  
> + lockdep_assert_held(_lock);
>   list_for_each_entry(page, >hugepage_freelists[nid], lru) {
>   if (pin && !is_pinnable_page(page))
>   continue;
> @@ -1345,6 +1348,7 @@ static void remove_hugetlb_page(struct hstate *h, 
> struct page *page,
>  {
>   int nid = page_to_nid(page);
>  
> + lockdep_assert_held(_lock);
>   if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
>   return;
>  
> @@ -1690,6 +1694,7 @@ static struct page *remove_pool_huge_page(struct hstate 
> *h,
>   int nr_nodes, node;
>   struct page *page = NULL;
>  
> + lockdep_assert_held(_lock);
>   for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
>   /*
>* If we're returning unused surplus pages, only examine
> @@ -1939,6 +1944,7 @@ static int gather_surplus_pages(struct hstate *h, long 
> delta)
>   long needed, allocated;
>   bool alloc_ok = true;
>  
> + lockdep_assert_held(_lock);
>   needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
>   if (needed <= 0) {
>   h->resv_huge_pages += delta;
> @@ -2032,6 +2038,7 @@ static void return_unused_surplus_pages(struct hstate 
> *h,
>   struct page *page, *t_page;
>   struct list_head page_list;
>  
> + lockdep_assert_held(_lock);
>   /* Uncommit the reservation */
>   h->resv_huge_pages -= unused_resv_pages;
>  
> @@ -2527,6 +2534,7 @@ static void try_to_free_low(struct hstate *h, unsigned 
> long count,
>   struct list_head page_list;
>   struct page *page, *next;
>  
> + lockdep_assert_held(_lock);
>   if (hstate_is_gigantic(h))
>   return;
>  
> @@ -2573,6 +2581,7 @@ static int adjust_pool_surplus(struct hstate *h, 
> nodemask_t *nodes_allowed,
>  {
>   int nr_nodes, node;
>  
> + lockdep_assert_held(_lock);
>   VM_BUG_ON(delta != -1 && delta != 1);
>  
>   if (delta < 0) {
>

Re: [PATCH 4/8] hugetlb: create remove_hugetlb_page() to separate functionality

2021-03-25 Thread Miaohe Lin

On 2021/3/25 8:28, Mike Kravetz wrote:
> The new remove_hugetlb_page() routine is designed to remove a hugetlb
> page from hugetlbfs processing.  It will remove the page from the active
> or free list, update global counters and set the compound page
> destructor to NULL so that PageHuge() will return false for the 'page'.
> After this call, the 'page' can be treated as a normal compound page or
> a collection of base size pages.
> 
> remove_hugetlb_page is to be called with the hugetlb_lock held.
> 
> Creating this routine and separating functionality is in preparation for
> restructuring code to reduce lock hold times.
> 
> Signed-off-by: Mike Kravetz 
> ---
>  mm/hugetlb.c | 70 +---
>  1 file changed, 45 insertions(+), 25 deletions(-)
> 
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 404b0b1c5258..3938ec086b5c 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -1327,6 +1327,46 @@ static inline void 
> destroy_compound_gigantic_page(struct page *page,
>   unsigned int order) { }
>  #endif
>  
> +/*
> + * Remove hugetlb page from lists, and update dtor so that page appears
> + * as just a compound page.  A reference is held on the page.
> + * NOTE: hugetlb specific page flags stored in page->private are not
> + *automatically cleared.  These flags may be used in routines
> + *which operate on the resulting compound page.

It seems HPageFreed and HPageTemporary is cleared. Which hugetlb specific page 
flags
is reserverd here and why? Could you please give a simple example to clarify
this in the comment to help understand this NOTE?

The code looks good to me. Many thanks!
Reviewed-by: Miaohe Lin 

> + *
> + * Must be called with hugetlb lock held.
> + */
> +static void remove_hugetlb_page(struct hstate *h, struct page *page,
> + bool adjust_surplus)
> +{
> + int nid = page_to_nid(page);
> +
> + if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
> + return;
> +
> + list_del(>lru);
> +
> + if (HPageFreed(page)) {
> + h->free_huge_pages--;
> + h->free_huge_pages_node[nid]--;
> + ClearHPageFreed(page);
> + }
> + if (adjust_surplus) {
> + h->surplus_huge_pages--;
> + h->surplus_huge_pages_node[nid]--;
> + }
> +
> + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
> + VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
> +
> + ClearHPageTemporary(page);
> + set_page_refcounted(page);
> + set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
> +
> + h->nr_huge_pages--;
> + h->nr_huge_pages_node[nid]--;
> +}
> +
>  static void update_and_free_page(struct hstate *h, struct page *page)
>  {
>   int i;
> @@ -1335,8 +1375,6 @@ static void update_and_free_page(struct hstate *h, 
> struct page *page)
>   if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
>   return;
>  
> - h->nr_huge_pages--;
> - h->nr_huge_pages_node[page_to_nid(page)]--;
>   for (i = 0; i < pages_per_huge_page(h);
>i++, subpage = mem_map_next(subpage, page, i)) {
>   subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
> @@ -1344,10 +1382,6 @@ static void update_and_free_page(struct hstate *h, 
> struct page *page)
>   1 << PG_active | 1 << PG_private |
>   1 << PG_writeback);
>   }
> - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
> - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
> - set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
> - set_page_refcounted(page);
>   if (hstate_is_gigantic(h)) {
>   destroy_compound_gigantic_page(page, huge_page_order(h));
>   free_gigantic_page(page, huge_page_order(h));
> @@ -1415,15 +1449,12 @@ static void __free_huge_page(struct page *page)
>   h->resv_huge_pages++;
>  
>   if (HPageTemporary(page)) {
> - list_del(>lru);
> - ClearHPageTemporary(page);
> + remove_hugetlb_page(h, page, false);
>   update_and_free_page(h, page);
>   } else if (h->surplus_huge_pages_node[nid]) {
>   /* remove the page from active list */
> - list_del(>lru);
> + remove_hugetlb_page(h, page, true);
>   update_and_free_page(h, page);
> - h->surplus_huge_pages--;
> - h->sur

Re: [PATCH 3/8] hugetlb: add per-hstate mutex to synchronize user adjustments

2021-03-25 Thread Miaohe Lin

On 2021/3/25 8:28, Mike Kravetz wrote:
> The helper routine hstate_next_node_to_alloc accesses and modifies the
> hstate variable next_nid_to_alloc.  The helper is used by the routines
> alloc_pool_huge_page and adjust_pool_surplus.  adjust_pool_surplus is
> called with hugetlb_lock held.  However, alloc_pool_huge_page can not
> be called with the hugetlb lock held as it will call the page allocator.
> Two instances of alloc_pool_huge_page could be run in parallel or
> alloc_pool_huge_page could run in parallel with adjust_pool_surplus
> which may result in the variable next_nid_to_alloc becoming invalid
> for the caller and pages being allocated on the wrong node.
> > Both alloc_pool_huge_page and adjust_pool_surplus are only called from
> the routine set_max_huge_pages after boot.  set_max_huge_pages is only
> called as the reusult of a user writing to the proc/sysfs nr_hugepages,
> or nr_hugepages_mempolicy file to adjust the number of hugetlb pages.
> 
> It makes little sense to allow multiple adjustment to the number of
> hugetlb pages in parallel.  Add a mutex to the hstate and use it to only
> allow one hugetlb page adjustment at a time.  This will synchronize
> modifications to the next_nid_to_alloc variable.
> 
> Signed-off-by: Mike Kravetz 
> ---
>  include/linux/hugetlb.h | 1 +
>  mm/hugetlb.c| 5 +
>  2 files changed, 6 insertions(+)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index a7f7d5f328dc..8817ec987d68 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -566,6 +566,7 @@ HPAGEFLAG(Freed, freed)
>  #define HSTATE_NAME_LEN 32
>  /* Defines one hugetlb page size */
>  struct hstate {
> + struct mutex mutex;

I am also with Michal and Oscar here, renaming the mutex to something closer to
its function.

Reviewed-by: Miaohe Lin 

>   int next_nid_to_alloc;
>   int next_nid_to_free;
>   unsigned int order;
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index f9ba63fc1747..404b0b1c5258 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -2616,6 +2616,8 @@ static int set_max_huge_pages(struct hstate *h, 
> unsigned long count, int nid,
>   else
>   return -ENOMEM;
>  
> + /* mutex prevents concurrent adjustments for the same hstate */
> + mutex_lock(>mutex);
>   spin_lock(_lock);
>  
>   /*
> @@ -2648,6 +2650,7 @@ static int set_max_huge_pages(struct hstate *h, 
> unsigned long count, int nid,
>   if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
>   if (count > persistent_huge_pages(h)) {
>   spin_unlock(_lock);
> + mutex_unlock(>mutex);
>   NODEMASK_FREE(node_alloc_noretry);
>   return -EINVAL;
>   }
> @@ -2722,6 +2725,7 @@ static int set_max_huge_pages(struct hstate *h, 
> unsigned long count, int nid,
>  out:
>   h->max_huge_pages = persistent_huge_pages(h);
>   spin_unlock(_lock);
> + mutex_unlock(>mutex);
>  
>   NODEMASK_FREE(node_alloc_noretry);
>  
> @@ -3209,6 +3213,7 @@ void __init hugetlb_add_hstate(unsigned int order)
>   BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
>   BUG_ON(order == 0);
>   h = [hugetlb_max_hstate++];
> + mutex_init(>mutex);
>   h->order = order;
>   h->mask = ~(huge_page_size(h) - 1);
>   for (i = 0; i < MAX_NUMNODES; ++i)
>

Re: [PATCH 0/8] make hugetlb put_page safe for all calling contexts

2021-03-25 Thread Miaohe Lin

Hi:
On 2021/3/25 8:28, Mike Kravetz wrote:
> This effort is the result a recent bug report [1].  In subsequent
> discussions [2], it was deemed necessary to properly fix the hugetlb

Many thanks for the effort. I have read the discussions and it is pretty long.
Maybe it would be helpful if you give a brief summary here?

> put_page path (free_huge_page).  This RFC provides a possible way to

trival: Not RFC here.

> address the issue.  Comments are welcome/encouraged as several attempts
> at this have been made in the past.
> > This series is based on v5.12-rc3-mmotm-2021-03-17-22-24.  At a high
> level, the series provides:
> - Patches 1 & 2 from Roman Gushchin provide cma_release_nowait()

trival: missing description of the Patches 3 ?

> - Patches 4, 5 & 6 are aimed at reducing lock hold times.  To be clear
>   the goal is to eliminate single lock hold times of a long duration.
>   Overall lock hold time is not addressed.
> - Patch 7 makes hugetlb_lock and subpool lock IRQ safe.  It also reverts
>   the code which defers calls to a workqueue if !in_task.
> - Patch 8 adds some lockdep_assert_held() calls
> 
> [1] https://lore.kernel.org/linux-mm/f1c03b05bc43a...@google.com/
> [2] http://lkml.kernel.org/r/20210311021321.127500-1-mike.krav...@oracle.com
> 
> RFC -> v1
> - Add Roman's cma_release_nowait() patches.  This eliminated the need
>   to do a workqueue handoff in hugetlb code.
> - Use Michal's suggestion to batch pages for freeing.  This eliminated
>   the need to recalculate loop control variables when dropping the lock.
> - Added lockdep_assert_held() calls
> - Rebased to v5.12-rc3-mmotm-2021-03-17-22-24
> 
> Mike Kravetz (6):
>   hugetlb: add per-hstate mutex to synchronize user adjustments
>   hugetlb: create remove_hugetlb_page() to separate functionality
>   hugetlb: call update_and_free_page without hugetlb_lock
>   hugetlb: change free_pool_huge_page to remove_pool_huge_page
>   hugetlb: make free_huge_page irq safe
>   hugetlb: add lockdep_assert_held() calls for hugetlb_lock
> 
> Roman Gushchin (2):
>   mm: cma: introduce cma_release_nowait()
>   mm: hugetlb: don't drop hugetlb_lock around cma_release() call
> 
>  include/linux/cma.h |   2 +
>  include/linux/hugetlb.h |   1 +
>  mm/cma.c|  93 +++
>  mm/cma.h|   5 +
>  mm/hugetlb.c| 354 +---
>  mm/hugetlb_cgroup.c |   8 +-
>  6 files changed, 294 insertions(+), 169 deletions(-)
>

[PATCH 0/3] Cleanup for khugepaged

2021-03-25 Thread Miaohe Lin

Hi all,
This series contains cleanups to remove unnecessary out label and
meaningless !pte_present() check. Also use helper function to simplify
the code. More details can be found in the respective changelogs.
Thanks!

Miaohe Lin (3):
  khugepaged: use helper function range_in_vma() in
collapse_pte_mapped_thp()
  khugepaged: remove unnecessary out label in collapse_huge_page()
  khugepaged: remove meaningless !pte_present() check in
khugepaged_scan_pmd()

 mm/khugepaged.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

-- 
2.19.1

[PATCH 3/3] khugepaged: remove meaningless !pte_present() check in khugepaged_scan_pmd()

2021-03-25 Thread Miaohe Lin

We know it must meet the !is_swap_pte() and !pte_none() condition if we
reach here. Since !is_swap_pte() indicates pte_none() or pte_present()
is met, it's guaranteed that pte must be present here.

Signed-off-by: Miaohe Lin 
---
 mm/khugepaged.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 58466af69e70..01c2d0ed9abd 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1271,10 +1271,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
}
-   if (!pte_present(pteval)) {
-   result = SCAN_PTE_NON_PRESENT;
-   goto out_unmap;
-   }
if (pte_uffd_wp(pteval)) {
/*
 * Don't collapse the page if any of the small
-- 
2.19.1

[PATCH 2/3] khugepaged: remove unnecessary out label in collapse_huge_page()

2021-03-25 Thread Miaohe Lin

The out label here is unneeded because it just goes to out_up_write label.
Remove it to make code more concise.

Signed-off-by: Miaohe Lin 
---
 mm/khugepaged.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index ad0444f3f487..58466af69e70 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1128,10 +1128,10 @@ static void collapse_huge_page(struct mm_struct *mm,
mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, );
if (result)
-   goto out;
+   goto out_up_write;
/* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd)
-   goto out;
+   goto out_up_write;
 
anon_vma_lock_write(vma->anon_vma);
 
@@ -1171,7 +1171,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_unlock(pmd_ptl);
anon_vma_unlock_write(vma->anon_vma);
result = SCAN_FAIL;
-   goto out;
+   goto out_up_write;
}
 
/*
@@ -1215,8 +1215,6 @@ static void collapse_huge_page(struct mm_struct *mm,
mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
-out:
-   goto out_up_write;
 }
 
 static int khugepaged_scan_pmd(struct mm_struct *mm,
-- 
2.19.1

[PATCH 1/3] khugepaged: use helper function range_in_vma() in collapse_pte_mapped_thp()

2021-03-25 Thread Miaohe Lin

We could use helper function range_in_vma() to check whether the desired
range is inside the vma to simplify the code.

Signed-off-by: Miaohe Lin 
---
 mm/khugepaged.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index b81521dfbb1a..ad0444f3f487 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1446,7 +1446,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, 
unsigned long addr)
int i;
 
if (!vma || !vma->vm_file ||
-   vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+   !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
return;
 
/*
-- 
2.19.1

[PATCH v3 1/5] mm/migrate.c: make putback_movable_page() static

2021-03-25 Thread Miaohe Lin

The putback_movable_page() is just called by putback_movable_pages() and
we know the page is locked and both PageMovable() and PageIsolated() is
checked right before calling putback_movable_page(). So we make it static
and remove all the 3 VM_BUG_ON_PAGE().

Signed-off-by: Miaohe Lin 
---
 include/linux/migrate.h | 1 -
 mm/migrate.c| 7 +--
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index fdf65f23acec..1d8095069b1c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -44,7 +44,6 @@ extern int migrate_pages(struct list_head *l, new_page_t new, 
free_page_t free,
unsigned long private, enum migrate_mode mode, int reason);
 extern struct page *alloc_migration_target(struct page *page, unsigned long 
private);
 extern int isolate_movable_page(struct page *page, isolate_mode_t mode);
-extern void putback_movable_page(struct page *page);
 
 extern void migrate_prep(void);
 extern void migrate_prep_local(void);
diff --git a/mm/migrate.c b/mm/migrate.c
index 47df0df8f21a..61e7f848b554 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -140,15 +140,10 @@ int isolate_movable_page(struct page *page, 
isolate_mode_t mode)
return -EBUSY;
 }
 
-/* It should be called on page which is PG_movable */
-void putback_movable_page(struct page *page)
+static void putback_movable_page(struct page *page)
 {
struct address_space *mapping;
 
-   VM_BUG_ON_PAGE(!PageLocked(page), page);
-   VM_BUG_ON_PAGE(!PageMovable(page), page);
-   VM_BUG_ON_PAGE(!PageIsolated(page), page);
-
mapping = page_mapping(page);
mapping->a_ops->putback_page(page);
__ClearPageIsolated(page);
-- 
2.19.1

[PATCH v3 5/5] Revert "mm: migrate: skip shared exec THP for NUMA balancing"

2021-03-25 Thread Miaohe Lin

This reverts commit c77c5cbafe549eb330e8909861a3e16cbda2c848.

Since commit c77c5cbafe54 ("mm: migrate: skip shared exec THP for NUMA
balancing"), the NUMA balancing would skip shared exec transhuge page.
But this enhancement is not suitable for transhuge page. Because it's
required that page_mapcount() must be 1 due to no migration pte dance
is done here. On the other hand, the shared exec transhuge page will
leave the migrate_misplaced_page() with pte entry untouched and page
locked. Thus pagefault for NUMA will be triggered again and deadlock
occurs when we start waiting for the page lock held by ourselves.

Yang Shi said:

 "Thanks for catching this. By relooking the code I think the other
  important reason for removing this is
  migrate_misplaced_transhuge_page() actually can't see shared exec
  file THP at all since page_lock_anon_vma_read() is called before
  and if page is not anonymous page it will just restore the PMD
  without migrating anything.
  The pages for private mapped file vma may be anonymous pages due to
  COW but they can't be THP so it won't trigger THP numa fault at all. I
  think this is why no bug was reported. I overlooked this in the first
  place."

Reviewed-by: Yang Shi 
Signed-off-by: Miaohe Lin 
---
 mm/migrate.c | 18 ++
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index c621c8f6fb7d..51190759e6dd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2105,17 +2105,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
 }
 
-static inline bool is_shared_exec_page(struct vm_area_struct *vma,
-  struct page *page)
-{
-   if (page_mapcount(page) != 1 &&
-   (page_is_file_lru(page) || vma_is_shmem(vma)) &&
-   (vma->vm_flags & VM_EXEC))
-   return true;
-
-   return false;
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
@@ -2133,7 +2122,8 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
 * Don't migrate file pages that are mapped in multiple processes
 * with execute permissions as they are probably shared libraries.
 */
-   if (is_shared_exec_page(vma, page))
+   if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+   (vma->vm_flags & VM_EXEC))
goto out;
 
/*
@@ -2188,9 +2178,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
 
-   if (is_shared_exec_page(vma, page))
-   goto out;
-
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
@@ -2302,7 +2289,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 out_unlock:
unlock_page(page);
-out:
put_page(page);
return 0;
 }
-- 
2.19.1

[PATCH v3 4/5] mm/migrate.c: use helper migrate_vma_collect_skip() in migrate_vma_collect_hole()

2021-03-25 Thread Miaohe Lin

It's more recommended to use helper function migrate_vma_collect_skip() to
skip the unexpected case and it also helps remove some duplicated codes.
Move migrate_vma_collect_skip() above migrate_vma_collect_hole() to avoid
compiler warning.

Reviewed-by: David Hildenbrand 
Signed-off-by: Miaohe Lin 
---
 mm/migrate.c | 28 +++-
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 87bbad578127..c621c8f6fb7d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2311,44 +2311,38 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
*mm,
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DEVICE_PRIVATE
-static int migrate_vma_collect_hole(unsigned long start,
+static int migrate_vma_collect_skip(unsigned long start,
unsigned long end,
-   __always_unused int depth,
struct mm_walk *walk)
 {
struct migrate_vma *migrate = walk->private;
unsigned long addr;
 
-   /* Only allow populating anonymous memory. */
-   if (!vma_is_anonymous(walk->vma)) {
-   for (addr = start; addr < end; addr += PAGE_SIZE) {
-   migrate->src[migrate->npages] = 0;
-   migrate->dst[migrate->npages] = 0;
-   migrate->npages++;
-   }
-   return 0;
-   }
-
for (addr = start; addr < end; addr += PAGE_SIZE) {
-   migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
-   migrate->npages++;
-   migrate->cpages++;
+   migrate->src[migrate->npages++] = 0;
}
 
return 0;
 }
 
-static int migrate_vma_collect_skip(unsigned long start,
+static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+   __always_unused int depth,
struct mm_walk *walk)
 {
struct migrate_vma *migrate = walk->private;
unsigned long addr;
 
+   /* Only allow populating anonymous memory. */
+   if (!vma_is_anonymous(walk->vma))
+   return migrate_vma_collect_skip(start, end, walk);
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
+   migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
-   migrate->src[migrate->npages++] = 0;
+   migrate->npages++;
+   migrate->cpages++;
}
 
return 0;
-- 
2.19.1

[PATCH v3 2/5] mm/migrate.c: remove unnecessary rc != MIGRATEPAGE_SUCCESS check in 'else' case

2021-03-25 Thread Miaohe Lin

It's guaranteed that in the 'else' case of the rc == MIGRATEPAGE_SUCCESS
check, rc does not equal to MIGRATEPAGE_SUCCESS. Remove this unnecessary
check.

Reviewed-by: David Hildenbrand 
Reviewed-by: Yang Shi 
Signed-off-by: Miaohe Lin 
---
 mm/migrate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/migrate.c b/mm/migrate.c
index 61e7f848b554..dacbdc9710ac 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1370,7 +1370,7 @@ static int unmap_and_move_huge_page(new_page_t 
get_new_page,
 out:
if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
-   else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+   else if (rc != -EAGAIN)
list_move_tail(>lru, ret);
 
/*
-- 
2.19.1

1 2 3 4 5 >

1 - 100 of 434 matches

Mail list logo