Re: [PATCH v2 02/12] mm: remove extra ZONE_DEVICE struct page refcount

2021-09-14 Thread Ralph Campbell

On 9/13/21 9:15 AM, Alex Sierra wrote:

From: Ralph Campbell 

ZONE_DEVICE struct pages have an extra reference count that complicates the
code for put_page() and several places in the kernel that need to check the
reference count to see that a page is not being used (gup, compaction,
migration, etc.). Clean up the code so the reference count doesn't need to
be treated specially for ZONE_DEVICE.

Signed-off-by: Ralph Campbell 
Signed-off-by: Alex Sierra 
Reviewed-by: Christoph Hellwig 
---
v2:
AS: merged this patch in linux 5.11 version

v5:
AS: add condition at try_grab_page to check for the zone device type, while
page ref counter is checked less/equal to zero. In case of device zone, pages
ref counter are initialized to zero.

v7:
AS: fix condition at try_grab_page added at v5, is invalid. It supposed
to fix xfstests/generic/413 test, however, there's a known issue on
this test where DAX mapped area DIO to non-DAX expect to fail.
https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
This condition was removed after rebase over patch series
https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
---
  arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
  drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
  fs/dax.c   |  4 +-
  include/linux/dax.h|  2 +-
  include/linux/memremap.h   |  7 +--
  include/linux/mm.h | 11 
  lib/test_hmm.c |  2 +-
  mm/internal.h  |  8 +++
  mm/memremap.c  | 69 +++---
  mm/migrate.c   |  5 --
  mm/page_alloc.c|  3 ++
  mm/swap.c  | 45 ++---
  12 files changed, 45 insertions(+), 115 deletions(-)


I don't see mm/memcontrol.c listed here so I think you will need this fragment 
too:

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 389b5766e74f..0cc9c7c71e79 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5541,11 +5541,7 @@ static struct page *mc_handle_swap_pte(struct 
vm_area_struct *vma,
 */
if (is_device_private_entry(ent)) {
page = pfn_swap_entry_to_page(ent);
-   /*
-* MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
-* a refcount of 1 when free (unlike normal page)
-*/
-   if (!page_ref_add_unless(page, 1, 1))
+   if (!get_page_unless_zero(page))
return NULL;
return page;
}




[PATCH v2 02/12] mm: remove extra ZONE_DEVICE struct page refcount

2021-09-13 Thread Alex Sierra
From: Ralph Campbell 

ZONE_DEVICE struct pages have an extra reference count that complicates the
code for put_page() and several places in the kernel that need to check the
reference count to see that a page is not being used (gup, compaction,
migration, etc.). Clean up the code so the reference count doesn't need to
be treated specially for ZONE_DEVICE.

Signed-off-by: Ralph Campbell 
Signed-off-by: Alex Sierra 
Reviewed-by: Christoph Hellwig 
---
v2:
AS: merged this patch in linux 5.11 version

v5:
AS: add condition at try_grab_page to check for the zone device type, while
page ref counter is checked less/equal to zero. In case of device zone, pages
ref counter are initialized to zero.

v7:
AS: fix condition at try_grab_page added at v5, is invalid. It supposed
to fix xfstests/generic/413 test, however, there's a known issue on
this test where DAX mapped area DIO to non-DAX expect to fail.
https://patchwork.kernel.org/project/fstests/patch/1489463960-3579-1-git-send-email-xz...@redhat.com
This condition was removed after rebase over patch series
https://lore.kernel.org/r/20210813044133.1536842-4-jhubb...@nvidia.com
---
 arch/powerpc/kvm/book3s_hv_uvmem.c |  2 +-
 drivers/gpu/drm/nouveau/nouveau_dmem.c |  2 +-
 fs/dax.c   |  4 +-
 include/linux/dax.h|  2 +-
 include/linux/memremap.h   |  7 +--
 include/linux/mm.h | 11 
 lib/test_hmm.c |  2 +-
 mm/internal.h  |  8 +++
 mm/memremap.c  | 69 +++---
 mm/migrate.c   |  5 --
 mm/page_alloc.c|  3 ++
 mm/swap.c  | 45 ++---
 12 files changed, 45 insertions(+), 115 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 84e5a2dc8be5..acee67710620 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -711,7 +711,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long 
gpa, struct kvm *kvm)
 
dpage = pfn_to_page(uvmem_pfn);
dpage->zone_device_data = pvt;
-   get_page(dpage);
+   init_page_count(dpage);
lock_page(dpage);
return dpage;
 out_clear:
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c 
b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 92987daa5e17..8bc7120e1216 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -324,7 +324,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm)
return NULL;
}
 
-   get_page(page);
+   init_page_count(page);
lock_page(page);
return page;
 }
diff --git a/fs/dax.c b/fs/dax.c
index c387d09e3e5a..1166630b7190 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -571,14 +571,14 @@ static void *grab_mapping_entry(struct xa_state *xas,
 
 /**
  * dax_layout_busy_page_range - find first pinned page in @mapping
- * @mapping: address space to scan for a page with ref count > 1
+ * @mapping: address space to scan for a page with ref count > 0
  * @start: Starting offset. Page containing 'start' is included.
  * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
  *   pages from 'start' till the end of file are included.
  *
  * DAX requires ZONE_DEVICE mapped pages. These pages are never
  * 'onlined' to the page allocator so they are considered idle when
- * page->count == 1. A filesystem uses this interface to determine if
+ * page->count == 0. A filesystem uses this interface to determine if
  * any page in the mapping is busy, i.e. for DMA, or other
  * get_user_pages() usages.
  *
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 8b5da1d60dbc..05fc982ce153 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -245,7 +245,7 @@ static inline bool dax_mapping(struct address_space 
*mapping)
 
 static inline bool dax_page_unused(struct page *page)
 {
-   return page_ref_count(page) == 1;
+   return page_ref_count(page) == 0;
 }
 
 #define dax_wait_page(_inode, _page, _wait_cb) \
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 45a79da89c5f..77ff5fd0685f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -66,9 +66,10 @@ enum memory_type {
 
 struct dev_pagemap_ops {
/*
-* Called once the page refcount reaches 1.  (ZONE_DEVICE pages never
-* reach 0 refcount unless there is a refcount bug. This allows the
-* device driver to implement its own memory management.)
+* Called once the page refcount reaches 0. The reference count
+* should be reset to one with init_page_count(page) before reusing
+* the page. This allows the device driver to implement its own
+* memory management.
 */
void (*page_free)(struct page *page);
 
diff --git