[GIT PULL] DAX and HMAT fixes for v6.1-rc8

2022-12-02 Thread Dan Williams
Hi Linus, please pull from:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm 
tags/dax-fixes-6.1-rc8

...to receive a few bug fixes around the handling of "Soft Reserved"
memory and memory tiering information. Linux is starting to enounter
more real world systems that deploy an ACPI HMAT to describe different
performance classes of memory, as well the "special purpose" (Linux
"Soft Reserved") designation from EFI. These fixes result from that
testing.

It has all appeared in -next for a while with no known issues.

---

The following changes since commit 094226ad94f471a9f19e8f8e7140a09c2625abaa:

  Linux 6.1-rc5 (2022-11-13 13:12:55 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm 
tags/dax-fixes-6.1-rc8

for you to fetch changes up to 472faf72b33d80aa8e7a99c9410c1a23d3bf0cd8:

  device-dax: Fix duplicate 'hmem' device registration (2022-11-21 15:34:40 
-0800)


dax fixes for v6.1-rc8

- Fix duplicate overlapping device-dax instances for HMAT described
  "Soft Reserved" Memory

- Fix missing node targets in the sysfs representation of memory tiers

- Remove a confusing variable initialization


Dan Williams (1):
  device-dax: Fix duplicate 'hmem' device registration

Vishal Verma (2):
  ACPI: HMAT: remove unnecessary variable initialization
  ACPI: HMAT: Fix initiator registration for single-initiator systems

 drivers/acpi/numa/hmat.c  | 27 ---
 drivers/dax/hmem/device.c | 24 +++-
 2 files changed, 35 insertions(+), 16 deletions(-)



RE: [PATCH v2.1 1/8] fsdax: introduce page->share for fsdax in reflink mode

2022-12-02 Thread Dan Williams
Shiyang Ruan wrote:
> fsdax page is used not only when CoW, but also mapread. To make the it
> easily understood, use 'share' to indicate that the dax page is shared
> by more than one extent.  And add helper functions to use it.
> 
> Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED.
> 
> Signed-off-by: Shiyang Ruan 
> ---
>  fs/dax.c   | 38 ++
>  include/linux/mm_types.h   |  5 -
>  include/linux/page-flags.h |  2 +-
>  3 files changed, 27 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 1c6867810cbd..edbacb273ab5 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
>   for (pfn = dax_to_pfn(entry); \
>   pfn < dax_end_pfn(entry); pfn++)
>  
> -static inline bool dax_mapping_is_cow(struct address_space *mapping)
> +static inline bool dax_page_is_shared(struct page *page)
>  {
> - return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
> + return (unsigned long)page->mapping == PAGE_MAPPING_DAX_SHARED;
>  }
>  
>  /*
> - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
> + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
> + * refcount.
>   */
> -static inline void dax_mapping_set_cow(struct page *page)
> +static inline void dax_page_bump_sharing(struct page *page)

Similar to page_ref naming I would call this page_share_get() and the
corresponding function page_share_put().

>  {
> - if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
> + if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_SHARED) {
>   /*
>* Reset the index if the page was already mapped
>* regularly before.
>*/
>   if (page->mapping)
> - page->index = 1;
> - page->mapping = (void *)PAGE_MAPPING_DAX_COW;
> + page->share = 1;
> + page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;

Small nit, You could save a cast here by defining
PAGE_MAPPING_DAX_SHARED as "((void *) 1)".

>   }
> - page->index++;
> + page->share++;
> +}
> +
> +static inline unsigned long dax_page_drop_sharing(struct page *page)
> +{
> + return --page->share;
>  }
>  
>  /*
> - * When it is called in dax_insert_entry(), the cow flag will indicate that
> + * When it is called in dax_insert_entry(), the shared flag will indicate 
> that
>   * whether this entry is shared by multiple files.  If so, set the 
> page->mapping
> - * FS_DAX_MAPPING_COW, and use page->index as refcount.
> + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
>   */
>  static void dax_associate_entry(void *entry, struct address_space *mapping,
> - struct vm_area_struct *vma, unsigned long address, bool cow)
> + struct vm_area_struct *vma, unsigned long address, bool shared)
>  {
>   unsigned long size = dax_entry_size(entry), pfn, index;
>   int i = 0;
> @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct 
> address_space *mapping,
>   for_each_mapped_pfn(entry, pfn) {
>   struct page *page = pfn_to_page(pfn);
>  
> - if (cow) {
> - dax_mapping_set_cow(page);
> + if (shared) {
> + dax_page_bump_sharing(page);
>   } else {
>   WARN_ON_ONCE(page->mapping);
>   page->mapping = mapping;
> @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct 
> address_space *mapping,
>   struct page *page = pfn_to_page(pfn);
>  
>   WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> - if (dax_mapping_is_cow(page->mapping)) {
> - /* keep the CoW flag if this page is still shared */
> - if (page->index-- > 0)
> + if (dax_page_is_shared(page)) {
> + /* keep the shared flag if this page is still shared */
> + if (dax_page_drop_sharing(page) > 0)
>   continue;

I think part of what makes this hard to read is trying to preserve the
same code paths for shared pages and typical pages.

page_share_put() should, in addition to decrementing the share, clear
out page->mapping value.

>   } else
>   WARN_ON_ONCE(page->mapping && page->mapping != mapping);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 500e536796ca..f46cac3657ad 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -103,7 +103,10 @@ struct page {
>   };
>   /* See page-flags.h for PAGE_MAPPING_FLAGS */
>   struct address_space *mapping;
> - pgoff_t index;  /* Our offset within mapping. */
> + union {
> +   

RE: [PATCH v2 0/8] fsdax,xfs: fix warning messages

2022-12-02 Thread Dan Williams
Shiyang Ruan wrote:
> Changes since v1:
>  1. Added a snippet of the warning message and some of the failed cases
>  2. Separated the patch for easily review
>  3. Added page->share and its helper functions
>  4. Included the patch[1] that removes the restrictions of fsdax and reflink
> [1] 
> https://lore.kernel.org/linux-xfs/1663234002-17-1-git-send-email-ruansy.f...@fujitsu.com/
> 
> Many testcases failed in dax+reflink mode with warning message in dmesg.
> Such as generic/051,075,127.  The warning message is like this:
> [  775.509337] [ cut here ]
> [  775.509636] WARNING: CPU: 1 PID: 16815 at fs/dax.c:386 
> dax_insert_entry.cold+0x2e/0x69
> [  775.510151] Modules linked in: auth_rpcgss oid_registry nfsv4 algif_hash 
> af_alg af_packet nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject 
> nft_ct nft_chain_nat iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 
> nf_defrag_ipv4 ip_set nf_tables nfnetlink ip6table_filter ip6_tables 
> iptable_filter ip_tables x_tables dax_pmem nd_pmem nd_btt sch_fq_codel 
> configfs xfs libcrc32c fuse
> [  775.524288] CPU: 1 PID: 16815 Comm: fsx Kdump: loaded Tainted: GW  
> 6.1.0-rc4+ #164 eb34e4ee4200c7cbbb47de2b1892c5a3e027fd6d
> [  775.524904] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS Arch 
> Linux 1.16.0-3-3 04/01/2014
> [  775.525460] RIP: 0010:dax_insert_entry.cold+0x2e/0x69
> [  775.525797] Code: c7 c7 18 eb e0 81 48 89 4c 24 20 48 89 54 24 10 e8 73 6d 
> ff ff 48 83 7d 18 00 48 8b 54 24 10 48 8b 4c 24 20 0f 84 e3 e9 b9 ff <0f> 0b 
> e9 dc e9 b9 ff 48 c7 c6 a0 20 c3 81 48 c7 c7 f0 ea e0 81 48
> [  775.526708] RSP: :c90001d57b30 EFLAGS: 00010082
> [  775.527042] RAX: 002a RBX:  RCX: 
> 0042
> [  775.527396] RDX: ea000a0f6c80 RSI: 81dfab1b RDI: 
> 
> [  775.527819] RBP: ea000a0f6c40 R08:  R09: 
> 820625e0
> [  775.528241] R10: c90001d579d8 R11: 820d2628 R12: 
> 88815fc98320
> [  775.528598] R13: c90001d57c18 R14:  R15: 
> 0001
> [  775.528997] FS:  7f39fc75d740() GS:88817bc8() 
> knlGS:
> [  775.529474] CS:  0010 DS:  ES:  CR0: 80050033
> [  775.529800] CR2: 7f39fc772040 CR3: 000107eb6001 CR4: 
> 003706e0
> [  775.530214] DR0:  DR1:  DR2: 
> 
> [  775.530592] DR3:  DR6: fffe0ff0 DR7: 
> 0400
> [  775.531002] Call Trace:
> [  775.531230]  
> [  775.531444]  dax_fault_iter+0x267/0x6c0
> [  775.531719]  dax_iomap_pte_fault+0x198/0x3d0
> [  775.532002]  __xfs_filemap_fault+0x24a/0x2d0 [xfs 
> aa8d25411432b306d9554da38096f4ebb86bdfe7]
> [  775.532603]  __do_fault+0x30/0x1e0
> [  775.532903]  do_fault+0x314/0x6c0
> [  775.533166]  __handle_mm_fault+0x646/0x1250
> [  775.533480]  handle_mm_fault+0xc1/0x230
> [  775.533810]  do_user_addr_fault+0x1ac/0x610
> [  775.534110]  exc_page_fault+0x63/0x140
> [  775.534389]  asm_exc_page_fault+0x22/0x30
> [  775.534678] RIP: 0033:0x7f39fc55820a
> [  775.534950] Code: 00 01 00 00 00 74 99 83 f9 c0 0f 87 7b fe ff ff c5 fe 6f 
> 4e 20 48 29 fe 48 83 c7 3f 49 8d 0c 10 48 83 e7 c0 48 01 fe 48 29 f9  a4 
> c4 c1 7e 7f 00 c4 c1 7e 7f 48 20 c5 f8 77 c3 0f 1f 44 00 00
> [  775.535839] RSP: 002b:7ffc66a08118 EFLAGS: 00010202
> [  775.536157] RAX: 7f39fc772001 RBX: 00042001 RCX: 
> 63c1
> [  775.536537] RDX: 6400 RSI: 7f39fac42050 RDI: 
> 7f39fc772040
> [  775.536919] RBP: 6400 R08: 7f39fc772001 R09: 
> 00042000
> [  775.537304] R10: 0001 R11: 0246 R12: 
> 0001
> [  775.537694] R13: 7f39fc772000 R14: 6401 R15: 
> 0003
> [  775.538086]  
> [  775.538333] ---[ end trace  ]---
> 
> This also effects dax+noreflink mode if we run the test after a
> dax+reflink test.  So, the most urgent thing is solving the warning
> messages.
> 
> With these fixes, most warning messages in dax_associate_entry() are
> gone.  But honestly, generic/388 will randomly failed with the warning.
> The case shutdown the xfs when fsstress is running, and do it for many
> times.  I think the reason is that dax pages in use are not able to be
> invalidated in time when fs is shutdown.  The next time dax page to be
> associated, it still remains the mapping value set last time.  I'll keep
> on solving it.

This one also sounds like it is going to be relevant for CXL PMEM, and
the improvements to the reference counting. CXL has a facility where the
driver asserts that no more writes are in-flight to the device so that
the device can assert a clean shutdown. Part of that will be making sure
that page access ends at fs shutdown.

> The warning message in dax_writeback_one() can also be fixed because of
> the dax unshare.
> 
> 
> Shiyang Ruan (8):
>   fsdax: introduce 

Re: [PATCH v2.1 3/8] fsdax: zero the edges if source is HOLE or UNWRITTEN

2022-12-02 Thread Allison Henderson
On Fri, 2022-12-02 at 09:25 +, Shiyang Ruan wrote:
> If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest
> page should be zeroed.  Otherwise, since it's a pmem, old data may
> remains on the dest page, the result of CoW will be incorrect.
> 
> The function name is also not easy to understand, rename it to
> "dax_iomap_copy_around()", which means it copys data around the
> range.
> 
> Signed-off-by: Shiyang Ruan 
> Reviewed-by: Darrick J. Wong 
> 
I think the new changes look good
Reviewed-by: Allison Henderson 

> ---
>  fs/dax.c | 79 +++---
> --
>  1 file changed, 49 insertions(+), 30 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index a77739f2abe7..f12645d6f3c8 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1092,7 +1092,8 @@ static int dax_iomap_direct_access(const struct
> iomap *iomap, loff_t pos,
>  }
>  
>  /**
> - * dax_iomap_cow_copy - Copy the data from source to destination
> before write
> + * dax_iomap_copy_around - Prepare for an unaligned write to a
> shared/cow page
> + * by copying the data before and after the range to be written.
>   * @pos:   address to do copy from.
>   * @length:size of copy operation.
>   * @align_size:aligned w.r.t align_size (either PMD_SIZE or
> PAGE_SIZE)
> @@ -1101,35 +1102,50 @@ static int dax_iomap_direct_access(const
> struct iomap *iomap, loff_t pos,
>   *
>   * This can be called from two places. Either during DAX write fault
> (page
>   * aligned), to copy the length size data to daddr. Or, while doing
> normal DAX
> - * write operation, dax_iomap_actor() might call this to do the copy
> of either
> + * write operation, dax_iomap_iter() might call this to do the copy
> of either
>   * start or end unaligned address. In the latter case the rest of
> the copy of
> - * aligned ranges is taken care by dax_iomap_actor() itself.
> + * aligned ranges is taken care by dax_iomap_iter() itself.
> + * If the srcmap contains invalid data, such as HOLE and UNWRITTEN,
> zero the
> + * area to make sure no old data remains.
>   */
> -static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t
> align_size,
> +static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t
> align_size,
> const struct iomap *srcmap, void *daddr)
>  {
> loff_t head_off = pos & (align_size - 1);
> size_t size = ALIGN(head_off + length, align_size);
> loff_t end = pos + length;
> loff_t pg_end = round_up(end, align_size);
> +   /* copy_all is usually in page fault case */
> bool copy_all = head_off == 0 && end == pg_end;
> +   /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
> +   bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
> +    srcmap->type == IOMAP_UNWRITTEN;
> void *saddr = 0;
> int ret = 0;
>  
> -   ret = dax_iomap_direct_access(srcmap, pos, size, ,
> NULL);
> -   if (ret)
> -   return ret;
> +   if (!zero_edge) {
> +   ret = dax_iomap_direct_access(srcmap, pos, size,
> , NULL);
> +   if (ret)
> +   return ret;
> +   }
>  
> if (copy_all) {
> -   ret = copy_mc_to_kernel(daddr, saddr, length);
> -   return ret ? -EIO : 0;
> +   if (zero_edge)
> +   memset(daddr, 0, size);
> +   else
> +   ret = copy_mc_to_kernel(daddr, saddr,
> length);
> +   goto out;
> }
>  
> /* Copy the head part of the range */
> if (head_off) {
> -   ret = copy_mc_to_kernel(daddr, saddr, head_off);
> -   if (ret)
> -   return -EIO;
> +   if (zero_edge)
> +   memset(daddr, 0, head_off);
> +   else {
> +   ret = copy_mc_to_kernel(daddr, saddr,
> head_off);
> +   if (ret)
> +   return -EIO;
> +   }
> }
>  
> /* Copy the tail part of the range */
> @@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos,
> uint64_t length, size_t align_size,
> loff_t tail_off = head_off + length;
> loff_t tail_len = pg_end - end;
>  
> -   ret = copy_mc_to_kernel(daddr + tail_off, saddr +
> tail_off,
> -   tail_len);
> -   if (ret)
> -   return -EIO;
> +   if (zero_edge)
> +   memset(daddr + tail_off, 0, tail_len);
> +   else {
> +   ret = copy_mc_to_kernel(daddr + tail_off,
> +   saddr + tail_off,
> tail_len);
> +   if (ret)
> +   return -EIO;
> +   }
> }
> -   return 0;
> +out:
> +   if (zero_edge)
> +   

Re: [PATCH v2.1 1/8] fsdax: introduce page->share for fsdax in reflink mode

2022-12-02 Thread Allison Henderson
On Fri, 2022-12-02 at 09:23 +, Shiyang Ruan wrote:
> fsdax page is used not only when CoW, but also mapread. To make the
> it
> easily understood, use 'share' to indicate that the dax page is
> shared
> by more than one extent.  And add helper functions to use it.
> 
> Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED.
> 
The new changes look reasonable to me
Reviewed-by: Allison Henderson 

> Signed-off-by: Shiyang Ruan 
> ---
>  fs/dax.c   | 38 ++--
> --
>  include/linux/mm_types.h   |  5 -
>  include/linux/page-flags.h |  2 +-
>  3 files changed, 27 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 1c6867810cbd..edbacb273ab5 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
> for (pfn = dax_to_pfn(entry); \
> pfn < dax_end_pfn(entry); pfn++)
>  
> -static inline bool dax_mapping_is_cow(struct address_space *mapping)
> +static inline bool dax_page_is_shared(struct page *page)
>  {
> -   return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
> +   return (unsigned long)page->mapping ==
> PAGE_MAPPING_DAX_SHARED;
>  }
>  
>  /*
> - * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the
> refcount.
> + * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase
> the
> + * refcount.
>   */
> -static inline void dax_mapping_set_cow(struct page *page)
> +static inline void dax_page_bump_sharing(struct page *page)
>  {
> -   if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
> +   if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_SHARED) {
> /*
>  * Reset the index if the page was already mapped
>  * regularly before.
>  */
> if (page->mapping)
> -   page->index = 1;
> -   page->mapping = (void *)PAGE_MAPPING_DAX_COW;
> +   page->share = 1;
> +   page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;
> }
> -   page->index++;
> +   page->share++;
> +}
> +
> +static inline unsigned long dax_page_drop_sharing(struct page *page)
> +{
> +   return --page->share;
>  }
>  
>  /*
> - * When it is called in dax_insert_entry(), the cow flag will
> indicate that
> + * When it is called in dax_insert_entry(), the shared flag will
> indicate that
>   * whether this entry is shared by multiple files.  If so, set the
> page->mapping
> - * FS_DAX_MAPPING_COW, and use page->index as refcount.
> + * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
>   */
>  static void dax_associate_entry(void *entry, struct address_space
> *mapping,
> -   struct vm_area_struct *vma, unsigned long address,
> bool cow)
> +   struct vm_area_struct *vma, unsigned long address,
> bool shared)
>  {
> unsigned long size = dax_entry_size(entry), pfn, index;
> int i = 0;
> @@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry,
> struct address_space *mapping,
> for_each_mapped_pfn(entry, pfn) {
> struct page *page = pfn_to_page(pfn);
>  
> -   if (cow) {
> -   dax_mapping_set_cow(page);
> +   if (shared) {
> +   dax_page_bump_sharing(page);
> } else {
> WARN_ON_ONCE(page->mapping);
> page->mapping = mapping;
> @@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry,
> struct address_space *mapping,
> struct page *page = pfn_to_page(pfn);
>  
> WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
> -   if (dax_mapping_is_cow(page->mapping)) {
> -   /* keep the CoW flag if this page is still
> shared */
> -   if (page->index-- > 0)
> +   if (dax_page_is_shared(page)) {
> +   /* keep the shared flag if this page is still
> shared */
> +   if (dax_page_drop_sharing(page) > 0)
> continue;
> } else
> WARN_ON_ONCE(page->mapping && page->mapping
> != mapping);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 500e536796ca..f46cac3657ad 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -103,7 +103,10 @@ struct page {
> };
> /* See page-flags.h for PAGE_MAPPING_FLAGS */
> struct address_space *mapping;
> -   pgoff_t index;  /* Our offset within
> mapping. */
> +   union {
> +   pgoff_t index;  /* Our offset
> within mapping. */
> +   unsigned long share;/* share
> count for fsdax */
> +   };
> 

Re: [PATCH v2.1 1/8] fsdax: introduce page->share for fsdax in reflink mode

2022-12-02 Thread Andrew Morton
On Fri, 2 Dec 2022 09:23:11 + Shiyang Ruan  wrote:

> fsdax page is used not only when CoW, but also mapread. To make the it
> easily understood, use 'share' to indicate that the dax page is shared
> by more than one extent.  And add helper functions to use it.
> 
> Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED.
> 

For those who are wondering what changed, I queued the below incremental
patch.


From: Shiyang Ruan 
Subject: fsdax: introduce page->share for fsdax in reflink mode
Date: Fri, 2 Dec 2022 09:23:11 +

rename several functions

Link: 
https://lkml.kernel.org/r/1669972991-246-1-git-send-email-ruansy.f...@fujitsu.com
Signed-off-by: Shiyang Ruan 
Cc: Alistair Popple 
Cc: Dan Williams 
Cc: Darrick J. Wong 
Cc: Dave Chinner 
Cc: Jason Gunthorpe 
Cc: John Hubbard 
Signed-off-by: Andrew Morton 
---

 fs/dax.c |   12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

--- a/fs/dax.c~fsdax-introduce-page-share-for-fsdax-in-reflink-mode-fix
+++ a/fs/dax.c
@@ -334,7 +334,7 @@ static unsigned long dax_end_pfn(void *e
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
 
-static inline bool dax_mapping_is_shared(struct page *page)
+static inline bool dax_page_is_shared(struct page *page)
 {
return (unsigned long)page->mapping == PAGE_MAPPING_DAX_SHARED;
 }
@@ -343,7 +343,7 @@ static inline bool dax_mapping_is_shared
  * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
  * refcount.
  */
-static inline void dax_mapping_set_shared(struct page *page)
+static inline void dax_page_bump_sharing(struct page *page)
 {
if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_SHARED) {
/*
@@ -357,7 +357,7 @@ static inline void dax_mapping_set_share
page->share++;
 }
 
-static inline unsigned long dax_mapping_decrease_shared(struct page *page)
+static inline unsigned long dax_page_drop_sharing(struct page *page)
 {
return --page->share;
 }
@@ -381,7 +381,7 @@ static void dax_associate_entry(void *en
struct page *page = pfn_to_page(pfn);
 
if (shared) {
-   dax_mapping_set_shared(page);
+   dax_page_bump_sharing(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
@@ -402,9 +402,9 @@ static void dax_disassociate_entry(void
struct page *page = pfn_to_page(pfn);
 
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-   if (dax_mapping_is_shared(page)) {
+   if (dax_page_is_shared(page)) {
/* keep the shared flag if this page is still shared */
-   if (dax_mapping_decrease_shared(page) > 0)
+   if (dax_page_drop_sharing(page) > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
_




[PATCH v2.1 3/8] fsdax: zero the edges if source is HOLE or UNWRITTEN

2022-12-02 Thread Shiyang Ruan
If srcmap contains invalid data, such as HOLE and UNWRITTEN, the dest
page should be zeroed.  Otherwise, since it's a pmem, old data may
remains on the dest page, the result of CoW will be incorrect.

The function name is also not easy to understand, rename it to
"dax_iomap_copy_around()", which means it copys data around the range.

Signed-off-by: Shiyang Ruan 
Reviewed-by: Darrick J. Wong 
---
 fs/dax.c | 79 +++-
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index a77739f2abe7..f12645d6f3c8 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1092,7 +1092,8 @@ static int dax_iomap_direct_access(const struct iomap 
*iomap, loff_t pos,
 }
 
 /**
- * dax_iomap_cow_copy - Copy the data from source to destination before write
+ * dax_iomap_copy_around - Prepare for an unaligned write to a shared/cow page
+ * by copying the data before and after the range to be written.
  * @pos:   address to do copy from.
  * @length:size of copy operation.
  * @align_size:aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
@@ -1101,35 +1102,50 @@ static int dax_iomap_direct_access(const struct iomap 
*iomap, loff_t pos,
  *
  * This can be called from two places. Either during DAX write fault (page
  * aligned), to copy the length size data to daddr. Or, while doing normal DAX
- * write operation, dax_iomap_actor() might call this to do the copy of either
+ * write operation, dax_iomap_iter() might call this to do the copy of either
  * start or end unaligned address. In the latter case the rest of the copy of
- * aligned ranges is taken care by dax_iomap_actor() itself.
+ * aligned ranges is taken care by dax_iomap_iter() itself.
+ * If the srcmap contains invalid data, such as HOLE and UNWRITTEN, zero the
+ * area to make sure no old data remains.
  */
-static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
+static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t 
align_size,
const struct iomap *srcmap, void *daddr)
 {
loff_t head_off = pos & (align_size - 1);
size_t size = ALIGN(head_off + length, align_size);
loff_t end = pos + length;
loff_t pg_end = round_up(end, align_size);
+   /* copy_all is usually in page fault case */
bool copy_all = head_off == 0 && end == pg_end;
+   /* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
+   bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
+srcmap->type == IOMAP_UNWRITTEN;
void *saddr = 0;
int ret = 0;
 
-   ret = dax_iomap_direct_access(srcmap, pos, size, , NULL);
-   if (ret)
-   return ret;
+   if (!zero_edge) {
+   ret = dax_iomap_direct_access(srcmap, pos, size, , NULL);
+   if (ret)
+   return ret;
+   }
 
if (copy_all) {
-   ret = copy_mc_to_kernel(daddr, saddr, length);
-   return ret ? -EIO : 0;
+   if (zero_edge)
+   memset(daddr, 0, size);
+   else
+   ret = copy_mc_to_kernel(daddr, saddr, length);
+   goto out;
}
 
/* Copy the head part of the range */
if (head_off) {
-   ret = copy_mc_to_kernel(daddr, saddr, head_off);
-   if (ret)
-   return -EIO;
+   if (zero_edge)
+   memset(daddr, 0, head_off);
+   else {
+   ret = copy_mc_to_kernel(daddr, saddr, head_off);
+   if (ret)
+   return -EIO;
+   }
}
 
/* Copy the tail part of the range */
@@ -1137,12 +1153,19 @@ static int dax_iomap_cow_copy(loff_t pos, uint64_t 
length, size_t align_size,
loff_t tail_off = head_off + length;
loff_t tail_len = pg_end - end;
 
-   ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
-   tail_len);
-   if (ret)
-   return -EIO;
+   if (zero_edge)
+   memset(daddr + tail_off, 0, tail_len);
+   else {
+   ret = copy_mc_to_kernel(daddr + tail_off,
+   saddr + tail_off, tail_len);
+   if (ret)
+   return -EIO;
+   }
}
-   return 0;
+out:
+   if (zero_edge)
+   dax_flush(srcmap->dax_dev, daddr, size);
+   return ret ? -EIO : 0;
 }
 
 /*
@@ -1241,13 +1264,10 @@ static int dax_memzero(struct iomap_iter *iter, loff_t 
pos, size_t size)
if (ret < 0)
return ret;
memset(kaddr + offset, 0, size);
-   if (srcmap->addr != iomap->addr) {
-   ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
- 

[PATCH v2.1 1/8] fsdax: introduce page->share for fsdax in reflink mode

2022-12-02 Thread Shiyang Ruan
fsdax page is used not only when CoW, but also mapread. To make the it
easily understood, use 'share' to indicate that the dax page is shared
by more than one extent.  And add helper functions to use it.

Also, the flag needs to be renamed to PAGE_MAPPING_DAX_SHARED.

Signed-off-by: Shiyang Ruan 
---
 fs/dax.c   | 38 ++
 include/linux/mm_types.h   |  5 -
 include/linux/page-flags.h |  2 +-
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 1c6867810cbd..edbacb273ab5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -334,35 +334,41 @@ static unsigned long dax_end_pfn(void *entry)
for (pfn = dax_to_pfn(entry); \
pfn < dax_end_pfn(entry); pfn++)
 
-static inline bool dax_mapping_is_cow(struct address_space *mapping)
+static inline bool dax_page_is_shared(struct page *page)
 {
-   return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
+   return (unsigned long)page->mapping == PAGE_MAPPING_DAX_SHARED;
 }
 
 /*
- * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
+ * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the
+ * refcount.
  */
-static inline void dax_mapping_set_cow(struct page *page)
+static inline void dax_page_bump_sharing(struct page *page)
 {
-   if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
+   if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_SHARED) {
/*
 * Reset the index if the page was already mapped
 * regularly before.
 */
if (page->mapping)
-   page->index = 1;
-   page->mapping = (void *)PAGE_MAPPING_DAX_COW;
+   page->share = 1;
+   page->mapping = (void *)PAGE_MAPPING_DAX_SHARED;
}
-   page->index++;
+   page->share++;
+}
+
+static inline unsigned long dax_page_drop_sharing(struct page *page)
+{
+   return --page->share;
 }
 
 /*
- * When it is called in dax_insert_entry(), the cow flag will indicate that
+ * When it is called in dax_insert_entry(), the shared flag will indicate that
  * whether this entry is shared by multiple files.  If so, set the 
page->mapping
- * FS_DAX_MAPPING_COW, and use page->index as refcount.
+ * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount.
  */
 static void dax_associate_entry(void *entry, struct address_space *mapping,
-   struct vm_area_struct *vma, unsigned long address, bool cow)
+   struct vm_area_struct *vma, unsigned long address, bool shared)
 {
unsigned long size = dax_entry_size(entry), pfn, index;
int i = 0;
@@ -374,8 +380,8 @@ static void dax_associate_entry(void *entry, struct 
address_space *mapping,
for_each_mapped_pfn(entry, pfn) {
struct page *page = pfn_to_page(pfn);
 
-   if (cow) {
-   dax_mapping_set_cow(page);
+   if (shared) {
+   dax_page_bump_sharing(page);
} else {
WARN_ON_ONCE(page->mapping);
page->mapping = mapping;
@@ -396,9 +402,9 @@ static void dax_disassociate_entry(void *entry, struct 
address_space *mapping,
struct page *page = pfn_to_page(pfn);
 
WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
-   if (dax_mapping_is_cow(page->mapping)) {
-   /* keep the CoW flag if this page is still shared */
-   if (page->index-- > 0)
+   if (dax_page_is_shared(page)) {
+   /* keep the shared flag if this page is still shared */
+   if (dax_page_drop_sharing(page) > 0)
continue;
} else
WARN_ON_ONCE(page->mapping && page->mapping != mapping);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 500e536796ca..f46cac3657ad 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -103,7 +103,10 @@ struct page {
};
/* See page-flags.h for PAGE_MAPPING_FLAGS */
struct address_space *mapping;
-   pgoff_t index;  /* Our offset within mapping. */
+   union {
+   pgoff_t index;  /* Our offset within 
mapping. */
+   unsigned long share;/* share count for 
fsdax */
+   };
/**
 * @private: Mapping-private opaque data.
 * Usually used for buffer_heads if PagePrivate.
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0b0ae5084e60..c8a3aa02278d 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -641,7 +641,7 @@