On Fri, Aug 07, 2020 at 09:13:29PM +0800, Shiyang Ruan wrote:
> Under the mode of both dax and reflink on, one page may be shared by
> multiple files and offsets.  In order to track them in memory-failure or
> other cases, we introduce this function by finding out who is sharing
> this block(the page) in a filesystem.  It returns a list that contains
> all the owners, and the offset in each owner.
> 
> For XFS, rmapbt is used to find out the owners of one block.  So, it
> should be turned on when we want to use dax&reflink feature together.
> 
> Signed-off-by: Shiyang Ruan <[email protected]>
> ---
>  fs/xfs/xfs_super.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/dax.h |  7 +++++
>  include/linux/fs.h  |  2 ++
>  3 files changed, 76 insertions(+)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index 379cbff438bc..b71392219c91 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -35,6 +35,9 @@
>  #include "xfs_refcount_item.h"
>  #include "xfs_bmap_item.h"
>  #include "xfs_reflink.h"
> +#include "xfs_alloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
>  
>  #include <linux/magic.h>
>  #include <linux/fs_context.h>
> @@ -1097,6 +1100,69 @@ xfs_fs_free_cached_objects(
>       return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>  }
>  
> +static int _get_shared_files_fn(

Needs an xfs_ prefix...

> +     struct xfs_btree_cur    *cur,
> +     struct xfs_rmap_irec    *rec,
> +     void                    *priv)
> +{
> +     struct list_head        *list = priv;
> +     struct xfs_inode        *ip;
> +     struct shared_files     *sfp;
> +
> +     /* Get files that incore, filter out others that are not in use. */
> +     xfs_iget(cur->bc_mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE, 0, 
> &ip);

No error checking at all?

What if rm_owner refers to metadata?

> +     if (ip && !ip->i_vnode.i_mapping)
> +             return 0;

When is the xfs_inode released?  We don't iput it here, and there's no
way for dax_unlock_page (afaict the only consumer) to do it, so we
leak the reference.

> +
> +     sfp = kmalloc(sizeof(*sfp), GFP_KERNEL);

If there are millions of open files reflinked to this range of pmem this
is going to allocate a /lot/ of memory.

> +     sfp->mapping = ip->i_vnode.i_mapping;

sfp->mapping = VFS_I(ip)->i_mapping;

> +     sfp->index = rec->rm_offset;
> +     list_add_tail(&sfp->list, list);

Why do we leave ->cookie uninitialized?  What does it even do?

> +
> +     return 0;
> +}
> +
> +static int
> +xfs_fs_get_shared_files(
> +     struct super_block      *sb,
> +     pgoff_t                 offset,

Which device does this offset refer to?  XFS supports multiple storage
devices.

Also, uh, is this really a pgoff_t?  If yes, you can't use it with
XFS_B_TO_FSB below without first converting it to a loff_t.

> +     struct list_head        *list)
> +{
> +     struct xfs_mount        *mp = XFS_M(sb);
> +     struct xfs_trans        *tp = NULL;
> +     struct xfs_btree_cur    *cur = NULL;
> +     struct xfs_rmap_irec    rmap_low = { 0 }, rmap_high = { 0 };

No need to memset(0) rmap_low later, or zero rmap_high just to memset it
later.

> +     struct xfs_buf          *agf_bp = NULL;
> +     xfs_agblock_t           bno = XFS_B_TO_FSB(mp, offset);

"FSB" refers to xfs_fsblock_t.  You just ripped the upper 32 bits off
the fsblock number.

> +     xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, bno);
> +     int                     error = 0;
> +
> +     error = xfs_trans_alloc_empty(mp, &tp);
> +     if (error)
> +             return error;
> +
> +     error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +     if (error)
> +             return error;
> +
> +     cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agno);
> +
> +     memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));

Not necessary, bc_rec is zero in a freshly created cursor.

> +     /* Construct the range for one rmap search */
> +     memset(&rmap_low, 0, sizeof(rmap_low));
> +     memset(&rmap_high, 0xFF, sizeof(rmap_high));
> +     rmap_low.rm_startblock = rmap_high.rm_startblock = bno;
> +
> +     error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
> +                                  _get_shared_files_fn, list);
> +     if (error == -ECANCELED)
> +             error = 0;
> +
> +     xfs_btree_del_cursor(cur, error);
> +     xfs_trans_brelse(tp, agf_bp);
> +     return error;
> +}

Looking at this, I don't think this is the right way to approach memory
poisoning.  Rather than allocating a (potentially huge) linked list and
passing it to the memory poison code to unmap pages, kill processes, and
free the list, I think:

1) "->get_shared_files" should be more targetted.  Call it ->storage_lost
or something, so that it only has one purpose, which is to react to
asynchronous notifications that storage has been lost.

2) The inner _get_shared_files_fn should directly call back into the
memory manager to remove a poisoned page from the mapping and signal
whatever process might have it mapped.

That way, _get_shared_files_fn can look in the xfs buffer cache to see
if we have a copy in DRAM, and immediately write it back to pmem.

Hmm and now that you've gotten me rambling about hwpoison, I wonder what
happens if dram backing part of the xfs buffer cache goes bad...

--D

> +
>  static const struct super_operations xfs_super_operations = {
>       .alloc_inode            = xfs_fs_alloc_inode,
>       .destroy_inode          = xfs_fs_destroy_inode,
> @@ -1110,6 +1176,7 @@ static const struct super_operations 
> xfs_super_operations = {
>       .show_options           = xfs_fs_show_options,
>       .nr_cached_objects      = xfs_fs_nr_cached_objects,
>       .free_cached_objects    = xfs_fs_free_cached_objects,
> +     .get_shared_files       = xfs_fs_get_shared_files,
>  };
>  
>  static int
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 6904d4e0b2e0..0a85e321d6b4 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -40,6 +40,13 @@ struct dax_operations {
>  
>  extern struct attribute_group dax_attribute_group;
>  
> +struct shared_files {
> +     struct list_head        list;
> +     struct address_space    *mapping;
> +     pgoff_t                 index;
> +     dax_entry_t             cookie;
> +};
> +
>  #if IS_ENABLED(CONFIG_DAX)
>  struct dax_device *dax_get_by_host(const char *host);
>  struct dax_device *alloc_dax(void *private, const char *host,
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f5abba86107d..81de3d2739b9 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1977,6 +1977,8 @@ struct super_operations {
>                                 struct shrink_control *);
>       long (*free_cached_objects)(struct super_block *,
>                                   struct shrink_control *);
> +     int (*get_shared_files)(struct super_block *sb, pgoff_t offset,
> +                             struct list_head *list);
>  };
>  
>  /*
> -- 
> 2.27.0
> 
> 
> 

Reply via email to