On Fri, Sep 24, 2021 at 09:09:58PM +0800, Shiyang Ruan wrote:
> This function is used to handle errors which may cause data lost in
> filesystem.  Such as memory failure in fsdax mode.
> 
> If the rmap feature of XFS enabled, we can query it to find files and
> metadata which are associated with the corrupt data.  For now all we do
> is kill processes with that file mapped into their address spaces, but
> future patches could actually do something about corrupt metadata.
> 
> After that, the memory failure needs to notify the processes who are
> using those files.
> 
> Signed-off-by: Shiyang Ruan <ruansy.f...@fujitsu.com>
> ---
>  drivers/dax/super.c |  19 +++++
>  fs/xfs/xfs_fsops.c  |   3 +
>  fs/xfs/xfs_mount.h  |   1 +
>  fs/xfs/xfs_super.c  | 188 ++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/dax.h |  18 +++++
>  5 files changed, 229 insertions(+)
> 
> diff --git a/drivers/dax/super.c b/drivers/dax/super.c
> index 7d4a11dcba90..22091e7fb0ef 100644
> --- a/drivers/dax/super.c
> +++ b/drivers/dax/super.c
> @@ -135,6 +135,25 @@ struct dax_device *fs_dax_get_by_bdev(struct 
> block_device *bdev)
>  }
>  EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev);
>  
> +void fs_dax_register_holder(struct dax_device *dax_dev, void *holder,
> +             const struct dax_holder_operations *ops)
> +{
> +     dax_set_holder(dax_dev, holder, ops);
> +}
> +EXPORT_SYMBOL_GPL(fs_dax_register_holder);
> +
> +void fs_dax_unregister_holder(struct dax_device *dax_dev)
> +{
> +     dax_set_holder(dax_dev, NULL, NULL);
> +}
> +EXPORT_SYMBOL_GPL(fs_dax_unregister_holder);
> +
> +void *fs_dax_get_holder(struct dax_device *dax_dev)
> +{
> +     return dax_get_holder(dax_dev);
> +}
> +EXPORT_SYMBOL_GPL(fs_dax_get_holder);
> +
>  bool generic_fsdax_supported(struct dax_device *dax_dev,
>               struct block_device *bdev, int blocksize, sector_t start,
>               sector_t sectors)
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index 33e26690a8c4..4c2d3d4ca5a5 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -542,6 +542,9 @@ xfs_do_force_shutdown(
>       } else if (flags & SHUTDOWN_CORRUPT_INCORE) {
>               tag = XFS_PTAG_SHUTDOWN_CORRUPT;
>               why = "Corruption of in-memory data";
> +     } else if (flags & SHUTDOWN_CORRUPT_META) {
> +             tag = XFS_PTAG_SHUTDOWN_CORRUPT;
> +             why = "Corruption of on-disk metadata";
>       } else {
>               tag = XFS_PTAG_SHUTDOWN_IOERROR;
>               why = "Metadata I/O Error";
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index e091f3b3fa15..d0f6da23e3df 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -434,6 +434,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int 
> lags, char *fname,
>  #define SHUTDOWN_LOG_IO_ERROR        0x0002  /* write attempt to the log 
> failed */
>  #define SHUTDOWN_FORCE_UMOUNT        0x0004  /* shutdown from a forced 
> unmount */
>  #define SHUTDOWN_CORRUPT_INCORE      0x0008  /* corrupt in-memory data 
> structures */
> +#define SHUTDOWN_CORRUPT_META        0x0010  /* corrupt metadata on device */
>  
>  #define XFS_SHUTDOWN_STRINGS \
>       { SHUTDOWN_META_IO_ERROR,       "metadata_io" }, \
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index c4e0cd1c1c8c..46fdf44b5ec2 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -37,11 +37,19 @@
>  #include "xfs_reflink.h"
>  #include "xfs_pwork.h"
>  #include "xfs_ag.h"
> +#include "xfs_alloc.h"
> +#include "xfs_rmap.h"
> +#include "xfs_rmap_btree.h"
> +#include "xfs_rtalloc.h"
> +#include "xfs_bit.h"
>  
>  #include <linux/magic.h>
>  #include <linux/fs_context.h>
>  #include <linux/fs_parser.h>
> +#include <linux/mm.h>
> +#include <linux/dax.h>
>  
> +static const struct dax_holder_operations xfs_dax_holder_operations;
>  static const struct super_operations xfs_super_operations;
>  
>  static struct kset *xfs_kset;                /* top-level xfs sysfs dir */
> @@ -377,6 +385,8 @@ xfs_close_devices(
>  
>               xfs_free_buftarg(mp->m_logdev_targp);
>               xfs_blkdev_put(logdev);
> +             if (dax_logdev)
> +                     fs_dax_unregister_holder(dax_logdev);
>               fs_put_dax(dax_logdev);
>       }
>       if (mp->m_rtdev_targp) {
> @@ -385,9 +395,13 @@ xfs_close_devices(
>  
>               xfs_free_buftarg(mp->m_rtdev_targp);
>               xfs_blkdev_put(rtdev);
> +             if (dax_rtdev)
> +                     fs_dax_unregister_holder(dax_rtdev);
>               fs_put_dax(dax_rtdev);
>       }
>       xfs_free_buftarg(mp->m_ddev_targp);
> +     if (dax_ddev)
> +             fs_dax_unregister_holder(dax_ddev);
>       fs_put_dax(dax_ddev);
>  }
>  
> @@ -411,6 +425,9 @@ xfs_open_devices(
>       struct block_device     *logdev = NULL, *rtdev = NULL;
>       int                     error;
>  
> +     if (dax_ddev)
> +             fs_dax_register_holder(dax_ddev, mp,
> +                             &xfs_dax_holder_operations);
>       /*
>        * Open real time and log devices - order is important.
>        */
> @@ -419,6 +436,9 @@ xfs_open_devices(
>               if (error)
>                       goto out;
>               dax_logdev = fs_dax_get_by_bdev(logdev);
> +             if (dax_logdev != dax_ddev && dax_logdev)
> +                     fs_dax_register_holder(dax_logdev, mp,
> +                                     &xfs_dax_holder_operations);
>       }
>  
>       if (mp->m_rtname) {
> @@ -433,6 +453,9 @@ xfs_open_devices(
>                       goto out_close_rtdev;
>               }
>               dax_rtdev = fs_dax_get_by_bdev(rtdev);
> +             if (dax_rtdev)
> +                     fs_dax_register_holder(dax_rtdev, mp,
> +                                     &xfs_dax_holder_operations);
>       }
>  
>       /*
> @@ -1110,6 +1133,171 @@ xfs_fs_free_cached_objects(
>       return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
>  }
>  
> +struct notify_failure_info {
> +     xfs_agblock_t startblock;

Style nit: tabs between type name and variable name.

> +     xfs_filblks_t blockcount;
> +     int flags;

Are these MF_ flags to be passed from memory_failure to
mf_dax_kill_procs?

> +};
> +
> +static loff_t
> +xfs_notify_failure_start(
> +     struct xfs_mount                        *mp,
> +     const struct xfs_rmap_irec              *rec,
> +     const struct notify_failure_info        *notify)
> +{
> +     loff_t start = rec->rm_offset;
> +
> +     if (notify->startblock > rec->rm_startblock)
> +             start += XFS_FSB_TO_B(mp,
> +                             notify->startblock - rec->rm_startblock);

I'm confused, is this supposed to return the file pos(ition) of the
failed range in units of bytes or in fs blocks?

If it's units of bytes (like the loff_t return value implies) then this
should be called xfs_notify_failure_pos.

> +     return start;
> +}
> +
> +static size_t
> +xfs_notify_failure_size(
> +     struct xfs_mount                        *mp,
> +     const struct xfs_rmap_irec              *rec,
> +     const struct notify_failure_info        *notify)
> +{
> +     xfs_agblock_t rec_start = rec->rm_startblock;
> +     xfs_agblock_t rec_end = rec->rm_startblock + rec->rm_blockcount;

These are "next" variables, not "end".  The end of the record is
startblock + blockcount - 1.

> +     xfs_agblock_t notify_start = notify->startblock;
> +     xfs_agblock_t notify_end = notify->startblock + notify->blockcount;
> +     xfs_agblock_t cross_start = max(rec_start, notify_start);
> +     xfs_agblock_t cross_end = min(rec_end, notify_end);
> +
> +     return XFS_FSB_TO_B(mp, cross_end - cross_start);
> +}
> +
> +static int
> +xfs_dax_notify_failure_fn(
> +     struct xfs_btree_cur            *cur,
> +     const struct xfs_rmap_irec      *rec,
> +     void                            *data)
> +{
> +     struct xfs_mount                *mp = cur->bc_mp;
> +     struct xfs_inode                *ip;
> +     struct address_space            *mapping;
> +     struct notify_failure_info      *notify = data;
> +     int                             error = 0;
> +
> +     if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
> +         (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
> +             // TODO check and try to fix metadata
> +             xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_META);
> +             return -EFSCORRUPTED;
> +     }
> +
> +     /* Get files that incore, filter out others that are not in use. */
> +     error = xfs_iget(mp, cur->bc_tp, rec->rm_owner, XFS_IGET_INCORE,
> +                      0, &ip);

If you're going to use _INCORE then you probably want to filter out the
-ENODATA or whatever error code means "inode wasn't loaded", because
returning any nonzero value to rmap_query_range causes it to stop
iterating.

> +     if (error)
> +             return error;
> +
> +     mapping = VFS_I(ip)->i_mapping;
> +     if (IS_ENABLED(CONFIG_MEMORY_FAILURE)) {
> +             loff_t offset = xfs_notify_failure_start(mp, rec, notify);
> +             size_t size = xfs_notify_failure_size(mp, rec, notify);
> +
> +             error = mf_dax_kill_procs(mapping, offset >> PAGE_SHIFT, size,
> +                                       notify->flags);
> +     }
> +     // TODO try to fix data
> +     xfs_irele(ip);
> +
> +     return error;
> +}
> +
> +static loff_t
> +xfs_dax_bdev_offset(
> +     struct xfs_mount *mp,
> +     struct dax_device *dax_dev,
> +     loff_t disk_offset)
> +{
> +     struct block_device *bdev;
> +
> +     if (mp->m_ddev_targp->bt_daxdev == dax_dev)
> +             bdev = mp->m_ddev_targp->bt_bdev;
> +     else if (mp->m_logdev_targp->bt_daxdev == dax_dev)
> +             bdev = mp->m_logdev_targp->bt_bdev;
> +     else
> +             bdev = mp->m_rtdev_targp->bt_bdev;
> +
> +     return disk_offset - (get_start_sect(bdev) << SECTOR_SHIFT);
> +}
> +
> +static int
> +xfs_dax_notify_failure(
> +     struct dax_device       *dax_dev,
> +     loff_t                  offset,
> +     size_t                  len,
> +     int                     flags)

Are @flags supposed to contain the MF_ flags that were passed to
memory_failure()?  The variable name should probably be @mf_flags
throughout the patchset if that's the case.

> +{
> +     struct xfs_mount        *mp = fs_dax_get_holder(dax_dev);
> +     struct xfs_trans        *tp = NULL;
> +     struct xfs_btree_cur    *cur = NULL;
> +     struct xfs_buf          *agf_bp = NULL;
> +     struct xfs_rmap_irec    rmap_low, rmap_high;
> +     loff_t                  bdev_offset = xfs_dax_bdev_offset(mp, dax_dev,
> +                                                               offset);

I don't like using loff_t to represent byte offsets into the physical
device.  loff_t should be used only for file byte offsets, and that's
not what we're storing here.

> +     xfs_fsblock_t           fsbno = XFS_B_TO_FSB(mp, bdev_offset);

I think this is still wrong, since fsblocks are segmented (nonlinear)
addresses.  Pass daddr into xfs_dax_notify_ddev_failure like I lay out
below, and then you can do:

        start_fsbno = XFS_DADDR_TO_FSB(mp, daddr);
        agno = XFS_FSB_TO_AGNO(mp, fsbno);
        notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
        notify.blockcount = XFS_BB_TO_FSB(mp, bblen);

(More on this below)

> +     xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, fsbno);
> +     int                     error = 0;
> +     struct notify_failure_info notify = {
> +             .startblock     = XFS_FSB_TO_AGBNO(mp, fsbno),
> +             .blockcount     = XFS_B_TO_FSB(mp, len),
> +             .flags          = flags,
> +     };
> +
> +     if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
> +             xfs_warn(mp,
> +                      "notify_failure() not supported on realtime device!");
> +             return -EOPNOTSUPP;
> +     }
> +
> +     if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
> +         mp->m_logdev_targp != mp->m_ddev_targp) {
> +             xfs_err(mp, "ondisk log corrupt, shutting down fs!");
> +             xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_META);
> +             return -EFSCORRUPTED;
> +     }

Urk.  xfs_dax_notify_failure should be a short function to dispatch the
notification to the proper handler.  Everything from here down should be
in a separate function xfs_dax_notify_ddev_failure, so that the next
line of xfs_dax_notify_failure is just:

        return xfs_dax_notify_ddev_failure(mp, BTOBB(offset),
                        BTOBB(len), flags);

And then we have

static int
xfs_dax_notify_ddev_failure(
        struct xfs_mount        *mp,
        xfs_daddr_t             daddr,
        xfs_daddr_t             bblen,
        int                     mf_flags)
{
        if (!xfs_has_rmapbt(mp)) {
                xfs_warn(...);

Because otherwise xfs_dax_notify_failure gets cluttered.

> +
> +     if (!xfs_has_rmapbt(mp)) {
> +             xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
> +             return -EOPNOTSUPP;
> +     }
> +
> +     error = xfs_trans_alloc_empty(mp, &tp);
> +     if (error)
> +             return error;
> +
> +     error = xfs_alloc_read_agf(mp, tp, agno, 0, &agf_bp);
> +     if (error)
> +             goto out_cancel_tp;
> +
> +     cur = xfs_rmapbt_init_cursor(mp, tp, agf_bp, agf_bp->b_pag);

What happens if the failure range spans multiple AGs?  I suppose it's
not technically possible today since we only report a single page at a
time.  But for the general case, I think we actually want (building off
the sample code above) this function to do something like this:

        start_fsbno = XFS_DADDR_TO_FSB(mp, daddr);
        agno = XFS_FSB_TO_AGNO(mp, fsbno);
        end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
        end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);

        for (; agno <= end_agbno; agno++) {
                struct xfs_rmap_irec    rmap_low = { };
                struct xfs_rmap_irec    rmap_high;

                notify.startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
                notify.blockcount = XFS_BB_TO_FSB(mp, bblen);

                /*
                 * init transaction, read agf, init cursor...
                 */

                memset(&rmap_high, 0xFF, sizeof(rmap_high));
                rmap_low.rm_startblock = XFS_FSB_TO_AGBNO(mp, fsbno);
                if (agno == end_agbno)
                        rmap_high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
                                                        end_fsbno);

                error = xfs_rmap_query_range(...);
                if (error)
                        fail;

                fsbno = XFS_AGB_TO_FSB(mp, agno + 1, 0);
        }

--D

> +
> +     /* Construct a range for rmap query */
> +     memset(&rmap_low, 0, sizeof(rmap_low));
> +     memset(&rmap_high, 0xFF, sizeof(rmap_high));
> +     rmap_low.rm_startblock = rmap_high.rm_startblock = notify.startblock;
> +     rmap_low.rm_blockcount = rmap_high.rm_blockcount = notify.blockcount;
> +
> +     error = xfs_rmap_query_range(cur, &rmap_low, &rmap_high,
> +                                  xfs_dax_notify_failure_fn, &notify);
> +
> +     xfs_btree_del_cursor(cur, error);
> +     xfs_trans_brelse(tp, agf_bp);
> +
> +out_cancel_tp:
> +     xfs_trans_cancel(tp);
> +     return error;
> +}
> +
> +static const struct dax_holder_operations xfs_dax_holder_operations = {
> +     .notify_failure         = xfs_dax_notify_failure,
> +};
> +
>  static const struct super_operations xfs_super_operations = {
>       .alloc_inode            = xfs_fs_alloc_inode,
>       .destroy_inode          = xfs_fs_destroy_inode,
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 3d90becbd160..0f1fa7a4a616 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -149,6 +149,10 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
>  }
>  
>  struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
> +void fs_dax_register_holder(struct dax_device *dax_dev, void *holder,
> +             const struct dax_holder_operations *ops);
> +void fs_dax_unregister_holder(struct dax_device *dax_dev);
> +void *fs_dax_get_holder(struct dax_device *dax_dev);
>  int dax_writeback_mapping_range(struct address_space *mapping,
>               struct dax_device *dax_dev, struct writeback_control *wbc);
>  
> @@ -179,6 +183,20 @@ static inline struct dax_device 
> *fs_dax_get_by_bdev(struct block_device *bdev)
>       return NULL;
>  }
>  
> +static inline void fs_dax_register_holder(struct dax_device *dax_dev,
> +             void *holder, const struct dax_holder_operations *ops)
> +{
> +}
> +
> +static inline void fs_dax_unregister_holder(struct dax_device *dax_dev)
> +{
> +}
> +
> +static inline void *fs_dax_get_holder(struct dax_device *dax_dev)
> +{
> +     return NULL;
> +}
> +
>  static inline struct page *dax_layout_busy_page(struct address_space 
> *mapping)
>  {
>       return NULL;
> -- 
> 2.33.0
> 
> 
> 

Reply via email to