[Cluster-devel] [PATCH 10/12] xfs: hold i_rwsem until AIO completes

Christoph Hellwig Tue, 14 Jan 2020 08:41:07 -0800

Switch ext4 from the magic i_dio_count scheme to just hold i_rwsem
until the actual I/O has completed to reduce the locking complexity
and avoid nasty bugs due to missing inode_dio_wait calls.


Signed-off-by: Christoph Hellwig <h...@lst.de>
---
 fs/xfs/scrub/bmap.c    |  1 -
 fs/xfs/xfs_bmap_util.c |  3 ---
 fs/xfs/xfs_file.c      | 47 +++++++++++++-----------------------------
 fs/xfs/xfs_icache.c    |  3 +--
 fs/xfs/xfs_ioctl.c     |  1 -
 fs/xfs/xfs_iops.c      |  5 -----
 fs/xfs/xfs_reflink.c   |  2 --
 7 files changed, 15 insertions(+), 47 deletions(-)

diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index fa6ea6407992..d3e4068d3189 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -45,7 +45,6 @@ xchk_setup_inode_bmap(
         */
        if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
            sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
-               inode_dio_wait(VFS_I(sc->ip));
                error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
                if (error)
                        goto out;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e62fb5216341..a454f481107e 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -674,9 +674,6 @@ xfs_free_eofblocks(
                if (error)
                        return error;
 
-               /* wait on dio to ensure i_size has settled */
-               inode_dio_wait(VFS_I(ip));
-
                error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
                                &tp);
                if (error) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 0cc843a4a163..d0ee7d2932e4 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -193,9 +193,11 @@ xfs_file_dio_aio_read(
        } else {
                xfs_ilock(ip, XFS_IOLOCK_SHARED);
        }
-       ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
+       ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
+                          IOMAP_DIO_RWSEM_SHARED);
+       if (ret != -EIOCBQUEUED)
+               xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        return ret;
 }
 
@@ -341,15 +343,6 @@ xfs_file_aio_write_checks(
                                xfs_ilock(ip, *iolock);
                                iov_iter_reexpand(from, count);
                        }
-                       /*
-                        * We now have an IO submission barrier in place, but
-                        * AIO can do EOF updates during IO completion and hence
-                        * we now need to wait for all of them to drain. Non-AIO
-                        * DIO will have drained before we are given the
-                        * XFS_IOLOCK_EXCL, and so for most cases this wait is a
-                        * no-op.
-                        */
-                       inode_dio_wait(inode);
                        drained_dio = true;
                        goto restart;
                }
@@ -469,13 +462,7 @@ static const struct iomap_dio_ops xfs_dio_write_ops = {
  * needs to do sub-block zeroing and that requires serialisation against other
  * direct IOs to the same block. In this case we need to serialise the
  * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer.  To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. inode_dio_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
+ * the dio layer.
  */
 STATIC ssize_t
 xfs_file_dio_aio_write(
@@ -546,18 +533,21 @@ xfs_file_dio_aio_write(
         * xfs_file_aio_write_checks() for other reasons.
         */
        if (unaligned_io) {
-               inode_dio_wait(inode);
-               dio_flags = IOMAP_DIO_SYNCHRONOUS;
-       } else if (iolock == XFS_IOLOCK_EXCL) {
-               xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               iolock = XFS_IOLOCK_SHARED;
+               dio_flags = IOMAP_DIO_RWSEM_EXCL | IOMAP_DIO_SYNCHRONOUS;
+       } else {
+               if (iolock == XFS_IOLOCK_EXCL) {
+                       xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
+                       iolock = XFS_IOLOCK_SHARED;
+               }
+               dio_flags = IOMAP_DIO_RWSEM_SHARED;
        }
 
        trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
        ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
                           &xfs_dio_write_ops, dio_flags);
 out:
-       xfs_iunlock(ip, iolock);
+       if (ret != -EIOCBQUEUED)
+               xfs_iunlock(ip, iolock);
 
        /*
         * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -819,15 +809,6 @@ xfs_file_fallocate(
        if (error)
                goto out_unlock;
 
-       /*
-        * Must wait for all AIO to complete before we continue as AIO can
-        * change the file size on completion without holding any locks we
-        * currently hold. We must do this first because AIO can update both
-        * the on disk and in memory inode sizes, and the operations that follow
-        * require the in-memory size to be fully up-to-date.
-        */
-       inode_dio_wait(inode);
-
        /*
         * Now AIO and DIO has drained we flush and (if necessary) invalidate
         * the cached range over the first operation we are about to run.
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 8dc2e5414276..9e6f32fd32f5 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1720,8 +1720,7 @@ xfs_prep_free_cowblocks(
         */
        if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
-           mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
-           atomic_read(&VFS_I(ip)->i_dio_count))
+           mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK))
                return false;
 
        return true;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7b35d62ede9f..331453f2c4be 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -548,7 +548,6 @@ xfs_ioc_space(
        error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
        if (error)
                goto out_unlock;
-       inode_dio_wait(inode);
 
        switch (bf->l_whence) {
        case 0: /*SEEK_SET*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8afe69ca188b..700edeccc6bf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -893,11 +893,6 @@ xfs_setattr_size(
        if (error)
                return error;
 
-       /*
-        * Wait for all direct I/O to complete.
-        */
-       inode_dio_wait(inode);
-
        /*
         * File data changes must be complete before we start the transaction to
         * modify the inode.  This needs to be done before joining the inode to
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index de451235c4ee..f775e60ca6f7 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1525,8 +1525,6 @@ xfs_reflink_unshare(
 
        trace_xfs_reflink_unshare(ip, offset, len);
 
-       inode_dio_wait(inode);
-
        error = iomap_file_unshare(inode, offset, len,
                        &xfs_buffered_write_iomap_ops);
        if (error)
-- 
2.24.1

[Cluster-devel] [PATCH 10/12] xfs: hold i_rwsem until AIO completes

Reply via email to