[PATCH 01/10] fs: Separate out kiocb flags setup based on RWF_* flags

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 fs/read_write.c| 12 +++-
 include/linux/fs.h | 14 ++
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/fs/read_write.c b/fs/read_write.c
index 47c1d4484df9..53c816c61122 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, 
struct iov_iter *iter,
struct kiocb kiocb;
ssize_t ret;
 
-   if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
-   return -EOPNOTSUPP;
-
init_sync_kiocb(, filp);
-   if (flags & RWF_HIPRI)
-   kiocb.ki_flags |= IOCB_HIPRI;
-   if (flags & RWF_DSYNC)
-   kiocb.ki_flags |= IOCB_DSYNC;
-   if (flags & RWF_SYNC)
-   kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+   ret = kiocb_set_rw_flags(, flags);
+   if (ret)
+   return ret;
kiocb.ki_pos = *ppos;
 
if (type == READ)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 803e5a9b2654..f53867140f43 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3056,6 +3056,20 @@ static inline int iocb_flags(struct file *file)
return res;
 }
 
+static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
+{
+   if (unlikely(flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)))
+   return -EOPNOTSUPP;
+
+   if (flags & RWF_HIPRI)
+   ki->ki_flags |= IOCB_HIPRI;
+   if (flags & RWF_DSYNC)
+   ki->ki_flags |= IOCB_DSYNC;
+   if (flags & RWF_SYNC)
+   ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+   return 0;
+}
+
 static inline ino_t parent_ino(struct dentry *dentry)
 {
ino_t res;
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/10] fs: Introduce filemap_range_has_page()

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

filemap_range_has_page() return true if the file's mapping has
a page within the range mentioned. This function will be used
to check if a write() call will cause a writeback of previous
writes.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 include/linux/fs.h |  2 ++
 mm/filemap.c   | 33 +
 2 files changed, 35 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index f53867140f43..dc0ab585cd56 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2517,6 +2517,8 @@ extern int filemap_fdatawait(struct address_space *);
 extern void filemap_fdatawait_keep_errors(struct address_space *);
 extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
   loff_t lend);
+extern int filemap_range_has_page(struct address_space *, loff_t lstart,
+ loff_t lend);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f1be573a5e6..87aba7698584 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -376,6 +376,39 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
+/**
+ * filemap_range_has_page - check if a page exists in range.
+ * @mapping:   address space structure to wait for
+ * @start_byte:offset in bytes where the range starts
+ * @end_byte:  offset in bytes where the range ends (inclusive)
+ *
+ * Find at least one page in the range supplied, usually used to check if
+ * direct writing in this range will trigger a writeback.
+ */
+int filemap_range_has_page(struct address_space *mapping,
+  loff_t start_byte, loff_t end_byte)
+{
+   pgoff_t index = start_byte >> PAGE_SHIFT;
+   pgoff_t end = end_byte >> PAGE_SHIFT;
+   struct pagevec pvec;
+   int ret;
+
+   if (end_byte < start_byte)
+   return 0;
+
+   if (mapping->nrpages == 0)
+   return 0;
+
+   pagevec_init(, 0);
+   ret = pagevec_lookup(, mapping, index, 1);
+   if (!ret)
+   return 0;
+   ret = (pvec.pages[0]->index <= end);
+   pagevec_release();
+   return ret;
+}
+EXPORT_SYMBOL(filemap_range_has_page);
+
 static int __filemap_fdatawait_range(struct address_space *mapping,
 loff_t start_byte, loff_t end_byte)
 {
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/10] fs: Introduce IOMAP_NOWAIT

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps.
This is used by XFS in the XFS patch.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 fs/iomap.c| 2 ++
 include/linux/iomap.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 4b10892967a5..5d85ec6e7b20 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -879,6 +879,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
} else {
dio->flags |= IOMAP_DIO_WRITE;
flags |= IOMAP_WRITE;
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   flags |= IOMAP_NOWAIT;
}
 
ret = filemap_write_and_wait_range(mapping, start, end);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index f753e788da31..69f4e9470084 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -52,6 +52,7 @@ struct iomap {
 #define IOMAP_REPORT   (1 << 2) /* report extent status, e.g. FIEMAP */
 #define IOMAP_FAULT(1 << 3) /* mapping for page fault */
 #define IOMAP_DIRECT   (1 << 4) /* direct I/O */
+#define IOMAP_NOWAIT   (1 << 5) /* Don't wait for writeback */
 
 struct iomap_ops {
/*
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] xfs: nowait aio support

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

If IOCB_NOWAIT is set, bail if the i_rwsem is not lockable
immediately.

IF IOMAP_NOWAIT is set, return EAGAIN in xfs_file_iomap_begin
if it needs allocation either due to file extension, writing to a hole,
or COW or waiting for other DIOs to finish.

Return -EAGAIN if we don't have extent list in memory.

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Darrick J. Wong 
---
 fs/xfs/xfs_file.c  | 19 ++-
 fs/xfs/xfs_iomap.c | 22 ++
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5fb5a0958a14..f87a8a66e6f7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -541,8 +541,11 @@ xfs_file_dio_aio_write(
iolock = XFS_IOLOCK_SHARED;
}
 
-   xfs_ilock(ip, iolock);
-
+   if (!xfs_ilock_nowait(ip, iolock)) {
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EAGAIN;
+   xfs_ilock(ip, iolock);
+   }
ret = xfs_file_aio_write_checks(iocb, from, );
if (ret)
goto out;
@@ -553,9 +556,15 @@ xfs_file_dio_aio_write(
 * otherwise demote the lock if we had to take the exclusive lock
 * for other reasons in xfs_file_aio_write_checks.
 */
-   if (unaligned_io)
-   inode_dio_wait(inode);
-   else if (iolock == XFS_IOLOCK_EXCL) {
+   if (unaligned_io) {
+   /* If we are going to wait for other DIO to finish, bail */
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (atomic_read(>i_dio_count))
+   return -EAGAIN;
+   } else {
+   inode_dio_wait(inode);
+   }
+   } else if (iolock == XFS_IOLOCK_EXCL) {
xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
iolock = XFS_IOLOCK_SHARED;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 94e5bdf7304c..05dc87e8c1f5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -995,6 +995,11 @@ xfs_file_iomap_begin(
lockmode = xfs_ilock_data_map_shared(ip);
}
 
+   if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) {
+   error = -EAGAIN;
+   goto out_unlock;
+   }
+
ASSERT(offset <= mp->m_super->s_maxbytes);
if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
length = mp->m_super->s_maxbytes - offset;
@@ -1016,6 +1021,15 @@ xfs_file_iomap_begin(
 
if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
if (flags & IOMAP_DIRECT) {
+   /*
+* A reflinked inode will result in CoW alloc.
+* FIXME: It could still overwrite on unshared extents
+* and not need allocation.
+*/
+   if (flags & IOMAP_NOWAIT) {
+   error = -EAGAIN;
+   goto out_unlock;
+   }
/* may drop and re-acquire the ilock */
error = xfs_reflink_allocate_cow(ip, , ,
);
@@ -1033,6 +1047,14 @@ xfs_file_iomap_begin(
 
if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, , nimaps)) {
/*
+* If nowait is set bail since we are going to make
+* allocations.
+*/
+   if (flags & IOMAP_NOWAIT) {
+   error = -EAGAIN;
+   goto out_unlock;
+   }
+   /*
 * We cap the maximum length we map here to MAX_WRITEBACK_PAGES
 * pages to keep the chunks of work done where somewhat 
symmetric
 * with the work writeback does. This is a completely arbitrary
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] ext4: nowait aio support

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Return EAGAIN if any of the following checks fail for direct I/O:
  + i_rwsem is lockable
  + Writing beyond end of file (will trigger allocation)
  + Blocks are not allocated at the write location

Signed-off-by: Goldwyn Rodrigues 
Reviewed-by: Jan Kara 
---
 fs/ext4/file.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 831fd6beebf0..07f08ff2c11b 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -216,7 +216,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
return ext4_dax_write_iter(iocb, from);
 #endif
 
-   inode_lock(inode);
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (!inode_trylock(inode))
+   return -EAGAIN;
+   } else {
+   inode_lock(inode);
+   }
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
@@ -235,9 +241,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
 
iocb->private = 
/* Check whether we do a DIO overwrite or not */
-   if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-   ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
-   overwrite = 1;
+   if (o_direct && !unaligned_aio) {
+   if (ext4_overwrite_io(inode, iocb->ki_pos, 
iov_iter_count(from))) {
+   if (ext4_should_dioread_nolock(inode))
+   overwrite = 1;
+   } else if (iocb->ki_flags & IOCB_NOWAIT) {
+   ret = -EAGAIN;
+   goto out;
+   }
+   }
 
ret = __generic_file_write_iter(iocb, from);
inode_unlock(inode);
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] btrfs: nowait aio support

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Return EAGAIN if any of the following checks fail
 + i_rwsem is not lockable
 + NODATACOW or PREALLOC is not set
 + Cannot nocow at the desired location
 + Writing beyond end of file which is not allocated

Acked-by: David Sterba 
Signed-off-by: Goldwyn Rodrigues 
---
 fs/btrfs/file.c  | 25 -
 fs/btrfs/inode.c |  3 +++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index da1096eb1a40..aae088e49915 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1875,12 +1875,29 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
ssize_t num_written = 0;
bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
ssize_t err;
-   loff_t pos;
-   size_t count;
+   loff_t pos = iocb->ki_pos;
+   size_t count = iov_iter_count(from);
loff_t oldsize;
int clean_page = 0;
 
-   inode_lock(inode);
+   if ((iocb->ki_flags & IOCB_NOWAIT) &&
+   (iocb->ki_flags & IOCB_DIRECT)) {
+   /* Don't sleep on inode rwsem */
+   if (!inode_trylock(inode))
+   return -EAGAIN;
+   /*
+* We will allocate space in case nodatacow is not set,
+* so bail
+*/
+   if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) ||
+   check_can_nocow(BTRFS_I(inode), pos, ) <= 0) {
+   inode_unlock(inode);
+   return -EAGAIN;
+   }
+   } else
+   inode_lock(inode);
+
err = generic_write_checks(iocb, from);
if (err <= 0) {
inode_unlock(inode);
@@ -1914,8 +1931,6 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 */
update_time_for_write(inode);
 
-   pos = iocb->ki_pos;
-   count = iov_iter_count(from);
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
if (start_pos > oldsize) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17cbe9306faf..2ab71b946829 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8755,6 +8755,9 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct 
iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
+   } else if (iocb->ki_flags & IOCB_NOWAIT) {
+   ret = -EAGAIN;
+   goto out;
}
ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/10] fs: Use RWF_* flags for AIO operations

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

aio_rw_flags is introduced in struct iocb (using aio_reserved1) which will
carry the RWF_* flags. We cannot use aio_flags because they are not
checked for validity which may break existing applications.

Note, the only place RWF_HIPRI comes in effect is dio_await_one().
All the rest of the locations, aio code return -EIOCBQUEUED before the
checks for RWF_HIPRI.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 fs/aio.c | 8 +++-
 include/uapi/linux/aio_abi.h | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index f52d925ee259..020fa0045e3c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
ssize_t ret;
 
/* enforce forwards compatibility on users */
-   if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
+   if (unlikely(iocb->aio_reserved2)) {
pr_debug("EINVAL: reserve field set\n");
return -EINVAL;
}
@@ -1586,6 +1586,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
req->common.ki_flags |= IOCB_EVENTFD;
}
 
+   ret = kiocb_set_rw_flags(>common, iocb->aio_rw_flags);
+   if (unlikely(ret)) {
+   pr_debug("EINVAL: aio_rw_flags\n");
+   goto out_put_req;
+   }
+
ret = put_user(KIOCB_KEY, _iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
index bb2554f7fbd1..a2d4a8ac94ca 100644
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -79,7 +79,7 @@ struct io_event {
 struct iocb {
/* these are internal to the kernel/libc. */
__u64   aio_data;   /* data to be returned in event's data */
-   __u32   PADDED(aio_key, aio_reserved1);
+   __u32   PADDED(aio_key, aio_rw_flags);
/* the kernel sets aio_key to the req # */
 
/* common fields */
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/10] fs: return if direct write will trigger writeback

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

Find out if the write will trigger a wait due to writeback. If yes,
return -EAGAIN.

Return -EINVAL for buffered AIO: there are multiple causes of
delay such as page locks, dirty throttling logic, page loading
from disk etc. which cannot be taken care of.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 mm/filemap.c | 17 ++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 097213275461..bc146efa6815 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2675,6 +2675,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
struct iov_iter *from)
 
pos = iocb->ki_pos;
 
+   if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
+   return -EINVAL;
+
if (limit != RLIM_INFINITY) {
if (iocb->ki_pos >= limit) {
send_sig(SIGXFSZ, current, 0);
@@ -2743,9 +2746,17 @@ generic_file_direct_write(struct kiocb *iocb, struct 
iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
 
-   written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 
1);
-   if (written)
-   goto out;
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   /* If there are pages to writeback, return */
+   if (filemap_range_has_page(inode->i_mapping, pos,
+  pos + iov_iter_count(from)))
+   return -EAGAIN;
+   } else {
+   written = filemap_write_and_wait_range(mapping, pos,
+   pos + write_len - 1);
+   if (written)
+   goto out;
+   }
 
/*
 * After a write we want buffered reads to be sure to go to disk to get
-- 
2.12.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] fs: Introduce RWF_NOWAIT

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

RWF_NOWAIT informs kernel to bail out if an AIO request will block
for reasons such as file allocations, or a writeback triggered,
or would block while allocating requests while performing
direct I/O.

RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags.

The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This
is called by most filesystems, either through fsops.write_iter() or through
the function defined by write_iter(). If not, we perform the check defined
by .write_iter() which is called for direct IO specifically.

Filesystems xfs, btrfs and ext4 would be supported in the following patches.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Jan Kara 
Signed-off-by: Goldwyn Rodrigues 
---
 fs/9p/vfs_file.c| 3 +++
 fs/aio.c| 6 ++
 fs/ceph/file.c  | 3 +++
 fs/cifs/file.c  | 3 +++
 fs/fuse/file.c  | 3 +++
 fs/nfs/direct.c | 3 +++
 fs/ocfs2/file.c | 3 +++
 include/linux/fs.h  | 5 -
 include/uapi/linux/fs.h | 1 +
 mm/filemap.c| 3 +++
 10 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3de3b4a89d89..403681db7723 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
loff_t origin;
int err = 0;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
retval = generic_write_checks(iocb, from);
if (retval <= 0)
return retval;
diff --git a/fs/aio.c b/fs/aio.c
index 020fa0045e3c..34027b67e2f4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1592,6 +1592,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb 
__user *user_iocb,
goto out_put_req;
}
 
+   if ((req->common.ki_flags & IOCB_NOWAIT) &&
+   !(req->common.ki_flags & IOCB_DIRECT)) {
+   ret = -EOPNOTSUPP;
+   goto out_put_req;
+   }
+
ret = put_user(KIOCB_KEY, _iocb->aio_key);
if (unlikely(ret)) {
pr_debug("EFAULT: aio_key\n");
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 29308a80d66f..366b0bb71f97 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1300,6 +1300,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
int err, want, got;
loff_t pos;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
if (ceph_snap(inode) != CEPH_NOSNAP)
return -EROFS;
 
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0fd081bd2a2f..ff84fa9ddb6c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2725,6 +2725,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct 
iov_iter *from)
 * write request.
 */
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
rc = generic_write_checks(iocb, from);
if (rc <= 0)
return rc;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 3ee4fdc3da9e..812c7bd0c290 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1425,6 +1425,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
ssize_t res;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
if (is_bad_inode(inode))
return -EIO;
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 6fb9fad2d1e6..c8e7dd76126c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -979,6 +979,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct 
iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index bfeb647459d9..e7f8ba890305 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2235,6 +2235,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
if (count == 0)
return 0;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EOPNOTSUPP;
+
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
inode_lock(inode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dc0ab585cd56..2a7d14af6d12 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -268,6 +268,7 @@ struct writeback_control;
 #define IOCB_DSYNC (1 << 4)
 #define IOCB_SYNC  (1 << 5)
 #define IOCB_WRITE (1 << 6)
+#define IOCB_NOWAIT(1 << 7)
 
 struct kiocb {
struct file *ki_filp;
@@ -3060,7 +3061,7 @@ static inline int iocb_flags(struct file 

[PATCH 07/10] fs: return on congested block device

2017-06-04 Thread Goldwyn Rodrigues
From: Goldwyn Rodrigues 

A new bio operation flag REQ_NOWAIT is introduced to identify bio's
orignating from iocb with IOCB_NOWAIT. This flag indicates
to return immediately if a request cannot be made instead
of retrying.

Stacked devices such as md (the ones with make_request_fn hooks)
currently are not supported because it may block for housekeeping.
For example, an md can have a part of the device suspended.
For this reason, only request based devices are supported.
In the future, this feature will be expanded to stacked devices
by teaching them how to handle the REQ_NOWAIT flags.

Reviewed-by: Christoph Hellwig 
Signed-off-by: Goldwyn Rodrigues 
---
 block/blk-core.c  | 24 ++--
 block/blk-mq-sched.c  |  3 +++
 block/blk-mq.c|  2 ++
 fs/direct-io.c| 10 --
 include/linux/bio.h   |  6 ++
 include/linux/blk_types.h |  2 ++
 6 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a7421b772d0e..a6ee659fd56b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1256,6 +1256,11 @@ static struct request *get_request(struct request_queue 
*q, unsigned int op,
if (!IS_ERR(rq))
return rq;
 
+   if (op & REQ_NOWAIT) {
+   blk_put_rl(rl);
+   return ERR_PTR(-EAGAIN);
+   }
+
if (!gfpflags_allow_blocking(gfp_mask) || unlikely(blk_queue_dying(q))) 
{
blk_put_rl(rl);
return rq;
@@ -1900,6 +1905,17 @@ generic_make_request_checks(struct bio *bio)
goto end_io;
}
 
+   /*
+* For a REQ_NOWAIT based request, return -EOPNOTSUPP
+* if queue does not have QUEUE_FLAG_NOWAIT_SUPPORT set
+* and if it is not a request based queue.
+*/
+
+   if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) {
+   err = -EOPNOTSUPP;
+   goto end_io;
+   }
+
part = bio->bi_bdev->bd_part;
if (should_fail_request(part, bio->bi_iter.bi_size) ||
should_fail_request(_to_disk(part)->part0,
@@ -2057,7 +2073,7 @@ blk_qc_t generic_make_request(struct bio *bio)
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-   if (likely(blk_queue_enter(q, false) == 0)) {
+   if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
struct bio_list lower, same;
 
/* Create a fresh bio_list for all subordinate requests 
*/
@@ -2082,7 +2098,11 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(_list_on_stack[0], );
bio_list_merge(_list_on_stack[0], 
_list_on_stack[1]);
} else {
-   bio_io_error(bio);
+   if (unlikely(!blk_queue_dying(q) &&
+   (bio->bi_opf & REQ_NOWAIT)))
+   bio_wouldblock_error(bio);
+   else
+   bio_io_error(bio);
}
bio = bio_list_pop(_list_on_stack[0]);
} while (bio);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 1f5b692526ae..9a1dea8b964e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -83,6 +83,9 @@ struct request *blk_mq_sched_get_request(struct request_queue 
*q,
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
+   if (op & REQ_NOWAIT)
+   data->flags |= BLK_MQ_REQ_NOWAIT;
+
if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL;
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1bcccedcc74f..b0608f1955b2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1556,6 +1556,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue 
*q, struct bio *bio)
rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, );
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
+   if (bio->bi_opf & REQ_NOWAIT)
+   bio_wouldblock_error(bio);
return BLK_QC_T_NONE;
}
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index a04ebea77de8..139ebd5ae1c7 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -480,8 +480,12 @@ static int dio_bio_complete(struct dio *dio, struct bio 
*bio)
unsigned i;
int err;
 
-   if (bio->bi_error)
-   dio->io_error = -EIO;
+   if (bio->bi_error) {
+   if (bio->bi_error == -EAGAIN && (bio->bi_opf & REQ_NOWAIT))
+   dio->io_error = -EAGAIN;
+   else
+   dio->io_error = -EIO;
+   }
 
if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) {
err = bio->bi_error;
@@ -1197,6 +1201,8 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode 
*inode,

[PATCH 0/10 v10] No wait AIO

2017-06-04 Thread Goldwyn Rodrigues
Formerly known as non-blocking AIO.

This series adds nonblocking feature to asynchronous I/O writes.
io_submit() can be delayed because of a number of reason:
 - Block allocation for files
 - Data writebacks for direct I/O
 - Sleeping because of waiting to acquire i_rwsem
 - Congested block device

The goal of the patch series is to return -EAGAIN/-EWOULDBLOCK if
any of these conditions are met. This way userspace can push most
of the write()s to the kernel to the best of its ability to complete
and if it returns -EAGAIN, can defer it to another thread.

In order to enable this, IOCB_RW_FLAG_NOWAIT is introduced in
uapi/linux/aio_abi.h. If set for aio_rw_flags, it translates to
IOCB_NOWAIT for struct iocb, REQ_NOWAIT for bio.bi_opf and IOMAP_NOWAIT for
iomap. aio_rw_flags is a new flag replacing aio_reserved1. We could
not use aio_flags because it is not currently checked for invalidity
in the kernel.

This feature is provided for direct I/O of asynchronous I/O only. I have
tested it against xfs, ext4, and btrfs while I intend to add more filesystems.
The nowait feature is for request based devices. In the future, I intend to
add support to stacked devices such as md.

Applications will have to check supportability
by sending a async direct write and any other error besides -EAGAIN
would mean it is not supported.

First two patches are prep patches into nowait I/O.

Changes since v1:
 + changed name from _NONBLOCKING to *_NOWAIT
 + filemap_range_has_page call moved to closer to (just before) calling 
filemap_write_and_wait_range().
 + BIO_NOWAIT limited to get_request()
 + XFS fixes 
- included reflink 
- use of xfs_ilock_nowait() instead of a XFS_IOLOCK_NONBLOCKING flag
- Translate the flag through IOMAP_NOWAIT (iomap) to check for
  block allocation for the file.
 + ext4 coding style

Changes since v2:
 + Using aio_reserved1 as aio_rw_flags instead of aio_flags
 + blk-mq support
 + xfs uptodate with kernel and reflink changes

 Changes since v3:
  + Added FS_NOWAIT, which is set if the filesystem supports NOWAIT feature.
  + Checks in generic_make_request() to make sure BIO_NOWAIT comes in
for async direct writes only.
  + Added QUEUE_FLAG_NOWAIT, which is set if the device supports BIO_NOWAIT.
This is added (rather not set) to block devices such as dm/md currently.

 Changes since v4:
  + Ported AIO code to use RWF_* flags. Check for RWF_* flags in
generic_file_write_iter().
  + Changed IOCB_RW_FLAGS_NOWAIT to RWF_NOWAIT.

 Changes since v5:
  + BIO_NOWAIT to REQ_NOWAIT
  + Common helper for RWF flags.

 Changes since v6:
  + REQ_NOWAIT will be ignored for request based devices since they
cannot block. So, removed QUEUE_FLAG_NOWAIT since it is not
required in the current implementation. It will be resurrected
when we program for stacked devices.
  + changed kiocb_rw_flags() to kiocb_set_rw_flags() in order to accomodate
for errors. Moved checks in the function.

 Changes since v7:
  + split patches into prep so the main patches are smaller and easier
to understand
  + All patches are reviewed or acked!
 
 Changes since v8:
 + Err out AIO reads with -EINVAL flagged as RWF_NOWAIT

 Changes since v9:
 + Retract - Err out AIO reads with -EINVAL flagged as RWF_NOWAIT
 + XFS returns EAGAIN if extent list is not in memory
 + Man page updates to io_submit with iocb description and nowait features.

-- 
Goldwyn


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[BUG] direct I/O pwrite() returning EEXIST on btrfs

2017-06-04 Thread Eric Biggers
Hi,

For at least a few kernel versions now I've been receiving I/O errors in a KVM
guest when testing ext4 with kvm-xfstests on a particular computer.  I've
tracked this down to the host filesystem which is BTRFS, and is sometimes
returning EEXIST to pwrite() calls made by QEMU to write to the disk image using
direct I/O.  The disk image file is in "raw" format, does not have the 'C'
attribute, and the filesystem has snapshots.  This bug still occurs on latest
Linus tree (v4.12-rc3-239-g3c06e6cbdb6a).

The bug has something to do with BTRFS inserting extent_maps into the inode's
extent_map_tree.  The EEXIST error code is coming from merge_extent_mapping(),
as called by btrfs_get_extent(), as called by btrfs_get_blocks_direct().  It's
probably similar to, but not the same as, the bug fixed by 8e2bd3b7fac9 ("Btrfs:
deal with existing encompassing extent map in btrfs_get_extent()").

I haven't found a better way to reproduce it yet and I'm having trouble fully
understanding the relevant BTRFS code, but perhaps this report will ring a bell
for someone else a little sooner.

I also tried adding the following debugging code:

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17cbe9306faf..846ace6ab34b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6768,7 +6768,9 @@ static struct extent_map *prev_extent_map(struct 
extent_map *em)
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
struct extent_map *existing,
struct extent_map *em,
-   u64 map_start)
+   u64 map_start,
+   struct extent_map *prev_ret,
+   struct extent_map *next_ret)
 {
struct extent_map *prev;
struct extent_map *next;
@@ -6786,6 +6788,16 @@ static int merge_extent_mapping(struct extent_map_tree 
*em_tree,
next = next_extent_map(prev);
}
 
+   if (prev)
+   *prev_ret = *prev;
+   else
+   *prev_ret = (struct extent_map){ 0 };
+
+   if (next)
+   *next_ret = *next;
+   else
+   *next_ret = (struct extent_map){ 0 };
+
start = prev ? extent_map_end(prev) : em->start;
start = max_t(u64, start, em->start);
end = next ? next->start : extent_map_end(em);
@@ -6857,7 +6869,7 @@ static noinline int uncompress_inline(struct btrfs_path 
*path,
  */
 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page,
-   size_t pg_offset, u64 start, u64 len,
+   size_t pg_offset, const u64 start, const u64 len,
int create)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
@@ -7111,14 +7123,34 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode 
*inode,
 
} else if (start >= extent_map_end(existing) ||
start <= existing->start) {
+   const u64 em_start = em->start;
+   const u64 em_end = extent_map_end(em);
+   const u64 existing_start = existing->start;
+   const u64 existing_end = extent_map_end(existing);
+   struct extent_map prev, next;
+
/*
 * The existing extent map is the one nearest to
 * the [start, start + len) range which overlaps
 */
err = merge_extent_mapping(em_tree, existing,
-  em, start);
+  em, start, , );
free_extent_map(existing);
if (err) {
+   if (err == -EEXIST) {
+   pr_warn("merge_extent_mapping() 
returned -EEXIST!\n"
+   "given a request to find the 
extent map for [%llu, %llu)\n"
+   "found and tried to insert 
[%llu, %llu), but it overlapped\n"
+   "existing extent [%llu, %llu), 
then was adjusted to\n"
+   "[%llu, %llu) but still 
overlapped\n"
+   "prev=[%llu, %llu) or 
next=[%llu, %llu)\n",
+   start, start + len,
+   em_start, em_end,
+   existing_start, existing_end,
+   em->start, extent_map_end(em),
+   prev.start, 
extent_map_end(),
+   next.start, 
extent_map_end());
+   }
free_extent_map(em);
   

Re: [PATCH 04/12] fs: ceph: CURRENT_TIME with ktime_get_real_ts()

2017-06-04 Thread Yan, Zheng
On Fri, Jun 2, 2017 at 10:18 PM, Arnd Bergmann  wrote:
> On Fri, Jun 2, 2017 at 2:18 PM, Yan, Zheng  wrote:
>> On Fri, Jun 2, 2017 at 7:33 PM, Arnd Bergmann  wrote:
>>> On Fri, Jun 2, 2017 at 1:18 PM, Yan, Zheng  wrote:
>>> What I meant is another related problem in ceph_mkdir() where the
>>> i_ctime field of the parent inode is different between the persistent
>>> representation in the mds and the in-memory representation.
>>>
>>
>> I don't see any problem in mkdir case. Parent inode's i_ctime in mds is set 
>> to
>> r_stamp. When client receives request reply, it set its in-memory inode's 
>> ctime
>> to the same time stamp.
>
> Ok, I see it now, thanks for the clarification. Most other file systems do 
> this
> the other way round and update all fields in the in-memory inode structure
> first and then write that to persistent storage, so I was getting confused 
> about
> the order of events here.
>
> If I understand it all right, we have three different behaviors in ceph now,
> though the differences are very minor and probably don't ever matter:
>
> - in setattr(), we update ctime in the in-memory inode first and then send
>   the same time to the mds, and expect to set it again when the reply comes.
>
> - in ceph_write_iter write() and mmap/page_mkwrite(), we call
>   file_update_time() to set i_mtime and i_ctime to the same
>   timestamp first once a write is observed by the fs and then take
>   two other timestamps that we send to the mds, and update the
>   in-memory inode a second time when the reply comes. ctime
>   is never older than mtime here, as far as I can tell, but it may
>   be newer when the timer interrupt happens between taking the
>   two stamps.

We don't use request to send i_mtime/i_ctime to mds in this case.
Instead, we use cap flush message. i_mtime/i_ctime are directly
encoded in cap flush message. When mds receives the cap flush message,
it writes i_mtime/i_ctime to persistent storage and sends a cap flush
ack message to client. (when client receives the cap flush ack
message, it does not update i_mtime/i_ctime). There is no issue as you
described.

>
> - in all other calls, we only update the inode (and/or parent inode)
>   after the reply arrives.

There are two cases. 1. Client updates in-memory inode's ctime, it
sends the new ctime to mds through cap flush message. 2. client set
mds request's r_stamp and send the request to mds. MDS updates
relavent inodes' ctime and sends reply to client. Client updates
in-memory inodes' ctime according to the reply.

Regards
Yan, Zheng

>
>Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html