Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Christoph Hellwig
On Wed, May 23, 2018 at 08:38:06AM +1000, Dave Chinner wrote:
> Ok, I missed that detail as it's in a different patch. It looks like
> if (pos > EOF) it will zeroed. But in this case I think that pos ==
> EOF and so it was reading instead. That smells like off-by-one bug
> to me.

This has been fixed in the tree I pushed yesterday already.


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Dave Chinner
On Tue, May 22, 2018 at 10:24:54AM +0200, Christoph Hellwig wrote:
> On Tue, May 22, 2018 at 10:07:45AM +1000, Dave Chinner wrote:
> > > Something doesn't smell right here.  The only pages we need to read in
> > > are the first and last pages in the write_begin range, and only if they
> > > aren't page aligned and the underlying extent is IOMAP_MAPPED, right?
> > 
> > And not beyond EOF, too.
> > 
> > The bufferhead code handles this via the buffer_new() flag - it
> > triggers the skipping of read IO and the states in which it is
> > set are clearly indicated in iomap_to_bh(). That same logic needs to
> > apply here.
> 
> The buffer_new logic itself isn't really something to copy directly
> as it has all kinds of warts..

Sure, my point was that it documents all the cases where we can
avoid reading from disk, not that we should copy the logic.

> > > I also noticed that speculative preallocation kicks in by the second 80M
> > > write() call and writeback for the second call can successfully allocate
> > > the entire preallocation, which means that the third (or nth) write call
> > > can have a real extent already mapped in, and then we end up reading it.
> > 
> > Yeah, that's because there's no check against EOF here. These writes
> > are all beyond EOF, so there shouldn't be any read at all...
> 
> The EOF case is already handled in iomap_block_needs_zeroing.

Ok, I missed that detail as it's in a different patch. It looks like
if (pos > EOF) it will zeroed. But in this case I think that pos ==
EOF and so it was reading instead. That smells like off-by-one bug
to me.

> We just
> need to skip the read for ranges entirely covered by the write.

Yup.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Christoph Hellwig
On Tue, May 22, 2018 at 10:31:03AM +0200, Christoph Hellwig wrote:
> The fix should be as simple as this:

fsx wants some little tweaks:

diff --git a/fs/iomap.c b/fs/iomap.c
index 357711e50cfa..47676d1b957b 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -342,19 +342,19 @@ __iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len,
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
unsigned poff = block_start & (PAGE_SIZE - 1);
unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
block_start);
+   unsigned from = pos & (PAGE_SIZE - 1);
+   unsigned to = from + len;
int status;
 
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
 
if (PageUptodate(page))
return 0;
+   if (from <= poff && to >= poff + plen)
+   return 0;
 
if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
-   unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-   unsigned pend = poff + plen;
-
-   if (poff < from || pend > to)
-   zero_user_segments(page, poff, from, to, pend);
+   zero_user_segments(page, poff, from, to, poff + plen);
} else {
status = iomap_read_page_sync(inode, block_start, page,
poff, plen, iomap);


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Christoph Hellwig
The fix should be as simple as this:

diff --git a/fs/iomap.c b/fs/iomap.c
index 357711e50cfa..212c3c21e51c 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -342,19 +342,19 @@ __iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len,
loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
unsigned poff = block_start & (PAGE_SIZE - 1);
unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
block_start);
+   unsigned from = pos & (PAGE_SIZE - 1);
+   unsigned to = from + len;
int status;
 
WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
 
if (PageUptodate(page))
return 0;
+   if (poff >= from && poff + len <= to)
+   return 0;
 
if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
-   unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
-   unsigned pend = poff + plen;
-
-   if (poff < from || pend > to)
-   zero_user_segments(page, poff, from, to, pend);
+   zero_user_segments(page, poff, from, to, poff + len);
} else {
status = iomap_read_page_sync(inode, block_start, page,
poff, plen, iomap);


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Christoph Hellwig
On Tue, May 22, 2018 at 10:07:45AM +1000, Dave Chinner wrote:
> > Something doesn't smell right here.  The only pages we need to read in
> > are the first and last pages in the write_begin range, and only if they
> > aren't page aligned and the underlying extent is IOMAP_MAPPED, right?
> 
> And not beyond EOF, too.
> 
> The bufferhead code handles this via the buffer_new() flag - it
> triggers the skipping of read IO and the states in which it is
> set are clearly indicated in iomap_to_bh(). That same logic needs to
> apply here.

The buffer_new logic itself isn't really something to copy directly
as it has all kinds of warts..

> > I also noticed that speculative preallocation kicks in by the second 80M
> > write() call and writeback for the second call can successfully allocate
> > the entire preallocation, which means that the third (or nth) write call
> > can have a real extent already mapped in, and then we end up reading it.
> 
> Yeah, that's because there's no check against EOF here. These writes
> are all beyond EOF, so there shouldn't be any read at all...

The EOF case is already handled in iomap_block_needs_zeroing.  We just
need to skip the read for ranges entirely covered by the write.


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-22 Thread Christoph Hellwig
On Mon, May 21, 2018 at 04:27:00PM -0700, Darrick J. Wong wrote:
> Something doesn't smell right here.  The only pages we need to read in
> are the first and last pages in the write_begin range, and only if they
> aren't page aligned and the underlying extent is IOMAP_MAPPED, right?

Yes,  and I'm pretty sure I did get this right before refactoring
everything for sub-blocksize support.


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-21 Thread Dave Chinner
On Mon, May 21, 2018 at 04:27:00PM -0700, Darrick J. Wong wrote:
> On Fri, May 18, 2018 at 06:48:12PM +0200, Christoph Hellwig wrote:
> > For now just limited to blocksize == PAGE_SIZE, where we can simply read
> > in the full page in write begin, and just set the whole page dirty after
> > copying data into it.  This code is enabled by default and XFS will now
> > be feed pages without buffer heads in ->writepage and ->writepages.
> > 
> > If a file system sets the IOMAP_F_BUFFER_HEAD flag on the iomap the old
> > path will still be used, this both helps the transition in XFS and
> > prepares for the gfs2 migration to the iomap infrastructure.
> > 
> > Signed-off-by: Christoph Hellwig 
> > ---
> >  fs/iomap.c| 132 ++
> >  fs/xfs/xfs_iomap.c|   6 +-
> >  include/linux/iomap.h |   2 +
> >  3 files changed, 127 insertions(+), 13 deletions(-)
> > 
> > diff --git a/fs/iomap.c b/fs/iomap.c
> > index 821671af2618..cd4c563db80a 100644
> > --- a/fs/iomap.c
> > +++ b/fs/iomap.c
> > @@ -314,6 +314,58 @@ iomap_write_failed(struct inode *inode, loff_t pos, 
> > unsigned len)
> > truncate_pagecache_range(inode, max(pos, i_size), pos + len);
> >  }
> >  
> > +static int
> > +iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page 
> > *page,
> > +   unsigned poff, unsigned plen, struct iomap *iomap)
> > +{
> > +   struct bio_vec bvec;
> > +   struct bio bio;
> > +   int ret;
> > +
> > +   bio_init(, , 1);
> > +   bio.bi_opf = REQ_OP_READ;
> > +   bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
> > +   bio_set_dev(, iomap->bdev);
> > +   __bio_add_page(, page, plen, poff);
> > +   ret = submit_bio_wait();
> > +   if (ret < 0 && iomap_block_needs_zeroing(inode, block_start, iomap))
> > +   zero_user(page, poff, plen);
> > +   return ret;
> > +}
> > +
> > +static int
> > +__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
> > +   struct page *page, struct iomap *iomap)
> > +{
> > +   loff_t block_size = i_blocksize(inode);
> > +   loff_t block_start = pos & ~(block_size - 1);
> > +   loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
> > +   unsigned poff = block_start & (PAGE_SIZE - 1);
> > +   unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
> > block_start);
> > +   int status;
> > +
> > +   WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> > +
> > +   if (PageUptodate(page))
> > +   return 0;
> > +
> > +   if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
> > +   unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
> > +   unsigned pend = poff + plen;
> > +
> > +   if (poff < from || pend > to)
> > +   zero_user_segments(page, poff, from, to, pend);
> > +   } else {
> > +   status = iomap_read_page_sync(inode, block_start, page,
> > +   poff, plen, iomap);
> 
> Something doesn't smell right here.  The only pages we need to read in
> are the first and last pages in the write_begin range, and only if they
> aren't page aligned and the underlying extent is IOMAP_MAPPED, right?

And not beyond EOF, too.

The bufferhead code handles this via the buffer_new() flag - it
triggers the skipping of read IO and the states in which it is
set are clearly indicated in iomap_to_bh(). That same logic needs to
apply here.

> I also noticed that speculative preallocation kicks in by the second 80M
> write() call and writeback for the second call can successfully allocate
> the entire preallocation, which means that the third (or nth) write call
> can have a real extent already mapped in, and then we end up reading it.

Yeah, that's because there's no check against EOF here. These writes
are all beyond EOF, so there shouldn't be any read at all...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


Re: [PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-21 Thread Darrick J. Wong
On Fri, May 18, 2018 at 06:48:12PM +0200, Christoph Hellwig wrote:
> For now just limited to blocksize == PAGE_SIZE, where we can simply read
> in the full page in write begin, and just set the whole page dirty after
> copying data into it.  This code is enabled by default and XFS will now
> be feed pages without buffer heads in ->writepage and ->writepages.
> 
> If a file system sets the IOMAP_F_BUFFER_HEAD flag on the iomap the old
> path will still be used, this both helps the transition in XFS and
> prepares for the gfs2 migration to the iomap infrastructure.
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/iomap.c| 132 ++
>  fs/xfs/xfs_iomap.c|   6 +-
>  include/linux/iomap.h |   2 +
>  3 files changed, 127 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 821671af2618..cd4c563db80a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -314,6 +314,58 @@ iomap_write_failed(struct inode *inode, loff_t pos, 
> unsigned len)
>   truncate_pagecache_range(inode, max(pos, i_size), pos + len);
>  }
>  
> +static int
> +iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page 
> *page,
> + unsigned poff, unsigned plen, struct iomap *iomap)
> +{
> + struct bio_vec bvec;
> + struct bio bio;
> + int ret;
> +
> + bio_init(, , 1);
> + bio.bi_opf = REQ_OP_READ;
> + bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
> + bio_set_dev(, iomap->bdev);
> + __bio_add_page(, page, plen, poff);
> + ret = submit_bio_wait();
> + if (ret < 0 && iomap_block_needs_zeroing(inode, block_start, iomap))
> + zero_user(page, poff, plen);
> + return ret;
> +}
> +
> +static int
> +__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
> + struct page *page, struct iomap *iomap)
> +{
> + loff_t block_size = i_blocksize(inode);
> + loff_t block_start = pos & ~(block_size - 1);
> + loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
> + unsigned poff = block_start & (PAGE_SIZE - 1);
> + unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
> block_start);
> + int status;
> +
> + WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
> +
> + if (PageUptodate(page))
> + return 0;
> +
> + if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
> + unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
> + unsigned pend = poff + plen;
> +
> + if (poff < from || pend > to)
> + zero_user_segments(page, poff, from, to, pend);
> + } else {
> + status = iomap_read_page_sync(inode, block_start, page,
> + poff, plen, iomap);

Something doesn't smell right here.  The only pages we need to read in
are the first and last pages in the write_begin range, and only if they
aren't page aligned and the underlying extent is IOMAP_MAPPED, right?

I also noticed that speculative preallocation kicks in by the second 80M
write() call and writeback for the second call can successfully allocate
the entire preallocation, which means that the third (or nth) write call
can have a real extent already mapped in, and then we end up reading it.

--D

> + if (status < 0)
> + return status;
> + SetPageUptodate(page);
> + }
> +
> + return 0;
> +}
> +
>  static int
>  iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned 
> flags,
>   struct page **pagep, struct iomap *iomap)
> @@ -331,7 +383,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
> unsigned len, unsigned flags,
>   if (!page)
>   return -ENOMEM;
>  
> - status = __block_write_begin_int(page, pos, len, NULL, iomap);
> + if (iomap->flags & IOMAP_F_BUFFER_HEAD)
> + status = __block_write_begin_int(page, pos, len, NULL, iomap);
> + else
> + status = __iomap_write_begin(inode, pos, len, page, iomap);
>   if (unlikely(status)) {
>   unlock_page(page);
>   put_page(page);
> @@ -344,14 +399,63 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
> unsigned len, unsigned flags,
>   return status;
>  }
>  
> +int
> +iomap_set_page_dirty(struct page *page)
> +{
> + struct address_space *mapping = page_mapping(page);
> + int newly_dirty;
> +
> + if (unlikely(!mapping))
> + return !TestSetPageDirty(page);
> +
> + /*
> +  * Lock out page->mem_cgroup migration to keep PageDirty
> +  * synchronized with per-memcg dirty page counters.
> +  */
> + lock_page_memcg(page);
> + newly_dirty = !TestSetPageDirty(page);
> + if (newly_dirty)
> + __set_page_dirty(page, mapping, 0);
> + unlock_page_memcg(page);
> +
> + if (newly_dirty)
> + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
> + 

[PATCH 16/34] iomap: add initial support for writes without buffer heads

2018-05-18 Thread Christoph Hellwig
For now just limited to blocksize == PAGE_SIZE, where we can simply read
in the full page in write begin, and just set the whole page dirty after
copying data into it.  This code is enabled by default and XFS will now
be feed pages without buffer heads in ->writepage and ->writepages.

If a file system sets the IOMAP_F_BUFFER_HEAD flag on the iomap the old
path will still be used, this both helps the transition in XFS and
prepares for the gfs2 migration to the iomap infrastructure.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap.c| 132 ++
 fs/xfs/xfs_iomap.c|   6 +-
 include/linux/iomap.h |   2 +
 3 files changed, 127 insertions(+), 13 deletions(-)

diff --git a/fs/iomap.c b/fs/iomap.c
index 821671af2618..cd4c563db80a 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -314,6 +314,58 @@ iomap_write_failed(struct inode *inode, loff_t pos, 
unsigned len)
truncate_pagecache_range(inode, max(pos, i_size), pos + len);
 }
 
+static int
+iomap_read_page_sync(struct inode *inode, loff_t block_start, struct page 
*page,
+   unsigned poff, unsigned plen, struct iomap *iomap)
+{
+   struct bio_vec bvec;
+   struct bio bio;
+   int ret;
+
+   bio_init(, , 1);
+   bio.bi_opf = REQ_OP_READ;
+   bio.bi_iter.bi_sector = iomap_sector(iomap, block_start);
+   bio_set_dev(, iomap->bdev);
+   __bio_add_page(, page, plen, poff);
+   ret = submit_bio_wait();
+   if (ret < 0 && iomap_block_needs_zeroing(inode, block_start, iomap))
+   zero_user(page, poff, plen);
+   return ret;
+}
+
+static int
+__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+   struct page *page, struct iomap *iomap)
+{
+   loff_t block_size = i_blocksize(inode);
+   loff_t block_start = pos & ~(block_size - 1);
+   loff_t block_end = (pos + len + block_size - 1) & ~(block_size - 1);
+   unsigned poff = block_start & (PAGE_SIZE - 1);
+   unsigned plen = min_t(loff_t, PAGE_SIZE - poff, block_end - 
block_start);
+   int status;
+
+   WARN_ON_ONCE(i_blocksize(inode) < PAGE_SIZE);
+
+   if (PageUptodate(page))
+   return 0;
+
+   if (iomap_block_needs_zeroing(inode, block_start, iomap)) {
+   unsigned from = pos & (PAGE_SIZE - 1), to = from + len;
+   unsigned pend = poff + plen;
+
+   if (poff < from || pend > to)
+   zero_user_segments(page, poff, from, to, pend);
+   } else {
+   status = iomap_read_page_sync(inode, block_start, page,
+   poff, plen, iomap);
+   if (status < 0)
+   return status;
+   SetPageUptodate(page);
+   }
+
+   return 0;
+}
+
 static int
 iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned 
flags,
struct page **pagep, struct iomap *iomap)
@@ -331,7 +383,10 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len, unsigned flags,
if (!page)
return -ENOMEM;
 
-   status = __block_write_begin_int(page, pos, len, NULL, iomap);
+   if (iomap->flags & IOMAP_F_BUFFER_HEAD)
+   status = __block_write_begin_int(page, pos, len, NULL, iomap);
+   else
+   status = __iomap_write_begin(inode, pos, len, page, iomap);
if (unlikely(status)) {
unlock_page(page);
put_page(page);
@@ -344,14 +399,63 @@ iomap_write_begin(struct inode *inode, loff_t pos, 
unsigned len, unsigned flags,
return status;
 }
 
+int
+iomap_set_page_dirty(struct page *page)
+{
+   struct address_space *mapping = page_mapping(page);
+   int newly_dirty;
+
+   if (unlikely(!mapping))
+   return !TestSetPageDirty(page);
+
+   /*
+* Lock out page->mem_cgroup migration to keep PageDirty
+* synchronized with per-memcg dirty page counters.
+*/
+   lock_page_memcg(page);
+   newly_dirty = !TestSetPageDirty(page);
+   if (newly_dirty)
+   __set_page_dirty(page, mapping, 0);
+   unlock_page_memcg(page);
+
+   if (newly_dirty)
+   __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+   return newly_dirty;
+}
+EXPORT_SYMBOL_GPL(iomap_set_page_dirty);
+
+static int
+__iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+   unsigned copied, struct page *page, struct iomap *iomap)
+{
+   unsigned start = pos & (PAGE_SIZE - 1);
+
+   if (unlikely(copied < len)) {
+   /* see block_write_end() for an explanation */
+   if (!PageUptodate(page))
+   copied = 0;
+   if (iomap_block_needs_zeroing(inode, pos, iomap))
+   zero_user(page, start + copied, len - copied);
+   }
+
+   flush_dcache_page(page);
+   SetPageUptodate(page);
+   iomap_set_page_dirty(page);
+