[RFC v2 2/2] pmem: Enable pmem_submit_bio for asynchronous flush

2021-07-26 Thread Pankaj Gupta
From: Pankaj Gupta 

Return from "pmem_submit_bio" when asynchronous flush is in
process in other context.

Signed-off-by: Pankaj Gupta 
---
 drivers/nvdimm/pmem.c | 17 -
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 1e0615b8565e..3ca1fa88a5e7 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -201,8 +201,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data;
struct nd_region *nd_region = to_region(pmem);
 
-   if (bio->bi_opf & REQ_PREFLUSH)
-   ret = nvdimm_flush(nd_region, bio);
+   if ((bio->bi_opf & REQ_PREFLUSH) &&
+   nvdimm_flush(nd_region, bio)) {
+
+   /* asynchronous flush completes in other context */
+   if (nd_region->flush)
+   return BLK_QC_T_NONE;
+   }
 
do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue);
if (do_acct)
@@ -222,11 +227,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio)
if (do_acct)
bio_end_io_acct(bio, start);
 
-   if (bio->bi_opf & REQ_FUA)
+   if (bio->bi_opf & REQ_FUA)  {
ret = nvdimm_flush(nd_region, bio);
 
-   if (ret)
-   bio->bi_status = errno_to_blk_status(ret);
+   /* asynchronous flush completes in other context */
+   if (nd_region->flush)
+   return BLK_QC_T_NONE;
+   }
 
bio_endio(bio);
return BLK_QC_T_NONE;
-- 
2.25.1




[RFC v2 0/2] virtio-pmem: Asynchronous flush

2021-07-26 Thread Pankaj Gupta
From: Pankaj Gupta 

 Jeff reported preflush order issue with the existing implementation
 of virtio pmem preflush. Dan suggested[1] to implement asynchronous flush
 for virtio pmem using work queue as done in md/RAID. This patch series
 intends to solve the preflush ordering issue and also makes the flush
 asynchronous for the submitting thread.

 Submitting this patch series for review. Sorry, It took me long time to
 come back to this due to some personal reasons.

 RFC v1 -> RFC v2
 - More testing and bug fix.

 [1] https://marc.info/?l=linux-kernel=157446316409937=2

Pankaj Gupta (2):
  virtio-pmem: Async virtio-pmem flush
  pmem: enable pmem_submit_bio for asynchronous flush

 drivers/nvdimm/nd_virtio.c   | 72 
 drivers/nvdimm/pmem.c| 17 ++---
 drivers/nvdimm/virtio_pmem.c | 10 -
 drivers/nvdimm/virtio_pmem.h | 14 +++
 4 files changed, 91 insertions(+), 22 deletions(-)

-- 
2.25.1




[RFC v2 1/2] virtio-pmem: Async virtio-pmem flush

2021-07-26 Thread Pankaj Gupta
From: Pankaj Gupta 

Implement asynchronous flush for virtio pmem using work queue
to solve the preflush ordering issue. Also, coalesce the flush
requests when a flush is already in process.

Signed-off-by: Pankaj Gupta 
---
 drivers/nvdimm/nd_virtio.c   | 72 
 drivers/nvdimm/virtio_pmem.c | 10 -
 drivers/nvdimm/virtio_pmem.h | 14 +++
 3 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/drivers/nvdimm/nd_virtio.c b/drivers/nvdimm/nd_virtio.c
index 10351d5b49fa..61b655b583be 100644
--- a/drivers/nvdimm/nd_virtio.c
+++ b/drivers/nvdimm/nd_virtio.c
@@ -97,29 +97,69 @@ static int virtio_pmem_flush(struct nd_region *nd_region)
return err;
 };
 
+static void submit_async_flush(struct work_struct *ws);
+
 /* The asynchronous flush callback function */
 int async_pmem_flush(struct nd_region *nd_region, struct bio *bio)
 {
-   /*
-* Create child bio for asynchronous flush and chain with
-* parent bio. Otherwise directly call nd_region flush.
+   /* queue asynchronous flush and coalesce the flush requests */
+   struct virtio_device *vdev = nd_region->provider_data;
+   struct virtio_pmem *vpmem  = vdev->priv;
+   ktime_t req_start = ktime_get_boottime();
+
+   spin_lock_irq(>lock);
+   /* flush requests wait until ongoing flush completes,
+* hence coalescing all the pending requests.
 */
-   if (bio && bio->bi_iter.bi_sector != -1) {
-   struct bio *child = bio_alloc(GFP_ATOMIC, 0);
-
-   if (!child)
-   return -ENOMEM;
-   bio_copy_dev(child, bio);
-   child->bi_opf = REQ_PREFLUSH;
-   child->bi_iter.bi_sector = -1;
-   bio_chain(child, bio);
-   submit_bio(child);
-   return 0;
+   wait_event_lock_irq(vpmem->sb_wait,
+   !vpmem->flush_bio ||
+   ktime_before(req_start, vpmem->prev_flush_start),
+   vpmem->lock);
+   /* new request after previous flush is completed */
+   if (ktime_after(req_start, vpmem->prev_flush_start)) {
+   WARN_ON(vpmem->flush_bio);
+   vpmem->flush_bio = bio;
+   bio = NULL;
+   }
+   spin_unlock_irq(>lock);
+
+   if (!bio) {
+   INIT_WORK(>flush_work, submit_async_flush);
+   queue_work(vpmem->pmem_wq, >flush_work);
+   return 1;
+   }
+
+   /* flush completed in other context while we waited */
+   if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
+   bio->bi_opf &= ~REQ_PREFLUSH;
+   submit_bio(bio);
+   } else if (bio && (bio->bi_opf & REQ_FUA)) {
+   bio->bi_opf &= ~REQ_FUA;
+   bio_endio(bio);
}
-   if (virtio_pmem_flush(nd_region))
-   return -EIO;
 
return 0;
 };
 EXPORT_SYMBOL_GPL(async_pmem_flush);
+
+static void submit_async_flush(struct work_struct *ws)
+{
+   struct virtio_pmem *vpmem = container_of(ws, struct virtio_pmem, 
flush_work);
+   struct bio *bio = vpmem->flush_bio;
+
+   vpmem->start_flush = ktime_get_boottime();
+   bio->bi_status = 
errno_to_blk_status(virtio_pmem_flush(vpmem->nd_region));
+   vpmem->prev_flush_start = vpmem->start_flush;
+   vpmem->flush_bio = NULL;
+   wake_up(>sb_wait);
+
+   /* Submit parent bio only for PREFLUSH */
+   if (bio && (bio->bi_opf & REQ_PREFLUSH)) {
+   bio->bi_opf &= ~REQ_PREFLUSH;
+   submit_bio(bio);
+   } else if (bio && (bio->bi_opf & REQ_FUA)) {
+   bio->bi_opf &= ~REQ_FUA;
+   bio_endio(bio);
+   }
+}
 MODULE_LICENSE("GPL");
diff --git a/drivers/nvdimm/virtio_pmem.c b/drivers/nvdimm/virtio_pmem.c
index 726c7354d465..56780a6140c7 100644
--- a/drivers/nvdimm/virtio_pmem.c
+++ b/drivers/nvdimm/virtio_pmem.c
@@ -24,6 +24,7 @@ static int init_vq(struct virtio_pmem *vpmem)
return PTR_ERR(vpmem->req_vq);
 
spin_lock_init(>pmem_lock);
+   spin_lock_init(>lock);
INIT_LIST_HEAD(>req_list);
 
return 0;
@@ -57,7 +58,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev)
dev_err(>dev, "failed to initialize virtio pmem vq's\n");
goto out_err;
}
-
+   vpmem->pmem_wq = alloc_workqueue("vpmem_wq", WQ_MEM_RECLAIM, 0);
+   if (!vpmem->pmem_wq) {
+   err = -ENOMEM;
+   goto out_err;
+   }
+   init_waitqueue_head(>sb_wait);
virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
start, >start);
virtio_cread_le(vpmem->vdev, struct virtio_pmem_config,
@@ -90,10 +96,12 @@ static int virtio_pmem_probe(struct virtio_device *vdev)
goto out_nd;
}
nd_region->provider_data = dev_to_virtio(nd_region->dev.parent->parent);
+   vpmem->nd_region = nd_region;

Re: [PATCH 03/27] iomap: mark the iomap argument to iomap_sector const

2021-07-26 Thread Christoph Hellwig
On Mon, Jul 19, 2021 at 09:08:20AM -0700, Darrick J. Wong wrote:
> IMHO, constifiying functions is a good way to signal to /programmers/
> that they're not intended to touch the arguments, so

Yes, that is the point here.  Basically the iomap and iter should
be pretty much const, and we almost get there except for the odd
size changed flag for gfs2.



Re: [PATCH 08/27] iomap: add the new iomap_iter model

2021-07-26 Thread Christoph Hellwig
On Tue, Jul 20, 2021 at 07:48:38AM +1000, Dave Chinner wrote:
> We should avoid namespace conflicts where function names shadow
> object types. iomap_iterate() is fine as the function name - there's
> no need for abbreviation here because it's not an overly long name.
> This will makes it clearly different to the struct iomap_iter that
> is passed to it and it will also make grep, cscope and other
> code searching tools much more precise...

Well, there isn't really a conflict by definition.  I actually like
this choice of names (stolen from the original patch from willy)
as it clearly indicates they go together.

But I'm happy to collect a few more opinions.



Re: [PATCH 08/27] iomap: add the new iomap_iter model

2021-07-26 Thread Christoph Hellwig
On Mon, Jul 19, 2021 at 09:56:00AM -0700, Darrick J. Wong wrote:
> Linus previously complained to me about filesystem code (especially
> iomap since it was "newer") (ab)using loff_t variables to store the
> lengths of byte ranges.  It was "loff_t length;" (or so willy
> recollects) that tripped him up.
> 
> ISTR he also said we should use size_t for all lengths because nobody
> should do operations larger than ~2G, but I reject that because iomap
> has users that iterate large ranges of data without generating any IO
> (e.g. fiemap, seek, swapfile activation).
> 
> So... rather than confusing things even more by mixing u64 and ssize_t
> for lengths, can we introduce a new 64-bit length typedef for iomap?
> Last summer, Dave suggested[1] something like:
> 
>   typedef long long lsize_t;
> 
> That would enable cleanup of all the count/size/length parameters in
> fs/remap_range.c and fs/xfs/xfs_reflink.c to use the new 64-bit length
> type, since those operations have never been limited to 32-bit sizes.

I'd rather avoid playing guinea pig for a somewhat odd new type.  For
now I've switched it to the loff_t as that matches the rest of iomap.
If we switch away either to a new type or s64/u64 we should probably do
it as a big sweep.



Re: [PATCH 20/27] fsdax: switch dax_iomap_rw to use iomap_iter

2021-07-26 Thread Christoph Hellwig
On Tue, Jul 20, 2021 at 08:10:05AM +1000, Dave Chinner wrote:
> At first I wondered "iomi? Strange name, why is this one-off name
> used?" and then I realised it's because this function also takes an
> struct iov_iter named "iter".
> 
> That's going to cause confusion in the long run - iov_iter and
> iomap_iter both being generally named "iter", and then one or the
> other randomly changing when both are used in the same function.
> 
> Would it be better to avoid any possible confusion simply by using
> "iomi" for all iomap_iter variables throughout the patchset from the
> start? That way nobody is going to confuse iov_iter with iomap_iter
> iteration variables and code that uses both types will naturally
> have different, well known names...

Hmm.  iomi comes from the original patch from willy and I kinda hate
it.  But given that we have this clash here (and in the direct I/O code)
I kept using it.

Does anyone have any strong opinions here?



Re: [PATCH 17/27] iomap: switch iomap_seek_hole to use iomap_iter

2021-07-26 Thread Darrick J. Wong
On Mon, Jul 26, 2021 at 10:22:36AM +0200, Christoph Hellwig wrote:
> On Mon, Jul 19, 2021 at 10:22:47AM -0700, Darrick J. Wong wrote:
> > > -static loff_t
> > > -iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length,
> > > -   void *data, struct iomap *iomap, struct iomap *srcmap)
> > > +static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, loff_t 
> > > *pos)
> > 
> > /me wonders if @pos should be named hole_pos (here and in the caller) to
> > make it a little easier to read...
> 
> Sure.
> 
> > ...because what we're really saying here is that if seek_hole_iter found
> > a hole (and returned zero, thereby terminating the loop before iter.len
> > could reach zero), we want to return the position of the hole.
> 
> Yes.
> 
> > > + return size;
> > 
> > Not sure why we return size here...?  Oh, because there's an implicit
> > hole at EOF, so we return i_size.  Uh, does this do the right thing if
> > ->iomap_begin returns posteof mappings?  I don't see anything in
> > iomap_iter_advance that would stop iteration at EOF.
> 
> Nothing in ->iomap_begin checks that, iomap_seek_hole initializes
> iter.len so that it stops at EOF.

Oh, right.  Sorry, I forgot that. :(

--D



Re: [PATCH 16/27] iomap: switch iomap_bmap to use iomap_iter

2021-07-26 Thread Christoph Hellwig
On Mon, Jul 19, 2021 at 10:05:45AM -0700, Darrick J. Wong wrote:
> > bno = 0;
> > -   ret = iomap_apply(inode, pos, blocksize, 0, ops, ,
> > - iomap_bmap_actor);
> > +   while ((ret = iomap_iter(, ops)) > 0) {
> > +   if (iter.iomap.type != IOMAP_MAPPED)
> > +   continue;
> 
> There isn't a mapped extent, so return 0 here, right?

We can't just return 0, we always need the final iomap_iter() call
to clean up in case a ->iomap_end method is supplied.  No for bmap
having and needing one is rather theoretical, but people will copy
and paste that once we start breaking the rules.



Re: [PATCH 16/27] iomap: switch iomap_bmap to use iomap_iter

2021-07-26 Thread Darrick J. Wong
On Mon, Jul 26, 2021 at 10:19:42AM +0200, Christoph Hellwig wrote:
> On Mon, Jul 19, 2021 at 10:05:45AM -0700, Darrick J. Wong wrote:
> > >   bno = 0;
> > > - ret = iomap_apply(inode, pos, blocksize, 0, ops, ,
> > > -   iomap_bmap_actor);
> > > + while ((ret = iomap_iter(, ops)) > 0) {
> > > + if (iter.iomap.type != IOMAP_MAPPED)
> > > + continue;
> > 
> > There isn't a mapped extent, so return 0 here, right?
> 
> We can't just return 0, we always need the final iomap_iter() call
> to clean up in case a ->iomap_end method is supplied.  No for bmap
> having and needing one is rather theoretical, but people will copy
> and paste that once we start breaking the rules.

Oh, right, I forgot that someone might want to ->iomap_end.  The
"continue" works because we only asked for one block, therefore we know
that we'll never get to the loop body a second time; and we ignore
iter.processed, which also means we never revisit the loop body.

This "continue without setting iter.processed to break out of loop"
pattern is a rather indirect subtlety, since C programmers are taught
that they can break out of a loop using break;.  This new iomap_iter
pattern fubars that longstanding language feature, and the language
around it is soft:

> /**
>  * iomap_iter - iterate over a ranges in a file
>  * @iter: iteration structue
>  * @ops: iomap ops provided by the file system
>  *
>  * Iterate over file system provided contiguous ranges of blocks with the same
>  * state.  Should be called in a loop that continues as long as this function
>  * returns a positive value.  If 0 or a negative value is returned the caller
>  * should break out of the loop - a negative value is an error either from the
>  * file system or from the last iteration stored in @iter.copied.
>  */

The documentation needs to be much more explicit about the fact that you
cannot "break;" your way out of an iomap_iter loop.  I think the comment
should be rewritten along these lines:

"Iterate over filesystem-provided space mappings for the provided file
range.  This function handles cleanup of resources acquired for
iteration when the filesystem indicates there are no more space
mappings, which means that this function must be called in a loop that
continues as long it returns a positive value.  If 0 or a negative value
is returned, the caller must not return to the loop body.  Within a loop
body, there are two ways to break out of the loop body: leave
@iter.processed unchanged, or set it to the usual negative errno."

Hm.

What if we provide an explicit loop break function?  That would be clear
overkill for bmap, but somebody else wanting to break out of a more
complex loop body ought to be able to say "break" to do that, not
"continue with subtleties".

static inline int
iomap_iter_break(struct iomap_iter *iter, int ret)
{
int ret2;

if (!iter->iomap.length || !ops->iomap_end)
return ret;

ret2 = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter),
0, iter->flags, >iomap);
return ret ? ret : ret2;
}

And then then theoretical loop body becomes:

while ((ret = iomap_iter(, ops)) > 0) {
if (iter.iomap.type != WHAT_I_WANT) {
ret = iomap_iter_break(, 0);
break;
}



ret = vfs_do_some_risky_thing(...);
if (ret) {
ret = iomap_iter_break(, ret);
break;
}



iter.processed = iter.iomap.length;
}
return ret;

Clunky, for sure, but at least we still get to use break as the language
designers intended.

--D