Re: [Cluster-devel] [PATCH V12 16/20] block: enable multipage bvecs

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:16AM +0800, Ming Lei wrote:
> This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  block/bio.c | 22 +++---
>  fs/iomap.c  |  4 ++--
>  fs/xfs/xfs_aops.c   |  4 ++--
>  include/linux/bio.h |  2 +-
>  4 files changed, 20 insertions(+), 12 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 75fde30af51f..8bf9338d4783 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
>   * @page: page to add
>   * @len: length of the data to add
>   * @off: offset of the data in @page
> + * @same_page: if %true only merge if the new data is in the same physical
> + *   page as the last segment of the bio.
>   *
>   * Try to add the data at @page + @off to the last bvec of @bio.  This is a
>   * a useful optimisation for file systems with a block size smaller than the
> @@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
>   * Return %true on success or %false on failure.
>   */
>  bool __bio_try_merge_page(struct bio *bio, struct page *page,
> - unsigned int len, unsigned int off)
> + unsigned int len, unsigned int off, bool same_page)
>  {
>   if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
>   return false;
>  
>   if (bio->bi_vcnt > 0) {
>   struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
> + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
> + bv->bv_offset + bv->bv_len;
> + phys_addr_t page_addr = page_to_phys(page);
>  
> - if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
> - bv->bv_len += len;
> - bio->bi_iter.bi_size += len;
> - return true;
> - }
> + if (vec_end_addr != page_addr + off)
> + return false;
> + if (same_page && ((vec_end_addr - 1) & PAGE_MASK) != page_addr)
> + return false;
> +
> + bv->bv_len += len;
> + bio->bi_iter.bi_size += len;
> + return true;
>   }
>   return false;
>  }
> @@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
>  int bio_add_page(struct bio *bio, struct page *page,
>unsigned int len, unsigned int offset)
>  {
> - if (!__bio_try_merge_page(bio, page, len, offset)) {
> + if (!__bio_try_merge_page(bio, page, len, offset, false)) {
>   if (bio_full(bio))
>   return 0;
>   __bio_add_page(bio, page, len, offset);
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 1f648d098a3b..ec5527b0fba4 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -313,7 +313,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>*/
>   sector = iomap_sector(iomap, pos);
>   if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
> - if (__bio_try_merge_page(ctx->bio, page, plen, poff))
> + if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
>   goto done;
>   is_contig = true;
>   }
> @@ -344,7 +344,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>   ctx->bio->bi_end_io = iomap_read_end_io;
>   }
>  
> - __bio_add_page(ctx->bio, page, plen, poff);
> + bio_add_page(ctx->bio, page, plen, poff);
>  done:
>   /*
>* Move the caller beyond our range so that it keeps making progress.
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 1f1829e506e8..b9fd44168f61 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -616,12 +616,12 @@ xfs_add_to_ioend(
>   bdev, sector);
>   }
>  
> - if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
> + if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
>   if (iop)
>   atomic_inc(>write_count);
>   if (bio_full(wpc->ioend->io_bio))
>   xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
> - __bio_add_page(wpc->ioend->io_bio, page, len, poff);
> + bio_add_page(wpc->ioend->io_bio, page, len, poff);
>   }
>  
>   wpc->ioend->io_size += len;
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c35997dd02c2..5505f74aef8b 100644
> --- a/include/linux/bio.h
> +++ b/include/lin

Re: [Cluster-devel] [PATCH V12 02/20] btrfs: look at bi_size for repair decisions

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:02AM +0800, Ming Lei wrote:
> From: Christoph Hellwig 
> 
> bio_readpage_error currently uses bi_vcnt to decide if it is worth
> retrying an I/O.  But the vector count is mostly an implementation
> artifact - it really should figure out if there is more than a
> single sector worth retrying.  Use bi_size for that and shift by
> PAGE_SHIFT.  This really should be blocks/sectors, but given that
> btrfs doesn't support a sector size different from the PAGE_SIZE
> using the page size keeps the changes to a minimum.
> 
> Reviewed-by: David Sterba 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Christoph Hellwig 
> ---
>  fs/btrfs/extent_io.c | 2 +-
>  include/linux/bio.h  | 6 --
>  2 files changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 15fd46582bb2..40751e86a2a9 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2368,7 +2368,7 @@ static int bio_readpage_error(struct bio *failed_bio, 
> u64 phy_offset,
>   int read_mode = 0;
>   blk_status_t status;
>   int ret;
> - unsigned failed_bio_pages = bio_pages_all(failed_bio);
> + unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
>  
>   BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 056fb627edb3..6f6bc331a5d1 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, 
> struct bio_vec *bv)
>   bv->bv_len = iter.bi_bvec_done;
>  }
>  
> -static inline unsigned bio_pages_all(struct bio *bio)
> -{
> - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
> - return bio->bi_vcnt;
> -}
> -
>  static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
>  {
>   WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 17/20] block: always define BIO_MAX_PAGES as 256

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:17AM +0800, Ming Lei wrote:
> Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
> increase BIO_MAX_PAGES for it.
> 
> CONFIG_THP_SWAP needs to split one THP into normal pages and adds
> them all to one bio. With multipage-bvec, it just takes one bvec to
> hold them all.
> 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bio.h | 8 
>  1 file changed, 8 deletions(-)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 5505f74aef8b..7be48c55b14a 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -34,15 +34,7 @@
>  #define BIO_BUG_ON
>  #endif
>  
> -#ifdef CONFIG_THP_SWAP
> -#if HPAGE_PMD_NR > 256
> -#define BIO_MAX_PAGESHPAGE_PMD_NR
> -#else
>  #define BIO_MAX_PAGES256
> -#endif
> -#else
> -#define BIO_MAX_PAGES256
> -#endif
>  
>  #define bio_prio(bio)(bio)->bi_ioprio
>  #define bio_set_prio(bio, prio)  ((bio)->bi_ioprio = prio)
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 18/20] block: document usage of bio iterator helpers

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:18AM +0800, Ming Lei wrote:
> Now multi-page bvec is supported, some helpers may return page by
> page, meantime some may return segment by segment, this patch
> documents the usage.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  Documentation/block/biovecs.txt | 25 +
>  1 file changed, 25 insertions(+)
> 
> diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
> index 25689584e6e0..ce6eccaf5df7 100644
> --- a/Documentation/block/biovecs.txt
> +++ b/Documentation/block/biovecs.txt
> @@ -117,3 +117,28 @@ Other implications:
> size limitations and the limitations of the underlying devices. Thus
> there's no need to define ->merge_bvec_fn() callbacks for individual block
> drivers.
> +
> +Usage of helpers:
> +=
> +
> +* The following helpers whose names have the suffix of "_all" can only be 
> used
> +on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
> +shouldn't use them because the bio may have been split before it reached the
> +driver.
> +
> + bio_for_each_segment_all()
> + bio_first_bvec_all()
> + bio_first_page_all()
> + bio_last_bvec_all()
> +
> +* The following helpers iterate over single-page segment. The passed 'struct
> +bio_vec' will contain a single-page IO vector during the iteration
> +
> + bio_for_each_segment()
> + bio_for_each_segment_all()
> +
> +* The following helpers iterate over multi-page bvec. The passed 'struct
> +bio_vec' will contain a multi-page IO vector during the iteration
> +
> + bio_for_each_bvec()
> + rq_for_each_bvec()
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 15/20] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:15AM +0800, Ming Lei wrote:
> This patch introduces one extra iterator variable to 
> bio_for_each_segment_all(),
> then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.
> 
> Given it is just one mechannical & simple change on all 
> bio_for_each_segment_all()
> users, this patch does tree-wide change in one single patch, so that we can
> avoid to use a temporary helper for this conversion.
> 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  block/bio.c   | 27 ++-
>  block/bounce.c|  6 --
>  drivers/md/bcache/btree.c |  3 ++-
>  drivers/md/dm-crypt.c |  3 ++-
>  drivers/md/raid1.c|  3 ++-
>  drivers/staging/erofs/data.c  |  3 ++-
>  drivers/staging/erofs/unzip_vle.c |  3 ++-
>  fs/block_dev.c|  6 --
>  fs/btrfs/compression.c|  3 ++-
>  fs/btrfs/disk-io.c|  3 ++-
>  fs/btrfs/extent_io.c  |  9 ++---
>  fs/btrfs/inode.c  |  6 --
>  fs/btrfs/raid56.c |  3 ++-
>  fs/crypto/bio.c   |  3 ++-
>  fs/direct-io.c|  4 +++-
>  fs/exofs/ore.c|  3 ++-
>  fs/exofs/ore_raid.c   |  3 ++-
>  fs/ext4/page-io.c |  3 ++-
>  fs/ext4/readpage.c|  3 ++-
>  fs/f2fs/data.c|  9 ++---
>  fs/gfs2/lops.c|  6 --
>  fs/gfs2/meta_io.c |  3 ++-
>  fs/iomap.c|  6 --
>  fs/mpage.c|  3 ++-
>  fs/xfs/xfs_aops.c |  5 +++--
>  include/linux/bio.h   | 11 +--
>  include/linux/bvec.h  | 30 ++
>  27 files changed, 125 insertions(+), 45 deletions(-)



Re: [Cluster-devel] [PATCH V12 14/20] bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:14AM +0800, Ming Lei wrote:
> bch_bio_alloc_pages() is always called on one new bio, so it is safe
> to access the bvec table directly. Given it is the only kind of this
> case, open code the bvec table access since bio_for_each_segment_all()
> will be changed to support for iterating over multipage bvec.
> 
> Acked-by: Coly Li 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  drivers/md/bcache/util.c | 6 +-
>  1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index 20eddeac1531..62fb917f7a4f 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
>   int i;
>   struct bio_vec *bv;
>  
> - bio_for_each_segment_all(bv, bio, i) {
> + /*
> +  * This is called on freshly new bio, so it is safe to access the
> +  * bvec table directly.
> +  */
> + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
>   bv->bv_page = alloc_page(gfp_mask);
>   if (!bv->bv_page) {
>   while (--bv >= bio->bi_io_vec)
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 09/20] block: use bio_for_each_bvec() to compute multi-page bvec count

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:09AM +0800, Ming Lei wrote:
> First it is more efficient to use bio_for_each_bvec() in both
> blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
> many multi-page bvecs there are in the bio.
> 
> Secondly once bio_for_each_bvec() is used, the bvec may need to be
> splitted because its length can be very longer than max segment size,
> so we have to split the big bvec into several segments.
> 
> Thirdly when splitting multi-page bvec into segments, the max segment
> limit may be reached, so the bio split need to be considered under
> this situation too.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c | 100 
> +++---
>  1 file changed, 80 insertions(+), 20 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 51ec6ca56a0a..2d8f388d43de 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -161,6 +161,70 @@ static inline unsigned get_max_io_size(struct 
> request_queue *q,
>   return sectors;
>  }
>  
> +static unsigned get_max_segment_size(struct request_queue *q,
> +  unsigned offset)
> +{
> + unsigned long mask = queue_segment_boundary(q);
> +
> + return min_t(unsigned long, mask - (mask & offset) + 1,
> +  queue_max_segment_size(q));
> +}
> +
> +/*
> + * Split the bvec @bv into segments, and update all kinds of
> + * variables.
> + */
> +static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
> + unsigned *nsegs, unsigned *last_seg_size,
> + unsigned *front_seg_size, unsigned *sectors)
> +{
> + unsigned len = bv->bv_len;
> + unsigned total_len = 0;
> + unsigned new_nsegs = 0, seg_size = 0;
> +
> + /*
> +  * Multipage bvec may be too big to hold in one segment,
> +  * so the current bvec has to be splitted as multiple
> +  * segments.
> +  */
> + while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
> + seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
> + seg_size = min(seg_size, len);
> +
> + new_nsegs++;
> + total_len += seg_size;
> + len -= seg_size;
> +
> + if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
> + break;
> + }
> +
> + if (!new_nsegs)
> + return !!len;
> +
> + /* update front segment size */
> + if (!*nsegs) {
> + unsigned first_seg_size;
> +
> + if (new_nsegs == 1)
> + first_seg_size = get_max_segment_size(q, bv->bv_offset);
> + else
> + first_seg_size = queue_max_segment_size(q);
> +
> + if (*front_seg_size < first_seg_size)
> + *front_seg_size = first_seg_size;
> + }
> +
> + /* update other varibles */
> + *last_seg_size = seg_size;
> + *nsegs += new_nsegs;
> + if (sectors)
> + *sectors += total_len >> 9;
> +
> + /* split in the middle of the bvec if len != 0 */
> + return !!len;
> +}
> +
>  static struct bio *blk_bio_segment_split(struct request_queue *q,
>struct bio *bio,
>struct bio_set *bs,
> @@ -174,7 +238,7 @@ static struct bio *blk_bio_segment_split(struct 
> request_queue *q,
>   struct bio *new = NULL;
>   const unsigned max_sectors = get_max_io_size(q, bio);
>  
> - bio_for_each_segment(bv, bio, iter) {
> + bio_for_each_bvec(bv, bio, iter) {
>   /*
>* If the queue doesn't support SG gaps and adding this
>* offset would create a gap, disallow it.
> @@ -189,8 +253,12 @@ static struct bio *blk_bio_segment_split(struct 
> request_queue *q,
>*/
>   if (nsegs < queue_max_segments(q) &&
>   sectors < max_sectors) {
> - nsegs++;
> - sectors = max_sectors;
> + /* split in the middle of bvec */
> + bv.bv_len = (max_sectors - sectors) << 9;
> + bvec_split_segs(q, , ,
> + _size,
> + _seg_size,
> + );
>   }
>   goto split;
>   }
> @@ -212,14 +280,12 @@ static struct b

Re: [Cluster-devel] [PATCH V12 13/20] block: loop: pass multi-page bvec to iov_iter

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:13AM +0800, Ming Lei wrote:
> iov_iter is implemented on bvec itererator helpers, so it is safe to pass
> multi-page bvec to it, and this way is much more efficient than passing one
> page in each bvec.
> 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  drivers/block/loop.c | 20 ++--
>  1 file changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 176ab1f28eca..e3683211f12d 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -510,21 +510,22 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>loff_t pos, bool rw)
>  {
>   struct iov_iter iter;
> + struct req_iterator rq_iter;
>   struct bio_vec *bvec;
>   struct request *rq = blk_mq_rq_from_pdu(cmd);
>   struct bio *bio = rq->bio;
>   struct file *file = lo->lo_backing_file;
> + struct bio_vec tmp;
>   unsigned int offset;
> - int segments = 0;
> + int nr_bvec = 0;
>   int ret;
>  
> + rq_for_each_bvec(tmp, rq, rq_iter)
> + nr_bvec++;
> +
>   if (rq->bio != rq->biotail) {
> - struct req_iterator iter;
> - struct bio_vec tmp;
>  
> - __rq_for_each_bio(bio, rq)
> - segments += bio_segments(bio);
> - bvec = kmalloc_array(segments, sizeof(struct bio_vec),
> + bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>GFP_NOIO);
>   if (!bvec)
>   return -EIO;
> @@ -533,10 +534,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>   /*
>* The bios of the request may be started from the middle of
>* the 'bvec' because of bio splitting, so we can't directly
> -  * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
> +  * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
>* API will take care of all details for us.
>*/
> - rq_for_each_segment(tmp, rq, iter) {
> + rq_for_each_bvec(tmp, rq, rq_iter) {
>   *bvec = tmp;
>   bvec++;
>   }
> @@ -550,11 +551,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>*/
>   offset = bio->bi_iter.bi_bvec_done;
>   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> - segments = bio_segments(bio);
>   }
>   atomic_set(>ref, 2);
>  
> - iov_iter_bvec(, rw, bvec, segments, blk_rq_bytes(rq));
> + iov_iter_bvec(, rw, bvec, nr_bvec, blk_rq_bytes(rq));
>   iter.iov_offset = offset;
>  
>   cmd->iocb.ki_pos = pos;
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 01/20] btrfs: remove various bio_offset arguments

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:01AM +0800, Ming Lei wrote:
> From: Christoph Hellwig 
> 
> The btrfs write path passes a bio_offset argument through some deep
> callchains including async offloading.  In the end this is easily
> calculatable using page_offset plus the bvec offset for the first
> page in the bio, and only actually used by by a single function.
> Just move the calculation of the offset there.
> 
> Reviewed-by: David Sterba 
> Signed-off-by: Christoph Hellwig 
> ---
>  fs/btrfs/disk-io.c   | 21 +
>  fs/btrfs/disk-io.h   |  2 +-
>  fs/btrfs/extent_io.c |  9 ++---
>  fs/btrfs/extent_io.h |  5 ++---
>  fs/btrfs/inode.c | 17 -
>  5 files changed, 18 insertions(+), 36 deletions(-)

[snip]

> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 9ea4c6f0352f..c576b3fcaea7 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1920,8 +1920,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned 
> long offset,
>   * At IO completion time the cums attached on the ordered extent record
>   * are inserted into the btree
>   */
> -static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio 
> *bio,
> - u64 bio_offset)
> +static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio 
> *bio)
>  {
>   struct inode *inode = private_data;
>   blk_status_t ret = 0;
> @@ -1973,8 +1972,7 @@ blk_status_t btrfs_submit_bio_done(void *private_data, 
> struct bio *bio,
>   *c-3) otherwise:async submit
>   */
>  static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio 
> *bio,
> -  int mirror_num, unsigned long bio_flags,
> -  u64 bio_offset)
> +  int mirror_num, unsigned long bio_flags)
>  {
>   struct inode *inode = private_data;
>   struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> @@ -2011,8 +2009,7 @@ static blk_status_t btrfs_submit_bio_hook(void 
> *private_data, struct bio *bio,
>   goto mapit;
>   /* we're doing a write, do the async checksumming */
>   ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
> -   bio_offset, inode,
> -   btrfs_submit_bio_start);
> +   inode, btrfs_submit_bio_start);
>   goto out;
>   } else if (!skip_sum) {
>   ret = btrfs_csum_one_bio(inode, bio, 0, 0);
> @@ -8123,10 +8120,13 @@ static void btrfs_endio_direct_write(struct bio *bio)
>  }
>  
>  static blk_status_t btrfs_submit_bio_start_direct_io(void *private_data,
> - struct bio *bio, u64 offset)
> + struct bio *bio)
>  {
>   struct inode *inode = private_data;
> + struct bio_vec *bvec = bio_first_bvec_all(bio);
> + u64 offset = page_offset(bvec->bv_page) + bvec->bv_offset;

Hm, but for direct I/O, these will be user pages (or the zero page), so
page_offset() won't be valid?

>   blk_status_t ret;
> +
>   ret = btrfs_csum_one_bio(inode, bio, offset, 1);
>   BUG_ON(ret); /* -ENOMEM */
>   return 0;



Re: [Cluster-devel] [PATCH V12 06/20] block: rename bvec helpers

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:06AM +0800, Ming Lei wrote:
> We will support multi-page bvec soon, and have to deal with
> single-page vs multi-page bvec. This patch follows Christoph's
> suggestion to rename all the following helpers:
> 
>   for_each_bvec
>   bvec_iter_bvec
>   bvec_iter_len
>   bvec_iter_page
>   bvec_iter_offset
> 
> into:
>   for_each_segment
>   segment_iter_bvec
>   segment_iter_len
>   segment_iter_page
>   segment_iter_offset
> 
> so that these helpers named with 'segment' only deal with single-page
> bvec, or called segment. We will introduce helpers named with 'bvec'
> for multi-page bvec.
> 
> bvec_iter_advance() isn't renamed becasue this helper is always operated
> on real bvec even though multi-page bvec is supported.
> 
> Suggested-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  .clang-format  |  2 +-
>  drivers/md/dm-integrity.c  |  2 +-
>  drivers/md/dm-io.c |  4 ++--
>  drivers/nvdimm/blk.c   |  4 ++--
>  drivers/nvdimm/btt.c   |  4 ++--
>  include/linux/bio.h| 10 +-
>  include/linux/bvec.h   | 20 +++-
>  include/linux/ceph/messenger.h |  2 +-
>  lib/iov_iter.c |  2 +-
>  net/ceph/messenger.c   | 14 +++---
>  10 files changed, 33 insertions(+), 31 deletions(-)



Re: [Cluster-devel] [PATCH V12 05/20] block: remove bvec_iter_rewind()

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:05AM +0800, Ming Lei wrote:
> Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes
> bio_rewind_iter(), then no one uses bvec_iter_rewind() any more,
> so remove it.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bvec.h | 24 
>  1 file changed, 24 deletions(-)
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index 02c73c6aa805..ba0ae40e77c9 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec 
> *bv,
>   return true;
>  }
>  
> -static inline bool bvec_iter_rewind(const struct bio_vec *bv,
> -  struct bvec_iter *iter,
> -  unsigned int bytes)
> -{
> - while (bytes) {
> - unsigned len = min(bytes, iter->bi_bvec_done);
> -
> - if (iter->bi_bvec_done == 0) {
> - if (WARN_ONCE(iter->bi_idx == 0,
> -   "Attempted to rewind iter beyond "
> -   "bvec's boundaries\n")) {
> - return false;
> - }
> - iter->bi_idx--;
> - iter->bi_bvec_done = __bvec_iter_bvec(bv, 
> *iter)->bv_len;
> - continue;
> - }
> - bytes -= len;
> - iter->bi_size += len;
> - iter->bi_bvec_done -= len;
> - }
> - return true;
> -}
> -
>  #define for_each_bvec(bvl, bio_vec, iter, start) \
>   for (iter = (start);\
>(iter).bi_size &&  \
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 04/20] block: don't use bio->bi_vcnt to figure out segment number

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:04AM +0800, Ming Lei wrote:
> It is wrong to use bio->bi_vcnt to figure out how many segments
> there are in the bio even though CLONED flag isn't set on this bio,
> because this bio may be splitted or advanced.
> 
> So always use bio_segments() in blk_recount_segments(), and it shouldn't
> cause any performance loss now because the physical segment number is figured
> out in blk_queue_split() and BIO_SEG_VALID is set meantime since
> bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting").
> 
> Reviewed-by: Christoph Hellwig 

Reviewed-by: Omar Sandoval 

> Fixes: 76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max 
> segments")
> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index e69d8f8ba819..51ec6ca56a0a 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -367,13 +367,7 @@ void blk_recalc_rq_segments(struct request *rq)
>  
>  void blk_recount_segments(struct request_queue *q, struct bio *bio)
>  {
> - unsigned short seg_cnt;
> -
> - /* estimate segment number by bi_vcnt for non-cloned bio */
> - if (bio_flagged(bio, BIO_CLONED))
> - seg_cnt = bio_segments(bio);
> - else
> - seg_cnt = bio->bi_vcnt;
> + unsigned short seg_cnt = bio_segments(bio);
>  
>   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, >queue_flags) &&
>   (seg_cnt < queue_max_segments(q)))
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V12 03/20] block: remove the "cluster" flag

2018-11-26 Thread Omar Sandoval
On Mon, Nov 26, 2018 at 10:17:03AM +0800, Ming Lei wrote:
> From: Christoph Hellwig 
> 
> The cluster flag implements some very old SCSI behavior.  As far as I
> can tell the original intent was to enable or disable any kind of
> segment merging.  But the actually visible effect to the LLDD is that
> it limits each segments to be inside a single page, which we can
> also affect by setting the maximum segment size and the segment
> boundary.

Reviewed-by: Omar Sandoval 

One comment typo below.

> Signed-off-by: Christoph Hellwig 
> 
> Replace virt boundary with segment boundary limit.
> 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c   | 20 
>  block/blk-settings.c|  3 ---
>  block/blk-sysfs.c   |  5 +
>  drivers/scsi/scsi_lib.c | 20 
>  include/linux/blkdev.h  |  6 --
>  5 files changed, 25 insertions(+), 29 deletions(-)
> 

[snip]

> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 0df15cb738d2..78d6d05992b0 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1810,6 +1810,8 @@ static int scsi_map_queues(struct blk_mq_tag_set *set)
>  void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q)
>  {
>   struct device *dev = shost->dma_dev;
> + unsigned max_segment_size = dma_get_max_seg_size(dev);
> + unsigned long segment_boundary = shost->dma_boundary;
>  
>   /*
>* this limit is imposed by hardware restrictions
> @@ -1828,13 +1830,23 @@ void __scsi_init_queue(struct Scsi_Host *shost, 
> struct request_queue *q)
>   blk_queue_max_hw_sectors(q, shost->max_sectors);
>   if (shost->unchecked_isa_dma)
>   blk_queue_bounce_limit(q, BLK_BOUNCE_ISA);
> - blk_queue_segment_boundary(q, shost->dma_boundary);
>   dma_set_seg_boundary(dev, shost->dma_boundary);
>  
> - blk_queue_max_segment_size(q, dma_get_max_seg_size(dev));
> + /*
> +  * Clustering is a really old concept from the stone age of Linux
> +  * SCSI support.  But the basic idea is that we never give the
> +  * driver a segment that spans multiple pages.  For that we need
> +  * to limit the segment size, and set the segment boundary so that
> +  * we never merge a second segment which is no page aligned.

Typo, "which is not page aligned".



Re: [Cluster-devel] [PATCH V10 18/19] block: kill QUEUE_FLAG_NO_SG_MERGE

2018-11-16 Thread Omar Sandoval
On Fri, Nov 16, 2018 at 02:59:22PM +0100, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 06:18:11PM -0800, Omar Sandoval wrote:
> > This commit message wasn't very clear. Is it the case that
> > QUEUE_FLAG_NO_SG_MERGE is no longer set by any drivers?
> 
> I think he wants to say that not doing S/G merging is rather pointless
> with the current setup of the I/O path, as it isn't going to save
> you a significant amount of cycles.

Okay, that makes sense. Ming, you can add

Reviewed-by: Omar Sandoval 



Re: [Cluster-devel] [PATCH V10 03/19] block: use bio_for_each_bvec() to compute multi-page bvec count

2018-11-16 Thread Omar Sandoval
On Fri, Nov 16, 2018 at 10:19:56AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 02:18:47PM -0800, Omar Sandoval wrote:
> > My only reason to prefer unsigned int is consistency. unsigned int is
> > much more common in the kernel:
> > 
> > $ ag --cc -s 'unsigned\s+int' | wc -l
> > 129632
> > $ ag --cc -s 'unsigned\s+(?!char|short|int|long)' | wc -l
> > 22435
> > 
> > checkpatch also warns on plain unsigned.
> 
> Talk about chicken and egg.  unsigned is perfectly valid C, and being
> shorter often helps being more readable.  checkpath is as so often
> wrongly opinionated..

Fair enough. Since enough people don't mind bare unsigned in the block
code, I retract my comment :)



Re: [Cluster-devel] [PATCH V10 18/19] block: kill QUEUE_FLAG_NO_SG_MERGE

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:05PM +0800, Ming Lei wrote:
> Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
> physical segment number is mainly figured out in blk_queue_split() for
> fast path, and the flag of BIO_SEG_VALID is set there too.
> 
> Now only blk_recount_segments() and blk_recalc_rq_segments() use this
> flag.
> 
> Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
> is set in blk_queue_split().
> 
> For another user of blk_recalc_rq_segments():
> 
> - run in partial completion branch of blk_update_request, which is an unusual 
> case
> 
> - run in blk_cloned_rq_check_limits(), still not a big problem if the flag is 
> killed
> since dm-rq is the only user.
> 
> Multi-page bvec is enabled now, QUEUE_FLAG_NO_SG_MERGE doesn't make sense any 
> more.

This commit message wasn't very clear. Is it the case that
QUEUE_FLAG_NO_SG_MERGE is no longer set by any drivers?



Re: [Cluster-devel] [PATCH V10 17/19] block: don't use bio->bi_vcnt to figure out segment number

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:04PM +0800, Ming Lei wrote:
> It is wrong to use bio->bi_vcnt to figure out how many segments
> there are in the bio even though CLONED flag isn't set on this bio,
> because this bio may be splitted or advanced.
> 
> So always use bio_segments() in blk_recount_segments(), and it shouldn't
> cause any performance loss now because the physical segment number is figured
> out in blk_queue_split() and BIO_SEG_VALID is set meantime since
> bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting").
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Fixes: 7f60dcaaf91 ("block: blk-merge: fix blk_recount_segments()")

>From what I can tell, the problem was originally introduced by
76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max 
segments")

Is that right?

> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index cb9f49bcfd36..153a659fde74 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -429,13 +429,7 @@ void blk_recalc_rq_segments(struct request *rq)
>  
>  void blk_recount_segments(struct request_queue *q, struct bio *bio)
>  {
> - unsigned short seg_cnt;
> -
> - /* estimate segment number by bi_vcnt for non-cloned bio */
> - if (bio_flagged(bio, BIO_CLONED))
> - seg_cnt = bio_segments(bio);
> - else
> - seg_cnt = bio->bi_vcnt;
> + unsigned short seg_cnt = bio_segments(bio);
>  
>   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, >queue_flags) &&
>   (seg_cnt < queue_max_segments(q)))
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 13/19] iomap & xfs: only account for new added page

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:00PM +0800, Ming Lei wrote:
> After multi-page is enabled, one new page may be merged to a segment
> even though it is a new added page.
> 
> This patch deals with this issue by post-check in case of merge, and
> only a freshly new added page need to be dealt with for iomap & xfs.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  fs/iomap.c  | 22 ++
>  fs/xfs/xfs_aops.c   | 10 --
>  include/linux/bio.h | 11 +++
>  3 files changed, 33 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index df0212560b36..a1b97a5c726a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -288,6 +288,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>   loff_t orig_pos = pos;
>   unsigned poff, plen;
>   sector_t sector;
> + bool need_account = false;
>  
>   if (iomap->type == IOMAP_INLINE) {
>   WARN_ON_ONCE(pos);
> @@ -313,18 +314,15 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>*/
>   sector = iomap_sector(iomap, pos);
>   if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
> - if (__bio_try_merge_page(ctx->bio, page, plen, poff))
> + if (__bio_try_merge_page(ctx->bio, page, plen, poff)) {
> + need_account = iop && bio_is_last_segment(ctx->bio,
> + page, plen, poff);

It's redundant to make this iop && ... since you already check
iop && need_account below. Maybe rename it to added_page? Also, this
indentation is wack.

>   goto done;
> + }
>   is_contig = true;
>   }
>  
> - /*
> -  * If we start a new segment we need to increase the read count, and we
> -  * need to do so before submitting any previous full bio to make sure
> -  * that we don't prematurely unlock the page.
> -  */
> - if (iop)
> - atomic_inc(>read_count);
> + need_account = true;
>  
>   if (!ctx->bio || !is_contig || bio_full(ctx->bio)) {
>   gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
> @@ -347,6 +345,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
> loff_t length, void *data,
>   __bio_add_page(ctx->bio, page, plen, poff);
>  done:
>   /*
> +  * If we add a new page we need to increase the read count, and we
> +  * need to do so before submitting any previous full bio to make sure
> +  * that we don't prematurely unlock the page.
> +  */
> + if (iop && need_account)
> + atomic_inc(>read_count);
> +
> + /*
>* Move the caller beyond our range so that it keeps making progress.
>* For that we have to include any leading non-uptodate ranges, but
>* we can skip trailing ones as they will be handled in the next
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 1f1829e506e8..d8e9cc9f751a 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -603,6 +603,7 @@ xfs_add_to_ioend(
>   unsignedlen = i_blocksize(inode);
>   unsignedpoff = offset & (PAGE_SIZE - 1);
>   sector_tsector;
> + boolneed_account;
>  
>   sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
>   ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
> @@ -617,13 +618,18 @@ xfs_add_to_ioend(
>   }
>  
>   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
> - if (iop)
> - atomic_inc(>write_count);
> + need_account = true;
>   if (bio_full(wpc->ioend->io_bio))
>   xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
>   __bio_add_page(wpc->ioend->io_bio, page, len, poff);
> + } else {
> + need_account = iop && bio_is_last_segment(wpc->ioend->io_bio,
> + page, len, poff);

Same here, no need for iop &&, rename it added_page, indentation is off.

>   }
>  
> + if (iop && need_account)
> + atomic_inc(>write_count);
> +
>   wpc->ioend->io_size += len;
>  }
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1a2430a8b89d..5040e9a2eb09 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -341,6 +341,17 @@ static inline struct bio_vec 

Re: [Cluster-devel] [PATCH V10 15/19] block: always define BIO_MAX_PAGES as 256

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:02PM +0800, Ming Lei wrote:
> Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
> increase BIO_MAX_PAGES for it.

You mentioned to it in the cover letter, but this needs more explanation
in the commit message. Why did CONFIG_THP_SWAP require > 256? Why does
multipage bvecs remove that requirement?

> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  include/linux/bio.h | 8 
>  1 file changed, 8 deletions(-)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 5040e9a2eb09..277921ad42e7 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -34,15 +34,7 @@
>  #define BIO_BUG_ON
>  #endif
>  
> -#ifdef CONFIG_THP_SWAP
> -#if HPAGE_PMD_NR > 256
> -#define BIO_MAX_PAGESHPAGE_PMD_NR
> -#else
>  #define BIO_MAX_PAGES256
> -#endif
> -#else
> -#define BIO_MAX_PAGES256
> -#endif
>  
>  #define bio_prio(bio)(bio)->bi_ioprio
>  #define bio_set_prio(bio, prio)  ((bio)->bi_ioprio = prio)
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 16/19] block: document usage of bio iterator helpers

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:03PM +0800, Ming Lei wrote:
> Now multi-page bvec is supported, some helpers may return page by
> page, meantime some may return segment by segment, this patch
> documents the usage.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  Documentation/block/biovecs.txt | 26 ++
>  1 file changed, 26 insertions(+)
> 
> diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
> index 25689584e6e0..bfafb70d0d9e 100644
> --- a/Documentation/block/biovecs.txt
> +++ b/Documentation/block/biovecs.txt
> @@ -117,3 +117,29 @@ Other implications:
> size limitations and the limitations of the underlying devices. Thus
> there's no need to define ->merge_bvec_fn() callbacks for individual block
> drivers.
> +
> +Usage of helpers:
> +=
> +
> +* The following helpers whose names have the suffix of "_all" can only be 
> used
> +on non-BIO_CLONED bio, and usually they are used by filesystem code, and 
> driver
> +shouldn't use them because bio may have been split before they got to the 
> driver:

Putting an english teacher hat on, this is quite the run-on sentence.
How about:

* The following helpers whose names have the suffix of "_all" can only be used
on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
shouldn't use them because the bio may have been split before it reached the
driver.

Maybe also an explanation of why the filesystem would want to use these?

> + bio_for_each_segment_all()
> + bio_first_bvec_all()
> + bio_first_page_all()
> + bio_last_bvec_all()
> +
> +* The following helpers iterate over single-page bvec, and the local
> +variable of 'struct bio_vec' or the reference records single-page IO
> +vector during the itearation:


* The following helpers iterate over single-page bvecs. The passed 'struct
bio_vec' will contain a single-page IO vector during the iteration.

> + bio_for_each_segment()
> + bio_for_each_segment_all()
> +
> +* The following helper iterates over multi-page bvec, and each bvec may
> +include multiple physically contiguous pages, and the local variable of
> +'struct bio_vec' or the reference records multi-page IO vector during the
> +itearation:

* The following helper iterates over multi-page bvecs. Each bvec may
include multiple physically contiguous pages. The passed 'struct bio_vec' will
contain a multi-page IO vector during the iteration.

> + bio_for_each_bvec()
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 14/19] block: enable multipage bvecs

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:53:01PM +0800, Ming Lei wrote:
> This patch pulls the trigger for multi-page bvecs.
> 
> Now any request queue which supports queue cluster will see multi-page
> bvecs.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  block/bio.c | 24 ++--
>  1 file changed, 18 insertions(+), 6 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 6486722d4d4b..ed6df6f8e63d 100644
> --- a/block/bio.c
> +++ b/block/bio.c

This comment above __bio_try_merge_page() doesn't make sense after this
change:

 This is a
 a useful optimisation for file systems with a block size smaller than the
 page size.

Can you please get rid of it in this patch?

> @@ -767,12 +767,24 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
> *page,
>  
>   if (bio->bi_vcnt > 0) {
>   struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
> -
> - if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
> - bv->bv_len += len;
> - bio->bi_iter.bi_size += len;
> - return true;
> - }
> + struct request_queue *q = NULL;
> +
> + if (page == bv->bv_page && off == (bv->bv_offset + bv->bv_len)
> + && (off + len) <= PAGE_SIZE)
> + goto merge;

The parentheses around (bv->bv_offset + bv->bv_len) and (off + len) are
unnecessary noise.

What's the point of the new (off + len) <= PAGE_SIZE check?

> +
> + if (bio->bi_disk)
> + q = bio->bi_disk->queue;
> +
> + /* disable multi-page bvec too if cluster isn't enabled */
> + if (!q || !blk_queue_cluster(q) ||
> + ((page_to_phys(bv->bv_page) + bv->bv_offset + bv->bv_len) !=
> +  (page_to_phys(page) + off)))

More unnecessary parentheses here.

> + return false;
> + merge:
> + bv->bv_len += len;
> + bio->bi_iter.bi_size += len;
> + return true;
>   }
>   return false;
>  }
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 10/19] block: loop: pass multi-page bvec to iov_iter

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:57PM +0800, Ming Lei wrote:
> iov_iter is implemented with bvec itererator, so it is safe to pass
> multipage bvec to it, and this way is much more efficient than
> passing one page in each bvec.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

Comments below.

> Signed-off-by: Ming Lei 
> ---
>  drivers/block/loop.c | 23 ---
>  1 file changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index bf6bc35aaf88..a3fd418ec637 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -515,16 +515,16 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>   struct bio *bio = rq->bio;
>   struct file *file = lo->lo_backing_file;
>   unsigned int offset;
> - int segments = 0;
> + int nr_bvec = 0;
>   int ret;
>  
>   if (rq->bio != rq->biotail) {
> - struct req_iterator iter;
> + struct bvec_iter iter;
>   struct bio_vec tmp;
>  
>   __rq_for_each_bio(bio, rq)
> - segments += bio_segments(bio);
> - bvec = kmalloc_array(segments, sizeof(struct bio_vec),
> + nr_bvec += bio_bvecs(bio);
> + bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
>GFP_NOIO);
>   if (!bvec)
>   return -EIO;
> @@ -533,13 +533,14 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>   /*
>* The bios of the request may be started from the middle of
>* the 'bvec' because of bio splitting, so we can't directly
> -  * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
> +  * copy bio->bi_iov_vec to new bvec. The bio_for_each_bvec
>* API will take care of all details for us.
>*/
> - rq_for_each_segment(tmp, rq, iter) {
> - *bvec = tmp;
> - bvec++;
> - }
> + __rq_for_each_bio(bio, rq)
> + bio_for_each_bvec(tmp, bio, iter) {
> + *bvec = tmp;
> + bvec++;
> + }

Even if they're not strictly necessary, could you please include the
curly braces for __rq_for_each_bio() here?

>   bvec = cmd->bvec;
>   offset = 0;
>   } else {
> @@ -550,11 +551,11 @@ static int lo_rw_aio(struct loop_device *lo, struct 
> loop_cmd *cmd,
>*/
>   offset = bio->bi_iter.bi_bvec_done;
>   bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
> - segments = bio_segments(bio);
> + nr_bvec = bio_bvecs(bio);

This scared me for a second, but it's fine to do here because we haven't
actually enabled multipage bvecs yet, right?

>   }
>   atomic_set(>ref, 2);
>  
> - iov_iter_bvec(, rw, bvec, segments, blk_rq_bytes(rq));
> + iov_iter_bvec(, rw, bvec, nr_bvec, blk_rq_bytes(rq));
>   iter.iov_offset = offset;
>  
>   cmd->iocb.ki_pos = pos;
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 12/19] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:59PM +0800, Ming Lei wrote:
> This patch introduces one extra iterator variable to 
> bio_for_each_segment_all(),
> then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.
> 
> Given it is just one mechannical & simple change on all 
> bio_for_each_segment_all()
> users, this patch does tree-wide change in one single patch, so that we can
> avoid to use a temporary helper for this conversion.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Alexander Viro 
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: linux-bt...@vger.kernel.org
> Cc: David Sterba 
> Cc: Darrick J. Wong 
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  block/bio.c   | 27 ++-
>  block/blk-zoned.c |  1 +
>  block/bounce.c|  6 --
>  drivers/md/bcache/btree.c |  3 ++-
>  drivers/md/dm-crypt.c |  3 ++-
>  drivers/md/raid1.c|  3 ++-
>  drivers/staging/erofs/data.c  |  3 ++-
>  drivers/staging/erofs/unzip_vle.c |  3 ++-
>  fs/block_dev.c|  6 --
>  fs/btrfs/compression.c|  3 ++-
>  fs/btrfs/disk-io.c|  3 ++-
>  fs/btrfs/extent_io.c  | 12 
>  fs/btrfs/inode.c  |  6 --
>  fs/btrfs/raid56.c |  3 ++-
>  fs/crypto/bio.c   |  3 ++-
>  fs/direct-io.c|  4 +++-
>  fs/exofs/ore.c|  3 ++-
>  fs/exofs/ore_raid.c   |  3 ++-
>  fs/ext4/page-io.c |  3 ++-
>  fs/ext4/readpage.c|  3 ++-
>  fs/f2fs/data.c|  9 ++---
>  fs/gfs2/lops.c|  6 --
>  fs/gfs2/meta_io.c |  3 ++-
>  fs/iomap.c|  6 --
>  fs/mpage.c|  3 ++-
>  fs/xfs/xfs_aops.c |  5 +++--
>  include/linux/bio.h   | 11 +--
>  include/linux/bvec.h  | 31 +++
>  28 files changed, 129 insertions(+), 46 deletions(-)
> 

[snip]

> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 3496c816946e..1a2430a8b89d 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -131,12 +131,19 @@ static inline bool bio_full(struct bio *bio)
>   return bio->bi_vcnt >= bio->bi_max_vecs;
>  }
>  
> +#define bvec_for_each_segment(bv, bvl, i, iter_all)  \
> + for (bv = bvec_init_iter_all(_all);\
> + (iter_all.done < (bvl)->bv_len) &&  \
> + ((bvec_next_segment((bvl), _all)), 1); \

The parentheses around (bvec_next_segment((bvl), _all)) are
unnecessary.

> + iter_all.done += bv->bv_len, i += 1)
> +
>  /*
>   * drivers should _never_ use the all version - the bio may have been split
>   * before it got to the driver and the driver won't own all of it
>   */
> -#define bio_for_each_segment_all(bvl, bio, i)
> \
> - for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
> +#define bio_for_each_segment_all(bvl, bio, i, iter_all)  \
> + for (i = 0, iter_all.idx = 0; iter_all.idx < (bio)->bi_vcnt; 
> iter_all.idx++)\
> + bvec_for_each_segment(bvl, &((bio)->bi_io_vec[iter_all.idx]), 
> i, iter_all)

Would it be possible to move i into iter_all to streamline this a bit?

>  static inline void __bio_advance_iter(struct bio *bio, struct bvec_iter 
> *iter,
> unsigned bytes, bool mp)
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index 01616a0b6220..02f26d2b59ad 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -82,6 +82,12 @@ struct bvec_iter {
>  current bvec */
>  };
>  
> +struct bvec_iter_all {
> + struct bio_vec  bv;
> + int idx;
> + unsigneddone;
> +};
> +
>  /*
>   * various member access, note that bio_data should of course not be used
>   * on highmem page vectors
> @@ -216,6 +222,31 @@ static inline bool mp_bvec_iter_advance(const struct 
> bio_vec *bv,
>   .bi_bvec_done   = 0,\
>  }
>  
> +static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all 
> *iter_all)
> +{
> + iter_all->bv.bv_page = NULL;
> + iter_all->done = 0;
> +
> + return _all->bv;
> +}
> +
> +/* used for chunk_for_each_segment */
> +static inline void bvec_next_segment(const struct bio_vec *bvec,
> + struct bvec_iter_all *iter_all)

Indentation.

> +{
> + struct bio_vec 

Re: [Cluster-devel] [PATCH V10 11/19] bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:58PM +0800, Ming Lei wrote:
> bch_bio_alloc_pages() is always called on one new bio, so it is safe
> to access the bvec table directly. Given it is the only kind of this
> case, open code the bvec table access since bio_for_each_segment_all()
> will be changed to support for iterating over multipage bvec.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Acked-by: Coly Li 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  drivers/md/bcache/util.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index 20eddeac1531..8517aebcda2d 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -270,7 +270,7 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
>   int i;
>   struct bio_vec *bv;
>  
> - bio_for_each_segment_all(bv, bio, i) {
> + for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++) {

This is missing an i++.



Re: [Cluster-devel] [PATCH V10 07/19] btrfs: use bvec_last_segment to get bio's last page

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:54PM +0800, Ming Lei wrote:
> Preparing for supporting multi-page bvec.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  fs/btrfs/compression.c | 5 -
>  fs/btrfs/extent_io.c   | 5 +++--
>  2 files changed, 7 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
> index 2955a4ea2fa8..161e14b8b180 100644
> --- a/fs/btrfs/compression.c
> +++ b/fs/btrfs/compression.c
> @@ -400,8 +400,11 @@ blk_status_t btrfs_submit_compressed_write(struct inode 
> *inode, u64 start,
>  static u64 bio_end_offset(struct bio *bio)
>  {
>   struct bio_vec *last = bio_last_bvec_all(bio);
> + struct bio_vec bv;
>  
> - return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
> + bvec_last_segment(last, );
> +
> + return page_offset(bv.bv_page) + bv.bv_len + bv.bv_offset;
>  }
>  
>  static noinline int add_ra_bio_pages(struct inode *inode,
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index d228f706ff3e..5d5965297e7e 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2720,11 +2720,12 @@ static int __must_check submit_one_bio(struct bio 
> *bio, int mirror_num,
>  {
>   blk_status_t ret = 0;
>   struct bio_vec *bvec = bio_last_bvec_all(bio);
> - struct page *page = bvec->bv_page;
> + struct bio_vec bv;
>   struct extent_io_tree *tree = bio->bi_private;
>   u64 start;
>  
> - start = page_offset(page) + bvec->bv_offset;
> + bvec_last_segment(bvec, );
> + start = page_offset(bv.bv_page) + bv.bv_offset;
>  
>   bio->bi_private = NULL;
>  
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 09/19] block: introduce bio_bvecs()

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:56PM +0800, Ming Lei wrote:
> There are still cases in which we need to use bio_bvecs() for get the
> number of multi-page segment, so introduce it.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bio.h | 30 +-
>  1 file changed, 25 insertions(+), 5 deletions(-)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1f0dcf109841..3496c816946e 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -196,7 +196,6 @@ static inline unsigned bio_segments(struct bio *bio)
>* We special case discard/write same/write zeroes, because they
>* interpret bi_size differently:
>*/
> -
>   switch (bio_op(bio)) {
>   case REQ_OP_DISCARD:
>   case REQ_OP_SECURE_ERASE:
> @@ -205,13 +204,34 @@ static inline unsigned bio_segments(struct bio *bio)
>   case REQ_OP_WRITE_SAME:
>   return 1;
>   default:
> - break;
> + bio_for_each_segment(bv, bio, iter)
> + segs++;
> + return segs;
>   }
> +}
>  
> - bio_for_each_segment(bv, bio, iter)
> - segs++;
> +static inline unsigned bio_bvecs(struct bio *bio)
> +{
> + unsigned bvecs = 0;
> + struct bio_vec bv;
> + struct bvec_iter iter;
>  
> - return segs;
> + /*
> +  * We special case discard/write same/write zeroes, because they
> +  * interpret bi_size differently:
> +  */
> + switch (bio_op(bio)) {
> + case REQ_OP_DISCARD:
> + case REQ_OP_SECURE_ERASE:
> + case REQ_OP_WRITE_ZEROES:
> + return 0;
> + case REQ_OP_WRITE_SAME:
> + return 1;
> + default:
> + bio_for_each_bvec(bv, bio, iter)
> + bvecs++;
> + return bvecs;
> + }
>  }
>  
>  /*
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 08/19] btrfs: move bio_pages_all() to btrfs

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:55PM +0800, Ming Lei wrote:
> BTRFS is the only user of this helper, so move this helper into
> BTRFS, and implement it via bio_for_each_segment_all(), since
> bio->bi_vcnt may not equal to number of pages after multipage bvec
> is enabled.

Shouldn't you also get rid of bio_pages_all() in this patch?

> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  fs/btrfs/extent_io.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 5d5965297e7e..874bb9aeebdc 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -2348,6 +2348,18 @@ struct bio *btrfs_create_repair_bio(struct inode 
> *inode, struct bio *failed_bio,
>   return bio;
>  }
>  
> +static unsigned btrfs_bio_pages_all(struct bio *bio)
> +{
> + unsigned i;
> + struct bio_vec *bv;
> +
> + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
> +
> + bio_for_each_segment_all(bv, bio, i)
> + ;
> + return i;
> +}
> +
>  /*
>   * this is a generic handler for readpage errors (default
>   * readpage_io_failed_hook). if other copies exist, read those and write back
> @@ -2368,7 +2380,7 @@ static int bio_readpage_error(struct bio *failed_bio, 
> u64 phy_offset,
>   int read_mode = 0;
>   blk_status_t status;
>   int ret;
> - unsigned failed_bio_pages = bio_pages_all(failed_bio);
> + unsigned failed_bio_pages = btrfs_bio_pages_all(failed_bio);
>  
>   BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
>  
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 06/19] fs/buffer.c: use bvec iterator to truncate the bio

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:53PM +0800, Ming Lei wrote:
> Once multi-page bvec is enabled, the last bvec may include more than one
> page, this patch use bvec_last_segment() to truncate the bio.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  fs/buffer.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 1286c2b95498..fa37ad52e962 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -3032,7 +3032,10 @@ void guard_bio_eod(int op, struct bio *bio)
>  
>   /* ..and clear the end of the buffer for reads */
>   if (op == REQ_OP_READ) {
> - zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
> + struct bio_vec bv;
> +
> + bvec_last_segment(bvec, );
> + zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
>   truncated_bytes);
>   }
>  }
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 05/19] block: introduce bvec_last_segment()

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:52PM +0800, Ming Lei wrote:
> BTRFS and guard_bio_eod() need to get the last singlepage segment
> from one multipage bvec, so introduce this helper to make them happy.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

Minor comments below.

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bvec.h | 25 +
>  1 file changed, 25 insertions(+)
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index 3d61352cd8cf..01616a0b6220 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -216,4 +216,29 @@ static inline bool mp_bvec_iter_advance(const struct 
> bio_vec *bv,
>   .bi_bvec_done   = 0,\
>  }
>  
> +/*
> + * Get the last singlepage segment from the multipage bvec and store it
> + * in @seg
> + */
> +static inline void bvec_last_segment(const struct bio_vec *bvec,
> + struct bio_vec *seg)

Indentation is all messed up here.

> +{
> + unsigned total = bvec->bv_offset + bvec->bv_len;
> + unsigned last_page = total / PAGE_SIZE;
> +
> + if (last_page * PAGE_SIZE == total)
> + last_page--;

I think this could just be

unsigned int last_page = (total - 1) / PAGE_SIZE;

> + seg->bv_page = nth_page(bvec->bv_page, last_page);
> +
> + /* the whole segment is inside the last page */
> + if (bvec->bv_offset >= last_page * PAGE_SIZE) {
> + seg->bv_offset = bvec->bv_offset % PAGE_SIZE;
> + seg->bv_len = bvec->bv_len;
> + } else {
> + seg->bv_offset = 0;
> + seg->bv_len = total - last_page * PAGE_SIZE;
> + }
> +}
> +
>  #endif /* __LINUX_BVEC_ITER_H */
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 03/19] block: use bio_for_each_bvec() to compute multi-page bvec count

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:05:10PM -0500, Mike Snitzer wrote:
> On Thu, Nov 15 2018 at  3:20pm -0500,
> Omar Sandoval  wrote:
> 
> > On Thu, Nov 15, 2018 at 04:52:50PM +0800, Ming Lei wrote:
> > > First it is more efficient to use bio_for_each_bvec() in both
> > > blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
> > > many multi-page bvecs there are in the bio.
> > > 
> > > Secondly once bio_for_each_bvec() is used, the bvec may need to be
> > > splitted because its length can be very longer than max segment size,
> > > so we have to split the big bvec into several segments.
> > > 
> > > Thirdly when splitting multi-page bvec into segments, the max segment
> > > limit may be reached, so the bio split need to be considered under
> > > this situation too.
> > > 
> > > Cc: Dave Chinner 
> > > Cc: Kent Overstreet 
> > > Cc: Mike Snitzer 
> > > Cc: dm-de...@redhat.com
> > > Cc: Alexander Viro 
> > > Cc: linux-fsde...@vger.kernel.org
> > > Cc: Shaohua Li 
> > > Cc: linux-r...@vger.kernel.org
> > > Cc: linux-er...@lists.ozlabs.org
> > > Cc: David Sterba 
> > > Cc: linux-bt...@vger.kernel.org
> > > Cc: Darrick J. Wong 
> > > Cc: linux-...@vger.kernel.org
> > > Cc: Gao Xiang 
> > > Cc: Christoph Hellwig 
> > > Cc: Theodore Ts'o 
> > > Cc: linux-e...@vger.kernel.org
> > > Cc: Coly Li 
> > > Cc: linux-bca...@vger.kernel.org
> > > Cc: Boaz Harrosh 
> > > Cc: Bob Peterson 
> > > Cc: cluster-devel@redhat.com
> > > Signed-off-by: Ming Lei 
> > > ---
> > >  block/blk-merge.c | 90 
> > > ++-
> > >  1 file changed, 76 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/block/blk-merge.c b/block/blk-merge.c
> > > index 91b2af332a84..6f7deb94a23f 100644
> > > --- a/block/blk-merge.c
> > > +++ b/block/blk-merge.c
> > > @@ -160,6 +160,62 @@ static inline unsigned get_max_io_size(struct 
> > > request_queue *q,
> > >   return sectors;
> > >  }
> > >  
> > > +/*
> > > + * Split the bvec @bv into segments, and update all kinds of
> > > + * variables.
> > > + */
> > > +static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
> > > + unsigned *nsegs, unsigned *last_seg_size,
> > > + unsigned *front_seg_size, unsigned *sectors)
> > > +{
> > > + bool need_split = false;
> > > + unsigned len = bv->bv_len;
> > > + unsigned total_len = 0;
> > > + unsigned new_nsegs = 0, seg_size = 0;
> > 
> > "unsigned int" here and everywhere else.
> 
> Curious why?  I've wondered what govens use of "unsigned" vs "unsigned
> int" recently and haven't found _the_ reason to pick one over the other.

My only reason to prefer unsigned int is consistency. unsigned int is
much more common in the kernel:

$ ag --cc -s 'unsigned\s+int' | wc -l
129632
$ ag --cc -s 'unsigned\s+(?!char|short|int|long)' | wc -l
22435

checkpatch also warns on plain unsigned.



Re: [Cluster-devel] [PATCH V10 04/19] block: use bio_for_each_bvec() to map sg

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:51PM +0800, Ming Lei wrote:
> It is more efficient to use bio_for_each_bvec() to map sg, meantime
> we have to consider splitting multipage bvec as done in 
> blk_bio_segment_split().
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c | 72 
> +++
>  1 file changed, 52 insertions(+), 20 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 6f7deb94a23f..cb9f49bcfd36 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -473,6 +473,56 @@ static int blk_phys_contig_segment(struct request_queue 
> *q, struct bio *bio,
>   return biovec_phys_mergeable(q, _bv, _bv);
>  }
>  
> +static struct scatterlist *blk_next_sg(struct scatterlist **sg,
> + struct scatterlist *sglist)
> +{
> + if (!*sg)
> + return sglist;
> + else {
> + /*
> +  * If the driver previously mapped a shorter
> +  * list, we could see a termination bit
> +  * prematurely unless it fully inits the sg
> +  * table on each mapping. We KNOW that there
> +  * must be more entries here or the driver
> +  * would be buggy, so force clear the
> +  * termination bit to avoid doing a full
> +  * sg_init_table() in drivers for each command.
> +  */
> + sg_unmark_end(*sg);
> + return sg_next(*sg);
> + }
> +}
> +
> +static unsigned blk_bvec_map_sg(struct request_queue *q,
> + struct bio_vec *bvec, struct scatterlist *sglist,
> + struct scatterlist **sg)
> +{
> + unsigned nbytes = bvec->bv_len;
> + unsigned nsegs = 0, total = 0;
> +
> + while (nbytes > 0) {
> + unsigned seg_size;
> + struct page *pg;
> + unsigned offset, idx;
> +
> + *sg = blk_next_sg(sg, sglist);
> +
> + seg_size = min(nbytes, queue_max_segment_size(q));
> + offset = (total + bvec->bv_offset) % PAGE_SIZE;
> + idx = (total + bvec->bv_offset) / PAGE_SIZE;
> + pg = nth_page(bvec->bv_page, idx);
> +
> + sg_set_page(*sg, pg, seg_size, offset);
> +
> + total += seg_size;
> + nbytes -= seg_size;
> + nsegs++;
> + }
> +
> + return nsegs;
> +}
> +
>  static inline void
>  __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
>struct scatterlist *sglist, struct bio_vec *bvprv,
> @@ -490,25 +540,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
> bio_vec *bvec,
>   (*sg)->length += nbytes;
>   } else {
>  new_segment:
> - if (!*sg)
> - *sg = sglist;
> - else {
> - /*
> -  * If the driver previously mapped a shorter
> -  * list, we could see a termination bit
> -  * prematurely unless it fully inits the sg
> -  * table on each mapping. We KNOW that there
> -  * must be more entries here or the driver
> -  * would be buggy, so force clear the
> -  * termination bit to avoid doing a full
> -  * sg_init_table() in drivers for each command.
> -  */
> - sg_unmark_end(*sg);
> - *sg = sg_next(*sg);
> - }
> -
> - sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
> - (*nsegs)++;
> + (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
>   }
>   *bvprv = *bvec;
>  }
> @@ -530,7 +562,7 @@ static int __blk_bios_map_sg(struct request_queue *q, 
> struct bio *bio,
>   int cluster = blk_queue_cluster(q), nsegs = 0;
>  
>   for_each_bio(bio)
> - bio_for_each_segment(bvec, bio, iter)
> + bio_for_each_bvec(bvec, bio, iter)
>   __blk_segment_map_sg(q, , sglist, , sg,
>, );
>  
> -- 
> 2.9.5
> 



Re: [Cluster-devel] [PATCH V10 03/19] block: use bio_for_each_bvec() to compute multi-page bvec count

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:50PM +0800, Ming Lei wrote:
> First it is more efficient to use bio_for_each_bvec() in both
> blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
> many multi-page bvecs there are in the bio.
> 
> Secondly once bio_for_each_bvec() is used, the bvec may need to be
> splitted because its length can be very longer than max segment size,
> so we have to split the big bvec into several segments.
> 
> Thirdly when splitting multi-page bvec into segments, the max segment
> limit may be reached, so the bio split need to be considered under
> this situation too.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com
> Signed-off-by: Ming Lei 
> ---
>  block/blk-merge.c | 90 
> ++-
>  1 file changed, 76 insertions(+), 14 deletions(-)
> 
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 91b2af332a84..6f7deb94a23f 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -160,6 +160,62 @@ static inline unsigned get_max_io_size(struct 
> request_queue *q,
>   return sectors;
>  }
>  
> +/*
> + * Split the bvec @bv into segments, and update all kinds of
> + * variables.
> + */
> +static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
> + unsigned *nsegs, unsigned *last_seg_size,
> + unsigned *front_seg_size, unsigned *sectors)
> +{
> + bool need_split = false;
> + unsigned len = bv->bv_len;
> + unsigned total_len = 0;
> + unsigned new_nsegs = 0, seg_size = 0;

"unsigned int" here and everywhere else.

> + if ((*nsegs >= queue_max_segments(q)) || !len)
> + return need_split;
> +
> + /*
> +  * Multipage bvec may be too big to hold in one segment,
> +  * so the current bvec has to be splitted as multiple
> +  * segments.
> +  */
> + while (new_nsegs + *nsegs < queue_max_segments(q)) {
> + seg_size = min(queue_max_segment_size(q), len);
> +
> + new_nsegs++;
> + total_len += seg_size;
> + len -= seg_size;
> +
> + if ((queue_virt_boundary(q) && ((bv->bv_offset +
> + total_len) & queue_virt_boundary(q))) || !len)
> + break;

Checking queue_virt_boundary(q) != 0 is superfluous, and the len check
could just control the loop, i.e.,

while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
seg_size = min(queue_max_segment_size(q), len);

new_nsegs++;
total_len += seg_size;
len -= seg_size;

if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
break;
}

And if you rewrite it this way, I _think_ you can get rid of this
special case:

if ((*nsegs >= queue_max_segments(q)) || !len)
return need_split;

above.

> + }
> +
> + /* split in the middle of the bvec */
> + if (len)
> + need_split = true;

need_split is unnecessary, just return len != 0.

> +
> + /* update front segment size */
> + if (!*nsegs) {
> + unsigned first_seg_size = seg_size;
> +
> + if (new_nsegs > 1)
> + first_seg_size = queue_max_segment_size(q);
> + if (*front_seg_size < first_seg_size)
> + *front_seg_size = first_seg_size;
> + }
> +
> + /* update other varibles */
> + *last_seg_size = seg_size;
> + *nsegs += new_nsegs;
> + if (sectors)
> + *sectors += total_len >> 9;
> +
> + return need_split;
> +}
> +
>  static struct bio *blk_bio_segment_split(struct request_queue *q,
>struct bio *bio,
>struct bio_set *bs,
> @@ -173,7 +229,7 @@ static struct bio *blk_bio_segment_split(struct 
> request_queue *q,
>   struct bio *new = NULL;
>   const unsigned max_sectors = get_max_io_size(q, bio);
>  
> - bio_for_each_segment(bv, bio, iter) {
> + bio_for_each_bvec(bv, bio, iter) {
>   /*
>* If the queue doesn't support SG gaps and adding this
>* offset would create a gap, disallow it.
> @@ -188,8 +244,12 @@ static struct bio *blk_bio_segment_split(struct 
> request_queue *q,
>*/
>   if (nsegs < queue_max_segments(q) &&
>   sectors < max_sectors) {
> -

Re: [Cluster-devel] [PATCH V10 02/19] block: introduce bio_for_each_bvec()

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:49PM +0800, Ming Lei wrote:
> This helper is used for iterating over multi-page bvec for bio
> split & merge code.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

One comment below.

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bio.h  | 34 +++---
>  include/linux/bvec.h | 36 
>  2 files changed, 63 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 056fb627edb3..1f0dcf109841 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -76,6 +76,9 @@
>  #define bio_data_dir(bio) \
>   (op_is_write(bio_op(bio)) ? WRITE : READ)
>  
> +#define bio_iter_mp_iovec(bio, iter) \
> + mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))
> +
>  /*
>   * Check whether this bio carries any data or not. A NULL bio is allowed.
>   */
> @@ -135,18 +138,33 @@ static inline bool bio_full(struct bio *bio)
>  #define bio_for_each_segment_all(bvl, bio, i)
> \
>   for (i = 0, bvl = (bio)->bi_io_vec; i < (bio)->bi_vcnt; i++, bvl++)
>  
> -static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter,
> - unsigned bytes)
> +static inline void __bio_advance_iter(struct bio *bio, struct bvec_iter 
> *iter,
> +   unsigned bytes, bool mp)
>  {
>   iter->bi_sector += bytes >> 9;
>  
>   if (bio_no_advance_iter(bio))
>   iter->bi_size -= bytes;
>   else
> - bvec_iter_advance(bio->bi_io_vec, iter, bytes);
> + if (!mp)
> + bvec_iter_advance(bio->bi_io_vec, iter, bytes);
> + else
> + mp_bvec_iter_advance(bio->bi_io_vec, iter, bytes);

if (!foo) {} else {} hurts my brain, please do

if (mp)
mp_bvec_iter_advance(bio->bi_io_vec, iter, bytes);
else
bvec_iter_advance(bio->bi_io_vec, iter, bytes);



Re: [Cluster-devel] [PATCH V10 01/19] block: introduce multi-page page bvec helpers

2018-11-15 Thread Omar Sandoval
On Thu, Nov 15, 2018 at 04:52:48PM +0800, Ming Lei wrote:
> This patch introduces helpers of 'mp_bvec_iter_*' for multipage
> bvec support.
> 
> The introduced helpers treate one bvec as real multi-page segment,
> which may include more than one pages.
> 
> The existed helpers of bvec_iter_* are interfaces for supporting current
> bvec iterator which is thought as single-page by drivers, fs, dm and
> etc. These introduced helpers will build single-page bvec in flight, so
> this way won't break current bio/bvec users, which needn't any change.
> 
> Cc: Dave Chinner 
> Cc: Kent Overstreet 
> Cc: Mike Snitzer 
> Cc: dm-de...@redhat.com
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> Cc: Shaohua Li 
> Cc: linux-r...@vger.kernel.org
> Cc: linux-er...@lists.ozlabs.org
> Cc: David Sterba 
> Cc: linux-bt...@vger.kernel.org
> Cc: Darrick J. Wong 
> Cc: linux-...@vger.kernel.org
> Cc: Gao Xiang 
> Cc: Christoph Hellwig 
> Cc: Theodore Ts'o 
> Cc: linux-e...@vger.kernel.org
> Cc: Coly Li 
> Cc: linux-bca...@vger.kernel.org
> Cc: Boaz Harrosh 
> Cc: Bob Peterson 
> Cc: cluster-devel@redhat.com

Reviewed-by: Omar Sandoval 

But a couple of comments below.

> Signed-off-by: Ming Lei 
> ---
>  include/linux/bvec.h | 63 
> +---
>  1 file changed, 60 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bvec.h b/include/linux/bvec.h
> index 02c73c6aa805..8ef904a50577 100644
> --- a/include/linux/bvec.h
> +++ b/include/linux/bvec.h
> @@ -23,6 +23,44 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +
> +/*
> + * What is multi-page bvecs?
> + *
> + * - bvecs stored in bio->bi_io_vec is always multi-page(mp) style
> + *
> + * - bvec(struct bio_vec) represents one physically contiguous I/O
> + *   buffer, now the buffer may include more than one pages after
> + *   multi-page(mp) bvec is supported, and all these pages represented
> + *   by one bvec is physically contiguous. Before mp support, at most
> + *   one page is included in one bvec, we call it single-page(sp)
> + *   bvec.
> + *
> + * - .bv_page of the bvec represents the 1st page in the mp bvec
> + *
> + * - .bv_offset of the bvec represents offset of the buffer in the bvec
> + *
> + * The effect on the current drivers/filesystem/dm/bcache/...:
> + *
> + * - almost everyone supposes that one bvec only includes one single
> + *   page, so we keep the sp interface not changed, for example,
> + *   bio_for_each_segment() still returns bvec with single page
> + *
> + * - bio_for_each_segment*() will be changed to return single-page
> + *   bvec too
> + *
> + * - during iterating, iterator variable(struct bvec_iter) is always
> + *   updated in multipage bvec style and that means bvec_iter_advance()
> + *   is kept not changed
> + *
> + * - returned(copied) single-page bvec is built in flight by bvec
> + *   helpers from the stored multipage bvec
> + *
> + * - In case that some components(such as iov_iter) need to support
> + *   multi-page bvec, we introduce new helpers(mp_bvec_iter_*) for
> + *   them.
> + */

This comment sounds more like a commit message (i.e., how were things
before, and how are we changing them). In a couple of years when I read
this code, I probably won't care how it was changed, just how it works.
So I think a comment explaining the concepts of multi-page and
single-page bvecs is very useful, but please move all of the "foo was
changed" and "before mp support" type stuff to the commit message.

>  /*
>   * was unsigned short, but we might as well be ready for > 64kB I/O pages
> @@ -50,16 +88,35 @@ struct bvec_iter {
>   */
>  #define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx])
>  
> -#define bvec_iter_page(bvec, iter)   \
> +#define mp_bvec_iter_page(bvec, iter)\
>   (__bvec_iter_bvec((bvec), (iter))->bv_page)
>  
> -#define bvec_iter_len(bvec, iter)\
> +#define mp_bvec_iter_len(bvec, iter) \
>   min((iter).bi_size, \
>   __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
>  
> -#define bvec_iter_offset(bvec, iter) \
> +#define mp_bvec_iter_offset(bvec, iter)  \
>   (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
>  
> +#define mp_bvec_iter_page_idx(bvec, iter)\
> + (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
> +
> +/*
> + *  of single-page(sp) segment.
> + *
> + * This helpers are for building sp bvec in flight.
>

Re: [Cluster-devel] [RFC PATCH 1/5] new helper: iov_iter_rw()

2015-03-17 Thread Omar Sandoval
On Tue, Mar 17, 2015 at 10:31:51AM +0100, David Sterba wrote:
 On Mon, Mar 16, 2015 at 05:36:05PM +, Al Viro wrote:
  On Mon, Mar 16, 2015 at 04:33:49AM -0700, Omar Sandoval wrote:
   Get either READ or WRITE out of iter-type.
  
  Umm...  
  
   + * Get one of READ or WRITE out of iter-type without any other flags 
   OR'd in
   + * with it.
   + */
   +static inline int iov_iter_rw(const struct iov_iter *i)
   +{
   + return i-type  RW_MASK;
   +}
  
  TBH, I would turn that into a macro.  Reason: indirect includes.
 
 Agreed, but the proposed define is rather cryptic and I was not able to
 understand the meaning on the first glance.
 
  #define iov_iter_rw(i) ((0 ? (struct iov_iter *)0 : (i))-type  RW_MASK)
 
 This worked for me, does not compile with anything else than
 'struct iov_iter*' as i:
 
 #define iov_iter_rw(i)({  \
   struct iov_iter __iter = *(i);  \
   (i)-type  RW_MASK;\
   })
 
 The assignment is optimized out.

[-cc individual fs maintainers to avoid all of these email bounces,
should've looked a bit closer at that get_maintainer.pl output...]

I agree that this is a bit more readable, but it evaluates i twice.
That's an easy fix, just do __iter.type instead of (i)-type, but
there's still the possibility of someone passing in something called
__iter as i, and the fix for that tends to be add more underscores. At
the very least, Al's macro could probably use a comment explaining
what's going on there, though.

-- 
Omar



[Cluster-devel] [RFC PATCH 5/5] direct_IO: remove rw from a_ops-direct_IO()

2015-03-16 Thread Omar Sandoval
Now that no one is using rw, remove it completely.

Signed-off-by: Omar Sandoval osan...@osandov.com
---
 Documentation/filesystems/Locking  | 2 +-
 Documentation/filesystems/vfs.txt  | 2 +-
 drivers/staging/lustre/lustre/llite/rw26.c | 4 ++--
 fs/9p/vfs_addr.c   | 2 +-
 fs/affs/file.c | 3 +--
 fs/block_dev.c | 3 +--
 fs/btrfs/inode.c   | 4 ++--
 fs/ceph/addr.c | 3 +--
 fs/cifs/file.c | 3 +--
 fs/exofs/inode.c   | 4 ++--
 fs/ext2/inode.c| 3 +--
 fs/ext3/inode.c| 4 ++--
 fs/ext4/inode.c| 4 ++--
 fs/f2fs/data.c | 4 ++--
 fs/fat/inode.c | 3 +--
 fs/fuse/file.c | 3 +--
 fs/gfs2/aops.c | 4 ++--
 fs/hfs/inode.c | 4 ++--
 fs/hfsplus/inode.c | 4 ++--
 fs/jfs/inode.c | 4 ++--
 fs/nfs/direct.c| 2 +-
 fs/nilfs2/inode.c  | 3 +--
 fs/ocfs2/aops.c| 4 +---
 fs/reiserfs/inode.c| 4 ++--
 fs/udf/file.c  | 3 +--
 fs/udf/inode.c | 3 +--
 fs/xfs/xfs_aops.c  | 1 -
 include/linux/fs.h | 2 +-
 include/linux/nfs_fs.h | 2 +-
 mm/filemap.c   | 4 ++--
 mm/page_io.c   | 4 +---
 31 files changed, 42 insertions(+), 57 deletions(-)

diff --git a/Documentation/filesystems/Locking 
b/Documentation/filesystems/Locking
index f91926f..b38abaf 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -196,7 +196,7 @@ prototypes:
void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
-   int (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t 
offset);
+   int (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
int (*migratepage)(struct address_space *, struct page *, struct page 
*);
int (*launder_page)(struct page *);
int (*is_partially_uptodate)(struct page *, unsigned long, unsigned 
long);
diff --git a/Documentation/filesystems/vfs.txt 
b/Documentation/filesystems/vfs.txt
index 966b228..d8ebc3c 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -590,7 +590,7 @@ struct address_space_operations {
void (*invalidatepage) (struct page *, unsigned int, unsigned int);
int (*releasepage) (struct page *, int);
void (*freepage)(struct page *);
-   ssize_t (*direct_IO)(int, struct kiocb *, struct iov_iter *iter, loff_t 
offset);
+   ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t 
offset);
/* migrate the contents of a page to the specified target */
int (*migratepage) (struct page *, struct page *);
int (*launder_page) (struct page *);
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c 
b/drivers/staging/lustre/lustre/llite/rw26.c
index 3aa9de6..0d7ce6b 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -359,8 +359,8 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env 
*env, struct cl_io *io,
  * up to 22MB for 128kB kmalloc and up to 682MB for 4MB kmalloc. */
 #define MAX_DIO_SIZE ((MAX_MALLOC / sizeof(struct brw_page) * PAGE_CACHE_SIZE) 
 \
  ~(DT_MAX_BRW_SIZE - 1))
-static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
-  struct iov_iter *iter, loff_t file_offset)
+static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
+  loff_t file_offset)
 {
struct lu_env *env;
struct cl_io *io;
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index eb14e05..b298a90 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -259,7 +259,7 @@ static int v9fs_launder_page(struct page *page)
  *
  */
 static ssize_t
-v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
/*
 * FIXME
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0314acd..ef5dcb4 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -389,8 +389,7 @@ static void affs_write_failed(struct address_space 
*mapping, loff_t to)
 }
 
 static ssize_t
-affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
-  loff_t offset)
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
struct file *file = iocb-ki_filp

[Cluster-devel] [RFC PATCH 4/5] direct_IO: use iov_iter_rw() instead of rw everywhere

2015-03-16 Thread Omar Sandoval
The rw parameter to direct_IO is redundant with iov_iter-type, and
treated slightly differently just about everywhere it's used: some users
do rw  WRITE, and others do rw == WRITE where they should be doing a
bitwise check. Simplify this with the new iov_iter_rw() helper, which
always returns either READ or WRITE.

Signed-off-by: Omar Sandoval osan...@osandov.com
---
 drivers/staging/lustre/lustre/llite/rw26.c | 18 +-
 fs/affs/file.c |  4 ++--
 fs/btrfs/inode.c   | 12 ++--
 fs/ext2/inode.c|  2 +-
 fs/ext3/inode.c|  8 
 fs/ext4/ext4.h |  4 ++--
 fs/ext4/indirect.c | 10 +-
 fs/ext4/inode.c| 20 ++--
 fs/f2fs/data.c | 16 
 fs/fat/inode.c |  4 ++--
 fs/fuse/file.c | 13 +++--
 fs/gfs2/aops.c |  7 +++
 fs/hfs/inode.c |  2 +-
 fs/hfsplus/inode.c |  2 +-
 fs/jfs/inode.c |  2 +-
 fs/nfs/direct.c|  2 +-
 fs/nilfs2/inode.c  |  4 ++--
 fs/ocfs2/aops.c|  2 +-
 fs/reiserfs/inode.c|  2 +-
 fs/udf/inode.c |  2 +-
 fs/xfs/xfs_aops.c  |  2 +-
 21 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/drivers/staging/lustre/lustre/llite/rw26.c 
b/drivers/staging/lustre/lustre/llite/rw26.c
index 2f21304..3aa9de6 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -399,7 +399,7 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
 *size changing by concurrent truncates and writes.
 * 1. Need inode mutex to operate transient pages.
 */
-   if (rw == READ)
+   if (iov_iter_rw(iter) == READ)
mutex_lock(inode-i_mutex);
 
LASSERT(obj-cob_transient_pages == 0);
@@ -408,7 +408,7 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
size_t offs;
 
count = min_t(size_t, iov_iter_count(iter), size);
-   if (rw == READ) {
+   if (iov_iter_rw(iter) == READ) {
if (file_offset = i_size_read(inode))
break;
if (file_offset + count  i_size_read(inode))
@@ -418,11 +418,11 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
result = iov_iter_get_pages_alloc(iter, pages, count, offs);
if (likely(result  0)) {
int n = DIV_ROUND_UP(result + offs, PAGE_SIZE);
-   result = ll_direct_IO_26_seg(env, io, rw, inode,
-file-f_mapping,
-result, file_offset,
-pages, n);
-   ll_free_user_pages(pages, n, rw==READ);
+   result = ll_direct_IO_26_seg(env, io, iov_iter_rw(iter),
+inode, file-f_mapping,
+result, file_offset, pages,
+n);
+   ll_free_user_pages(pages, n, iov_iter_rw(iter) == READ);
}
if (unlikely(result = 0)) {
/* If we can't allocate a large enough buffer
@@ -449,11 +449,11 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb,
}
 out:
LASSERT(obj-cob_transient_pages == 0);
-   if (rw == READ)
+   if (iov_iter_rw(iter) == READ)
mutex_unlock(inode-i_mutex);
 
if (tot_bytes  0) {
-   if (rw == WRITE) {
+   if (iov_iter_rw(iter) == WRITE) {
struct lov_stripe_md *lsm;
 
lsm = ccc_inode_lsm_get(inode);
diff --git a/fs/affs/file.c b/fs/affs/file.c
index f3028aa..0314acd 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -398,7 +398,7 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter 
*iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
 
-   if (rw == WRITE) {
+   if (iov_iter_rw(iter) == WRITE) {
loff_t size = offset + count;
 
if (AFFS_I(inode)-mmu_private  size)
@@ -406,7 +406,7 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter 
*iter,
}
 
ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
-   if (ret  0  (rw  WRITE))
+   if (ret  0  iov_iter_rw(iter) == WRITE)
affs_write_failed(mapping, offset + count

[Cluster-devel] [RFC PATCH 1/5] new helper: iov_iter_rw()

2015-03-16 Thread Omar Sandoval
Get either READ or WRITE out of iter-type.

Signed-off-by: Omar Sandoval osan...@osandov.com
---
 include/linux/uio.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 7188029..87a47b3 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -10,6 +10,7 @@
 #define __LINUX_UIO_H
 
 #include linux/kernel.h
+#include linux/fs.h
 #include uapi/linux/uio.h
 
 struct page;
@@ -111,6 +112,15 @@ static inline bool iter_is_iovec(struct iov_iter *i)
 }
 
 /*
+ * Get one of READ or WRITE out of iter-type without any other flags OR'd in
+ * with it.
+ */
+static inline int iov_iter_rw(const struct iov_iter *i)
+{
+   return i-type  RW_MASK;
+}
+
+/*
  * Cap the iov_iter by given limit; note that the second argument is
  * *not* the new size - it's upper limit for such.  Passing it a value
  * greater than the amount of data in iov_iter is fine - it'll just do
-- 
2.3.3



[Cluster-devel] [RFC PATCH 0/5] Remove rw parameter from direct_IO()

2015-03-16 Thread Omar Sandoval
Hi,

Al, here's some cleanup that you mentioned back in December that I got
around to (https://lkml.org/lkml/2014/12/15/28).

In summary, the rw parameter to a_ops-direct_IO() is redundant with
.type in struct iov_iter. Additionally, rw is inconsistently checked for
being a WRITE; some filesystems do rw == WRITE, others do rw  WRITE,
and others do both within the same function :) The distinction is that
swapout may OR in the ITER_BVEC flag in the rw passed to -direct_IO(),
so the two are not equivalent (although this really only happens for
swap-over-NFS, but it's scary nonetheless). After looking through all of
these, it definitely looks like every check means for ANY write, not
just non-kernel writes.

So, the solution presented here is:

- Add a helper, iov_iter_rw(), which always returns either READ or
  WRITE, no ITER_.* or REQ_.* nonsense mixed in. For consistency, the
  return value is always checked for equality
- Get rid of all uses of rw in any implementations of direct_IO,
  starting with the generic code
- Nuke the actual parameter and update the documentation

I decided to squish all of the filesystems together in patch 4 to avoid
inundating the mailing lists with 20+ mostly two-line patches, but I can
split those out if that's any better. Additionally, patch 1 pulls fs.h
into uio.h, which seems undesirable.

These were mostly just compile tested, with a couple of direct I/O
xfstests run on btrfs as quick sanity check, so getting some more eyes
on is probably a good thing. They should apply on top of v4.0-rc4.
Please comment away.

Thank you,

Omar Sandoval (5):
  new helper: iov_iter_rw()
  Remove rw from {,__,do_}blockdev_direct_IO()
  Remove rw from dax_{do_,}io()
  direct_IO: use iov_iter_rw() instead of rw everywhere
  direct_IO: remove rw from a_ops-direct_IO()

 Documentation/filesystems/Locking  |  2 +-
 Documentation/filesystems/vfs.txt  |  2 +-
 drivers/staging/lustre/lustre/llite/rw26.c | 22 -
 fs/9p/vfs_addr.c   |  2 +-
 fs/affs/file.c |  9 +++
 fs/block_dev.c |  8 +++---
 fs/btrfs/inode.c   | 24 +-
 fs/ceph/addr.c |  3 +--
 fs/cifs/file.c |  3 +--
 fs/dax.c   | 27 ++---
 fs/direct-io.c | 39 ++
 fs/exofs/inode.c   |  4 +--
 fs/ext2/inode.c| 11 -
 fs/ext3/inode.c| 14 +--
 fs/ext4/ext4.h |  4 +--
 fs/ext4/indirect.c | 25 ++-
 fs/ext4/inode.c| 28 ++---
 fs/f2fs/data.c | 22 -
 fs/fat/inode.c |  9 +++
 fs/fuse/file.c | 16 ++--
 fs/gfs2/aops.c | 16 ++--
 fs/hfs/inode.c |  8 +++---
 fs/hfsplus/inode.c |  9 +++
 fs/jfs/inode.c |  8 +++---
 fs/nfs/direct.c|  4 +--
 fs/nilfs2/inode.c  | 10 +++-
 fs/ocfs2/aops.c| 22 +++--
 fs/reiserfs/inode.c|  8 +++---
 fs/udf/file.c  |  3 +--
 fs/udf/inode.c |  7 +++---
 fs/xfs/xfs_aops.c  | 12 -
 include/linux/fs.h | 24 +-
 include/linux/nfs_fs.h |  2 +-
 include/linux/uio.h| 10 
 mm/filemap.c   |  4 +--
 mm/page_io.c   |  4 +--
 36 files changed, 206 insertions(+), 219 deletions(-)

-- 
2.3.3



[Cluster-devel] [RFC PATCH 3/5] Remove rw from dax_{do_,}io()

2015-03-16 Thread Omar Sandoval
And use iov_iter_rw() instead.

Signed-off-by: Omar Sandoval osan...@osandov.com
---
 fs/dax.c   | 27 +--
 fs/ext2/inode.c|  4 ++--
 fs/ext4/indirect.c |  4 ++--
 fs/ext4/inode.c|  2 +-
 include/linux/fs.h |  4 ++--
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index ed1619e..a278469 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -98,9 +98,9 @@ static bool buffer_size_valid(struct buffer_head *bh)
return bh-b_state != 0;
 }
 
-static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
-   loff_t start, loff_t end, get_block_t get_block,
-   struct buffer_head *bh)
+static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
+ loff_t start, loff_t end, get_block_t get_block,
+ struct buffer_head *bh)
 {
ssize_t retval = 0;
loff_t pos = start;
@@ -109,7 +109,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
void *addr;
bool hole = false;
 
-   if (rw != WRITE)
+   if (iov_iter_rw(iter) != WRITE)
end = min(end, i_size_read(inode));
 
while (pos  end) {
@@ -124,7 +124,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
bh-b_size = PAGE_ALIGN(end - pos);
bh-b_state = 0;
retval = get_block(inode, block, bh,
-   rw == WRITE);
+  iov_iter_rw(iter) == WRITE);
if (retval)
break;
if (!buffer_size_valid(bh))
@@ -137,7 +137,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
bh-b_size -= done;
}
 
-   hole = (rw != WRITE)  !buffer_written(bh);
+   hole = iov_iter_rw(iter) != WRITE  
!buffer_written(bh);
if (hole) {
addr = NULL;
size = bh-b_size - first;
@@ -154,7 +154,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
max = min(pos + size, end);
}
 
-   if (rw == WRITE)
+   if (iov_iter_rw(iter) == WRITE)
len = copy_from_iter(addr, max - pos, iter);
else if (!hole)
len = copy_to_iter(addr, max - pos, iter);
@@ -173,7 +173,6 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
 
 /**
  * dax_do_io - Perform I/O to a DAX file
- * @rw: READ to read or WRITE to write
  * @iocb: The control block for this I/O
  * @inode: The file which the I/O is directed at
  * @iter: The addresses to do I/O from or to
@@ -189,9 +188,9 @@ static ssize_t dax_io(int rw, struct inode *inode, struct 
iov_iter *iter,
  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
  * is in progress.
  */
-ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
-   struct iov_iter *iter, loff_t pos,
-   get_block_t get_block, dio_iodone_t end_io, int flags)
+ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
+ struct iov_iter *iter, loff_t pos, get_block_t get_block,
+ dio_iodone_t end_io, int flags)
 {
struct buffer_head bh;
ssize_t retval = -EINVAL;
@@ -199,7 +198,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode 
*inode,
 
memset(bh, 0, sizeof(bh));
 
-   if ((flags  DIO_LOCKING)  (rw == READ)) {
+   if ((flags  DIO_LOCKING)  iov_iter_rw(iter) == READ) {
struct address_space *mapping = inode-i_mapping;
mutex_lock(inode-i_mutex);
retval = filemap_write_and_wait_range(mapping, pos, end - 1);
@@ -212,9 +211,9 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode 
*inode,
/* Protects against truncate */
atomic_inc(inode-i_dio_count);
 
-   retval = dax_io(rw, inode, iter, pos, end, get_block, bh);
+   retval = dax_io(inode, iter, pos, end, get_block, bh);
 
-   if ((flags  DIO_LOCKING)  (rw == READ))
+   if ((flags  DIO_LOCKING)  iov_iter_rw(iter) == READ)
mutex_unlock(inode-i_mutex);
 
if ((retval  0)  end_io)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 00979a6..cf95bda 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -861,8 +861,8 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter 
*iter,
ssize_t ret;
 
if (IS_DAX(inode))
-   ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
-   NULL, DIO_LOCKING

[Cluster-devel] [RFC PATCH 2/5] Remove rw from {, __, do_}blockdev_direct_IO()

2015-03-16 Thread Omar Sandoval
Most filesystems call through to these at some point, so we'll start
here.

Signed-off-by: Omar Sandoval osan...@osandov.com
---
 fs/affs/file.c  |  2 +-
 fs/block_dev.c  |  5 ++---
 fs/btrfs/inode.c|  8 
 fs/direct-io.c  | 39 ++-
 fs/ext2/inode.c |  2 +-
 fs/ext3/inode.c |  2 +-
 fs/ext4/indirect.c  | 11 ++-
 fs/ext4/inode.c |  2 +-
 fs/f2fs/data.c  |  2 +-
 fs/fat/inode.c  |  2 +-
 fs/gfs2/aops.c  |  5 ++---
 fs/hfs/inode.c  |  2 +-
 fs/hfsplus/inode.c  |  3 +--
 fs/jfs/inode.c  |  2 +-
 fs/nilfs2/inode.c   |  3 +--
 fs/ocfs2/aops.c | 16 +++-
 fs/reiserfs/inode.c |  2 +-
 fs/udf/inode.c  |  2 +-
 fs/xfs/xfs_aops.c   |  9 -
 include/linux/fs.h  | 22 --
 20 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/fs/affs/file.c b/fs/affs/file.c
index d2468bf..f3028aa 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -405,7 +405,7 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter 
*iter,
return 0;
}
 
-   ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
+   ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
if (ret  0  (rw  WRITE))
affs_write_failed(mapping, offset + count);
return ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 975266b..e3a3125 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -153,9 +153,8 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, struct 
iov_iter *iter,
struct file *file = iocb-ki_filp;
struct inode *inode = file-f_mapping-host;
 
-   return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter,
-   offset, blkdev_get_block,
-   NULL, NULL, 0);
+   return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+   blkdev_get_block, NULL, NULL, 0);
 }
 
 int __sync_blockdev(struct block_device *bdev, int wait)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da828cf..d6413f4 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8119,10 +8119,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb 
*iocb,
wakeup = false;
}
 
-   ret = __blockdev_direct_IO(rw, iocb, inode,
-   BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
-   iter, offset, btrfs_get_blocks_direct, NULL,
-   btrfs_submit_direct, flags);
+   ret = __blockdev_direct_IO(iocb, inode,
+  
BTRFS_I(inode)-root-fs_info-fs_devices-latest_bdev,
+  iter, offset, btrfs_get_blocks_direct, NULL,
+  btrfs_submit_direct, flags);
if (rw  WRITE) {
if (ret  0  ret != -EIOCBQUEUED)
btrfs_delalloc_release_space(inode, count);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b..02a87d9 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1094,10 +1094,10 @@ static inline int drop_refcount(struct dio *dio)
  * for the whole file.
  */
 static inline ssize_t
-do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
-   struct block_device *bdev, struct iov_iter *iter, loff_t offset, 
-   get_block_t get_block, dio_iodone_t end_io,
-   dio_submit_t submit_io, int flags)
+do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
+ struct block_device *bdev, struct iov_iter *iter,
+ loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+ dio_submit_t submit_io, int flags)
 {
unsigned i_blkbits = ACCESS_ONCE(inode-i_blkbits);
unsigned blkbits = i_blkbits;
@@ -,9 +,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct 
inode *inode,
struct blk_plug plug;
unsigned long align = offset | iov_iter_alignment(iter);
 
-   if (rw  WRITE)
-   rw = WRITE_ODIRECT;
-
/*
 * Avoid references to bdev if not absolutely needed to give
 * the early prefetch in the caller enough time.
@@ -1128,7 +1125,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct 
inode *inode,
}
 
/* watch out for a 0 len io from a tricksy fs */
-   if (rw == READ  !iov_iter_count(iter))
+   if (iov_iter_rw(iter) == READ  !iov_iter_count(iter))
return 0;
 
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
@@ -1144,7 +1141,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct 
inode *inode,
 
dio-flags = flags;
if (dio-flags  DIO_LOCKING) {
-   if (rw == READ) {
+   if (iov_iter_rw(iter) == READ) {
struct address_space *mapping =
iocb-ki_filp-f_mapping;
 
@@ -1170,19 +1167,19