[Cluster-devel] [PATCH 10/11] gfs2: Clean up assertion, consistency check, and error reporting functions

2019-01-11 Thread Andreas Gruenbacher
Instead of passing the __func__, __FILE__, and __LINE__ pre-processor macros to
each of those functions, print the location of the caller via:

  printk(%pS", (void *)_RET_IP_).

This gives enough context information to locate where in the code an error
occurred, and reduces the code size by about 2 percent.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/util.c | 107 -
 fs/gfs2/util.h |  71 +---
 2 files changed, 62 insertions(+), 116 deletions(-)

diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 0a814ccac41d2..b4c72bb799052 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -88,14 +88,12 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, const char *fmt, 
...)
  *  -2 if it was already withdrawn
  */
 
-int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
-  const char *function, char *file, unsigned int line)
+int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion)
 {
int me;
me = gfs2_lm_withdraw(sdp,
- "fatal: assertion \"%s\" failed\n"
- "   function = %s, file = %s, line = %u\n",
- assertion, function, file, line);
+ "fatal: assertion \"%s\" failed at %pS\n",
+ assertion, (void *)_RET_IP_);
dump_stack();
return (me) ? -1 : -2;
 }
@@ -106,8 +104,7 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char 
*assertion,
  *  -2 if we didn't
  */
 
-int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
-  const char *function, char *file, unsigned int line)
+int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion)
 {
if (time_before(jiffies,
sdp->sd_last_warning +
@@ -115,8 +112,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char 
*assertion,
return -2;
 
if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
-   fs_warn(sdp, "warning: assertion \"%s\" failed at function = 
%s, file = %s, line = %u\n",
-   assertion, function, file, line);
+   fs_warn(sdp, "warning: assertion \"%s\" failed at %pS\n",
+   assertion, (void *)_RET_IP_);
 
if (sdp->sd_args.ar_debug)
BUG();
@@ -124,10 +121,8 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char 
*assertion,
dump_stack();
 
if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
-   panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
- "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
- sdp->sd_fsname, assertion,
- sdp->sd_fsname, function, file, line);
+   panic("GFS2: fsid=%s: warning: assertion \"%s\" failed at 
%pS\n",
+ sdp->sd_fsname, assertion, (void *)_RET_IP_);
 
sdp->sd_last_warning = jiffies;
 
@@ -135,61 +130,56 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char 
*assertion,
 }
 
 /**
- * gfs2_consist_i - Flag a filesystem consistency error and withdraw
+ * gfs2_consist - Flag a filesystem consistency error and withdraw
  * Returns: -1 if this call withdrew the machine,
  *  0 if it was already withdrawn
  */
 
-int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char 
*function,
-  char *file, unsigned int line)
+int gfs2_consist(struct gfs2_sbd *sdp)
 {
int rv;
rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error - function = 
%s, file = %s, line = %u\n",
- function, file, line);
+ "fatal: filesystem consistency error at %pS\n",
+ (void *)_RET_IP_);
return rv;
 }
 
 /**
- * gfs2_consist_inode_i - Flag an inode consistency error and withdraw
+ * gfs2_consist_inode - Flag an inode consistency error and withdraw
  * Returns: -1 if this call withdrew the machine,
  *  0 if it was already withdrawn
  */
 
-int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide,
-const char *function, char *file, unsigned int line)
+int gfs2_consist_inode(struct gfs2_inode *ip)
 {
struct gfs2_sbd *sdp = GFS2_SB(>i_inode);
int rv;
rv = gfs2_lm_withdraw(sdp,
- "fatal: filesystem consistency error\n"
- "  inode = %llu %llu\n"
- "  function = %s, file = %s, line = %u\n",
+ "fatal: filesystem consistency error at %pS\n"
+ "  inode = %llu %llu\n",
+ (void *)_RET_IP_,
  (unsigned long long)ip->i_no_formal_ino,
- (unsigned long long)ip->i_no_addr,
- function, 

[Cluster-devel] [PATCH 00/11] Read bitmap buffers on demand

2019-01-11 Thread Andreas Gruenbacher
Here's a bunch of patches that have accumulated here lately.  The most
interesting one is likely "gfs2: Read bitmap buffers on demand" which
should make filesystems with large resource groups more efficient.

Thanks,
Andreas

Andreas Gruenbacher (11):
  gfs2: Fix setting GBF_FULL in gfs2_rbm_find
  gfs2: Rename minext => minlen
  gfs2: Don't use struct gfs2_rbm in struct gfs2_extent
  gfs2: Minor gfs2_free_extlen cleanup
  gfs2: Only use struct gfs2_rbm for bitmap manipulations
  gfs2: Minor gfs2_adjust_reservation and gfs2_alloc_extent cleanups
  gfs2: Minor gfs2_rgrp_send_discards cleanup
  gfs2: Add buffer head to struct gfs2_rgrpd
  gfs2: Read bitmap buffers on demand
  gfs2: Clean up assertion, consistency check, and error reporting
functions
  gfs2: Skip gfs2_metatype_check for cached buffers

 fs/gfs2/bmap.c   |   2 +-
 fs/gfs2/incore.h |  37 +--
 fs/gfs2/lops.c   |   3 +-
 fs/gfs2/meta_io.c|  10 +-
 fs/gfs2/rgrp.c   | 672 +--
 fs/gfs2/rgrp.h   |   3 +-
 fs/gfs2/trace_gfs2.h |  10 +-
 fs/gfs2/trans.h  |   2 +-
 fs/gfs2/util.c   | 128 -
 fs/gfs2/util.h   |  85 ++
 10 files changed, 509 insertions(+), 443 deletions(-)

-- 
2.20.1



[Cluster-devel] [PATCH 02/11] gfs2: Rename minext => minlen

2019-01-11 Thread Andreas Gruenbacher
Rename the minext parameters which represent extent lengths to minlen to
avoid confusion with maxext, which are actual extents.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/rgrp.c | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cb87560c7bcbc..cfef8cc5fa155 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -73,7 +73,7 @@ static const char valid_change[16] = {
1, 0, 0, 0
 };
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minlen,
 const struct gfs2_inode *ip, bool nowrap);
 
 
@@ -1318,7 +1318,8 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
 
 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 struct buffer_head *bh,
-const struct gfs2_bitmap *bi, unsigned minlen, u64 
*ptrimmed)
+const struct gfs2_bitmap *bi, unsigned minlen,
+u64 *ptrimmed)
 {
struct super_block *sb = sdp->sd_vfs;
u64 blk;
@@ -1653,7 +1654,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd 
*rgd, u64 block,
  * gfs2_reservation_check_and_update - Check for reservations during block 
alloc
  * @rbm: The current position in the resource group
  * @ip: The inode for which we are searching for blocks
- * @minext: The minimum extent length
+ * @minlen: The minimum extent length
  * @maxext: A pointer to the maximum extent structure
  *
  * This checks the current position in the rgrp to see whether there is
@@ -1667,7 +1668,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd 
*rgd, u64 block,
 
 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 const struct gfs2_inode *ip,
-u32 minext,
+u32 minlen,
 struct gfs2_extent *maxext)
 {
u64 block = gfs2_rbm_to_block(rbm);
@@ -1679,8 +1680,8 @@ static int gfs2_reservation_check_and_update(struct 
gfs2_rbm *rbm,
 * If we have a minimum extent length, then skip over any extent
 * which is less than the min extent length in size.
 */
-   if (minext) {
-   extlen = gfs2_free_extlen(rbm, minext);
+   if (minlen) {
+   extlen = gfs2_free_extlen(rbm, minlen);
if (extlen <= maxext->len)
goto fail;
}
@@ -1691,7 +1692,7 @@ static int gfs2_reservation_check_and_update(struct 
gfs2_rbm *rbm,
 */
nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
if (nblock == block) {
-   if (!minext || extlen >= minext)
+   if (!minlen || extlen >= minlen)
return 0;
 
if (extlen > maxext->len) {
@@ -1711,7 +1712,7 @@ static int gfs2_reservation_check_and_update(struct 
gfs2_rbm *rbm,
  * gfs2_rbm_find - Look for blocks of a particular state
  * @rbm: Value/result starting position and final position
  * @state: The state which we want to find
- * @minext: Pointer to the requested extent length (NULL for a single block)
+ * @minlen: Pointer to the requested extent length (NULL for a single block)
  *  This is updated to be the actual reservation size.
  * @ip: If set, check for reservations
  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
@@ -1726,7 +1727,7 @@ static int gfs2_reservation_check_and_update(struct 
gfs2_rbm *rbm,
  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
  */
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minlen,
 const struct gfs2_inode *ip, bool nowrap)
 {
struct buffer_head *bh;
@@ -1772,7 +1773,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, 
u32 *minext,
 
initial_bii = rbm->bii;
ret = gfs2_reservation_check_and_update(rbm, ip,
-   minext ? *minext : 0,
+   minlen ? *minlen : 0,
);
if (ret == 0)
return 0;
@@ -1802,21 +1803,21 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 
state, u32 *minext,
break;
}
 
-   if (minext == NULL || state != GFS2_BLKST_FREE)
+   if (minlen == NULL || state != GFS2_BLKST_FREE)
return -ENOSPC;
 
/* If the extent was too small, and it's smaller than the smallest
   to have failed before, remember for future reference that it's
   useless to search this rgrp again for this amount or more. */
 

[Cluster-devel] [PATCH 01/11] gfs2: Fix setting GBF_FULL in gfs2_rbm_find

2019-01-11 Thread Andreas Gruenbacher
In gfs2_rbm_find, set the GBF_FULL flag whenever there are no free
blocks in an entire bitmap.  That requires that gfs2_bitfit scans the
entire bitmap (offset == 0), but doesn't depend on initial_offset.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/rgrp.c | 13 +
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 831d7cb5a49c4..cb87560c7bcbc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1731,7 +1731,6 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, 
u32 *minext,
 {
struct buffer_head *bh;
int initial_bii;
-   u32 initial_offset;
int first_bii = rbm->bii;
u32 first_offset = rbm->offset;
u32 offset;
@@ -1761,10 +1760,12 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 
state, u32 *minext,
WARN_ON(!buffer_uptodate(bh));
if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
-   initial_offset = rbm->offset;
offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
-   if (offset == BFITNOENT)
-   goto bitmap_full;
+   if (offset == BFITNOENT) {
+   if (state == GFS2_BLKST_FREE && rbm->offset == 0)
+   set_bit(GBF_FULL, >bi_flags);
+   goto next_bitmap;
+   }
rbm->offset = offset;
if (ip == NULL)
return 0;
@@ -1787,10 +1788,6 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, 
u32 *minext,
}
return ret;
 
-bitmap_full:   /* Mark bitmap as full and fall through */
-   if ((state == GFS2_BLKST_FREE) && initial_offset == 0)
-   set_bit(GBF_FULL, >bi_flags);
-
 next_bitmap:   /* Find next bitmap in the rgrp */
rbm->offset = 0;
rbm->bii++;
-- 
2.20.1



[Cluster-devel] [PATCH 03/11] gfs2: Don't use struct gfs2_rbm in struct gfs2_extent

2019-01-11 Thread Andreas Gruenbacher
Don't use struct gfs2_rbm in struct gfs2_extent: the extent will always
be in the same resource group, so there is no need to track the resource
group separately.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/rgrp.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cfef8cc5fa155..2ce46e372b020 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -61,7 +61,8 @@
  */
 
 struct gfs2_extent {
-   struct gfs2_rbm rbm;
+   int bii;
+   u32 offset;
u32 len;
 };
 
@@ -1696,8 +1697,9 @@ static int gfs2_reservation_check_and_update(struct 
gfs2_rbm *rbm,
return 0;
 
if (extlen > maxext->len) {
+   maxext->bii = rbm->bii;
+   maxext->offset = rbm->offset;
maxext->len = extlen;
-   maxext->rbm = *rbm;
}
 fail:
nblock = block + extlen;
@@ -1740,7 +1742,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, 
u32 *minlen,
int iters = rbm->rgd->rd_length;
int ret;
struct gfs2_bitmap *bi;
-   struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
+   struct gfs2_extent maxext = { };
 
/* If we are not starting at the beginning of a bitmap, then we
 * need to add one to the bitmap count to ensure that we search
@@ -1816,7 +1818,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, 
u32 *minlen,
/* If the maximum extent we found is big enough to fulfill the
   minimum requirements, use it anyway. */
if (maxext.len) {
-   *rbm = maxext.rbm;
+   rbm->bii = maxext.bii;
+   rbm->offset = maxext.offset;
*minlen = maxext.len;
return 0;
}
-- 
2.20.1



[Cluster-devel] [PATCH 06/11] gfs2: Minor gfs2_adjust_reservation and gfs2_alloc_extent cleanups

2019-01-11 Thread Andreas Gruenbacher
Pass the resource group and starting position to gfs2_adjust_reservation
directly instead of passing an rbm.  With this change, gfs2_alloc_blocks
no longer needs the rbm after calling gfs2_alloc_extent, so
gfs2_alloc_extent can modify the rbm instead of creating a copy.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/rgrp.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 14ffa86b6b1c5..7a93568fdc709 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2207,34 +2207,32 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 
 /**
  * gfs2_alloc_extent - allocate an extent from a given bitmap
- * @rbm: the resource group information
+ * @rbm: The position in the resource group
  * @dinode: TRUE if the first block we allocate is for a dinode
  * @n: The extent length (value/result)
  *
+ * Side effects:
+ * - Modifies rbm.
+ *
  * Add the bitmap buffer to the transaction.
  * Set the found bits to @new_state to change block's allocation state.
  */
-static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
+static void gfs2_alloc_extent(struct gfs2_rbm *rbm, bool dinode,
 unsigned int *n)
 {
-   struct gfs2_rbm pos = { .rgd = rbm->rgd, };
const unsigned int elen = *n;
-   u64 block;
int ret;
 
*n = 1;
-   block = gfs2_rbm_to_block(rbm);
gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
-   block++;
while (*n < elen) {
-   ret = gfs2_rbm_from_block(, block);
-   if (ret || gfs2_testbit(, true) != GFS2_BLKST_FREE)
+   ret = gfs2_rbm_incr(rbm);
+   if (ret || gfs2_testbit(rbm, true) != GFS2_BLKST_FREE)
break;
-   gfs2_trans_add_meta(pos.rgd->rd_gl, rbm_bi()->bi_bh);
-   gfs2_setbit(, true, GFS2_BLKST_USED);
+   gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm_bi(rbm)->bi_bh);
+   gfs2_setbit(rbm, true, GFS2_BLKST_USED);
(*n)++;
-   block++;
}
 }
 
@@ -2322,7 +2320,8 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 /**
  * gfs2_adjust_reservation - Adjust (or remove) a reservation after allocation
  * @ip: The inode we have just allocated blocks for
- * @rbm: The start of the allocated blocks
+ * @rgd: The resource group
+ * @start: The start of the reservation
  * @len: The extent length
  *
  * Adjusts a reservation after an allocation has taken place. If the
@@ -2331,15 +2330,13 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
  */
 
 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
-   const struct gfs2_rbm *rbm, unsigned len)
+   struct gfs2_rgrpd *rgd,
+   u64 start, unsigned len)
 {
struct gfs2_blkreserv *rs = >i_res;
-   struct gfs2_rgrpd *rgd = rbm->rgd;
 
spin_lock(>rd_rsspin);
if (gfs2_rs_active(rs)) {
-   u64 start = gfs2_rbm_to_block(rbm);
-
if (rs->rs_start == start) {
unsigned int rlen;
 
@@ -2429,11 +2426,11 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
goto rgrp_error;
}
 
-   gfs2_alloc_extent(, dinode, nblocks);
block = gfs2_rbm_to_block();
+   gfs2_alloc_extent(, dinode, nblocks);
rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
if (gfs2_rs_active(>i_res))
-   gfs2_adjust_reservation(ip, , *nblocks);
+   gfs2_adjust_reservation(ip, rbm.rgd, block, *nblocks);
ndata = *nblocks;
if (dinode)
ndata--;
-- 
2.20.1



[Cluster-devel] [PATCH 09/11] gfs2: Read bitmap buffers on demand

2019-01-11 Thread Andreas Gruenbacher
Before this patch, when locking a resource group, gfs2 would read in the
resource group header and all the bitmap buffers of the resource group.
Those buffers would then be locked into memory until the resource group
is unlocked, which will happen when the filesystem is unmounted or when
transferring the resource group lock to another node, but not due to
memory pressure.  Larger resource groups lock more buffers into memory,
and cause more unnecessary I/O when resource group locks are transferred
between nodes.

With this patch, when locking a resource group, only the resource group
header is read in.  The other bitmap buffers (the resource group header
contains part of the bitmap) are only read in on demand.

It would probably make sense to also only read in the resource group
header on demand, when the resource group is modified, but since the
header contains the number of free blocks in the resource group, there
is a higher incentive to keep the header locked in memory after that.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/incore.h |   1 -
 fs/gfs2/rgrp.c   | 382 ++-
 2 files changed, 241 insertions(+), 142 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 51eb41484e9af..d39c26b950121 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -87,7 +87,6 @@ struct gfs2_log_operations {
  * be reallocated in that same transaction.
  */
 struct gfs2_bitmap {
-   struct buffer_head *bi_bh;
char *bi_clone;
unsigned long bi_flags;
u32 bi_offset;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 339d6b064f1fc..503ea6f18ed74 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -62,6 +62,7 @@
 
 struct gfs2_rbm {
struct gfs2_rgrpd *rgd;
+   struct buffer_head *bh;
u32 offset; /* The offset is bitmap relative */
int bii;/* Bitmap index */
 };
@@ -112,8 +113,8 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, 
bool do_clone,
unsigned int buflen = bi->bi_bytes;
const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
 
-   byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
-   end = bi->bi_bh->b_data + bi->bi_offset + buflen;
+   byte1 = rbm->bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
+   end = rbm->bh->b_data + bi->bi_offset + buflen;
 
BUG_ON(byte1 >= end);
 
@@ -126,7 +127,7 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, 
bool do_clone,
rbm->offset, cur_state, new_state);
fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
(unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
-   (unsigned long long)bi->bi_bh->b_blocknr);
+   (unsigned long long)rbm->bh->b_blocknr);
fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
bi->bi_offset, bi->bi_bytes,
(unsigned long long)gfs2_rbm_to_block(rbm));
@@ -164,7 +165,7 @@ static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, 
bool use_clone)
if (use_clone && bi->bi_clone)
buffer = bi->bi_clone;
else
-   buffer = bi->bi_bh->b_data;
+   buffer = rbm->bh->b_data;
buffer += bi->bi_offset;
byte = buffer + (rbm->offset / GFS2_NBBY);
bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
@@ -276,62 +277,152 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int 
len,
 }
 
 /**
- * gfs2_rbm_from_block - Set the rbm based upon rgd and block number
- * @rbm: The rbm with rgd already set correctly
+ * __gfs2_rbm_get - Get the buffer head of @rbm
+ * @rbm: The rbm
+ *
+ * Get the buffer head of the bitmap block the rbm points at.
+ *
+ * Returns: 0 on success, or an error code
+ */
+static int __gfs2_rbm_get(struct gfs2_rbm *rbm) {
+   struct gfs2_rgrpd *rgd = rbm->rgd;
+   struct buffer_head *bh;
+   int error;
+
+   if (rbm->bii == 0) {
+   rbm->bh = rgd->rd_bh;
+   return 0;
+   }
+
+   /* FIXME: Might want to do read-ahead here. */
+   error = gfs2_meta_read(rgd->rd_gl, rgd->rd_addr + rbm->bii,
+  DIO_WAIT, 0, );
+   if (error)
+   goto out;
+   if (gfs2_metatype_check(rgd->rd_sbd, bh, GFS2_METATYPE_RB)) {
+   brelse(bh);
+   error = -EIO;
+   goto out;
+   }
+   rbm->bh = bh;
+out:
+   return error;
+}
+
+/**
+ * gfs2_rbm_get - Set up @rbm to point at @block
+ * @rbm: The rbm
+ * @rgd: The resource group of @block
  * @block: The block number (filesystem relative)
  *
- * This sets the bi and offset members of an rbm based on a
- * resource group and a filesystem relative block number. The
- * resource group must be set in the rbm on entry, the bi and
- * offset members will be set by this function.
+ * This sets the 

[Cluster-devel] [PATCH 08/11] gfs2: Add buffer head to struct gfs2_rgrpd

2019-01-11 Thread Andreas Gruenbacher
Prepare for removing the buffer head from struct gfs2_bitmap by adding a
buffer head to struct gfs2_rgrpd for the resource group header.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/incore.h |  1 +
 fs/gfs2/rgrp.c   | 46 +++---
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1dad2c54e7d8a..51eb41484e9af 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -99,6 +99,7 @@ struct gfs2_bitmap {
 struct gfs2_rgrpd {
struct rb_node rd_node; /* Link with superblock */
struct gfs2_glock *rd_gl;   /* Glock for this rgrp */
+   struct buffer_head *rd_bh;
u64 rd_addr;/* grp block disk address */
u64 rd_data0;   /* first data location */
u32 rd_length;  /* length of rgrp header in fs blocks */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index a86358afb33e7..339d6b064f1fc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1139,7 +1139,7 @@ static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void 
*buf)
 static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
 {
struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
-   struct gfs2_rgrp *str = (struct gfs2_rgrp 
*)rgd->rd_bits[0].bi_bh->b_data;
+   struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bh->b_data;
int valid = 1;
 
if (rgl->rl_flags != str->rg_flags) {
@@ -1212,12 +1212,20 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
struct gfs2_glock *gl = rgd->rd_gl;
unsigned int length = rgd->rd_length;
struct gfs2_bitmap *bi;
-   unsigned int x, y;
+   unsigned int x = 0, y;
int error;
 
-   if (rgd->rd_bits[0].bi_bh != NULL)
+   if (rgd->rd_bh != NULL)
return 0;
 
+   error = gfs2_meta_read(gl, rgd->rd_addr, DIO_WAIT, 0, >rd_bh);
+   if (error)
+   return error;
+   if (gfs2_metatype_check(sdp, rgd->rd_bh, GFS2_METATYPE_RG)) {
+   error = -EIO;
+   goto fail;
+   }
+
for (x = 0; x < length; x++) {
bi = rgd->rd_bits + x;
error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, >bi_bh);
@@ -1240,7 +1248,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
for (x = 0; x < length; x++)
clear_bit(GBF_FULL, >rd_bits[x].bi_flags);
-   gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
+   gfs2_rgrp_in(rgd, rgd->rd_bh->b_data);
rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
rgd->rd_free_clone = rgd->rd_free;
/* max out the rgrp allocation failure point */
@@ -1248,8 +1256,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
}
if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
-   gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
-rgd->rd_bits[0].bi_bh->b_data);
+   gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bh->b_data);
}
else if (sdp->sd_args.ar_rgrplvb) {
if (!gfs2_rgrp_lvb_valid(rgd)){
@@ -1269,6 +1276,8 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
gfs2_assert_warn(sdp, !bi->bi_clone);
}
+   brelse(rgd->rd_bh);
+   rgd->rd_bh = NULL;
 
return error;
 }
@@ -1323,7 +1332,8 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
bi->bi_bh = NULL;
}
}
-
+   brelse(rgd->rd_bh);
+   rgd->rd_bh = NULL;
 }
 
 /**
@@ -1423,7 +1433,6 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
struct inode *inode = file_inode(filp);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev);
-   struct buffer_head *bh;
struct gfs2_rgrpd *rgd;
struct gfs2_rgrpd *rgd_end;
struct gfs2_holder gh;
@@ -1485,10 +1494,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
/* Mark rgrp as having been trimmed */
ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0);
if (ret == 0) {
-   bh = rgd->rd_bits[0].bi_bh;
rgd->rd_flags |= GFS2_RGF_TRIMMED;
-   gfs2_trans_add_meta(rgd->rd_gl, bh);
-   gfs2_rgrp_out(rgd, bh->b_data);
+   gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bh);
+   gfs2_rgrp_out(rgd, rgd->rd_bh->b_data);
gfs2_trans_end(sdp);
}
}
@@ -2459,8 +2467,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, 
unsigned int *nblocks,
 

[Cluster-devel] [PATCH 11/11] gfs2: Skip gfs2_metatype_check for cached buffers

2019-01-11 Thread Andreas Gruenbacher
When reading in buffers from disk, set a new BH_Verify buffer head flag.  In
gfs2_metatype_check, skip the check if BH_Verify is cleared and clear the flag
when checking.  That way, we'll only check the metatype once when reading
buffers from disk, and not when the buffers were already in the page cache.

While touching this code, convert two 'be32_to_cpu(magic) == GFS2_MAGIC' checks
into 'magic == cpu_to_be32(GFS2_MAGIC)'.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/incore.h  |  5 -
 fs/gfs2/meta_io.c | 10 --
 fs/gfs2/util.c| 31 ---
 fs/gfs2/util.h| 24 ++--
 4 files changed, 42 insertions(+), 28 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index d39c26b950121..40644e1207969 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -126,13 +126,16 @@ struct gfs2_rgrpd {
 
 enum gfs2_state_bits {
BH_Pinned = BH_PrivateStart,
-   BH_Escaped = BH_PrivateStart + 1,
+   BH_Escaped,
+   BH_Verify
 };
 
 BUFFER_FNS(Pinned, pinned)
 TAS_BUFFER_FNS(Pinned, pinned)
 BUFFER_FNS(Escaped, escaped)
 TAS_BUFFER_FNS(Escaped, escaped)
+BUFFER_FNS(Verify, verify)
+TAS_BUFFER_FNS(Verify, verify)
 
 struct gfs2_bufdata {
struct buffer_head *bd_bh;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index be9c0bf697fe6..a111e12a5e1b6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -268,6 +268,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
} else {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
+   set_buffer_verify(bh);
bhs[num++] = bh;
}
 
@@ -280,6 +281,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
brelse(bh);
} else {
bh->b_end_io = end_buffer_read_sync;
+   set_buffer_verify(bh);
bhs[num++] = bh;
}
}
@@ -452,8 +454,10 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, 
u64 dblock, u32 extlen)
 
if (buffer_uptodate(first_bh))
goto out;
-   if (!buffer_locked(first_bh))
+   if (!buffer_locked(first_bh)) {
+   set_buffer_verify(first_bh);
ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, _bh);
+   }
 
dblock++;
extlen--;
@@ -461,10 +465,12 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, 
u64 dblock, u32 extlen)
while (extlen) {
bh = gfs2_getbuf(gl, dblock, CREATE);
 
-   if (!buffer_uptodate(bh) && !buffer_locked(bh))
+   if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
+   set_buffer_verify(bh);
ll_rw_block(REQ_OP_READ,
REQ_RAHEAD | REQ_META | REQ_PRIO,
1, );
+   }
brelse(bh);
dblock++;
extlen--;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index b4c72bb799052..2fb4e83bd3efa 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -202,20 +202,37 @@ int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct 
buffer_head *bh,
 }
 
 /**
- * gfs2_metatype_check_ii - Flag a metadata type consistency error and withdraw
+ * gfs2_metatype_check_ii - Check the metadata type of a block
+ *
+ * Report and withdraw on inconsistencies.
+ *
  * Returns: -1 if this call withdrew the machine,
  *  -2 if it was already withdrawn
  */
 
 int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh,
-  u16 type, u16 t, void *caller)
+  u16 type, void *caller)
 {
+   struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
int me;
-   me = gfs2_lm_withdraw(sdp,
- "fatal: invalid metadata block at %pS\n"
- "  bh = %llu (type: exp=%u, found=%u)\n",
- caller,
- (unsigned long long)bh->b_blocknr, type, t);
+
+   clear_buffer_verify(bh);
+
+   if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+   me = gfs2_lm_withdraw(sdp,
+ "fatal: invalid metadata block at %pS\n"
+ "  bh = %llu (magic number)\n",
+ caller,
+ (unsigned long long)bh->b_blocknr);
+   } else if (unlikely(be32_to_cpu(mh->mh_type) != type)) {
+   me = gfs2_lm_withdraw(sdp,
+ "fatal: invalid metadata block at %pS\n"
+ "  bh = %llu (type: exp=%u, found=%u)\n",
+ caller,
+ (unsigned long long)bh->b_blocknr, type,
+ be32_to_cpu(mh->mh_type));
+ 

[Cluster-devel] [PATCH 04/11] gfs2: Minor gfs2_free_extlen cleanup

2019-01-11 Thread Andreas Gruenbacher
Limit the n_unaligned argument of gfs2_unaligned_extlen to the number of
blocks to scan so that the len argument can never underrun.

Slightly simplify the unaligned block logic in gfs2_free_extlen.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/rgrp.c | 30 +-
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2ce46e372b020..f06755dae951e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -335,8 +335,6 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 
n_unaligned, u32 *le
if (res != GFS2_BLKST_FREE)
return true;
(*len)--;
-   if (*len == 0)
-   return true;
if (gfs2_rbm_incr(rbm))
return true;
}
@@ -362,7 +360,6 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 
n_unaligned, u32 *le
 static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
 {
struct gfs2_rbm rbm = *rrbm;
-   u32 n_unaligned = rbm.offset & 3;
u32 size = len;
u32 bytes;
u32 chunk_size;
@@ -370,11 +367,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, 
u32 len)
u64 block;
struct gfs2_bitmap *bi;
 
-   if (n_unaligned &&
-   gfs2_unaligned_extlen(, 4 - n_unaligned, ))
-   goto out;
+   /* Deal with unaligned bits at the front */
+   if (rbm.offset & 3) {
+   u32 n_unaligned = min(4 - (rbm.offset & 3), len);
+
+   if (gfs2_unaligned_extlen(, n_unaligned, ))
+   goto out;
+   }
 
-   n_unaligned = len & 3;
/* Start is now byte aligned */
while (len > 3) {
bi = rbm_bi();
@@ -392,20 +392,16 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, 
u32 len)
BUG_ON(len < chunk_size);
len -= chunk_size;
block = gfs2_rbm_to_block();
-   if (gfs2_rbm_from_block(, block + chunk_size)) {
-   n_unaligned = 0;
-   break;
-   }
-   if (ptr) {
-   n_unaligned = 3;
+   if (gfs2_rbm_from_block(, block + chunk_size))
+   goto out;
+   if (ptr)
break;
-   }
-   n_unaligned = len & 3;
}
 
/* Deal with any bits left over at the end */
-   if (n_unaligned)
-   gfs2_unaligned_extlen(, n_unaligned, );
+   if (len)
+   gfs2_unaligned_extlen(, len, );
+
 out:
return size - len;
 }
-- 
2.20.1



[Cluster-devel] [PATCH 05/11] gfs2: Only use struct gfs2_rbm for bitmap manipulations

2019-01-11 Thread Andreas Gruenbacher
GFS2 uses struct gfs2_rbm to represent a filesystem block number as a
bit position within a resource group.  This representation is used in
the bitmap manipulation code to prevent excessive conversions between
block numbers and bit positions, but also in struct gfs2_blkreserv which
is part of struct gfs2_inode, to mark the start of a reservation.  In
the inode, the bit position representation makes less sense: first, the
start position is used as a block number about as often as a bit
position; second, the bit position representation makes the code
unnecessarily complicated and difficult to read.

Change struct gfs2_blkreserv to represent the start of a reservation as
a block number instead of a bit position.  This requires keeping track
of the resource group in gfs2_blkreserv separately.  With that, struct
gfs2_rbm can be moved to rgrp.c.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/bmap.c   |   2 +-
 fs/gfs2/incore.h |  30 +
 fs/gfs2/rgrp.c   | 153 +--
 fs/gfs2/trace_gfs2.h |  10 +--
 fs/gfs2/trans.h  |   2 +-
 5 files changed, 100 insertions(+), 97 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 02b2646d84b3a..e338182f74efd 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1498,7 +1498,7 @@ static int sweep_bh_for_rgrps(struct gfs2_inode *ip, 
struct gfs2_holder *rd_gh,
 
/* Must be done with the rgrp glock held: */
if (gfs2_rs_active(>i_res) &&
-   rgd == ip->i_res.rs_rbm.rgd)
+   rgd == ip->i_res.rs_rgd)
gfs2_rs_deltree(>i_res);
}
 
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index e10e0b0a7cd58..1dad2c54e7d8a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -124,31 +124,6 @@ struct gfs2_rgrpd {
struct rb_root rd_rstree;   /* multi-block reservation tree */
 };
 
-struct gfs2_rbm {
-   struct gfs2_rgrpd *rgd;
-   u32 offset; /* The offset is bitmap relative */
-   int bii;/* Bitmap index */
-};
-
-static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
-{
-   return rbm->rgd->rd_bits + rbm->bii;
-}
-
-static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
-{
-   BUG_ON(rbm->offset >= rbm->rgd->rd_data);
-   return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
-   rbm->offset;
-}
-
-static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1,
-  const struct gfs2_rbm *rbm2)
-{
-   return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) &&
-  (rbm1->offset == rbm2->offset);
-}
-
 enum gfs2_state_bits {
BH_Pinned = BH_PrivateStart,
BH_Escaped = BH_PrivateStart + 1,
@@ -308,8 +283,9 @@ struct gfs2_qadata { /* quota allocation data */
 */
 
 struct gfs2_blkreserv {
-   struct rb_node rs_node;   /* link to other block reservations */
-   struct gfs2_rbm rs_rbm;   /* Start of reservation */
+   struct rb_node rs_node;   /* node within rd_rstree */
+   struct gfs2_rgrpd *rs_rgd;
+   u64 rs_start; /* start of reservation */
u32 rs_free;  /* how many blocks are still free */
 };
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f06755dae951e..14ffa86b6b1c5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -60,6 +60,12 @@
  * 3 = Used (metadata)
  */
 
+struct gfs2_rbm {
+   struct gfs2_rgrpd *rgd;
+   u32 offset; /* The offset is bitmap relative */
+   int bii;/* Bitmap index */
+};
+
 struct gfs2_extent {
int bii;
u32 offset;
@@ -74,6 +80,18 @@ static const char valid_change[16] = {
1, 0, 0, 0
 };
 
+static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm)
+{
+   return rbm->rgd->rd_bits + rbm->bii;
+}
+
+static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm)
+{
+   BUG_ON(rbm->offset >= rbm->rgd->rd_data);
+   return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) +
+   rbm->offset;
+}
+
 static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minlen,
 const struct gfs2_inode *ip, bool nowrap);
 
@@ -189,7 +207,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 
mask, u8 state)
 
 /**
  * rs_cmp - multi-block reservation range compare
- * @blk: absolute file system block number of the new reservation
+ * @start: start of the new reservation
  * @len: number of blocks in the new reservation
  * @rs: existing reservation to compare against
  *
@@ -197,13 +215,11 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 
mask, u8 state)
  * -1 if the block range is before the start of the reservation
  *  0 if the block range overlaps with the reservation
  */
-static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
+static 

[Cluster-devel] [PATCH 07/11] gfs2: Minor gfs2_rgrp_send_discards cleanup

2019-01-11 Thread Andreas Gruenbacher
Pass the resource group and bitmap index to gfs2_rgrp_send_discards
separately: one of the next patches will remove bi_bh from struct
gfs2_bitmap, so we'll need to get it from the resource group in the
future.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/lops.c |  3 ++-
 fs/gfs2/rgrp.c | 11 +--
 fs/gfs2/rgrp.h |  3 ++-
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 94dcab655bc02..2db478768b229 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -81,7 +81,8 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
if (bi->bi_clone == NULL)
return;
if (sdp->sd_args.ar_discard)
-   gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, 
NULL);
+   gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, rgd,
+   index, 1, NULL);
memcpy(bi->bi_clone + bi->bi_offset,
   bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
clear_bit(GBF_FULL, >bi_flags);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7a93568fdc709..a86358afb33e7 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1343,10 +1343,10 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
 }
 
 int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
-struct buffer_head *bh,
-const struct gfs2_bitmap *bi, unsigned minlen,
-u64 *ptrimmed)
+   struct buffer_head *bh, struct gfs2_rgrpd *rgd,
+   unsigned int bii, unsigned minlen, u64 *ptrimmed)
 {
+   struct gfs2_bitmap *bi = rgd->rd_bits + bii;
struct super_block *sb = sdp->sd_vfs;
u64 blk;
sector_t start = 0;
@@ -1472,10 +1472,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) {
/* Trim each bitmap in the rgrp */
for (x = 0; x < rgd->rd_length; x++) {
-   struct gfs2_bitmap *bi = rgd->rd_bits + x;
ret = gfs2_rgrp_send_discards(sdp,
-   rgd->rd_data0, NULL, bi, minlen,
-   );
+   rgd->rd_data0, NULL, rgd, x,
+   minlen, );
if (ret) {
gfs2_glock_dq_uninit();
goto out;
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 499079a9dbbed..4af09a548c358 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -75,7 +75,8 @@ extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
 extern void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
   struct buffer_head *bh,
-  const struct gfs2_bitmap *bi, unsigned 
minlen, u64 *ptrimmed);
+  struct gfs2_rgrpd *rgd, unsigned int bii,
+  unsigned minlen, u64 *ptrimmed);
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
 /* This is how to tell if a reservation is in the rgrp tree: */
-- 
2.20.1



[Cluster-devel] [PATCH V13 17/19] block: document usage of bio iterator helpers

2019-01-11 Thread Ming Lei
Now multi-page bvec is supported, some helpers may return page by
page, meantime some may return segment by segment, this patch
documents the usage.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 Documentation/block/biovecs.txt | 25 +
 1 file changed, 25 insertions(+)

diff --git a/Documentation/block/biovecs.txt b/Documentation/block/biovecs.txt
index 25689584e6e0..ce6eccaf5df7 100644
--- a/Documentation/block/biovecs.txt
+++ b/Documentation/block/biovecs.txt
@@ -117,3 +117,28 @@ Other implications:
size limitations and the limitations of the underlying devices. Thus
there's no need to define ->merge_bvec_fn() callbacks for individual block
drivers.
+
+Usage of helpers:
+=
+
+* The following helpers whose names have the suffix of "_all" can only be used
+on non-BIO_CLONED bio. They are usually used by filesystem code. Drivers
+shouldn't use them because the bio may have been split before it reached the
+driver.
+
+   bio_for_each_segment_all()
+   bio_first_bvec_all()
+   bio_first_page_all()
+   bio_last_bvec_all()
+
+* The following helpers iterate over single-page segment. The passed 'struct
+bio_vec' will contain a single-page IO vector during the iteration
+
+   bio_for_each_segment()
+   bio_for_each_segment_all()
+
+* The following helpers iterate over multi-page bvec. The passed 'struct
+bio_vec' will contain a multi-page IO vector during the iteration
+
+   bio_for_each_bvec()
+   rq_for_each_bvec()
-- 
2.9.5



[Cluster-devel] [PATCH V13 16/19] block: always define BIO_MAX_PAGES as 256

2019-01-11 Thread Ming Lei
Now multi-page bvec can cover CONFIG_THP_SWAP, so we don't need to
increase BIO_MAX_PAGES for it.

CONFIG_THP_SWAP needs to split one THP into normal pages and adds
them all to one bio. With multipage-bvec, it just takes one bvec to
hold them all.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h | 8 
 1 file changed, 8 deletions(-)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1ece9f30294b..54ef81f11f83 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -34,15 +34,7 @@
 #define BIO_BUG_ON
 #endif
 
-#ifdef CONFIG_THP_SWAP
-#if HPAGE_PMD_NR > 256
-#define BIO_MAX_PAGES  HPAGE_PMD_NR
-#else
 #define BIO_MAX_PAGES  256
-#endif
-#else
-#define BIO_MAX_PAGES  256
-#endif
 
 #define bio_prio(bio)  (bio)->bi_ioprio
 #define bio_set_prio(bio, prio)((bio)->bi_ioprio = prio)
-- 
2.9.5



[Cluster-devel] [PATCH V13 18/19] block: kill QUEUE_FLAG_NO_SG_MERGE

2019-01-11 Thread Ming Lei
Since bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting"),
physical segment number is mainly figured out in blk_queue_split() for
fast path, and the flag of BIO_SEG_VALID is set there too.

Now only blk_recount_segments() and blk_recalc_rq_segments() use this
flag.

Basically blk_recount_segments() is bypassed in fast path given BIO_SEG_VALID
is set in blk_queue_split().

For another user of blk_recalc_rq_segments():

- run in partial completion branch of blk_update_request, which is an unusual 
case

- run in blk_cloned_rq_check_limits(), still not a big problem if the flag is 
killed
since dm-rq is the only user.

Multi-page bvec is enabled now, not doing S/G merging is rather pointless with 
the
current setup of the I/O path, as it isn't going to save you a significant 
amount
of cycles.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c  | 31 ++-
 block/blk-mq-debugfs.c |  1 -
 block/blk-mq.c |  3 ---
 drivers/md/dm-table.c  | 13 -
 include/linux/blkdev.h |  1 -
 5 files changed, 6 insertions(+), 43 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf736d2b3710..dc4877eaf9f9 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -354,8 +354,7 @@ void blk_queue_split(struct request_queue *q, struct bio 
**bio)
 EXPORT_SYMBOL(blk_queue_split);
 
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
-struct bio *bio,
-bool no_sg_merge)
+struct bio *bio)
 {
struct bio_vec bv, bvprv = { NULL };
int prev = 0;
@@ -381,13 +380,6 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
nr_phys_segs = 0;
for_each_bio(bio) {
bio_for_each_bvec(bv, bio, iter) {
-   /*
-* If SG merging is disabled, each bio vector is
-* a segment
-*/
-   if (no_sg_merge)
-   goto new_segment;
-
if (prev) {
if (seg_size + bv.bv_len
> queue_max_segment_size(q))
@@ -417,27 +409,16 @@ static unsigned int __blk_recalc_rq_segments(struct 
request_queue *q,
 
 void blk_recalc_rq_segments(struct request *rq)
 {
-   bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
-   >q->queue_flags);
-
-   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
-   no_sg_merge);
+   rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
 }
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt = bio_segments(bio);
-
-   if (test_bit(QUEUE_FLAG_NO_SG_MERGE, >queue_flags) &&
-   (seg_cnt < queue_max_segments(q)))
-   bio->bi_phys_segments = seg_cnt;
-   else {
-   struct bio *nxt = bio->bi_next;
+   struct bio *nxt = bio->bi_next;
 
-   bio->bi_next = NULL;
-   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-   bio->bi_next = nxt;
-   }
+   bio->bi_next = NULL;
+   bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
+   bio->bi_next = nxt;
 
bio_set_flag(bio, BIO_SEG_VALID);
 }
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 90d68760af08..2f9a11ef5bad 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -128,7 +128,6 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
-   QUEUE_FLAG_NAME(NO_SG_MERGE),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3ba37b9e15e9..fa45817a7e62 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2829,9 +2829,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct 
blk_mq_tag_set *set,
set->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 
-   if (!(set->flags & BLK_MQ_F_SG_MERGE))
-   blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q);
-
q->sg_reserved_size = INT_MAX;
 
INIT_DELAYED_WORK(>requeue_work, blk_mq_requeue_work);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4b1be754cc41..ba9481f1bf3c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1698,14 +1698,6 @@ static int device_is_not_random(struct dm_target *ti, 
struct dm_dev *dev,
return q && !blk_queue_add_random(q);
 }
 
-static int queue_supports_sg_merge(struct dm_target *ti, struct dm_dev *dev,
-  sector_t start, sector_t len, 

[Cluster-devel] [PATCH V13 15/19] block: enable multipage bvecs

2019-01-11 Thread Ming Lei
This patch pulls the trigger for multi-page bvecs.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/bio.c | 22 +++---
 fs/iomap.c  |  4 ++--
 fs/xfs/xfs_aops.c   |  4 ++--
 include/linux/bio.h |  2 +-
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 968b12fea564..83a2dfa417ca 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -753,6 +753,8 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * @page: page to add
  * @len: length of the data to add
  * @off: offset of the data in @page
+ * @same_page: if %true only merge if the new data is in the same physical
+ * page as the last segment of the bio.
  *
  * Try to add the data at @page + @off to the last bvec of @bio.  This is a
  * a useful optimisation for file systems with a block size smaller than the
@@ -761,19 +763,25 @@ EXPORT_SYMBOL(bio_add_pc_page);
  * Return %true on success or %false on failure.
  */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off)
+   unsigned int len, unsigned int off, bool same_page)
 {
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
 
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
+   phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
+   bv->bv_offset + bv->bv_len - 1;
+   phys_addr_t page_addr = page_to_phys(page);
 
-   if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) {
-   bv->bv_len += len;
-   bio->bi_iter.bi_size += len;
-   return true;
-   }
+   if (vec_end_addr + 1 != page_addr + off)
+   return false;
+   if (same_page && (vec_end_addr & PAGE_MASK) != page_addr)
+   return false;
+
+   bv->bv_len += len;
+   bio->bi_iter.bi_size += len;
+   return true;
}
return false;
 }
@@ -819,7 +827,7 @@ EXPORT_SYMBOL_GPL(__bio_add_page);
 int bio_add_page(struct bio *bio, struct page *page,
 unsigned int len, unsigned int offset)
 {
-   if (!__bio_try_merge_page(bio, page, len, offset)) {
+   if (!__bio_try_merge_page(bio, page, len, offset, false)) {
if (bio_full(bio))
return 0;
__bio_add_page(bio, page, len, offset);
diff --git a/fs/iomap.c b/fs/iomap.c
index af736acd9006..0c350e658b7f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -318,7 +318,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
 */
sector = iomap_sector(iomap, pos);
if (ctx->bio && bio_end_sector(ctx->bio) == sector) {
-   if (__bio_try_merge_page(ctx->bio, page, plen, poff))
+   if (__bio_try_merge_page(ctx->bio, page, plen, poff, true))
goto done;
is_contig = true;
}
@@ -349,7 +349,7 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, 
loff_t length, void *data,
ctx->bio->bi_end_io = iomap_read_end_io;
}
 
-   __bio_add_page(ctx->bio, page, plen, poff);
+   bio_add_page(ctx->bio, page, plen, poff);
 done:
/*
 * Move the caller beyond our range so that it keeps making progress.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 1f1829e506e8..b9fd44168f61 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -616,12 +616,12 @@ xfs_add_to_ioend(
bdev, sector);
}
 
-   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+   if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, true)) {
if (iop)
atomic_inc(>write_count);
if (bio_full(wpc->ioend->io_bio))
xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
-   __bio_add_page(wpc->ioend->io_bio, page, len, poff);
+   bio_add_page(wpc->ioend->io_bio, page, len, poff);
}
 
wpc->ioend->io_size += len;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c5231e5c7e85..1ece9f30294b 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -441,7 +441,7 @@ extern int bio_add_page(struct bio *, struct page *, 
unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
   unsigned int, unsigned int);
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
-   unsigned int len, unsigned int off);
+   unsigned int len, unsigned int off, bool same_page);
 void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
 int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
-- 
2.9.5



[Cluster-devel] [PATCH V13 19/19] block: kill BLK_MQ_F_SG_MERGE

2019-01-11 Thread Ming Lei
QUEUE_FLAG_NO_SG_MERGE has been killed, so kill BLK_MQ_F_SG_MERGE too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-mq-debugfs.c   | 1 -
 drivers/block/loop.c | 2 +-
 drivers/block/nbd.c  | 2 +-
 drivers/block/rbd.c  | 2 +-
 drivers/block/skd_main.c | 1 -
 drivers/block/xen-blkfront.c | 2 +-
 drivers/md/dm-rq.c   | 2 +-
 drivers/mmc/core/queue.c | 3 +--
 drivers/scsi/scsi_lib.c  | 2 +-
 include/linux/blk-mq.h   | 1 -
 10 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 2f9a11ef5bad..2ba0aa05ce13 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -250,7 +250,6 @@ static const char *const alloc_policy_name[] = {
 static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED),
-   HCTX_FLAG_NAME(SG_MERGE),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
 };
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 28dd22c6f83f..e3b9212ec7a1 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1906,7 +1906,7 @@ static int loop_add(struct loop_device **l, int i)
lo->tag_set.queue_depth = 128;
lo->tag_set.numa_node = NUMA_NO_NODE;
lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
lo->tag_set.driver_data = lo;
 
err = blk_mq_alloc_tag_set(>tag_set);
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 08696f5f00bb..999c94de78e5 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1570,7 +1570,7 @@ static int nbd_dev_add(int index)
nbd->tag_set.numa_node = NUMA_NO_NODE;
nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING;
+   BLK_MQ_F_BLOCKING;
nbd->tag_set.driver_data = nbd;
 
err = blk_mq_alloc_tag_set(>tag_set);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 8e5140bbf241..3dfd300b5283 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3988,7 +3988,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
rbd_dev->tag_set.ops = _mq_ops;
rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
-   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
rbd_dev->tag_set.nr_hw_queues = 1;
rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
 
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
index a10d5736d8f7..a7040f9a1b1b 100644
--- a/drivers/block/skd_main.c
+++ b/drivers/block/skd_main.c
@@ -2843,7 +2843,6 @@ static int skd_cons_disk(struct skd_device *skdev)
skdev->sgs_per_request * sizeof(struct scatterlist);
skdev->tag_set.numa_node = NUMA_NO_NODE;
skdev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
-   BLK_MQ_F_SG_MERGE |
BLK_ALLOC_POLICY_TO_MQ_FLAG(BLK_TAG_ALLOC_FIFO);
skdev->tag_set.driver_data = skdev;
rc = blk_mq_alloc_tag_set(>tag_set);
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 0ed4b200fa58..d43a5677ccbc 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -977,7 +977,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 
sector_size,
} else
info->tag_set.queue_depth = BLK_RING_SIZE(info);
info->tag_set.numa_node = NUMA_NO_NODE;
-   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
info->tag_set.cmd_size = sizeof(struct blkif_req);
info->tag_set.driver_data = info;
 
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 4eb5f8c56535..b2f8eb2365ee 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -527,7 +527,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, 
struct dm_table *t)
md->tag_set->ops = _mq_ops;
md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
md->tag_set->numa_node = md->numa_node_id;
-   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+   md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
md->tag_set->driver_data = md;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 35cc138b096d..cc19e71c71d4 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -410,8 +410,7 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card 
*card)
else
mq->tag_set.queue_depth = MMC_QUEUE_DEPTH;
mq->tag_set.numa_node = NUMA_NO_NODE;
-  

[Cluster-devel] [PATCH V13 14/19] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2019-01-11 Thread Ming Lei
This patch introduces one extra iterator variable to bio_for_each_segment_all(),
then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.

Given it is just one mechannical & simple change on all 
bio_for_each_segment_all()
users, this patch does tree-wide change in one single patch, so that we can
avoid to use a temporary helper for this conversion.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/bio.c   | 27 ++-
 block/bounce.c|  6 --
 drivers/md/bcache/btree.c |  3 ++-
 drivers/md/dm-crypt.c |  3 ++-
 drivers/md/raid1.c|  3 ++-
 drivers/staging/erofs/data.c  |  3 ++-
 drivers/staging/erofs/unzip_vle.c |  3 ++-
 fs/block_dev.c|  6 --
 fs/btrfs/compression.c|  3 ++-
 fs/btrfs/disk-io.c|  3 ++-
 fs/btrfs/extent_io.c  |  9 ++---
 fs/btrfs/inode.c  |  6 --
 fs/btrfs/raid56.c |  3 ++-
 fs/crypto/bio.c   |  3 ++-
 fs/direct-io.c|  4 +++-
 fs/exofs/ore.c|  3 ++-
 fs/exofs/ore_raid.c   |  3 ++-
 fs/ext4/page-io.c |  3 ++-
 fs/ext4/readpage.c|  3 ++-
 fs/f2fs/data.c|  9 ++---
 fs/gfs2/lops.c|  6 --
 fs/gfs2/meta_io.c |  3 ++-
 fs/iomap.c|  6 --
 fs/mpage.c|  3 ++-
 fs/xfs/xfs_aops.c |  5 +++--
 include/linux/bio.h   | 11 +--
 include/linux/bvec.h  | 30 ++
 27 files changed, 125 insertions(+), 45 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 4db1008309ed..968b12fea564 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct 
iov_iter *iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1295,6 +1298,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1368,7 +1372,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
 
  out_unmap:
-   bio_for_each_segment_all(bvec, bio, j) {
+   bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1379,11 +1383,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
/*
 * make sure we dirty pages we wrote to
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
 
@@ -1475,8 +1480,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@@ -1585,8 +1591,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (!PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
}
@@ -1596,8 +1603,9 @@ static void bio_release_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, 

[Cluster-devel] [PATCH V13 13/19] bcache: avoid to use bio_for_each_segment_all() in bch_bio_alloc_pages()

2019-01-11 Thread Ming Lei
bch_bio_alloc_pages() is always called on one new bio, so it is safe
to access the bvec table directly. Given it is the only kind of this
case, open code the bvec table access since bio_for_each_segment_all()
will be changed to support for iterating over multipage bvec.

Acked-by: Coly Li 
Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 drivers/md/bcache/util.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 20eddeac1531..62fb917f7a4f 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -270,7 +270,11 @@ int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
int i;
struct bio_vec *bv;
 
-   bio_for_each_segment_all(bv, bio, i) {
+   /*
+* This is called on freshly new bio, so it is safe to access the
+* bvec table directly.
+*/
+   for (i = 0, bv = bio->bi_io_vec; i < bio->bi_vcnt; bv++, i++) {
bv->bv_page = alloc_page(gfp_mask);
if (!bv->bv_page) {
while (--bv >= bio->bi_io_vec)
-- 
2.9.5



[Cluster-devel] [PATCH V13 12/19] block: loop: pass multi-page bvec to iov_iter

2019-01-11 Thread Ming Lei
iov_iter is implemented on bvec itererator helpers, so it is safe to pass
multi-page bvec to it, and this way is much more efficient than passing one
page in each bvec.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 drivers/block/loop.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b8a0720d3653..28dd22c6f83f 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,21 +511,22 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 loff_t pos, bool rw)
 {
struct iov_iter iter;
+   struct req_iterator rq_iter;
struct bio_vec *bvec;
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct bio *bio = rq->bio;
struct file *file = lo->lo_backing_file;
+   struct bio_vec tmp;
unsigned int offset;
-   int segments = 0;
+   int nr_bvec = 0;
int ret;
 
+   rq_for_each_bvec(tmp, rq, rq_iter)
+   nr_bvec++;
+
if (rq->bio != rq->biotail) {
-   struct req_iterator iter;
-   struct bio_vec tmp;
 
-   __rq_for_each_bio(bio, rq)
-   segments += bio_segments(bio);
-   bvec = kmalloc_array(segments, sizeof(struct bio_vec),
+   bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
 GFP_NOIO);
if (!bvec)
return -EIO;
@@ -534,10 +535,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
/*
 * The bios of the request may be started from the middle of
 * the 'bvec' because of bio splitting, so we can't directly
-* copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
+* copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
 * API will take care of all details for us.
 */
-   rq_for_each_segment(tmp, rq, iter) {
+   rq_for_each_bvec(tmp, rq, rq_iter) {
*bvec = tmp;
bvec++;
}
@@ -551,11 +552,10 @@ static int lo_rw_aio(struct loop_device *lo, struct 
loop_cmd *cmd,
 */
offset = bio->bi_iter.bi_bvec_done;
bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-   segments = bio_segments(bio);
}
atomic_set(>ref, 2);
 
-   iov_iter_bvec(, rw, bvec, segments, blk_rq_bytes(rq));
+   iov_iter_bvec(, rw, bvec, nr_bvec, blk_rq_bytes(rq));
iter.iov_offset = offset;
 
cmd->iocb.ki_pos = pos;
-- 
2.9.5



[Cluster-devel] [PATCH V13 11/19] btrfs: use bvec_last_segment to get bio's last page

2019-01-11 Thread Ming Lei
Preparing for supporting multi-page bvec.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 fs/btrfs/extent_io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index dc8ba3ee515d..c092f88700bd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2697,11 +2697,12 @@ static int __must_check submit_one_bio(struct bio *bio, 
int mirror_num,
 {
blk_status_t ret = 0;
struct bio_vec *bvec = bio_last_bvec_all(bio);
-   struct page *page = bvec->bv_page;
+   struct bio_vec bv;
struct extent_io_tree *tree = bio->bi_private;
u64 start;
 
-   start = page_offset(page) + bvec->bv_offset;
+   bvec_last_segment(bvec, );
+   start = page_offset(bv.bv_page) + bv.bv_offset;
 
bio->bi_private = NULL;
 
-- 
2.9.5



[Cluster-devel] [PATCH V13 09/19] block: introduce bvec_last_segment()

2019-01-11 Thread Ming Lei
BTRFS and guard_bio_eod() need to get the last singlepage segment
from one multipage bvec, so introduce this helper to make them happy.

Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index d441486db605..ca6e630f88ab 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -131,4 +131,26 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
.bi_bvec_done   = 0,\
 }
 
+/*
+ * Get the last single-page segment from the multi-page bvec and store it
+ * in @seg
+ */
+static inline void bvec_last_segment(const struct bio_vec *bvec,
+struct bio_vec *seg)
+{
+   unsigned total = bvec->bv_offset + bvec->bv_len;
+   unsigned last_page = (total - 1) / PAGE_SIZE;
+
+   seg->bv_page = nth_page(bvec->bv_page, last_page);
+
+   /* the whole segment is inside the last page */
+   if (bvec->bv_offset >= last_page * PAGE_SIZE) {
+   seg->bv_offset = bvec->bv_offset % PAGE_SIZE;
+   seg->bv_len = bvec->bv_len;
+   } else {
+   seg->bv_offset = 0;
+   seg->bv_len = total - last_page * PAGE_SIZE;
+   }
+}
+
 #endif /* __LINUX_BVEC_ITER_H */
-- 
2.9.5



[Cluster-devel] [PATCH V13 08/19] block: use bio_for_each_bvec() to map sg

2019-01-11 Thread Ming Lei
It is more efficient to use bio_for_each_bvec() to map sg, meantime
we have to consider splitting multipage bvec as done in blk_bio_segment_split().

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 70 +++
 1 file changed, 50 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index abe1c89c1253..bf736d2b3710 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -460,6 +460,54 @@ static int blk_phys_contig_segment(struct request_queue 
*q, struct bio *bio,
return biovec_phys_mergeable(q, _bv, _bv);
 }
 
+static struct scatterlist *blk_next_sg(struct scatterlist **sg,
+   struct scatterlist *sglist)
+{
+   if (!*sg)
+   return sglist;
+
+   /*
+* If the driver previously mapped a shorter list, we could see a
+* termination bit prematurely unless it fully inits the sg table
+* on each mapping. We KNOW that there must be more entries here
+* or the driver would be buggy, so force clear the termination bit
+* to avoid doing a full sg_init_table() in drivers for each command.
+*/
+   sg_unmark_end(*sg);
+   return sg_next(*sg);
+}
+
+static unsigned blk_bvec_map_sg(struct request_queue *q,
+   struct bio_vec *bvec, struct scatterlist *sglist,
+   struct scatterlist **sg)
+{
+   unsigned nbytes = bvec->bv_len;
+   unsigned nsegs = 0, total = 0, offset = 0;
+
+   while (nbytes > 0) {
+   unsigned seg_size;
+   struct page *pg;
+   unsigned idx;
+
+   *sg = blk_next_sg(sg, sglist);
+
+   seg_size = get_max_segment_size(q, bvec->bv_offset + total);
+   seg_size = min(nbytes, seg_size);
+
+   offset = (total + bvec->bv_offset) % PAGE_SIZE;
+   idx = (total + bvec->bv_offset) / PAGE_SIZE;
+   pg = nth_page(bvec->bv_page, idx);
+
+   sg_set_page(*sg, pg, seg_size, offset);
+
+   total += seg_size;
+   nbytes -= seg_size;
+   nsegs++;
+   }
+
+   return nsegs;
+}
+
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 struct scatterlist *sglist, struct bio_vec *bvprv,
@@ -477,25 +525,7 @@ __blk_segment_map_sg(struct request_queue *q, struct 
bio_vec *bvec,
(*sg)->length += nbytes;
} else {
 new_segment:
-   if (!*sg)
-   *sg = sglist;
-   else {
-   /*
-* If the driver previously mapped a shorter
-* list, we could see a termination bit
-* prematurely unless it fully inits the sg
-* table on each mapping. We KNOW that there
-* must be more entries here or the driver
-* would be buggy, so force clear the
-* termination bit to avoid doing a full
-* sg_init_table() in drivers for each command.
-*/
-   sg_unmark_end(*sg);
-   *sg = sg_next(*sg);
-   }
-
-   sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
-   (*nsegs)++;
+   (*nsegs) += blk_bvec_map_sg(q, bvec, sglist, sg);
}
*bvprv = *bvec;
 }
@@ -517,7 +547,7 @@ static int __blk_bios_map_sg(struct request_queue *q, 
struct bio *bio,
int nsegs = 0;
 
for_each_bio(bio)
-   bio_for_each_segment(bvec, bio, iter)
+   bio_for_each_bvec(bvec, bio, iter)
__blk_segment_map_sg(q, , sglist, , sg,
 );
 
-- 
2.9.5



[Cluster-devel] [PATCH V13 10/19] fs/buffer.c: use bvec iterator to truncate the bio

2019-01-11 Thread Ming Lei
Once multi-page bvec is enabled, the last bvec may include more than one
page, this patch use bvec_last_segment() to truncate the bio.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 fs/buffer.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 52d024bfdbc1..fb72ac21f2b1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3032,7 +3032,10 @@ void guard_bio_eod(int op, struct bio *bio)
 
/* ..and clear the end of the buffer for reads */
if (op == REQ_OP_READ) {
-   zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+   struct bio_vec bv;
+
+   bvec_last_segment(bvec, );
+   zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
truncated_bytes);
}
 }
-- 
2.9.5



[Cluster-devel] [PATCH V13 05/19] block: introduce multi-page bvec helpers

2019-01-11 Thread Ming Lei
This patch introduces helpers of 'bvec_iter_*' for multi-page bvec
support.

The introduced helpers treate one bvec as real multi-page segment,
which may include more than one pages.

The existed helpers of bvec_iter_* are interfaces for supporting current
bvec iterator which is thought as single-page by drivers, fs, dm and
etc. These introduced helpers will build single-page bvec in flight, so
this way won't break current bio/bvec users, which needn't any change.

Follows some multi-page bvec background:

- bvecs stored in bio->bi_io_vec is always multi-page style

- bvec(struct bio_vec) represents one physically contiguous I/O
  buffer, now the buffer may include more than one page after
  multi-page bvec is supported, and all these pages represented
  by one bvec is physically contiguous. Before multi-page bvec
  support, at most one page is included in one bvec, we call it
  single-page bvec.

- .bv_page of the bvec points to the 1st page in the multi-page bvec

- .bv_offset of the bvec is the offset of the buffer in the bvec

The effect on the current drivers/filesystem/dm/bcache/...:

- almost everyone supposes that one bvec only includes one single
  page, so we keep the sp interface not changed, for example,
  bio_for_each_segment() still returns single-page bvec

- bio_for_each_segment_all() will return single-page bvec too

- during iterating, iterator variable(struct bvec_iter) is always
  updated in multi-page bvec style, and bvec_iter_advance() is kept
  not changed

- returned(copied) single-page bvec is built in flight by bvec
  helpers from the stored multi-page bvec

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 716a87b26a6a..babc6316c117 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
@@ -50,16 +51,32 @@ struct bvec_iter {
  */
 #define __bvec_iter_bvec(bvec, iter)   (&(bvec)[(iter).bi_idx])
 
-#define segment_iter_page(bvec, iter)  \
+/* multi-page (segment) helpers */
+#define bvec_iter_page(bvec, iter) \
(__bvec_iter_bvec((bvec), (iter))->bv_page)
 
-#define segment_iter_len(bvec, iter)   \
+#define bvec_iter_len(bvec, iter)  \
min((iter).bi_size, \
__bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
 
-#define segment_iter_offset(bvec, iter)\
+#define bvec_iter_offset(bvec, iter)   \
(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
 
+#define bvec_iter_page_idx(bvec, iter) \
+   (bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
+
+/* For building single-page bvec(segment) in flight */
+ #define segment_iter_offset(bvec, iter)   \
+   (bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
+
+#define segment_iter_len(bvec, iter)   \
+   min_t(unsigned, bvec_iter_len((bvec), (iter)),  \
+ PAGE_SIZE - segment_iter_offset((bvec), (iter)))
+
+#define segment_iter_page(bvec, iter)  \
+   nth_page(bvec_iter_page((bvec), (iter)),\
+bvec_iter_page_idx((bvec), (iter)))
+
 #define segment_iter_bvec(bvec, iter)  \
 ((struct bio_vec) {\
.bv_page= segment_iter_page((bvec), (iter)),\
@@ -67,8 +84,6 @@ struct bvec_iter {
.bv_offset  = segment_iter_offset((bvec), (iter)),  \
 })
 
-#define bvec_iter_len  segment_iter_len
-
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
struct bvec_iter *iter, unsigned bytes)
 {
-- 
2.9.5



[Cluster-devel] [PATCH V13 06/19] block: introduce bio_for_each_bvec() and rq_for_each_bvec()

2019-01-11 Thread Ming Lei
bio_for_each_bvec() is used for iterating over multi-page bvec for bio
split & merge code.

rq_for_each_bvec() can be used for drivers which may handle the
multi-page bvec directly, so far loop is one perfect use case.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 include/linux/bio.h| 10 ++
 include/linux/blkdev.h |  4 
 include/linux/bvec.h   |  7 +++
 3 files changed, 21 insertions(+)

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 16a65361535f..06888d45beb4 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -156,6 +156,16 @@ static inline void bio_advance_iter(struct bio *bio, 
struct bvec_iter *iter,
 #define bio_for_each_segment(bvl, bio, iter)   \
__bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter)
 
+#define __bio_for_each_bvec(bvl, bio, iter, start) \
+   for (iter = (start);\
+(iter).bi_size &&  \
+   ((bvl = bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
+bio_advance_iter((bio), &(iter), (bvl).bv_len))
+
+/* iterate over multi-page bvec */
+#define bio_for_each_bvec(bvl, bio, iter)  \
+   __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
+
 #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len)
 
 static inline unsigned bio_segments(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 338604dff7d0..7f4ca073e2f3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -797,6 +797,10 @@ struct req_iterator {
__rq_for_each_bio(_iter.bio, _rq)   \
bio_for_each_segment(bvl, _iter.bio, _iter.iter)
 
+#define rq_for_each_bvec(bvl, _rq, _iter)  \
+   __rq_for_each_bio(_iter.bio, _rq)   \
+   bio_for_each_bvec(bvl, _iter.bio, _iter.iter)
+
 #define rq_iter_last(bvec, _iter)  \
(_iter.bio->bi_next == NULL &&  \
 bio_iter_last(bvec, _iter.iter))
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index babc6316c117..d441486db605 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -65,6 +65,13 @@ struct bvec_iter {
 #define bvec_iter_page_idx(bvec, iter) \
(bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
 
+#define bvec_iter_bvec(bvec, iter) \
+((struct bio_vec) {\
+   .bv_page= bvec_iter_page((bvec), (iter)),   \
+   .bv_len = bvec_iter_len((bvec), (iter)),\
+   .bv_offset  = bvec_iter_offset((bvec), (iter)), \
+})
+
 /* For building single-page bvec(segment) in flight */
  #define segment_iter_offset(bvec, iter)   \
(bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
-- 
2.9.5



[Cluster-devel] [PATCH V13 07/19] block: use bio_for_each_bvec() to compute multi-page bvec count

2019-01-11 Thread Ming Lei
First it is more efficient to use bio_for_each_bvec() in both
blk_bio_segment_split() and __blk_recalc_rq_segments() to compute how
many multi-page bvecs there are in the bio.

Secondly once bio_for_each_bvec() is used, the bvec may need to be
splitted because its length can be very longer than max segment size,
so we have to split the big bvec into several segments.

Thirdly when splitting multi-page bvec into segments, the max segment
limit may be reached, so the bio split need to be considered under
this situation too.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 99 ---
 1 file changed, 79 insertions(+), 20 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index f85d878f313d..abe1c89c1253 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -161,6 +161,69 @@ static inline unsigned get_max_io_size(struct 
request_queue *q,
return sectors;
 }
 
+static unsigned get_max_segment_size(struct request_queue *q,
+unsigned offset)
+{
+   unsigned long mask = queue_segment_boundary(q);
+
+   return min_t(unsigned long, mask - (mask & offset) + 1,
+queue_max_segment_size(q));
+}
+
+/*
+ * Split the bvec @bv into segments, and update all kinds of
+ * variables.
+ */
+static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
+   unsigned *nsegs, unsigned *last_seg_size,
+   unsigned *front_seg_size, unsigned *sectors)
+{
+   unsigned len = bv->bv_len;
+   unsigned total_len = 0;
+   unsigned new_nsegs = 0, seg_size = 0;
+
+   /*
+* Multi-page bvec may be too big to hold in one segment, so the
+* current bvec has to be splitted as multiple segments.
+*/
+   while (len && new_nsegs + *nsegs < queue_max_segments(q)) {
+   seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+   seg_size = min(seg_size, len);
+
+   new_nsegs++;
+   total_len += seg_size;
+   len -= seg_size;
+
+   if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+   break;
+   }
+
+   if (!new_nsegs)
+   return !!len;
+
+   /* update front segment size */
+   if (!*nsegs) {
+   unsigned first_seg_size;
+
+   if (new_nsegs == 1)
+   first_seg_size = get_max_segment_size(q, bv->bv_offset);
+   else
+   first_seg_size = queue_max_segment_size(q);
+
+   if (*front_seg_size < first_seg_size)
+   *front_seg_size = first_seg_size;
+   }
+
+   /* update other varibles */
+   *last_seg_size = seg_size;
+   *nsegs += new_nsegs;
+   if (sectors)
+   *sectors += total_len >> 9;
+
+   /* split in the middle of the bvec if len != 0 */
+   return !!len;
+}
+
 static struct bio *blk_bio_segment_split(struct request_queue *q,
 struct bio *bio,
 struct bio_set *bs,
@@ -174,7 +237,7 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
 
-   bio_for_each_segment(bv, bio, iter) {
+   bio_for_each_bvec(bv, bio, iter) {
/*
 * If the queue doesn't support SG gaps and adding this
 * offset would create a gap, disallow it.
@@ -189,8 +252,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
 */
if (nsegs < queue_max_segments(q) &&
sectors < max_sectors) {
-   nsegs++;
-   sectors = max_sectors;
+   /* split in the middle of bvec */
+   bv.bv_len = (max_sectors - sectors) << 9;
+   bvec_split_segs(q, , ,
+   _size,
+   _seg_size,
+   );
}
goto split;
}
@@ -212,14 +279,12 @@ static struct bio *blk_bio_segment_split(struct 
request_queue *q,
if (nsegs == queue_max_segments(q))
goto split;
 
-   if (nsegs == 1 && seg_size > front_seg_size)
-   front_seg_size = seg_size;
-
-   nsegs++;
bvprv = bv;
bvprvp = 
-   seg_size = bv.bv_len;
-   sectors += bv.bv_len >> 9;
+
+   if (bvec_split_segs(q, , , _size,
+   _seg_size, ))
+   goto split;
 
}
 

[Cluster-devel] [PATCH V13 03/19] block: remove bvec_iter_rewind()

2019-01-11 Thread Ming Lei
Commit 7759eb23fd980 ("block: remove bio_rewind_iter()") removes
bio_rewind_iter(), then no one uses bvec_iter_rewind() any more,
so remove it.

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 include/linux/bvec.h | 24 
 1 file changed, 24 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 02c73c6aa805..ba0ae40e77c9 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -92,30 +92,6 @@ static inline bool bvec_iter_advance(const struct bio_vec 
*bv,
return true;
 }
 
-static inline bool bvec_iter_rewind(const struct bio_vec *bv,
-struct bvec_iter *iter,
-unsigned int bytes)
-{
-   while (bytes) {
-   unsigned len = min(bytes, iter->bi_bvec_done);
-
-   if (iter->bi_bvec_done == 0) {
-   if (WARN_ONCE(iter->bi_idx == 0,
- "Attempted to rewind iter beyond "
- "bvec's boundaries\n")) {
-   return false;
-   }
-   iter->bi_idx--;
-   iter->bi_bvec_done = __bvec_iter_bvec(bv, 
*iter)->bv_len;
-   continue;
-   }
-   bytes -= len;
-   iter->bi_size += len;
-   iter->bi_bvec_done -= len;
-   }
-   return true;
-}
-
 #define for_each_bvec(bvl, bio_vec, iter, start)   \
for (iter = (start);\
 (iter).bi_size &&  \
-- 
2.9.5



[Cluster-devel] [PATCH V13 04/19] block: rename bvec helpers

2019-01-11 Thread Ming Lei
We will support multi-page bvec soon, and have to deal with
single-page vs multi-page bvec. This patch follows Christoph's
suggestion to rename all the following helpers:

for_each_bvec
bvec_iter_bvec
bvec_iter_len
bvec_iter_page
bvec_iter_offset

into:
for_each_segment
segment_iter_bvec
segment_iter_len
segment_iter_page
segment_iter_offset

so that these helpers named with 'segment' only deal with single-page
bvec, or called segment. We will introduce helpers named with 'bvec'
for multi-page bvec.

bvec_iter_advance() isn't renamed becasue this helper is always operated
on real bvec even though multi-page bvec is supported.

Acked-by: Miguel Ojeda 
Reviewed-by: Christoph Hellwig 
Reviewed-by: Omar Sandoval 
Suggested-by: Christoph Hellwig 
Signed-off-by: Ming Lei 
---
 .clang-format  |  2 +-
 drivers/md/dm-integrity.c  |  2 +-
 drivers/md/dm-io.c |  4 ++--
 drivers/nvdimm/blk.c   |  4 ++--
 drivers/nvdimm/btt.c   |  4 ++--
 include/linux/bio.h| 10 +-
 include/linux/bvec.h   | 20 +++-
 include/linux/ceph/messenger.h |  2 +-
 lib/iov_iter.c |  2 +-
 net/ceph/messenger.c   | 14 +++---
 10 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/.clang-format b/.clang-format
index e6080f5834a3..049200fbab94 100644
--- a/.clang-format
+++ b/.clang-format
@@ -120,7 +120,7 @@ ForEachMacros:
   - 'for_each_available_child_of_node'
   - 'for_each_bio'
   - 'for_each_board_func_rsrc'
-  - 'for_each_bvec'
+  - 'for_each_segment'
   - 'for_each_child_of_node'
   - 'for_each_clear_bit'
   - 'for_each_clear_bit_from'
diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c
index 457200ca6287..046b7785e3f6 100644
--- a/drivers/md/dm-integrity.c
+++ b/drivers/md/dm-integrity.c
@@ -1574,7 +1574,7 @@ static bool __journal_read_write(struct dm_integrity_io 
*dio, struct bio *bio,
char *tag_ptr = journal_entry_tag(ic, je);
 
if (bip) do {
-   struct bio_vec biv = 
bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+   struct bio_vec biv = 
segment_iter_bvec(bip->bip_vec, bip->bip_iter);
unsigned tag_now = min(biv.bv_len, 
tag_todo);
char *tag_addr;
BUG_ON(PageHighMem(biv.bv_page));
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 81ffc59d05c9..d72ec2bdd333 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -208,8 +208,8 @@ static void list_dp_init(struct dpages *dp, struct 
page_list *pl, unsigned offse
 static void bio_get_page(struct dpages *dp, struct page **p,
 unsigned long *len, unsigned *offset)
 {
-   struct bio_vec bvec = bvec_iter_bvec((struct bio_vec *)dp->context_ptr,
-dp->context_bi);
+   struct bio_vec bvec = segment_iter_bvec((struct bio_vec 
*)dp->context_ptr,
+   dp->context_bi);
 
*p = bvec.bv_page;
*len = bvec.bv_len;
diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c
index db45c6bbb7bb..dfae945216bb 100644
--- a/drivers/nvdimm/blk.c
+++ b/drivers/nvdimm/blk.c
@@ -89,9 +89,9 @@ static int nd_blk_rw_integrity(struct nd_namespace_blk *nsblk,
struct bio_vec bv;
void *iobuf;
 
-   bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+   bv = segment_iter_bvec(bip->bip_vec, bip->bip_iter);
/*
-* The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+* The 'bv' obtained from segment_iter_bvec has its .bv_len and
 * .bv_offset already adjusted for iter->bi_bvec_done, and we
 * can use those directly
 */
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index b123b0dcf274..2bbbc90c7b91 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1154,9 +1154,9 @@ static int btt_rw_integrity(struct btt *btt, struct 
bio_integrity_payload *bip,
struct bio_vec bv;
void *mem;
 
-   bv = bvec_iter_bvec(bip->bip_vec, bip->bip_iter);
+   bv = segment_iter_bvec(bip->bip_vec, bip->bip_iter);
/*
-* The 'bv' obtained from bvec_iter_bvec has its .bv_len and
+* The 'bv' obtained from segment_iter_bvec has its .bv_len and
 * .bv_offset already adjusted for iter->bi_bvec_done, and we
 * can use those directly
 */
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 72b4f7be2106..16a65361535f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -48,14 +48,14 @@
 #define 

[Cluster-devel] [PATCH V13 02/19] block: don't use bio->bi_vcnt to figure out segment number

2019-01-11 Thread Ming Lei
It is wrong to use bio->bi_vcnt to figure out how many segments
there are in the bio even though CLONED flag isn't set on this bio,
because this bio may be splitted or advanced.

So always use bio_segments() in blk_recount_segments(), and it shouldn't
cause any performance loss now because the physical segment number is figured
out in blk_queue_split() and BIO_SEG_VALID is set meantime since
bdced438acd83ad83a6c ("block: setup bi_phys_segments after splitting").

Reviewed-by: Omar Sandoval 
Reviewed-by: Christoph Hellwig 
Fixes: 76d8137a3113 ("blk-merge: recaculate segment if it isn't less than max 
segments")
Signed-off-by: Ming Lei 
---
 block/blk-merge.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 71e9ac03f621..f85d878f313d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -367,13 +367,7 @@ void blk_recalc_rq_segments(struct request *rq)
 
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-   unsigned short seg_cnt;
-
-   /* estimate segment number by bi_vcnt for non-cloned bio */
-   if (bio_flagged(bio, BIO_CLONED))
-   seg_cnt = bio_segments(bio);
-   else
-   seg_cnt = bio->bi_vcnt;
+   unsigned short seg_cnt = bio_segments(bio);
 
if (test_bit(QUEUE_FLAG_NO_SG_MERGE, >queue_flags) &&
(seg_cnt < queue_max_segments(q)))
-- 
2.9.5



[Cluster-devel] [PATCH V13 00/19] block: support multi-page bvec

2019-01-11 Thread Ming Lei
Hi,

This patchset brings multi-page bvec into block layer:

1) what is multi-page bvec?

Multipage bvecs means that one 'struct bio_bvec' can hold multiple pages
which are physically contiguous instead of one single page used in linux
kernel for long time.

2) why is multi-page bvec introduced?

Kent proposed the idea[1] first. 

As system's RAM becomes much bigger than before, and huge page, transparent
huge page and memory compaction are widely used, it is a bit easy now
to see physically contiguous pages from fs in I/O. On the other hand, from
block layer's view, it isn't necessary to store intermediate pages into bvec,
and it is enough to just store the physicallly contiguous 'segment' in each
io vector.

Also huge pages are being brought to filesystem and swap [2][6], we can
do IO on a hugepage each time[3], which requires that one bio can transfer
at least one huge page one time. Turns out it isn't flexiable to change
BIO_MAX_PAGES simply[3][5]. Multipage bvec can fit in this case very well.
As we saw, if CONFIG_THP_SWAP is enabled, BIO_MAX_PAGES can be configured
as much bigger, such as 512, which requires at least two 4K pages for holding
the bvec table.

With multi-page bvec:

- Inside block layer, both bio splitting and sg map can become more
efficient than before by just traversing the physically contiguous
'segment' instead of each page.

- segment handling in block layer can be improved much in future since it
should be quite easy to convert multipage bvec into segment easily. For
example, we might just store segment in each bvec directly in future.

- bio size can be increased and it should improve some high-bandwidth IO
case in theory[4].

- there is opportunity in future to improve memory footprint of bvecs. 

3) how is multi-page bvec implemented in this patchset?

Patch 1 ~ 4 parpares for supporting multi-page bvec. 

Patches 5 ~ 15 implement multipage bvec in block layer:

- put all tricks into bvec/bio/rq iterators, and as far as
drivers and fs use these standard iterators, they are happy
with multipage bvec

- introduce bio_for_each_bvec() to iterate over multipage bvec for 
splitting
bio and mapping sg

- keep current bio_for_each_segment*() to itereate over singlepage bvec 
and
make sure current users won't be broken; especailly, convert to this
new helper prototype in single patch 21 given it is bascially a 
mechanism
conversion

- deal with iomap & xfs's sub-pagesize io vec in patch 13

- enalbe multipage bvec in patch 14 

Patch 16 redefines BIO_MAX_PAGES as 256.

Patch 17 documents usages of bio iterator helpers.

Patch 18~19 kills NO_SG_MERGE.

These patches can be found in the following git tree:

git:  https://github.com/ming1/linux.git  for-4.21-block-mp-bvec-V12

Lots of test(blktest, xfstests, ltp io, ...) have been run with this patchset,
and not see regression.

Thanks Christoph for reviewing the early version and providing very good
suggestions, such as: introduce bio_init_with_vec_table(), remove another
unnecessary helpers for cleanup and so on.

Thanks Chritoph and Omar for reviewing V10/V11/V12, and provides lots of
helpful comments.

V13:
- rebase on v5.0-rc2
- address Omar's comment on patch 1 of V12 by using V11's approach
- rename one local vairable in patch 15 as suggested by Christoph

V12:
- deal with non-cluster by max segment size & segment boundary limit
- rename bvec helper's name
- revert new change on bvec_iter_advance() in V11
- introduce rq_for_each_bvec()
- use simpler check on enalbing multi-page bvec
- fix Document change

V11:
- address most of reviews from Omar and christoph
- rename mp_bvec_* as segment_* helpers
- remove 'mp' parameter from bvec_iter_advance() and related helpers
- cleanup patch on bvec_split_segs() and blk_bio_segment_split(),
  remove unnecessary checks
- simplify bvec_last_segment()
- drop bio_pages_all()
- introduce dedicated functions/file for handling non-cluser bio for
avoiding checking queue cluster before adding page to bio
- introduce bio_try_merge_segment() for simplifying iomap/xfs page
  accounting code
- Fix Document change

V10:
- no any code change, just add more guys and list into patch's CC list,
as suggested by Christoph and Dave Chinner
V9:
- fix regression on iomap's sub-pagesize io vec, covered by patch 13
V8:
- remove prepare patches which all are merged to linus tree
- rebase on for-4.21/block
- address comments on V7
- add patches of killing NO_SG_MERGE

V7:
- include Christoph and Mike's bio_clone_bioset() patches, which is
  actually prepare patches for multipage bvec
- address Christoph's comments

V6:
- avoid to introduce lots of renaming, follow Jen's 

[Cluster-devel] [PATCH V13 01/19] btrfs: look at bi_size for repair decisions

2019-01-11 Thread Ming Lei
From: Christoph Hellwig 

bio_readpage_error currently uses bi_vcnt to decide if it is worth
retrying an I/O.  But the vector count is mostly an implementation
artifact - it really should figure out if there is more than a
single sector worth retrying.  Use bi_size for that and shift by
PAGE_SHIFT.  This really should be blocks/sectors, but given that
btrfs doesn't support a sector size different from the PAGE_SIZE
using the page size keeps the changes to a minimum.

Reviewed-by: Omar Sandoval 
Reviewed-by: David Sterba 
Signed-off-by: Christoph Hellwig 
---
 fs/btrfs/extent_io.c | 2 +-
 include/linux/bio.h  | 6 --
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 52abe4082680..dc8ba3ee515d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2350,7 +2350,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 
phy_offset,
int read_mode = 0;
blk_status_t status;
int ret;
-   unsigned failed_bio_pages = bio_pages_all(failed_bio);
+   unsigned failed_bio_pages = failed_bio->bi_iter.bi_size >> PAGE_SHIFT;
 
BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7380b094dcca..72b4f7be2106 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -263,12 +263,6 @@ static inline void bio_get_last_bvec(struct bio *bio, 
struct bio_vec *bv)
bv->bv_len = iter.bi_bvec_done;
 }
 
-static inline unsigned bio_pages_all(struct bio *bio)
-{
-   WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-   return bio->bi_vcnt;
-}
-
 static inline struct bio_vec *bio_first_bvec_all(struct bio *bio)
 {
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
-- 
2.9.5