[Cluster-devel] [PATCH V8 12/18] block: allow bio_for_each_segment_all() to iterate over multi-page bvec

2018-11-09 Thread Ming Lei
This patch introduces one extra iterator variable to bio_for_each_segment_all(),
then we can allow bio_for_each_segment_all() to iterate over multi-page bvec.

Given it is just one mechannical & simple change on all 
bio_for_each_segment_all()
users, this patch does tree-wide change in one single patch, so that we can
avoid to use a temporary helper for this conversion.

Cc: linux-fsde...@vger.kernel.org
Cc: Alexander Viro 
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
Cc: linux-er...@lists.ozlabs.org
Cc: linux-bt...@vger.kernel.org
Cc: David Sterba 
Cc: Darrick J. Wong 
Cc: Gao Xiang 
Cc: Christoph Hellwig 
Cc: Theodore Ts'o 
Cc: linux-e...@vger.kernel.org
Cc: Coly Li 
Cc: linux-bca...@vger.kernel.org
Cc: Boaz Harrosh 
Cc: Bob Peterson 
Cc: cluster-devel@redhat.com
Signed-off-by: Ming Lei 
---
 block/bio.c   | 27 ++-
 block/blk-zoned.c |  1 +
 block/bounce.c|  6 --
 drivers/md/bcache/btree.c |  3 ++-
 drivers/md/dm-crypt.c |  3 ++-
 drivers/md/raid1.c|  3 ++-
 drivers/staging/erofs/data.c  |  3 ++-
 drivers/staging/erofs/unzip_vle.c |  3 ++-
 fs/block_dev.c|  6 --
 fs/btrfs/compression.c|  3 ++-
 fs/btrfs/disk-io.c|  3 ++-
 fs/btrfs/extent_io.c  | 12 
 fs/btrfs/inode.c  |  6 --
 fs/btrfs/raid56.c |  3 ++-
 fs/crypto/bio.c   |  3 ++-
 fs/direct-io.c|  4 +++-
 fs/exofs/ore.c|  3 ++-
 fs/exofs/ore_raid.c   |  3 ++-
 fs/ext4/page-io.c |  3 ++-
 fs/ext4/readpage.c|  3 ++-
 fs/f2fs/data.c|  9 ++---
 fs/gfs2/lops.c|  6 --
 fs/gfs2/meta_io.c |  3 ++-
 fs/iomap.c|  6 --
 fs/mpage.c|  3 ++-
 fs/xfs/xfs_aops.c |  5 +++--
 include/linux/bio.h   | 11 +--
 include/linux/bvec.h  | 31 +++
 28 files changed, 129 insertions(+), 46 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index d5368a445561..6486722d4d4b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1072,8 +1072,9 @@ static int bio_copy_from_iter(struct bio *bio, struct 
iov_iter *iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_from_iter(bvec->bv_page,
@@ -1103,8 +1104,9 @@ static int bio_copy_to_iter(struct bio *bio, struct 
iov_iter iter)
 {
int i;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
ssize_t ret;
 
ret = copy_page_to_iter(bvec->bv_page,
@@ -1126,8 +1128,9 @@ void bio_free_pages(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i)
+   bio_for_each_segment_all(bvec, bio, i, iter_all)
__free_page(bvec->bv_page);
 }
 EXPORT_SYMBOL(bio_free_pages);
@@ -1293,6 +1296,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
struct bio *bio;
int ret;
struct bio_vec *bvec;
+   struct bvec_iter_all iter_all;
 
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1366,7 +1370,7 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
 
  out_unmap:
-   bio_for_each_segment_all(bvec, bio, j) {
+   bio_for_each_segment_all(bvec, bio, j, iter_all) {
put_page(bvec->bv_page);
}
bio_put(bio);
@@ -1377,11 +1381,12 @@ static void __bio_unmap_user(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
/*
 * make sure we dirty pages we wrote to
 */
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
if (bio_data_dir(bio) == READ)
set_page_dirty_lock(bvec->bv_page);
 
@@ -1473,8 +1478,9 @@ static void bio_copy_kern_endio_read(struct bio *bio)
char *p = bio->bi_private;
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, bio, i) {
+   bio_for_each_segment_all(bvec, bio, i, iter_all) {
memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
p += bvec->bv_len;
}
@@ -1583,8 +1589,9 @@ void bio_set_pages_dirty(struct bio *bio)
 {
struct bio_vec *bvec;
int i;
+   struct bvec_iter_all iter_all;
 
-   bio_for_each_segment_all(bvec, 

Re: [Cluster-devel] [PATCH 0/3] dlm: fix various incorrect behaviors

2018-11-09 Thread Tycho Andersen
On Wed, Nov 07, 2018 at 04:20:42PM -0600, David Teigland wrote:
> On Fri, Nov 02, 2018 at 02:18:19PM -0600, Tycho Andersen wrote:
> > Hi,
> > 
> > here's a series to fix some bugs I noticed in the DLM. The third patch
> > in the series and maybe the first should probably go to stable, assuming
> > everyone agrees they're indeed bugs.
> > 
> > Thanks,
> > 
> > Tycho
> > 
> > Tycho Andersen (3):
> >   dlm: fix invalid free
> >   dlm: don't allow zero length names
> >   dlm: don't leak kernel pointer to userspace
> > 
> >  fs/dlm/lockspace.c | 2 +-
> >  fs/dlm/member.c| 5 +++--
> >  fs/dlm/user.c  | 2 +-
> >  3 files changed, 5 insertions(+), 4 deletions(-)
> 
> I've pushed these to linux-dlm next.

Great, thanks! Should we send 1 and 3 to stable?

Tycho



[Cluster-devel] [PATCH] gfs2: Fix metadata read-ahead during truncate (2)

2018-11-09 Thread Andreas Gruenbacher
The previous attempt to fix for metadata read-ahead during truncate was
incorrect: for files with a height > 2 (1006989312 bytes with a block
size of 4096 bytes), read-ahead requests were not being issued for some
of the indirect blocks discovered while walking the metadata tree,
leading to significant slow-downs when deleting large files.  Fix that.

In addition, only issue read-ahead requests in the first pass through
the meta-data tree, while deallocating data blocks.

Fixes: c3ce5aa9b0 ("gfs2: Fix metadata read-ahead during truncate")
Cc: sta...@vger.kernel.org # v4.16+
Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/bmap.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5f3ea07ef5e2..38d88fcb6988 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1908,10 +1908,16 @@ static int punch_hole(struct gfs2_inode *ip, u64 
offset, u64 length)
if (ret < 0)
goto out;
 
-   /* issue read-ahead on metadata */
-   if (mp.mp_aheight > 1) {
-   for (; ret > 1; ret--) {
-   metapointer_range(, mp.mp_aheight - 
ret,
+   /* On the first pass, issue read-ahead on metadata. */
+   if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
+   unsigned int height = mp.mp_aheight - 1;
+
+   /* No read-ahead for data blocks. */
+   if (mp.mp_aheight - 1 == strip_h)
+   height--;
+
+   for (; height >= mp.mp_aheight - ret; height--) 
{
+   metapointer_range(, height,
  start_list, 
start_aligned,
  end_list, end_aligned,
  , );
-- 
2.19.1.546.g028f9c799.dirty



Re: [Cluster-devel] [DLM PATCH] dlm: Don't swamp the CPU with callbacks queued during recovery

2018-11-09 Thread Steven Whitehouse

Hi,


On 08/11/18 19:04, Bob Peterson wrote:

Hi,

Before this patch, recovery would cause all callbacks to be delayed,
put on a queue, and afterward they were all queued to the callback
work queue. This patch does the same thing, but occasionally takes
a break after 25 of them so it won't swamp the CPU at the expense
of other RT processes like corosync.

Signed-off-by: Bob Peterson 
---
  fs/dlm/ast.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 562fa8c3edff..47ee66d70109 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -292,6 +292,8 @@ void dlm_callback_suspend(struct dlm_ls *ls)
flush_workqueue(ls->ls_callback_wq);
  }
  
+#define MAX_CB_QUEUE 25

+
  void dlm_callback_resume(struct dlm_ls *ls)
  {
struct dlm_lkb *lkb, *safe;
@@ -302,15 +304,23 @@ void dlm_callback_resume(struct dlm_ls *ls)
if (!ls->ls_callback_wq)
return;
  
+more:

mutex_lock(>ls_cb_mutex);
list_for_each_entry_safe(lkb, safe, >ls_cb_delay, lkb_cb_list) {
list_del_init(>lkb_cb_list);
queue_work(ls->ls_callback_wq, >lkb_cb_work);
count++;
+   if (count == MAX_CB_QUEUE)
+   break;
}
mutex_unlock(>ls_cb_mutex);
  
  	if (count)

log_rinfo(ls, "dlm_callback_resume %d", count);
+   if (count == MAX_CB_QUEUE) {
+   count = 0;
+   cond_resched();
+   goto more;
+   }
  }
  



While that is a good thing to do, it looks like the real culprit here 
might be elsewhere. Look at what this is doing... adding a large number 
of work items under the ls_cb_mutex, and then look at what the work item 
does... first thing is to lock the lkb_cb_mutex, so if we have a 
multi-core system then this is creating a large number of work items all 
of which will be fighting each other (and the thread that is trying to 
add new items) for the lock so no wonder it doesn't work efficiently.


If we called the callbacks directly here, then we would avoid all that 
fighting for the mutex and also remove the need for scheduling the work 
item in the first place too. That should greatly decrease the amount of 
cpu time required and reduce latency and contention on the mutex,


Steve.