Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-16 Thread Bob Peterson
- Original Message -
> On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse 
> wrote:
> > That looks good to me, and I assume that it should be faster too?
> 
> I did some tests with a directory tree with 3439 directories and 51556
> files in it.
> 
> In that tree, 47313 or 86% of the 54995 files and directories ended up
> with readahead enabled, while 7682 didn't.
> 
> On a slow SATA magnetic disc, I compared the time required for running
> "find", then "find | xargs stat", then "find | xargs getfattr". I
> booted with selinux=0 for these measurements, so "find" and "find |
> xargs stat" didn't use the xattrs.
> 
> The results are attached as a spreadsheet. They show that xattr
> readahead can save quite some time (here, about -75%). The optimized
> version of xattr readahead is about 3% faster than the simple version
> in this test, so I'll leave it up to you whether that's worth it for
> you.
> 
> It makes no difference whether the scheduler is "noop" or "deadline"
> here; the "noop" scheduler still seems to merge adjacent I/O requests
> (which makes sense).
> 
> Andreas
> 
Hi,

Thanks. Both patches are now applied to the for-next branch of the linux-gfs2 
tree:
https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next=c8d577038449a718ad0027d1790b6ef4441715d4
https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next=843396ad9886458bddbc7d07137b2975876b029e

Regards,

Bob Peterson
Red Hat File Systems



Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-13 Thread Andreas Gruenbacher
On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse  wrote:
> That looks good to me, and I assume that it should be faster too?

I did some tests with a directory tree with 3439 directories and 51556
files in it.

In that tree, 47313 or 86% of the 54995 files and directories ended up
with readahead enabled, while 7682 didn't.

On a slow SATA magnetic disc, I compared the time required for running
"find", then "find | xargs stat", then "find | xargs getfattr". I
booted with selinux=0 for these measurements, so "find" and "find |
xargs stat" didn't use the xattrs.

The results are attached as a spreadsheet. They show that xattr
readahead can save quite some time (here, about -75%). The optimized
version of xattr readahead is about 3% faster than the simple version
in this test, so I'll leave it up to you whether that's worth it for
you.

It makes no difference whether the scheduler is "noop" or "deadline"
here; the "noop" scheduler still seems to merge adjacent I/O requests
(which makes sense).

Andreas


gfs2-readahead.ods
Description: application/vnd.oasis.opendocument.spreadsheet


Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-13 Thread Bob Peterson
- Original Message -
> Here is an updated version of this patch, please review.
> 
> Thanks,
> Andreas
> 
> --
> 
> Instead of submitting a READ_SYNC bio for the inode and a READA bio for
> the inode's extended attributes through submit_bh, submit a single READ
> bio for both strough submit_bio when possible.  This can be more
> efficient on some kinds of block devices.
> 
> Signed-off-by: Andreas Gruenbacher 
> ---
>  fs/gfs2/meta_io.c | 81
>  ++-
>  1 file changed, 63 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
> index 0f24828..e137d96 100644
> --- a/fs/gfs2/meta_io.c
> +++ b/fs/gfs2/meta_io.c
> @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock
> *gl, u64 blkno)
>   return bh;
>  }
>  
> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
> +static void gfs2_meta_read_endio(struct bio *bio)
>  {
> - struct buffer_head *bh;
> + struct bio_vec *bvec;
> + int i;
> +
> + bio_for_each_segment_all(bvec, bio, i) {
> + struct page *page = bvec->bv_page;
> + struct buffer_head *bh = page_buffers(page);
> + unsigned int len = bvec->bv_len;
> +
> + while (bh_offset(bh) < bvec->bv_offset)
> + bh = bh->b_this_page;
> + do {
> + struct buffer_head *next = bh->b_this_page;
> + len -= bh->b_size;
> + bh->b_end_io(bh, !bio->bi_error);
> + bh = next;
> + } while (bh && len);
> + }
> + bio_put(bio);
> +}
>  
> - bh = gfs2_getbuf(gl, blkno, 1);
> - lock_buffer(bh);
> - if (buffer_uptodate(bh)) {
> - unlock_buffer(bh);
> - brelse(bh);
> +/*
> + * Submit several consecutive buffer head I/O requests as a single bio I/O
> + * request.  (See submit_bh_wbc.)
> + */
> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
> +{
> + struct buffer_head *bh = bhs[0];
> + struct bio *bio;
> + int i;
> +
> + if (!num)
>   return;
> +
> + bio = bio_alloc(GFP_NOIO, num);
> + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
> + bio->bi_bdev = bh->b_bdev;
> + for (i = 0; i < num; i++) {
> + bh = bhs[i];
> + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
>   }
> - bh->b_end_io = end_buffer_read_sync;
> - submit_bh(READA | REQ_META | REQ_PRIO, bh);
> + bio->bi_end_io = gfs2_meta_read_endio;
> + submit_bio(rw, bio);
>  }
>  
>  /**
> @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int
> flags,
>  int rahead, struct buffer_head **bhp)
>  {
>   struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
> - struct buffer_head *bh;
> + struct buffer_head *bh, *bhs[2];
> + int num = 0;
>  
>   if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) {
>   *bhp = NULL;
> @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
> int flags,
>   lock_buffer(bh);
>   if (buffer_uptodate(bh)) {
>   unlock_buffer(bh);
> - if (rahead)
> - gfs2_meta_readahead(gl, blkno + 1);
> - return 0;
> + flags &= ~DIO_WAIT;
> + } else {
> + bh->b_end_io = end_buffer_read_sync;
> + get_bh(bh);
> + bhs[num++] = bh;
>   }
> - bh->b_end_io = end_buffer_read_sync;
> - get_bh(bh);
> - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
> - if (rahead)
> - gfs2_meta_readahead(gl, blkno + 1);
> +
> + if (rahead) {
> + bh = gfs2_getbuf(gl, blkno + 1, CREATE);
> +
> + lock_buffer(bh);
> + if (buffer_uptodate(bh)) {
> + unlock_buffer(bh);
> + brelse(bh);
> + } else {
> + bh->b_end_io = end_buffer_read_sync;
> + bhs[num++] = bh;
> + }
> + }
> +
> + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
>   if (!(flags & DIO_WAIT))
>   return 0;
>  
> + bh = *bhp;
>   wait_on_buffer(bh);
>   if (unlikely(!buffer_uptodate(bh))) {
>   struct gfs2_trans *tr = current->journal_info;
> --
> 2.5.0
> 
> 
Hi,

ACK to both patches

Looks good. I'll hold onto them until this merge window closes.

Regards,

Bob Peterson
Red Hat File Systems



Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-12 Thread Andreas Gruenbacher
Here is an updated version of this patch, please review.

Thanks,
Andreas

--

Instead of submitting a READ_SYNC bio for the inode and a READA bio for
the inode's extended attributes through submit_bh, submit a single READ
bio for both strough submit_bio when possible.  This can be more
efficient on some kinds of block devices.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/meta_io.c | 81 ++-
 1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0f24828..e137d96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, 
u64 blkno)
return bh;
 }
 
-static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
+static void gfs2_meta_read_endio(struct bio *bio)
 {
-   struct buffer_head *bh;
+   struct bio_vec *bvec;
+   int i;
+
+   bio_for_each_segment_all(bvec, bio, i) {
+   struct page *page = bvec->bv_page;
+   struct buffer_head *bh = page_buffers(page);
+   unsigned int len = bvec->bv_len;
+
+   while (bh_offset(bh) < bvec->bv_offset)
+   bh = bh->b_this_page;
+   do {
+   struct buffer_head *next = bh->b_this_page;
+   len -= bh->b_size;
+   bh->b_end_io(bh, !bio->bi_error);
+   bh = next;
+   } while (bh && len);
+   }
+   bio_put(bio);
+}
 
-   bh = gfs2_getbuf(gl, blkno, 1);
-   lock_buffer(bh);
-   if (buffer_uptodate(bh)) {
-   unlock_buffer(bh);
-   brelse(bh);
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request.  (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+   struct buffer_head *bh = bhs[0];
+   struct bio *bio;
+   int i;
+
+   if (!num)
return;
+
+   bio = bio_alloc(GFP_NOIO, num);
+   bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+   bio->bi_bdev = bh->b_bdev;
+   for (i = 0; i < num; i++) {
+   bh = bhs[i];
+   bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
}
-   bh->b_end_io = end_buffer_read_sync;
-   submit_bh(READA | REQ_META | REQ_PRIO, bh);
+   bio->bi_end_io = gfs2_meta_read_endio;
+   submit_bio(rw, bio);
 }
 
 /**
@@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
   int rahead, struct buffer_head **bhp)
 {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-   struct buffer_head *bh;
+   struct buffer_head *bh, *bhs[2];
+   int num = 0;
 
if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) {
*bhp = NULL;
@@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
-   return 0;
+   flags &= ~DIO_WAIT;
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   get_bh(bh);
+   bhs[num++] = bh;
}
-   bh->b_end_io = end_buffer_read_sync;
-   get_bh(bh);
-   submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
+
+   if (rahead) {
+   bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+   lock_buffer(bh);
+   if (buffer_uptodate(bh)) {
+   unlock_buffer(bh);
+   brelse(bh);
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   bhs[num++] = bh;
+   }
+   }
+
+   gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
 
+   bh = *bhp;
wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;
-- 
2.5.0



Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-12 Thread Steven Whitehouse

Hi,

On 12/11/15 20:15, Andreas Gruenbacher wrote:

Here is an updated version of this patch, please review.

Thanks,
Andreas


That looks good to me, and I assume that it should be faster too?

Steve.



--

Instead of submitting a READ_SYNC bio for the inode and a READA bio for
the inode's extended attributes through submit_bh, submit a single READ
bio for both strough submit_bio when possible.  This can be more
efficient on some kinds of block devices.

Signed-off-by: Andreas Gruenbacher 
---
  fs/gfs2/meta_io.c | 81 ++-
  1 file changed, 63 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0f24828..e137d96 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, 
u64 blkno)
return bh;
  }
  
-static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)

+static void gfs2_meta_read_endio(struct bio *bio)
  {
-   struct buffer_head *bh;
+   struct bio_vec *bvec;
+   int i;
+
+   bio_for_each_segment_all(bvec, bio, i) {
+   struct page *page = bvec->bv_page;
+   struct buffer_head *bh = page_buffers(page);
+   unsigned int len = bvec->bv_len;
+
+   while (bh_offset(bh) < bvec->bv_offset)
+   bh = bh->b_this_page;
+   do {
+   struct buffer_head *next = bh->b_this_page;
+   len -= bh->b_size;
+   bh->b_end_io(bh, !bio->bi_error);
+   bh = next;
+   } while (bh && len);
+   }
+   bio_put(bio);
+}
  
-	bh = gfs2_getbuf(gl, blkno, 1);

-   lock_buffer(bh);
-   if (buffer_uptodate(bh)) {
-   unlock_buffer(bh);
-   brelse(bh);
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request.  (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+   struct buffer_head *bh = bhs[0];
+   struct bio *bio;
+   int i;
+
+   if (!num)
return;
+
+   bio = bio_alloc(GFP_NOIO, num);
+   bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+   bio->bi_bdev = bh->b_bdev;
+   for (i = 0; i < num; i++) {
+   bh = bhs[i];
+   bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
}
-   bh->b_end_io = end_buffer_read_sync;
-   submit_bh(READA | REQ_META | REQ_PRIO, bh);
+   bio->bi_end_io = gfs2_meta_read_endio;
+   submit_bio(rw, bio);
  }
  
  /**

@@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
   int rahead, struct buffer_head **bhp)
  {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-   struct buffer_head *bh;
+   struct buffer_head *bh, *bhs[2];
+   int num = 0;
  
  	if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) {

*bhp = NULL;
@@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
-   return 0;
+   flags &= ~DIO_WAIT;
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   get_bh(bh);
+   bhs[num++] = bh;
}
-   bh->b_end_io = end_buffer_read_sync;
-   get_bh(bh);
-   submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
+
+   if (rahead) {
+   bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+   lock_buffer(bh);
+   if (buffer_uptodate(bh)) {
+   unlock_buffer(bh);
+   brelse(bh);
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   bhs[num++] = bh;
+   }
+   }
+
+   gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
  
+	bh = *bhp;

wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;




Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-12 Thread Steven Whitehouse

Hi,

On 01/11/15 19:02, Andreas Gruenbacher wrote:

Instead of submitting separate bio for the inode and its extended
attributes, submit a single bio for both when possible.  The entire
request becomes a normal read, not a readahead.

To keep track of the buffer heads that make up the bio, we allocate
temporary buffer head arrays: in the endio handler, it would also be
possible to compute the buffer head block numbers from the bio and to
grab the buffer heads with gfs2_getbuf, but the code would become even
messier.
---
  fs/gfs2/meta_io.c | 94 ---
  1 file changed, 76 insertions(+), 18 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0f24828..e650127 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, 
u64 blkno)
return bh;
  }
  
-static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)

+struct gfs2_meta_read {
+   int num;
+   struct buffer_head *bhs[0];
+};
+
+static void gfs2_meta_read_endio(struct bio *bio) {
+   struct gfs2_meta_read *r = bio->bi_private;
+   int i;
+
+   for (i = 0; i < r->num; i++) {
+   struct buffer_head *bh = r->bhs[i];
+
+   if (unlikely(bio_flagged(bio, BIO_QUIET)))
+   set_bit(BH_Quiet, >b_state);
+
+   bh->b_end_io(bh, !bio->bi_error);
+   }
+   bio_put(bio);
+   kfree(r);
+}
+
+/*
+ * (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
  {
-   struct buffer_head *bh;
+   struct gfs2_meta_read *r;
+   struct buffer_head *bh = bhs[0];
+   struct bio *bio;
+   int i;
  
-	bh = gfs2_getbuf(gl, blkno, 1);

-   lock_buffer(bh);
-   if (buffer_uptodate(bh)) {
-   unlock_buffer(bh);
-   brelse(bh);
+   if (!num)
+   return;
+
+   if (num == 1) {
+   bh->b_end_io = end_buffer_read_sync;
+   submit_bh(rw, bh);
return;
}
-   bh->b_end_io = end_buffer_read_sync;
-   submit_bh(READA | REQ_META | REQ_PRIO, bh);
+
+   r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]),
+   GFP_NOIO | __GFP_NOFAIL);
Can we avoid doing this I wonder? I would like to avoid adding 
GFP_NOFAIL allocations, and I think it should be possible to figure out 
where the buffers are using the bio itself (i.e. iterating over it, 
similar to gfs2_end_log_write() in lops.c) so that we don't need to 
allocate this additional memory. Otherwise this looks like the right 
approach I think,


Steve.


+   r->num = num;
+   for (i = 0; i < num; i++)
+   r->bhs[i] = bhs[i];
+
+   bio = bio_alloc(GFP_NOIO, num);
+   bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+   bio->bi_bdev = bh->b_bdev;
+   for (i = 0; i < num; i++) {
+   bh = bhs[i];
+   bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+   }
+   bio->bi_end_io = gfs2_meta_read_endio;
+   bio->bi_private = r;
+
+   submit_bio(rw, bio);
  }
  
  /**

@@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
   int rahead, struct buffer_head **bhp)
  {
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-   struct buffer_head *bh;
+   struct buffer_head *bh, *bhs[2];
+   int num = 0;
  
  	if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) {

*bhp = NULL;
@@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int 
flags,
lock_buffer(bh);
if (buffer_uptodate(bh)) {
unlock_buffer(bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
-   return 0;
+   flags &= ~DIO_WAIT;
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   get_bh(bh);
+   bhs[num++] = bh;
}
-   bh->b_end_io = end_buffer_read_sync;
-   get_bh(bh);
-   submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
-   if (rahead)
-   gfs2_meta_readahead(gl, blkno + 1);
+
+   if (rahead) {
+   bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+   lock_buffer(bh);
+   if (buffer_uptodate(bh)) {
+   unlock_buffer(bh);
+   brelse(bh);
+   } else {
+   bh->b_end_io = end_buffer_read_sync;
+   bhs[num++] = bh;
+   }
+   }
+
+   gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
if (!(flags & DIO_WAIT))
return 0;
  
+	bh = *bhp;

wait_on_buffer(bh);
if (unlikely(!buffer_uptodate(bh))) {
struct gfs2_trans *tr = current->journal_info;




Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization

2015-11-12 Thread Andreas Gruenbacher
On Thu, Nov 12, 2015 at 2:44 PM, Steven Whitehouse  wrote:
> Hi,
>
>
> On 01/11/15 19:02, Andreas Gruenbacher wrote:
>>
>> Instead of submitting separate bio for the inode and its extended
>> attributes, submit a single bio for both when possible.  The entire
>> request becomes a normal read, not a readahead.
>>
>> To keep track of the buffer heads that make up the bio, we allocate
>> temporary buffer head arrays: in the endio handler, it would also be
>> possible to compute the buffer head block numbers from the bio and to
>> grab the buffer heads with gfs2_getbuf, but the code would become even
>> messier.
>> ---
>>   fs/gfs2/meta_io.c | 94
>> ---
>>   1 file changed, 76 insertions(+), 18 deletions(-)
>>
>> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
>> index 0f24828..e650127 100644
>> --- a/fs/gfs2/meta_io.c
>> +++ b/fs/gfs2/meta_io.c
>> @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock
>> *gl, u64 blkno)
>> return bh;
>>   }
>>   -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno)
>> +struct gfs2_meta_read {
>> +   int num;
>> +   struct buffer_head *bhs[0];
>> +};
>> +
>> +static void gfs2_meta_read_endio(struct bio *bio) {
>> +   struct gfs2_meta_read *r = bio->bi_private;
>> +   int i;
>> +
>> +   for (i = 0; i < r->num; i++) {
>> +   struct buffer_head *bh = r->bhs[i];
>> +
>> +   if (unlikely(bio_flagged(bio, BIO_QUIET)))
>> +   set_bit(BH_Quiet, >b_state);
>> +
>> +   bh->b_end_io(bh, !bio->bi_error);
>> +   }
>> +   bio_put(bio);
>> +   kfree(r);
>> +}
>> +
>> +/*
>> + * (See submit_bh_wbc.)
>> + */
>> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
>>   {
>> -   struct buffer_head *bh;
>> +   struct gfs2_meta_read *r;
>> +   struct buffer_head *bh = bhs[0];
>> +   struct bio *bio;
>> +   int i;
>>   - bh = gfs2_getbuf(gl, blkno, 1);
>> -   lock_buffer(bh);
>> -   if (buffer_uptodate(bh)) {
>> -   unlock_buffer(bh);
>> -   brelse(bh);
>> +   if (!num)
>> +   return;
>> +
>> +   if (num == 1) {
>> +   bh->b_end_io = end_buffer_read_sync;
>> +   submit_bh(rw, bh);
>> return;
>> }
>> -   bh->b_end_io = end_buffer_read_sync;
>> -   submit_bh(READA | REQ_META | REQ_PRIO, bh);
>> +
>> +   r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]),
>> +   GFP_NOIO | __GFP_NOFAIL);
>
> Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL
> allocations, and I think it should be possible to figure out where the
> buffers are using the bio itself (i.e. iterating over it, similar to
> gfs2_end_log_write() in lops.c) so that we don't need to allocate this
> additional memory.

Yes, that's better than going through gfs2_getbuf() again. I'll change that.

Thanks,
Andreas