Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
- Original Message - > On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehouse> wrote: > > That looks good to me, and I assume that it should be faster too? > > I did some tests with a directory tree with 3439 directories and 51556 > files in it. > > In that tree, 47313 or 86% of the 54995 files and directories ended up > with readahead enabled, while 7682 didn't. > > On a slow SATA magnetic disc, I compared the time required for running > "find", then "find | xargs stat", then "find | xargs getfattr". I > booted with selinux=0 for these measurements, so "find" and "find | > xargs stat" didn't use the xattrs. > > The results are attached as a spreadsheet. They show that xattr > readahead can save quite some time (here, about -75%). The optimized > version of xattr readahead is about 3% faster than the simple version > in this test, so I'll leave it up to you whether that's worth it for > you. > > It makes no difference whether the scheduler is "noop" or "deadline" > here; the "noop" scheduler still seems to merge adjacent I/O requests > (which makes sense). > > Andreas > Hi, Thanks. Both patches are now applied to the for-next branch of the linux-gfs2 tree: https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next=c8d577038449a718ad0027d1790b6ef4441715d4 https://git.kernel.org/cgit/linux/kernel/git/gfs2/linux-gfs2.git/commit/fs/gfs2?h=for-next=843396ad9886458bddbc7d07137b2975876b029e Regards, Bob Peterson Red Hat File Systems
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
On Thu, Nov 12, 2015 at 9:33 PM, Steven Whitehousewrote: > That looks good to me, and I assume that it should be faster too? I did some tests with a directory tree with 3439 directories and 51556 files in it. In that tree, 47313 or 86% of the 54995 files and directories ended up with readahead enabled, while 7682 didn't. On a slow SATA magnetic disc, I compared the time required for running "find", then "find | xargs stat", then "find | xargs getfattr". I booted with selinux=0 for these measurements, so "find" and "find | xargs stat" didn't use the xattrs. The results are attached as a spreadsheet. They show that xattr readahead can save quite some time (here, about -75%). The optimized version of xattr readahead is about 3% faster than the simple version in this test, so I'll leave it up to you whether that's worth it for you. It makes no difference whether the scheduler is "noop" or "deadline" here; the "noop" scheduler still seems to merge adjacent I/O requests (which makes sense). Andreas gfs2-readahead.ods Description: application/vnd.oasis.opendocument.spreadsheet
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
- Original Message - > Here is an updated version of this patch, please review. > > Thanks, > Andreas > > -- > > Instead of submitting a READ_SYNC bio for the inode and a READA bio for > the inode's extended attributes through submit_bh, submit a single READ > bio for both strough submit_bio when possible. This can be more > efficient on some kinds of block devices. > > Signed-off-by: Andreas Gruenbacher> --- > fs/gfs2/meta_io.c | 81 > ++- > 1 file changed, 63 insertions(+), 18 deletions(-) > > diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c > index 0f24828..e137d96 100644 > --- a/fs/gfs2/meta_io.c > +++ b/fs/gfs2/meta_io.c > @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock > *gl, u64 blkno) > return bh; > } > > -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) > +static void gfs2_meta_read_endio(struct bio *bio) > { > - struct buffer_head *bh; > + struct bio_vec *bvec; > + int i; > + > + bio_for_each_segment_all(bvec, bio, i) { > + struct page *page = bvec->bv_page; > + struct buffer_head *bh = page_buffers(page); > + unsigned int len = bvec->bv_len; > + > + while (bh_offset(bh) < bvec->bv_offset) > + bh = bh->b_this_page; > + do { > + struct buffer_head *next = bh->b_this_page; > + len -= bh->b_size; > + bh->b_end_io(bh, !bio->bi_error); > + bh = next; > + } while (bh && len); > + } > + bio_put(bio); > +} > > - bh = gfs2_getbuf(gl, blkno, 1); > - lock_buffer(bh); > - if (buffer_uptodate(bh)) { > - unlock_buffer(bh); > - brelse(bh); > +/* > + * Submit several consecutive buffer head I/O requests as a single bio I/O > + * request. (See submit_bh_wbc.) > + */ > +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) > +{ > + struct buffer_head *bh = bhs[0]; > + struct bio *bio; > + int i; > + > + if (!num) > return; > + > + bio = bio_alloc(GFP_NOIO, num); > + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); > + bio->bi_bdev = bh->b_bdev; > + for (i = 0; i < num; i++) { > + bh = bhs[i]; > + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); > } > - bh->b_end_io = end_buffer_read_sync; > - submit_bh(READA | REQ_META | REQ_PRIO, bh); > + bio->bi_end_io = gfs2_meta_read_endio; > + submit_bio(rw, bio); > } > > /** > @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int > flags, > int rahead, struct buffer_head **bhp) > { > struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; > - struct buffer_head *bh; > + struct buffer_head *bh, *bhs[2]; > + int num = 0; > > if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) { > *bhp = NULL; > @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, > int flags, > lock_buffer(bh); > if (buffer_uptodate(bh)) { > unlock_buffer(bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > - return 0; > + flags &= ~DIO_WAIT; > + } else { > + bh->b_end_io = end_buffer_read_sync; > + get_bh(bh); > + bhs[num++] = bh; > } > - bh->b_end_io = end_buffer_read_sync; > - get_bh(bh); > - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); > - if (rahead) > - gfs2_meta_readahead(gl, blkno + 1); > + > + if (rahead) { > + bh = gfs2_getbuf(gl, blkno + 1, CREATE); > + > + lock_buffer(bh); > + if (buffer_uptodate(bh)) { > + unlock_buffer(bh); > + brelse(bh); > + } else { > + bh->b_end_io = end_buffer_read_sync; > + bhs[num++] = bh; > + } > + } > + > + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); > if (!(flags & DIO_WAIT)) > return 0; > > + bh = *bhp; > wait_on_buffer(bh); > if (unlikely(!buffer_uptodate(bh))) { > struct gfs2_trans *tr = current->journal_info; > -- > 2.5.0 > > Hi, ACK to both patches Looks good. I'll hold onto them until this merge window closes. Regards, Bob Peterson Red Hat File Systems
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
Here is an updated version of this patch, please review. Thanks, Andreas -- Instead of submitting a READ_SYNC bio for the inode and a READA bio for the inode's extended attributes through submit_bh, submit a single READ bio for both strough submit_bio when possible. This can be more efficient on some kinds of block devices. Signed-off-by: Andreas Gruenbacher--- fs/gfs2/meta_io.c | 81 ++- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0f24828..e137d96 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +static void gfs2_meta_read_endio(struct bio *bio) { - struct buffer_head *bh; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_error); + bh = next; + } while (bh && len); + } + bio_put(bio); +} - bh = gfs2_getbuf(gl, blkno, 1); - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - brelse(bh); +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +{ + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; + + if (!num) return; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } - bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META | REQ_PRIO, bh); + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(rw, bio); } /** @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) { *bhp = NULL; @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; -- 2.5.0
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
Hi, On 12/11/15 20:15, Andreas Gruenbacher wrote: Here is an updated version of this patch, please review. Thanks, Andreas That looks good to me, and I assume that it should be faster too? Steve. -- Instead of submitting a READ_SYNC bio for the inode and a READA bio for the inode's extended attributes through submit_bh, submit a single READ bio for both strough submit_bio when possible. This can be more efficient on some kinds of block devices. Signed-off-by: Andreas Gruenbacher--- fs/gfs2/meta_io.c | 81 ++- 1 file changed, 63 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0f24828..e137d96 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,19 +187,50 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +static void gfs2_meta_read_endio(struct bio *bio) { - struct buffer_head *bh; + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_error); + bh = next; + } while (bh && len); + } + bio_put(bio); +} - bh = gfs2_getbuf(gl, blkno, 1); - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - brelse(bh); +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +{ + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; + + if (!num) return; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); } - bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META | REQ_PRIO, bh); + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(rw, bio); } /** @@ -216,7 +247,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) { *bhp = NULL; @@ -228,18 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info;
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
Hi, On 01/11/15 19:02, Andreas Gruenbacher wrote: Instead of submitting separate bio for the inode and its extended attributes, submit a single bio for both when possible. The entire request becomes a normal read, not a readahead. To keep track of the buffer heads that make up the bio, we allocate temporary buffer head arrays: in the endio handler, it would also be possible to compute the buffer head block numbers from the bio and to grab the buffer heads with gfs2_getbuf, but the code would become even messier. --- fs/gfs2/meta_io.c | 94 --- 1 file changed, 76 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0f24828..e650127 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) +struct gfs2_meta_read { + int num; + struct buffer_head *bhs[0]; +}; + +static void gfs2_meta_read_endio(struct bio *bio) { + struct gfs2_meta_read *r = bio->bi_private; + int i; + + for (i = 0; i < r->num; i++) { + struct buffer_head *bh = r->bhs[i]; + + if (unlikely(bio_flagged(bio, BIO_QUIET))) + set_bit(BH_Quiet, >b_state); + + bh->b_end_io(bh, !bio->bi_error); + } + bio_put(bio); + kfree(r); +} + +/* + * (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) { - struct buffer_head *bh; + struct gfs2_meta_read *r; + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; - bh = gfs2_getbuf(gl, blkno, 1); - lock_buffer(bh); - if (buffer_uptodate(bh)) { - unlock_buffer(bh); - brelse(bh); + if (!num) + return; + + if (num == 1) { + bh->b_end_io = end_buffer_read_sync; + submit_bh(rw, bh); return; } - bh->b_end_io = end_buffer_read_sync; - submit_bh(READA | REQ_META | REQ_PRIO, bh); + + r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]), + GFP_NOIO | __GFP_NOFAIL); Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL allocations, and I think it should be possible to figure out where the buffers are using the bio itself (i.e. iterating over it, similar to gfs2_end_log_write() in lops.c) so that we don't need to allocate this additional memory. Otherwise this looks like the right approach I think, Steve. + r->num = num; + for (i = 0; i < num; i++) + r->bhs[i] = bhs[i]; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + } + bio->bi_end_io = gfs2_meta_read_endio; + bio->bi_private = r; + + submit_bio(rw, bio); } /** @@ -216,7 +260,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, >sd_flags))) { *bhp = NULL; @@ -228,18 +273,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); - if (rahead) - gfs2_meta_readahead(gl, blkno + 1); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info;
Re: [Cluster-devel] [PATCH 2/2] gfs2: Extended attribute readahead optimization
On Thu, Nov 12, 2015 at 2:44 PM, Steven Whitehousewrote: > Hi, > > > On 01/11/15 19:02, Andreas Gruenbacher wrote: >> >> Instead of submitting separate bio for the inode and its extended >> attributes, submit a single bio for both when possible. The entire >> request becomes a normal read, not a readahead. >> >> To keep track of the buffer heads that make up the bio, we allocate >> temporary buffer head arrays: in the endio handler, it would also be >> possible to compute the buffer head block numbers from the bio and to >> grab the buffer heads with gfs2_getbuf, but the code would become even >> messier. >> --- >> fs/gfs2/meta_io.c | 94 >> --- >> 1 file changed, 76 insertions(+), 18 deletions(-) >> >> diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c >> index 0f24828..e650127 100644 >> --- a/fs/gfs2/meta_io.c >> +++ b/fs/gfs2/meta_io.c >> @@ -187,19 +187,63 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock >> *gl, u64 blkno) >> return bh; >> } >> -static void gfs2_meta_readahead(struct gfs2_glock *gl, u64 blkno) >> +struct gfs2_meta_read { >> + int num; >> + struct buffer_head *bhs[0]; >> +}; >> + >> +static void gfs2_meta_read_endio(struct bio *bio) { >> + struct gfs2_meta_read *r = bio->bi_private; >> + int i; >> + >> + for (i = 0; i < r->num; i++) { >> + struct buffer_head *bh = r->bhs[i]; >> + >> + if (unlikely(bio_flagged(bio, BIO_QUIET))) >> + set_bit(BH_Quiet, >b_state); >> + >> + bh->b_end_io(bh, !bio->bi_error); >> + } >> + bio_put(bio); >> + kfree(r); >> +} >> + >> +/* >> + * (See submit_bh_wbc.) >> + */ >> +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) >> { >> - struct buffer_head *bh; >> + struct gfs2_meta_read *r; >> + struct buffer_head *bh = bhs[0]; >> + struct bio *bio; >> + int i; >> - bh = gfs2_getbuf(gl, blkno, 1); >> - lock_buffer(bh); >> - if (buffer_uptodate(bh)) { >> - unlock_buffer(bh); >> - brelse(bh); >> + if (!num) >> + return; >> + >> + if (num == 1) { >> + bh->b_end_io = end_buffer_read_sync; >> + submit_bh(rw, bh); >> return; >> } >> - bh->b_end_io = end_buffer_read_sync; >> - submit_bh(READA | REQ_META | REQ_PRIO, bh); >> + >> + r = kmalloc(sizeof(*r) + num * sizeof(r->bhs[0]), >> + GFP_NOIO | __GFP_NOFAIL); > > Can we avoid doing this I wonder? I would like to avoid adding GFP_NOFAIL > allocations, and I think it should be possible to figure out where the > buffers are using the bio itself (i.e. iterating over it, similar to > gfs2_end_log_write() in lops.c) so that we don't need to allocate this > additional memory. Yes, that's better than going through gfs2_getbuf() again. I'll change that. Thanks, Andreas