[Cluster-devel] GFS2: Pre-pull patch posting (fixes)
Hi, Here are a couple of patches which between them fix a problem where occasionally it was possible for the GFS2 module to be unloaded before all the glocks were deallocated, which, needless to say, made the slab allocator unhappy, Steve.
[Cluster-devel] [PATCH 1/2] GFS2: Wait for unlock completion on umount
This patch adds a wait on umount between the point at which we dispose of all glocks and the point at which we unmount the lock protocol. This ensures that we've received all the replies to our unlock requests before we stop the locking. Signed-off-by: Steven Whitehouse swhit...@redhat.com Reported-by: Fabio M. Di Nitto fdini...@redhat.com --- fs/gfs2/incore.h |2 ++ fs/gfs2/lock_dlm.c |7 ++- fs/gfs2/ops_fstype.c |2 ++ fs/gfs2/super.c |3 +++ 4 files changed, 13 insertions(+), 1 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4792200..bc0ad15 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -544,6 +544,8 @@ struct gfs2_sbd { struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_trans_gl; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; /* Inode Stuff */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 46df988..cdd0755 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -21,6 +21,7 @@ static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl-gl_state; + struct gfs2_sbd *sdp = gl-gl_sbd; BUG_ON(gl-gl_lksb.sb_flags DLM_SBF_DEMOTED); @@ -30,6 +31,8 @@ static void gdlm_ast(void *arg) switch (gl-gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ kmem_cache_free(gfs2_glock_cachep, gl); + if (atomic_dec_and_test(sdp-sd_glock_disposal)) + wake_up(sdp-sd_glock_wait); return; case -DLM_ECANCEL: /* Cancel while getting lock */ ret |= LM_OUT_CANCELED; @@ -167,7 +170,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) { struct gfs2_glock *gl = ptr; - struct lm_lockstruct *ls = gl-gl_sbd-sd_lockstruct; + struct gfs2_sbd *sdp = gl-gl_sbd; + struct lm_lockstruct *ls = sdp-sd_lockstruct; int error; if (gl-gl_lksb.sb_lkid == 0) { @@ -183,6 +187,7 @@ static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) (unsigned long long)gl-gl_name.ln_number, error); return; } + atomic_inc(sdp-sd_glock_disposal); } static void gdlm_cancel(struct gfs2_glock *gl) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index edfee24..9390fc7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -82,6 +82,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(sdp-sd_tune); + init_waitqueue_head(sdp-sd_glock_wait); + atomic_set(sdp-sd_glock_disposal, 0); spin_lock_init(sdp-sd_statfs_spin); spin_lock_init(sdp-sd_rindex_spin); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c282ad4..66242b3 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -21,6 +21,7 @@ #include linux/gfs2_ondisk.h #include linux/crc32.h #include linux/time.h +#include linux/wait.h #include gfs2.h #include incore.h @@ -860,6 +861,8 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + /* Wait for dlm to reply to all our unlock requests */ + wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 0); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); -- 1.6.2.5
[Cluster-devel] [PATCH 2/2] GFS2: Extend umount wait coverage to full glock lifetime
Although all glocks are, by the time of the umount glock wait, scheduled for demotion, some of them haven't made it far enough through the process for the original set of waiting code to wait for them. This extends the ref count to the whole glock lifetime in order to ensure that the waiting does catch all glocks. It does make it a bit more invasive, but it seems the only sensible solution at the moment. Signed-off-by: Steven Whitehouse swhit...@redhat.com --- fs/gfs2/glock.c |4 fs/gfs2/glock.h |2 +- fs/gfs2/lock_dlm.c |6 +++--- fs/gfs2/ops_fstype.c | 10 +- fs/gfs2/super.c |2 -- 5 files changed, 17 insertions(+), 7 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f455a03..f426633 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -769,6 +769,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!gl) return -ENOMEM; + atomic_inc(sdp-sd_glock_disposal); gl-gl_flags = 0; gl-gl_name = name; atomic_set(gl-gl_ref, 1); @@ -1538,6 +1539,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) up_write(gfs2_umount_flush_sem); msleep(10); } + flush_workqueue(glock_workqueue); + wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 0); + gfs2_dump_lockstate(sdp); } void gfs2_glock_finish_truncate(struct gfs2_inode *ip) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 13f0bd2..c0262fa 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -123,7 +123,7 @@ struct lm_lockops { int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); - void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); + void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); unsigned int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index cdd0755..0e5e0e7 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -167,15 +167,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, return LM_OUT_ASYNC; } -static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) +static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) { - struct gfs2_glock *gl = ptr; struct gfs2_sbd *sdp = gl-gl_sbd; struct lm_lockstruct *ls = sdp-sd_lockstruct; int error; if (gl-gl_lksb.sb_lkid == 0) { kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(sdp-sd_glock_disposal)) + wake_up(sdp-sd_glock_wait); return; } @@ -187,7 +188,6 @@ static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) (unsigned long long)gl-gl_name.ln_number, error); return; } - atomic_inc(sdp-sd_glock_disposal); } static void gdlm_cancel(struct gfs2_glock *gl) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 9390fc7..8a102f7 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -985,9 +985,17 @@ static const match_table_t nolock_tokens = { { Opt_err, NULL }, }; +static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl-gl_sbd; + kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(sdp-sd_glock_disposal)) + wake_up(sdp-sd_glock_wait); +} + static const struct lm_lockops nolock_ops = { .lm_proto_name = lock_nolock, - .lm_put_lock = kmem_cache_free, + .lm_put_lock = nolock_put_lock, .lm_tokens = nolock_tokens, }; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 66242b3..b9dd3da 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -861,8 +861,6 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); - /* Wait for dlm to reply to all our unlock requests */ - wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 0); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); -- 1.6.2.5
[Cluster-devel] GFS2: Pull request (fixes)
Hi, Please consider pulling the following two changes, Steve. The following changes since commit 1a45dcfe2525e9432cb4aba461d4994fc2befe42: Linus Torvalds (1): Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block are available in the git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-fixes.git master Steven Whitehouse (2): GFS2: Wait for unlock completion on umount GFS2: Extend umount wait coverage to full glock lifetime fs/gfs2/glock.c |4 fs/gfs2/glock.h |2 +- fs/gfs2/incore.h |2 ++ fs/gfs2/lock_dlm.c | 11 --- fs/gfs2/ops_fstype.c | 12 +++- fs/gfs2/super.c |1 + 6 files changed, 27 insertions(+), 5 deletions(-)
[Cluster-devel] conga/luci/init.d luci
CVSROOT:/cvs/cluster Module name:conga Branch: RHEL5 Changes by: rmcc...@sourceware.org 2010-02-04 23:36:35 Modified files: luci/init.d: luci Log message: conga: fix bz469881 Give luci more time to start up. This should fix problems with the luci init script when luci is running on a heavily loaded (or just very slow) machine. Patches: http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/init.d/luci.diff?cvsroot=clusteronly_with_tag=RHEL5r1=1.11.2.5r2=1.11.2.6 --- conga/luci/init.d/luci 2008/04/11 06:50:32 1.11.2.5 +++ conga/luci/init.d/luci 2010/02/04 23:36:34 1.11.2.6 @@ -138,7 +138,7 @@ fi stop_luci - max_wait=10 + max_wait=25 cur_wait=0 while [ $cur_wait -lt $max_wait ]; do sleep 1 @@ -150,7 +150,11 @@ done if [ $? -ne 0 ]; then - errmsg='Failed to stop luci' + if [ $cur_wait -eq $max_wait ]; then + errmsg=Luci did not stop after $max_wait seconds. + else + errmsg='Failed to stop luci' + fi return 1 fi return 0 @@ -170,7 +174,7 @@ sh $LUCID /dev/null cur_wait=0 - max_wait=10 + max_wait=25 luci_running ret=$? while [ $ret -ne 1 ] [ $cur_wait -lt $max_wait ]; do @@ -181,7 +185,11 @@ done if [ $ret -ne 1 ]; then - errmsg='An error occurred while starting luci' + if [ $cur_wait -eq $max_wait ]; then + errmsg=Luci did not start after $max_wait seconds. + else + errmsg='An error occurred while starting luci' + fi stop_luci return 1 fi
[Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync
Currently gfs2 ordered buffer writes use WRITE_SYNC_PLUG as the IO type being dispatched. They aren't sync writes; we issue all the IO pending, then wait for it all. IOWs, this is async IO with a bulk wait on the end. We should use normal WRITE tagging for this, and before we start waiting make sure that all the Io is issued by unplugging the device. The use of normal WRITEs for these buffers should significantly reduce the overhead of processing in the cfq elevator and enable the disk subsystem to get much closer to disk bandwidth for large sequential writes. Signed-off-by: Dave Chinner dchin...@redhat.com --- fs/gfs2/aops.c |3 +++ fs/gfs2/log.c | 11 +++ fs/gfs2/lops.c | 18 ++ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 7b8da94..b75784c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,6 +20,7 @@ #include linux/swap.h #include linux/gfs2_ondisk.h #include linux/backing-dev.h +#include linux/blkdev.h #include gfs2.h #include incore.h @@ -34,6 +35,7 @@ #include super.h #include util.h #include glops.h +#include trace_gfs2.h static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, @@ -52,6 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, if (gfs2_is_jdata(ip)) set_buffer_uptodate(bh); gfs2_trans_add_bh(ip-i_gl, bh, 0); + trace_gfs2_submit_bh(bh, WRITE, __func__); } } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index bd26dff..a9797be 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include linux/kthread.h #include linux/freezer.h #include linux/bio.h +#include linux/blkdev.h #include gfs2.h #include incore.h @@ -121,8 +122,8 @@ __acquires(sdp-sd_log_lock) lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh-b_end_io = end_buffer_write_sync; - trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__); - submit_bh(WRITE_SYNC_PLUG, bh); + trace_gfs2_submit_bh(bh, WRITE, __func__); + submit_bh(WRITE, bh); } else { unlock_buffer(bh); brelse(bh); @@ -675,8 +676,8 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) lock_buffer(bh); if (buffer_mapped(bh) test_clear_buffer_dirty(bh)) { bh-b_end_io = end_buffer_write_sync; - trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__); - submit_bh(WRITE_SYNC_PLUG, bh); + trace_gfs2_submit_bh(bh, WRITE, __func__); + submit_bh(WRITE, bh); } else { unlock_buffer(bh); brelse(bh); @@ -692,6 +693,8 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) struct gfs2_bufdata *bd; struct buffer_head *bh; + blk_run_backing_dev(blk_get_backing_dev_info(sdp-sd_vfs-s_bdev), NULL); + gfs2_log_lock(sdp); while (!list_empty(sdp-sd_log_le_ordered)) { bd = list_entry(sdp-sd_log_le_ordered.prev, struct gfs2_bufdata, bd_le.le_list); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 7278cf0..0fe2f3c 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -15,6 +15,7 @@ #include linux/gfs2_ondisk.h #include linux/bio.h #include linux/fs.h +#include linux/blkdev.h #include gfs2.h #include incore.h @@ -198,8 +199,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); - trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__); - submit_bh(WRITE_SYNC_PLUG, bh); + trace_gfs2_submit_bh(bh, WRITE, __func__); + submit_bh(WRITE, bh); gfs2_log_lock(sdp); n = 0; @@ -209,8 +210,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); lock_buffer(bd2-bd_bh); bh = gfs2_log_fake_buf(sdp, bd2-bd_bh); - trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__); - submit_bh(WRITE_SYNC_PLUG, bh); + trace_gfs2_submit_bh(bh, WRITE, __func__); + submit_bh(WRITE, bh); gfs2_log_lock(sdp); if (++n = num) break; @@ -220,6 +221,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) total -= num; } gfs2_log_unlock(sdp); + blk_run_backing_dev(blk_get_backing_dev_info(sdp-sd_vfs-s_bdev), NULL); } static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct
[Cluster-devel] (no subject)
These patches improve sequential write IO patterns and reduce ordered write log contention. The first patch is simply for diagnosis purposes - it enabled me to see where Io was being dispatched from, and led directly to he fix in the second patch. The third patch removes the use of WRITE_SYNC_PLUG for async writes (data, metadata and log), and the third moves the AIL pushing out from under the log lock so that incoming writes can still proceed while the log is being flushed. The difference is on a local disk that XFS can do 85MB/s sequential write, gfs2 can do: cfq noop vanilla 38MB/s 48MB/s +2 48MB/s 65MB/s +3 48MB/s 65MB/s +4 51MB/s 75MB/s The improvement is due to the IO patterns resulting in the disk being IO bound, and the subsequent improvements in IO patterns directly translate into more throughput On a faster 4-disk dm stripe array on the same machine that XFS can do 265MB/s (@ 550iop/s) sequential write, gfs2 can do: cfq noop vanilla 135MB/s @ 400iop/s 130MB/s @ 800iop/s +4 135MB/s @ 400iop/s 130MB/s @ 500iop/s No improvement or degradation in throughput is seen here as the disks never get to being IO bound - the write is cpu bound. However, there is an improvement in iops seen on no-op scheduler as a result of the improvement in IO dispatch patterns. The patches have not seen much testing, so this is really just a posting for comments/feedback at this point.