[Cluster-devel] GFS2: Pre-pull patch posting (fixes)

2010-02-04 Thread Steven Whitehouse
Hi,

Here are a couple of patches which between them fix a problem where
occasionally it was possible for the GFS2 module to be unloaded
before all the glocks were deallocated, which, needless to say, made
the slab allocator unhappy,

Steve.



[Cluster-devel] [PATCH 1/2] GFS2: Wait for unlock completion on umount

2010-02-04 Thread Steven Whitehouse
This patch adds a wait on umount between the point at which we
dispose of all glocks and the point at which we unmount the
lock protocol. This ensures that we've received all the replies
to our unlock requests before we stop the locking.

Signed-off-by: Steven Whitehouse swhit...@redhat.com
Reported-by: Fabio M. Di Nitto fdini...@redhat.com
---
 fs/gfs2/incore.h |2 ++
 fs/gfs2/lock_dlm.c   |7 ++-
 fs/gfs2/ops_fstype.c |2 ++
 fs/gfs2/super.c  |3 +++
 4 files changed, 13 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 4792200..bc0ad15 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -544,6 +544,8 @@ struct gfs2_sbd {
struct gfs2_holder sd_live_gh;
struct gfs2_glock *sd_rename_gl;
struct gfs2_glock *sd_trans_gl;
+   wait_queue_head_t sd_glock_wait;
+   atomic_t sd_glock_disposal;
 
/* Inode Stuff */
 
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 46df988..cdd0755 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -21,6 +21,7 @@ static void gdlm_ast(void *arg)
 {
struct gfs2_glock *gl = arg;
unsigned ret = gl-gl_state;
+   struct gfs2_sbd *sdp = gl-gl_sbd;
 
BUG_ON(gl-gl_lksb.sb_flags  DLM_SBF_DEMOTED);
 
@@ -30,6 +31,8 @@ static void gdlm_ast(void *arg)
switch (gl-gl_lksb.sb_status) {
case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
kmem_cache_free(gfs2_glock_cachep, gl);
+   if (atomic_dec_and_test(sdp-sd_glock_disposal))
+   wake_up(sdp-sd_glock_wait);
return;
case -DLM_ECANCEL: /* Cancel while getting lock */
ret |= LM_OUT_CANCELED;
@@ -167,7 +170,8 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
 static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
 {
struct gfs2_glock *gl = ptr;
-   struct lm_lockstruct *ls = gl-gl_sbd-sd_lockstruct;
+   struct gfs2_sbd *sdp = gl-gl_sbd;
+   struct lm_lockstruct *ls = sdp-sd_lockstruct;
int error;
 
if (gl-gl_lksb.sb_lkid == 0) {
@@ -183,6 +187,7 @@ static void gdlm_put_lock(struct kmem_cache *cachep, void 
*ptr)
   (unsigned long long)gl-gl_name.ln_number, error);
return;
}
+   atomic_inc(sdp-sd_glock_disposal);
 }
 
 static void gdlm_cancel(struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index edfee24..9390fc7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -82,6 +82,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 
gfs2_tune_init(sdp-sd_tune);
 
+   init_waitqueue_head(sdp-sd_glock_wait);
+   atomic_set(sdp-sd_glock_disposal, 0);
spin_lock_init(sdp-sd_statfs_spin);
 
spin_lock_init(sdp-sd_rindex_spin);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c282ad4..66242b3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -21,6 +21,7 @@
 #include linux/gfs2_ondisk.h
 #include linux/crc32.h
 #include linux/time.h
+#include linux/wait.h
 
 #include gfs2.h
 #include incore.h
@@ -860,6 +861,8 @@ restart:
gfs2_jindex_free(sdp);
/*  Take apart glock structures and buffer lists  */
gfs2_gl_hash_clear(sdp);
+   /* Wait for dlm to reply to all our unlock requests */
+   wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 
0);
/*  Unmount the locking protocol  */
gfs2_lm_unmount(sdp);
 
-- 
1.6.2.5



[Cluster-devel] [PATCH 2/2] GFS2: Extend umount wait coverage to full glock lifetime

2010-02-04 Thread Steven Whitehouse
Although all glocks are, by the time of the umount glock wait,
scheduled for demotion, some of them haven't made it far
enough through the process for the original set of waiting
code to wait for them.

This extends the ref count to the whole glock lifetime in order
to ensure that the waiting does catch all glocks. It does make
it a bit more invasive, but it seems the only sensible solution
at the moment.

Signed-off-by: Steven Whitehouse swhit...@redhat.com
---
 fs/gfs2/glock.c  |4 
 fs/gfs2/glock.h  |2 +-
 fs/gfs2/lock_dlm.c   |6 +++---
 fs/gfs2/ops_fstype.c |   10 +-
 fs/gfs2/super.c  |2 --
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f455a03..f426633 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -769,6 +769,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
if (!gl)
return -ENOMEM;
 
+   atomic_inc(sdp-sd_glock_disposal);
gl-gl_flags = 0;
gl-gl_name = name;
atomic_set(gl-gl_ref, 1);
@@ -1538,6 +1539,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
up_write(gfs2_umount_flush_sem);
msleep(10);
}
+   flush_workqueue(glock_workqueue);
+   wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 
0);
+   gfs2_dump_lockstate(sdp);
 }
 
 void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 13f0bd2..c0262fa 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -123,7 +123,7 @@ struct lm_lockops {
int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
-   void (*lm_put_lock) (struct kmem_cache *cachep, void *gl);
+   void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl);
unsigned int (*lm_lock) (struct gfs2_glock *gl,
 unsigned int req_state, unsigned int flags);
void (*lm_cancel) (struct gfs2_glock *gl);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index cdd0755..0e5e0e7 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -167,15 +167,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl,
return LM_OUT_ASYNC;
 }
 
-static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr)
+static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
 {
-   struct gfs2_glock *gl = ptr;
struct gfs2_sbd *sdp = gl-gl_sbd;
struct lm_lockstruct *ls = sdp-sd_lockstruct;
int error;
 
if (gl-gl_lksb.sb_lkid == 0) {
kmem_cache_free(cachep, gl);
+   if (atomic_dec_and_test(sdp-sd_glock_disposal))
+   wake_up(sdp-sd_glock_wait);
return;
}
 
@@ -187,7 +188,6 @@ static void gdlm_put_lock(struct kmem_cache *cachep, void 
*ptr)
   (unsigned long long)gl-gl_name.ln_number, error);
return;
}
-   atomic_inc(sdp-sd_glock_disposal);
 }
 
 static void gdlm_cancel(struct gfs2_glock *gl)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9390fc7..8a102f7 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -985,9 +985,17 @@ static const match_table_t nolock_tokens = {
{ Opt_err, NULL },
 };
 
+static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl)
+{
+   struct gfs2_sbd *sdp = gl-gl_sbd;
+   kmem_cache_free(cachep, gl);
+   if (atomic_dec_and_test(sdp-sd_glock_disposal))
+   wake_up(sdp-sd_glock_wait);
+}
+
 static const struct lm_lockops nolock_ops = {
.lm_proto_name = lock_nolock,
-   .lm_put_lock = kmem_cache_free,
+   .lm_put_lock = nolock_put_lock,
.lm_tokens = nolock_tokens,
 };
 
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 66242b3..b9dd3da 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -861,8 +861,6 @@ restart:
gfs2_jindex_free(sdp);
/*  Take apart glock structures and buffer lists  */
gfs2_gl_hash_clear(sdp);
-   /* Wait for dlm to reply to all our unlock requests */
-   wait_event(sdp-sd_glock_wait, atomic_read(sdp-sd_glock_disposal) == 
0);
/*  Unmount the locking protocol  */
gfs2_lm_unmount(sdp);
 
-- 
1.6.2.5



[Cluster-devel] GFS2: Pull request (fixes)

2010-02-04 Thread Steven Whitehouse
Hi,

Please consider pulling the following two changes,

Steve.


The following changes since commit 1a45dcfe2525e9432cb4aba461d4994fc2befe42:
  Linus Torvalds (1):
Merge branch 'for-linus' of git://git.kernel.dk/linux-2.6-block

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-fixes.git master

Steven Whitehouse (2):
  GFS2: Wait for unlock completion on umount
  GFS2: Extend umount wait coverage to full glock lifetime

 fs/gfs2/glock.c  |4 
 fs/gfs2/glock.h  |2 +-
 fs/gfs2/incore.h |2 ++
 fs/gfs2/lock_dlm.c   |   11 ---
 fs/gfs2/ops_fstype.c |   12 +++-
 fs/gfs2/super.c  |1 +
 6 files changed, 27 insertions(+), 5 deletions(-)




[Cluster-devel] conga/luci/init.d luci

2010-02-04 Thread rmccabe
CVSROOT:/cvs/cluster
Module name:conga
Branch: RHEL5
Changes by: rmcc...@sourceware.org  2010-02-04 23:36:35

Modified files:
luci/init.d: luci 

Log message:
conga: fix bz469881

Give luci more time to start up. This should fix problems with the luci 
init script when luci is running on a heavily loaded (or just very slow) 
machine.

Patches:
http://sourceware.org/cgi-bin/cvsweb.cgi/conga/luci/init.d/luci.diff?cvsroot=clusteronly_with_tag=RHEL5r1=1.11.2.5r2=1.11.2.6

--- conga/luci/init.d/luci  2008/04/11 06:50:32 1.11.2.5
+++ conga/luci/init.d/luci  2010/02/04 23:36:34 1.11.2.6
@@ -138,7 +138,7 @@
fi
 
stop_luci
-   max_wait=10
+   max_wait=25
cur_wait=0
while [ $cur_wait -lt $max_wait ]; do
sleep 1
@@ -150,7 +150,11 @@
done
 
if [ $? -ne 0 ]; then
-   errmsg='Failed to stop luci'
+   if [ $cur_wait -eq $max_wait ]; then
+   errmsg=Luci did not stop after $max_wait seconds.
+   else
+   errmsg='Failed to stop luci'
+   fi
return 1
fi
return 0
@@ -170,7 +174,7 @@
sh $LUCID /dev/null 
 
cur_wait=0
-   max_wait=10
+   max_wait=25
luci_running
ret=$?
while [ $ret -ne 1 ]  [ $cur_wait -lt $max_wait ]; do
@@ -181,7 +185,11 @@
done
 
if [ $ret -ne 1 ]; then
-   errmsg='An error occurred while starting luci'
+   if [ $cur_wait -eq $max_wait ]; then
+   errmsg=Luci did not start after $max_wait seconds.
+   else
+   errmsg='An error occurred while starting luci'
+   fi
stop_luci
return 1
fi



[Cluster-devel] [PATCH 3/4] gfs2: ordered buffer writes are not sync

2010-02-04 Thread Dave Chinner
Currently gfs2 ordered buffer writes use WRITE_SYNC_PLUG as the IO
type being dispatched. They aren't sync writes; we issue all the IO
pending, then wait for it all. IOWs, this is async IO with a bulk
wait on the end.

We should use normal WRITE tagging for this, and before we start
waiting make sure that all the Io is issued by unplugging the
device. The use of normal WRITEs for these buffers should
significantly reduce the overhead of processing in the cfq elevator
and enable the disk subsystem to get much closer to disk bandwidth
for large sequential writes.

Signed-off-by: Dave Chinner dchin...@redhat.com
---
 fs/gfs2/aops.c |3 +++
 fs/gfs2/log.c  |   11 +++
 fs/gfs2/lops.c |   18 ++
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 7b8da94..b75784c 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
 #include linux/swap.h
 #include linux/gfs2_ondisk.h
 #include linux/backing-dev.h
+#include linux/blkdev.h
 
 #include gfs2.h
 #include incore.h
@@ -34,6 +35,7 @@
 #include super.h
 #include util.h
 #include glops.h
+#include trace_gfs2.h
 
 
 static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
@@ -52,6 +54,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, 
struct page *page,
if (gfs2_is_jdata(ip))
set_buffer_uptodate(bh);
gfs2_trans_add_bh(ip-i_gl, bh, 0);
+   trace_gfs2_submit_bh(bh, WRITE, __func__);
}
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index bd26dff..a9797be 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -18,6 +18,7 @@
 #include linux/kthread.h
 #include linux/freezer.h
 #include linux/bio.h
+#include linux/blkdev.h
 
 #include gfs2.h
 #include incore.h
@@ -121,8 +122,8 @@ __acquires(sdp-sd_log_lock)
lock_buffer(bh);
if (test_clear_buffer_dirty(bh)) {
bh-b_end_io = end_buffer_write_sync;
-   trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, 
__func__);
-   submit_bh(WRITE_SYNC_PLUG, bh);
+   trace_gfs2_submit_bh(bh, WRITE, __func__);
+   submit_bh(WRITE, bh);
} else {
unlock_buffer(bh);
brelse(bh);
@@ -675,8 +676,8 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
lock_buffer(bh);
if (buffer_mapped(bh)  test_clear_buffer_dirty(bh)) {
bh-b_end_io = end_buffer_write_sync;
-   trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
-   submit_bh(WRITE_SYNC_PLUG, bh);
+   trace_gfs2_submit_bh(bh, WRITE, __func__);
+   submit_bh(WRITE, bh);
} else {
unlock_buffer(bh);
brelse(bh);
@@ -692,6 +693,8 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
struct gfs2_bufdata *bd;
struct buffer_head *bh;
 
+   blk_run_backing_dev(blk_get_backing_dev_info(sdp-sd_vfs-s_bdev), 
NULL);
+
gfs2_log_lock(sdp);
while (!list_empty(sdp-sd_log_le_ordered)) {
bd = list_entry(sdp-sd_log_le_ordered.prev, struct 
gfs2_bufdata, bd_le.le_list);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 7278cf0..0fe2f3c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -15,6 +15,7 @@
 #include linux/gfs2_ondisk.h
 #include linux/bio.h
 #include linux/fs.h
+#include linux/blkdev.h
 
 #include gfs2.h
 #include incore.h
@@ -198,8 +199,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
}
 
gfs2_log_unlock(sdp);
-   trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
-   submit_bh(WRITE_SYNC_PLUG, bh);
+   trace_gfs2_submit_bh(bh, WRITE, __func__);
+   submit_bh(WRITE, bh);
gfs2_log_lock(sdp);
 
n = 0;
@@ -209,8 +210,8 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
gfs2_log_unlock(sdp);
lock_buffer(bd2-bd_bh);
bh = gfs2_log_fake_buf(sdp, bd2-bd_bh);
-   trace_gfs2_submit_bh(bh, WRITE_SYNC_PLUG, __func__);
-   submit_bh(WRITE_SYNC_PLUG, bh);
+   trace_gfs2_submit_bh(bh, WRITE, __func__);
+   submit_bh(WRITE, bh);
gfs2_log_lock(sdp);
if (++n = num)
break;
@@ -220,6 +221,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
total -= num;
}
gfs2_log_unlock(sdp);
+   blk_run_backing_dev(blk_get_backing_dev_info(sdp-sd_vfs-s_bdev), 
NULL);
 }
 
 static void buf_lo_after_commit(struct gfs2_sbd *sdp, struct 

[Cluster-devel] (no subject)

2010-02-04 Thread Dave Chinner
These patches improve sequential write IO patterns and reduce ordered
write log contention.

The first patch is simply for diagnosis purposes - it enabled me to
see where Io was being dispatched from, and led directly to he fix
in the second patch. The third patch removes the use of WRITE_SYNC_PLUG for
async writes (data, metadata and log), and the third moves the AIL pushing out
from under the log lock so that incoming writes can still proceed while
the log is being flushed.

The difference is on a local disk that XFS can do 85MB/s sequential
write, gfs2 can do:
cfq noop
vanilla 38MB/s  48MB/s
+2  48MB/s  65MB/s
+3  48MB/s  65MB/s
+4  51MB/s  75MB/s

The improvement is due to the IO patterns resulting in the disk being IO bound,
and the subsequent improvements in IO patterns directly translate into more
throughput

On a faster 4-disk dm stripe array on the same machine that XFS can do 265MB/s
(@ 550iop/s) sequential write, gfs2 can do:

cfq  noop
vanilla 135MB/s @ 400iop/s  130MB/s @ 800iop/s
+4  135MB/s @ 400iop/s  130MB/s @ 500iop/s

No improvement or degradation in throughput is seen here as the disks never get
to being IO bound - the write is cpu bound. However, there is an improvement in
iops seen on no-op scheduler as a result of the improvement in IO dispatch
patterns.

The patches have not seen much testing, so this is really just a posting
for comments/feedback at this point.