Because s_vfs_rename_mutex is not cluster-wide, multiple nodes can reverse the roles of which directories are "old" and which are "new" for the purposes of rename. This can cause deadlocks where two nodes can lock old-then-new but since their roles are reversed, they wait for each other.
This patch fixes the problem by acquiring all gfs2_rename's inode glocks asychronously and waits for all glocks to be acquired. That way all inodes are locked regardless of the order. Signed-off-by: Bob Peterson <rpete...@redhat.com> --- fs/gfs2/glock.c | 40 ++++++++++++++++++++++++++++++++++++++++ fs/gfs2/glock.h | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/inode.c | 13 +++++++++---- fs/gfs2/ops_fstype.c | 1 + 5 files changed, 52 insertions(+), 4 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index a27dbd3dec01..3334101c9921 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -305,6 +305,11 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) clear_bit(HIF_WAIT, &gh->gh_iflags); smp_mb__after_atomic(); wake_up_bit(&gh->gh_iflags, HIF_WAIT); + if (gh->gh_flags & GL_ASYNC) { + struct gfs2_sbd *sdp = gh->gh_gl->gl_name.ln_sbd; + + wake_up(&sdp->sd_async_glock_wait); + } } /** @@ -952,6 +957,41 @@ int gfs2_glock_wait(struct gfs2_holder *gh) return gh->gh_error; } +static int all_glocks_held(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_glock *gl = ghs[0].gh_gl; + int i; + + for (i = 0; i < num_gh; i++) { + if (test_bit(HIF_WAIT, &ghs[i].gh_iflags) || + !test_bit(HIF_HOLDER, &ghs[i].gh_iflags) || + gl->gl_state != ghs[i].gh_state) + return 0; + if (ghs[i].gh_error) + return ghs[i].gh_error; + } + return 1; +} + +/** + * gfs2_glock_async_wait - wait on multiple asynchronous glock acquisitions + * @gh: the glock holder + * + * Returns: 0 on success + */ + +int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs) +{ + struct gfs2_glock *gl = ghs[0].gh_gl; + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + might_sleep(); + if (!wait_event_timeout(sdp->sd_async_glock_wait, + all_glocks_held(num_gh, ghs), HZ * 2)) + return -ESTALE; + return 0; +} + /** * handle_callback - process a demote request * @gl: the glock diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index e4e0bed5257c..a997c42726a4 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -190,6 +190,7 @@ extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); extern int gfs2_glock_poll(struct gfs2_holder *gh); extern int gfs2_glock_wait(struct gfs2_holder *gh); +extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq(struct gfs2_holder *gh); extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a993d7c022e..6b450065b9d5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -725,6 +725,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_freeze_gl; struct work_struct sd_freeze_work; wait_queue_head_t sd_glock_wait; + wait_queue_head_t sd_async_glock_wait; atomic_t sd_glock_disposal; struct completion sd_locking_init; struct completion sd_wdack; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1ced4d0a3b04..cb184d6bed9b 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1388,16 +1388,18 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } num_gh = 1; - gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs); if (odip != ndip) { - gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE,GL_ASYNC, + ghs + num_gh); num_gh++; } - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, ghs + num_gh); num_gh++; if (nip) { - gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, GL_ASYNC, + ghs + num_gh); num_gh++; /* grab the resource lock for unlink flag twiddling * this is the case of the target file already existing @@ -1414,6 +1416,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) goto out_gunlock; } + error = gfs2_glock_async_wait(num_gh, ghs); + if (error) + goto out_gunlock; if (gfs2_holder_initialized(&rd_gh)) { error = gfs2_glock_nq(&rd_gh); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 4a8e5a7310f0..f3fd5cd9d43f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -87,6 +87,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); init_waitqueue_head(&sdp->sd_glock_wait); + init_waitqueue_head(&sdp->sd_async_glock_wait); atomic_set(&sdp->sd_glock_disposal, 0); init_completion(&sdp->sd_locking_init); init_completion(&sdp->sd_wdack); -- 2.21.0